How to use AttentionMechanism with MultiRNNCell and dynamic_decode? - tensorflow

I want to create a multi-layered dynamic RNN-based decoder that uses an attention mechanism. To do this, I first create an attention mechanism:
attention_mechanism = BahdanauAttention(num_units=ATTENTION_UNITS,
memory=encoder_outputs,
normalize=True)
Then I use the AttentionWrapper to wrap a LSTM cell with the attention mechanism:
attention_wrapper = AttentionWrapper(cell=self._create_lstm_cell(DECODER_SIZE),
attention_mechanism=attention_mechanism,
output_attention=False,
alignment_history=True,
attention_layer_size=ATTENTION_LAYER_SIZE)
where self._create_lstm_cell is defined as follows:
#staticmethod
def _create_lstm_cell(cell_size):
return BasicLSTMCell(cell_size)
I then do some bookkeeping (e.g. creating my MultiRNNCell, creating an initial state, creating a TrainingHelper, etc.)
attention_zero = attention_wrapper.zero_state(batch_size=tf.flags.FLAGS.batch_size, dtype=tf.float32)
# define initial state
initial_state = attention_zero.clone(cell_state=encoder_final_states[0])
training_helper = TrainingHelper(inputs=self.y, # feed in ground truth
sequence_length=self.y_lengths) # feed in sequence lengths
layered_cell = MultiRNNCell(
[attention_wrapper] + [ResidualWrapper(self._create_lstm_cell(cell_size=DECODER_SIZE))
for _ in range(NUMBER_OF_DECODER_LAYERS - 1)])
decoder = BasicDecoder(cell=layered_cell,
helper=training_helper,
initial_state=initial_state)
decoder_outputs, decoder_final_state, decoder_final_sequence_lengths = dynamic_decode(decoder=decoder,
maximum_iterations=tf.flags.FLAGS.max_number_of_scans // 12,
impute_finished=True)
But I receive the following error: AttributeError: 'LSTMStateTuple' object has no attribute 'attention'.
What is the correct way to add an attention mechanism to a MultiRNNCell dynamic decoder?

Have you tried using the attention wrapper provided by tf.contrib?
Here is an example using both an attention wrapper and dropout:
cells = []
for i in range(n_layers):
cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)
cell = tf.contrib.rnn.AttentionCellWrapper(
cell, attn_length=40, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=0.5)
cells.append(cell)
cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
init_state = cell.zero_state(batch_size, tf.float32)

What you need to do is that you create the MultiLayer cell then you wrap it with an AttentionWrapper , below is an example :
def decoding_layer(dec_input, encoder_state,
target_sequence_length, max_target_sequence_length,
rnn_size,
num_layers, target_vocab_to_int, target_vocab_size,
batch_size, keep_prob, decoding_embedding_size , encoder_outputs):
"""
Create decoding layer
:param dec_input: Decoder input
:param encoder_state: Encoder state
:param target_sequence_length: The lengths of each sequence in the target batch
:param max_target_sequence_length: Maximum length of target sequences
:param rnn_size: RNN Size
:param num_layers: Number of layers
:param target_vocab_to_int: Dictionary to go from the target words to an id
:param target_vocab_size: Size of target vocabulary
:param batch_size: The size of the batch
:param keep_prob: Dropout keep probability
:param decoding_embedding_size: Decoding embedding size
:return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
"""
# 1. Decoder Embedding
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
# 2. Construct the decoder cell
def create_cell(rnn_size):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,
initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
dec_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
#dec_cell = tf.contrib.rnn.MultiRNNCell(cells_a)
#attention details
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=rnn_size, memory=encoder_outputs)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attention_mechanism , attention_layer_size=rnn_size/2)
attn_zero = attn_cell.zero_state(batch_size , tf.float32 )
attn_zero = attn_zero.clone(cell_state = encoder_state)
#new_state = tf.contrib.seq2seq.AttentionWrapperState(cell_state = encoder_state, attention = attn_zero , time = 0 ,alignments=None , alignment_history=())
"""out_cell = tf.contrib.rnn.OutputProjectionWrapper(
attn_cell, target_vocab_size, reuse=True
)"""
#end of attention
#tensor_util.make_tensor_proto(attn_cell)
output_layer = Dense(target_vocab_size,
kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
with tf.variable_scope("decode"):
train_decoder_out = decoding_layer_train(attn_zero, attn_cell, dec_embed_input,
target_sequence_length, max_target_sequence_length, output_layer, keep_prob)
with tf.variable_scope("decode", reuse=True):
infer_decoder_out = decoding_layer_infer(attn_zero, attn_cell, dec_embeddings,
target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'], max_target_sequence_length,
target_vocab_size, output_layer, batch_size, keep_prob)
return (train_decoder_out, infer_decoder_out)

Related

How to connect multi-layered Bi-directional LSTM encoder to a decoder?

I'm making a seq2seq model which uses a Bi-LSTM as encoder and Attention mechanism in decoder. For a single layer of LSTM model is working fine. My encoder looks something like this.
Encoder:
def encoding_layer(self, rnn_inputs, rnn_size, num_layers, keep_prob,
source_vocab_size,
encoding_embedding_size,
source_sequence_length,
emb_matrix):
embed = tf.nn.embedding_lookup(emb_matrix, rnn_inputs)
stacked_cells = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob)
outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw=stacked_cells,
cell_bw=stacked_cells,
inputs=embed,
sequence_length=source_sequence_length,
dtype=tf.float32)
concat_outputs = tf.concat(outputs, 2)
cell_state_fw, cell_state_bw = state
cell_state_final = tf.concat([cell_state_fw.c, cell_state_bw.c], 1)
hidden_state_final = tf.concat([cell_state_fw.h, cell_state_bw.h], 1)
encoder_final_state = tf.nn.rnn_cell.LSTMStateTuple(c=cell_state_final, h=hidden_state_final)
return concat_outputs, encoder_final_state
Decoder :
def decoding_layer_train(self, encoder_outputs, encoder_state, dec_cell, dec_embed_input,
target_sequence_length, max_summary_length,
output_layer, keep_prob, rnn_size, batch_size):
rnn_size = 2 * rnn_size
dec_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob)
train_helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, target_sequence_length)
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(rnn_size, encoder_outputs,
memory_sequence_length=target_sequence_length)
attention_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attention_mechanism,
attention_layer_size=rnn_size/2)
state = attention_cell.zero_state(dtype=tf.float32, batch_size=batch_size)
state = state.clone(cell_state=encoder_state)
decoder = tf.contrib.seq2seq.BasicDecoder(cell=attention_cell, helper=train_helper,
initial_state=state,
output_layer=output_layer)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=max_summary_length)
return outputs
With above configuration of single layer Bi-LSTM my model is working fine. But, now I want to use a multilayered Bi-LSTM encoder and decoder. So, in encoder and decoder if I change the cell to:
stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
After changing cell I am getting this error:
AttributeError: 'tuple' object has no attribute 'c'
here,
num_layers = 2
rnn_size = 128
embedding_size = 50
So, I want to know what exactly is returned as state in second case. And how to pass that state to decoder.
Full code: https://github.com/sainimohit23/Text-Summarization

Multilayer Seq2Seq model with LSTM in Keras

I was making a seq2seq model in keras. I had built single layer encoder and decoder and they were working fine. But now I want to extend it to multi layer encoder and decoder.
I am building it using Keras Functional API.
Training:-
Code for encoder:-
encoder_input=Input(shape=(None,vec_dimension))
encoder_lstm=LSTM(vec_dimension,return_state=True,return_sequences=True)(encoder_input)
encoder_lstm=LSTM(vec_dimension,return_state=True)(encoder_lstm)
encoder_output,encoder_h,encoder_c=encoder_lstm
Code for decoder:-
encoder_state=[encoder_h,encoder_c]
decoder_input=Input(shape=(None,vec_dimension))
decoder_lstm= LSTM(vec_dimension,return_state=True,return_sequences=True (decoder_input,initial_state=encoder_state)
decoder_lstm=LSTM(vec_dimension,return_state=True,return_sequences=True)(decoder_lstm)
decoder_output,_,_=decoder_lstm
For testing :-
encoder_model=Model(inputs=encoder_input,outputs=encoder_state)
decoder_state_input_h=Input(shape=(None,vec_dimension))
decoder_state_input_c=Input(shape=(None,vec_dimension))
decoder_states_input=[decoder_state_input_h,decoder_state_input_c]
decoder_output,decoder_state_h,decoder_state_c =decoder_lstm #(decoder_input,initial_state=decoder_states_input)
decoder_states=[decoder_state_h,decoder_state_c]
decoder_model=Model(inputs=[decoder_input]+decoder_states_input,outputs=[decoder_output]+decoder_states)
Now when I try to increase the no. of layers in the decoder for training then training works fine but for testing it dosen't works and throws error.
Actually the problem is when making it multi layer i had shifted the initial_state to a middle layer which used to be specified at the end.So
when I am calling it during testing, it is throwing errors.
RuntimeError: Graph disconnected: cannot obtain value for tensor Tensor("input_64:0", shape=(?, ?, 150), dtype=float32) at layer "input_64".The following previous layers were accessed without issue: []
How should I pass the initial_state=decoder_states_input which is for the input layer so that it doesn't throws error.
How should I pass the initial_state=decoder_states_input in the end layer for for the first Input layer??
EDIT:-
In that code I have tried to make multiple layers of decoder LSTM. But that's giving error.
When working with single layer.The correct codes are:-
Encoder(Training):-
encoder_input=Input(shape=(None,vec_dimension))
encoder_lstm =LSTM(vec_dimension,return_state=True)(encoder_input)
encoder_output,encoder_h,encoder_c=encoder_lstm
Decoder(Training):-
encoder_state=[encoder_h,encoder_c]
decoder_input=Input(shape=(None,vec_dimension))
decoder_lstm= LSTM(vec_dimension, return_state=True, return_sequences=True)
decoder_output,_,_=decoder_lstm(decoder_input,initial_state=encoder_state)
Decoder(Testing)
decoder_output,decoder_state_h,decoder_state_c=decoder_lstm( decoder_input, initial_state=decoder_states_input)
decoder_states=[decoder_state_h,decoder_state_c]
decoder_output,decoder_state_h,decoder_state_c=decoder_lstm (decoder_input,initial_state=decoder_states_input)
decoder_model=Model(inputs=[decoder_input]+decoder_states_input,outputs=[decoder_output]+decoder_states)
EDIT - Updated to use the functional API model in Keras vs. the RNN
from keras.models import Model
from keras.layers import Input, LSTM, Dense, RNN
layers = [256,128] # we loop LSTMCells then wrap them in an RNN layer
encoder_inputs = Input(shape=(None, num_encoder_tokens))
e_outputs, h1, c1 = LSTM(latent_dim, return_state=True, return_sequences=True)(encoder_inputs)
_, h2, c2 = LSTM(latent_dim, return_state=True)(e_outputs)
encoder_states = [h1, c1, h2, c2]
decoder_inputs = Input(shape=(None, num_decoder_tokens))
out_layer1 = LSTM(latent_dim, return_sequences=True, return_state=True)
d_outputs, dh1, dc1 = out_layer1(decoder_inputs,initial_state= [h1, c1])
out_layer2 = LSTM(latent_dim, return_sequences=True, return_state=True)
final, dh2, dc2 = out_layer2(d_outputs, initial_state= [h2, c2])
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(final)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
And here is the inference setup:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input_h1 = Input(shape=(latent_dim,))
decoder_state_input_c1 = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c,
decoder_state_input_h1, decoder_state_input_c1]
d_o, state_h, state_c = out_layer1(
decoder_inputs, initial_state=decoder_states_inputs[:2])
d_o, state_h1, state_c1 = out_layer2(
d_o, initial_state=decoder_states_inputs[-2:])
decoder_states = [state_h, state_c, state_h1, state_c1]
decoder_outputs = decoder_dense(d_o)
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
decoder_model.summary()
Lastly, if you are following the Keras seq2seq example, you will have to change the prediction script as there are multiple hidden states that need to be managed vs. just two of them in the single-layer example. There will be 2x the number of layer hidden states
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
(i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
(i, char) for char, i in target_token_index.items())
def decode_sequence(input_seq):
# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, num_decoder_tokens))
# Populate the first character of target sequence with the start character.
target_seq[0, 0, target_token_index['\t']] = 1.
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = ''
while not stop_condition:
output_tokens, h, c, h1, c1 = decoder_model.predict(
[target_seq] + states_value) #######NOTICE THE ADDITIONAL HIDDEN STATES
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
# Exit condition: either hit max length
# or find stop character.
if (sampled_char == '\n' or
len(decoded_sentence) > max_decoder_seq_length):
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.
# Update states
states_value = [h, c, h1, c1]#######NOTICE THE ADDITIONAL HIDDEN STATES
return decoded_sentence
for seq_index in range(100):
# Take one sequence (part of the training set)
# for trying out decoding.
input_seq = encoder_input_data[seq_index: seq_index + 1]
decoded_sentence = decode_sequence(input_seq)
print('-')
print('Input sentence:', input_texts[seq_index])
print('Target sentence:', target_texts[seq_index])
print('Decoded sentence:', decoded_sentence)
I've generalized Jeremy Wortz's awesome answer to create the model from a list, 'latent_dims', which will be 'len(latent_dims)' deep, as opposed to a fixed 2-deep.
Starting with the 'latent_dims' declaration:
# latent_dims is an array which defines the depth of the encoder/decoder, as well as how large
# the layers should be. So an array of sizes [a,b,c] would produce a depth-3 encoder and decoder
# with layer sizes equal to [a,b,c] and [c,b,a] respectively.
latent_dims = [1024, 512, 256]
Creating the model for training:
# Define an input sequence and process it by going through a len(latent_dims)-layer deep encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
outputs = encoder_inputs
encoder_states = []
for j in range(len(latent_dims))[::-1]:
outputs, h, c = LSTM(latent_dims[j], return_state=True, return_sequences=bool(j))(outputs)
encoder_states += [h, c]
# Set up the decoder, setting the initial state of each layer to the state of the layer in the encoder
# which is it's mirror (so for encoder: a->b->c, you'd have decoder initial states: c->b->a).
decoder_inputs = Input(shape=(None, num_decoder_tokens))
outputs = decoder_inputs
output_layers = []
for j in range(len(latent_dims)):
output_layers.append(
LSTM(latent_dims[len(latent_dims) - j - 1], return_sequences=True, return_state=True)
)
outputs, dh, dc = output_layers[-1](outputs, initial_state=encoder_states[2*j:2*(j+1)])
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
For inference it's as follows:
# Define sampling models (modified for n-layer deep network)
encoder_model = Model(encoder_inputs, encoder_states)
d_outputs = decoder_inputs
decoder_states_inputs = []
decoder_states = []
for j in range(len(latent_dims))[::-1]:
current_state_inputs = [Input(shape=(latent_dims[j],)) for _ in range(2)]
temp = output_layers[len(latent_dims)-j-1](d_outputs, initial_state=current_state_inputs)
d_outputs, cur_states = temp[0], temp[1:]
decoder_states += cur_states
decoder_states_inputs += current_state_inputs
decoder_outputs = decoder_dense(d_outputs)
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
And finally a few modifications to Jeremy Wortz's 'decode_sequence' function are implemented to get the following:
def decode_sequence(input_seq, encoder_model, decoder_model):
# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, num_decoder_tokens))
# Populate the first character of target sequence with the start character.
target_seq[0, 0, target_token_index['\t']] = 1.
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = [] #Creating a list then using "".join() is usually much faster for string creation
while not stop_condition:
to_split = decoder_model.predict([target_seq] + states_value)
output_tokens, states_value = to_split[0], to_split[1:]
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, 0])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence.append(sampled_char)
# Exit condition: either hit max length
# or find stop character.
if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length:
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.
return "".join(decoded_sentence)

How to use tf.contrib.seq2seq.Helper for non-embedding data?

I'm trying to use tf.contrib.seq2seq module to do forecasting on some data (just float32 vectors) but all the examples I found using the seq2seq module from TensorFlow are used for translation and therefore embeddings.
I'm struggling to understand exactly what tf.contrib.seq2seq.Helper is doing for the Seq2Seq architecture and how I can use the CustomHelper in my case.
This is what I've done for now:
import tensorflow as tf
from tensorflow.python.layers import core as layers_core
input_seq_len = 15 # Sequence length as input
input_dim = 1 # Nb of features in input
output_seq_len = forecast_len = 20 # horizon length for forecasting
output_dim = 1 # nb of features to forecast
encoder_units = 200 # nb of units in each cell for the encoder
decoder_units = 200 # nb of units in each cell for the decoder
attention_units = 100
batch_size = 8
graph = tf.Graph()
with graph.as_default():
learning_ = tf.placeholder(tf.float32)
with tf.variable_scope('Seq2Seq'):
# Placeholder for encoder input
enc_input = tf.placeholder(tf.float32, [None, input_seq_len, input_dim])
# Placeholder for decoder output - Targets
target = tf.placeholder(tf.float32, [None, output_seq_len, output_dim])
### BUILD THE ENCODER
# Build RNN cell
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_units)
initial_state = encoder_cell.zero_state(batch_size, dtype=tf.float32)
# Run Dynamic RNN
# encoder_outputs: [batch_size, seq_size, num_units]
# encoder_state: [batch_size, num_units]
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, enc_input, initial_state=initial_state)
## Attention layer
attention_mechanism_bahdanau = tf.contrib.seq2seq.BahdanauAttention(
num_units = attention_units, # depth of query mechanism
memory = encoder_outputs, # hidden states to attend (output of RNN)
normalize=False, # normalize energy term
name='BahdanauAttention')
attention_mechanism_luong = tf.contrib.seq2seq.LuongAttention(
num_units = encoder_units,
memory = encoder_outputs,
scale=False,
name='LuongAttention'
)
### BUILD THE DECODER
# Simple Dense layer to project from rnn_dim to the desired output_dim
projection = layers_core.Dense(output_dim, use_bias=True, name="output_projection")
helper = tf.contrib.seq2seq.TrainingHelper(target, sequence_length=[output_seq_len for _ in range(batch_size)])
## This is where I don't really know what to do in my case, is this function changing my data into [ GO, data, END] ?
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(decoder_units)
attention_cell = tf.contrib.seq2seq.AttentionWrapper(
cell = decoder_cell,
attention_mechanism = attention_mechanism_luong, # Instance of AttentionMechanism
attention_layer_size = attention_units,
name="attention_wrapper")
initial_state = attention_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
initial_state = initial_state.clone(cell_state=encoder_state)
decoder = tf.contrib.seq2seq.BasicDecoder(attention_cell, initial_state=initial_state, helper=helper, output_layer=projection)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder)
# Loss function:
loss = 0.5*tf.reduce_sum(tf.square(outputs[0] - target), -1)
loss = tf.reduce_mean(loss, 1)
loss = tf.reduce_mean(loss)
# Optimizer
optimizer = tf.train.AdamOptimizer(learning_).minimize(loss)
I understood that Training state and Inference state are quite different for the Seq2seq architecture but I don't know how to use the Helpers from the module in order to distinguish both.
I'm using this module because it's quite useful for Attention Layers.
How can I use the Helper in order to create a ['Go' , [input_sequence]] for the decoder ?

How to use multilayered bidirectional LSTM in Tensorflow?

I want to know how to use multilayered bidirectional LSTM in Tensorflow.
I have already implemented the contents of bidirectional LSTM, but I wanna compare this model with the model added multi-layers.
How should I add some code in this part?
x = tf.unstack(tf.transpose(x, perm=[1, 0, 2]))
#print(x[0].get_shape())
# Define lstm cells with tensorflow
# Forward direction cell
lstm_fw_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
# Backward direction cell
lstm_bw_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
# Get lstm cell output
try:
outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
dtype=tf.float32)
except Exception: # Old TensorFlow version only returns outputs not states
outputs = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x,
dtype=tf.float32)
# Linear activation, using rnn inner loop last output
outputs = tf.stack(outputs, axis=1)
outputs = tf.reshape(outputs, (batch_size*n_steps, n_hidden*2))
outputs = tf.matmul(outputs, weights['out']) + biases['out']
outputs = tf.reshape(outputs, (batch_size, n_steps, n_classes))
You can use two different approaches to apply multilayer bilstm model:
1) use out of previous bilstm layer as input to the next bilstm. In the beginning you should create the arrays with forward and backward cells of length num_layers. And
for n in range(num_layers):
cell_fw = cell_forw[n]
cell_bw = cell_back[n]
state_fw = cell_fw.zero_state(batch_size, tf.float32)
state_bw = cell_bw.zero_state(batch_size, tf.float32)
(output_fw, output_bw), last_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, output,
initial_state_fw=state_fw,
initial_state_bw=state_bw,
scope='BLSTM_'+ str(n),
dtype=tf.float32)
output = tf.concat([output_fw, output_bw], axis=2)
2) Also worth a look at another approach stacked bilstm.
This is primarily same as the first answer but with a little variation of usage of scope name and with added dropout wrappers. It also takes care of the error the first answer gives about variable scope.
def bidirectional_lstm(input_data, num_layers, rnn_size, keep_prob):
output = input_data
for layer in range(num_layers):
with tf.variable_scope('encoder_{}'.format(layer),reuse=tf.AUTO_REUSE):
# By giving a different variable scope to each layer, I've ensured that
# the weights are not shared among the layers. If you want to share the
# weights, you can do that by giving variable_scope as "encoder" but do
# make sure first that reuse is set to tf.AUTO_REUSE
cell_fw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob = keep_prob)
cell_bw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob = keep_prob)
outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw,
cell_bw,
output,
dtype=tf.float32)
# Concat the forward and backward outputs
output = tf.concat(outputs,2)
return output
On top of Taras's answer. Here is another example using just 2-layer Bidirectional RNN with GRU cells
embedding_weights = tf.Variable(tf.random_uniform([vocabulary_size, state_size], -1.0, 1.0))
embedding_vectors = tf.nn.embedding_lookup(embedding_weights, tokens)
#First BLSTM
cell = tf.nn.rnn_cell.GRUCell(state_size)
cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=1-dropout)
(forward_output, backward_output), _ = \
tf.nn.bidirectional_dynamic_rnn(cell, cell, inputs=embedding_vectors,
sequence_length=lengths, dtype=tf.float32,scope='BLSTM_1')
outputs = tf.concat([forward_output, backward_output], axis=2)
#Second BLSTM using the output of previous layer as an input.
cell2 = tf.nn.rnn_cell.GRUCell(state_size)
cell2 = tf.nn.rnn_cell.DropoutWrapper(cell2, output_keep_prob=1-dropout)
(forward_output, backward_output), _ = \
tf.nn.bidirectional_dynamic_rnn(cell2, cell2, inputs=outputs,
sequence_length=lengths, dtype=tf.float32,scope='BLSTM_2')
outputs = tf.concat([forward_output, backward_output], axis=2)
BTW, don't forget to add different scope name. Hope this help.
As #Taras pointed out, you can use:
(1) tf.nn.bidirectional_dynamic_rnn()
(2) tf.contrib.rnn.stack_bidirectional_dynamic_rnn().
All previous answers only capture (1), so I give some details on (2), in particular since it usually outperforms (1). For an intuition about the different connectivities
see here.
Let's say you want to create a stack of 3 BLSTM layers, each with 64 nodes:
num_layers = 3
num_nodes = 64
# Define LSTM cells
enc_fw_cells = [LSTMCell(num_nodes)for layer in range(num_layers)]
enc_bw_cells = [LSTMCell(num_nodes) for layer in range(num_layers)]
# Connect LSTM cells bidirectionally and stack
(all_states, fw_state, bw_state) = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
cells_fw=enc_fw_cells, cells_bw=enc_bw_cells, inputs=input_embed, dtype=tf.float32)
# Concatenate results
for k in range(num_layers):
if k == 0:
con_c = tf.concat((fw_state[k].c, bw_state[k].c), 1)
con_h = tf.concat((fw_state[k].h, bw_state[k].h), 1)
else:
con_c = tf.concat((con_c, fw_state[k].c, bw_state[k].c), 1)
con_h = tf.concat((con_h, fw_state[k].h, bw_state[k].h), 1)
output = tf.contrib.rnn.LSTMStateTuple(c=con_c, h=con_h)
In this case, I use the final states of the stacked biRNN rather than the states at all timesteps (saved in all_states), since I was using an encoding decoding scheme, where the above code was only the encoder.

character level bidirectional language model in tensorflow

Inspired from Andrej Karpathy Char-RNN, There is a Tensorflow implementation of char-rnn sherjilozair/char-rnn-tensorflow: Multi-layer Recurrent Neural Networks (LSTM, RNN) for character-level language models in Python using Tensorflow. I want to implement bidirectional character level language model from this code. I change the model.py and wrote a simple code:
class Model:
def __init__(self, input_data, targets, seq_length=Config.max_seq_length, training=True):
if Config.model == 'rnn':
cell_fn = rnn.BasicRNNCell
elif Config.model == 'gru':
cell_fn = rnn.GRUCell
elif Config.model == 'lstm':
cell_fn = rnn.BasicLSTMCell
elif Config.model == 'nas':
cell_fn = rnn.NASCell
else:
raise Exception("model type not supported: {}".format(Config.model))
fw_cells = []
bw_cells = []
for _ in range(Config.num_layers):
fw_cell = cell_fn(Config.rnn_size)
bw_cell = cell_fn(Config.rnn_size)
fw_cells.append(fw_cell)
bw_cells.append(bw_cell)
self.fw_cell = rnn.MultiRNNCell(fw_cells, state_is_tuple=True)
self.bw_cell = rnn.MultiRNNCell(bw_cells, state_is_tuple=True)
self.input_data, self.targets = input_data, targets
with tf.variable_scope('rnnlm'):
softmax_w = tf.get_variable("softmax_w", [Config.rnn_size*2, Config.vocab_size])
softmax_b = tf.get_variable("softmax_b", [Config.vocab_size])
embedding = tf.get_variable("embedding", [Config.vocab_size, Config.rnn_size])
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
inputs = tf.unstack(inputs, num=seq_length, axis=1)
outputs, _, _ = tf.nn.static_bidirectional_rnn(self.fw_cell, self.bw_cell, inputs,
dtype=tf.float32, scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, Config.rnn_size*2])
self.logits = tf.matmul(output, softmax_w) + softmax_b
self.probs = tf.nn.softmax(self.logits)
self.lr = tf.Variable(0.0, trainable=False)
if training:
loss = legacy_seq2seq.sequence_loss_by_example(
[self.logits],
[tf.reshape(self.targets, [-1])],
[tf.sign(tf.cast(tf.reshape(self.targets, [-1]), dtype=tf.float32))])
with tf.name_scope('cost'):
self.cost = tf.reduce_mean(loss)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), Config.grad_clip)
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(zip(grads, tvars))
In training phase, I see a fast converge. After near 3000 iteration, the loss reach 0.003. In test phase, the probability of all character is 1.0. I think there is a mistake.
I will so glad to get some help to find my mistake.
use the preceding and following output to predict the prob of current word. In your case, you used current rnn output to predict the prob of current word.
Looks like you set self.lr = tf.Variable(0.0, trainable=False). Try changing this to a nonzero value. If you are reading probabilities from self.probs during the testing phase this should be normalized appropriately,