Multilayer Seq2Seq model with LSTM in Keras - tensorflow

I was making a seq2seq model in keras. I had built single layer encoder and decoder and they were working fine. But now I want to extend it to multi layer encoder and decoder.
I am building it using Keras Functional API.
Training:-
Code for encoder:-
encoder_input=Input(shape=(None,vec_dimension))
encoder_lstm=LSTM(vec_dimension,return_state=True,return_sequences=True)(encoder_input)
encoder_lstm=LSTM(vec_dimension,return_state=True)(encoder_lstm)
encoder_output,encoder_h,encoder_c=encoder_lstm
Code for decoder:-
encoder_state=[encoder_h,encoder_c]
decoder_input=Input(shape=(None,vec_dimension))
decoder_lstm= LSTM(vec_dimension,return_state=True,return_sequences=True (decoder_input,initial_state=encoder_state)
decoder_lstm=LSTM(vec_dimension,return_state=True,return_sequences=True)(decoder_lstm)
decoder_output,_,_=decoder_lstm
For testing :-
encoder_model=Model(inputs=encoder_input,outputs=encoder_state)
decoder_state_input_h=Input(shape=(None,vec_dimension))
decoder_state_input_c=Input(shape=(None,vec_dimension))
decoder_states_input=[decoder_state_input_h,decoder_state_input_c]
decoder_output,decoder_state_h,decoder_state_c =decoder_lstm #(decoder_input,initial_state=decoder_states_input)
decoder_states=[decoder_state_h,decoder_state_c]
decoder_model=Model(inputs=[decoder_input]+decoder_states_input,outputs=[decoder_output]+decoder_states)
Now when I try to increase the no. of layers in the decoder for training then training works fine but for testing it dosen't works and throws error.
Actually the problem is when making it multi layer i had shifted the initial_state to a middle layer which used to be specified at the end.So
when I am calling it during testing, it is throwing errors.
RuntimeError: Graph disconnected: cannot obtain value for tensor Tensor("input_64:0", shape=(?, ?, 150), dtype=float32) at layer "input_64".The following previous layers were accessed without issue: []
How should I pass the initial_state=decoder_states_input which is for the input layer so that it doesn't throws error.
How should I pass the initial_state=decoder_states_input in the end layer for for the first Input layer??
EDIT:-
In that code I have tried to make multiple layers of decoder LSTM. But that's giving error.
When working with single layer.The correct codes are:-
Encoder(Training):-
encoder_input=Input(shape=(None,vec_dimension))
encoder_lstm =LSTM(vec_dimension,return_state=True)(encoder_input)
encoder_output,encoder_h,encoder_c=encoder_lstm
Decoder(Training):-
encoder_state=[encoder_h,encoder_c]
decoder_input=Input(shape=(None,vec_dimension))
decoder_lstm= LSTM(vec_dimension, return_state=True, return_sequences=True)
decoder_output,_,_=decoder_lstm(decoder_input,initial_state=encoder_state)
Decoder(Testing)
decoder_output,decoder_state_h,decoder_state_c=decoder_lstm( decoder_input, initial_state=decoder_states_input)
decoder_states=[decoder_state_h,decoder_state_c]
decoder_output,decoder_state_h,decoder_state_c=decoder_lstm (decoder_input,initial_state=decoder_states_input)
decoder_model=Model(inputs=[decoder_input]+decoder_states_input,outputs=[decoder_output]+decoder_states)

EDIT - Updated to use the functional API model in Keras vs. the RNN
from keras.models import Model
from keras.layers import Input, LSTM, Dense, RNN
layers = [256,128] # we loop LSTMCells then wrap them in an RNN layer
encoder_inputs = Input(shape=(None, num_encoder_tokens))
e_outputs, h1, c1 = LSTM(latent_dim, return_state=True, return_sequences=True)(encoder_inputs)
_, h2, c2 = LSTM(latent_dim, return_state=True)(e_outputs)
encoder_states = [h1, c1, h2, c2]
decoder_inputs = Input(shape=(None, num_decoder_tokens))
out_layer1 = LSTM(latent_dim, return_sequences=True, return_state=True)
d_outputs, dh1, dc1 = out_layer1(decoder_inputs,initial_state= [h1, c1])
out_layer2 = LSTM(latent_dim, return_sequences=True, return_state=True)
final, dh2, dc2 = out_layer2(d_outputs, initial_state= [h2, c2])
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(final)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
And here is the inference setup:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input_h1 = Input(shape=(latent_dim,))
decoder_state_input_c1 = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c,
decoder_state_input_h1, decoder_state_input_c1]
d_o, state_h, state_c = out_layer1(
decoder_inputs, initial_state=decoder_states_inputs[:2])
d_o, state_h1, state_c1 = out_layer2(
d_o, initial_state=decoder_states_inputs[-2:])
decoder_states = [state_h, state_c, state_h1, state_c1]
decoder_outputs = decoder_dense(d_o)
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
decoder_model.summary()
Lastly, if you are following the Keras seq2seq example, you will have to change the prediction script as there are multiple hidden states that need to be managed vs. just two of them in the single-layer example. There will be 2x the number of layer hidden states
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
(i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
(i, char) for char, i in target_token_index.items())
def decode_sequence(input_seq):
# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, num_decoder_tokens))
# Populate the first character of target sequence with the start character.
target_seq[0, 0, target_token_index['\t']] = 1.
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = ''
while not stop_condition:
output_tokens, h, c, h1, c1 = decoder_model.predict(
[target_seq] + states_value) #######NOTICE THE ADDITIONAL HIDDEN STATES
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
# Exit condition: either hit max length
# or find stop character.
if (sampled_char == '\n' or
len(decoded_sentence) > max_decoder_seq_length):
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.
# Update states
states_value = [h, c, h1, c1]#######NOTICE THE ADDITIONAL HIDDEN STATES
return decoded_sentence
for seq_index in range(100):
# Take one sequence (part of the training set)
# for trying out decoding.
input_seq = encoder_input_data[seq_index: seq_index + 1]
decoded_sentence = decode_sequence(input_seq)
print('-')
print('Input sentence:', input_texts[seq_index])
print('Target sentence:', target_texts[seq_index])
print('Decoded sentence:', decoded_sentence)

I've generalized Jeremy Wortz's awesome answer to create the model from a list, 'latent_dims', which will be 'len(latent_dims)' deep, as opposed to a fixed 2-deep.
Starting with the 'latent_dims' declaration:
# latent_dims is an array which defines the depth of the encoder/decoder, as well as how large
# the layers should be. So an array of sizes [a,b,c] would produce a depth-3 encoder and decoder
# with layer sizes equal to [a,b,c] and [c,b,a] respectively.
latent_dims = [1024, 512, 256]
Creating the model for training:
# Define an input sequence and process it by going through a len(latent_dims)-layer deep encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
outputs = encoder_inputs
encoder_states = []
for j in range(len(latent_dims))[::-1]:
outputs, h, c = LSTM(latent_dims[j], return_state=True, return_sequences=bool(j))(outputs)
encoder_states += [h, c]
# Set up the decoder, setting the initial state of each layer to the state of the layer in the encoder
# which is it's mirror (so for encoder: a->b->c, you'd have decoder initial states: c->b->a).
decoder_inputs = Input(shape=(None, num_decoder_tokens))
outputs = decoder_inputs
output_layers = []
for j in range(len(latent_dims)):
output_layers.append(
LSTM(latent_dims[len(latent_dims) - j - 1], return_sequences=True, return_state=True)
)
outputs, dh, dc = output_layers[-1](outputs, initial_state=encoder_states[2*j:2*(j+1)])
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
For inference it's as follows:
# Define sampling models (modified for n-layer deep network)
encoder_model = Model(encoder_inputs, encoder_states)
d_outputs = decoder_inputs
decoder_states_inputs = []
decoder_states = []
for j in range(len(latent_dims))[::-1]:
current_state_inputs = [Input(shape=(latent_dims[j],)) for _ in range(2)]
temp = output_layers[len(latent_dims)-j-1](d_outputs, initial_state=current_state_inputs)
d_outputs, cur_states = temp[0], temp[1:]
decoder_states += cur_states
decoder_states_inputs += current_state_inputs
decoder_outputs = decoder_dense(d_outputs)
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
And finally a few modifications to Jeremy Wortz's 'decode_sequence' function are implemented to get the following:
def decode_sequence(input_seq, encoder_model, decoder_model):
# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, num_decoder_tokens))
# Populate the first character of target sequence with the start character.
target_seq[0, 0, target_token_index['\t']] = 1.
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = [] #Creating a list then using "".join() is usually much faster for string creation
while not stop_condition:
to_split = decoder_model.predict([target_seq] + states_value)
output_tokens, states_value = to_split[0], to_split[1:]
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, 0])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence.append(sampled_char)
# Exit condition: either hit max length
# or find stop character.
if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length:
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.
return "".join(decoded_sentence)

Related

Building Keras Seq2Seq Inference Model

I am building a Seq2Seq Model with the Encoder-Decoder Architecture. The model is aimed to summarise input text. The training model has been built and the training seems fine. Below is the source code of the training model, which is similar to the Keras documentation.
from tensorflow import keras
#Hyperparameters
latent_dim = 256
batch_size = 32 # Batch size for training.
epochs = 10 # Number of epochs to train for.
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(max_encoder_seq_length,))
enc_embedding = keras.layers.Embedding(input_dim=num_encoder_tokens, output_dim=128,)
enc_embedding_context = enc_embedding(encoder_inputs)
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(enc_embedding_context)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(max_decoder_seq_length,))
dec_embedding = keras.layers.Embedding(input_dim=num_decoder_tokens, output_dim=128,)
dec_embedding_context = dec_embedding(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_embedding_context, initial_state=encoder_states)
decoder_dense = keras.layers.TimeDistributed(keras.layers.Dense(num_decoder_tokens, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
The model summary.
I am facing a problem when building the inference model. Initially, the last layer of the decoder model connected to the second output tensor of the last second layer (it should be connecting to the first output tensor). And each time the code below was executed, the index keep on increment.
encoder_inputs = model.input[0] # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[4].output # lstm
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)
decoder_inputs = model.input[1] # input_2
decoder_embedding = model.layers[3].output
# Get the state from encoder
decoder_state_input_h = keras.Input(shape=(latent_dim,))
decoder_state_input_c = keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[5] #lstm_1
decoder_outputs_lstm, state_h_dec, state_c_dec = decoder_lstm(
decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[6]
decoder_outputs = decoder_dense(decoder_outputs_lstm)
decoder_model = keras.Model(
[decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)
The decoder model summary was as below.
The encoder model seems fine.

why seq2seq model return negative loss if I used a pre-trained embedding model

I am following this example code to build a seq2seq model using keras. https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py
when I train that code it works normally fine and the results are good. But when I try to train it using a pre-trained embedding model, the loss and the crossentropy always get negative values.
I have tried to use only a dataset of 5 examples to make the model overfit over them, just to make sure it works correct, but the loss and the crossentropy still negative.
I use FastText embedding model, here is the code to load the dataset with the embedding vectors:
encoder_input_data = np.zeros(
(input_texts_len, max_encoder_seq_length,vector_length),
dtype='float32')
decoder_input_data = np.zeros(
(input_texts_len, max_decoder_seq_length,vector_length),
dtype='float32')
decoder_target_data = np.zeros(
(input_texts_len, max_decoder_seq_length,vector_length),
dtype='float32')
padding = np.zeros((vector_length),dtype='float32')
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
for t, word in enumerate(input_text):
encoder_input_data[i, t] = w2v.get_vector(word)
encoder_input_data[i, t + 1:] = padding
for t, word in enumerate(target_text):
decoder_input_data[i, t] = w2v.get_vector(word)
if t > 0:
decoder_target_data[i, t - 1] = w2v.get_vector(word)
decoder_input_data[i, t + 1:] = padding
decoder_target_data[i, t] = padding
Here is the model code itself:
encoder_inputs = Input(shape=(max_encoder_seq_length,vec_leng,))
x = Masking(mask_value=0.0)(encoder_inputs)
encoder = LSTM(latent_dim,name='lstm_1')
encoder_outputs, state_h, state_c = encoder(x)
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(max_decoder_seq_length,vec_leng,))
a = Masking(mask_value=0.0) (decoder_inputs)
decoder_lstm = LSTM(latent_dim,name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(a, initial_state=encoder_states)
# Attention layer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attn_out])
decoder_dense = Dense(vec_leng, activation='softmax')
dense_time = TimeDistributed(decoder_dense, name='time_distributed_layer')
decoder_pred = dense_time(decoder_concat_input)
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_pred, name='main_model')
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, encoder_states], name='encoder_model')
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
encoder_states_ = Input(batch_shape=(1,max_encoder_seq_length, latent_dim))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
a = Input(shape=(max_decoder_seq_length,vec_leng,))
decoder_outputs, state_h, state_c = decoder_lstm(a, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
attn_inf_out, attn_inf_states = attn_layer([encoder_states_, decoder_outputs])
decoder_inf_concat = Concatenate(axis=-1)([decoder_outputs, attn_inf_out])
decoder_inf_pred = TimeDistributed(decoder_dense)(decoder_inf_concat)
decoder_model = Model(
[encoder_states_, decoder_states_inputs, a],
[decoder_inf_pred, attn_inf_states, decoder_states], name='decoder_model')
and here is the training prints:
what is the reason that I got these negative values? and how to solve them?
You get negative loss values because your target vectors elements are not correct, your one_hot target vector elements must be 1 or 0 integers.

Why doesn't connect to the expected layer in Keras model

I want to construct a variational autoencoder in Keras (2.2.4, with TensorFlow backend), here is my code:
dims = [1000, 256, 64, 32]
x_inputs = Input(shape=(dims[0],), name='inputs')
h = x_inputs
# internal layers in encoder
for i in range(n_stacks-1):
h = Dense(dims[i + 1], activation='relu', kernel_initializer='glorot_uniform', name='encoder_%d' % i)(h)
# hidden layer
z_mean = Dense(dims[-1], kernel_initializer='glorot_uniform', name='z_mean')(h)
z_log_var = Dense(dims[-1], kernel_initializer='glorot_uniform', name='z_log_var')(h)
z = Lambda(sampling, output_shape=(dims[-1],), name='z')([z_mean, z_log_var])
encoder = Model(inputs=x_inputs, outputs=z, name='encoder')
encoder_z_mean = Model(inputs=x_inputs, outputs=z_mean, name='encoder_z_mean')
# internal layers in decoder
latent_inputs = Input(shape=(dims[-1],), name='latent_inputs')
h = latent_inputs
for i in range(n_stacks-1, 0, -1):
h = Dense(dims[i], activation='relu', kernel_initializer='glorot_uniform', name='decoder_%d' % i)(h)
# output
outputs = Dense(dims[0], activation='relu', kernel_initializer='glorot_uniform' name='mean')
decoder = Model(inputs=latent_inputs, outputs=outputs, name='decoder')
ae_output = decoder(encoder_z_mean(x_inputs))
ae = Model(inputs=x_inputs, outputs=ae_output, name='ae')
ae.summary()
vae_output = decoder(encoder(x_inputs))
vae = Model(inputs=x_inputs, outputs=vae_output, name='vae')
vae.summary()
The problem is I can print the summary of the "ae" and "vae" models, but when I train the ae model, it says
tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor 'latent_inputs' with dtype float and shape [?,32]
In the model "decoder" is supposed to connect to the output of "encoder_z_mean" layer in the ae model. But when I print the summary of the "ae" model, "decoder" is actually connected to "encoder_z_mean[1][0]". Should it be "encoder_z_mean[0][0]"?
A few corrections:
x_inputs is already the input of the encoders, don't call it again with encoder_z_mean(x_inputs) or with encoder(x_inputs)
Besides creating a second node (the 1 that you are worried with, and that is not a problem), it may be the source of the error because it's not an extra input, but the same input
A healthy usage of this would need the creation of a new Input(...) tensor to be called
The last Dense layer is not being called on a tensor. You probably want (h) there.
Do it this way:
# output - called h in the last layer
outputs = Dense(dims[0], activation='relu', kernel_initializer='glorot_uniform' name='mean')(h)
#unchanged
decoder = Model(inputs=latent_inputs, outputs=outputs, name='decoder')
#adjusted inputs
ae_output = decoder(encoder_z_mean.output)
ae = Model(encoder_z_mean.input, ae_output, name='ae')
ae.summary()
vae_output = decoder(encoder.output)
vae = Model(encoder.input, vae_output, name='vae')
vae.summary()
It's possible that the [1][0] still occurs with the decoder, but this is not a problem at all. It means that the decoder itself has its own input node (number 0), and you created an extra input node (number 1) when you called it with the output of another model. This is harmless. The node 1 will be used while node 0 will be ignored.

How to use tf.contrib.seq2seq.Helper for non-embedding data?

I'm trying to use tf.contrib.seq2seq module to do forecasting on some data (just float32 vectors) but all the examples I found using the seq2seq module from TensorFlow are used for translation and therefore embeddings.
I'm struggling to understand exactly what tf.contrib.seq2seq.Helper is doing for the Seq2Seq architecture and how I can use the CustomHelper in my case.
This is what I've done for now:
import tensorflow as tf
from tensorflow.python.layers import core as layers_core
input_seq_len = 15 # Sequence length as input
input_dim = 1 # Nb of features in input
output_seq_len = forecast_len = 20 # horizon length for forecasting
output_dim = 1 # nb of features to forecast
encoder_units = 200 # nb of units in each cell for the encoder
decoder_units = 200 # nb of units in each cell for the decoder
attention_units = 100
batch_size = 8
graph = tf.Graph()
with graph.as_default():
learning_ = tf.placeholder(tf.float32)
with tf.variable_scope('Seq2Seq'):
# Placeholder for encoder input
enc_input = tf.placeholder(tf.float32, [None, input_seq_len, input_dim])
# Placeholder for decoder output - Targets
target = tf.placeholder(tf.float32, [None, output_seq_len, output_dim])
### BUILD THE ENCODER
# Build RNN cell
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_units)
initial_state = encoder_cell.zero_state(batch_size, dtype=tf.float32)
# Run Dynamic RNN
# encoder_outputs: [batch_size, seq_size, num_units]
# encoder_state: [batch_size, num_units]
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, enc_input, initial_state=initial_state)
## Attention layer
attention_mechanism_bahdanau = tf.contrib.seq2seq.BahdanauAttention(
num_units = attention_units, # depth of query mechanism
memory = encoder_outputs, # hidden states to attend (output of RNN)
normalize=False, # normalize energy term
name='BahdanauAttention')
attention_mechanism_luong = tf.contrib.seq2seq.LuongAttention(
num_units = encoder_units,
memory = encoder_outputs,
scale=False,
name='LuongAttention'
)
### BUILD THE DECODER
# Simple Dense layer to project from rnn_dim to the desired output_dim
projection = layers_core.Dense(output_dim, use_bias=True, name="output_projection")
helper = tf.contrib.seq2seq.TrainingHelper(target, sequence_length=[output_seq_len for _ in range(batch_size)])
## This is where I don't really know what to do in my case, is this function changing my data into [ GO, data, END] ?
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(decoder_units)
attention_cell = tf.contrib.seq2seq.AttentionWrapper(
cell = decoder_cell,
attention_mechanism = attention_mechanism_luong, # Instance of AttentionMechanism
attention_layer_size = attention_units,
name="attention_wrapper")
initial_state = attention_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
initial_state = initial_state.clone(cell_state=encoder_state)
decoder = tf.contrib.seq2seq.BasicDecoder(attention_cell, initial_state=initial_state, helper=helper, output_layer=projection)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder)
# Loss function:
loss = 0.5*tf.reduce_sum(tf.square(outputs[0] - target), -1)
loss = tf.reduce_mean(loss, 1)
loss = tf.reduce_mean(loss)
# Optimizer
optimizer = tf.train.AdamOptimizer(learning_).minimize(loss)
I understood that Training state and Inference state are quite different for the Seq2seq architecture but I don't know how to use the Helpers from the module in order to distinguish both.
I'm using this module because it's quite useful for Attention Layers.
How can I use the Helper in order to create a ['Go' , [input_sequence]] for the decoder ?

How to use AttentionMechanism with MultiRNNCell and dynamic_decode?

I want to create a multi-layered dynamic RNN-based decoder that uses an attention mechanism. To do this, I first create an attention mechanism:
attention_mechanism = BahdanauAttention(num_units=ATTENTION_UNITS,
memory=encoder_outputs,
normalize=True)
Then I use the AttentionWrapper to wrap a LSTM cell with the attention mechanism:
attention_wrapper = AttentionWrapper(cell=self._create_lstm_cell(DECODER_SIZE),
attention_mechanism=attention_mechanism,
output_attention=False,
alignment_history=True,
attention_layer_size=ATTENTION_LAYER_SIZE)
where self._create_lstm_cell is defined as follows:
#staticmethod
def _create_lstm_cell(cell_size):
return BasicLSTMCell(cell_size)
I then do some bookkeeping (e.g. creating my MultiRNNCell, creating an initial state, creating a TrainingHelper, etc.)
attention_zero = attention_wrapper.zero_state(batch_size=tf.flags.FLAGS.batch_size, dtype=tf.float32)
# define initial state
initial_state = attention_zero.clone(cell_state=encoder_final_states[0])
training_helper = TrainingHelper(inputs=self.y, # feed in ground truth
sequence_length=self.y_lengths) # feed in sequence lengths
layered_cell = MultiRNNCell(
[attention_wrapper] + [ResidualWrapper(self._create_lstm_cell(cell_size=DECODER_SIZE))
for _ in range(NUMBER_OF_DECODER_LAYERS - 1)])
decoder = BasicDecoder(cell=layered_cell,
helper=training_helper,
initial_state=initial_state)
decoder_outputs, decoder_final_state, decoder_final_sequence_lengths = dynamic_decode(decoder=decoder,
maximum_iterations=tf.flags.FLAGS.max_number_of_scans // 12,
impute_finished=True)
But I receive the following error: AttributeError: 'LSTMStateTuple' object has no attribute 'attention'.
What is the correct way to add an attention mechanism to a MultiRNNCell dynamic decoder?
Have you tried using the attention wrapper provided by tf.contrib?
Here is an example using both an attention wrapper and dropout:
cells = []
for i in range(n_layers):
cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)
cell = tf.contrib.rnn.AttentionCellWrapper(
cell, attn_length=40, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=0.5)
cells.append(cell)
cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
init_state = cell.zero_state(batch_size, tf.float32)
What you need to do is that you create the MultiLayer cell then you wrap it with an AttentionWrapper , below is an example :
def decoding_layer(dec_input, encoder_state,
target_sequence_length, max_target_sequence_length,
rnn_size,
num_layers, target_vocab_to_int, target_vocab_size,
batch_size, keep_prob, decoding_embedding_size , encoder_outputs):
"""
Create decoding layer
:param dec_input: Decoder input
:param encoder_state: Encoder state
:param target_sequence_length: The lengths of each sequence in the target batch
:param max_target_sequence_length: Maximum length of target sequences
:param rnn_size: RNN Size
:param num_layers: Number of layers
:param target_vocab_to_int: Dictionary to go from the target words to an id
:param target_vocab_size: Size of target vocabulary
:param batch_size: The size of the batch
:param keep_prob: Dropout keep probability
:param decoding_embedding_size: Decoding embedding size
:return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
"""
# 1. Decoder Embedding
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
# 2. Construct the decoder cell
def create_cell(rnn_size):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,
initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
dec_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
#dec_cell = tf.contrib.rnn.MultiRNNCell(cells_a)
#attention details
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=rnn_size, memory=encoder_outputs)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attention_mechanism , attention_layer_size=rnn_size/2)
attn_zero = attn_cell.zero_state(batch_size , tf.float32 )
attn_zero = attn_zero.clone(cell_state = encoder_state)
#new_state = tf.contrib.seq2seq.AttentionWrapperState(cell_state = encoder_state, attention = attn_zero , time = 0 ,alignments=None , alignment_history=())
"""out_cell = tf.contrib.rnn.OutputProjectionWrapper(
attn_cell, target_vocab_size, reuse=True
)"""
#end of attention
#tensor_util.make_tensor_proto(attn_cell)
output_layer = Dense(target_vocab_size,
kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
with tf.variable_scope("decode"):
train_decoder_out = decoding_layer_train(attn_zero, attn_cell, dec_embed_input,
target_sequence_length, max_target_sequence_length, output_layer, keep_prob)
with tf.variable_scope("decode", reuse=True):
infer_decoder_out = decoding_layer_infer(attn_zero, attn_cell, dec_embeddings,
target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'], max_target_sequence_length,
target_vocab_size, output_layer, batch_size, keep_prob)
return (train_decoder_out, infer_decoder_out)