How to connect multi-layered Bi-directional LSTM encoder to a decoder? - tensorflow

I'm making a seq2seq model which uses a Bi-LSTM as encoder and Attention mechanism in decoder. For a single layer of LSTM model is working fine. My encoder looks something like this.
Encoder:
def encoding_layer(self, rnn_inputs, rnn_size, num_layers, keep_prob,
source_vocab_size,
encoding_embedding_size,
source_sequence_length,
emb_matrix):
embed = tf.nn.embedding_lookup(emb_matrix, rnn_inputs)
stacked_cells = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob)
outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw=stacked_cells,
cell_bw=stacked_cells,
inputs=embed,
sequence_length=source_sequence_length,
dtype=tf.float32)
concat_outputs = tf.concat(outputs, 2)
cell_state_fw, cell_state_bw = state
cell_state_final = tf.concat([cell_state_fw.c, cell_state_bw.c], 1)
hidden_state_final = tf.concat([cell_state_fw.h, cell_state_bw.h], 1)
encoder_final_state = tf.nn.rnn_cell.LSTMStateTuple(c=cell_state_final, h=hidden_state_final)
return concat_outputs, encoder_final_state
Decoder :
def decoding_layer_train(self, encoder_outputs, encoder_state, dec_cell, dec_embed_input,
target_sequence_length, max_summary_length,
output_layer, keep_prob, rnn_size, batch_size):
rnn_size = 2 * rnn_size
dec_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob)
train_helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, target_sequence_length)
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(rnn_size, encoder_outputs,
memory_sequence_length=target_sequence_length)
attention_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attention_mechanism,
attention_layer_size=rnn_size/2)
state = attention_cell.zero_state(dtype=tf.float32, batch_size=batch_size)
state = state.clone(cell_state=encoder_state)
decoder = tf.contrib.seq2seq.BasicDecoder(cell=attention_cell, helper=train_helper,
initial_state=state,
output_layer=output_layer)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=max_summary_length)
return outputs
With above configuration of single layer Bi-LSTM my model is working fine. But, now I want to use a multilayered Bi-LSTM encoder and decoder. So, in encoder and decoder if I change the cell to:
stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
After changing cell I am getting this error:
AttributeError: 'tuple' object has no attribute 'c'
here,
num_layers = 2
rnn_size = 128
embedding_size = 50
So, I want to know what exactly is returned as state in second case. And how to pass that state to decoder.
Full code: https://github.com/sainimohit23/Text-Summarization

Related

Building Keras Seq2Seq Inference Model

I am building a Seq2Seq Model with the Encoder-Decoder Architecture. The model is aimed to summarise input text. The training model has been built and the training seems fine. Below is the source code of the training model, which is similar to the Keras documentation.
from tensorflow import keras
#Hyperparameters
latent_dim = 256
batch_size = 32 # Batch size for training.
epochs = 10 # Number of epochs to train for.
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(max_encoder_seq_length,))
enc_embedding = keras.layers.Embedding(input_dim=num_encoder_tokens, output_dim=128,)
enc_embedding_context = enc_embedding(encoder_inputs)
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(enc_embedding_context)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(max_decoder_seq_length,))
dec_embedding = keras.layers.Embedding(input_dim=num_decoder_tokens, output_dim=128,)
dec_embedding_context = dec_embedding(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_embedding_context, initial_state=encoder_states)
decoder_dense = keras.layers.TimeDistributed(keras.layers.Dense(num_decoder_tokens, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
The model summary.
I am facing a problem when building the inference model. Initially, the last layer of the decoder model connected to the second output tensor of the last second layer (it should be connecting to the first output tensor). And each time the code below was executed, the index keep on increment.
encoder_inputs = model.input[0] # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[4].output # lstm
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)
decoder_inputs = model.input[1] # input_2
decoder_embedding = model.layers[3].output
# Get the state from encoder
decoder_state_input_h = keras.Input(shape=(latent_dim,))
decoder_state_input_c = keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[5] #lstm_1
decoder_outputs_lstm, state_h_dec, state_c_dec = decoder_lstm(
decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[6]
decoder_outputs = decoder_dense(decoder_outputs_lstm)
decoder_model = keras.Model(
[decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)
The decoder model summary was as below.
The encoder model seems fine.

" ValueError: Expecting KerasTensor which is from tf.keras.Input()". Error in prediction with dropout function

I am trying to predict uncertainty in a regression problem using Dropout during testing as per Yarin Gal's article. I created a class using Keras's backend function as provided by this stack overflow question's answer. The class takes a NN model as input and randomly drops neurons during testing to give a stochastic estimate rather than deterministic output for a time-series forecasting.
I create a simple encoder-decoder model as shown below for the forecasting with 0.1 dropout during training:
input_sequence = Input(shape=(lookback, train_x.shape[2]))
encoder = LSTM(128, return_sequences=False)(input_sequence)
r_vec = RepeatVector(forward_pred)(encoder)
decoder = LSTM(128, return_sequences=True, dropout=0.1)(r_vec) #maybe use dropout=0.1
output = TimeDistributed(Dense(train_y.shape[2], activation='linear'))(decoder)
# optimiser = optimizers.Adam(clipnorm=1)
enc_dec_model = Model(input_sequence, output)
enc_dec_model.compile(loss="mean_squared_error",
optimizer="adam",
metrics=['mean_squared_error'])
enc_dec_model.summary()
After that, I define and call the DropoutPrediction class.
# Define the class:
class KerasDropoutPrediction(object):
def __init__(self ,model):
self.f = K.function(
[model.layers[0].input,
K.learning_phase()],
[model.layers[-1].output])
def predict(self ,x, n_iter=10):
result = []
for _ in range(n_iter):
result.append(self.f([x , 1]))
result = np.array(result).reshape(n_iter ,x.shape[0] ,x.shape[1]).T
return result
# Call the object:
kdp = KerasDropoutPrediction(enc_dec_model)
y_pred_do = kdp.predict(x_test,n_iter=100)
y_pred_do_mean = y_pred_do.mean(axis=1)
However, in the line
kdp = KerasDropoutPrediction(enc_dec_model), when I call the LSTM model,
I got the following error message which says the input has to be a Keras Tensor. Can anyone help me with this error?
Error Message:
ValueError: Found unexpected instance while processing input tensors for keras functional model. Expecting KerasTensor which is from tf.keras.Input() or output from keras layer call(). Got: 0
To activate Dropout at inference time, you simply have to specify training=True (TF>2.0) in the layer of interest (in the last LSTM layer in your case)
with training=False
inp = Input(shape=(10, 1))
x = LSTM(1, dropout=0.3)(inp, training=False)
m = Model(inp,x)
# m.compile(...)
# m.fit(...)
X = np.random.uniform(0,1, (1,10,1))
output = []
for i in range(0,100):
output.append(m.predict(X)) # always the same
with training=True
inp = Input(shape=(10, 1))
x = LSTM(1, dropout=0.3)(inp, training=True)
m = Model(inp,x)
# m.compile(...)
# m.fit(...)
X = np.random.uniform(0,1, (1,10,1))
output = []
for i in range(0,100):
output.append(m.predict(X)) # always different
In your example, this becomes:
input_sequence = Input(shape=(lookback, train_x.shape[2]))
encoder = LSTM(128, return_sequences=False)(input_sequence)
r_vec = RepeatVector(forward_pred)(encoder)
decoder = LSTM(128, return_sequences=True, dropout=0.1)(r_vec, training=True)
output = TimeDistributed(Dense(train_y.shape[2], activation='linear'))(decoder)
enc_dec_model = Model(input_sequence, output)
enc_dec_model.compile(
loss="mean_squared_error",
optimizer="adam",
metrics=['mean_squared_error']
)
enc_dec_model.fit(train_x, train_y, epochs=10, batch_size=32)
and the KerasDropoutPrediction:
class KerasDropoutPrediction(object):
def __init__(self, model):
self.model = model
def predict(self, X, n_iter=10):
result = []
for _ in range(n_iter):
result.append(self.model.predict(X))
result = np.array(result)
return result
kdp = KerasDropoutPrediction(enc_dec_model)
y_pred_do = kdp.predict(test_x, n_iter=100)
y_pred_do_mean = y_pred_do.mean(axis=0)

Decoder does not accept Output of Bidirectional Encoder

I'm trying to implement a Encoder Decoder Model with Tensorflow. The Encoder is a bidirectional cell.
def encoder(hidden_units, encoder_embedding, sequence_length):
forward_cell = tf.contrib.rnn.LSTMCell(hidden_units)
backward_cell = tf.contrib.rnn.LSTMCell(hidden_units)
bi_outputs, final_states = tf.nn.bidirectional_dynamic_rnn(forward_cell, backward_cell, encoder_embedding, sequence_length= sequence_length, dtype=tf.float32)
encoder_outputs = tf.concat(bi_outputs, 2)
forward_cell_state, backward_cell_state =final_states
cell_state_final = tf.concat([forward_cell_state.c, backward_cell_state.c],1)
hidden_state_final = tf.concat([forward_cell_state.h, backward_cell_state.h],1)
encoder_final_state = tf.nn.rnn_cell.LSTMStateTuple(c=cell_state_final, h=hidden_state_final)
return encoder_outputs, encoder_final_state
Something went wrong between Encoder and Decoder. I get an Error like ValueError: Shapes (?, 42) and (12, 21) are not compatible ....
The Decoder has a attention mechanism and looks like this:
def decoder(decoder_embedding, vocab_size, hidden_units, sequence_length, encoder_output, encoder_state, batchsize):
projection_layer = Dense(vocab_size)
helper = tf.contrib.seq2seq.TrainingHelper(decoder_embedding, sequence_length=sequence_length)
# Decoder
decoder_cell = tf.contrib.rnn.LSTMCell(hidden_units)
# Attention Mechanis
attention_mechanism = tf.contrib.seq2seq.LuongAttention(hidden_units, encoder_output)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=hidden_units)
# Initial attention
attn_zero = attn_cell.zero_state(batch_size=batchsize, dtype=tf.float32)
ini_state = attn_zero.clone(cell_state=encoder_state)
decoder = tf.contrib.seq2seq.BasicDecoder(cell=attn_cell, initial_state=ini_state, helper=helper, output_layer=projection_layer)
decoder_outputs, _final_state, _final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder)
return decoder_outputs
How can this be fixed?
The problem is that the number of hidden units of the encoder is twice as large as of the decoder and you are trying to Luong-style attention. It computes the attention energies (state similarities) as a dot-products between the decoder state and all encoder states—and the dot product requires the same dimensionality.
You have several options:
Use Bahdanau-style attention that has an additional non-linear layer into a shared encoder-decoder space.
Change the dimensionality of your encoder or decoder such that the hidden states will be the same, i.e. encoder's hidden_units = 2 × decoder's hidden_units
Add a linear dense layer after the encoder output that would project the encoder outputs to the decoder's hidden dimension.

character level bidirectional language model in tensorflow

Inspired from Andrej Karpathy Char-RNN, There is a Tensorflow implementation of char-rnn sherjilozair/char-rnn-tensorflow: Multi-layer Recurrent Neural Networks (LSTM, RNN) for character-level language models in Python using Tensorflow. I want to implement bidirectional character level language model from this code. I change the model.py and wrote a simple code:
class Model:
def __init__(self, input_data, targets, seq_length=Config.max_seq_length, training=True):
if Config.model == 'rnn':
cell_fn = rnn.BasicRNNCell
elif Config.model == 'gru':
cell_fn = rnn.GRUCell
elif Config.model == 'lstm':
cell_fn = rnn.BasicLSTMCell
elif Config.model == 'nas':
cell_fn = rnn.NASCell
else:
raise Exception("model type not supported: {}".format(Config.model))
fw_cells = []
bw_cells = []
for _ in range(Config.num_layers):
fw_cell = cell_fn(Config.rnn_size)
bw_cell = cell_fn(Config.rnn_size)
fw_cells.append(fw_cell)
bw_cells.append(bw_cell)
self.fw_cell = rnn.MultiRNNCell(fw_cells, state_is_tuple=True)
self.bw_cell = rnn.MultiRNNCell(bw_cells, state_is_tuple=True)
self.input_data, self.targets = input_data, targets
with tf.variable_scope('rnnlm'):
softmax_w = tf.get_variable("softmax_w", [Config.rnn_size*2, Config.vocab_size])
softmax_b = tf.get_variable("softmax_b", [Config.vocab_size])
embedding = tf.get_variable("embedding", [Config.vocab_size, Config.rnn_size])
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
inputs = tf.unstack(inputs, num=seq_length, axis=1)
outputs, _, _ = tf.nn.static_bidirectional_rnn(self.fw_cell, self.bw_cell, inputs,
dtype=tf.float32, scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, Config.rnn_size*2])
self.logits = tf.matmul(output, softmax_w) + softmax_b
self.probs = tf.nn.softmax(self.logits)
self.lr = tf.Variable(0.0, trainable=False)
if training:
loss = legacy_seq2seq.sequence_loss_by_example(
[self.logits],
[tf.reshape(self.targets, [-1])],
[tf.sign(tf.cast(tf.reshape(self.targets, [-1]), dtype=tf.float32))])
with tf.name_scope('cost'):
self.cost = tf.reduce_mean(loss)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), Config.grad_clip)
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(zip(grads, tvars))
In training phase, I see a fast converge. After near 3000 iteration, the loss reach 0.003. In test phase, the probability of all character is 1.0. I think there is a mistake.
I will so glad to get some help to find my mistake.
use the preceding and following output to predict the prob of current word. In your case, you used current rnn output to predict the prob of current word.
Looks like you set self.lr = tf.Variable(0.0, trainable=False). Try changing this to a nonzero value. If you are reading probabilities from self.probs during the testing phase this should be normalized appropriately,

How to use AttentionMechanism with MultiRNNCell and dynamic_decode?

I want to create a multi-layered dynamic RNN-based decoder that uses an attention mechanism. To do this, I first create an attention mechanism:
attention_mechanism = BahdanauAttention(num_units=ATTENTION_UNITS,
memory=encoder_outputs,
normalize=True)
Then I use the AttentionWrapper to wrap a LSTM cell with the attention mechanism:
attention_wrapper = AttentionWrapper(cell=self._create_lstm_cell(DECODER_SIZE),
attention_mechanism=attention_mechanism,
output_attention=False,
alignment_history=True,
attention_layer_size=ATTENTION_LAYER_SIZE)
where self._create_lstm_cell is defined as follows:
#staticmethod
def _create_lstm_cell(cell_size):
return BasicLSTMCell(cell_size)
I then do some bookkeeping (e.g. creating my MultiRNNCell, creating an initial state, creating a TrainingHelper, etc.)
attention_zero = attention_wrapper.zero_state(batch_size=tf.flags.FLAGS.batch_size, dtype=tf.float32)
# define initial state
initial_state = attention_zero.clone(cell_state=encoder_final_states[0])
training_helper = TrainingHelper(inputs=self.y, # feed in ground truth
sequence_length=self.y_lengths) # feed in sequence lengths
layered_cell = MultiRNNCell(
[attention_wrapper] + [ResidualWrapper(self._create_lstm_cell(cell_size=DECODER_SIZE))
for _ in range(NUMBER_OF_DECODER_LAYERS - 1)])
decoder = BasicDecoder(cell=layered_cell,
helper=training_helper,
initial_state=initial_state)
decoder_outputs, decoder_final_state, decoder_final_sequence_lengths = dynamic_decode(decoder=decoder,
maximum_iterations=tf.flags.FLAGS.max_number_of_scans // 12,
impute_finished=True)
But I receive the following error: AttributeError: 'LSTMStateTuple' object has no attribute 'attention'.
What is the correct way to add an attention mechanism to a MultiRNNCell dynamic decoder?
Have you tried using the attention wrapper provided by tf.contrib?
Here is an example using both an attention wrapper and dropout:
cells = []
for i in range(n_layers):
cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)
cell = tf.contrib.rnn.AttentionCellWrapper(
cell, attn_length=40, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=0.5)
cells.append(cell)
cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
init_state = cell.zero_state(batch_size, tf.float32)
What you need to do is that you create the MultiLayer cell then you wrap it with an AttentionWrapper , below is an example :
def decoding_layer(dec_input, encoder_state,
target_sequence_length, max_target_sequence_length,
rnn_size,
num_layers, target_vocab_to_int, target_vocab_size,
batch_size, keep_prob, decoding_embedding_size , encoder_outputs):
"""
Create decoding layer
:param dec_input: Decoder input
:param encoder_state: Encoder state
:param target_sequence_length: The lengths of each sequence in the target batch
:param max_target_sequence_length: Maximum length of target sequences
:param rnn_size: RNN Size
:param num_layers: Number of layers
:param target_vocab_to_int: Dictionary to go from the target words to an id
:param target_vocab_size: Size of target vocabulary
:param batch_size: The size of the batch
:param keep_prob: Dropout keep probability
:param decoding_embedding_size: Decoding embedding size
:return: Tuple of (Training BasicDecoderOutput, Inference BasicDecoderOutput)
"""
# 1. Decoder Embedding
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
# 2. Construct the decoder cell
def create_cell(rnn_size):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,
initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
dec_cell = tf.contrib.rnn.MultiRNNCell([create_cell(rnn_size) for _ in range(num_layers)])
#dec_cell = tf.contrib.rnn.MultiRNNCell(cells_a)
#attention details
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=rnn_size, memory=encoder_outputs)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attention_mechanism , attention_layer_size=rnn_size/2)
attn_zero = attn_cell.zero_state(batch_size , tf.float32 )
attn_zero = attn_zero.clone(cell_state = encoder_state)
#new_state = tf.contrib.seq2seq.AttentionWrapperState(cell_state = encoder_state, attention = attn_zero , time = 0 ,alignments=None , alignment_history=())
"""out_cell = tf.contrib.rnn.OutputProjectionWrapper(
attn_cell, target_vocab_size, reuse=True
)"""
#end of attention
#tensor_util.make_tensor_proto(attn_cell)
output_layer = Dense(target_vocab_size,
kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
with tf.variable_scope("decode"):
train_decoder_out = decoding_layer_train(attn_zero, attn_cell, dec_embed_input,
target_sequence_length, max_target_sequence_length, output_layer, keep_prob)
with tf.variable_scope("decode", reuse=True):
infer_decoder_out = decoding_layer_infer(attn_zero, attn_cell, dec_embeddings,
target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'], max_target_sequence_length,
target_vocab_size, output_layer, batch_size, keep_prob)
return (train_decoder_out, infer_decoder_out)