I'm trying to use tf.contrib.seq2seq module to do forecasting on some data (just float32 vectors) but all the examples I found using the seq2seq module from TensorFlow are used for translation and therefore embeddings.
I'm struggling to understand exactly what tf.contrib.seq2seq.Helper is doing for the Seq2Seq architecture and how I can use the CustomHelper in my case.
This is what I've done for now:
import tensorflow as tf
from tensorflow.python.layers import core as layers_core
input_seq_len = 15 # Sequence length as input
input_dim = 1 # Nb of features in input
output_seq_len = forecast_len = 20 # horizon length for forecasting
output_dim = 1 # nb of features to forecast
encoder_units = 200 # nb of units in each cell for the encoder
decoder_units = 200 # nb of units in each cell for the decoder
attention_units = 100
batch_size = 8
graph = tf.Graph()
with graph.as_default():
learning_ = tf.placeholder(tf.float32)
with tf.variable_scope('Seq2Seq'):
# Placeholder for encoder input
enc_input = tf.placeholder(tf.float32, [None, input_seq_len, input_dim])
# Placeholder for decoder output - Targets
target = tf.placeholder(tf.float32, [None, output_seq_len, output_dim])
# Build RNN cell
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_units)
initial_state = encoder_cell.zero_state(batch_size, dtype=tf.float32)
# Run Dynamic RNN
# encoder_outputs: [batch_size, seq_size, num_units]
# encoder_state: [batch_size, num_units]
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, enc_input, initial_state=initial_state)
## Attention layer
attention_mechanism_bahdanau = tf.contrib.seq2seq.BahdanauAttention(
num_units = attention_units, # depth of query mechanism
memory = encoder_outputs, # hidden states to attend (output of RNN)
normalize=False, # normalize energy term
attention_mechanism_luong = tf.contrib.seq2seq.LuongAttention(
num_units = encoder_units,
memory = encoder_outputs,
# Simple Dense layer to project from rnn_dim to the desired output_dim
projection = layers_core.Dense(output_dim, use_bias=True, name="output_projection")
helper = tf.contrib.seq2seq.TrainingHelper(target, sequence_length=[output_seq_len for _ in range(batch_size)])
## This is where I don't really know what to do in my case, is this function changing my data into [ GO, data, END] ?
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(decoder_units)
attention_cell = tf.contrib.seq2seq.AttentionWrapper(
cell = decoder_cell,
attention_mechanism = attention_mechanism_luong, # Instance of AttentionMechanism
attention_layer_size = attention_units,
initial_state = attention_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
initial_state = initial_state.clone(cell_state=encoder_state)
decoder = tf.contrib.seq2seq.BasicDecoder(attention_cell, initial_state=initial_state, helper=helper, output_layer=projection)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder)
# Loss function:
loss = 0.5*tf.reduce_sum(tf.square(outputs[0] - target), -1)
loss = tf.reduce_mean(loss, 1)
loss = tf.reduce_mean(loss)
# Optimizer
optimizer = tf.train.AdamOptimizer(learning_).minimize(loss)
I understood that Training state and Inference state are quite different for the Seq2seq architecture but I don't know how to use the Helpers from the module in order to distinguish both.
I'm using this module because it's quite useful for Attention Layers.
How can I use the Helper in order to create a ['Go' , [input_sequence]] for the decoder ?


Keras Bidirectional LSTM seq2seq inference model expects 3 inputs but only receives 1, even though I am passing in 3 inputs

I am creating a language model with a bidirecitonal LSTM, seq2seq model.
I have created the model and trained it successfully:
lstm_units = 100
# Set up embedding layer using pretrained weights
embedding_layer = Embedding(total_words+1, emb_dimension, input_length=max_input_len, weights=[embedding_matrix], name="Embedding")
# Encoder
encoder_input_x = Input(shape=(None,), name="Enc_x_Input")
encoder_embedding_x = embedding_layer(encoder_input_x)
encoder_lstm_x, enc_state_h_fwd, enc_state_c_fwd, enc_state_h_bwd, enc_state_c_bwd = Bidirectional(LSTM(lstm_units, dropout=0.5, return_state=True, name="Enc_LSTM1"), name="Enc_Bi1")(encoder_embedding_x) # pass hidden activation and memory cell states forward
encoder_state_h = Concatenate()([enc_state_h_fwd, enc_state_h_bwd])
encoder_state_c = Concatenate()([enc_state_c_fwd, enc_state_c_bwd])
encoder_states = [encoder_state_h, encoder_state_c] # package states to pass to decoder
# Decoder
decoder_input_x = Input(shape=(None,), name="Dec_x_Input")
decoder_embedding_x = embedding_layer(decoder_input_x)
decoder_lstm_layer = LSTM(lstm_units*2, return_state=True, return_sequences=True, dropout=0.5, name="Dec_LSTM1") # We define an LSTM layer without passing anything in here, as we will need to use this LSTM later.
decoder_lstm_x, _, _ = decoder_lstm_layer(decoder_embedding_x, initial_state=encoder_states) # we pass in encoder states
decoder_dense_layer = TimeDistributed(Dense(total_words+1, activation="softmax", name="Dec_Softmax")) # we set this dense to a variable so we can use it later, as above with the LSTM
decoder_output_x = decoder_dense_layer(decoder_lstm_x)
model = Model(inputs=[encoder_input_x, decoder_input_x], outputs=decoder_output_x)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
I then set up the inference model:
# Inference Encoder
inf_encoder_model = Model(encoder_input_x, encoder_states) # Here we are creating a model using layers from the model we built earlier.
# The encoder model outputs the encoder_states, ie. the concatenated h and c values from the BLSTM
# Inference Decoder
# Create new inputs for decoder state
inf_dec_state_h_input = Input(shape=(2*lstm_units,), name="Dec_h_state_input") # The must be sized to fit both FWD and BWD h values from the BLSTM
inf_dec_state_c_input = Input(shape=(2*lstm_units,), name="Dec_c_state_input")
inf_dec_state_input = [inf_dec_state_h_input, inf_dec_state_c_input] # package states to pass to decoder
# Decoder LSTM + Dense
inf_decoder_lstm_x, inf_dec_state_h, inf_dec_state_c = decoder_lstm_layer(decoder_embedding_x, initial_state=inf_dec_state_input) # reuse embedding layer from training. We pass in encoder states
inf_decoder_states = [inf_dec_state_h, inf_dec_state_c] # I think we we loop inference, we'll pass these states back in to the input instead of the encoder states
inf_decoder_output = decoder_dense_layer(inf_decoder_lstm_x)
decoder_model = Model([decoder_input_x] + inf_dec_state_input, [inf_decoder_output] + inf_decoder_states) # we reuse the decoder_input_x from the training model
The decoder model for inference is set up to take the decoder inputs + the c and h states which are output from the encoder.
When running the inference loop using this code:
states = inf_encoder_model.predict(x_inputs[700])
# Generate empty target sequence of length 1.
target_seq = np.zeros((max_output_len, 1), dtype=int)
# Populate the first character of target sequence with the start character.
target_seq[0, 0] = 4 # 4 is the start of sequence token used during training
# Get prediction
prediction, h, c = decoder_model.predict([target_seq] + states)
it gives me a long error that ends with:
ValueError: Layer Dec_LSTM1 expects 3 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'model_16/Embedding/embedding_lookup/Identity_1:0' shape=(None, 1, 100) dtype=float32>]
The encoder states seem to be fine; a list containing 2 arrays, the h and c values, each with shape (60, 200). The target_seq is an array of shape (1, 60). x_inputs[700] is training data, also of shape (1, 60).
Why is the model.predict line suggesting I am giving it 1 input tensor when I am giving it a list containing 3 arrays?

How to define and use a custom loss function in keras

I have a model in Keras. The model is using B. cross-entropy (log loss). However, I wanna create my custom B.C.E log loss for it.
here is my model
def get_model(train, num_users, num_items, layers=[20, 10, 5, 2]):
num_layer = len(layers) # Number of layers in the MLP
user_matrix = K.constant(getTrainMatrix(train))
item_matrix = K.constant(getTrainMatrix(train).T)
# Input variables
user_input = Input(shape=(1,), dtype='int32', name='user_input')
item_input = Input(shape=(1,), dtype='int32', name='item_input')
user_rating = Lambda(lambda x: tf.gather(user_matrix, tf.to_int32(x)))(user_input)
item_rating = Lambda(lambda x: tf.gather(item_matrix, tf.to_int32(x)))(item_input)
user_rating = Reshape((num_items, ))(user_rating)
item_rating = Reshape((num_users, ))(item_rating)
MLP_Embedding_User = Dense(layers[0]//2, activation="linear" , name='user_embedding')
MLP_Embedding_Item = Dense(layers[0]//2, activation="linear" , name='item_embedding')
user_latent = MLP_Embedding_User(user_rating)
item_latent = MLP_Embedding_Item(item_rating)
# The 0-th layer is the concatenation of embedding layers
vector = concatenate([user_latent, item_latent])
# Final prediction layer
prediction = Dense(1, activation='sigmoid', kernel_initializer=initializers.lecun_normal(),
model_ = Model(inputs=[user_input, item_input],
return model_
Here is the call to the compile function.
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')
Now my question is how to define a custome binary cross entropy loss for it?

How to use tensorflow seq2seq without embeddings?

I have been working on LSTM for timeseries forecasting by using tensorflow. Now, i want to try sequence to sequence (seq2seq). In the official site there is a tutorial which shows NMT with embeddings . So, how can I use this new seq2seq module without embeddings? (directly using time series "sequences").
# 1. Encoder
encoder_cell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE)
encoder_outputs, encoder_state = tf.nn.static_rnn(
# Decoder
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_SIZE)
helper = tf.contrib.seq2seq.TrainingHelper(
decoder_emb_inp, decoder_lengths, time_major=True)
decoder = tf.contrib.seq2seq.BasicDecoder(
decoder_cell, helper, encoder_state)
# Dynamic decoding
outputs, _ = tf.contrib.seq2seq.dynamic_decode(decoder)
outputs = outputs[-1]
# output is result of linear activation of last layer of RNN
weight = tf.Variable(tf.random_normal([LSTM_SIZE, N_OUTPUTS]))
bias = tf.Variable(tf.random_normal([N_OUTPUTS]))
predictions = tf.matmul(outputs, weight) + bias
What should be the args for TrainingHelper() if I use input_seq=x and output_seq=label?
decoder_emb_inp ???
decoder_lengths ???
Where input_seq are the first 8 point of the sequence, and output_seq are the last 2 point of the sequence.
Thanks on advance!
I got it to work for no embedding using a very rudimentary InferenceHelper:
inference_helper = tf.contrib.seq2seq.InferenceHelper(
sample_fn=lambda outputs: outputs,
end_fn=lambda sample_ids: False)
My inputs are floats with the shape [batch_size, time, dim]. For the example below dim would be 1, but this can easily be extended to more dimensions. Here's the relevant part of the code:
projection_layer = tf.layers.Dense(
units=1, # = dim
mean=0.0, stddev=0.1))
# Training Decoder
training_decoder_output = None
with tf.variable_scope("decode"):
# output_data doesn't exist during prediction phase.
if output_data is not None:
# Prepend the "go" token
go_tokens = tf.constant(go_token, shape=[batch_size, 1, 1])
dec_input = tf.concat([go_tokens, target_data], axis=1)
# Helper for the training process.
training_helper = tf.contrib.seq2seq.TrainingHelper(
sequence_length=[output_size] * batch_size)
# Basic decoder
training_decoder = tf.contrib.seq2seq.BasicDecoder(
dec_cell, training_helper, enc_state, projection_layer)
# Perform dynamic decoding using the decoder
training_decoder_output = tf.contrib.seq2seq.dynamic_decode(
training_decoder, impute_finished=True,
# Inference Decoder
# Reuses the same parameters trained by the training process.
with tf.variable_scope("decode", reuse=tf.AUTO_REUSE):
start_tokens = tf.constant(
go_token, shape=[batch_size, 1])
# The sample_ids are the actual output in this case (not dealing with any logits here).
# My end_fn is always False because I'm working with a generator that will stop giving
# more data. You may extend the end_fn as you wish. E.g. you can append end_tokens
# and make end_fn be true when the sample_id is the end token.
inference_helper = tf.contrib.seq2seq.InferenceHelper(
sample_fn=lambda outputs: outputs,
sample_shape=[1], # again because dim=1
end_fn=lambda sample_ids: False)
# Basic decoder
inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
# Perform dynamic decoding using the decoder
inference_decoder_output = tf.contrib.seq2seq.dynamic_decode(
inference_decoder, impute_finished=True,
Have a look at this question. Also I found this tutorial to be very useful to understand seq2seq models, although it does use embeddings. So replace their GreedyEmbeddingHelper by an InferenceHelper like the one I posted above.
P.s. I posted the full code at

MXNET custom loss function and eval_metric

How do I create a custom loss function in MXNET? For example, instead of computing cross-entropy loss for one label (using standard mx.sym.SoftmaxOutput layer which computes cross-entropy loss and returns a symbol that can be passed as a loss symbol to the fit function), I want to compute weighted cross-entropy loss for each possible label. The MXNET tutorials mention using
mx.symbol.MakeLoss(scalar_loss_symbol, normalization='batch')
However, when I use MakeLoss function, the standard eval_metric - "acc" does not work (obviously as the model doesn't know what is my predicted probability vector). Therefore I need to write my own eval_metric.
Further, at the time of prediction, I need to predict the probability vector as well, which cannot be accessed unless I group the final probability vector with the loss symbol and block_grad on it.
The code below is a modification of the MXNET tutorial where the standard SoftmaxOutput loss function is rewritten for a custom weighted loss function and required custom eval_metric is written.
import logging
import mxnet as mx
import numpy as np
mnist = mx.test_utils.get_mnist()
batch_size = 100
weighted_train_labels =
np.zeros((mnist['train_label'].shape[0],np.max(mnist['train_label'])+ 1))
weighted_train_labels[np.arange(mnist['train_label'].shape[0]),mnist['train_label']] = 1
train_iter =['train_data'], {'label':weighted_train_labels}, batch_size, shuffle=True)
weighted_test_labels = np.zeros((mnist['test_label'].shape[0],np.max(mnist['test_label'])+ 1))
weighted_test_labels[np.arange(mnist['test_label'].shape[0]),mnist['test_label']] = 1
val_iter =['test_data'], {'label':weighted_test_labels}, batch_size)
data = mx.sym.var('data')
# first conv layer
conv1 = mx.sym.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2,2), stride=(2,2))
# second conv layer
conv2 = mx.sym.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2,2), stride=(2,2))
# first fullc layer
flatten = mx.sym.flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
# second fullc
fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
# softmax loss
#lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
label = mx.sym.var('label')
softmax = mx.sym.log_softmax(data=fc2)
softmax_output = mx.sym.BlockGrad(data = softmax,name = 'softmax')
ce = ce = -mx.sym.sum(mx.sym.sum(mx.sym.broadcast_mul(softmax,label),1))
lenet = mx.symbol.MakeLoss(ce, normalization='batch')
sym = mx.sym.Group([softmax_output,lenet])
print sym.list_outputs
def custom_metric(label,softmax):
return len(np.where(np.argmax(softmax,1)==np.argmax(label,1))[0])/float(label.shape[0])
eval_metrics = mx.metric.CustomMetric(custom_metric,name='custom-accuracy', output_names=['softmax_output'],label_names=['label'])
lenet_model = mx.mod.Module(symbol=sym, context=mx.gpu(),data_names=['data'], label_names=['label']),
#batch_end_callback = mx.callback.Speedometer(batch_size, 100),

How to use tf.contrib.seq2seq.BahdanauAttention

I am trying to produce a simple code for a seq2seq model with attention in tf 1.1. I am not sure what is the parameter "depth of query mechanism ". I am getting an error on creation of Attention Mechanisms saying that:
TypeError: int() argument must be a string, a bytes-like object or a number, not 'TensorShape'
Here is my code. Am I on a right track? I could not find any detailed documentation.
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, BasicLSTMCell, DropoutWrapper, MultiRNNCell, EmbeddingWrapper, static_rnn
import tensorflow.contrib.seq2seq as seq2seq
import attention_wrapper as wrapper
sess = tf.InteractiveSession()
## Place holders
encode_input = [tf.placeholder(tf.int32,
name = "ei_%i" %i)
for i in range(input_seq_length)]
labels = [tf.placeholder(tf.int32,
name = "l_%i" %i)
for i in range(output_seq_length)]
decode_input = [tf.zeros_like(encode_input[0], dtype=np.int32, name="GO")] + labels[:-1]
############ Encoder
lstm_cell = BasicLSTMCell(embedding_dim)
encoder_cell = EmbeddingWrapper(lstm_cell, embedding_classes=input_vocab_size, embedding_size=embedding_dim)
encoder_outputs, encoder_state = static_rnn(encoder_cell, encode_input, dtype=tf.float32)
############ Decoder
# Attention Mechanisms. Bahdanau is additive style attention
attn_mech = tf.contrib.seq2seq.BahdanauAttention(
num_units = input_seq_length, # depth of query mechanism
memory = encoder_outputs, # hidden states to attend (output of RNN)
normalize=False, # normalize energy term
lstm_cell_decoder = BasicLSTMCell(embedding_dim)
# Attention Wrapper: adds the attention mechanism to the cell
attn_cell = wrapper.AttentionWrapper(
cell = lstm_cell_decoder,# Instance of RNNCell
attention_mechanism = attn_mech, # Instance of AttentionMechanism
attention_size = embedding_dim, # Int, depth of attention (output) tensor
attention_history=False, # whether to store history in final output
# Decoder setup
decoder = tf.contrib.seq2seq.BasicDecoder(
cell = lstm_cell_decoder,
helper = helper, # A Helper instance
initial_state = encoder_state, # initial state of decoder
output_layer = None) # instance of tf.layers.Layer, like Dense
# Perform dynamic decoding with decoder object
outputs, final_state = tf.contrib.seq2seq.dynamic_decode(decoder)