Masking Layer and Mask_Zero in Embedding in Keras - tensorflow

I have deployed a model where i have a sequence of pages visited by the customer and then the numerical features like time spent on that page etc. Now, to pass the model in keras, I encode the pages information in the form of embeddings and concatenate it with other numerical features and pass it to a LSTM Cell in Keras. Here, to ignore the padded value in embeddings, I am using a mask_zero = True argument in the keras layer and to ignore numerical features for that timestamp, using the masking layer in keras to skip if value = -99 (have used the -99 as the padded value for the numerical feature). Below is the model summary.
from keras.layers import Input, Embedding, Dot, Reshape, Dense, Dropout,Concatenate,Masking
##Input for the Sequential Data
input0 = Input(name=str(inputs[0]),shape=[max_len])
input1 = Input(name=str(inputs[1]),shape=[max_len])
input2 = Input(name=str(inputs[2]),shape=[max_len])
input3 = Input(name=str(inputs[3]),shape=[max_len])
##Input Profiles for the Timespent on each page
input_ts0 = Input(name=str(inputs_ts[0]),shape=[max_len,1])
input_ts1 = Input(name=str(inputs_ts[1]),shape=[max_len,1])
input_ts2 = Input(name=str(inputs_ts[2]),shape=[max_len,1])
input_ts3 = Input(name=str(inputs_ts[3]),shape=[max_len,1])
##Embedding Layer
embed0 = Embedding(def_val+1,50,input_length=max_len,mask_zero=True)(input0)
embed1 = Embedding(def_val+1,50,input_length=max_len,mask_zero=True)(input1)
embed2 = Embedding(def_val+1,50,input_length=max_len,mask_zero=True)(input2)
embed3 = Embedding(def_val+1,50,input_length=max_len,mask_zero=True)(input3)
##concatenate the embedding and time spent on each page
ts_eve_concat0 = Concatenate(name='Concatenated_eve_ts0')([embed0,input_ts0])
ts_eve_concat1 = Concatenate(name='Concatenated_eve_ts1')([embed1,input_ts1])
ts_eve_concat2 = Concatenate(name='Concatenated_eve_ts2')([embed2,input_ts2])
ts_eve_concat3 = Concatenate(name='Concatenated_eve_ts3')([embed3,input_ts3])
##Masking the TS input where there is no information
masking_0 = Masking(mask_value = -99)(ts_eve_concat0)
masking_1 = Masking(mask_value = -99)(ts_eve_concat1)
masking_2 = Masking(mask_value = -99)(ts_eve_concat2)
masking_3 = Masking(mask_value = -99)(ts_eve_concat3)
#LSTM on all the individual layers
lstm0 = LSTM(32)(masking_0)
lstm1 = LSTM(32)(masking_1)
lstm2 = LSTM(32)(masking_2)
lstm3 = LSTM(32)(masking_3)
##Concatenate all the LSTM Layers
concat_lstm = Concatenate(name='Concatenated_lstm')([lstm0,lstm1,lstm2,lstm3])
layer = Dense(64,name='FC1')(concat_lstm)
layer = Activation('relu')(layer)
layer = Dropout(0.3)(layer)
layer = Dense(32,name='FC2',activation='relu')(layer)
layer = Dropout(0.3)(layer)
layer = Dense(1,name='out_layer')(layer)
layer = Activation('sigmoid')(layer)
Is this approach correct or do I need to send the information in some other manner

Related

How can i extract the encoded part of multi-modal autoencoder and convert the .h5 model to a numpy array?

I am making a deep multimodal autoencoder model which takes two inputs and produces a two outputs (which are the reconstructed inputs). The two inputs are with shape of (1000, 50) and (1000,60) respectively and the model has 3 hidden layers and aim to concatenate the two latent layer of input1 and input2.
I would like to extract the encoded part of my model and save the data as a numpy array.
here is the complete code of the model :
input_X = Input(shape=(X[0].shape))
dense_X = Dense(40,activation='relu')(input_X)
dense1_X = Dense(20,activation='relu')(dense_X)
latent_X= Dense(2,activation='relu')(dense1_X)
input_X1 = Input(shape=(X1[0].shape))
dense_X1 = Dense(40,activation='relu')(input_X1)
dense1_X1 = Dense(20,activation='relu')(dense_X1)
latent_X1= Dense(2,activation='relu')(dense1_X1)
Concat_X_X1 = concatenate([latent_X, latent_X1])
decoding_X = Dense(20,activation='relu')(Concat_X_X1)
decoding1_X = Dense(40,activation='relu')(decoding_X)
output_X = Dense(X[0].shape[0],activation='sigmoid')(decoding1_X)
decoding_X1 = Dense(20,activation='relu')(Concat_X_X1)
decoding1_X1 = Dense(40,activation='relu')(decoding_X1)
output_X1 = Dense(X1[0].shape[0],activation='sigmoid')(decoding1_X1)
multi_modal_autoencoder = Model([input_X, input_X1], [output_X, output_X1], name='multi_modal_autoencoder')
encoder = Model([input_X, input_X1], Concat_X_X1)
encoder.save('encoder.h5')
multi_modal_autoencoder.compile(optimizer=keras.optimizers.Adam(lr=0.001),loss='mse')
model = multi_modal_autoencoder.fit([X,X1], [X, X1], epochs=70, batch_size=150)
With h5py package you can get into your .h5 file and extract exactly what you want:
f = h5py.File('encoder.h5', 'r')
keys = list(f.keys())
values = f.get('some_key')
You can hierarchically use .get many times to go deeper into your .h5 file to extract what you need.

Keras Functional API Multiple Input Shape Errors

My goal is to use a CNN to go through a picture, then add an array of extra data before the dense layers.
picIn = keras.Input(shape=x[0].shape)
conv1 = layers.Conv2D(32,kernel_size=3,padding='same',use_bias=False)(picIn)
batch1 = layers.BatchNormalization()(conv1)
leaky1 = layers.LeakyReLU(alpha=.3)(batch1)
conv2 = layers.Conv2D(32,kernel_size=3,padding='same',use_bias=False)(leaky1)
batch2 = layers.BatchNormalization()(conv2)
leaky2 = layers.LeakyReLU(alpha=.3)(batch2)
cdrop1 = layers.Dropout(.20)(leaky2)
conv3= layers.Conv2D(64,kernel_size=3,padding='same',use_bias=False)(cdrop1)
batch3 = layers.BatchNormalization()(conv3)
leaky3 = layers.LeakyReLU(alpha=.3)(batch3)
conv4 = layers.Conv2D(64,kernel_size=3,padding='same',use_bias=False)(leaky3)
batch4 = layers.BatchNormalization()(conv4)
leaky4 = layers.LeakyReLU(alpha=.3)(batch4)
cdrop2 = layers.Dropout(.20)(leaky4)
flat1 = layers.Flatten()(cdrop2)
rtheta1 = rtheta[trainCut]
rtheta1 = rtheta1.reshape(467526,1)
rtheta2 = rtheta[testCut]
rtheta2 = rtheta2.reshape(82247,1)
ip2 = keras.Input(shape=rtheta1.shape)
flat2 = layers.Flatten()(ip2)
merge = layers.Concatenate()([flat1,flat2])
hidden1 = layers.Dense(512,use_bias=False)(merge)
batch5 = layers.BatchNormalization()(hidden1)
leaky5 = layers.LeakyReLU(alpha=.3)(batch5)
ddrop1 = layers.Dropout(.20)(leaky5)
hidden2 = layers.Dense(512,use_bias=False)(ddrop1)
batch6 = layers.BatchNormalization()(hidden2)
leaky6 = layers.LeakyReLU(alpha=.3)(batch6)
ddrop2 = layers.Dropout(.20)(leaky6)
hidden3 = layers.Dense(512,use_bias=False)(merge)
batch7 = layers.BatchNormalization()(hidden1)
leaky7 = layers.LeakyReLU(alpha=.3)(batch5)
ddrop3 = layers.Dropout(.20)(leaky5)
output = layers.Dense(1)(ddrop3)
model = keras.Model(inputs = [picIn,ip2], outputs = output)
H = model.fit(x =[ x[trainCut],rtheta[trainCut]],y= y[trainCut],batch_size=args.bsize,validation_data=([x[testCut],rtheta[testCut]], y[testCut]),epochs=args.epochs)
I always get an error related to the shape of the inputs
Input 0 of layer dense is incompatible with the layer: expected axis -1 of input shape to have value 473926 but received input with shape [None, 6401]
Model was constructed with shape (None, 467526, 1) for input Tensor("input_2:0", shape=(None, 467526, 1), dtype=float32), but it was called on an input with incompatible shape (None, 1, 1).
Im confused on what exactly to do here.
x[traincut] is a matrix of size (467526,10,10,2)
rtheta1 is (467526,1) and so is y[traincut]
The validation data is the same except it is 82247 instead of 467526.
I have tried it without flattening after ip2 and I get a different error but I think the core issue is still the same.
Any help would be appreciated. Thanks!
Edit: The data was not the right shape, obviously, but I figured out how to fix it.
Are you ensuring that all of your training data's shape is uniform before you put it through and into the first tensor?

model size too big with my attention model implementation?

I am implementing Minh-Thang Luong's attention model to build a english to chinese translater.And the model i trained has abnormally big size(980 MB).Minh-Thang Luong's original paper
this is model parameters
state size:120
source language vocabulary size:400000
source language word embedding size:400000*50
target language vocabulary size:20000
target language word embedding size:20000*300
This is my model implementation in tensorflow.
import tensorflow as tf
src_vocab_size=400000
src_w2v_dim=50
tgt_vocab_size=20000
tgt_w2v_dim=300
state_size=120
with tf.variable_scope('net_encode'):
ph_src_embedding = tf.placeholder(dtype=tf.float32,shape=[src_vocab_size,src_w2v_dim],name='src_vocab_embedding_placeholder')
#src_word_emb = tf.Variable(initial_value=ph_src_embedding,dtype=tf.float32,trainable=False, name='src_vocab_embedding_variable')
encoder_X_ix = tf.placeholder(shape=(None, None), dtype=tf.int32)
encoder_X_len = tf.placeholder(shape=(None), dtype=tf.int32)
encoder_timestep = tf.shape(encoder_X_ix)[1]
encoder_X = tf.nn.embedding_lookup(ph_src_embedding, encoder_X_ix)
batchsize = tf.shape(encoder_X_ix)[0]
encoder_Y_ix = tf.placeholder(shape=[None, None],dtype=tf.int32)
encoder_Y_onehot = tf.one_hot(encoder_Y_ix, src_vocab_size)
enc_cell = tf.nn.rnn_cell.LSTMCell(state_size)
enc_initstate = enc_cell.zero_state(batchsize,dtype=tf.float32)
enc_outputs, enc_final_states = tf.nn.dynamic_rnn(enc_cell,encoder_X,encoder_X_len,enc_initstate)
enc_pred = tf.layers.dense(enc_outputs, units=src_vocab_size)
encoder_loss = tf.losses.softmax_cross_entropy(encoder_Y_onehot,enc_pred)
encoder_trainop = tf.train.AdamOptimizer(0.001).minimize(encoder_loss)
with tf.variable_scope('net_decode'):
ph_tgt_embedding = tf.placeholder(dtype=tf.float32, shape=[tgt_vocab_size, tgt_w2v_dim],
name='tgt_vocab_embedding_placeholder')
#tgt_word_emb = tf.Variable(initial_value=ph_tgt_embedding, dtype=tf.float32, trainable=False, name='tgt_vocab_embedding_variable')
decoder_X_ix = tf.placeholder(shape=(None, None), dtype=tf.int32)
decoder_timestep = tf.shape(decoder_X_ix)[1]
decoder_X_len = tf.placeholder(shape=(None), dtype=tf.int32)
decoder_X = tf.nn.embedding_lookup(ph_tgt_embedding, decoder_X_ix)
decoder_Y_ix = tf.placeholder(shape=[None, None],dtype=tf.int32)
decoder_Y_onehot = tf.one_hot(decoder_Y_ix, tgt_vocab_size)
dec_cell = tf.nn.rnn_cell.LSTMCell(state_size)
dec_outputs, dec_final_state = tf.nn.dynamic_rnn(dec_cell,decoder_X,decoder_X_len,enc_final_states)
tile_enc = tf.tile(tf.expand_dims(enc_outputs,1),[1,decoder_timestep,1,1]) # [batchsize,decoder_len,encoder_len,state_size]
tile_dec = tf.tile(tf.expand_dims(dec_outputs, 2), [1, 1, encoder_timestep, 1]) # [batchsize,decoder_len,encoder_len,state_size]
enc_dec_cat = tf.concat([tile_enc,tile_dec],-1) # [batchsize,decoder_len,encoder_len,state_size*2]
weights = tf.nn.softmax(tf.layers.dense(enc_dec_cat,units=1),axis=-2) # [batchsize,decoder_len,encoder_len,1]
weighted_enc = tf.tile(weights, [1, 1, 1, state_size])*tf.tile(tf.expand_dims(enc_outputs,1),[1,decoder_timestep,1,1]) # [batchsize,decoder_len,encoder_len,state_size]
attention = tf.reduce_sum(weighted_enc,axis=2,keepdims=False) # [batchsize,decoder_len,state_size]
dec_attention_cat = tf.concat([dec_outputs,attention],axis=-1) # [batchsize,decoder_len,state_size*2]
dec_pred = tf.layers.dense(dec_attention_cat,units=tgt_vocab_size) # [batchsize,decoder_len,tgt_vocab_size]
pred_ix = tf.argmax(dec_pred,axis=-1) # [batchsize,decoder_len]
decoder_loss = tf.losses.softmax_cross_entropy(decoder_Y_onehot,dec_pred)
total_loss = encoder_loss + decoder_loss
decoder_trainop = tf.train.AdamOptimizer(0.001).minimize(total_loss)
_l0 = tf.summary.scalar('decoder_loss',decoder_loss)
_l1 = tf.summary.scalar('encoder_loss',encoder_loss)
log_all = tf.summary.merge_all()
writer = tf.summary.FileWriter(log_path,graph=tf.get_default_graph())
this is a run down of model parameters size that i can think of so far
encoder cell
=(50*120+120*120+120)*4
=(src_lang_embedding_size*statesize+statesize*statesize+statesize)*(forget gate,remember gate,new state,output gate)
=(kernelsize_for_input+kernelsize_for_previous_state+bias)*(forget gate,remember gate,new state,output gate)
=82080 floats
encoder dense layer
=120*400000
=statesize*src_lang_vocabulary_size
=48000000 floats
decoder cell
=(300*120+120*120+120)*4
=(target_lang_embedding_size*statesize+statesize*statesize+statesize)*(forget gate,remember gate,new state,output gate)
=(kernelsize_for_input+kernelsize_for_previous_state+bias)*(forget gate,remember gate,new state,output gate)
=202080 floats
dense layer that compute attention weights
=(120+120)*1
=(encoder_output_size+decoder_output_size)*(1 unit)
=240 floats
decoder dense layer
=(120+120)*20000
=(attention_vector_size+decoder_outputsize)*target_lang_vocabulary_size
=4800000 floats
summing them all gets 212 MB,but the actual model size is 980 MB.So where is wrong?
You are only computing the number of trainable parameters, these are not the only numbers you need to accommodate in the GPU memory.
You are using Adam optimizer so, you need to store gradients for all your parameters and momentums for all the parameters. This means that you need to store each parameter 3 times, this gives you 636 MB.
Then, you need to accommodate all the intermediate states of the network for the forward and the backward pass.
Let's say have a batch size of b and source and the target length of 50, then you have (at least, I might have something forgotten):
b×l×50 source embeddings,
b×l×300 target embeddings,
b×l×5×120 encoder states,
b×l×400000 encoder logits,
b×l×5×300 decoder states,
b×l×120 intermediate attention states,
b×l×20000 output logits.
This is in total 421970×b×l floats that you need to store for your forward and backward pass.
Btw. source vocabulary 400k is a tremendously large number, I don't believe most of them is frequent enough to learn anything meaningful about them. You should use pre-processing (i.e., SentencePiece) that would reduce your vocabulary to a reasonable size.

How to use previous output and hidden states from LSTM for the attention mechanism?

I am currently trying to code the attention mechanism from this paper: "Effective Approaches to Attention-based Neural Machine Translation", Luong, Pham, Manning (2015). (I use global attention with the dot score).
However, I am unsure on how to input the hidden and output states from the lstm decode. The issue is that the input of the lstm decoder at time t depends on quantities that I need to compute using the output and hidden states from t-1.
Here is the relevant part of the code:
with tf.variable_scope('data'):
prob = tf.placeholder_with_default(1.0, shape=())
X_or = tf.placeholder(shape = [batch_size, timesteps_1, num_input], dtype = tf.float32, name = "input")
X = tf.unstack(X_or, timesteps_1, 1)
y = tf.placeholder(shape = [window_size,1], dtype = tf.float32, name = "label_annotation")
logits = tf.zeros((1,1), tf.float32)
with tf.variable_scope('lstm_cell_encoder'):
rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [hidden_size, hidden_size]]
multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
lstm_outputs, lstm_state = tf.contrib.rnn.static_rnn(cell=multi_rnn_cell,inputs=X,dtype=tf.float32)
concat_lstm_outputs = tf.stack(tf.squeeze(lstm_outputs))
last_encoder_state = lstm_state[-1]
with tf.variable_scope('lstm_cell_decoder'):
initial_input = tf.unstack(tf.zeros(shape=(1,1,hidden_size2)))
rnn_decoder_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple = True)
# Compute the hidden and output of h_1
for index in range(window_size):
output_decoder, state_decoder = tf.nn.static_rnn(rnn_decoder_cell, initial_input, initial_state=last_encoder_state, dtype=tf.float32)
# Compute the score for source output vector
scores = tf.matmul(concat_lstm_outputs, tf.reshape(output_decoder[-1],(hidden_size,1)))
attention_coef = tf.nn.softmax(scores)
context_vector = tf.reduce_sum(tf.multiply(concat_lstm_outputs, tf.reshape(attention_coef, (window_size, 1))),0)
context_vector = tf.reshape(context_vector, (1,hidden_size))
# compute the tilda hidden state \tilde{h}_t=tanh(W[c_t, h_t]+b_t)
concat_context = tf.concat([context_vector, output_decoder[-1]], axis = 1)
W_tilde = tf.Variable(tf.random_normal(shape = [hidden_size*2, hidden_size2], stddev = 0.1), name = "weights_tilde", trainable = True)
b_tilde = tf.Variable(tf.zeros([1, hidden_size2]), name="bias_tilde", trainable = True)
hidden_tilde = tf.nn.tanh(tf.matmul(concat_context, W_tilde)+b_tilde) # hidden_tilde is [1*64]
# update for next time step
initial_input = tf.unstack(tf.reshape(hidden_tilde, (1,1,hidden_size2)))
last_encoder_state = state_decoder
# predict the target
W_target = tf.Variable(tf.random_normal(shape = [hidden_size2, 1], stddev = 0.1), name = "weights_target", trainable = True)
logit = tf.matmul(hidden_tilde, W_target)
logits = tf.concat([logits, logit], axis = 0)
logits = logits[1:]
The part inside the loop is what I am unsure of. Does tensorflow remember the computational graph when I overwrite the variable "initial_input" and "last_encoder_state"?
I think your model will be much simplified if you use tf.contrib.seq2seq.AttentionWrapper with one of implementations: BahdanauAttention or LuongAttention.
This way it'll be possible to wire the attention vector on a cell level, so that cell output is already after attention applied. Example from the seq2seq tutorial:
cell = LSTMCell(512)
attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_size=256)
Note that this way you won't need a loop of window_size, because tf.nn.static_rnn or tf.nn.dynamic_rnn will instantiate the cells wrapped with attention.
Regarding your question: you should distinguish python variables and tensorflow graph nodes: you can assign last_encoder_state to a different tensor, the original graph node won't change because of this. This is flexible, but can be also misleading in the result network - you might think that you connect an LSTM to one tensor, but it's actually the other. In general, you shouldn't do that.

(De-)Convutional lstm autoencoder - error jumps

I'm trying to build a convolutional lstm autoencoder (that also predicts future and past) with Tensorflow, and it works to a certain degree, but the error sometimes jumps back up, so essentially, it never converges.
The model is as follows:
The encoder starts with a 64x64 frame from a 20 frame bouncing mnist video for each time step of the lstm. Every stacking layer of LSTM halfs it and increases the depth via 2x2 convolutions with a stride of 2. (so -->32x32x3 -->...--> 1x1x96)
On the other hand, the lstm performs 3x3 convolutions with a stride of 1 on its state. Both results are concatenated to form the new state. In the same way, the decoder uses transposed convolutions to go back to the original format. Then the squared error is calculated.
The error starts at around 2700 and it takes around 20 hours (geforce1060) to get down to ~1700. At which point the jumping back up (and it sometimes jumps back up to 2300 or even ridiculous values like 440300) happens often enough that I can't really get any lower. Also at that point, it can usually pinpoint where the number should be, but its too fuzzy to actually make out the digit...
I tried different learning rates and optimizers, so if anybody knows why that jumping happens, that'd make me happy :)
Here is a graph of the loss with epochs:
import tensorflow as tf
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
#based on code by loliverhennigh (Github)
class ConvCell(tf.contrib.rnn.RNNCell):
count = 0 #exists only to remove issues with variable scope
def __init__(self, shape, num_features, transpose = False):
self.shape = shape
self.num_features = num_features
self._state_is_tuple = True
self._transpose = transpose
ConvCell.count+=1
self.count = ConvCell.count
#property
def state_size(self):
return (tf.contrib.rnn.LSTMStateTuple(self.shape[0:4],self.shape[0:4]))
#property
def output_size(self):
return tf.TensorShape(self.shape[1:4])
#here comes to the actual conv lstm implementation, if transpose = true, it performs a deconvolution on the input
def __call__(self, inputs, state, scope=None):
with tf.variable_scope(scope or type(self).__name__+str(self.count)):
c, h = state
state_shape = h.shape
input_shape = inputs.shape
#filter variables and convolutions on data coming from the same cell, a time step previous
h_filters = tf.get_variable("h_filters",[3,3,state_shape[3],self.num_features])
h_filters_gates = tf.get_variable("h_filters_gates",[3,3,state_shape[3],3])
h_partial = tf.nn.conv2d(h,h_filters,[1,1,1,1],'SAME')
h_partial_gates = tf.nn.conv2d(h,h_filters_gates,[1,1,1,1],'SAME')
c_filters = tf.get_variable("c_filters",[3,3,state_shape[3],3])
c_partial = tf.nn.conv2d(c,c_filters,[1,1,1,1],'SAME')
#filters and convolutions/deconvolutions on data coming fromthe cell input
if self._transpose:
x_filters = tf.get_variable("x_filters",[2,2,self.num_features,input_shape[3]])
x_filters_gates = tf.get_variable("x_filters_gates",[2,2,3,input_shape[3]])
x_partial = tf.nn.conv2d_transpose(inputs,x_filters,[int(state_shape[0]),int(state_shape[1]),int(state_shape[2]),self.num_features],[1,2,2,1],'VALID')
x_partial_gates = tf.nn.conv2d_transpose(inputs,x_filters_gates,[int(state_shape[0]),int(state_shape[1]),int(state_shape[2]),3],[1,2,2,1],'VALID')
else:
x_filters = tf.get_variable("x_filters",[2,2,input_shape[3],self.num_features])
x_filters_gates = tf.get_variable("x_filters_gates",[2,2,input_shape[3],3])
x_partial = tf.nn.conv2d(inputs,x_filters,[1,2,2,1],'VALID')
x_partial_gates = tf.nn.conv2d(inputs,x_filters_gates,[1,2,2,1],'VALID')
#some more lstm gate business
gate_bias = tf.get_variable("gate_bias",[1,1,1,3])
h_bias = tf.get_variable("h_bias",[1,1,1,self.num_features*2])
gates = h_partial_gates + x_partial_gates + c_partial + gate_bias
i,f,o = tf.split(gates,3,axis=3)
#concatenate the units coming from the spacial and the temporal dimension to build a unified state
concat = tf.concat([h_partial,x_partial],3) + h_bias
new_c = tf.nn.relu(concat)*tf.sigmoid(i)+c*tf.sigmoid(f)
new_h = new_c * tf.sigmoid(o)
new_state = tf.contrib.rnn.LSTMStateTuple(new_c,new_h)
return new_h, new_state #its redundant, but thats how tensorflow likes it, apparently
#global variables
LEARNING_RATE = 0.005
ITERATIONS_PER_EPOCH = 80
BATCH_SIZE = 75
TEST = False #manual switch to go from training to testing
if TEST:
BATCH_SIZE = 1
inputs = tf.placeholder(tf.float32, (20, BATCH_SIZE, 64, 64,1))
shape0 = [BATCH_SIZE,64,64,2]
shape1 = [BATCH_SIZE,32,32,6]
shape2 = [BATCH_SIZE,16,16,12]
shape3 = [BATCH_SIZE,8,8,24]
shape4 = [BATCH_SIZE,4,4,48]
shape5 = [BATCH_SIZE,2,2,96]
shape6 = [BATCH_SIZE,1,1,192]
#apparently tf.multirnncell has very specific requirements for the initial states oO
initial_state1 = (tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape1),tf.zeros(shape1)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape2),tf.zeros(shape2)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape3),tf.zeros(shape3)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape4),tf.zeros(shape4)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape5),tf.zeros(shape5)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape6),tf.zeros(shape6)))
initial_state2 = (tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape5),tf.zeros(shape5)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape4),tf.zeros(shape4)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape3),tf.zeros(shape3)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape2),tf.zeros(shape2)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape1),tf.zeros(shape1)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape0),tf.zeros(shape0)))
#encoding part of the autoencoder graph
cell1 = ConvCell(shape1,3)
cell2 = ConvCell(shape2,6)
cell3 = ConvCell(shape3,12)
cell4 = ConvCell(shape4,24)
cell5 = ConvCell(shape5,48)
cell6 = ConvCell(shape6,96)
mcell = tf.contrib.rnn.MultiRNNCell([cell1,cell2,cell3,cell4,cell5,cell6])
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(mcell, inputs[0:20,:,:,:],initial_state=initial_state1,dtype=tf.float32, time_major=True)
#decoding part of the autoencoder graph, forward block and backwards block
cell9a = ConvCell(shape5,48,transpose = True)
cell10a = ConvCell(shape4,24,transpose = True)
cell11a = ConvCell(shape3,12,transpose = True)
cell12a = ConvCell(shape2,6,transpose = True)
cell13a = ConvCell(shape1,3,transpose = True)
cell14a = ConvCell(shape0,1,transpose = True)
mcella = tf.contrib.rnn.MultiRNNCell([cell9a,cell10a,cell11a,cell12a,cell13a,cell14a])
cell9b = ConvCell(shape5,48,transpose = True)
cell10b = ConvCell(shape4,24,transpose = True)
cell11b= ConvCell(shape3,12,transpose = True)
cell12b = ConvCell(shape2,6,transpose = True)
cell13b = ConvCell(shape1,3,transpose = True)
cell14b = ConvCell(shape0,1,transpose = True)
mcellb = tf.contrib.rnn.MultiRNNCell([cell9b,cell10b,cell11b,cell12b,cell13b,cell14b])
def PredictionLayer(rnn_outputs,viewPoint = 11, reverse = False):
predLength = viewPoint-2 if reverse else 20-viewPoint #vision is the input for the decoder
vision = tf.concat([rnn_outputs[viewPoint-1:viewPoint,:,:,:],tf.zeros([predLength,BATCH_SIZE,1,1,192])],0)
if reverse:
rnn_outputs2, rnn_states = tf.nn.dynamic_rnn(mcellb, vision, initial_state = initial_state2, time_major=True)
else:
rnn_outputs2, rnn_states = tf.nn.dynamic_rnn(mcella, vision, initial_state = initial_state2, time_major=True)
mean = tf.reduce_mean(rnn_outputs2,4)
if TEST:
return mean
if reverse:
return tf.reduce_sum(tf.square(mean-inputs[viewPoint-2::-1,:,:,:,0]))
else:
return tf.reduce_sum(tf.square(mean-inputs[viewPoint-1:20,:,:,:,0]))
if TEST:
mean = tf.concat([PredictionLayer(rnn_outputs,11,True)[::-1,:,:,:],createPredictionLayer(rnn_outputs,11)],0)
else: #training part of the graph
error = tf.zeros([1])
for i in range(8,15): #range size of 7 or less works, 9 or more does not, no idea why
error += PredictionLayer(rnn_outputs, i)
error += PredictionLayer(rnn_outputs, i, True)
train_fn = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE).minimize(error)
################################################################################
## TRAINING LOOP ##
################################################################################
#code based on siemanko/tf_lstm.py (Github)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
saver = tf.train.Saver(restore_sequentially=True, allow_empty=True,)
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
session.run(tf.global_variables_initializer())
vids = np.load("mnist_test_seq.npy") #20/10000/64/64 , moving mnist dataset from http://www.cs.toronto.edu/~nitish/unsupervised_video/
vids = vids[:,0:6000,:,:] #training set
saver.restore(session,tf.train.latest_checkpoint('./conv_lstm_multiples_v2/'))
#saver.restore(session,'.\conv_lstm_multiples\iteration-74')
for epoch in range(1000):
if TEST:
break
epoch_error = 0
#randomize batches each epoch
vids = np.swapaxes(vids,0,1)
np.random.shuffle(vids)
vids = np.swapaxes(vids,0,1)
for i in range(ITERATIONS_PER_EPOCH):
#running the graph and feeding data
err,_ = session.run([error, train_fn], {inputs: np.expand_dims(vids[:,i*BATCH_SIZE:(i+1)*BATCH_SIZE,:,:],axis=4)})
print(err)
epoch_error += err
#training error each epoch and regular saving
epoch_error /= (ITERATIONS_PER_EPOCH*BATCH_SIZE*4096*20*7)
if (epoch+1) % 5 == 0:
saver.save(session,'.\conv_lstm_multiples_v2\iteration',global_step=epoch)
print("saved")
print("Epoch %d, train error: %f" % (epoch, epoch_error))
#testing
plt.ion()
f, axarr = plt.subplots(2)
vids = np.load("mnist_test_seq.npy")
for i in range(6000,10000):
img = session.run([mean], {inputs: np.expand_dims(vids[:,i:i+1,:,:],axis=4)})
for j in range(20):
axarr[0].imshow(img[0][j,0,:,:])
axarr[1].imshow(vids[j,i,:,:])
plt.show()
plt.pause(0.1)
Usually this happens when gradients' magnitude is very high at some point and causes your network parameters to change a lot. To verify that it is indeed the case, you can produce the same plot of gradient magnitudes and see if they jump right before the loss jump. Assuming this is the case, the classic approach is to use gradient clipping (or go all the way to natural gradient).