Can embedding_rnn_seq2seq function return all states? - tensorflow

I am playing Seq2Seq model use embedding_rnn_seq2seq function ,
i read document say embedding_rnn_seq2seq return outputs and state that in each time-step ,
but i try to get state only can get one step
here is my model
seq_length = 100
batch_size = 128
vocab_size = 12
memory_dim = 100
enc_inp = [tf.placeholder(tf.int32, shape=(None,), name="inp%i" % t) for t in range(seq_length)]
labels = [tf.placeholder(tf.int32, shape=(None,), name="labels%i" % t) for t in range(seq_length)]
dec_inp = ([tf.zeros_like(labels[0], dtype=np.int32, name="GO")] + labels[:-1])
weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels]
cell = rnn_cell.GRUCell(memory_dim)
dec_outputs, dec_memory = seq2seq.embedding_rnn_seq2seq(enc_inp,dec_inp,cell,vocab_size,vocab_size,vocab_size)
loss = seq2seq.sequence_loss(dec_outputs, labels, weights, vocab_size)
try to get state (dec_memory)
dec_memory_batch = sess.run(dec_memory , feed_dict)
only return a one-step memory_dim size vector , maybe is last step state
So , do anyone have some advice ?

Related

Expected to see 3 array(s), but instead got the following list of 1 arrays:

I am trying to train a triple loss model using a fit_generator. it requires three input and no output. so i have a function that generates hard triplets. the output from the triplets generator has a shape of (3,5,279) which is 3 inputs(anchor,positive and negative) for 5 batches and a total of 279 features. When i run the fit_generator it throws this error that "the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 3 array(s), but instead got the following list of 1 arrays" meanwhile i have passed a list of three arrays. the code is below. it works when i use the fit, however, i want to always call the generator function to generate my triplets as my batches. thanks in advance..this has taken me three days
def load_data():
path = "arrhythmia_data.txt"
f = open( path, "r")
data = []
#remove line breaker, comma separate and store in array
for line in f:
line = line.replace('\n','').replace('?','0')
line = line.split(",")
data.append(line)
f.close()
data = np.array(data).astype(np.float64)
#print(data.shape)
#create the class labels for input data
Y_train = data[:,-1:]
train = data[:,:-1]
normaliser = preprocessing.MinMaxScaler()
train = normaliser.fit_transform(train)
val = train[320:,:]
train = train[:320,:]
#create one hot encoding of the class labels of the data and separate them into train and test data
lb = LabelBinarizer()
encode = lb.fit_transform(Y_train)
nb_classes = int(len(encode[0]))
#one_hot_labels = keras.utils.to_categorical(labels, num_classes=10) this could also be used for one hot encoding
Y_val_e = encode[320:,:]
Y_train_e = encode[:320,:]
print(Y_train_e[0])
print(np.argmax(Y_train_e[0]))
val_in = []
train_in = []
#grouping and sorting the input data based on label id or name
for n in range(nb_classes):
images_class_n = np.asarray([row for idx,row in enumerate(train) if np.argmax(Y_train_e[idx])==n])
train_in.append(images_class_n)
images_class_n = np.asarray([row for idx,row in enumerate(val) if np.argmax(Y_val_e[idx])==n])
val_in.append(images_class_n)
#print(train_in[0].shape)
return train_in,val_in,Y_train_e,Y_val_e,nb_classes
train_in,val,Y_train,Y_val,nb_classes = load_data()
input_shape = (train_in[0].shape[1],)
def build_network(input_shape , embeddingsize):
'''
Define the neural network to learn image similarity
Input :
input_shape : shape of input images
embeddingsize : vectorsize used to encode our picture
'''
#in_ = Input(train.shape)
net = Sequential()
net.add(Dense(128, activation='relu', input_shape=input_shape))
net.add(Dense(128, activation='relu'))
net.add(Dense(256, activation='relu'))
net.add(Dense(4096, activation='sigmoid'))
net.add(Dense(embeddingsize, activation= None))
#Force the encoding to live on the d-dimentional hypershpere
net.add(Lambda(lambda x: K.l2_normalize(x,axis=-1)))
return net
class TripletLossLayer(Layer):
def __init__(self, alpha, **kwargs):
self.alpha = alpha
super(TripletLossLayer, self).__init__(**kwargs)
def triplet_loss(self, inputs):
anchor, positive, negative = inputs
p_dist = K.sum(K.square(anchor-positive), axis=-1)
n_dist = K.sum(K.square(anchor-negative), axis=-1)
return K.sum(K.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
def call(self, inputs):
loss = self.triplet_loss(inputs)
self.add_loss(loss)
return loss
def build_model(input_shape, network, margin=0.2):
'''
Define the Keras Model for training
Input :
input_shape : shape of input images
network : Neural network to train outputing embeddings
margin : minimal distance between Anchor-Positive and Anchor-Negative for the lossfunction (alpha)
'''
# Define the tensors for the three input images
anchor_input = Input(input_shape, name="anchor_input")
positive_input = Input(input_shape, name="positive_input")
negative_input = Input(input_shape, name="negative_input")
# Generate the encodings (feature vectors) for the three images
encoded_a = network(anchor_input)
encoded_p = network(positive_input)
encoded_n = network(negative_input)
#TripletLoss Layer
loss_layer = TripletLossLayer(alpha=margin,name='triplet_loss_layer')([encoded_a,encoded_p,encoded_n])
# Connect the inputs with the outputs
network_train = Model(inputs=[anchor_input,positive_input,negative_input],outputs=loss_layer)
# return the model
return network_train
def get_batch_random(batch_size,s="train"):
# initialize result
triplets=[np.zeros((batch_size,m)) for i in range(3)]
for i in range(batch_size):
#Pick one random class for anchor
anchor_class = np.random.randint(0, nb_classes)
nb_sample_available_for_class_AP = X[anchor_class].shape[0]
#Pick two different random pics for this class => A and P. You can use same anchor as P if there is one one element for anchor
if nb_sample_available_for_class_AP<=1:
continue
[idx_A,idx_P] = np.random.choice(nb_sample_available_for_class_AP,size=2 ,replace=False)
#Pick another class for N, different from anchor_class
negative_class = (anchor_class + np.random.randint(1,nb_classes)) % nb_classes
nb_sample_available_for_class_N = X[negative_class].shape[0]
#Pick a random pic for this negative class => N
idx_N = np.random.randint(0, nb_sample_available_for_class_N)
triplets[0][i,:] = X[anchor_class][idx_A,:]
triplets[1][i,:] = X[anchor_class][idx_P,:]
triplets[2][i,:] = X[negative_class][idx_N,:]
return np.array(triplets)
def get_batch_hard(draw_batch_size,hard_batchs_size,norm_batchs_size,network,s="train"):
if s == 'train':
X = train_in
else:
X = val
#m, features = X[0].shape
#while True:
#Step 1 : pick a random batch to study
studybatch = get_batch_random(draw_batch_size,X)
#Step 2 : compute the loss with current network : d(A,P)-d(A,N). The alpha parameter here is omited here since we want only to order them
studybatchloss = np.zeros((draw_batch_size))
#Compute embeddings for anchors, positive and negatives
A = network.predict(studybatch[0])
P = network.predict(studybatch[1])
N = network.predict(studybatch[2])
#Compute d(A,P)-d(A,N)
studybatchloss = np.sum(np.square(A-P),axis=1) - np.sum(np.square(A-N),axis=1)
#Sort by distance (high distance first) and take the
selection = np.argsort(studybatchloss)[::-1][:hard_batchs_size]
#Draw other random samples from the batch
selection2 = np.random.choice(np.delete(np.arange(draw_batch_size),selection),norm_batchs_size,replace=False)
selection = np.append(selection,selection2)
triplets = [studybatch[0][selection,:], studybatch[1][selection,:],studybatch[2][selection,:]]
triplets = triplets.reshape(triplets.shape[0],triplets.shape[1],triplets.shape[2])
yield triplets
network = build_network(input_shape,embeddingsize=10)
hard = get_batch_hard(5,4,1,network,s="train")
network_train = build_model(input_shape,network)
optimizer = Adam(lr = 0.00006)
network_train.compile(loss=None,optimizer=optimizer)
#this works
#history = network_train.fit(hard,epochs=100,steps_per_epoch=1, verbose=2)
history = network_train.fit_generator(hard,epochs=10,steps_per_epoch=16, verbose=2)
# error:: the list of Numpy arrays that you are passing to your model is not the size the model
expected. Expected to see 3 array(s), but instead got the following list of 1 arrays:
I think that's beacause in your generator you are yielding the 3 inputs array in one list, you need to yield the 3 arrays independently:
triplet_1 = studybatch[0][selection,:]
triplet_2 = studybatch[1][selection,:]
triplet_3 = studybatch[2][selection,:]
yield [triplet_1, triplet_2, triplet_3]

How to extract cell state from a LSTM at each timestep in Keras?

Is there a way in Keras to retrieve the cell state (i.e., c vector) of a LSTM layer at every timestep of a given input?
It seems the return_state argument returns the last cell state after the computation is done, but I need also the intermediate ones. Also, I don't want to pass these cell states to the next layer, I only want to be able to access them.
Preferably using TensorFlow as backend.
Thanks
I was looking for a solution to this issue and after reading the guidance for creating your own custom RNN Cell in tf.keras (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AbstractRNNCell), I believe the following is the most concise and easy to read way of doing this for Tensorflow 2:
import tensorflow as tf
from tensorflow.keras.layers import LSTMCell
class LSTMCellReturnCellState(LSTMCell):
def call(self, inputs, states, training=None):
real_inputs = inputs[:,:self.units] # decouple [h, c]
outputs, [h,c] = super().call(real_inputs, states, training=training)
return tf.concat([h, c], axis=1), [h,c]
num_units = 512
test_input = tf.random.uniform([5,100,num_units])
rnn = tf.keras.layers.RNN(LSTMCellReturnCellState(num_units),
return_sequences=True, return_state=True)
whole_seq_output, final_memory_state, final_carry_state = rnn(test_input)
print(whole_seq_output.shape)
>>> (5,100,1024)
# Hidden state sequence
h_seq = whole_seq_output[:,:,:num_units] # (5,100,512)
# Cell state sequence
c_seq = whole_seq_output[:,:,num_units:] # (5,100,512)
As mentioned in an above solution, you can see the advantage of this is that it can be easily wrapped into tf.keras.layers.RNN as a drop-in for the normal LSTMCell.
Here is a Colab Notebook with the code running as expected for tensorflow==2.6.0
I know it's pretty late, I hope this can help.
what you are asking, technically, is possible by modifying the LSTM-cell in call method. I modify it and make it return 4 dimension instead of 3 when you give return_sequences=True.
Code
from keras.layers.recurrent import _generate_dropout_mask
class Mod_LSTMCELL(LSTMCell):
def call(self, inputs, states, training=None):
if 0 < self.dropout < 1 and self._dropout_mask is None:
self._dropout_mask = _generate_dropout_mask(
K.ones_like(inputs),
self.dropout,
training=training,
count=4)
if (0 < self.recurrent_dropout < 1 and
self._recurrent_dropout_mask is None):
self._recurrent_dropout_mask = _generate_dropout_mask(
K.ones_like(states[0]),
self.recurrent_dropout,
training=training,
count=4)
# dropout matrices for input units
dp_mask = self._dropout_mask
# dropout matrices for recurrent units
rec_dp_mask = self._recurrent_dropout_mask
h_tm1 = states[0] # previous memory state
c_tm1 = states[1] # previous carry state
if self.implementation == 1:
if 0 < self.dropout < 1.:
inputs_i = inputs * dp_mask[0]
inputs_f = inputs * dp_mask[1]
inputs_c = inputs * dp_mask[2]
inputs_o = inputs * dp_mask[3]
else:
inputs_i = inputs
inputs_f = inputs
inputs_c = inputs
inputs_o = inputs
x_i = K.dot(inputs_i, self.kernel_i)
x_f = K.dot(inputs_f, self.kernel_f)
x_c = K.dot(inputs_c, self.kernel_c)
x_o = K.dot(inputs_o, self.kernel_o)
if self.use_bias:
x_i = K.bias_add(x_i, self.bias_i)
x_f = K.bias_add(x_f, self.bias_f)
x_c = K.bias_add(x_c, self.bias_c)
x_o = K.bias_add(x_o, self.bias_o)
if 0 < self.recurrent_dropout < 1.:
h_tm1_i = h_tm1 * rec_dp_mask[0]
h_tm1_f = h_tm1 * rec_dp_mask[1]
h_tm1_c = h_tm1 * rec_dp_mask[2]
h_tm1_o = h_tm1 * rec_dp_mask[3]
else:
h_tm1_i = h_tm1
h_tm1_f = h_tm1
h_tm1_c = h_tm1
h_tm1_o = h_tm1
i = self.recurrent_activation(x_i + K.dot(h_tm1_i,
self.recurrent_kernel_i))
f = self.recurrent_activation(x_f + K.dot(h_tm1_f,
self.recurrent_kernel_f))
c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1_c,
self.recurrent_kernel_c))
o = self.recurrent_activation(x_o + K.dot(h_tm1_o,
self.recurrent_kernel_o))
else:
if 0. < self.dropout < 1.:
inputs *= dp_mask[0]
z = K.dot(inputs, self.kernel)
if 0. < self.recurrent_dropout < 1.:
h_tm1 *= rec_dp_mask[0]
z += K.dot(h_tm1, self.recurrent_kernel)
if self.use_bias:
z = K.bias_add(z, self.bias)
z0 = z[:, :self.units]
z1 = z[:, self.units: 2 * self.units]
z2 = z[:, 2 * self.units: 3 * self.units]
z3 = z[:, 3 * self.units:]
i = self.recurrent_activation(z0)
f = self.recurrent_activation(z1)
c = f * c_tm1 + i * self.activation(z2)
o = self.recurrent_activation(z3)
h = o * self.activation(c)
if 0 < self.dropout + self.recurrent_dropout:
if training is None:
h._uses_learning_phase = True
return tf.expand_dims(tf.concat([h,c],axis=0),0), [h, c]
Sample code
# create a cell
test = Mod_LSTMCELL(100)
# Input timesteps=10, features=7
in1 = Input(shape=(10,7))
out1 = RNN(test, return_sequences=True)(in1)
M = Model(inputs=[in1],outputs=[out1])
M.compile(keras.optimizers.Adam(),loss='mse')
ans = M.predict(np.arange(7*10,dtype=np.float32).reshape(1, 10, 7))
print(ans.shape)
# state_h
print(ans[0,0,0,:])
# state_c
print(ans[0,0,1,:])
First, this is not possible do with the tf.keras.layers.LSTM. You have to use LSTMCell instead or subclass LSTM. Second, there is no need to subclass LSTMCell to get the sequence of cell states. LSTMCell already returns a list of the hidden state (h) and cell state (c) everytime you call it.
For those not familiar with LSTMCell, it takes in the current [h, c] tensors, and the input at the current timestep (it cannot take in a sequence of times) and returns the activations, and the updated [h,c].
Here is an example of showing how to use LSTMCell to process a sequence of timesteps and to return the accumulated cell states.
# example inputs
inputs = tf.convert_to_tensor(np.random.rand(3, 4), dtype='float32') # 3 timesteps, 4 features
h_c = [tf.zeros((1,2)), tf.zeros((1,2))] # must initialize hidden/cell state for lstm cell
h_c = tf.convert_to_tensor(h_c, dtype='float32')
lstm = tf.keras.layers.LSTMCell(2)
# example of how you accumulate cell state over repeated calls to LSTMCell
inputs = tf.unstack(inputs, axis=0)
c_states = []
for cur_inputs in inputs:
out, h_c = lstm(tf.expand_dims(cur_inputs, axis=0), h_c)
h, c = h_c
c_states.append(c)
You can access the states of any RNN by setting return_sequences = True in the initializer. You can find more information about this parameter here.

Tensorflow: Saving/importing checkpoint works without error, but all imported variables have value 'none'

I am training a deep CNN for image augmentation and have run into a very odd issue.
My network architecture is fully convolutional and implements several small "u-shaped" components, wherein feature maps are down/upsampled in order to be processed throughout a "top layer." In the top layer, there are several nodes where the network "guesses" the output image, and then adds the output of the lower layers to the features derived from the guess. The loss function I have penalizes error in the final prediction as well as these guesses.
The network is defined thusly:
def convnet(x, weights, biases):
#TOP LAYER
conv0_1 = conv3dWrap(x, weights['wConv0_1'], biases['bConv0_1'],[1,1,1,1,1])
conv0_2 = conv3dWrap(conv0_1, weights['wConv0_2'], biases['bConv0_2'],[1,1,1,1,1])
#MID LAYER DOWN SAMPLE
conv1_1 = conv3dWrap(conv0_2, weights['wConv1_1'], biases['bConv1_1'],[1,2,2,2,1])
conv1_2 = conv3dWrap(conv1_1, weights['wConv1_2'], biases['bConv1_2'],[1,1,1,1,1])
#BOTTOM LAYER DOWN SAMPLE
conv2_1 = conv3dWrap(conv1_2, weights['wConv2_1'], biases['bConv2_1'],[1,2,2,2,1])
conv2_2 = conv3dWrap(conv2_1, weights['wConv2_2'], biases['bConv2_2'],[1,1,1,1,1])
conv2_3 = conv3dWrap(conv2_2, weights['wConv2_3'], biases['bConv2_3'],[1,1,1,1,1])
convTrans2_1 = conv3dTransWrap(conv2_3,weights['wTConv2_1'], biases['bTConv2_1'], [4,2,32,32,64],[1,2,2,2,1])
#MID LAYER UPSAMPLE
conv1_3 = conv3dWrap(tf.add(convTrans2_1,conv1_2),weights['wConv1_3'], biases['bConv1_3'],[1,1,1,1,1])
conv1_4 = conv3dWrap(conv1_3, weights['wConv1_4'], biases['bConv1_4'],[1,1,1,1,1])
convTrans1_1 = conv3dTransWrap(conv1_4, weights['wTConv1_1'], biases['bTConv1_1'], [4,4,64,64,32],[1,2,2,2,1])
#TOP LAYER AGAIN
conv0_3 = conv3dWrap(tf.add(conv0_2,convTrans1_1), weights['wConv0_3'], biases['bConv0_3'],[1,1,1,1,1])
conv0_4 = conv3dWrap(conv0_3, weights['wConv0_4'], biases['bConv0_4'],[1,1,1,1,1])
recon0_1 = reconWrap(conv0_3, weights['wReconDS0_1'], biases['bReconDS0_1'],[1,1,1,1,1])
print(recon0_1.shape)
catRecon0_1 = tf.add(conv0_4,tf.contrib.keras.backend.repeat_elements(recon0_1,32,4))
conv0_5 = conv3dWrap(catRecon0_1, weights['wConv0_5'], biases['bConv0_5'],[1,1,1,1,1])
#MID LAYER AGAIN
conv1_5 = conv3dWrap(conv0_5, weights['wConv1_5'], biases['bConv1_5'],[1,2,2,2,1])
conv1_6 = conv3dWrap(conv1_5, weights['wConv1_6'], biases['bConv1_6'],[1,1,1,1,1])
#BOTTOM LAYER
conv2_4 = conv3dWrap(conv1_6, weights['wConv2_4'], biases['bConv2_4'],[1,2,2,2,1])
conv2_5 = conv3dWrap(conv2_4, weights['wConv2_5'], biases['bConv2_5'],[1,1,1,1,1])
conv2_6 = conv3dWrap(conv2_5, weights['wConv2_6'], biases['bConv2_6'],[1,1,1,1,1])
convTrans2_2 = conv3dTransWrap(conv2_6,weights['wTConv2_2'], biases['bTConv2_2'], [4,2,32,32,64],[1,2,2,2,1])
#MID LAYER UPSAMPLE
conv1_7 = conv3dWrap(tf.add(convTrans2_2,conv1_6),weights['wConv1_7'], biases['bConv1_7'],[1,1,1,1,1])
conv1_8 = conv3dWrap(conv1_7, weights['wConv1_8'], biases['bConv1_8'],[1,1,1,1,1])
convTrans1_2 = conv3dTransWrap(conv1_8,weights['wTConv1_2'], biases['bTConv1_2'], [4,4,64,64,32],[1,2,2,2,1])
#TOP LAYER
conv0_6 = conv3dWrap(tf.add(conv0_5,convTrans1_2), weights['wConv0_6'], biases['bConv0_6'],[1,1,1,1,1])
recon0_2 = reconWrap(conv0_6, weights['wReconDS0_2'], biases['bReconDS0_2'],[1,1,1,1,1])
catRecon0_2 = tf.add(conv0_6,tf.contrib.keras.backend.repeat_elements(recon0_2,32,4))
conv0_7 = conv3dWrap(catRecon0_2, weights['wConv0_7'], biases['bConv0_7'],[1,1,1,1,1])
#MID LAYER
conv1_9 = conv3dWrap(conv0_7, weights['wConv1_9'], biases['bConv1_9'],[1,2,2,2,1])
conv1_10 = conv3dWrap(conv1_9, weights['wConv1_10'], biases['bConv1_10'],[1,1,1,1,1])
#BOTTOM LAYER
conv2_7 = conv3dWrap(conv1_10, weights['wConv2_7'], biases['bConv2_7'],[1,2,2,2,1])
conv2_8 = conv3dWrap(conv2_7, weights['wConv2_8'], biases['bConv2_8'],[1,1,1,1,1])
conv2_9 = conv3dWrap(conv2_8, weights['wConv2_9'], biases['bConv2_9'],[1,1,1,1,1])
convTrans2_3 = conv3dTransWrap(conv2_9, weights['wTConv2_3'], biases['bTConv2_3'], [4,2,32,32,64],[1,2,2,2,1])
#MID LAYER UPSAMPLE
conv1_11 = conv3dWrap(tf.add(convTrans2_3,conv1_10),weights['wConv1_11'], biases['bConv1_11'],[1,1,1,1,1])
conv1_12 = conv3dWrap(conv1_11, weights['wConv1_12'], biases['bConv1_12'],[1,1,1,1,1])
convTrans1_3 = conv3dTransWrap(conv1_12,weights['wTConv1_3'], biases['bTConv1_3'], [4,4,64,64,32],[1,2,2,2,1])
#TOP LAYER
conv0_8 = conv3dWrap(tf.add(conv0_7,convTrans1_3), weights['wConv0_8'], biases['bConv0_8'],[1,1,1,1,1])
recon0_3 = reconWrap(conv0_8, weights['wReconDS0_3'], biases['bReconDS0_3'],[1,1,1,1,1])
catRecon0_3 = tf.add(conv0_8,tf.contrib.keras.backend.repeat_elements(recon0_3,32,4))
conv0_9 = conv3dWrap(catRecon0_3, weights['wConv0_9'], biases['bConv0_9'],[1,1,1,1,1])
print(recon0_3.shape)
#MID LAYER
conv1_13 = conv3dWrap(conv0_9, weights['wConv1_13'], biases['bConv1_13'],[1,2,2,2,1])
conv1_14 = conv3dWrap(conv1_13, weights['wConv1_14'], biases['bConv1_14'],[1,1,1,1,1])
#BOTTOM LAYER
conv2_10 = conv3dWrap(conv1_14, weights['wConv2_10'], biases['bConv2_10'],[1,2,2,2,1])
conv2_11 = conv3dWrap(conv2_10, weights['wConv2_11'], biases['bConv2_11'],[1,1,1,1,1])
conv2_12 = conv3dWrap(conv2_11, weights['wConv2_12'], biases['bConv2_12'],[1,1,1,1,1])
convTrans2_4 = conv3dTransWrap(conv2_12, weights['wTConv2_4'], biases['bTConv2_4'], [4,2,32,32,64],[1,2,2,2,1])
#MID LAYER UPSAMPLE
conv1_15 = conv3dWrap(tf.add(convTrans2_4,conv1_14),weights['wConv1_15'], biases['bConv1_15'],[1,1,1,1,1])
conv1_16 = conv3dWrap(conv1_15, weights['wConv1_16'], biases['bConv1_16'],[1,1,1,1,1])
convTrans1_4 = conv3dTransWrap(conv1_16,weights['wTConv1_4'], biases['bTConv1_4'], [4,4,64,64,32],[1,2,2,2,1])
#TOP LAYER
conv0_10 = conv3dWrap(tf.add(conv0_9,convTrans1_4), weights['wConv0_10'], biases['bConv0_10'],[1,1,1,1,1])
#OUTPUT
convOUT = reconWrap(conv0_10, weights['wConvOUT'], biases['bConvOUT'],[1,1,1,1,1])
print(convOUT.shape)
return recon0_1, recon0_2, recon0_3, convOUT
Where all of the "wrappers" are as follows:
def conv3dWrap(x, W, b, strides):
x = tf.nn.conv3d(x, W, strides, padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def reconWrap(x, W, b, strides):
x = tf.nn.conv3d(x, W, strides, padding='SAME')
x = tf.nn.bias_add(x, b)
return x
def conv3dTransWrap(x, W, b, shape, strides):
x = tf.nn.conv3d_transpose(x, W, shape, strides, padding='SAME')
x = tf.nn.bias_add(x,b)
return tf.nn.relu(x)
My weights and biases are stored in dictionaries that are defined before starting the training:
weights={
#TOP LAYER
'wConv0_1': tf.Variable(tf.random_normal([4, 3, 3, 1, 5]), name='wC0_1'),
'wConv0_2': tf.Variable(tf.random_normal([4, 3, 3, 5, 32]), name='wC0_2'),
'wConv0_3': tf.Variable(tf.random_normal([4, 3, 3, 32, 32]), name='wC0_3'),
'wConv0_4': tf.Variable(tf.random_normal([4, 3, 3, 32, 32]), name='wC0_4'),
'wReconDS0_1': tf.Variable(tf.random_normal([1, 1, 1, 32, 1]) , name='wR0_1') ...... #THIS CONTINUES FOR QUITE AWHILE
Then, I begin the training like this:
def train_cnn(x):
epochLosses=[]
print('Beginning Training!')
print(NUM_EPOCHS)
r1,r2,r3,pred = convNet(x, weights, biases)
cost = (tf.losses.mean_squared_error(y,pred)
+ 0.25* ((tf.losses.mean_squared_error(y,r1))
+ (tf.losses.mean_squared_error(y,r2))
+ (tf.losses.mean_squared_error(y,r3))))
regularizer= 0.01*tf.nn.l2_loss((weights['wConv0_1'])+
0.01*tf.nn.l2_loss(weights['wConv0_2'])+
0.01*tf.nn.l2_loss(weights['wConv0_3'])+
0.01*tf.nn.l2_loss(weights['wConv0_4'])+
0.01*tf.nn.l2_loss(weights['wReconDS0_1'])+
0.01*tf.nn.l2_loss(weights['wConv0_5'])+
0.01*tf.nn.l2_loss(weights['wConv0_6'])+
0.01*tf.nn.l2_loss(weights['wReconDS0_2'])+
0.01*tf.nn.l2_loss(weights['wReconDS0_3'])+
0.01*tf.nn.l2_loss(weights['wConv0_7'])+
0.01*tf.nn.l2_loss(weights['wConv0_8'])+
0.01*tf.nn.l2_loss(weights['wConv0_9'])+
0.01*tf.nn.l2_loss(weights['wConv0_10'])+
0.01*tf.nn.l2_loss(weights['wConvOUT'])+
0.01*tf.nn.l2_loss(weights['wConv1_1'])+
0.01*tf.nn.l2_loss(weights['wConv1_2'])+
0.01*tf.nn.l2_loss(weights['wConv1_3'])+
0.01*tf.nn.l2_loss(weights['wConv1_4'])+
0.01*tf.nn.l2_loss(weights['wConv1_5'])+
0.01*tf.nn.l2_loss(weights['wConv1_6'])+
0.01*tf.nn.l2_loss(weights['wConv1_7'])+
0.01*tf.nn.l2_loss(weights['wConv1_8'])+
0.01*tf.nn.l2_loss(weights['wConv1_9'])+
0.01*tf.nn.l2_loss(weights['wConv1_10'])+
0.01*tf.nn.l2_loss(weights['wConv1_11'])+
0.01*tf.nn.l2_loss(weights['wConv1_12'])+
0.01*tf.nn.l2_loss(weights['wConv1_13'])+
0.01*tf.nn.l2_loss(weights['wConv1_14'])+
0.01*tf.nn.l2_loss(weights['wConv1_15'])+
0.01*tf.nn.l2_loss(weights['wConv1_16'])+
0.01*tf.nn.l2_loss(weights['wTConv1_1'])+
0.01*tf.nn.l2_loss(weights['wTConv1_2'])+
0.01*tf.nn.l2_loss(weights['wTConv1_3'])+
0.01*tf.nn.l2_loss(weights['wTConv1_4'])+
0.01*tf.nn.l2_loss(weights['wConv2_1'])+
0.01*tf.nn.l2_loss(weights['wConv2_2'])+
0.01*tf.nn.l2_loss(weights['wConv2_3'])+
0.01*tf.nn.l2_loss(weights['wConv2_4'])+
0.01*tf.nn.l2_loss(weights['wConv2_5'])+
0.01*tf.nn.l2_loss(weights['wConv2_6'])+
0.01*tf.nn.l2_loss(weights['wConv2_7'])+
0.01*tf.nn.l2_loss(weights['wConv2_8'])+
0.01*tf.nn.l2_loss(weights['wConv2_9'])+
0.01*tf.nn.l2_loss(weights['wConv2_10'])+
0.01*tf.nn.l2_loss(weights['wConv2_11'])+
0.01*tf.nn.l2_loss(weights['wConv2_12'])+
0.01*tf.nn.l2_loss(weights['wTConv2_1'])+
0.01*tf.nn.l2_loss(weights['wTConv2_2'])+
0.01*tf.nn.l2_loss(weights['wTConv2_3'])+
0.01*tf.nn.l2_loss(weights['wTConv2_4']))
cost=cost+regularizer
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(cost)
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
valLosses=[]
epochLosses=[]
print('Beginning Session!')
writer = tf.summary.FileWriter ( './GRAPH' , sess.graph)
sess.run(tf.global_variables_initializer())
Finally, I go ahead and do some stuff for loading in the batches and, once they're ready, I do the following (for each pass, I won't do the saving every pass once I have the weight importing working):
_, c = sess.run([optimizer, cost], feed_dict = {x: inBatch,y: gsBatch})
epoch_loss += c
save_path = saver.save(sess, "./CHKPT/model.cpkt")
So when I go ahead and import this model
sess = tf.Session()
x = tf.placeholder(dtype=tf.float32)
new_saver = tf.train.import_meta_graph('./CHKPT/model.cpkt.meta')
sess.run(tf.global_variables_initializer())
a,b,c,pred = convNet(x, weights, biases)
I am met with the following error:
ValueError: Tried to convert 'filter' to a tensor and failed. Error: None values not supported.
When I look at the imported weights and biases, each of them have value 'None'. Not only is this odd, but the network 'runs' incredibly quickly during training, far far more quickly than I'd expect. I am worried that no legitimate computations are occurring.
This must not be the case, but, I am almost positive I am following the saving/loading process I've used for many other networks verbatim. Can anyone shed some light on what might be happening here?
Edit: I'm also very new to TF, and it's likely there are non-idealities in my code. If you see anything outside of the saving/importing that isn't kosher please let me know.
Running sess.run(tf.global_variables_initializer()) will reinitialize every tensor and delete their loaded values. Skip calling tf.global_variables_initializer() when you load a model. The initialization is done by the saver.
You are also missing the restore call (import_meta_graph() only loads the saver object).
new_saver = tf.train.import_meta_graph('./CHKPT/model.cpkt.meta')
new_saver.restore(sess, './CHKPT/model.cpkt')
Thereafter when you run:
a,b,c,pred = convNet(x, weights, biases)
you create an all new network and never use the loaded one.
Instead you have to find the tensors you need inside tf.global_variables() after restoring the model. For example by searching for them by name.

Implementing attention with beam search in tensorflow

I have written my own code with reference to this wonderful tutorial and I am not able to get results when using attention with beam search as per my understanding in the class AttentionModel the _build_decoder_cell function creates separate decoder cell and attention wrapper for inference mode , assuming this ( which i think is incorrect and cant find a way around it ) ,
with tf.name_scope("Decoder"):
mem_units = 2*dim
dec_cell = tf.contrib.rnn.BasicLSTMCell( 2*dim )
beam_cel = tf.contrib.rnn.BasicLSTMCell( 2*dim )
beam_width = 3
out_layer = Dense( output_vocab_size )
with tf.name_scope("Training"):
attn_mech = tf.contrib.seq2seq.BahdanauAttention( num_units = mem_units, memory = enc_rnn_out, normalize=True)
attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell = dec_cell,attention_mechanism = attn_mech )
batch_size = tf.shape(enc_rnn_out)[0]
initial_state = attn_cell.zero_state( batch_size = batch_size , dtype=tf.float32 )
initial_state = initial_state.clone(cell_state = enc_rnn_state)
helper = tf.contrib.seq2seq.TrainingHelper( inputs = emb_x_y , sequence_length = seq_len )
decoder = tf.contrib.seq2seq.BasicDecoder( cell = attn_cell, helper = helper, initial_state = initial_state ,output_layer=out_layer )
outputs, final_state, final_sequence_lengths= tf.contrib.seq2seq.dynamic_decode(decoder=decoder,impute_finished=True)
training_logits = tf.identity(outputs.rnn_output )
training_pred = tf.identity(outputs.sample_id )
with tf.name_scope("Inference"):
enc_rnn_out_beam = tf.contrib.seq2seq.tile_batch( enc_rnn_out , beam_width )
seq_len_beam = tf.contrib.seq2seq.tile_batch( seq_len , beam_width )
enc_rnn_state_beam = tf.contrib.seq2seq.tile_batch( enc_rnn_state , beam_width )
batch_size_beam = tf.shape(enc_rnn_out_beam)[0] # now batch size is beam_width times
# start tokens mean be the original batch size so divide
start_tokens = tf.tile(tf.constant([27], dtype=tf.int32), [ batch_size_beam//beam_width ] )
end_token = 0
attn_mech_beam = tf.contrib.seq2seq.BahdanauAttention( num_units = mem_units, memory = enc_rnn_out_beam, normalize=True)
cell_beam = tf.contrib.seq2seq.AttentionWrapper(cell=beam_cel,attention_mechanism=attn_mech_beam,attention_layer_size=mem_units)
initial_state_beam = cell_beam.zero_state(batch_size=batch_size_beam,dtype=tf.float32).clone(cell_state=enc_rnn_state_beam)
my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell = cell_beam,
embedding = emb_out,
start_tokens = start_tokens,
end_token = end_token,
initial_state = initial_state_beam,
beam_width = beam_width
,output_layer=out_layer)
beam_output, t1 , t2 = tf.contrib.seq2seq.dynamic_decode( my_decoder,
maximum_iterations=maxlen )
beam_logits = tf.no_op()
beam_sample_id = beam_output.predicted_ids
when i call beam _sample_id after training i am not getting correct result.
my guess is that we are supposed to using the same attention wrapper but that is not possible since we have to tile_sequence for beam search to be used.
Any insights / suggestion would be much appreciated.
i have also created an issue for this in their main repository Issue-93
I'm not sure what do you mean by "I am not able to get results" but I'm assuming that your model is not making use of the wieghts learnt while training .
if this is the case , then first of all you need to know that its all about variable sharing , the first thing you need to do is that you get rid of the different variable scopes between the training and inferring and instead you need to use some thing like
remove the
with tf.name_scope("Training"):
and use :
with tf.variable_scope("myScope"):
and then remove the
with tf.name_scope("Inference"):
and use instead
with tf.variable_scope("myScope" , reuse=True):
also at the beginning of your and after with tf.variable_scope("myScope" )
enc_rnn_out = tf.contrib.seq2seq.tile_batch( enc_rnn_out , 1 )
seq_len = tf.contrib.seq2seq.tile_batch( seq_len , 1 )
enc_rnn_state = tf.contrib.seq2seq.tile_batch( enc_rnn_state , 1 )
this will insure that your inference variables and training variables have the same signature and are shared ,
I have tested this when I was following the same tutorial that you have mentioned , my model is still training as I'm writing this post but I can see that the accuracy increasing as we speak , which indicates that the solution should work for you as well .
thank you
You can use tf.cond() to create different pathes between training and inference stage:
def get_tile_batch(enc_output, source_sequence_length, enc_state, useBeamSearch):
enc_output = tf.contrib.seq2seq.tile_batch(enc_output, multiplier=useBeamSearch)
source_sequence_length = tf.contrib.seq2seq.tile_batch(source_sequence_length, multiplier=useBeamSearch)
enc_state = tf.contrib.seq2seq.tile_batch(enc_state, multiplier=useBeamSearch)
return enc_output, source_sequence_length, enc_state
## for beam search: at training stage, use tile_batch multiplier = 1,
## at infer stage, use tile_batch multiplier = useBeamSearch
## tile_batch is just duplicate every sample in a batch,
## so it'll change batch_size to batch_size * useBeamSearch at runtime once batch_size was determined
enc_output, source_sequence_length, enc_state = tf.cond(
self.on_infer, # is inference stage?
lambda: get_tile_batch(enc_output, source_sequence_length, enc_state, useBeamSearch=useBeamSearch),
lambda: get_tile_batch(enc_output, source_sequence_length, enc_state, useBeamSearch=1)
)
# attention mechanism
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=rnn_size, memory=enc_output, memory_sequence_length=source_sequence_length)
dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attention_mechanism)
## for beam search: change batch_size to batch_size * useBeamSearch at infer stage
decoder_initial_state = tf.cond(
self.on_infer, # is inference stage?
lambda: dec_cell.zero_state(batch_size=batch_size * useBeamSearch, dtype=tf.float32),
lambda: dec_cell.zero_state(batch_size=batch_size * 1, dtype=tf.float32)
)
enc_state = decoder_initial_state.clone(cell_state=enc_state)

Tensorflow >r1.0 tf.layers.batch_normalization very bad test performance

I'm trying to use the tf.layers.batch_normalization function provided in the latest Tensorflow API to implement a recurrent batch normalized LSTM.
The implementation is as below (I modified the TF source code):
class BNLSTMCell(tf.nn.rnn_cell.RNNCell):
"""
Batch Normalized Long short-term memory unit (LSTM) recurrent network cell.
cf. Recurrent Batch Normalization
https://arxiv.org/abs/1603.09025
cf. A Gentle Guide to Using Batch Normalization in TensorFlow
http://ruishu.io/2016/12/27/batchnorm/
"""
def __init__(self, num_units, forward_only, gamma_c=1.0, gamma_h=1.0,
gamma_x=1.0, beta_c=0.0, beta_h=0.0, beta_x=0.0,
input_size=None, use_peepholes=False, cell_clip=None,
initializer=None, num_proj=None,
num_unit_shards=1, num_proj_shards=1,
forget_bias=1.0, state_is_tuple=False,
activation=tf.tanh):
"""Initialize the parameters for an LSTM cell.
Args:
num_units: int, The number of units in the LSTM cell
forward_only:
If False (training):
1. Normalize layer activations according to mini-batch statistics.
2. During the training step, update population statistics
approximation via moving average of mini-batch statistics.
If True (testing):
1. Normalize layer activations according to estimated population
statistics.
2. No update of population statistics according to mini-batch
statistcs from test data.
gamma_c: Scale of cell state normalization
beta_c: Offset of cell state normalization
gamma_h: Scale of hidden state normalization
beta_h: Offset of hidden state normalization
(set to 0 to avoid redundancy)
gamma_x: Scale of input normalization
beta_x: Offset of input normalization
(set to 0 to avoid redundancy)
input_size: Deprecated and unused.
use_peepholes: bool, Set True to enable diagonal/peephole connections.
cell_clip: (optional) A float value, if provided the cell state is clipped
by this value prior to the cell output activation.
initializer: (optional) The initializer to use for the weight and
projection matrices.
num_proj: (optional) int, The output dimensionality for the projection
matrices. If None, no projection is performed.
num_unit_shards: How to split the weight matrix. If >1, the weight
matrix is stored across num_unit_shards.
num_proj_shards: How to split the projection matrix. If >1, the
projection matrix is stored across num_proj_shards.
forget_bias: Biases of the forget gate are initialized by default to 1
in order to reduce the scale of forgetting at the beginning of
the training.
state_is_tuple: If True, accepted and returned states are 2-tuples of
the `c_state` and `m_state`. By default (False), they are concatenated
along the column axis. This default behavior will soon be deprecated.
activation: Activation function of the inner states.
"""
if not state_is_tuple:
logging.warn(
"%s: Using a concatenated state is slower and will soon be "
"deprecated. Use state_is_tuple=True." % self)
if input_size is not None:
logging.warn("%s: The input_size parameter is deprecated." % self)
self._num_units = num_units
self.forward_only = forward_only
self._gamma_c = gamma_c
self._beta_c = beta_c
self._gamma_h = gamma_h
self._beta_h = beta_h
self._gamma_x = gamma_x
self._beta_x = beta_x
self._use_peepholes = use_peepholes
self._cell_clip = cell_clip
self._initializer = initializer
self._num_proj = num_proj
self._num_unit_shards = num_unit_shards
self._num_proj_shards = num_proj_shards
self._forget_bias = forget_bias
self._state_is_tuple = state_is_tuple
self._activation = activation
if num_proj:
self._state_size = (
tf.nn.rnn_cell.LSTMStateTuple(num_units, num_proj)
if state_is_tuple else num_units + num_proj)
self._output_size = num_proj
else:
self._state_size = (
tf.nn.rnn_cell.LSTMStateTuple(num_units, num_units)
if state_is_tuple else 2 * num_units)
self._output_size = num_units
#property
def state_size(self):
return self._state_size
#property
def output_size(self):
return self._output_size
def __call__(self, inputs, state, scope=None):
"""Run one step of LSTM.
Args:
inputs: input Tensor, 2D, batch x num_units.
state: if `state_is_tuple` is False, this must be a state Tensor,
`2-D, batch x state_size`. If `state_is_tuple` is True, this must be a
tuple of state Tensors, both `2-D`, with column sizes `c_state` and
`m_state`.
scope: VariableScope for the created subgraph; defaults to "LSTMCell".
Returns:
A tuple containing:
- A `2-D, [batch x output_dim]`, Tensor representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is:
num_proj if num_proj was set,
num_units otherwise.
- Tensor(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
Raises:
ValueError: If input size cannot be inferred from inputs via
static shape inference.
"""
num_proj = self._num_units if self._num_proj is None else self._num_proj
if self._state_is_tuple:
(c_prev, m_prev) = state
else:
c_prev = tf.slice(state, [0, 0], [-1, self._num_units])
m_prev = tf.slice(state, [0, self._num_units], [-1, num_proj])
dtype = inputs.dtype
input_size = inputs.get_shape().with_rank(2)[1]
if input_size.value is None:
raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
with tf.variable_scope(scope or type(self).__name__,
initializer=self._initializer): # "LSTMCell"
w_h = tf.get_variable("W_h", [num_proj, 4 * self._num_units],
dtype=tf.float32)
w_x = tf.get_variable("W_x", [input_size.value, 4 * self._num_units],
dtype=tf.float32)
b = tf.get_variable(
"B", shape=[4 * self._num_units],
initializer=tf.zeros_initializer, dtype=dtype)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
hidden_matrix = tf.matmul(m_prev, w_h)
bn_hidden_matrix = tf.layers.batch_normalization(hidden_matrix,
momentum=0.5,
beta_initializer=tf.constant_initializer(self._beta_h),
gamma_initializer=tf.constant_initializer(self._gamma_h),
training=(not self.forward_only),
name='bn_hidden_matrix', reuse=None)
# print(tf.get_collection(tf.GraphKeys.VARIABLES, scope=scope))
input_matrix = tf.matmul(inputs, w_x)
bn_input_matrix = tf.layers.batch_normalization(input_matrix,
momentum=0.5,
beta_initializer=tf.constant_initializer(self._beta_x),
gamma_initializer=tf.constant_initializer(self._gamma_x),
training=(not self.forward_only),
name='bn_input_matrix', reuse=None)
lstm_matrix = tf.nn.bias_add(
tf.add(bn_input_matrix, bn_hidden_matrix), b)
i, j, f, o = tf.split(lstm_matrix, num_or_size_splits=4, axis=1)
# Diagonal connections
if self._use_peepholes:
w_f_diag = tf.get_variable(
"W_F_diag", shape=[self._num_units], dtype=dtype)
w_i_diag = tf.get_variable(
"W_I_diag", shape=[self._num_units], dtype=dtype)
w_o_diag = tf.get_variable(
"W_O_diag", shape=[self._num_units], dtype=dtype)
if self._use_peepholes:
c = (tf.sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
tf.sigmoid(i + w_i_diag * c_prev) * self._activation(j))
else:
c = (tf.sigmoid(f + self._forget_bias) * c_prev + tf.sigmoid(i) *
self._activation(j))
if self._cell_clip is not None:
# pylint: disable=invalid-unary-operand-type
c = tf.clip_by_value(c, -self._cell_clip, self._cell_clip)
# pylint: enable=invalid-unary-operand-type
bn_c = tf.layers.batch_normalization(c,
momentum=0.5,
beta_initializer=tf.constant_initializer(self._beta_c),
gamma_initializer=tf.constant_initializer(self._gamma_c),
training=(not self.forward_only),
name='bn_cell', reuse=None)
if self._use_peepholes:
m = tf.sigmoid(o + w_o_diag * bn_c) * self._activation(bn_c)
else:
m = tf.sigmoid(o) * self._activation(bn_c)
if self._num_proj is not None:
concat_w_proj = tf.nn.rnn_cell._get_concat_variable(
"W_P", [self._num_units, self._num_proj],
dtype, self._num_proj_shards)
m = tf.matmul(m, concat_w_proj)
new_state = (tf.nn.rnn_cell.LSTMStateTuple(c, m) if self._state_is_tuple
else tf.concat(1, [c, m]))
return m, new_state
I built a sequence to sequence model and run the extra updates during training as specified in other posts.
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if extra_update_ops and not forward_only:
outputs, extra_updates = session.run([output_feed, extra_update_ops], input_feed)
else:
outputs = session.run(output_feed, input_feed)
The training loss looks very reasonable.
However, my test output is garbage. I wonder if anyone has had similar experience and knows how to resolve it.