Tensorflow: Saving/importing checkpoint works without error, but all imported variables have value 'none' - tensorflow
I am training a deep CNN for image augmentation and have run into a very odd issue.
My network architecture is fully convolutional and implements several small "u-shaped" components, wherein feature maps are down/upsampled in order to be processed throughout a "top layer." In the top layer, there are several nodes where the network "guesses" the output image, and then adds the output of the lower layers to the features derived from the guess. The loss function I have penalizes error in the final prediction as well as these guesses.
The network is defined thusly:
def convnet(x, weights, biases):
#TOP LAYER
conv0_1 = conv3dWrap(x, weights['wConv0_1'], biases['bConv0_1'],[1,1,1,1,1])
conv0_2 = conv3dWrap(conv0_1, weights['wConv0_2'], biases['bConv0_2'],[1,1,1,1,1])
#MID LAYER DOWN SAMPLE
conv1_1 = conv3dWrap(conv0_2, weights['wConv1_1'], biases['bConv1_1'],[1,2,2,2,1])
conv1_2 = conv3dWrap(conv1_1, weights['wConv1_2'], biases['bConv1_2'],[1,1,1,1,1])
#BOTTOM LAYER DOWN SAMPLE
conv2_1 = conv3dWrap(conv1_2, weights['wConv2_1'], biases['bConv2_1'],[1,2,2,2,1])
conv2_2 = conv3dWrap(conv2_1, weights['wConv2_2'], biases['bConv2_2'],[1,1,1,1,1])
conv2_3 = conv3dWrap(conv2_2, weights['wConv2_3'], biases['bConv2_3'],[1,1,1,1,1])
convTrans2_1 = conv3dTransWrap(conv2_3,weights['wTConv2_1'], biases['bTConv2_1'], [4,2,32,32,64],[1,2,2,2,1])
#MID LAYER UPSAMPLE
conv1_3 = conv3dWrap(tf.add(convTrans2_1,conv1_2),weights['wConv1_3'], biases['bConv1_3'],[1,1,1,1,1])
conv1_4 = conv3dWrap(conv1_3, weights['wConv1_4'], biases['bConv1_4'],[1,1,1,1,1])
convTrans1_1 = conv3dTransWrap(conv1_4, weights['wTConv1_1'], biases['bTConv1_1'], [4,4,64,64,32],[1,2,2,2,1])
#TOP LAYER AGAIN
conv0_3 = conv3dWrap(tf.add(conv0_2,convTrans1_1), weights['wConv0_3'], biases['bConv0_3'],[1,1,1,1,1])
conv0_4 = conv3dWrap(conv0_3, weights['wConv0_4'], biases['bConv0_4'],[1,1,1,1,1])
recon0_1 = reconWrap(conv0_3, weights['wReconDS0_1'], biases['bReconDS0_1'],[1,1,1,1,1])
print(recon0_1.shape)
catRecon0_1 = tf.add(conv0_4,tf.contrib.keras.backend.repeat_elements(recon0_1,32,4))
conv0_5 = conv3dWrap(catRecon0_1, weights['wConv0_5'], biases['bConv0_5'],[1,1,1,1,1])
#MID LAYER AGAIN
conv1_5 = conv3dWrap(conv0_5, weights['wConv1_5'], biases['bConv1_5'],[1,2,2,2,1])
conv1_6 = conv3dWrap(conv1_5, weights['wConv1_6'], biases['bConv1_6'],[1,1,1,1,1])
#BOTTOM LAYER
conv2_4 = conv3dWrap(conv1_6, weights['wConv2_4'], biases['bConv2_4'],[1,2,2,2,1])
conv2_5 = conv3dWrap(conv2_4, weights['wConv2_5'], biases['bConv2_5'],[1,1,1,1,1])
conv2_6 = conv3dWrap(conv2_5, weights['wConv2_6'], biases['bConv2_6'],[1,1,1,1,1])
convTrans2_2 = conv3dTransWrap(conv2_6,weights['wTConv2_2'], biases['bTConv2_2'], [4,2,32,32,64],[1,2,2,2,1])
#MID LAYER UPSAMPLE
conv1_7 = conv3dWrap(tf.add(convTrans2_2,conv1_6),weights['wConv1_7'], biases['bConv1_7'],[1,1,1,1,1])
conv1_8 = conv3dWrap(conv1_7, weights['wConv1_8'], biases['bConv1_8'],[1,1,1,1,1])
convTrans1_2 = conv3dTransWrap(conv1_8,weights['wTConv1_2'], biases['bTConv1_2'], [4,4,64,64,32],[1,2,2,2,1])
#TOP LAYER
conv0_6 = conv3dWrap(tf.add(conv0_5,convTrans1_2), weights['wConv0_6'], biases['bConv0_6'],[1,1,1,1,1])
recon0_2 = reconWrap(conv0_6, weights['wReconDS0_2'], biases['bReconDS0_2'],[1,1,1,1,1])
catRecon0_2 = tf.add(conv0_6,tf.contrib.keras.backend.repeat_elements(recon0_2,32,4))
conv0_7 = conv3dWrap(catRecon0_2, weights['wConv0_7'], biases['bConv0_7'],[1,1,1,1,1])
#MID LAYER
conv1_9 = conv3dWrap(conv0_7, weights['wConv1_9'], biases['bConv1_9'],[1,2,2,2,1])
conv1_10 = conv3dWrap(conv1_9, weights['wConv1_10'], biases['bConv1_10'],[1,1,1,1,1])
#BOTTOM LAYER
conv2_7 = conv3dWrap(conv1_10, weights['wConv2_7'], biases['bConv2_7'],[1,2,2,2,1])
conv2_8 = conv3dWrap(conv2_7, weights['wConv2_8'], biases['bConv2_8'],[1,1,1,1,1])
conv2_9 = conv3dWrap(conv2_8, weights['wConv2_9'], biases['bConv2_9'],[1,1,1,1,1])
convTrans2_3 = conv3dTransWrap(conv2_9, weights['wTConv2_3'], biases['bTConv2_3'], [4,2,32,32,64],[1,2,2,2,1])
#MID LAYER UPSAMPLE
conv1_11 = conv3dWrap(tf.add(convTrans2_3,conv1_10),weights['wConv1_11'], biases['bConv1_11'],[1,1,1,1,1])
conv1_12 = conv3dWrap(conv1_11, weights['wConv1_12'], biases['bConv1_12'],[1,1,1,1,1])
convTrans1_3 = conv3dTransWrap(conv1_12,weights['wTConv1_3'], biases['bTConv1_3'], [4,4,64,64,32],[1,2,2,2,1])
#TOP LAYER
conv0_8 = conv3dWrap(tf.add(conv0_7,convTrans1_3), weights['wConv0_8'], biases['bConv0_8'],[1,1,1,1,1])
recon0_3 = reconWrap(conv0_8, weights['wReconDS0_3'], biases['bReconDS0_3'],[1,1,1,1,1])
catRecon0_3 = tf.add(conv0_8,tf.contrib.keras.backend.repeat_elements(recon0_3,32,4))
conv0_9 = conv3dWrap(catRecon0_3, weights['wConv0_9'], biases['bConv0_9'],[1,1,1,1,1])
print(recon0_3.shape)
#MID LAYER
conv1_13 = conv3dWrap(conv0_9, weights['wConv1_13'], biases['bConv1_13'],[1,2,2,2,1])
conv1_14 = conv3dWrap(conv1_13, weights['wConv1_14'], biases['bConv1_14'],[1,1,1,1,1])
#BOTTOM LAYER
conv2_10 = conv3dWrap(conv1_14, weights['wConv2_10'], biases['bConv2_10'],[1,2,2,2,1])
conv2_11 = conv3dWrap(conv2_10, weights['wConv2_11'], biases['bConv2_11'],[1,1,1,1,1])
conv2_12 = conv3dWrap(conv2_11, weights['wConv2_12'], biases['bConv2_12'],[1,1,1,1,1])
convTrans2_4 = conv3dTransWrap(conv2_12, weights['wTConv2_4'], biases['bTConv2_4'], [4,2,32,32,64],[1,2,2,2,1])
#MID LAYER UPSAMPLE
conv1_15 = conv3dWrap(tf.add(convTrans2_4,conv1_14),weights['wConv1_15'], biases['bConv1_15'],[1,1,1,1,1])
conv1_16 = conv3dWrap(conv1_15, weights['wConv1_16'], biases['bConv1_16'],[1,1,1,1,1])
convTrans1_4 = conv3dTransWrap(conv1_16,weights['wTConv1_4'], biases['bTConv1_4'], [4,4,64,64,32],[1,2,2,2,1])
#TOP LAYER
conv0_10 = conv3dWrap(tf.add(conv0_9,convTrans1_4), weights['wConv0_10'], biases['bConv0_10'],[1,1,1,1,1])
#OUTPUT
convOUT = reconWrap(conv0_10, weights['wConvOUT'], biases['bConvOUT'],[1,1,1,1,1])
print(convOUT.shape)
return recon0_1, recon0_2, recon0_3, convOUT
Where all of the "wrappers" are as follows:
def conv3dWrap(x, W, b, strides):
x = tf.nn.conv3d(x, W, strides, padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def reconWrap(x, W, b, strides):
x = tf.nn.conv3d(x, W, strides, padding='SAME')
x = tf.nn.bias_add(x, b)
return x
def conv3dTransWrap(x, W, b, shape, strides):
x = tf.nn.conv3d_transpose(x, W, shape, strides, padding='SAME')
x = tf.nn.bias_add(x,b)
return tf.nn.relu(x)
My weights and biases are stored in dictionaries that are defined before starting the training:
weights={
#TOP LAYER
'wConv0_1': tf.Variable(tf.random_normal([4, 3, 3, 1, 5]), name='wC0_1'),
'wConv0_2': tf.Variable(tf.random_normal([4, 3, 3, 5, 32]), name='wC0_2'),
'wConv0_3': tf.Variable(tf.random_normal([4, 3, 3, 32, 32]), name='wC0_3'),
'wConv0_4': tf.Variable(tf.random_normal([4, 3, 3, 32, 32]), name='wC0_4'),
'wReconDS0_1': tf.Variable(tf.random_normal([1, 1, 1, 32, 1]) , name='wR0_1') ...... #THIS CONTINUES FOR QUITE AWHILE
Then, I begin the training like this:
def train_cnn(x):
epochLosses=[]
print('Beginning Training!')
print(NUM_EPOCHS)
r1,r2,r3,pred = convNet(x, weights, biases)
cost = (tf.losses.mean_squared_error(y,pred)
+ 0.25* ((tf.losses.mean_squared_error(y,r1))
+ (tf.losses.mean_squared_error(y,r2))
+ (tf.losses.mean_squared_error(y,r3))))
regularizer= 0.01*tf.nn.l2_loss((weights['wConv0_1'])+
0.01*tf.nn.l2_loss(weights['wConv0_2'])+
0.01*tf.nn.l2_loss(weights['wConv0_3'])+
0.01*tf.nn.l2_loss(weights['wConv0_4'])+
0.01*tf.nn.l2_loss(weights['wReconDS0_1'])+
0.01*tf.nn.l2_loss(weights['wConv0_5'])+
0.01*tf.nn.l2_loss(weights['wConv0_6'])+
0.01*tf.nn.l2_loss(weights['wReconDS0_2'])+
0.01*tf.nn.l2_loss(weights['wReconDS0_3'])+
0.01*tf.nn.l2_loss(weights['wConv0_7'])+
0.01*tf.nn.l2_loss(weights['wConv0_8'])+
0.01*tf.nn.l2_loss(weights['wConv0_9'])+
0.01*tf.nn.l2_loss(weights['wConv0_10'])+
0.01*tf.nn.l2_loss(weights['wConvOUT'])+
0.01*tf.nn.l2_loss(weights['wConv1_1'])+
0.01*tf.nn.l2_loss(weights['wConv1_2'])+
0.01*tf.nn.l2_loss(weights['wConv1_3'])+
0.01*tf.nn.l2_loss(weights['wConv1_4'])+
0.01*tf.nn.l2_loss(weights['wConv1_5'])+
0.01*tf.nn.l2_loss(weights['wConv1_6'])+
0.01*tf.nn.l2_loss(weights['wConv1_7'])+
0.01*tf.nn.l2_loss(weights['wConv1_8'])+
0.01*tf.nn.l2_loss(weights['wConv1_9'])+
0.01*tf.nn.l2_loss(weights['wConv1_10'])+
0.01*tf.nn.l2_loss(weights['wConv1_11'])+
0.01*tf.nn.l2_loss(weights['wConv1_12'])+
0.01*tf.nn.l2_loss(weights['wConv1_13'])+
0.01*tf.nn.l2_loss(weights['wConv1_14'])+
0.01*tf.nn.l2_loss(weights['wConv1_15'])+
0.01*tf.nn.l2_loss(weights['wConv1_16'])+
0.01*tf.nn.l2_loss(weights['wTConv1_1'])+
0.01*tf.nn.l2_loss(weights['wTConv1_2'])+
0.01*tf.nn.l2_loss(weights['wTConv1_3'])+
0.01*tf.nn.l2_loss(weights['wTConv1_4'])+
0.01*tf.nn.l2_loss(weights['wConv2_1'])+
0.01*tf.nn.l2_loss(weights['wConv2_2'])+
0.01*tf.nn.l2_loss(weights['wConv2_3'])+
0.01*tf.nn.l2_loss(weights['wConv2_4'])+
0.01*tf.nn.l2_loss(weights['wConv2_5'])+
0.01*tf.nn.l2_loss(weights['wConv2_6'])+
0.01*tf.nn.l2_loss(weights['wConv2_7'])+
0.01*tf.nn.l2_loss(weights['wConv2_8'])+
0.01*tf.nn.l2_loss(weights['wConv2_9'])+
0.01*tf.nn.l2_loss(weights['wConv2_10'])+
0.01*tf.nn.l2_loss(weights['wConv2_11'])+
0.01*tf.nn.l2_loss(weights['wConv2_12'])+
0.01*tf.nn.l2_loss(weights['wTConv2_1'])+
0.01*tf.nn.l2_loss(weights['wTConv2_2'])+
0.01*tf.nn.l2_loss(weights['wTConv2_3'])+
0.01*tf.nn.l2_loss(weights['wTConv2_4']))
cost=cost+regularizer
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(cost)
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
valLosses=[]
epochLosses=[]
print('Beginning Session!')
writer = tf.summary.FileWriter ( './GRAPH' , sess.graph)
sess.run(tf.global_variables_initializer())
Finally, I go ahead and do some stuff for loading in the batches and, once they're ready, I do the following (for each pass, I won't do the saving every pass once I have the weight importing working):
_, c = sess.run([optimizer, cost], feed_dict = {x: inBatch,y: gsBatch})
epoch_loss += c
save_path = saver.save(sess, "./CHKPT/model.cpkt")
So when I go ahead and import this model
sess = tf.Session()
x = tf.placeholder(dtype=tf.float32)
new_saver = tf.train.import_meta_graph('./CHKPT/model.cpkt.meta')
sess.run(tf.global_variables_initializer())
a,b,c,pred = convNet(x, weights, biases)
I am met with the following error:
ValueError: Tried to convert 'filter' to a tensor and failed. Error: None values not supported.
When I look at the imported weights and biases, each of them have value 'None'. Not only is this odd, but the network 'runs' incredibly quickly during training, far far more quickly than I'd expect. I am worried that no legitimate computations are occurring.
This must not be the case, but, I am almost positive I am following the saving/loading process I've used for many other networks verbatim. Can anyone shed some light on what might be happening here?
Edit: I'm also very new to TF, and it's likely there are non-idealities in my code. If you see anything outside of the saving/importing that isn't kosher please let me know.
Running sess.run(tf.global_variables_initializer()) will reinitialize every tensor and delete their loaded values. Skip calling tf.global_variables_initializer() when you load a model. The initialization is done by the saver.
You are also missing the restore call (import_meta_graph() only loads the saver object).
new_saver = tf.train.import_meta_graph('./CHKPT/model.cpkt.meta')
new_saver.restore(sess, './CHKPT/model.cpkt')
Thereafter when you run:
a,b,c,pred = convNet(x, weights, biases)
you create an all new network and never use the loaded one.
Instead you have to find the tensors you need inside tf.global_variables() after restoring the model. For example by searching for them by name.
Related
How to build a custom question-answering head when using hugginface transformers?
Using the TFBertForQuestionAnswering.from_pretrained() function, we get a predefined head on top of BERT together with a loss function that are suitable for this task. My question is how to create a custom head without relying on TFAutoModelForQuestionAnswering.from_pretrained(). I want to do this because there is no place where the architecture of the head is explained clearly. By reading the code here we can see the architecture they are using, but I can't be sure I understand their code 100%. Starting from How to Fine-tune HuggingFace BERT model for Text Classification is good. However, it covers only the classification task, which is much simpler. 'start_positions' and 'end_positions' are created following this tutorial. So far, I've got the following: train_dataset # Dataset({ # features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'], # num_rows: 99205 # }) train_dataset.set_format(type='tensorflow', columns=['input_ids', 'token_type_ids', 'attention_mask']) features = {x: train_dataset[x] for x in ['input_ids', 'token_type_ids', 'attention_mask']} labels = [train_dataset[x] for x in ['start_positions', 'end_positions']] labels = np.array(labels).T tfdataset = tf.data.Dataset.from_tensor_slices((features, labels)).batch(16) input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids') token_type_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='token_type_ids') attention_mask = tf.keras.layers.Input((256,), dtype=tf.int32, name='attention_mask') bert = TFAutoModel.from_pretrained("bert-base-multilingual-cased") output = bert([input_ids, token_type_ids, attention_mask]).last_hidden_state output = tf.keras.layers.Dense(2, name="qa_outputs")(output) model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=output) num_train_epochs = 3 num_train_steps = len(tfdataset) * num_train_epochs optimizer, schedule = create_optimizer( init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_train_steps, weight_decay_rate=0.01 ) def qa_loss(labels, logits): loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE ) start_loss = loss_fn(labels[0], logits[0]) end_loss = loss_fn(labels[1], logits[1]) return (start_loss + end_loss) / 2.0 model.compile( loss=loss_fn, optimizer=optimizer ) model.fit(tfdataset, epochs=num_train_epochs) And I am getting the following error: ValueError: `labels.shape` must equal `logits.shape` except for the last dimension. Received: labels.shape=(2,) and logits.shape=(256, 2) It is complaining about the shape of the labels. This should not happen since I am using SparseCategoricalCrossentropy loss.
For future reference, I actually found a solution, which is just editing the TFBertForQuestionAnswering class itself. For example, I added an additional layer in the following code and trained the model as usual and it worked. from transformers import TFBertPreTrainedModel from transformers import TFBertMainLayer from transformers.modeling_tf_utils import TFQuestionAnsweringLoss, get_initializer, input_processing from transformers.modeling_tf_outputs import TFQuestionAnsweringModelOutput from transformers import BertConfig class MY_TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss): # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model _keys_to_ignore_on_load_unexpected = [ r"pooler", r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship", ] def __init__(self, config: BertConfig, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") # This is the dense layer I added self.my_dense = tf.keras.layers.Dense( units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="my_dense", ) self.qa_outputs = tf.keras.layers.Dense( units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs", ) def call( self, input_ids = None, attention_mask = None, token_type_ids = None, position_ids = None, head_mask = None, inputs_embeds = None, output_attentions = None, output_hidden_states = None, return_dict = None, start_positions = None, end_positions= None, training = False, **kwargs, ): r""" start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ inputs = input_processing( func=self.call, config=self.config, input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, start_positions=start_positions, end_positions=end_positions, training=training, kwargs_call=kwargs, ) outputs = self.bert( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"], position_ids=inputs["position_ids"], head_mask=inputs["head_mask"], inputs_embeds=inputs["inputs_embeds"], output_attentions=inputs["output_attentions"], output_hidden_states=inputs["output_hidden_states"], return_dict=inputs["return_dict"], training=inputs["training"], ) sequence_output = outputs[0] # You also have to add it here my_logits = self.my_dense(inputs=sequence_output) logits = self.qa_outputs(inputs=my_logits) start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) start_logits = tf.squeeze(input=start_logits, axis=-1) end_logits = tf.squeeze(input=end_logits, axis=-1) loss = None if inputs["start_positions"] is not None and inputs["end_positions"] is not None: labels = {"start_position": inputs["start_positions"]} labels["end_position"] = inputs["end_positions"] loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits)) if not inputs["return_dict"]: output = (start_logits, end_logits) + outputs[2:] return ((loss,) + output) if loss is not None else output return TFQuestionAnsweringModelOutput( loss=loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None return TFQuestionAnsweringModelOutput( start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns )
FailedPreconditionError: FailedPr...onError()
I have FailedPreconditionError when running sess. My network has two different parts, pretrained-network and new add in Recognition network. Pretrained model is used to extract features and the feature is used to train again for recognition. In my code, pre-trained model is loaded first. graph = tf.Graph() with graph.as_default(): input_data, input_labels, input_boxes = input_train_data.input_fn() input_boxes = tf.reshape(input_boxes,[input_boxes.shape[0]*2,-1])#convert from Nx8 to 2Nx4 # build model and loss net = Net(input_data, is_training = False) f_saver = tf.train.Saver(max_to_keep=1000, write_version=tf.train.SaverDef.V2, save_relative_paths=True) sess_config = tf.ConfigProto(log_device_placement = False, allow_soft_placement = True) if FLAGS.gpu_memory_fraction < 0: sess_config.gpu_options.allow_growth = True elif FLAGS.gpu_memory_fraction > 0: sess_config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction; session = tf.Session(graph=graph, config=sess_config) tf.logging.info('Initialize from: ' + config.train.init_checkpoint) f_saver.restore(session, config.train.init_checkpoint) f_saver restores the pre-trained model. Then feature conv5_3 is extracted and fed into Recognition network. conv5_3 = net.end_points['conv5_3'] with tf.variable_scope("Recognition"): global_step_rec = tf.Variable(0, name='global_step_rec', trainable=False) #Pass through recognition net r_net = regnet.ConstructRecNet(conv5_3) conv7_7 = r_net.end_points['pool7'] #implement ROI Pooling #input boxes be in x1, y1, x2, y2 h_fmap = tf.dtypes.cast(tf.shape(conv7_7)[1],tf.float32) w_fmap = tf.dtypes.cast(tf.shape(conv7_7)[2],tf.float32) #remap boxes at input images to feature mats #input_boxes = input_boxes / tf.constant([config.train.input_shape[0], config.train.input_shape[0],\ # config.train.input_shape[0], config.train.input_shape[0]], dtype=tf.float32)#Normalize with image size first remap_boxes=tf.matmul(input_boxes,tf.diag([w_fmap,h_fmap,w_fmap,h_fmap])) #put first column with image indexes rows = tf.expand_dims(tf.range(remap_boxes.shape[0]), 1)/2 add_index = tf.concat([tf.cast(rows,tf.float32),remap_boxes],-1) index = tf.not_equal(tf.reduce_sum(add_index[:,4:],axis=1),0) remap_boxes = tf.gather_nd(add_index,tf.where(index)) remap_boxes=tf.dtypes.cast(remap_boxes,tf.int32) prob = roi_pooling(conv7_7, remap_boxes, pool_height=1, pool_width=28) #Get features for CTC training prob = tf.transpose(prob, (1, 0, 2)) # prepare for CTC data_length = tf.fill([tf.shape(prob)[1]], tf.shape(prob)[0]) # input seq length, batch size ctc = tf.py_func(CTCUtils.compute_ctc_from_labels, [input_labels], [tf.int64, tf.int64, tf.int64]) ctc_labels = tf.to_int32(tf.SparseTensor(ctc[0], ctc[1], ctc[2])) predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(prob, data_length, merge_repeated=False, beam_width=10)[0][0]) tf.sparse_tensor_to_dense(predictions, default_value=-1, name='d_predictions') tf.reduce_mean(tf.edit_distance(predictions, ctc_labels, normalize=False), name='error_rate') loss = tf.reduce_mean(tf.compat.v1.nn.ctc_loss(inputs=prob, labels=ctc_labels, sequence_length=data_length, ctc_merge_repeated=True), name='loss') learning_rate = tf.train.piecewise_constant(global_step_rec, [150000, 200000],[config.train.learning_rate, 0.1 * config.train.learning_rate,0.01 * config.train.learning_rate]) opt_loss = tf.contrib.layers.optimize_loss(loss, global_step_rec, learning_rate, config.train.opt_type,config.train.grad_noise_scale, name='train_step') tf.global_variables_initializer() I can run sess till feature extraction conv5_3. But can't run those in Recognition and got error as FailedPreconditionError: FailedPr...onError(). What could be the problem? graph.finalize() with tf.variable_scope("Recognition"): for i in range(config.train.steps): input_data_, input_labels_, input_boxes_ = session.run([input_data, input_labels, input_boxes]) conv5_3_ = session.run([conv5_3]) #can run this line global_step_rec_ = session.run([global_step_rec]) # got FailedPreconditionError: FailedPr...onError() error at this line conv7_7_ = session.run([conv7_7]) h_fmap_ = session.run([h_fmap])
Now it works. Since my graph has two parts, I need to initialize separately. (1)First get all variables from pre-trained model to initialize with those from checkpoint. Then initialize with tf.train.Saver. (2)Then initialize the rest add-in layers using tf.global_variables_initializer() My code is as follow. #Initialization #Initialize pre-trained model first #Since we need to restore pre-trained model and initialize to respective variables in this current graph #(1)make a variable list for checkpoint #(2)initialize a saver for the variable list #(3)then restore #(1) def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors): varlist=[] reader = pywrap_tensorflow.NewCheckpointReader(file_name) if all_tensors: var_to_shape_map = reader.get_variable_to_shape_map() for key in sorted(var_to_shape_map): print(key) varlist.append(key) return varlist varlist=print_tensors_in_checkpoint_file(file_name=config.train.init_checkpoint,all_tensors=True,tensor_name=None) #(2)prepare the list of variables by calling variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) #countcheckpt_vars=0 #for n in tf.get_default_graph().as_graph_def().node: # print(n.name) #for op in tf.get_default_graph().get_operations(): # print(str(op.name)) #for var in zip(variables): # countcheckpt_vars=countcheckpt_vars+1 #(3) loader = tf.train.Saver(variables[:46])#since I need to initialize only 46 variables from global variables tf.logging.info('Initialize from: ' + config.train.init_checkpoint) sess_config = tf.ConfigProto(log_device_placement = False, allow_soft_placement = True) if FLAGS.gpu_memory_fraction < 0: sess_config.gpu_options.allow_growth = True elif FLAGS.gpu_memory_fraction > 0: sess_config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction; session = tf.Session(graph=graph, config=sess_config) loader.restore(session, config.train.init_checkpoint) Then initialize the rest of variables init = tf.global_variables_initializer() session.run(init)
(De-)Convutional lstm autoencoder - error jumps
I'm trying to build a convolutional lstm autoencoder (that also predicts future and past) with Tensorflow, and it works to a certain degree, but the error sometimes jumps back up, so essentially, it never converges. The model is as follows: The encoder starts with a 64x64 frame from a 20 frame bouncing mnist video for each time step of the lstm. Every stacking layer of LSTM halfs it and increases the depth via 2x2 convolutions with a stride of 2. (so -->32x32x3 -->...--> 1x1x96) On the other hand, the lstm performs 3x3 convolutions with a stride of 1 on its state. Both results are concatenated to form the new state. In the same way, the decoder uses transposed convolutions to go back to the original format. Then the squared error is calculated. The error starts at around 2700 and it takes around 20 hours (geforce1060) to get down to ~1700. At which point the jumping back up (and it sometimes jumps back up to 2300 or even ridiculous values like 440300) happens often enough that I can't really get any lower. Also at that point, it can usually pinpoint where the number should be, but its too fuzzy to actually make out the digit... I tried different learning rates and optimizers, so if anybody knows why that jumping happens, that'd make me happy :) Here is a graph of the loss with epochs: import tensorflow as tf import numpy as np import matplotlib import matplotlib.pyplot as plt import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = '0' #based on code by loliverhennigh (Github) class ConvCell(tf.contrib.rnn.RNNCell): count = 0 #exists only to remove issues with variable scope def __init__(self, shape, num_features, transpose = False): self.shape = shape self.num_features = num_features self._state_is_tuple = True self._transpose = transpose ConvCell.count+=1 self.count = ConvCell.count #property def state_size(self): return (tf.contrib.rnn.LSTMStateTuple(self.shape[0:4],self.shape[0:4])) #property def output_size(self): return tf.TensorShape(self.shape[1:4]) #here comes to the actual conv lstm implementation, if transpose = true, it performs a deconvolution on the input def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__+str(self.count)): c, h = state state_shape = h.shape input_shape = inputs.shape #filter variables and convolutions on data coming from the same cell, a time step previous h_filters = tf.get_variable("h_filters",[3,3,state_shape[3],self.num_features]) h_filters_gates = tf.get_variable("h_filters_gates",[3,3,state_shape[3],3]) h_partial = tf.nn.conv2d(h,h_filters,[1,1,1,1],'SAME') h_partial_gates = tf.nn.conv2d(h,h_filters_gates,[1,1,1,1],'SAME') c_filters = tf.get_variable("c_filters",[3,3,state_shape[3],3]) c_partial = tf.nn.conv2d(c,c_filters,[1,1,1,1],'SAME') #filters and convolutions/deconvolutions on data coming fromthe cell input if self._transpose: x_filters = tf.get_variable("x_filters",[2,2,self.num_features,input_shape[3]]) x_filters_gates = tf.get_variable("x_filters_gates",[2,2,3,input_shape[3]]) x_partial = tf.nn.conv2d_transpose(inputs,x_filters,[int(state_shape[0]),int(state_shape[1]),int(state_shape[2]),self.num_features],[1,2,2,1],'VALID') x_partial_gates = tf.nn.conv2d_transpose(inputs,x_filters_gates,[int(state_shape[0]),int(state_shape[1]),int(state_shape[2]),3],[1,2,2,1],'VALID') else: x_filters = tf.get_variable("x_filters",[2,2,input_shape[3],self.num_features]) x_filters_gates = tf.get_variable("x_filters_gates",[2,2,input_shape[3],3]) x_partial = tf.nn.conv2d(inputs,x_filters,[1,2,2,1],'VALID') x_partial_gates = tf.nn.conv2d(inputs,x_filters_gates,[1,2,2,1],'VALID') #some more lstm gate business gate_bias = tf.get_variable("gate_bias",[1,1,1,3]) h_bias = tf.get_variable("h_bias",[1,1,1,self.num_features*2]) gates = h_partial_gates + x_partial_gates + c_partial + gate_bias i,f,o = tf.split(gates,3,axis=3) #concatenate the units coming from the spacial and the temporal dimension to build a unified state concat = tf.concat([h_partial,x_partial],3) + h_bias new_c = tf.nn.relu(concat)*tf.sigmoid(i)+c*tf.sigmoid(f) new_h = new_c * tf.sigmoid(o) new_state = tf.contrib.rnn.LSTMStateTuple(new_c,new_h) return new_h, new_state #its redundant, but thats how tensorflow likes it, apparently #global variables LEARNING_RATE = 0.005 ITERATIONS_PER_EPOCH = 80 BATCH_SIZE = 75 TEST = False #manual switch to go from training to testing if TEST: BATCH_SIZE = 1 inputs = tf.placeholder(tf.float32, (20, BATCH_SIZE, 64, 64,1)) shape0 = [BATCH_SIZE,64,64,2] shape1 = [BATCH_SIZE,32,32,6] shape2 = [BATCH_SIZE,16,16,12] shape3 = [BATCH_SIZE,8,8,24] shape4 = [BATCH_SIZE,4,4,48] shape5 = [BATCH_SIZE,2,2,96] shape6 = [BATCH_SIZE,1,1,192] #apparently tf.multirnncell has very specific requirements for the initial states oO initial_state1 = (tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape1),tf.zeros(shape1)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape2),tf.zeros(shape2)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape3),tf.zeros(shape3)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape4),tf.zeros(shape4)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape5),tf.zeros(shape5)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape6),tf.zeros(shape6))) initial_state2 = (tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape5),tf.zeros(shape5)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape4),tf.zeros(shape4)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape3),tf.zeros(shape3)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape2),tf.zeros(shape2)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape1),tf.zeros(shape1)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape0),tf.zeros(shape0))) #encoding part of the autoencoder graph cell1 = ConvCell(shape1,3) cell2 = ConvCell(shape2,6) cell3 = ConvCell(shape3,12) cell4 = ConvCell(shape4,24) cell5 = ConvCell(shape5,48) cell6 = ConvCell(shape6,96) mcell = tf.contrib.rnn.MultiRNNCell([cell1,cell2,cell3,cell4,cell5,cell6]) rnn_outputs, rnn_states = tf.nn.dynamic_rnn(mcell, inputs[0:20,:,:,:],initial_state=initial_state1,dtype=tf.float32, time_major=True) #decoding part of the autoencoder graph, forward block and backwards block cell9a = ConvCell(shape5,48,transpose = True) cell10a = ConvCell(shape4,24,transpose = True) cell11a = ConvCell(shape3,12,transpose = True) cell12a = ConvCell(shape2,6,transpose = True) cell13a = ConvCell(shape1,3,transpose = True) cell14a = ConvCell(shape0,1,transpose = True) mcella = tf.contrib.rnn.MultiRNNCell([cell9a,cell10a,cell11a,cell12a,cell13a,cell14a]) cell9b = ConvCell(shape5,48,transpose = True) cell10b = ConvCell(shape4,24,transpose = True) cell11b= ConvCell(shape3,12,transpose = True) cell12b = ConvCell(shape2,6,transpose = True) cell13b = ConvCell(shape1,3,transpose = True) cell14b = ConvCell(shape0,1,transpose = True) mcellb = tf.contrib.rnn.MultiRNNCell([cell9b,cell10b,cell11b,cell12b,cell13b,cell14b]) def PredictionLayer(rnn_outputs,viewPoint = 11, reverse = False): predLength = viewPoint-2 if reverse else 20-viewPoint #vision is the input for the decoder vision = tf.concat([rnn_outputs[viewPoint-1:viewPoint,:,:,:],tf.zeros([predLength,BATCH_SIZE,1,1,192])],0) if reverse: rnn_outputs2, rnn_states = tf.nn.dynamic_rnn(mcellb, vision, initial_state = initial_state2, time_major=True) else: rnn_outputs2, rnn_states = tf.nn.dynamic_rnn(mcella, vision, initial_state = initial_state2, time_major=True) mean = tf.reduce_mean(rnn_outputs2,4) if TEST: return mean if reverse: return tf.reduce_sum(tf.square(mean-inputs[viewPoint-2::-1,:,:,:,0])) else: return tf.reduce_sum(tf.square(mean-inputs[viewPoint-1:20,:,:,:,0])) if TEST: mean = tf.concat([PredictionLayer(rnn_outputs,11,True)[::-1,:,:,:],createPredictionLayer(rnn_outputs,11)],0) else: #training part of the graph error = tf.zeros([1]) for i in range(8,15): #range size of 7 or less works, 9 or more does not, no idea why error += PredictionLayer(rnn_outputs, i) error += PredictionLayer(rnn_outputs, i, True) train_fn = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE).minimize(error) ################################################################################ ## TRAINING LOOP ## ################################################################################ #code based on siemanko/tf_lstm.py (Github) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) saver = tf.train.Saver(restore_sequentially=True, allow_empty=True,) session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) session.run(tf.global_variables_initializer()) vids = np.load("mnist_test_seq.npy") #20/10000/64/64 , moving mnist dataset from http://www.cs.toronto.edu/~nitish/unsupervised_video/ vids = vids[:,0:6000,:,:] #training set saver.restore(session,tf.train.latest_checkpoint('./conv_lstm_multiples_v2/')) #saver.restore(session,'.\conv_lstm_multiples\iteration-74') for epoch in range(1000): if TEST: break epoch_error = 0 #randomize batches each epoch vids = np.swapaxes(vids,0,1) np.random.shuffle(vids) vids = np.swapaxes(vids,0,1) for i in range(ITERATIONS_PER_EPOCH): #running the graph and feeding data err,_ = session.run([error, train_fn], {inputs: np.expand_dims(vids[:,i*BATCH_SIZE:(i+1)*BATCH_SIZE,:,:],axis=4)}) print(err) epoch_error += err #training error each epoch and regular saving epoch_error /= (ITERATIONS_PER_EPOCH*BATCH_SIZE*4096*20*7) if (epoch+1) % 5 == 0: saver.save(session,'.\conv_lstm_multiples_v2\iteration',global_step=epoch) print("saved") print("Epoch %d, train error: %f" % (epoch, epoch_error)) #testing plt.ion() f, axarr = plt.subplots(2) vids = np.load("mnist_test_seq.npy") for i in range(6000,10000): img = session.run([mean], {inputs: np.expand_dims(vids[:,i:i+1,:,:],axis=4)}) for j in range(20): axarr[0].imshow(img[0][j,0,:,:]) axarr[1].imshow(vids[j,i,:,:]) plt.show() plt.pause(0.1)
Usually this happens when gradients' magnitude is very high at some point and causes your network parameters to change a lot. To verify that it is indeed the case, you can produce the same plot of gradient magnitudes and see if they jump right before the loss jump. Assuming this is the case, the classic approach is to use gradient clipping (or go all the way to natural gradient).
Tensorflow >r1.0 tf.layers.batch_normalization very bad test performance
I'm trying to use the tf.layers.batch_normalization function provided in the latest Tensorflow API to implement a recurrent batch normalized LSTM. The implementation is as below (I modified the TF source code): class BNLSTMCell(tf.nn.rnn_cell.RNNCell): """ Batch Normalized Long short-term memory unit (LSTM) recurrent network cell. cf. Recurrent Batch Normalization https://arxiv.org/abs/1603.09025 cf. A Gentle Guide to Using Batch Normalization in TensorFlow http://ruishu.io/2016/12/27/batchnorm/ """ def __init__(self, num_units, forward_only, gamma_c=1.0, gamma_h=1.0, gamma_x=1.0, beta_c=0.0, beta_h=0.0, beta_x=0.0, input_size=None, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, num_unit_shards=1, num_proj_shards=1, forget_bias=1.0, state_is_tuple=False, activation=tf.tanh): """Initialize the parameters for an LSTM cell. Args: num_units: int, The number of units in the LSTM cell forward_only: If False (training): 1. Normalize layer activations according to mini-batch statistics. 2. During the training step, update population statistics approximation via moving average of mini-batch statistics. If True (testing): 1. Normalize layer activations according to estimated population statistics. 2. No update of population statistics according to mini-batch statistcs from test data. gamma_c: Scale of cell state normalization beta_c: Offset of cell state normalization gamma_h: Scale of hidden state normalization beta_h: Offset of hidden state normalization (set to 0 to avoid redundancy) gamma_x: Scale of input normalization beta_x: Offset of input normalization (set to 0 to avoid redundancy) input_size: Deprecated and unused. use_peepholes: bool, Set True to enable diagonal/peephole connections. cell_clip: (optional) A float value, if provided the cell state is clipped by this value prior to the cell output activation. initializer: (optional) The initializer to use for the weight and projection matrices. num_proj: (optional) int, The output dimensionality for the projection matrices. If None, no projection is performed. num_unit_shards: How to split the weight matrix. If >1, the weight matrix is stored across num_unit_shards. num_proj_shards: How to split the projection matrix. If >1, the projection matrix is stored across num_proj_shards. forget_bias: Biases of the forget gate are initialized by default to 1 in order to reduce the scale of forgetting at the beginning of the training. state_is_tuple: If True, accepted and returned states are 2-tuples of the `c_state` and `m_state`. By default (False), they are concatenated along the column axis. This default behavior will soon be deprecated. activation: Activation function of the inner states. """ if not state_is_tuple: logging.warn( "%s: Using a concatenated state is slower and will soon be " "deprecated. Use state_is_tuple=True." % self) if input_size is not None: logging.warn("%s: The input_size parameter is deprecated." % self) self._num_units = num_units self.forward_only = forward_only self._gamma_c = gamma_c self._beta_c = beta_c self._gamma_h = gamma_h self._beta_h = beta_h self._gamma_x = gamma_x self._beta_x = beta_x self._use_peepholes = use_peepholes self._cell_clip = cell_clip self._initializer = initializer self._num_proj = num_proj self._num_unit_shards = num_unit_shards self._num_proj_shards = num_proj_shards self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._activation = activation if num_proj: self._state_size = ( tf.nn.rnn_cell.LSTMStateTuple(num_units, num_proj) if state_is_tuple else num_units + num_proj) self._output_size = num_proj else: self._state_size = ( tf.nn.rnn_cell.LSTMStateTuple(num_units, num_units) if state_is_tuple else 2 * num_units) self._output_size = num_units #property def state_size(self): return self._state_size #property def output_size(self): return self._output_size def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = tf.slice(state, [0, 0], [-1, self._num_units]) m_prev = tf.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with tf.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" w_h = tf.get_variable("W_h", [num_proj, 4 * self._num_units], dtype=tf.float32) w_x = tf.get_variable("W_x", [input_size.value, 4 * self._num_units], dtype=tf.float32) b = tf.get_variable( "B", shape=[4 * self._num_units], initializer=tf.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate hidden_matrix = tf.matmul(m_prev, w_h) bn_hidden_matrix = tf.layers.batch_normalization(hidden_matrix, momentum=0.5, beta_initializer=tf.constant_initializer(self._beta_h), gamma_initializer=tf.constant_initializer(self._gamma_h), training=(not self.forward_only), name='bn_hidden_matrix', reuse=None) # print(tf.get_collection(tf.GraphKeys.VARIABLES, scope=scope)) input_matrix = tf.matmul(inputs, w_x) bn_input_matrix = tf.layers.batch_normalization(input_matrix, momentum=0.5, beta_initializer=tf.constant_initializer(self._beta_x), gamma_initializer=tf.constant_initializer(self._gamma_x), training=(not self.forward_only), name='bn_input_matrix', reuse=None) lstm_matrix = tf.nn.bias_add( tf.add(bn_input_matrix, bn_hidden_matrix), b) i, j, f, o = tf.split(lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: w_f_diag = tf.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = tf.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = tf.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (tf.sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + tf.sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (tf.sigmoid(f + self._forget_bias) * c_prev + tf.sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = tf.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type bn_c = tf.layers.batch_normalization(c, momentum=0.5, beta_initializer=tf.constant_initializer(self._beta_c), gamma_initializer=tf.constant_initializer(self._gamma_c), training=(not self.forward_only), name='bn_cell', reuse=None) if self._use_peepholes: m = tf.sigmoid(o + w_o_diag * bn_c) * self._activation(bn_c) else: m = tf.sigmoid(o) * self._activation(bn_c) if self._num_proj is not None: concat_w_proj = tf.nn.rnn_cell._get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = tf.matmul(m, concat_w_proj) new_state = (tf.nn.rnn_cell.LSTMStateTuple(c, m) if self._state_is_tuple else tf.concat(1, [c, m])) return m, new_state I built a sequence to sequence model and run the extra updates during training as specified in other posts. extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if extra_update_ops and not forward_only: outputs, extra_updates = session.run([output_feed, extra_update_ops], input_feed) else: outputs = session.run(output_feed, input_feed) The training loss looks very reasonable. However, my test output is garbage. I wonder if anyone has had similar experience and knows how to resolve it.
Calling a basic LSTM cell within a custom Tensorflow cell
I'm trying to implement the MATCH LSTM from this paper: https://arxiv.org/pdf/1608.07905.pdf I'm using Tensorflow. One part of the architecture is an RNN that uses the input and the previous state to compute an attention vector which it applies to a context before concatenating the result with the inputs and sending them into an LSTM. To build the first part of this RNN, I wrote a custom cell for Tensorflow to call. But I'm not sure how to send the results into an LSTM. Is it possible to call the basic LSTM cell within the custom cell I'm writing? I tried this a few different ways but kept getting the error "module' object has no attribute 'rnn_cell'" at the line where the LSTM cell is called. Any help would be much appreciated! EDIT to add code: import numpy as np import tensorflow as tf class MatchLSTMCell(tf.contrib.rnn.RNNCell): def __init__(self, state_size, question_tensor, encoded_questions, batch_size): self._state_size = state_size self.question_tensor = question_tensor self.encoded_questions = encoded_questions self.batch_size = batch_size #property def state_size(self): return self._state_size #property def output_size(self): return self._state_size def __call__(self, inputs, state, scope=None): scope = scope or type(self).__name__ with tf.variable_scope(scope): W_p = tf.get_variable("W_p", dtype=tf.float64, shape=[self.state_size, self.state_size], initializer=tf.contrib.layers.xavier_initializer()) W_r = tf.get_variable("W_r", dtype=tf.float64, shape=[self.state_size, self.state_size], initializer=tf.contrib.layers.xavier_initializer()) b_p = tf.get_variable("b_p", dtype=tf.float64, shape=[self.state_size]) w = tf.get_variable("w", dtype=tf.float64, shape=[1,self.state_size]) b = tf.get_variable("b", dtype=tf.float64, shape=[]) #print 'question tensor', np.shape(self.question_tensor) #print 'inputs', np.shape(inputs) #print 'insides', np.shape(tf.matmul(inputs, W_p) + tf.matmul(state, W_r) + b_p) G = tf.nn.tanh( tf.transpose(tf.transpose(self.question_tensor, perm=[1,0,2]) + (tf.matmul(inputs, W_p) + tf.matmul(state, W_r) + b_p), perm=[1,0,2]) ) #print 'big G', np.shape(G) attention_list = [] for i in range(self.batch_size): attention_matrix = tf.matmul(G[i,:,:], tf.transpose(w)) attention_list.append(attention_matrix) attention_scores = tf.stack(attention_list) a = tf.nn.softmax(attention_scores + b) a = tf.reshape(a, [self.batch_size, -1]) #print 'a shape is', np.shape(a) weighted_question_list = [] for i in range(self.batch_size): attention_vector = tf.matmul(tf.reshape(a[i], [1,-1]), self.encoded_questions[i]) weighted_question_list.append(attention_vector) weighted_questions = tf.stack(weighted_question_list) weighted_questions = tf.reshape(weighted_questions, [32, -1]) #print'weighted questions', np.shape(weighted_questions) z = tf.concat([inputs, weighted_questions], 1) lstm_cell = tf.nn.rnn_cell.LSTMCell(self.state_size) output, new_state = lstm_cell.__call__(z, state) return output, new_state
I'm also trying to reimplement Match_LSTM for Squad for experiment. I use MurtyShikhar's as reference. It works! However, he had to customize AttentionWrapper and use existed BasicLSTM cell. I also try to create a Match_LSTM_cell by putting z and state as (inputs,state) pair in Basic_LSTM: def __call__(self, inputs,state): #c is not a output. c somehow is a "memory keeper". #Necessary to update and pass new_c through LSTM c,h=state #...Calculate your z #...inputs will be each tokens in context(passage) respectively #...Calculate alpha_Q z=tf.concat([inputs,alpha_Q],axis=1) ########This part is reimplement of Basic_LSTM with vs.variable_scope("LSTM_core"): sigmoid=math_ops.sigmoid concat=_linear([z,h],dimension*4,bias=True) i,j,f,o=array_ops.split(concat,num_or_size_splits=4,axis=1) new_c=(c*sigmoid(f+self._forget_bias)+sigmoid(i)*self._activation(j)) new_h = self._activation(new_c) * sigmoid(o) new_state=(new_c,new_h) return new_h,new_state