I am trying to do a grid search by calling model.fit recursively for different parameters of my model.
I get a resource exhausted error in tensorflow. In spite of doing del model and tf.keras.backend.clear_session() at the end of the loop. This is my code
def kfoldsplit(FRAME_PATH, MASK_PATH,k):
kfold = []
all_frames = os.listdir(FRAME_PATH)
all_masks = os.listdir(MASK_PATH)
all_frames.sort(key=lambda var: [int(x) if x.isdigit() else x
for x in re.findall(r'[^0-9]|[0-9]+', var)])
all_masks.sort(key=lambda var: [int(x) if x.isdigit() else x
for x in re.findall(r'[^0-9]|[0-9]+', var)])
random.seed(230)
random.shuffle(all_frames)
# Generate train, val, and test sets for frames
train_split = int(0.8 * len(all_frames))
#val_split = int(0.9 * len(all_frames))
#test_split = int(0.9 * len(all_frames))
train_frames = all_frames[:train_split]
#val_frames = all_frames[train_split:val_split]
test_frames = all_frames[train_split:]
# Generate corresponding mask lists for masks
train_masks = [f for f in all_masks if 'image_' + f[6:16] + 'dcm' in train_frames]
#val_masks = [f for f in all_masks if 'image_' + f[6:16] + 'dcm' in val_frames]
test_masks = [f for f in all_masks if 'image_' + f[6:16] + 'dcm' in test_frames]
size_of_subset =int(len(train_masks)/k)
for i in range (0,k):
subset = (train_frames[i*size_of_subset:(i+1)*size_of_subset],train_masks[i*size_of_subset:(i+1)*size_of_subset])
kfold.append(subset)
return kfold, (test_frames,test_masks)
def get_model_name(k):
return 'model_'+str(k)+'.hdf5'
def float_range(start, stop, step):
while start < stop:
yield float(start)
start += decimal.Decimal(step)
frames_path = 'C:/Datasets/elderlymen1/2d/images'
masks_path = 'C:/Datasets/elderlymen1/2d/FASCIA_FILLED'
kf = kfoldsplit(frames_path, masks_path, 10)
def crossvalidation(epoch,kf, loops):
VALIDATION_ACCURACY = []
VALIDATION_LOSS = []
Params=[]
save_dir = 'C:/saved_models/'
fold_var = 1
i=0
for i in float_range(0,1,0.1):
for j in float_range(1e-6,1e-3,1e-6):
#while i <= loops:
#_alpha = random.uniform(0, 1)
#lrate = random.uniform(1e-3, 1e-6)
_alpha = i
lrate = j
Params.append([_alpha,lrate])
for subset in kf[0]:
list_IDs = subset[0]
train_data_generator = DataGenerator2(list_IDs, frames_path, masks_path, to_fit=True, batch_size=2,
dim=(512, 512), dimy=(512, 512), n_channels=1, n_classes=2, shuffle=True,
data_gen_args=data_gen_args_dict)
list_IDs = kf[1][0]
valid_data_generator = DataGenerator(list_IDs, frames_path, masks_path, to_fit=True, batch_size=2,
dim=(512, 512), dimy=(512, 512), n_channels=1, n_classes=2, shuffle=True)
# CREATE NEW MODEL
model = unet(pretrained_weights='csa/unet_ThighOuterSurface.hdf5')
# COMPILE NEW MODEL
model.compile(optimizer=Adam(lr=lrate), loss=combo_loss(alpha=_alpha, beta=0.4), metrics=[dice_accuracy])
# CREATE CALLBACKS
checkpoint = tf.keras.callbacks.ModelCheckpoint(save_dir + get_model_name(fold_var),
monitor='val_loss', verbose=1,
save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# There can be other callbacks, but just showing one because it involves the model name
# This saves the best model
# FIT THE MODEL
history = model.fit(train_data_generator, validation_steps=len(valid_data_generator), steps_per_epoch=len(train_data_generator),
epochs=epoch,
callbacks=callbacks_list,
validation_data=valid_data_generator)
# PLOT HISTORY
# :
# :
# LOAD BEST MODEL to evaluate the performance of the model
model.load_weights("C:/saved_models/model_" + str(fold_var) + ".hdf5")
results = model.evaluate(valid_data_generator)
results = dict(zip(model.metrics_names, results))
VALIDATION_ACCURACY.append(results['dice_accuracy'])
VALIDATION_LOSS.append(results['loss'])
tf.keras.backend.clear_session()
fold_var += 1
del model
#i+=1
print(VALIDATION_ACCURACY)
print(Params)
sample = open('metrics.txt', '+r')
print(VALIDATION_ACCURACY, file=sample)
print(Params, file=sample)
print('...',file=sample)
sample.close()
crossvalidation(15,kf, 2)
Why is the memory still exhausted and how can I release it. Or if it is not possible, is there another option for a grid search and cross validation for an image segmentation model?
Thank you
After trying everything I found in order to release memory, then only thing that solved the problem was adding
del model
gc.collect()
at the end of the for loop
I have FailedPreconditionError when running sess.
My network has two different parts, pretrained-network and new add in Recognition network.
Pretrained model is used to extract features and the feature is used to train again for recognition.
In my code, pre-trained model is loaded first.
graph = tf.Graph()
with graph.as_default():
input_data, input_labels, input_boxes = input_train_data.input_fn()
input_boxes = tf.reshape(input_boxes,[input_boxes.shape[0]*2,-1])#convert from Nx8 to 2Nx4
# build model and loss
net = Net(input_data, is_training = False)
f_saver = tf.train.Saver(max_to_keep=1000, write_version=tf.train.SaverDef.V2, save_relative_paths=True)
sess_config = tf.ConfigProto(log_device_placement = False, allow_soft_placement = True)
if FLAGS.gpu_memory_fraction < 0:
sess_config.gpu_options.allow_growth = True
elif FLAGS.gpu_memory_fraction > 0:
sess_config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction;
session = tf.Session(graph=graph, config=sess_config)
tf.logging.info('Initialize from: ' + config.train.init_checkpoint)
f_saver.restore(session, config.train.init_checkpoint)
f_saver restores the pre-trained model.
Then feature conv5_3 is extracted and fed into Recognition network.
conv5_3 = net.end_points['conv5_3']
with tf.variable_scope("Recognition"):
global_step_rec = tf.Variable(0, name='global_step_rec', trainable=False)
#Pass through recognition net
r_net = regnet.ConstructRecNet(conv5_3)
conv7_7 = r_net.end_points['pool7']
#implement ROI Pooling
#input boxes be in x1, y1, x2, y2
h_fmap = tf.dtypes.cast(tf.shape(conv7_7)[1],tf.float32)
w_fmap = tf.dtypes.cast(tf.shape(conv7_7)[2],tf.float32)
#remap boxes at input images to feature mats
#input_boxes = input_boxes / tf.constant([config.train.input_shape[0], config.train.input_shape[0],\
# config.train.input_shape[0], config.train.input_shape[0]], dtype=tf.float32)#Normalize with image size first
remap_boxes=tf.matmul(input_boxes,tf.diag([w_fmap,h_fmap,w_fmap,h_fmap]))
#put first column with image indexes
rows = tf.expand_dims(tf.range(remap_boxes.shape[0]), 1)/2
add_index = tf.concat([tf.cast(rows,tf.float32),remap_boxes],-1)
index = tf.not_equal(tf.reduce_sum(add_index[:,4:],axis=1),0)
remap_boxes = tf.gather_nd(add_index,tf.where(index))
remap_boxes=tf.dtypes.cast(remap_boxes,tf.int32)
prob = roi_pooling(conv7_7, remap_boxes, pool_height=1, pool_width=28)
#Get features for CTC training
prob = tf.transpose(prob, (1, 0, 2)) # prepare for CTC
data_length = tf.fill([tf.shape(prob)[1]], tf.shape(prob)[0]) # input seq length, batch size
ctc = tf.py_func(CTCUtils.compute_ctc_from_labels, [input_labels], [tf.int64, tf.int64, tf.int64])
ctc_labels = tf.to_int32(tf.SparseTensor(ctc[0], ctc[1], ctc[2]))
predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(prob, data_length, merge_repeated=False, beam_width=10)[0][0])
tf.sparse_tensor_to_dense(predictions, default_value=-1, name='d_predictions')
tf.reduce_mean(tf.edit_distance(predictions, ctc_labels, normalize=False), name='error_rate')
loss = tf.reduce_mean(tf.compat.v1.nn.ctc_loss(inputs=prob, labels=ctc_labels, sequence_length=data_length, ctc_merge_repeated=True), name='loss')
learning_rate = tf.train.piecewise_constant(global_step_rec, [150000, 200000],[config.train.learning_rate, 0.1 * config.train.learning_rate,0.01 * config.train.learning_rate])
opt_loss = tf.contrib.layers.optimize_loss(loss, global_step_rec, learning_rate, config.train.opt_type,config.train.grad_noise_scale, name='train_step')
tf.global_variables_initializer()
I can run sess till feature extraction conv5_3. But can't run those in Recognition and got error as FailedPreconditionError: FailedPr...onError(). What could be the problem?
graph.finalize()
with tf.variable_scope("Recognition"):
for i in range(config.train.steps):
input_data_, input_labels_, input_boxes_ = session.run([input_data, input_labels, input_boxes])
conv5_3_ = session.run([conv5_3]) #can run this line
global_step_rec_ = session.run([global_step_rec]) # got FailedPreconditionError: FailedPr...onError() error at this line
conv7_7_ = session.run([conv7_7])
h_fmap_ = session.run([h_fmap])
Now it works.
Since my graph has two parts, I need to initialize separately.
(1)First get all variables from pre-trained model to initialize with those from checkpoint.
Then initialize with tf.train.Saver.
(2)Then initialize the rest add-in layers using tf.global_variables_initializer()
My code is as follow.
#Initialization
#Initialize pre-trained model first
#Since we need to restore pre-trained model and initialize to respective variables in this current graph
#(1)make a variable list for checkpoint
#(2)initialize a saver for the variable list
#(3)then restore
#(1)
def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
varlist=[]
reader = pywrap_tensorflow.NewCheckpointReader(file_name)
if all_tensors:
var_to_shape_map = reader.get_variable_to_shape_map()
for key in sorted(var_to_shape_map):
print(key)
varlist.append(key)
return varlist
varlist=print_tensors_in_checkpoint_file(file_name=config.train.init_checkpoint,all_tensors=True,tensor_name=None)
#(2)prepare the list of variables by calling variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
#countcheckpt_vars=0
#for n in tf.get_default_graph().as_graph_def().node:
# print(n.name)
#for op in tf.get_default_graph().get_operations():
# print(str(op.name))
#for var in zip(variables):
# countcheckpt_vars=countcheckpt_vars+1
#(3)
loader = tf.train.Saver(variables[:46])#since I need to initialize only 46 variables from global variables
tf.logging.info('Initialize from: ' + config.train.init_checkpoint)
sess_config = tf.ConfigProto(log_device_placement = False, allow_soft_placement = True)
if FLAGS.gpu_memory_fraction < 0:
sess_config.gpu_options.allow_growth = True
elif FLAGS.gpu_memory_fraction > 0:
sess_config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction;
session = tf.Session(graph=graph, config=sess_config)
loader.restore(session, config.train.init_checkpoint)
Then initialize the rest of variables
init = tf.global_variables_initializer()
session.run(init)
I'm trying to do my own implementation of the Advantage Actor Critic algorithm by using tensorflow. I used the code in https://github.com/BoYanSTKO/Practical_RL-coursera/blob/master/week5_policy_based/practice_a3c.ipynb as a rough template on how I should write the algorithm.
I tried it on the simple CartPole-v0 gym environment but by implementation fails badly. The critics loss just explodes and becomes way to large while the actors loss is rather low.
I'm not sure what I'm doing wrong here. Any help? :)
I've tried separating the actor and critic from each other by having 2 different networks. This did not help either. Have also tried fine tuning some stuff like gamma and learning rate without any success.
!/usr/bin/python
import tensorflow as tf
import numpy as np
import gym
import random
from tensorboardX import SummaryWriter
class ActorCritic():
def __init__(self,state_dim,n_actions,learning_rate,gamma=0.99):
with tf.variable_scope("ActorCritic"):
self.states_ph = tf.placeholder(tf.float32,(None,state_dim),name="states")
self.action_ph = tf.placeholder(tf.int32,(None,),name="actions")
self.n_actions = n_actions
self.reward_ph = tf.placeholder(tf.float32,(None,),name="rewards")
self.next_state_values = tf.placeholder(tf.float32,(None,),name="rewards")
self.is_done_ph = tf.placeholder(tf.float32,(None,),name="rewards")
net = tf.layers.dense(self.states_ph,24,activation=tf.nn.relu)
self.logits = tf.layers.dense(net,n_actions,activation=None)
self.state_values = tf.layers.dense(net,1,activation=None)
self.action_probs = tf.nn.softmax(self.logits)
self.log_prob = tf.nn.log_softmax(self.logits)
self.entropy = -tf.reduce_sum(self.action_probs*self.log_prob,axis=-1,name="entropy")
self.logp_actions = tf.reduce_sum(self.log_prob*tf.one_hot(self.action_ph,depth=n_actions),axis=-1)
self.target_state_values = self.reward_ph + gamma*(1.0-self.is_done_ph)*self.next_state_values
self.advantage = self.target_state_values - self.state_values
self.actor_loss = -tf.reduce_mean(self.logp_actions * tf.stop_gradient(self.advantage)) - 0.01*tf.reduce_mean(self.entropy)
self.critic_loss = tf.reduce_mean(self.advantage**2.0)
self.train_opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.actor_loss+self.critic_loss)
def train(self,states,actions,rewards,is_done,nxt_state_values_batch):
sess = tf.get_default_session()
return sess.run([self.critic_loss,self.actor_loss,self.train_opt],feed_dict={
self.next_state_values:nxt_state_values_batch,
self.states_ph:states,
self.action_ph:actions,
self.reward_ph:rewards,
self.is_done_ph:is_done})
def predict_state_values(self,states):
sess = tf.get_default_session()
return sess.run(self.state_values,feed_dict={self.states_ph:states})
def sample_actions(self,states):
sess = tf.get_default_session()
action_probs = sess.run(self.action_probs,{self.states_ph:states})
return [ np.random.choice(range(self.n_actions),p=action_prob) for action_prob in action_probs ]
class EnvBatch():
def __init__(self,env_name,n_envs):
self.envs = [gym.make(env_name) for env in range(n_envs)]
self.n_actions = self.envs[0].action_space.n
self.state_dim = self.envs[0].observation_space.shape[0]
def reset(self):
return [env.reset().tolist() for env in self.envs ]
def step(self,actions):
states_batch, rewards_batch, is_done_batch = [], [], []
for action, env in zip(actions,self.envs):
s, r , d, _ = env.step(action)
if d:
s = env.reset()
states_batch.append(s)
rewards_batch.append(r)
is_done_batch.append(d)
return np.array(states_batch), np.array(rewards_batch), np.array(is_done_batch)
def evaluate_performance(env_name,agent,nr_runs=10):
env = gym.make(env_name)
rewards = []
for _ in range(nr_runs):
state = env.reset()
is_done = False
acc_reward = 0.0
while not is_done:
action = agent.sample_actions([state])
nxt_state, reward, is_done, _ = env.step(action[0])
state = nxt_state
acc_reward += reward
rewards.append(acc_reward)
return np.mean(rewards)
tf.reset_default_graph()
env = EnvBatch("CartPole-v0",10)
agent = ActorCritic(env.state_dim,env.n_actions,learning_rate=0.001)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
state_batch = env.reset()
writer = SummaryWriter()
for i in range(100000):
actions = agent.sample_actions(state_batch)
nxt_state_batch, rewards_batch, is_done_batch = env.step(actions)
nxt_state_values = agent.predict_state_values(nxt_state_batch).ravel()
critic_loss, actor_loss, _ = agent.train(state_batch,actions,rewards_batch,is_done_batch,nxt_state_values)
writer.add_scalar("actor_loss",actor_loss,i)
writer.add_scalar("critic_loss",critic_loss,i)
if i%50==0:
test_reward = evaluate_performance("CartPole-v0",agent)
writer.add_scalar("test_reward",test_reward,i)
if test_reward > 195:
print "Done!"
states_batch = nxt_state_batch
sess.close()
writer.close()
I'm trying to build a convolutional lstm autoencoder (that also predicts future and past) with Tensorflow, and it works to a certain degree, but the error sometimes jumps back up, so essentially, it never converges.
The model is as follows:
The encoder starts with a 64x64 frame from a 20 frame bouncing mnist video for each time step of the lstm. Every stacking layer of LSTM halfs it and increases the depth via 2x2 convolutions with a stride of 2. (so -->32x32x3 -->...--> 1x1x96)
On the other hand, the lstm performs 3x3 convolutions with a stride of 1 on its state. Both results are concatenated to form the new state. In the same way, the decoder uses transposed convolutions to go back to the original format. Then the squared error is calculated.
The error starts at around 2700 and it takes around 20 hours (geforce1060) to get down to ~1700. At which point the jumping back up (and it sometimes jumps back up to 2300 or even ridiculous values like 440300) happens often enough that I can't really get any lower. Also at that point, it can usually pinpoint where the number should be, but its too fuzzy to actually make out the digit...
I tried different learning rates and optimizers, so if anybody knows why that jumping happens, that'd make me happy :)
Here is a graph of the loss with epochs:
import tensorflow as tf
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
#based on code by loliverhennigh (Github)
class ConvCell(tf.contrib.rnn.RNNCell):
count = 0 #exists only to remove issues with variable scope
def __init__(self, shape, num_features, transpose = False):
self.shape = shape
self.num_features = num_features
self._state_is_tuple = True
self._transpose = transpose
ConvCell.count+=1
self.count = ConvCell.count
#property
def state_size(self):
return (tf.contrib.rnn.LSTMStateTuple(self.shape[0:4],self.shape[0:4]))
#property
def output_size(self):
return tf.TensorShape(self.shape[1:4])
#here comes to the actual conv lstm implementation, if transpose = true, it performs a deconvolution on the input
def __call__(self, inputs, state, scope=None):
with tf.variable_scope(scope or type(self).__name__+str(self.count)):
c, h = state
state_shape = h.shape
input_shape = inputs.shape
#filter variables and convolutions on data coming from the same cell, a time step previous
h_filters = tf.get_variable("h_filters",[3,3,state_shape[3],self.num_features])
h_filters_gates = tf.get_variable("h_filters_gates",[3,3,state_shape[3],3])
h_partial = tf.nn.conv2d(h,h_filters,[1,1,1,1],'SAME')
h_partial_gates = tf.nn.conv2d(h,h_filters_gates,[1,1,1,1],'SAME')
c_filters = tf.get_variable("c_filters",[3,3,state_shape[3],3])
c_partial = tf.nn.conv2d(c,c_filters,[1,1,1,1],'SAME')
#filters and convolutions/deconvolutions on data coming fromthe cell input
if self._transpose:
x_filters = tf.get_variable("x_filters",[2,2,self.num_features,input_shape[3]])
x_filters_gates = tf.get_variable("x_filters_gates",[2,2,3,input_shape[3]])
x_partial = tf.nn.conv2d_transpose(inputs,x_filters,[int(state_shape[0]),int(state_shape[1]),int(state_shape[2]),self.num_features],[1,2,2,1],'VALID')
x_partial_gates = tf.nn.conv2d_transpose(inputs,x_filters_gates,[int(state_shape[0]),int(state_shape[1]),int(state_shape[2]),3],[1,2,2,1],'VALID')
else:
x_filters = tf.get_variable("x_filters",[2,2,input_shape[3],self.num_features])
x_filters_gates = tf.get_variable("x_filters_gates",[2,2,input_shape[3],3])
x_partial = tf.nn.conv2d(inputs,x_filters,[1,2,2,1],'VALID')
x_partial_gates = tf.nn.conv2d(inputs,x_filters_gates,[1,2,2,1],'VALID')
#some more lstm gate business
gate_bias = tf.get_variable("gate_bias",[1,1,1,3])
h_bias = tf.get_variable("h_bias",[1,1,1,self.num_features*2])
gates = h_partial_gates + x_partial_gates + c_partial + gate_bias
i,f,o = tf.split(gates,3,axis=3)
#concatenate the units coming from the spacial and the temporal dimension to build a unified state
concat = tf.concat([h_partial,x_partial],3) + h_bias
new_c = tf.nn.relu(concat)*tf.sigmoid(i)+c*tf.sigmoid(f)
new_h = new_c * tf.sigmoid(o)
new_state = tf.contrib.rnn.LSTMStateTuple(new_c,new_h)
return new_h, new_state #its redundant, but thats how tensorflow likes it, apparently
#global variables
LEARNING_RATE = 0.005
ITERATIONS_PER_EPOCH = 80
BATCH_SIZE = 75
TEST = False #manual switch to go from training to testing
if TEST:
BATCH_SIZE = 1
inputs = tf.placeholder(tf.float32, (20, BATCH_SIZE, 64, 64,1))
shape0 = [BATCH_SIZE,64,64,2]
shape1 = [BATCH_SIZE,32,32,6]
shape2 = [BATCH_SIZE,16,16,12]
shape3 = [BATCH_SIZE,8,8,24]
shape4 = [BATCH_SIZE,4,4,48]
shape5 = [BATCH_SIZE,2,2,96]
shape6 = [BATCH_SIZE,1,1,192]
#apparently tf.multirnncell has very specific requirements for the initial states oO
initial_state1 = (tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape1),tf.zeros(shape1)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape2),tf.zeros(shape2)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape3),tf.zeros(shape3)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape4),tf.zeros(shape4)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape5),tf.zeros(shape5)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape6),tf.zeros(shape6)))
initial_state2 = (tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape5),tf.zeros(shape5)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape4),tf.zeros(shape4)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape3),tf.zeros(shape3)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape2),tf.zeros(shape2)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape1),tf.zeros(shape1)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape0),tf.zeros(shape0)))
#encoding part of the autoencoder graph
cell1 = ConvCell(shape1,3)
cell2 = ConvCell(shape2,6)
cell3 = ConvCell(shape3,12)
cell4 = ConvCell(shape4,24)
cell5 = ConvCell(shape5,48)
cell6 = ConvCell(shape6,96)
mcell = tf.contrib.rnn.MultiRNNCell([cell1,cell2,cell3,cell4,cell5,cell6])
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(mcell, inputs[0:20,:,:,:],initial_state=initial_state1,dtype=tf.float32, time_major=True)
#decoding part of the autoencoder graph, forward block and backwards block
cell9a = ConvCell(shape5,48,transpose = True)
cell10a = ConvCell(shape4,24,transpose = True)
cell11a = ConvCell(shape3,12,transpose = True)
cell12a = ConvCell(shape2,6,transpose = True)
cell13a = ConvCell(shape1,3,transpose = True)
cell14a = ConvCell(shape0,1,transpose = True)
mcella = tf.contrib.rnn.MultiRNNCell([cell9a,cell10a,cell11a,cell12a,cell13a,cell14a])
cell9b = ConvCell(shape5,48,transpose = True)
cell10b = ConvCell(shape4,24,transpose = True)
cell11b= ConvCell(shape3,12,transpose = True)
cell12b = ConvCell(shape2,6,transpose = True)
cell13b = ConvCell(shape1,3,transpose = True)
cell14b = ConvCell(shape0,1,transpose = True)
mcellb = tf.contrib.rnn.MultiRNNCell([cell9b,cell10b,cell11b,cell12b,cell13b,cell14b])
def PredictionLayer(rnn_outputs,viewPoint = 11, reverse = False):
predLength = viewPoint-2 if reverse else 20-viewPoint #vision is the input for the decoder
vision = tf.concat([rnn_outputs[viewPoint-1:viewPoint,:,:,:],tf.zeros([predLength,BATCH_SIZE,1,1,192])],0)
if reverse:
rnn_outputs2, rnn_states = tf.nn.dynamic_rnn(mcellb, vision, initial_state = initial_state2, time_major=True)
else:
rnn_outputs2, rnn_states = tf.nn.dynamic_rnn(mcella, vision, initial_state = initial_state2, time_major=True)
mean = tf.reduce_mean(rnn_outputs2,4)
if TEST:
return mean
if reverse:
return tf.reduce_sum(tf.square(mean-inputs[viewPoint-2::-1,:,:,:,0]))
else:
return tf.reduce_sum(tf.square(mean-inputs[viewPoint-1:20,:,:,:,0]))
if TEST:
mean = tf.concat([PredictionLayer(rnn_outputs,11,True)[::-1,:,:,:],createPredictionLayer(rnn_outputs,11)],0)
else: #training part of the graph
error = tf.zeros([1])
for i in range(8,15): #range size of 7 or less works, 9 or more does not, no idea why
error += PredictionLayer(rnn_outputs, i)
error += PredictionLayer(rnn_outputs, i, True)
train_fn = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE).minimize(error)
################################################################################
## TRAINING LOOP ##
################################################################################
#code based on siemanko/tf_lstm.py (Github)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
saver = tf.train.Saver(restore_sequentially=True, allow_empty=True,)
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
session.run(tf.global_variables_initializer())
vids = np.load("mnist_test_seq.npy") #20/10000/64/64 , moving mnist dataset from http://www.cs.toronto.edu/~nitish/unsupervised_video/
vids = vids[:,0:6000,:,:] #training set
saver.restore(session,tf.train.latest_checkpoint('./conv_lstm_multiples_v2/'))
#saver.restore(session,'.\conv_lstm_multiples\iteration-74')
for epoch in range(1000):
if TEST:
break
epoch_error = 0
#randomize batches each epoch
vids = np.swapaxes(vids,0,1)
np.random.shuffle(vids)
vids = np.swapaxes(vids,0,1)
for i in range(ITERATIONS_PER_EPOCH):
#running the graph and feeding data
err,_ = session.run([error, train_fn], {inputs: np.expand_dims(vids[:,i*BATCH_SIZE:(i+1)*BATCH_SIZE,:,:],axis=4)})
print(err)
epoch_error += err
#training error each epoch and regular saving
epoch_error /= (ITERATIONS_PER_EPOCH*BATCH_SIZE*4096*20*7)
if (epoch+1) % 5 == 0:
saver.save(session,'.\conv_lstm_multiples_v2\iteration',global_step=epoch)
print("saved")
print("Epoch %d, train error: %f" % (epoch, epoch_error))
#testing
plt.ion()
f, axarr = plt.subplots(2)
vids = np.load("mnist_test_seq.npy")
for i in range(6000,10000):
img = session.run([mean], {inputs: np.expand_dims(vids[:,i:i+1,:,:],axis=4)})
for j in range(20):
axarr[0].imshow(img[0][j,0,:,:])
axarr[1].imshow(vids[j,i,:,:])
plt.show()
plt.pause(0.1)
Usually this happens when gradients' magnitude is very high at some point and causes your network parameters to change a lot. To verify that it is indeed the case, you can produce the same plot of gradient magnitudes and see if they jump right before the loss jump. Assuming this is the case, the classic approach is to use gradient clipping (or go all the way to natural gradient).