Numpy load large file extremely slowly when tensorflow is training

Numpy load large file extremely slowly when tensorflow is training - numpy

I try training 10 tasks at the same time.(each GPU one task).
I use the shell script to start 10 python processes at the same time.
When the dataset file is small, the ten tasks almost finish the loading process at the same time, and their training process are fine and fast.
When the dataset file is large(almost 70G), then some tasks may finish the loading process faster and start to train while other tasks are still in loading process. But after some tasks get into training process, the others' loading speed will be very very slowly.And the training speed also become slowly.
I use numpy to load the dataset. The dataset will be completely loaded at first and sent into the model with tf.data.dataset.
I am wandering while the tensorflow training process will conflict with the numpy load process in other tasks and how to fix it.
def load_factor_file(in_file):
try:
key_name = os.path.basename(in_file).split('.')[0]
factor_arr = np.load(in_file)
return {key_name: factor_arr}
except Exception as e:
logger.error(f"{str(e)}")
return {}
def load_factor_files(in_files, store_dict):
for f in in_files:
fo = load_factor_file(f)
store_dict.update(fo)
del fo
def get_tensor_feature_label(version, txt_files, batch_size, run_mode, label_num, feature_np_dict, label_np_dict):
def get_data_from_cache(string_message):
feature_index, label_index, key = string_message.decode().split(',')
feature_index = int(feature_index)
label_index = int(label_index)
slice_arr = feature_np_dict[key][feature_index - FACTOR_WINDOW + 1: feature_index + 1]
label_value = label_np_dict[key][label_index]
x_data = slice_arr.astype(np.float32)
y_data = label_value.astype(np.float32)
return x_data, y_data
def parser(txt_string_in):
feat, label = tf.numpy_function(
get_data_from_cache, [txt_string_in], (tf.float32, tf.float32))
feat = tf.reshape(feat, (FACTOR_WINDOW, FACTOR_NUM_MAP[version]))
label = tf.reshape(label, (label_num,))
return feat, label
ds = tf.data.TextLineDataset.list_files(txt_files)
ds = ds.apply(
tf.data.experimental.parallel_interleave(
tf.data.TextLineDataset,
cycle_length=8,
sloppy=False))
ds = ds.prefetch(buffer_size=batch_size)
ds = ds.cache()
ds = ds.repeat()
if run_mode == 'train':
ds = ds.shuffle(buffer_size=1000)
ds = ds.apply(
tf.data.experimental.map_and_batch(
map_func=parser,
batch_size=batch_size,
num_parallel_batches=10))
ds = ds.prefetch(buffer_size=10)
iterator = tf.data.make_one_shot_iterator(ds)
features, labels = iterator.get_next()
logger.info(f"feature size {features.shape}, label size {labels.shape}")
return features, labels
def main(not_use_args):
if FLAGS.use_hvd != 0:
import horovod.tensorflow as hvd
hvd.init()
txt_path = f"{ARR_DATA_PATH_MAP[FLAGS.version]}/{FLAGS.stock}/{FLAGS.run_mode}/txt"
batch_size = FLAGS.batch_size
# model configeration
configuration = get_config(
txt_path, batch_size,
FLAGS.config_name, FLAGS.use_hvd, FLAGS.stock,
FLAGS.run_mode, FLAGS.max_per_epoch)
configuration['fn_params']["lr"] = FLAGS.lr
configuration['fn_params']["lr_decay_type"] = FLAGS.lr_decay_type
configuration['fn_params']['use_hvd'] = FLAGS.use_hvd
max_steps = FLAGS.epoch * configuration["fn_params"]["steps_per_epoch"]
configuration["fn_params"]["max_steps"] = max_steps
if FLAGS.warm_up_epochs > 0:
configuration["fn_params"]["warm_up_epochs"] = FLAGS.warm_up_epochs
configuration['model_dir'] = get_model_dir(configuration['model_dir'], configuration['name'])
configuration["fn_params"]["optimizer"] = FLAGS.optimizer
if FLAGS.train_scope != "None":
configuration["fn_params"]["train_scope"] = FLAGS.train_scope
t0_regression = create_estimator(configuration, FLAGS.threads)
# load np data
factor_files = glob.glob(f"{ARR_DATA_PATH_MAP[FLAGS.version]}/{FLAGS.stock}/{FLAGS.run_mode}/factor/{txt_round}-*.npy")
label_files = glob.glob(f"{ARR_DATA_PATH_MAP[FLAGS.version]}/{FLAGS.stock}/{FLAGS.run_mode}/label/{txt_round}-*.npy")
txt_files = glob.glob(f"{txt_path}/{txt_round}-*.txt")
logger.info(f"Loading factors = {factor_files}, label = {label_files}")
input_fn = functools.partial(
get_tensor_feature_label,
version=FLAGS.version,
txt_files=txt_files,
batch_size=batch_size,
run_mode=FLAGS.run_mode,
label_num=label_num,
feature_np_dict=feature_np_dict,
label_np_dict=label_np_dict)
log_dir = configuration["model_dir"] + '/tf_logs'
logging_hooks = get_log_hooks(
log_dir=log_dir,
save_steps=configuration["fn_params"]["steps_per_epoch"],
run_mode=FLAGS.run_mode,
test_total_num=FLAGS.test_total_num
)
all_hooks = logging_hooks
t0_regression.train(
input_fn=input_fn,
max_steps=max_steps,
hooks=all_hooks)
if __name__ == '__main__':
tf.app.run()

Related

GoogleNet fails to classify images

I built Keras Google Net from here:
https://www.analyticsvidhya.com/blog/2018/10/understanding-inception-network-from-scratch/
The only difference is that I replaced 1000 classes in output layers with 3. data is prepared this way :
def grey_preprocessor (xarray):
xarray=(xarray/127.5)-1
return xarray
img_resol = (224,224)
train_batches = ImageDataGenerator(horizontal_flip = True, preprocessing_function = grey_preprocessor).flow_from_directory(
directory = train_path, target_size=img_resol, classes = ['bacterial', 'healthy', 'viral'], batch_size = 10)
valid_batches = ImageDataGenerator(horizontal_flip = True, preprocessing_function = grey_preprocessor).flow_from_directory(
directory = valid_path, target_size=img_resol, classes = ['bacterial', 'healthy', 'viral'], batch_size = 10)
test_batches = ImageDataGenerator(horizontal_flip = True, preprocessing_function = grey_preprocessor).flow_from_directory(
directory = test_path, target_size=img_resol, classes = ['bacterial', 'healthy', 'viral'], batch_size = 10, shuffle = False)
assert train_batches.n == 4222
assert valid_batches.n == 300
assert test_batches.n == 150
assert train_batches.num_classes == valid_batches.num_classes == test_batches.num_classes == 3
I train it like this:
history = model.fit(train_batches, validation_data=valid_batches, epochs=epochs, batch_size=256, callbacks=[lr_sc])
However, all the accuracies on every batch are 0.3333, which means it doesn't classify at all. I understand that it can be anything. What is a good way to troubleshoot it?

If you want to normalize your grayscale image use this!
def gray_preprocessor (xarray):
xarray=xarray/255.0
return xarray
or you can also use lambda function:
gray_preprocessor = lambda xarray : xarray / 255.0

Resource exhausted: OOM model.fit in foor loop grid search cross validation

I am trying to do a grid search by calling model.fit recursively for different parameters of my model.
I get a resource exhausted error in tensorflow. In spite of doing del model and tf.keras.backend.clear_session() at the end of the loop. This is my code
def kfoldsplit(FRAME_PATH, MASK_PATH,k):
kfold = []
all_frames = os.listdir(FRAME_PATH)
all_masks = os.listdir(MASK_PATH)
all_frames.sort(key=lambda var: [int(x) if x.isdigit() else x
for x in re.findall(r'[^0-9]|[0-9]+', var)])
all_masks.sort(key=lambda var: [int(x) if x.isdigit() else x
for x in re.findall(r'[^0-9]|[0-9]+', var)])
random.seed(230)
random.shuffle(all_frames)
# Generate train, val, and test sets for frames
train_split = int(0.8 * len(all_frames))
#val_split = int(0.9 * len(all_frames))
#test_split = int(0.9 * len(all_frames))
train_frames = all_frames[:train_split]
#val_frames = all_frames[train_split:val_split]
test_frames = all_frames[train_split:]
# Generate corresponding mask lists for masks
train_masks = [f for f in all_masks if 'image_' + f[6:16] + 'dcm' in train_frames]
#val_masks = [f for f in all_masks if 'image_' + f[6:16] + 'dcm' in val_frames]
test_masks = [f for f in all_masks if 'image_' + f[6:16] + 'dcm' in test_frames]
size_of_subset =int(len(train_masks)/k)
for i in range (0,k):
subset = (train_frames[i*size_of_subset:(i+1)*size_of_subset],train_masks[i*size_of_subset:(i+1)*size_of_subset])
kfold.append(subset)
return kfold, (test_frames,test_masks)
def get_model_name(k):
return 'model_'+str(k)+'.hdf5'
def float_range(start, stop, step):
while start < stop:
yield float(start)
start += decimal.Decimal(step)
frames_path = 'C:/Datasets/elderlymen1/2d/images'
masks_path = 'C:/Datasets/elderlymen1/2d/FASCIA_FILLED'
kf = kfoldsplit(frames_path, masks_path, 10)
def crossvalidation(epoch,kf, loops):
VALIDATION_ACCURACY = []
VALIDATION_LOSS = []
Params=[]
save_dir = 'C:/saved_models/'
fold_var = 1
i=0
for i in float_range(0,1,0.1):
for j in float_range(1e-6,1e-3,1e-6):
#while i <= loops:
#_alpha = random.uniform(0, 1)
#lrate = random.uniform(1e-3, 1e-6)
_alpha = i
lrate = j
Params.append([_alpha,lrate])
for subset in kf[0]:
list_IDs = subset[0]
train_data_generator = DataGenerator2(list_IDs, frames_path, masks_path, to_fit=True, batch_size=2,
dim=(512, 512), dimy=(512, 512), n_channels=1, n_classes=2, shuffle=True,
data_gen_args=data_gen_args_dict)
list_IDs = kf[1][0]
valid_data_generator = DataGenerator(list_IDs, frames_path, masks_path, to_fit=True, batch_size=2,
dim=(512, 512), dimy=(512, 512), n_channels=1, n_classes=2, shuffle=True)
# CREATE NEW MODEL
model = unet(pretrained_weights='csa/unet_ThighOuterSurface.hdf5')
# COMPILE NEW MODEL
model.compile(optimizer=Adam(lr=lrate), loss=combo_loss(alpha=_alpha, beta=0.4), metrics=[dice_accuracy])
# CREATE CALLBACKS
checkpoint = tf.keras.callbacks.ModelCheckpoint(save_dir + get_model_name(fold_var),
monitor='val_loss', verbose=1,
save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# There can be other callbacks, but just showing one because it involves the model name
# This saves the best model
# FIT THE MODEL
history = model.fit(train_data_generator, validation_steps=len(valid_data_generator), steps_per_epoch=len(train_data_generator),
epochs=epoch,
callbacks=callbacks_list,
validation_data=valid_data_generator)
# PLOT HISTORY
# :
# :
# LOAD BEST MODEL to evaluate the performance of the model
model.load_weights("C:/saved_models/model_" + str(fold_var) + ".hdf5")
results = model.evaluate(valid_data_generator)
results = dict(zip(model.metrics_names, results))
VALIDATION_ACCURACY.append(results['dice_accuracy'])
VALIDATION_LOSS.append(results['loss'])
tf.keras.backend.clear_session()
fold_var += 1
del model
#i+=1
print(VALIDATION_ACCURACY)
print(Params)
sample = open('metrics.txt', '+r')
print(VALIDATION_ACCURACY, file=sample)
print(Params, file=sample)
print('...',file=sample)
sample.close()
crossvalidation(15,kf, 2)
Why is the memory still exhausted and how can I release it. Or if it is not possible, is there another option for a grid search and cross validation for an image segmentation model?
Thank you

After trying everything I found in order to release memory, then only thing that solved the problem was adding
del model
gc.collect()
at the end of the for loop

FailedPreconditionError: FailedPr...onError()

I have FailedPreconditionError when running sess.
My network has two different parts, pretrained-network and new add in Recognition network.
Pretrained model is used to extract features and the feature is used to train again for recognition.
In my code, pre-trained model is loaded first.
graph = tf.Graph()
with graph.as_default():
input_data, input_labels, input_boxes = input_train_data.input_fn()
input_boxes = tf.reshape(input_boxes,[input_boxes.shape[0]*2,-1])#convert from Nx8 to 2Nx4
# build model and loss
net = Net(input_data, is_training = False)
f_saver = tf.train.Saver(max_to_keep=1000, write_version=tf.train.SaverDef.V2, save_relative_paths=True)
sess_config = tf.ConfigProto(log_device_placement = False, allow_soft_placement = True)
if FLAGS.gpu_memory_fraction < 0:
sess_config.gpu_options.allow_growth = True
elif FLAGS.gpu_memory_fraction > 0:
sess_config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction;
session = tf.Session(graph=graph, config=sess_config)
tf.logging.info('Initialize from: ' + config.train.init_checkpoint)
f_saver.restore(session, config.train.init_checkpoint)
f_saver restores the pre-trained model.
Then feature conv5_3 is extracted and fed into Recognition network.
conv5_3 = net.end_points['conv5_3']
with tf.variable_scope("Recognition"):
global_step_rec = tf.Variable(0, name='global_step_rec', trainable=False)
#Pass through recognition net
r_net = regnet.ConstructRecNet(conv5_3)
conv7_7 = r_net.end_points['pool7']
#implement ROI Pooling
#input boxes be in x1, y1, x2, y2
h_fmap = tf.dtypes.cast(tf.shape(conv7_7)[1],tf.float32)
w_fmap = tf.dtypes.cast(tf.shape(conv7_7)[2],tf.float32)
#remap boxes at input images to feature mats
#input_boxes = input_boxes / tf.constant([config.train.input_shape[0], config.train.input_shape[0],\
# config.train.input_shape[0], config.train.input_shape[0]], dtype=tf.float32)#Normalize with image size first
remap_boxes=tf.matmul(input_boxes,tf.diag([w_fmap,h_fmap,w_fmap,h_fmap]))
#put first column with image indexes
rows = tf.expand_dims(tf.range(remap_boxes.shape[0]), 1)/2
add_index = tf.concat([tf.cast(rows,tf.float32),remap_boxes],-1)
index = tf.not_equal(tf.reduce_sum(add_index[:,4:],axis=1),0)
remap_boxes = tf.gather_nd(add_index,tf.where(index))
remap_boxes=tf.dtypes.cast(remap_boxes,tf.int32)
prob = roi_pooling(conv7_7, remap_boxes, pool_height=1, pool_width=28)
#Get features for CTC training
prob = tf.transpose(prob, (1, 0, 2)) # prepare for CTC
data_length = tf.fill([tf.shape(prob)[1]], tf.shape(prob)[0]) # input seq length, batch size
ctc = tf.py_func(CTCUtils.compute_ctc_from_labels, [input_labels], [tf.int64, tf.int64, tf.int64])
ctc_labels = tf.to_int32(tf.SparseTensor(ctc[0], ctc[1], ctc[2]))
predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(prob, data_length, merge_repeated=False, beam_width=10)[0][0])
tf.sparse_tensor_to_dense(predictions, default_value=-1, name='d_predictions')
tf.reduce_mean(tf.edit_distance(predictions, ctc_labels, normalize=False), name='error_rate')
loss = tf.reduce_mean(tf.compat.v1.nn.ctc_loss(inputs=prob, labels=ctc_labels, sequence_length=data_length, ctc_merge_repeated=True), name='loss')
learning_rate = tf.train.piecewise_constant(global_step_rec, [150000, 200000],[config.train.learning_rate, 0.1 * config.train.learning_rate,0.01 * config.train.learning_rate])
opt_loss = tf.contrib.layers.optimize_loss(loss, global_step_rec, learning_rate, config.train.opt_type,config.train.grad_noise_scale, name='train_step')
tf.global_variables_initializer()
I can run sess till feature extraction conv5_3. But can't run those in Recognition and got error as FailedPreconditionError: FailedPr...onError(). What could be the problem?
graph.finalize()
with tf.variable_scope("Recognition"):
for i in range(config.train.steps):
input_data_, input_labels_, input_boxes_ = session.run([input_data, input_labels, input_boxes])
conv5_3_ = session.run([conv5_3]) #can run this line
global_step_rec_ = session.run([global_step_rec]) # got FailedPreconditionError: FailedPr...onError() error at this line
conv7_7_ = session.run([conv7_7])
h_fmap_ = session.run([h_fmap])

Now it works.
Since my graph has two parts, I need to initialize separately.
(1)First get all variables from pre-trained model to initialize with those from checkpoint.
Then initialize with tf.train.Saver.
(2)Then initialize the rest add-in layers using tf.global_variables_initializer()
My code is as follow.
#Initialization
#Initialize pre-trained model first
#Since we need to restore pre-trained model and initialize to respective variables in this current graph
#(1)make a variable list for checkpoint
#(2)initialize a saver for the variable list
#(3)then restore
#(1)
def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
varlist=[]
reader = pywrap_tensorflow.NewCheckpointReader(file_name)
if all_tensors:
var_to_shape_map = reader.get_variable_to_shape_map()
for key in sorted(var_to_shape_map):
print(key)
varlist.append(key)
return varlist
varlist=print_tensors_in_checkpoint_file(file_name=config.train.init_checkpoint,all_tensors=True,tensor_name=None)
#(2)prepare the list of variables by calling variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
#countcheckpt_vars=0
#for n in tf.get_default_graph().as_graph_def().node:
# print(n.name)
#for op in tf.get_default_graph().get_operations():
# print(str(op.name))
#for var in zip(variables):
# countcheckpt_vars=countcheckpt_vars+1
#(3)
loader = tf.train.Saver(variables[:46])#since I need to initialize only 46 variables from global variables
tf.logging.info('Initialize from: ' + config.train.init_checkpoint)
sess_config = tf.ConfigProto(log_device_placement = False, allow_soft_placement = True)
if FLAGS.gpu_memory_fraction < 0:
sess_config.gpu_options.allow_growth = True
elif FLAGS.gpu_memory_fraction > 0:
sess_config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction;
session = tf.Session(graph=graph, config=sess_config)
loader.restore(session, config.train.init_checkpoint)
Then initialize the rest of variables
init = tf.global_variables_initializer()
session.run(init)

A2C is not working due to critic loss is not converging

I'm trying to do my own implementation of the Advantage Actor Critic algorithm by using tensorflow. I used the code in https://github.com/BoYanSTKO/Practical_RL-coursera/blob/master/week5_policy_based/practice_a3c.ipynb as a rough template on how I should write the algorithm.
I tried it on the simple CartPole-v0 gym environment but by implementation fails badly. The critics loss just explodes and becomes way to large while the actors loss is rather low.
I'm not sure what I'm doing wrong here. Any help? :)
I've tried separating the actor and critic from each other by having 2 different networks. This did not help either. Have also tried fine tuning some stuff like gamma and learning rate without any success.
!/usr/bin/python
import tensorflow as tf
import numpy as np
import gym
import random
from tensorboardX import SummaryWriter
class ActorCritic():
def __init__(self,state_dim,n_actions,learning_rate,gamma=0.99):
with tf.variable_scope("ActorCritic"):
self.states_ph = tf.placeholder(tf.float32,(None,state_dim),name="states")
self.action_ph = tf.placeholder(tf.int32,(None,),name="actions")
self.n_actions = n_actions
self.reward_ph = tf.placeholder(tf.float32,(None,),name="rewards")
self.next_state_values = tf.placeholder(tf.float32,(None,),name="rewards")
self.is_done_ph = tf.placeholder(tf.float32,(None,),name="rewards")
net = tf.layers.dense(self.states_ph,24,activation=tf.nn.relu)
self.logits = tf.layers.dense(net,n_actions,activation=None)
self.state_values = tf.layers.dense(net,1,activation=None)
self.action_probs = tf.nn.softmax(self.logits)
self.log_prob = tf.nn.log_softmax(self.logits)
self.entropy = -tf.reduce_sum(self.action_probs*self.log_prob,axis=-1,name="entropy")
self.logp_actions = tf.reduce_sum(self.log_prob*tf.one_hot(self.action_ph,depth=n_actions),axis=-1)
self.target_state_values = self.reward_ph + gamma*(1.0-self.is_done_ph)*self.next_state_values
self.advantage = self.target_state_values - self.state_values
self.actor_loss = -tf.reduce_mean(self.logp_actions * tf.stop_gradient(self.advantage)) - 0.01*tf.reduce_mean(self.entropy)
self.critic_loss = tf.reduce_mean(self.advantage**2.0)
self.train_opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.actor_loss+self.critic_loss)
def train(self,states,actions,rewards,is_done,nxt_state_values_batch):
sess = tf.get_default_session()
return sess.run([self.critic_loss,self.actor_loss,self.train_opt],feed_dict={
self.next_state_values:nxt_state_values_batch,
self.states_ph:states,
self.action_ph:actions,
self.reward_ph:rewards,
self.is_done_ph:is_done})
def predict_state_values(self,states):
sess = tf.get_default_session()
return sess.run(self.state_values,feed_dict={self.states_ph:states})
def sample_actions(self,states):
sess = tf.get_default_session()
action_probs = sess.run(self.action_probs,{self.states_ph:states})
return [ np.random.choice(range(self.n_actions),p=action_prob) for action_prob in action_probs ]
class EnvBatch():
def __init__(self,env_name,n_envs):
self.envs = [gym.make(env_name) for env in range(n_envs)]
self.n_actions = self.envs[0].action_space.n
self.state_dim = self.envs[0].observation_space.shape[0]
def reset(self):
return [env.reset().tolist() for env in self.envs ]
def step(self,actions):
states_batch, rewards_batch, is_done_batch = [], [], []
for action, env in zip(actions,self.envs):
s, r , d, _ = env.step(action)
if d:
s = env.reset()
states_batch.append(s)
rewards_batch.append(r)
is_done_batch.append(d)
return np.array(states_batch), np.array(rewards_batch), np.array(is_done_batch)
def evaluate_performance(env_name,agent,nr_runs=10):
env = gym.make(env_name)
rewards = []
for _ in range(nr_runs):
state = env.reset()
is_done = False
acc_reward = 0.0
while not is_done:
action = agent.sample_actions([state])
nxt_state, reward, is_done, _ = env.step(action[0])
state = nxt_state
acc_reward += reward
rewards.append(acc_reward)
return np.mean(rewards)
tf.reset_default_graph()
env = EnvBatch("CartPole-v0",10)
agent = ActorCritic(env.state_dim,env.n_actions,learning_rate=0.001)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
state_batch = env.reset()
writer = SummaryWriter()
for i in range(100000):
actions = agent.sample_actions(state_batch)
nxt_state_batch, rewards_batch, is_done_batch = env.step(actions)
nxt_state_values = agent.predict_state_values(nxt_state_batch).ravel()
critic_loss, actor_loss, _ = agent.train(state_batch,actions,rewards_batch,is_done_batch,nxt_state_values)
writer.add_scalar("actor_loss",actor_loss,i)
writer.add_scalar("critic_loss",critic_loss,i)
if i%50==0:
test_reward = evaluate_performance("CartPole-v0",agent)
writer.add_scalar("test_reward",test_reward,i)
if test_reward > 195:
print "Done!"
states_batch = nxt_state_batch
sess.close()
writer.close()

(De-)Convutional lstm autoencoder - error jumps

I'm trying to build a convolutional lstm autoencoder (that also predicts future and past) with Tensorflow, and it works to a certain degree, but the error sometimes jumps back up, so essentially, it never converges.
The model is as follows:
The encoder starts with a 64x64 frame from a 20 frame bouncing mnist video for each time step of the lstm. Every stacking layer of LSTM halfs it and increases the depth via 2x2 convolutions with a stride of 2. (so -->32x32x3 -->...--> 1x1x96)
On the other hand, the lstm performs 3x3 convolutions with a stride of 1 on its state. Both results are concatenated to form the new state. In the same way, the decoder uses transposed convolutions to go back to the original format. Then the squared error is calculated.
The error starts at around 2700 and it takes around 20 hours (geforce1060) to get down to ~1700. At which point the jumping back up (and it sometimes jumps back up to 2300 or even ridiculous values like 440300) happens often enough that I can't really get any lower. Also at that point, it can usually pinpoint where the number should be, but its too fuzzy to actually make out the digit...
I tried different learning rates and optimizers, so if anybody knows why that jumping happens, that'd make me happy :)
Here is a graph of the loss with epochs:
import tensorflow as tf
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
#based on code by loliverhennigh (Github)
class ConvCell(tf.contrib.rnn.RNNCell):
count = 0 #exists only to remove issues with variable scope
def __init__(self, shape, num_features, transpose = False):
self.shape = shape
self.num_features = num_features
self._state_is_tuple = True
self._transpose = transpose
ConvCell.count+=1
self.count = ConvCell.count
#property
def state_size(self):
return (tf.contrib.rnn.LSTMStateTuple(self.shape[0:4],self.shape[0:4]))
#property
def output_size(self):
return tf.TensorShape(self.shape[1:4])
#here comes to the actual conv lstm implementation, if transpose = true, it performs a deconvolution on the input
def __call__(self, inputs, state, scope=None):
with tf.variable_scope(scope or type(self).__name__+str(self.count)):
c, h = state
state_shape = h.shape
input_shape = inputs.shape
#filter variables and convolutions on data coming from the same cell, a time step previous
h_filters = tf.get_variable("h_filters",[3,3,state_shape[3],self.num_features])
h_filters_gates = tf.get_variable("h_filters_gates",[3,3,state_shape[3],3])
h_partial = tf.nn.conv2d(h,h_filters,[1,1,1,1],'SAME')
h_partial_gates = tf.nn.conv2d(h,h_filters_gates,[1,1,1,1],'SAME')
c_filters = tf.get_variable("c_filters",[3,3,state_shape[3],3])
c_partial = tf.nn.conv2d(c,c_filters,[1,1,1,1],'SAME')
#filters and convolutions/deconvolutions on data coming fromthe cell input
if self._transpose:
x_filters = tf.get_variable("x_filters",[2,2,self.num_features,input_shape[3]])
x_filters_gates = tf.get_variable("x_filters_gates",[2,2,3,input_shape[3]])
x_partial = tf.nn.conv2d_transpose(inputs,x_filters,[int(state_shape[0]),int(state_shape[1]),int(state_shape[2]),self.num_features],[1,2,2,1],'VALID')
x_partial_gates = tf.nn.conv2d_transpose(inputs,x_filters_gates,[int(state_shape[0]),int(state_shape[1]),int(state_shape[2]),3],[1,2,2,1],'VALID')
else:
x_filters = tf.get_variable("x_filters",[2,2,input_shape[3],self.num_features])
x_filters_gates = tf.get_variable("x_filters_gates",[2,2,input_shape[3],3])
x_partial = tf.nn.conv2d(inputs,x_filters,[1,2,2,1],'VALID')
x_partial_gates = tf.nn.conv2d(inputs,x_filters_gates,[1,2,2,1],'VALID')
#some more lstm gate business
gate_bias = tf.get_variable("gate_bias",[1,1,1,3])
h_bias = tf.get_variable("h_bias",[1,1,1,self.num_features*2])
gates = h_partial_gates + x_partial_gates + c_partial + gate_bias
i,f,o = tf.split(gates,3,axis=3)
#concatenate the units coming from the spacial and the temporal dimension to build a unified state
concat = tf.concat([h_partial,x_partial],3) + h_bias
new_c = tf.nn.relu(concat)*tf.sigmoid(i)+c*tf.sigmoid(f)
new_h = new_c * tf.sigmoid(o)
new_state = tf.contrib.rnn.LSTMStateTuple(new_c,new_h)
return new_h, new_state #its redundant, but thats how tensorflow likes it, apparently
#global variables
LEARNING_RATE = 0.005
ITERATIONS_PER_EPOCH = 80
BATCH_SIZE = 75
TEST = False #manual switch to go from training to testing
if TEST:
BATCH_SIZE = 1
inputs = tf.placeholder(tf.float32, (20, BATCH_SIZE, 64, 64,1))
shape0 = [BATCH_SIZE,64,64,2]
shape1 = [BATCH_SIZE,32,32,6]
shape2 = [BATCH_SIZE,16,16,12]
shape3 = [BATCH_SIZE,8,8,24]
shape4 = [BATCH_SIZE,4,4,48]
shape5 = [BATCH_SIZE,2,2,96]
shape6 = [BATCH_SIZE,1,1,192]
#apparently tf.multirnncell has very specific requirements for the initial states oO
initial_state1 = (tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape1),tf.zeros(shape1)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape2),tf.zeros(shape2)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape3),tf.zeros(shape3)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape4),tf.zeros(shape4)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape5),tf.zeros(shape5)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape6),tf.zeros(shape6)))
initial_state2 = (tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape5),tf.zeros(shape5)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape4),tf.zeros(shape4)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape3),tf.zeros(shape3)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape2),tf.zeros(shape2)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape1),tf.zeros(shape1)),tf.contrib.rnn.LSTMStateTuple(tf.zeros(shape0),tf.zeros(shape0)))
#encoding part of the autoencoder graph
cell1 = ConvCell(shape1,3)
cell2 = ConvCell(shape2,6)
cell3 = ConvCell(shape3,12)
cell4 = ConvCell(shape4,24)
cell5 = ConvCell(shape5,48)
cell6 = ConvCell(shape6,96)
mcell = tf.contrib.rnn.MultiRNNCell([cell1,cell2,cell3,cell4,cell5,cell6])
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(mcell, inputs[0:20,:,:,:],initial_state=initial_state1,dtype=tf.float32, time_major=True)
#decoding part of the autoencoder graph, forward block and backwards block
cell9a = ConvCell(shape5,48,transpose = True)
cell10a = ConvCell(shape4,24,transpose = True)
cell11a = ConvCell(shape3,12,transpose = True)
cell12a = ConvCell(shape2,6,transpose = True)
cell13a = ConvCell(shape1,3,transpose = True)
cell14a = ConvCell(shape0,1,transpose = True)
mcella = tf.contrib.rnn.MultiRNNCell([cell9a,cell10a,cell11a,cell12a,cell13a,cell14a])
cell9b = ConvCell(shape5,48,transpose = True)
cell10b = ConvCell(shape4,24,transpose = True)
cell11b= ConvCell(shape3,12,transpose = True)
cell12b = ConvCell(shape2,6,transpose = True)
cell13b = ConvCell(shape1,3,transpose = True)
cell14b = ConvCell(shape0,1,transpose = True)
mcellb = tf.contrib.rnn.MultiRNNCell([cell9b,cell10b,cell11b,cell12b,cell13b,cell14b])
def PredictionLayer(rnn_outputs,viewPoint = 11, reverse = False):
predLength = viewPoint-2 if reverse else 20-viewPoint #vision is the input for the decoder
vision = tf.concat([rnn_outputs[viewPoint-1:viewPoint,:,:,:],tf.zeros([predLength,BATCH_SIZE,1,1,192])],0)
if reverse:
rnn_outputs2, rnn_states = tf.nn.dynamic_rnn(mcellb, vision, initial_state = initial_state2, time_major=True)
else:
rnn_outputs2, rnn_states = tf.nn.dynamic_rnn(mcella, vision, initial_state = initial_state2, time_major=True)
mean = tf.reduce_mean(rnn_outputs2,4)
if TEST:
return mean
if reverse:
return tf.reduce_sum(tf.square(mean-inputs[viewPoint-2::-1,:,:,:,0]))
else:
return tf.reduce_sum(tf.square(mean-inputs[viewPoint-1:20,:,:,:,0]))
if TEST:
mean = tf.concat([PredictionLayer(rnn_outputs,11,True)[::-1,:,:,:],createPredictionLayer(rnn_outputs,11)],0)
else: #training part of the graph
error = tf.zeros([1])
for i in range(8,15): #range size of 7 or less works, 9 or more does not, no idea why
error += PredictionLayer(rnn_outputs, i)
error += PredictionLayer(rnn_outputs, i, True)
train_fn = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE).minimize(error)
################################################################################
## TRAINING LOOP ##
################################################################################
#code based on siemanko/tf_lstm.py (Github)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
saver = tf.train.Saver(restore_sequentially=True, allow_empty=True,)
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
session.run(tf.global_variables_initializer())
vids = np.load("mnist_test_seq.npy") #20/10000/64/64 , moving mnist dataset from http://www.cs.toronto.edu/~nitish/unsupervised_video/
vids = vids[:,0:6000,:,:] #training set
saver.restore(session,tf.train.latest_checkpoint('./conv_lstm_multiples_v2/'))
#saver.restore(session,'.\conv_lstm_multiples\iteration-74')
for epoch in range(1000):
if TEST:
break
epoch_error = 0
#randomize batches each epoch
vids = np.swapaxes(vids,0,1)
np.random.shuffle(vids)
vids = np.swapaxes(vids,0,1)
for i in range(ITERATIONS_PER_EPOCH):
#running the graph and feeding data
err,_ = session.run([error, train_fn], {inputs: np.expand_dims(vids[:,i*BATCH_SIZE:(i+1)*BATCH_SIZE,:,:],axis=4)})
print(err)
epoch_error += err
#training error each epoch and regular saving
epoch_error /= (ITERATIONS_PER_EPOCH*BATCH_SIZE*4096*20*7)
if (epoch+1) % 5 == 0:
saver.save(session,'.\conv_lstm_multiples_v2\iteration',global_step=epoch)
print("saved")
print("Epoch %d, train error: %f" % (epoch, epoch_error))
#testing
plt.ion()
f, axarr = plt.subplots(2)
vids = np.load("mnist_test_seq.npy")
for i in range(6000,10000):
img = session.run([mean], {inputs: np.expand_dims(vids[:,i:i+1,:,:],axis=4)})
for j in range(20):
axarr[0].imshow(img[0][j,0,:,:])
axarr[1].imshow(vids[j,i,:,:])
plt.show()
plt.pause(0.1)

Usually this happens when gradients' magnitude is very high at some point and causes your network parameters to change a lot. To verify that it is indeed the case, you can produce the same plot of gradient magnitudes and see if they jump right before the loss jump. Assuming this is the case, the classic approach is to use gradient clipping (or go all the way to natural gradient).

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Numpy load large file extremely slowly when tensorflow is training - numpy

Related

GoogleNet fails to classify images

Resource exhausted: OOM model.fit in foor loop grid search cross validation

FailedPreconditionError: FailedPr...onError()

A2C is not working due to critic loss is not converging

(De-)Convutional lstm autoencoder - error jumps

Categories

Resources