Colaboratory VM restarts like clockwork every 45 minutes - google-colaboratory

I'm running Python 2 on a GPU-enabled instance. I am training an LSTM and saving it every 10 cycles. Without fail, the VM restarts every 45 minutes (just before 50 cycles are completed). This has been happening for several days, both on my home wifi (Comcast) and work wifi. I suspect the problem is something native to Google's settings or the Notebook settings, but I can't find anything to tweak this.
My question is: has anyone encountered this? How did you resolve it?
I've included my code here, but I don't think this is code related. It's failing in the last if epoch % epoch_saving_period ... block.
pickle.dump((seq_length, save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()
print "Process started"
last_checkpoint_prefix = '/tmp/pretrained.ckpt-' + str(last_epoch)
tf.reset_default_graph()
with tf.Session(graph=train_graph) as sess:
session_config=config
saver = tf.train.Saver(tf.global_variables())
#tf.add_to_collection('train_op', train_op)
# If you're loading in a saved model, use the following
if (last_epoch > 0):
#saver = tf.train.import_meta_graph(last_checkpoint_prefix + '.meta')
saver.restore(sess, tf.train.latest_checkpoint('/tmp/'))
sess.run(tf.local_variables_initializer())
else:
# If you're running a fresh session, use the following
sess.run(tf.global_variables_initializer())
input_text = train_graph.get_tensor_by_name('input:0')
initial_state = train_graph.get_tensor_by_name('initial_state:0')
final_state = train_graph.get_tensor_by_name('final_state:0')
probs = train_graph.get_tensor_by_name('probs:0')
targets = train_graph.get_tensor_by_name('targets:0')
lr = train_graph.get_tensor_by_name('learning_rate:0')
#init_from_checkpoint('/tmp/pretrained.ckpt', {'input': 'input',
# 'final_state': 'initial_state',
# 'targets': 'targets',
# 'learning_rate': 'learning_rate'})
epochList = []
lossList = []
epoch_saving_period = 10
epoch = 0
for epoch in range(last_epoch,(last_epoch+num_epochs)):
state = sess.run(initial_state, {input_text: batches[0][0]})
for batch_index, (x, y) in enumerate(batches):
feed_dict = {
input_text: x,
targets: y,
initial_state: state,
lr * math.exp(-1 * epoch / 1000): learning_rate
}
train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
time_elapsed = time.time() - start_time
print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f} time_elapsed = {:.3f}'.format(
epoch + 1,
batch_index + 1,
len(batches),
train_loss,
time_elapsed
#((num_batches * num_epochs)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed),
))
epochList.append(epoch)
lossList.append(train_loss)
# save model every 10 epochs
if epoch % epoch_saving_period == 0:
last_epoch = epoch - epoch_saving_period
#saver = tf.train.Saver()
#saver.save(sess, save_dir)
savePath = saver.save(sess, "/tmp/pretrained.ckpt", global_step=epoch, write_meta_graph = True)
# Copy the file to our new bucket.
# Full reference: https://cloud.google.com/storage/docs/gsutil/commands/cp
!gsutil cp /tmp/checkpoint gs://{bucket_name}/
!gsutil cp /tmp/pretrained.ckpt-{epoch}.index gs://{bucket_name}/
!gsutil cp /tmp/pretrained.ckpt-{epoch}.meta gs://{bucket_name}/
!gsutil cp /tmp/pretrained.ckpt-{epoch}.data-00000-of-00001 gs://{bucket_name}/
!gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.index
!gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.meta
!gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.data-00000-of-00001
print('Model Trained and Saved')
if epoch % 5 == 0:
plt.plot(epochList, lossList)
plt.title('Train Loss')
plt.show()

Related

How to solve TypeError: can’t convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first

I am modifying the 'train model' function below so to plot loss and accuracy graphs at every epochs during traning
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
losses=[]
accuracies=[]
y_loss = {} # loss history
y_loss['aug1_train'] = []
y_loss['valid'] = []
y_acc = {}
y_acc['aug1_train'] = []
y_acc['valid'] = []
x_epoch = []
fig = plt.figure()
ax0 = fig.add_subplot(121, title="loss")
ax1 = fig.add_subplot(122, title="accuracy")
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['aug1_train', 'valid']:
if phase == 'aug1_train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels,paths in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'aug1_train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'aug1_train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f} '.format(
phase, epoch_loss, epoch_acc))
y_loss[phase].append(epoch_loss)
y_acc[phase].append(epoch_acc)
# deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
def draw_curve(current_epoch):
x_epoch.append(current_epoch)
ax0.plot(x_epoch, y_loss['aug1_train'], 'bo-', label='train')
ax0.plot(x_epoch, y_loss['valid'], 'ro-', label='val')
ax1.plot(x_epoch, y_acc['aug1_train'], 'bo-', label='train')
ax1.plot(x_epoch, y_acc['valid'], 'ro-', label='val')
if current_epoch == 0:
ax0.legend()
ax1.legend()
fig.savefig(os.path.join('/content/drive/My Drive/Stanford40/Graphs', 'train.jpg'))
draw_curve(epoch)
if phase=='aug1_train':
losses.append(epoch_loss)
accuracies.append(epoch_acc)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model,losses,accuracies
and I load the Densenet161 for traning as below
#Load Pretrained Densenet161 model
model_ft = models.densenet161(pretrained=True)
model_ft.classifier=nn.Linear(2208,11)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
opt = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
sched = lr_scheduler.StepLR(opt, step_size=5, gamma=0.1)
Finally I run the code below to start training:
model_ft,losses,accuracies = train_model(model_ft, criterion,opt ,sched,num_epochs=30)
and got this error as in the picture below:
How can I modify the code to get away from this error by using tensor.cpu() ?
What if try to get item() here
running_corrects += torch.sum(preds == labels.data).item()
and remove double() when dividing?
epoch_acc = running_corrects / dataset_sizes[phase]
It's hard to say without a detailed error backtrace. It is vague but the information it gives is that something somewhere is detecting a tensor and trying to convert it to numpy array not properly. My intuition tells me it comes from the matplotlib code in your visualization step. I believe it is trying to convert your loss terms.
You should convert them to lists after having performed back propagation...
y_loss[phase].append(epoch_loss.item())
y_acc[phase].append(epoch_acc.item())

performance of distributed tensorflow is bad and found high cpu-sys (long time wait on futex)

all,I am trying to run an NN model with distributed-tensorflow(2ps+2 worker,each on a standalone machine),the performance is bad,and worker's cpu-usage is between 50%~800% (machine has 40 cores)
I am pretty sure the input queue is ok(without starving problem).
I also have tried the "tcmlloc/lib/libtcmalloc.so",but no help
with profile tools "strace",I found something strange:long time wait on futex,sometimes even 2+ seconds wait on futex
enter image description here
I also get the timeline infoenter image description here
however,When I run the same programe with no-distributed mode,I can achieved almost 4-5 times of performance compared with the distributed mode,and the cpu-usage almost 2000%~3000%
my code is:` with tf.device(tf.train.replica_device_setter(worker_device = "/job:worker/task:%d" % FLAGS.task_id,cluster=cluster)):
global_step = tf.contrib.framework.get_or_create_global_step()
indexes, values, labels = inputs.inputs()
logits = models.inference([indexes, values])
with tf.name_scope('loss'):
diff = labels * tf.log(logits)
with tf.name_scope('total'):
loss = -tf.reduce_mean(diff)
tf.summary.scalar('loss', loss)
with tf.name_scope('train'):
print("learning_rate = %f" % FLAGS.learning_rate)
#sync_opt = tf.train.SyncReplicasOptimizer(tf.train.AdamOptimizer(FLAGS.learning_rate),
#replicas_to_aggregate=1, total_num_replicas=1, use_locking=False)
sync_opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
train_step = sync_opt.minimize(loss, global_step=global_step)
with tf.name_scope('accuracy'):
with tf.name_scope('correct_prediction'):
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
with tf.name_scope('accuracy'):
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)
hooks = [tf.train.StopAtStepHook(num_steps=6000)]
sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=if_chief,
checkpoint_dir=None,
hooks=hooks,config=sess_config,stop_grace_period_secs=20) as session:
step = 0
while not session.should_stop():
start_time = time.time()
run_metadata = tf.RunMetadata()
_, global_step_value,loss_value = session.run([train_step, global_step,loss],options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
run_metadata=run_metadata)
if step > 0 and step % 1 == 0:
duration = time.time() - start_time
format_str = ("After %d traning steps (%d global steps), "
"loss on training batch is %3f. "
"(cost %d s)")
print(format_str % (step, global_step_value, loss_value,duration))
step += 1
if step >0 and step % 10 == 0 :
tl = timeline.Timeline(run_metadata.step_stats)
ctf = tl.generate_chrome_trace_format()
with open('timeline.json', 'w') as f:
f.write(ctf)`
Any suggestions? Thanks!

tensorflow error: restore checkpoint file

I built up my own convolutional neural network, in which I track the moving averages of all trainable variables (tensorflow 1.0):
variable_averages = tf.train.ExponentialMovingAverage(
0.9999, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
train_op = tf.group(apply_gradient_op, variables_averages_op)
saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
summary_op = tf.summary.merge(summaries)
init = tf.global_variables_initializer()
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False))
sess.run(init)
# start queue runners
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
# training loop
start_time = time.time()
for step in range(FLAGS.max_steps):
_, loss_value = sess.run([train_op, loss])
duration = time.time() - start_time
start_time = time.time()
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 1 == 0:
# print current model status
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step/duration
sec_per_batch = duration/FLAGS.num_gpus
format_str = '{} step{}, loss {}, {} examples/sec, {} sec/batch'
print(format_str.format(datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
if step % 50 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 10 == 0 or step == FLAGS.max_steps:
print('save checkpoint')
# save checkpoint file
checkpoint_file = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_file, global_step=step)
This workes fine and checkpoint files are saved (saver version V2). Then I try to restore the checkpoints in a nother script for evaluating the model. There I have this piece of code
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
where I get the error "NotFoundError (see above for traceback): Key conv1/Variable/ExponentialMovingAverage not found in checkpoint" where conv1/variable/ is a variable scope.
This error ocuurs even before I try to restore the variables. Can you please help to solve it?
Thanks in advance
TheJude
I solved it in this way:
Call tf.reset_default_graph() before create second ExponentialMovingAverage(...) in the graph.
# reset the graph before create a new ema
tf.reset_default_graph()
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
It took me 2 hours...

Run distributed tensorflow example with error

I have three node to run a distributed tensorflow, which is two worker(one has GPU,one not)and one ps(without GPU).The code is below:
from __future__ import print_function
import tensorflow as tf
import sys
import time
# cluster specification
parameter_servers = ["192.168.1.102:2222"]
workers = [ "192.168.1.103:2223",
"192.168.1.104:2224"]
cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers})
# input flags
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
# start a server for a specific task
server = tf.train.Server(cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
# config
batch_size = 100
learning_rate = 0.001
training_epochs = 20
logs_path = "/tmp/mnist/1"
# load mnist data set
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
# Between-graph replication
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=cluster)):
# count the number of updates
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0),
trainable = False)
# input images
with tf.name_scope('input'):
# None -> batch size can be any size, 784 -> flattened mnist image
x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
# target 10 output classes
y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
# model parameters will change during training so we use tf.Variable
tf.set_random_seed(1)
with tf.name_scope("weights"):
W1 = tf.Variable(tf.random_normal([784, 100]))
W2 = tf.Variable(tf.random_normal([100, 10]))
# bias
with tf.name_scope("biases"):
b1 = tf.Variable(tf.zeros([100]))
b2 = tf.Variable(tf.zeros([10]))
# implement model
with tf.name_scope("softmax"):
# y is our prediction
z2 = tf.add(tf.matmul(x,W1),b1)
a2 = tf.nn.sigmoid(z2)
z3 = tf.add(tf.matmul(a2,W2),b2)
y = tf.nn.softmax(z3)
# specify cost function
with tf.name_scope('cross_entropy'):
# this is our cost
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
# specify optimizer
with tf.name_scope('train'):
# optimizer is an "operation" which we can execute in a session
grad_op = tf.train.GradientDescentOptimizer(learning_rate)
train_op = grad_op.minimize(cross_entropy, global_step=global_step)
with tf.name_scope('Accuracy'):
# accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# create a summary for our cost and accuracy
tf.scalar_summary("cost", cross_entropy)
tf.scalar_summary("accuracy", accuracy)
# merge all summaries into a single "operation" which we can execute in a session
summary_op = tf.merge_all_summaries()
init_op = tf.initialize_all_variables()
print("Variables initialized ...")
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
global_step=global_step,
init_op=init_op)
begin_time = time.time()
frequency = 100
with sv.prepare_or_wait_for_session(server.target) as sess:
# create log writer object (this will log on every machine)
writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())
# perform training cycles
start_time = time.time()
for epoch in range(training_epochs):
# number of batches in one epoch
batch_count = int(mnist.train.num_examples/batch_size)
count = 0
for i in range(batch_count):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# perform the operations we defined earlier on batch
_, cost, summary, step = sess.run(
[train_op, cross_entropy, summary_op, global_step],
feed_dict={x: batch_x, y_: batch_y})
writer.add_summary(summary, step)
count += 1
if count % frequency == 0 or i+1 == batch_count:
elapsed_time = time.time() - start_time
start_time = time.time()
print("Step: %d," % (step+1),
" Epoch: %2d," % (epoch+1),
" Batch: %3d of %3d," % (i+1, batch_count),
" Cost: %.4f," % cost,
" AvgTime: %3.2fms" % float(elapsed_time*1000/frequency))
count = 0
print("Test-Accuracy: %2.2f" % sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
print("Total Time: %3.2fs" % float(time.time() - begin_time))
print("Final Cost: %.4f" % cost)
sv.stop()
print("done")
I run the above code on my three node with instruction below in terminal:
pc-01$ python example.py --job-name="ps" --task_index=0
pc-02$ python example.py --job-name="worker" --task_index=0
pc-03$ python example.py --job-name="worker" --task_index=1
However, after the Variables initialized, I met a question that the terminal of worker always print :
I tensor flow/core/distributed_runtime/master.cc:193] CreateSession still waiting for response from worker:/job:worker/replica:0/task:0
and the terminal of ps don't proceed.
The IP of ps is 192.168.1.102, and the IP of the worker is 192.168.1.103,192.168.1.104,just like the code above.
Anyone can help me?
I guess filtering out device should help here. Could you please try adding device_filter to your session ?
config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
with sv.prepare_or_wait_for_session(server.target, config=\config) as sess:
This should fix the issue.

TensorFlow - Saver.restore not restoring all parameters

I was training Bidirectional LSTM type RNN for nearly 24 hours, and due to oscillation in the error I decided to decrease the learning before allowing it to continue training. Since the model is saved using Saver.save(sess,file) at every epoch, I terminated the training with the CTC Loss having minimised to approximately 115.
Now after restoring the model, the initial error rate I am getting is somewhere around 162, which is inconsistent with the flow of error rate I was getting in 7th epoch, and is also what I got in the first epoch. So it is my impression that either "restore" function is not working or if it is working, then there must be something else that is not allowing it to take effect.
Here is my code:
graph = tf.Graph()
with graph.as_default():
# Graph creation
graph_start = time.time()
seq_inputs = tf.placeholder(tf.float32, shape= [None,batch_size,frame_length], name="sequence_inputs")
seq_lens = tf.placeholder(shape=[batch_size],dtype=tf.int32)
seq_inputs = seq_bn(seq_inputs,seq_lens)
initializer = tf.truncated_normal_initializer(mean=0,stddev=0.1)
forward = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
num_proj = hidden_size,
use_peepholes=use_peephole,
initializer=initializer,
state_is_tuple=True)
forward = tf.nn.rnn_cell.MultiRNNCell([forward] * n_layers, state_is_tuple=True)
backward = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
num_proj= hidden_size,
use_peepholes=use_peephole,
initializer=initializer,
state_is_tuple=True)
backward = tf.nn.rnn_cell.MultiRNNCell([backward] * n_layers, state_is_tuple=True)
[fw_out,bw_out], _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=forward, cell_bw=backward, inputs=seq_inputs,time_major=True, dtype=tf.float32, sequence_length=tf.cast(seq_lens,tf.int64))
# Batch normalize forward output
mew,var_ = tf.nn.moments(fw_out,axes=[0])
fw_out = tf.nn.batch_normalization(fw_out,mew,var_,0.1,1,1e-6)
# fw_out = seq_bn(fw_out,seq_lens)
# Batch normalize backward output
mew,var_ = tf.nn.moments(bw_out,axes=[0])
bw_out = tf.nn.batch_normalization(bw_out,mew,var_,0.1,1,1e-6)
# bw_out = seq_bn(bw_out,seq_lens)
# Reshaping forward, and backward outputs for affine transformation
fw_out = tf.reshape(fw_out,[-1,hidden_size])
bw_out = tf.reshape(bw_out,[-1,hidden_size])
# Linear Layer params
W_fw = tf.Variable(tf.truncated_normal(shape=[hidden_size,n_chars],stddev=np.sqrt(2.0 / (hidden_size))))
W_bw = tf.Variable(tf.truncated_normal(shape=[hidden_size,n_chars],stddev=np.sqrt(2.0 / (hidden_size))))
b_out = tf.constant(0.1,shape=[n_chars])
# Perform an affine transformation
logits = tf.add(tf.add(tf.matmul(fw_out,W_fw),tf.matmul(bw_out,W_bw)),b_out)
logits = tf.reshape(logits,[-1,batch_size,n_chars])
# Use CTC Beam Search Decoder to decode pred string from the prob map
decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_lens)
# Target params
indices = tf.placeholder(dtype=tf.int64, shape=[None,2])
values = tf.placeholder(dtype=tf.int32, shape=[None])
shape = tf.placeholder(dtype=tf.int64,shape=[2])
# Make targets
targets = tf.SparseTensor(indices,values,shape)
# Compute Loss
loss = tf.reduce_mean(tf.nn.ctc_loss(logits, targets, seq_lens))
# Compute error rate based on edit distance
predicted = tf.to_int32(decoded[0])
error_rate = tf.reduce_sum(tf.edit_distance(predicted,targets,normalize=False))/ \
tf.to_float(tf.size(targets.values))
tvars = tf.trainable_variables()
grad, _ = tf.clip_by_global_norm(tf.gradients(loss,tvars),max_grad_norm)
optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=momentum)
train_step = optimizer.apply_gradients(zip(grad,tvars))
graph_end = time.time()
print("Time elapsed for creating graph: %.3f"%(round(graph_end-graph_start,3)))
# steps per epoch
start_time = 0
steps = int(np.ceil(len(data_train.files)/batch_size))
loss_tr = []
log_tr = []
loss_vl = []
log_vl = []
err_tr = []
err_vl = []
saver = tf.train.Saver()
with tf.Session(config=config) as sess:
#sess.run(tf.initialize_all_variables())
checkpt_path = tf.train.latest_checkpoint(checkpoint_dir)
print(saver.restore(sess,checkpt_path))
print("Model restore from 7th epoch 188th step")
feed = None
epoch = None
step = None
try:
for epoch in range(7,epochs+1):
if epoch==7:
initial_step = 189
else:
initial_step = 0
transcript = []
loss_val = 0
l_pr = 0
start_time = time.time()
for step in range(initial_step,steps):
train_data, transcript, \
targ_indices, targ_values, \
targ_shape, n_frames = data_train.next_batch()
n_frames = np.reshape(n_frames,[-1])
feed = {seq_inputs: train_data, indices:targ_indices, values:targ_values, shape:targ_shape, seq_lens:n_frames}
del train_data,targ_indices,targ_values,targ_shape,n_frames
# Evaluate loss value, decoded transcript, and log probability
_,loss_val,deco,l_pr,err_rt_tr = sess.run([train_step,loss,decoded,log_prob,error_rate],
feed_dict=feed)
del feed
loss_tr.append(loss_val)
log_tr.append(l_pr)
err_tr.append(err_rt_tr)
# On validation set
val_data, val_transcript, \
targ_indices, targ_values, \
targ_shape, n_frames = data_val.next_batch()
n_frames = np.reshape(n_frames, [-1])
feed = {seq_inputs: val_data, indices: targ_indices,values: targ_values, shape: targ_shape, seq_lens: n_frames}
del val_data, val_transcript,targ_indices,targ_values,targ_shape,n_frames
vl_loss, l_val_pr, err_rt_vl = sess.run([loss, log_prob, error_rate], feed_dict=feed)
del feed
loss_vl.append(vl_loss)
log_vl.append(l_val_pr)
err_vl.append(err_rt_vl)
print("epoch %d, step: %d, tr_loss: %.2f, vl_loss: %.2f, tr_err: %.2f, vl_err: %.2f"
% (epoch, step, np.mean(loss_tr), np.mean(loss_vl), err_rt_tr, err_rt_vl))
end_time = time.time()
elapsed = round(end_time - start_time, 3)
# On training set
# Select a random index within batch_size
sample_index = np.random.randint(0, batch_size)
# Fetch the target transcript
actual_str = [data_train.reverse_map[i] for i in transcript[sample_index]]
# Fetch the decoded path from probability map
pred_sparse = tf.SparseTensor(deco[0].indices, deco[0].values, deco[0].shape)
pred_dense = tf.sparse_tensor_to_dense(pred_sparse)
ans = pred_dense.eval()
#pred = [data_train.reverse_map[i] for i in ans[sample_index, :]]
pred = []
for i in ans[sample_index,:]:
if i == n_chars-1:
pred.append(data_train.reverse_map[0])
else:
pred.append(data_train.reverse_map[i])
print("time_elapsed for 200 steps: %.3f, " % (elapsed))
if epoch%2 == 0:
print("Sample mini-batch results: \n" \
"predicted string: ", np.array(pred))
print("actual string: ", np.array(actual_str))
print("On training set, the loss: %.2f, log_pr: %.3f, error rate %.3f:"% (loss_val, np.mean(l_pr), err_rt_tr))
print("On validation set, the loss: %.2f, log_pr: %.3f, error rate: %.3f" % (vl_loss, np.mean(l_val_pr), err_rt_vl))
# Save the trainable parameters after the end of an epoch
if epoch > 7:
path = saver.save(sess, 'model_%d' % epoch)
print("Session saved at: %s" % path)
np.save(results_fn, np.array([loss_tr, log_tr, loss_vl, log_vl, err_tr, err_vl], dtype=np.object))
except (KeyboardInterrupt, SystemExit, Exception), e:
print("Error/Interruption: %s" % str(e))
exc_type, exc_obj, exc_tb = sys.exc_info()
print("Line no: %d" % exc_tb.tb_lineno)
if epoch > 7:
print("Saving model: %s" % saver.save(sess, 'Last.cpkt'))
print("Current batch: %d" % data_train.b_id)
print("Current epoch: %d" % epoch)
print("Current step: %d"%step)
np.save(results_fn, np.array([loss_tr, log_tr, loss_vl, log_vl, err_tr, err_vl], dtype=np.object))
print("Clossing TF Session...")
sess.close()
print("Terminating Program...")
sys.exit(0)
I think you need to re-initialize your accumulators for each epoch.
So these ones must be put at the beginning, inside the loop.
loss_tr = []
log_tr = []
loss_vl = []
log_vl = []
err_tr = []
err_vl = []