How to load the last checkpoint in TensorFlow?

How to load the last checkpoint in TensorFlow? - tensorflow

I am practising with TensorFlow on this tutorial. The evaluate function depends on the training to load the latest checkpoint:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
start_epoch = 0
if ckpt_manager.latest_checkpoint:
start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
ckpt.restore(ckpt_manager.latest_checkpoint)
for epoch in range(start_epoch, EPOCHS):
start = time.time()
total_loss = 0
for (batch, (img_tensor, target)) in enumerate(dataset):
batch_loss, t_loss = train_step(img_tensor, target)
total_loss += t_loss
if batch % 100 == 0:
print ('Epoch {} Batch {} Loss {:.4f}'.format(
epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
loss_plot.append(total_loss / num_steps)
ckpt_manager.save()
Without ckpt_manager.save(), the evaluation function does not work.
When we have already trained a model and the checkpoints are available in checkpoint_path. How should we load the model without training?

You can use tf.train.latest_checkpoint to get the latest checkpoint file and then load it manually using ckpt.restore:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
ckpt_path = tf.train.latest_checkpoint(checkpoint_path)
ckpt.restore(ckpt_path)

Related

How to solve TypeError: can’t convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first

I am modifying the 'train model' function below so to plot loss and accuracy graphs at every epochs during traning
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
losses=[]
accuracies=[]
y_loss = {} # loss history
y_loss['aug1_train'] = []
y_loss['valid'] = []
y_acc = {}
y_acc['aug1_train'] = []
y_acc['valid'] = []
x_epoch = []
fig = plt.figure()
ax0 = fig.add_subplot(121, title="loss")
ax1 = fig.add_subplot(122, title="accuracy")
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['aug1_train', 'valid']:
if phase == 'aug1_train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels,paths in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'aug1_train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'aug1_train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f} '.format(
phase, epoch_loss, epoch_acc))
y_loss[phase].append(epoch_loss)
y_acc[phase].append(epoch_acc)
# deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
def draw_curve(current_epoch):
x_epoch.append(current_epoch)
ax0.plot(x_epoch, y_loss['aug1_train'], 'bo-', label='train')
ax0.plot(x_epoch, y_loss['valid'], 'ro-', label='val')
ax1.plot(x_epoch, y_acc['aug1_train'], 'bo-', label='train')
ax1.plot(x_epoch, y_acc['valid'], 'ro-', label='val')
if current_epoch == 0:
ax0.legend()
ax1.legend()
fig.savefig(os.path.join('/content/drive/My Drive/Stanford40/Graphs', 'train.jpg'))
draw_curve(epoch)
if phase=='aug1_train':
losses.append(epoch_loss)
accuracies.append(epoch_acc)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model,losses,accuracies
and I load the Densenet161 for traning as below
#Load Pretrained Densenet161 model
model_ft = models.densenet161(pretrained=True)
model_ft.classifier=nn.Linear(2208,11)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
opt = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
sched = lr_scheduler.StepLR(opt, step_size=5, gamma=0.1)
Finally I run the code below to start training:
model_ft,losses,accuracies = train_model(model_ft, criterion,opt ,sched,num_epochs=30)
and got this error as in the picture below:
How can I modify the code to get away from this error by using tensor.cpu() ?

What if try to get item() here
running_corrects += torch.sum(preds == labels.data).item()
and remove double() when dividing?
epoch_acc = running_corrects / dataset_sizes[phase]

It's hard to say without a detailed error backtrace. It is vague but the information it gives is that something somewhere is detecting a tensor and trying to convert it to numpy array not properly. My intuition tells me it comes from the matplotlib code in your visualization step. I believe it is trying to convert your loss terms.
You should convert them to lists after having performed back propagation...
y_loss[phase].append(epoch_loss.item())
y_acc[phase].append(epoch_acc.item())

Are there any tools/libraries that can convert tensorflow lstm model to .mlmodel format to run in iOS app

I have a simple tensorflow model that consists of lstm layers - such as tf.contrib.rnn.LSTMBlockCell or tf.keras.layers.LSTM (I can provide the sample code also, if needed). I want to run the model on an iOS app. However, I have looked at several websites that say that presently there is no way to convert and run tensorflow model that consist LSTM layers on iOS apps.
I have tried these tools/libraries to convert the tensorflow model to .mlmodel format (or .tflite format)
1. Swift for Tensorflow
2. Tensorflow Lite for iOS
3. tfcoreml
However, these tools also does not seem to be able to convert the lstm layers model to .mlmodel format. These tools, however allow to use custom layers to be added. But I don't know how I can add LSTM custom layer.
Am I wrong in saying that there is no support to run tensorflow lstm model in iOS apps? If yes, then please guide me on how I can go ahead to include the model in iOS app. Is there any other tool/library that can be ued to convert it to .mlmodel format. If no, then are there any plans to include tensorflow support for iOS in future?
Model
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.contrib.rnn import *
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
#Summary parameters
logs_path = "logs/"
# Training Parameters
learning_rate = 0.001
training_steps = 1000
batch_size = 128
display_step = 200
# Network Parameters
num_input = 28 # MNIST data input (img shape: 28*28)
timesteps = 28 # timesteps
num_hidden = 128 # hidden layer num of features
num_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
X = tf.placeholder("float", [None, timesteps, num_input])
Y = tf.placeholder("float", [None, num_classes])
# Define weights
weights = {
'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
}
biases = {
'out': tf.Variable(tf.random_normal([num_classes]))
}
def RNN(x, weights, biases):
# Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
x = tf.unstack(x, timesteps, 1)
# Define a lstm cell with tensorflow
lstm_cell = rnn.LSTMBlockCell(num_hidden, forget_bias = 1.0)
#lstm_cell = tf.keras.layers.LSTMCell(num_hidden, unit_forget_bias=True)
# Get lstm cell output
outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
# Linear activation, using rnn inner loop last output
return tf.matmul(outputs[-1], weights['out']) + biases['out']
logits = RNN(X, weights, biases)
with tf.name_scope('Model'):
prediction = tf.nn.softmax(logits, name = "prediction_layer")
with tf.name_scope('Loss'):
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y, name = "loss_layer"), name = "reduce_mean_loss")
with tf.name_scope('SGD'):
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate, name = "Gradient_Descent")
train_op = optimizer.minimize(loss_op, name = "minimize_layer")
with tf.name_scope('Accuracy'):
# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1), name = "correct_pred_layer")
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name = "reduce_mean_acc_layer")
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
#Create a summary to monitor cost tensor
tf.summary.scalar("loss", loss_op)
#Create a summary to monitor accuracy tensor
tf.summary.scalar("accuracy", accuracy)
#Merge all summaries into a single op
merged_summary_op = tf.summary.merge_all()
saver = tf.train.Saver()
save_path = ""
model_save = "model.ckpt"
# Start training
with tf.Session() as sess:
# Run the initializer
sess.run(init)
# op to write logs to Tensorboard
summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
for step in range(1, training_steps+1):
total_batch = int(mnist.train.num_examples/batch_size)
batch_x, batch_y = mnist.train.next_batch(batch_size)
# Reshape data to get 28 seq of 28 elements
batch_x = batch_x.reshape((batch_size, timesteps, num_input))
# Run optimization op (backprop)
sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
if step % display_step == 0 or step == 1:
# Calculate batch loss and accuracy
loss, acc, summary = sess.run([loss_op, accuracy, merged_summary_op], feed_dict={X: batch_x,
Y: batch_y})
# Write logs at every iteration
summary_writer.add_summary(summary, step * total_batch)
print("Step " + str(step) + ", Minibatch Loss= " + \
"{:.4f}".format(loss) + ", Training Accuracy= " + \
"{:.3f}".format(acc))
saver.save(sess, model_save)
tf.train.write_graph(sess.graph_def, save_path, 'save_graph.pbtxt')
#print(sess.graph.get_operations())
print("Optimization Finished!")
print("Run the command line:\n" \
"--> tensorboard --logdir=logs/ " \
"\nThen open http://0.0.0.0:6006/ into your web browser")
# Calculate accuracy for 128 mnist test images
test_len = 128
test_data = mnist.test.images[:test_len].reshape((-1, timesteps, num_input))
test_label = mnist.test.labels[:test_len]
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))
Code to generate the frozen model
import tensorflow as tf
import numpy as np
from tensorflow.python.tools import freeze_graph
save_path = ""
model_name = "test_model_tf_keras_layers_lstm"
input_graph_path = save_path + "save_graph.pbtxt"
checkpoint_path = save_path + "model.ckpt"
input_saver_def_path = ""
input_binary = False
output_node_names = "Model/prediction_layer" #output node's name. Should match to that mentioned in the code
restore_op_name = 'save/restore_all'
filename_tensor_name = 'save/const:0'
output_frozen_graph_name = save_path + 'frozen_model' + '.pb' # name of .pb file that one would like to give
clear_devices = True
freeze_graph.freeze_graph(input_graph_path, input_saver_def_path, input_binary, checkpoint_path, output_node_names, restore_op_name, filename_tensor_name, output_frozen_graph_name, clear_devices, "")
print("Model Freezed")
Conversion Code to generate .mlmodel format file
import tfcoreml
coreml_model = tfcoreml.convert(tf_model_path = 'frozen_model_test_model_tf_keras_layers_lstm.pb',
mlmodel_path = 'test_model.mlmodel',
output_feature_names = ['Model/prediction_layer:0'],
add_custom_layers = True)
coreml_model.save("test_model.mlmodel")
Error message shown with
lstm_cell = rnn.BasicLSTMCell(num_hidden, name = "lstm_cell")
Value Error: Split op case not handled. Input shape = [1, 512], output shape = [1, 128]
Error message shown with
lstm_cell = rnn.LSTMBlockCell(num_hidden, name = "lstm_cell")
InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'LSTMBlockCell' used by node rnn/lstm_cell/LSTMBlockCell (defined at /anaconda2/lib/python2.7/site-packages/tfcoreml/_tf_coreml_converter.py:153) with these attrs: [forget_bias=1, use_peephole=false, cell_clip=-1, T=DT_FLOAT]
Registered devices: [CPU]
Registered kernels:
<no registered kernels>
[[node rnn/lstm_cell/LSTMBlockCell (defined at /anaconda2/lib/python2.7/site-packages/tfcoreml/_tf_coreml_converter.py:153) ]]
I expect that the frozen tensorflow model can be converted to .mlmodel format.

Training/Test data in tensorflow example

I'm new in tensorflow and i follow example at here, but i have one question.
Codes are as follows:
import numpy as np
import tensorflow as tf
from time import time
import math
from include.data import get_data_set
from include.model import model, lr
train_x, train_y = get_data_set("train")
test_x, test_y = get_data_set("test")
x, y, output, y_pred_cls, global_step, learning_rate = model()
global_accuracy = 0
# PARAMS
_BATCH_SIZE = 128
_EPOCH = 60
_SAVE_PATH = "./tensorboard/cifar-10-v1.0.0/"
# LOSS AND OPTIMIZER
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
beta1=0.9,
beta2=0.999,
epsilon=1e-08).minimize(loss, global_step=global_step)
# PREDICTION AND ACCURACY CALCULATION
correct_prediction = tf.equal(y_pred_cls, tf.argmax(y, axis=1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# SAVER
merged = tf.summary.merge_all()
saver = tf.train.Saver()
sess = tf.Session()
train_writer = tf.summary.FileWriter(_SAVE_PATH, sess.graph)
try:
print("\nTrying to restore last checkpoint ...")
last_chk_path = tf.train.latest_checkpoint(checkpoint_dir=_SAVE_PATH)
saver.restore(sess, save_path=last_chk_path)
print("Restored checkpoint from:", last_chk_path)
except ValueError:
print("\nFailed to restore checkpoint. Initializing variables instead.")
sess.run(tf.global_variables_initializer())
def train(epoch):
batch_size = int(math.ceil(len(train_x) / _BATCH_SIZE))
i_global = 0
for s in range(batch_size):
batch_xs = train_x[s*_BATCH_SIZE: (s+1)*_BATCH_SIZE]
batch_ys = train_y[s*_BATCH_SIZE: (s+1)*_BATCH_SIZE]
start_time = time()
i_global, _, batch_loss, batch_acc = sess.run(
[global_step, optimizer, loss, accuracy],
feed_dict={x: batch_xs, y: batch_ys, learning_rate: lr(epoch)})
duration = time() - start_time
if s % 10 == 0:
percentage = int(round((s/batch_size)*100))
bar_len = 29
filled_len = int((bar_len*int(percentage))/100)
bar = '=' * filled_len + '>' + '-' * (bar_len - filled_len)
msg = "Global step: {:>5} - [{}] {:>3}% - acc: {:.4f} - loss: {:.4f} - {:.1f} sample/sec"
print(msg.format(i_global, bar, percentage, batch_acc, batch_loss, _BATCH_SIZE / duration))
test_and_save(i_global, epoch)
def test_and_save(_global_step, epoch):
global global_accuracy
i = 0
predicted_class = np.zeros(shape=len(test_x), dtype=np.int)
while i < len(test_x):
j = min(i + _BATCH_SIZE, len(test_x))
batch_xs = test_x[i:j, :]
batch_ys = test_y[i:j, :]
predicted_class[i:j] = sess.run(
y_pred_cls,
feed_dict={x: batch_xs, y: batch_ys, learning_rate: lr(epoch)}
)
i = j
correct = (np.argmax(test_y, axis=1) == predicted_class)
acc = correct.mean()*100
correct_numbers = correct.sum()
mes = "\nEpoch {} - accuracy: {:.2f}% ({}/{})"
print(mes.format((epoch+1), acc, correct_numbers, len(test_x)))
if global_accuracy != 0 and global_accuracy < acc:
summary = tf.Summary(value=[
tf.Summary.Value(tag="Accuracy/test", simple_value=acc),
])
train_writer.add_summary(summary, _global_step)
saver.save(sess, save_path=_SAVE_PATH, global_step=_global_step)
mes = "This epoch receive better accuracy: {:.2f} > {:.2f}. Saving session..."
print(mes.format(acc, global_accuracy))
global_accuracy = acc
elif global_accuracy == 0:
global_accuracy = acc
print("###########################################################################################################")
def main():
for i in range(_EPOCH):
print("\nEpoch: {0}/{1}\n".format((i+1), _EPOCH))
train(i)
if __name__ == "__main__":
main()
sess.close()
In this example, i think, both test and traning data feeds networks, normally only train data must feed network. I can not see any difference between train() and test_and_save() functions. Am i wrong? Thanks

Here is an explanation if I understood your question correctly. The train function is called every epoch and iterates through the training data. At the end of the epoch the test_and_save function is called where the model accuracy is evaluated. This iterates through the test data on the learned weights and calculates the accuracy and saves the model. This is repeated _EPOCH times.
Edit: The model is saved in the test_and_save function. However, the weights are only updated (gradients calculated) when optimizer is passed through sess.run() in the train function. In the test_and_save function the test data is fed to the network however only y_pred_cls is evaluated by passing to sess.run().

tensorflow error: restore checkpoint file

I built up my own convolutional neural network, in which I track the moving averages of all trainable variables (tensorflow 1.0):
variable_averages = tf.train.ExponentialMovingAverage(
0.9999, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
train_op = tf.group(apply_gradient_op, variables_averages_op)
saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
summary_op = tf.summary.merge(summaries)
init = tf.global_variables_initializer()
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False))
sess.run(init)
# start queue runners
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
# training loop
start_time = time.time()
for step in range(FLAGS.max_steps):
_, loss_value = sess.run([train_op, loss])
duration = time.time() - start_time
start_time = time.time()
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 1 == 0:
# print current model status
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step/duration
sec_per_batch = duration/FLAGS.num_gpus
format_str = '{} step{}, loss {}, {} examples/sec, {} sec/batch'
print(format_str.format(datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
if step % 50 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 10 == 0 or step == FLAGS.max_steps:
print('save checkpoint')
# save checkpoint file
checkpoint_file = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_file, global_step=step)
This workes fine and checkpoint files are saved (saver version V2). Then I try to restore the checkpoints in a nother script for evaluating the model. There I have this piece of code
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
where I get the error "NotFoundError (see above for traceback): Key conv1/Variable/ExponentialMovingAverage not found in checkpoint" where conv1/variable/ is a variable scope.
This error ocuurs even before I try to restore the variables. Can you please help to solve it?
Thanks in advance
TheJude

I solved it in this way:
Call tf.reset_default_graph() before create second ExponentialMovingAverage(...) in the graph.
# reset the graph before create a new ema
tf.reset_default_graph()
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
It took me 2 hours...

TensorFlow - Saver.restore not restoring all parameters

I was training Bidirectional LSTM type RNN for nearly 24 hours, and due to oscillation in the error I decided to decrease the learning before allowing it to continue training. Since the model is saved using Saver.save(sess,file) at every epoch, I terminated the training with the CTC Loss having minimised to approximately 115.
Now after restoring the model, the initial error rate I am getting is somewhere around 162, which is inconsistent with the flow of error rate I was getting in 7th epoch, and is also what I got in the first epoch. So it is my impression that either "restore" function is not working or if it is working, then there must be something else that is not allowing it to take effect.
Here is my code:
graph = tf.Graph()
with graph.as_default():
# Graph creation
graph_start = time.time()
seq_inputs = tf.placeholder(tf.float32, shape= [None,batch_size,frame_length], name="sequence_inputs")
seq_lens = tf.placeholder(shape=[batch_size],dtype=tf.int32)
seq_inputs = seq_bn(seq_inputs,seq_lens)
initializer = tf.truncated_normal_initializer(mean=0,stddev=0.1)
forward = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
num_proj = hidden_size,
use_peepholes=use_peephole,
initializer=initializer,
state_is_tuple=True)
forward = tf.nn.rnn_cell.MultiRNNCell([forward] * n_layers, state_is_tuple=True)
backward = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
num_proj= hidden_size,
use_peepholes=use_peephole,
initializer=initializer,
state_is_tuple=True)
backward = tf.nn.rnn_cell.MultiRNNCell([backward] * n_layers, state_is_tuple=True)
[fw_out,bw_out], _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=forward, cell_bw=backward, inputs=seq_inputs,time_major=True, dtype=tf.float32, sequence_length=tf.cast(seq_lens,tf.int64))
# Batch normalize forward output
mew,var_ = tf.nn.moments(fw_out,axes=[0])
fw_out = tf.nn.batch_normalization(fw_out,mew,var_,0.1,1,1e-6)
# fw_out = seq_bn(fw_out,seq_lens)
# Batch normalize backward output
mew,var_ = tf.nn.moments(bw_out,axes=[0])
bw_out = tf.nn.batch_normalization(bw_out,mew,var_,0.1,1,1e-6)
# bw_out = seq_bn(bw_out,seq_lens)
# Reshaping forward, and backward outputs for affine transformation
fw_out = tf.reshape(fw_out,[-1,hidden_size])
bw_out = tf.reshape(bw_out,[-1,hidden_size])
# Linear Layer params
W_fw = tf.Variable(tf.truncated_normal(shape=[hidden_size,n_chars],stddev=np.sqrt(2.0 / (hidden_size))))
W_bw = tf.Variable(tf.truncated_normal(shape=[hidden_size,n_chars],stddev=np.sqrt(2.0 / (hidden_size))))
b_out = tf.constant(0.1,shape=[n_chars])
# Perform an affine transformation
logits = tf.add(tf.add(tf.matmul(fw_out,W_fw),tf.matmul(bw_out,W_bw)),b_out)
logits = tf.reshape(logits,[-1,batch_size,n_chars])
# Use CTC Beam Search Decoder to decode pred string from the prob map
decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_lens)
# Target params
indices = tf.placeholder(dtype=tf.int64, shape=[None,2])
values = tf.placeholder(dtype=tf.int32, shape=[None])
shape = tf.placeholder(dtype=tf.int64,shape=[2])
# Make targets
targets = tf.SparseTensor(indices,values,shape)
# Compute Loss
loss = tf.reduce_mean(tf.nn.ctc_loss(logits, targets, seq_lens))
# Compute error rate based on edit distance
predicted = tf.to_int32(decoded[0])
error_rate = tf.reduce_sum(tf.edit_distance(predicted,targets,normalize=False))/ \
tf.to_float(tf.size(targets.values))
tvars = tf.trainable_variables()
grad, _ = tf.clip_by_global_norm(tf.gradients(loss,tvars),max_grad_norm)
optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=momentum)
train_step = optimizer.apply_gradients(zip(grad,tvars))
graph_end = time.time()
print("Time elapsed for creating graph: %.3f"%(round(graph_end-graph_start,3)))
# steps per epoch
start_time = 0
steps = int(np.ceil(len(data_train.files)/batch_size))
loss_tr = []
log_tr = []
loss_vl = []
log_vl = []
err_tr = []
err_vl = []
saver = tf.train.Saver()
with tf.Session(config=config) as sess:
#sess.run(tf.initialize_all_variables())
checkpt_path = tf.train.latest_checkpoint(checkpoint_dir)
print(saver.restore(sess,checkpt_path))
print("Model restore from 7th epoch 188th step")
feed = None
epoch = None
step = None
try:
for epoch in range(7,epochs+1):
if epoch==7:
initial_step = 189
else:
initial_step = 0
transcript = []
loss_val = 0
l_pr = 0
start_time = time.time()
for step in range(initial_step,steps):
train_data, transcript, \
targ_indices, targ_values, \
targ_shape, n_frames = data_train.next_batch()
n_frames = np.reshape(n_frames,[-1])
feed = {seq_inputs: train_data, indices:targ_indices, values:targ_values, shape:targ_shape, seq_lens:n_frames}
del train_data,targ_indices,targ_values,targ_shape,n_frames
# Evaluate loss value, decoded transcript, and log probability
_,loss_val,deco,l_pr,err_rt_tr = sess.run([train_step,loss,decoded,log_prob,error_rate],
feed_dict=feed)
del feed
loss_tr.append(loss_val)
log_tr.append(l_pr)
err_tr.append(err_rt_tr)
# On validation set
val_data, val_transcript, \
targ_indices, targ_values, \
targ_shape, n_frames = data_val.next_batch()
n_frames = np.reshape(n_frames, [-1])
feed = {seq_inputs: val_data, indices: targ_indices,values: targ_values, shape: targ_shape, seq_lens: n_frames}
del val_data, val_transcript,targ_indices,targ_values,targ_shape,n_frames
vl_loss, l_val_pr, err_rt_vl = sess.run([loss, log_prob, error_rate], feed_dict=feed)
del feed
loss_vl.append(vl_loss)
log_vl.append(l_val_pr)
err_vl.append(err_rt_vl)
print("epoch %d, step: %d, tr_loss: %.2f, vl_loss: %.2f, tr_err: %.2f, vl_err: %.2f"
% (epoch, step, np.mean(loss_tr), np.mean(loss_vl), err_rt_tr, err_rt_vl))
end_time = time.time()
elapsed = round(end_time - start_time, 3)
# On training set
# Select a random index within batch_size
sample_index = np.random.randint(0, batch_size)
# Fetch the target transcript
actual_str = [data_train.reverse_map[i] for i in transcript[sample_index]]
# Fetch the decoded path from probability map
pred_sparse = tf.SparseTensor(deco[0].indices, deco[0].values, deco[0].shape)
pred_dense = tf.sparse_tensor_to_dense(pred_sparse)
ans = pred_dense.eval()
#pred = [data_train.reverse_map[i] for i in ans[sample_index, :]]
pred = []
for i in ans[sample_index,:]:
if i == n_chars-1:
pred.append(data_train.reverse_map[0])
else:
pred.append(data_train.reverse_map[i])
print("time_elapsed for 200 steps: %.3f, " % (elapsed))
if epoch%2 == 0:
print("Sample mini-batch results: \n" \
"predicted string: ", np.array(pred))
print("actual string: ", np.array(actual_str))
print("On training set, the loss: %.2f, log_pr: %.3f, error rate %.3f:"% (loss_val, np.mean(l_pr), err_rt_tr))
print("On validation set, the loss: %.2f, log_pr: %.3f, error rate: %.3f" % (vl_loss, np.mean(l_val_pr), err_rt_vl))
# Save the trainable parameters after the end of an epoch
if epoch > 7:
path = saver.save(sess, 'model_%d' % epoch)
print("Session saved at: %s" % path)
np.save(results_fn, np.array([loss_tr, log_tr, loss_vl, log_vl, err_tr, err_vl], dtype=np.object))
except (KeyboardInterrupt, SystemExit, Exception), e:
print("Error/Interruption: %s" % str(e))
exc_type, exc_obj, exc_tb = sys.exc_info()
print("Line no: %d" % exc_tb.tb_lineno)
if epoch > 7:
print("Saving model: %s" % saver.save(sess, 'Last.cpkt'))
print("Current batch: %d" % data_train.b_id)
print("Current epoch: %d" % epoch)
print("Current step: %d"%step)
np.save(results_fn, np.array([loss_tr, log_tr, loss_vl, log_vl, err_tr, err_vl], dtype=np.object))
print("Clossing TF Session...")
sess.close()
print("Terminating Program...")
sys.exit(0)

I think you need to re-initialize your accumulators for each epoch.
So these ones must be put at the beginning, inside the loop.
loss_tr = []
log_tr = []
loss_vl = []
log_vl = []
err_tr = []
err_vl = []

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to load the last checkpoint in TensorFlow? - tensorflow

You can use tf.train.latest_checkpoint to get the latest checkpoint file and then load it manually using ckpt.restore: checkpoint_path = "./checkpoints/train" ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, ckpt_path = tf.train.latest_checkpoint(checkpoint_path) ckpt.restore(ckpt_path)

Related

How to solve TypeError: can’t convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first

Are there any tools/libraries that can convert tensorflow lstm model to .mlmodel format to run in iOS app

Training/Test data in tensorflow example

tensorflow error: restore checkpoint file

TensorFlow - Saver.restore not restoring all parameters

Categories

Resources