TensorFlow - Saver.restore not restoring all parameters - tensorflow

I was training Bidirectional LSTM type RNN for nearly 24 hours, and due to oscillation in the error I decided to decrease the learning before allowing it to continue training. Since the model is saved using Saver.save(sess,file) at every epoch, I terminated the training with the CTC Loss having minimised to approximately 115.
Now after restoring the model, the initial error rate I am getting is somewhere around 162, which is inconsistent with the flow of error rate I was getting in 7th epoch, and is also what I got in the first epoch. So it is my impression that either "restore" function is not working or if it is working, then there must be something else that is not allowing it to take effect.
Here is my code:
graph = tf.Graph()
with graph.as_default():
# Graph creation
graph_start = time.time()
seq_inputs = tf.placeholder(tf.float32, shape= [None,batch_size,frame_length], name="sequence_inputs")
seq_lens = tf.placeholder(shape=[batch_size],dtype=tf.int32)
seq_inputs = seq_bn(seq_inputs,seq_lens)
initializer = tf.truncated_normal_initializer(mean=0,stddev=0.1)
forward = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
num_proj = hidden_size,
use_peepholes=use_peephole,
initializer=initializer,
state_is_tuple=True)
forward = tf.nn.rnn_cell.MultiRNNCell([forward] * n_layers, state_is_tuple=True)
backward = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
num_proj= hidden_size,
use_peepholes=use_peephole,
initializer=initializer,
state_is_tuple=True)
backward = tf.nn.rnn_cell.MultiRNNCell([backward] * n_layers, state_is_tuple=True)
[fw_out,bw_out], _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=forward, cell_bw=backward, inputs=seq_inputs,time_major=True, dtype=tf.float32, sequence_length=tf.cast(seq_lens,tf.int64))
# Batch normalize forward output
mew,var_ = tf.nn.moments(fw_out,axes=[0])
fw_out = tf.nn.batch_normalization(fw_out,mew,var_,0.1,1,1e-6)
# fw_out = seq_bn(fw_out,seq_lens)
# Batch normalize backward output
mew,var_ = tf.nn.moments(bw_out,axes=[0])
bw_out = tf.nn.batch_normalization(bw_out,mew,var_,0.1,1,1e-6)
# bw_out = seq_bn(bw_out,seq_lens)
# Reshaping forward, and backward outputs for affine transformation
fw_out = tf.reshape(fw_out,[-1,hidden_size])
bw_out = tf.reshape(bw_out,[-1,hidden_size])
# Linear Layer params
W_fw = tf.Variable(tf.truncated_normal(shape=[hidden_size,n_chars],stddev=np.sqrt(2.0 / (hidden_size))))
W_bw = tf.Variable(tf.truncated_normal(shape=[hidden_size,n_chars],stddev=np.sqrt(2.0 / (hidden_size))))
b_out = tf.constant(0.1,shape=[n_chars])
# Perform an affine transformation
logits = tf.add(tf.add(tf.matmul(fw_out,W_fw),tf.matmul(bw_out,W_bw)),b_out)
logits = tf.reshape(logits,[-1,batch_size,n_chars])
# Use CTC Beam Search Decoder to decode pred string from the prob map
decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_lens)
# Target params
indices = tf.placeholder(dtype=tf.int64, shape=[None,2])
values = tf.placeholder(dtype=tf.int32, shape=[None])
shape = tf.placeholder(dtype=tf.int64,shape=[2])
# Make targets
targets = tf.SparseTensor(indices,values,shape)
# Compute Loss
loss = tf.reduce_mean(tf.nn.ctc_loss(logits, targets, seq_lens))
# Compute error rate based on edit distance
predicted = tf.to_int32(decoded[0])
error_rate = tf.reduce_sum(tf.edit_distance(predicted,targets,normalize=False))/ \
tf.to_float(tf.size(targets.values))
tvars = tf.trainable_variables()
grad, _ = tf.clip_by_global_norm(tf.gradients(loss,tvars),max_grad_norm)
optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=momentum)
train_step = optimizer.apply_gradients(zip(grad,tvars))
graph_end = time.time()
print("Time elapsed for creating graph: %.3f"%(round(graph_end-graph_start,3)))
# steps per epoch
start_time = 0
steps = int(np.ceil(len(data_train.files)/batch_size))
loss_tr = []
log_tr = []
loss_vl = []
log_vl = []
err_tr = []
err_vl = []
saver = tf.train.Saver()
with tf.Session(config=config) as sess:
#sess.run(tf.initialize_all_variables())
checkpt_path = tf.train.latest_checkpoint(checkpoint_dir)
print(saver.restore(sess,checkpt_path))
print("Model restore from 7th epoch 188th step")
feed = None
epoch = None
step = None
try:
for epoch in range(7,epochs+1):
if epoch==7:
initial_step = 189
else:
initial_step = 0
transcript = []
loss_val = 0
l_pr = 0
start_time = time.time()
for step in range(initial_step,steps):
train_data, transcript, \
targ_indices, targ_values, \
targ_shape, n_frames = data_train.next_batch()
n_frames = np.reshape(n_frames,[-1])
feed = {seq_inputs: train_data, indices:targ_indices, values:targ_values, shape:targ_shape, seq_lens:n_frames}
del train_data,targ_indices,targ_values,targ_shape,n_frames
# Evaluate loss value, decoded transcript, and log probability
_,loss_val,deco,l_pr,err_rt_tr = sess.run([train_step,loss,decoded,log_prob,error_rate],
feed_dict=feed)
del feed
loss_tr.append(loss_val)
log_tr.append(l_pr)
err_tr.append(err_rt_tr)
# On validation set
val_data, val_transcript, \
targ_indices, targ_values, \
targ_shape, n_frames = data_val.next_batch()
n_frames = np.reshape(n_frames, [-1])
feed = {seq_inputs: val_data, indices: targ_indices,values: targ_values, shape: targ_shape, seq_lens: n_frames}
del val_data, val_transcript,targ_indices,targ_values,targ_shape,n_frames
vl_loss, l_val_pr, err_rt_vl = sess.run([loss, log_prob, error_rate], feed_dict=feed)
del feed
loss_vl.append(vl_loss)
log_vl.append(l_val_pr)
err_vl.append(err_rt_vl)
print("epoch %d, step: %d, tr_loss: %.2f, vl_loss: %.2f, tr_err: %.2f, vl_err: %.2f"
% (epoch, step, np.mean(loss_tr), np.mean(loss_vl), err_rt_tr, err_rt_vl))
end_time = time.time()
elapsed = round(end_time - start_time, 3)
# On training set
# Select a random index within batch_size
sample_index = np.random.randint(0, batch_size)
# Fetch the target transcript
actual_str = [data_train.reverse_map[i] for i in transcript[sample_index]]
# Fetch the decoded path from probability map
pred_sparse = tf.SparseTensor(deco[0].indices, deco[0].values, deco[0].shape)
pred_dense = tf.sparse_tensor_to_dense(pred_sparse)
ans = pred_dense.eval()
#pred = [data_train.reverse_map[i] for i in ans[sample_index, :]]
pred = []
for i in ans[sample_index,:]:
if i == n_chars-1:
pred.append(data_train.reverse_map[0])
else:
pred.append(data_train.reverse_map[i])
print("time_elapsed for 200 steps: %.3f, " % (elapsed))
if epoch%2 == 0:
print("Sample mini-batch results: \n" \
"predicted string: ", np.array(pred))
print("actual string: ", np.array(actual_str))
print("On training set, the loss: %.2f, log_pr: %.3f, error rate %.3f:"% (loss_val, np.mean(l_pr), err_rt_tr))
print("On validation set, the loss: %.2f, log_pr: %.3f, error rate: %.3f" % (vl_loss, np.mean(l_val_pr), err_rt_vl))
# Save the trainable parameters after the end of an epoch
if epoch > 7:
path = saver.save(sess, 'model_%d' % epoch)
print("Session saved at: %s" % path)
np.save(results_fn, np.array([loss_tr, log_tr, loss_vl, log_vl, err_tr, err_vl], dtype=np.object))
except (KeyboardInterrupt, SystemExit, Exception), e:
print("Error/Interruption: %s" % str(e))
exc_type, exc_obj, exc_tb = sys.exc_info()
print("Line no: %d" % exc_tb.tb_lineno)
if epoch > 7:
print("Saving model: %s" % saver.save(sess, 'Last.cpkt'))
print("Current batch: %d" % data_train.b_id)
print("Current epoch: %d" % epoch)
print("Current step: %d"%step)
np.save(results_fn, np.array([loss_tr, log_tr, loss_vl, log_vl, err_tr, err_vl], dtype=np.object))
print("Clossing TF Session...")
sess.close()
print("Terminating Program...")
sys.exit(0)

I think you need to re-initialize your accumulators for each epoch.
So these ones must be put at the beginning, inside the loop.
loss_tr = []
log_tr = []
loss_vl = []
log_vl = []
err_tr = []
err_vl = []

Related

How to solve TypeError: can’t convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first

I am modifying the 'train model' function below so to plot loss and accuracy graphs at every epochs during traning
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
losses=[]
accuracies=[]
y_loss = {} # loss history
y_loss['aug1_train'] = []
y_loss['valid'] = []
y_acc = {}
y_acc['aug1_train'] = []
y_acc['valid'] = []
x_epoch = []
fig = plt.figure()
ax0 = fig.add_subplot(121, title="loss")
ax1 = fig.add_subplot(122, title="accuracy")
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['aug1_train', 'valid']:
if phase == 'aug1_train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels,paths in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'aug1_train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'aug1_train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f} '.format(
phase, epoch_loss, epoch_acc))
y_loss[phase].append(epoch_loss)
y_acc[phase].append(epoch_acc)
# deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
def draw_curve(current_epoch):
x_epoch.append(current_epoch)
ax0.plot(x_epoch, y_loss['aug1_train'], 'bo-', label='train')
ax0.plot(x_epoch, y_loss['valid'], 'ro-', label='val')
ax1.plot(x_epoch, y_acc['aug1_train'], 'bo-', label='train')
ax1.plot(x_epoch, y_acc['valid'], 'ro-', label='val')
if current_epoch == 0:
ax0.legend()
ax1.legend()
fig.savefig(os.path.join('/content/drive/My Drive/Stanford40/Graphs', 'train.jpg'))
draw_curve(epoch)
if phase=='aug1_train':
losses.append(epoch_loss)
accuracies.append(epoch_acc)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model,losses,accuracies
and I load the Densenet161 for traning as below
#Load Pretrained Densenet161 model
model_ft = models.densenet161(pretrained=True)
model_ft.classifier=nn.Linear(2208,11)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
opt = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
sched = lr_scheduler.StepLR(opt, step_size=5, gamma=0.1)
Finally I run the code below to start training:
model_ft,losses,accuracies = train_model(model_ft, criterion,opt ,sched,num_epochs=30)
and got this error as in the picture below:
How can I modify the code to get away from this error by using tensor.cpu() ?
What if try to get item() here
running_corrects += torch.sum(preds == labels.data).item()
and remove double() when dividing?
epoch_acc = running_corrects / dataset_sizes[phase]
It's hard to say without a detailed error backtrace. It is vague but the information it gives is that something somewhere is detecting a tensor and trying to convert it to numpy array not properly. My intuition tells me it comes from the matplotlib code in your visualization step. I believe it is trying to convert your loss terms.
You should convert them to lists after having performed back propagation...
y_loss[phase].append(epoch_loss.item())
y_acc[phase].append(epoch_acc.item())

How to fix the fetch argument error in implementing Bayesian Neural Network with tenssorflow

placeholder_X = tf.placeholder(tf.float32, shape = [None, 19])
placeholder_y = tf.placeholder(tf.float32, shape = [None,1])
#Build an iterator over training batches
#training_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
training_dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
#Shuffle the dataset (note shuffle argument much larger than training size).learning_rate # shuffling of data
# and form batches of size batch_size
training_batches = training_dataset.shuffle(20000, reshuffle_each_iteration =True).repeat().batch(FLAGS.batch_size)
#training_iterator = tf.data.make_one_shot_iterator(training_batches)
#Building iterator over the heldout set with batch_size = heldout_size,
# i.e., return the entire heldout set as a constant.
val_dataset = tf.data.Dataset.from_tensor_slices((placeholder_X, placeholder_y))
val_batches = val_dataset.repeat().batch(500)
#heldout_iterator = tf.data.make_one_shot_iterator(heldout_batches)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test,y_test))
test_dataset = test_dataset.batch(500)
#Combine these into a feasible iterator that can switch between training
# and validation inputs.
# Here should be minibatch increment be defined
handle = tf.placeholder(tf.string, shape = [])
feedable_iterator = tf.data.Iterator.from_string_handle(handle, training_batches.output_types, training_batches.output_shapes)
features_final, labels_final = feedable_iterator.get_next()
#create Reinitializable iterator for Train and Validation, one hot iterator for Test
train_val_iterator = tf.data.Iterator.from_structure(training_batches.output_types, training_batches.output_shapes)
training_iterator = train_val_iterator.make_initializer(training_batches)
val_iterator = train_val_iterator.make_initializer(val_batches)
test_iterator = test_dataset.make_one_shot_iterator()
def main(argv):
# extract the activation function from the hyperopt spec as an attribute from the tf.nn module
#activation = getattr(tf.nn, FLAGS.activation_function)
# define the graph
#with tf.Graph().as_default():
# Building the Bayesian Neural Network
# we are Gaussian Reparametrization Trick
# to compute the stochastic gradients as described in the paper
with tf.compat.v1.name_scope("bayesian_neural_net", values =[features_final]):
neural_net = tf.keras.Sequential()
for i in range(FLAGS.num_hidden_layers):
layer = tfp.layers.DenseReparameterization(
units = 10,
activation = tf.nn.relu,
trainable = True,
kernel_prior_fn=tfp.layers.default_multivariate_normal_fn, # NormalDiag
kernel_posterior_fn=tfp.layers.default_mean_field_normal_fn(),
#kernel_posterior_fn=tfp_layers_util.default_mean_field_normal_fn(), # softplus(sigma)
kernel_posterior_tensor_fn=lambda x: x.sample(),
bias_prior_fn=tfp.layers.default_multivariate_normal_fn, # NormalDiag
bias_posterior_fn=tfp.layers.default_mean_field_normal_fn(), # softplus(sigma)
bias_posterior_tensor_fn=lambda x: x.sample()
)
neural_net.add(layer)
neural_net.add(tfp.layers.DenseReparameterization(
units=2, # one dimensional output
activation= tf.nn.softmax, # since regression (outcome not bounded)
trainable=True, # i.e subject to optimization
kernel_prior_fn=tfp.layers.default_multivariate_normal_fn, # NormalDiag with hyperopt sigma
kernel_posterior_fn=tfp.layers.default_mean_field_normal_fn(), # softplus(sigma)
kernel_posterior_tensor_fn=lambda x: x.sample(),
bias_prior_fn =tfp.layers.default_multivariate_normal_fn, # NormalDiag with hyperopt sigma
bias_posterior_fn=tfp.layers.default_mean_field_normal_fn(), # softplus(sigma)
bias_posterior_tensor_fn=lambda x: x.sample()
))
logits = neural_net(features_final)
#labels_distribution = tfd.Bernoulli(logits=logits)
labels_distribution = tfd.Categorical(logits=logits)
#labels_distribution = tfd.Bernoulli(logits=logits)
# Perform KL annealing. The optimal number of annealing steps
# depends on the dataset and architecture.
t = tf.Variable(0.0)
kl_regularizer = t / (FLAGS.kl_annealing * len(X_train) / FLAGS.batch_size)
#Compute the -ELBO as the loss. The kl term is annealed from 1 to 1 over
# the epochs specified by the kl_annealing flag.
log_likelihood = labels_distribution.log_prob(labels_final)
#neg_log_likelihood = tf.reduce_mean(tf.squared_difference(logits,labels_final))
neg_log_likelihood = -tf.reduce_mean(input_tensor = log_likelihood)
kl = sum(neural_net.losses)/len(X_train) * tf.minimum(1.0, kl_regularizer)
elbo_loss = neg_log_likelihood + kl
# Build metrics for evaluation. Predictions are formed from single forward
# pass of the probablisitic layers . They are cheap but noisy predictions
predictions = tf.argmax(input = logits, axis=1)
predictions = tf.cast(predictions, tf.float32)
# TP, TN, FP, FN
TP = tf.count_nonzero(predictions * labels_final)
TN = tf.count_nonzero((predictions - 1) * (labels_final - 1))
FP = tf.count_nonzero(predictions * (labels_final - 1))
FN = tf.count_nonzero((predictions - 1) * labels_final)
# precision, recall, f1
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)
tpr = TP/(TP+FN)
fpr = FP/(TP+FN)
#create Reinitializable iterator for Train and Validation, one hot iterator for Test
train_val_iterator = tf.data.Iterator.from_structure(training_batches.output_types, training_batches.output_shapes)
training_iterator = train_val_iterator.make_initializer(training_batches)
val_iterator = train_val_iterator.make_initializer(val_batches)
test_iterator = test_dataset.make_one_shot_iterator()
with tf.compat.v1.name_scope("train"):
train_accuracy, train_accuracy_update_op = tf.metrics.accuracy(labels=labels_final,predictions =predictions)
opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
train_op = opt.minimize(elbo_loss)
update_step_op = tf.assign(t, t+1)
with tf.compat.v1.name_scope("valid"):
valid_accuracy, validation_accuracy_update_op = tf.metrics.accuracy(labels= labels_final,predictions = predictions)
with tf.compat.v1.name_scope("test"):
test_accuracy, test_accuracy_update_op = tf.metrics.accuracy(labels = labels_final,predictions = predictions)
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
saver = tf.train.Saver()
stream_vars_valid = [ v for v in tf.local_variables() if "valid" in v.name]
reset_valid_op = tf.variables_initializer(stream_vars_valid)
valid_accuracy_summary = []
stop_early =0
with tf.compat.v1.Session() as sess:
sess.run(init_op)
# Run the training loop
train_val_string, test_string = sess.run([
train_val_iterator.string_handle(),
test_iterator.string_handle()])
training_steps = int(round(FLAGS.epochs * (len(X_train) / FLAGS.batch_size)))
for step in range(training_steps):
#start reininitializable's train iterator
sess.run(training_iterator, feed_dict = {placeholder_X:X_train, placeholder_y:y_train})
#
_ = sess.run([train_op,train_accuracy_update_op, update_step_op],feed_dict={handle: train_val_string})
# Manually print the frequency
if step % 100 == 0:
save_path = saver.save(sess, "/tmp/my_model.ckpt")
loss_value, accuracy_value, kl_value = sess.run([elbo_loss, train_accuracy, kl], feed_dict= {handle: train_val_string})
print("Step:{:>3d} loss : {:.3f} KL: {:.3f}" .format(step , loss_value, accuracy_value, kl_value))
if (step +1) % FLAGS.eval_freq ==0:
# Compute log prob of heldout set by averaging draws from the model:
# p(heldout | train) = int_model p(heldout|model) p(model|train) ~= 1/n * sum_{i=1}^n p(heldout | model_i)
# where model_i is a draw from the posterior
#p(model|train)
probs = np.asarray([sess.run((labels_distribution.probs),
feed_dict ={handle: train_val_string})
for _ in range(FLAGS.num_monte_carlo)])
mean_probs = np.mean(probs, axis =0).astype(np.int32)
print(mean_probs.dtype)
_, label_vals = sess.run((features_final, labels_final), feed_dict = {handle: train_val_string})
label_vals = (label_vals).astype(np.int32)
heldout_lp = np.mean(np.log(mean_probs[np.arange(mean_probs.shape[0]), label_vals]))
print(" ...Held_out nats: {:.3f}".format(heldout_lp))
# Calculate validation accuracy
for step in range(10):
#start reinitializable's validation iterator
sess.run(val_iterator, feed_dict = {placeholder_X:X_val, placeholder_y:y_val})
sess.run(validation_accuracy_update_op, feed_dict={handle:train_val_string})
valid_value = sess.run(valid_accuracy, feed_dict={handle:train_val_string})
valid_accuracy_summary.append(valid_value)
if valid_value < max(valid_accuracy_summary) and step > 100:
stop_early += 1
if stop_early == 40:
break
else:
stop_early = 0
print("Validation Accuracy: {:.3f}".format(valid_value))
sess.run(reset_valid_op)
#Feed to r=feedable iterator the string handle
test_value, precision_value, recall_value, fpr_value, tpr_value,f1 = sess.run([test_accuracy, precision, recall, fpr, tpr,f1],feed_dict={handle: test_string})
print("Step: {:>3d} test Accuracy: {:.3f} Precision: {:.3f} Recall: {:.3f} ".format(step, test_value, precision_value, recall_value))
print("Step: {:>3d} fpr: {:.3f} tpr: {:.3f} f1_1: {:.3f}".format( step, fpr_value, tpr_value,f1))
if __name__ == "__main__":
tf.compat.v1.app.run()
Expect the output to progress but it is giving out this error
Step: 0 loss : 0.646 KL: 0.875
Step:100 loss : 0.654 KL: 0.904
Step:200 loss : 0.657 KL: 0.906
Step:300 loss : 0.648 KL: 0.906
int32
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:137: RuntimeWarning: divide by zero encountered in log
...Held_out nats: -inf
Validation Accuracy: 0.914
Step: 9 test Accuracy: 0.000 Precision: 0.910 Recall: 1.000
Step: 9 fpr: 0.099 tpr: 1.000 f1_1: 0.953
Step:400 loss : 0.624 KL: 0.906
Step:500 loss : 0.641 KL: 0.906
Step:600 loss : 0.612 KL: 0.906
Step:700 loss : 0.579 KL: 0.906
int32
...Held_out nats: -inf
Validation Accuracy: 0.914
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in __init__(self, fetches, contraction_fn)
302 self._unique_fetches.append(ops.get_default_graph().as_graph_element(
--> 303 fetch, allow_tensor=True, allow_operation=True))
304 except TypeError as e:
14 frames
TypeError: Can not convert a float64 into a Tensor or Operation.
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in __init__(self, fetches, contraction_fn)
305 raise TypeError('Fetch argument %r has invalid type %r, '
306 'must be a string or Tensor. (%s)' %
--> 307 (fetch, type(fetch), str(e)))
308 except ValueError as e:
309 raise ValueError('Fetch argument %r cannot be interpreted as a '
The exception arises because you use same name f1 as assignment, we need to change name f1 at left side.
test_value, precision_value, recall_value, fpr_value, tpr_value,f1 = sess.run([test_accuracy, precision, recall, fpr, tpr,f1],feed_dict={handle: test_string})
change the line to
test_value, precision_value, recall_value, fpr_value, tpr_value,f1_value = sess.run([test_accuracy, precision, recall, fpr, tpr,f1],feed_dict={handle: test_string})
Hopefully, this will work.

Training/Test data in tensorflow example

I'm new in tensorflow and i follow example at here, but i have one question.
Codes are as follows:
import numpy as np
import tensorflow as tf
from time import time
import math
from include.data import get_data_set
from include.model import model, lr
train_x, train_y = get_data_set("train")
test_x, test_y = get_data_set("test")
x, y, output, y_pred_cls, global_step, learning_rate = model()
global_accuracy = 0
# PARAMS
_BATCH_SIZE = 128
_EPOCH = 60
_SAVE_PATH = "./tensorboard/cifar-10-v1.0.0/"
# LOSS AND OPTIMIZER
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
beta1=0.9,
beta2=0.999,
epsilon=1e-08).minimize(loss, global_step=global_step)
# PREDICTION AND ACCURACY CALCULATION
correct_prediction = tf.equal(y_pred_cls, tf.argmax(y, axis=1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# SAVER
merged = tf.summary.merge_all()
saver = tf.train.Saver()
sess = tf.Session()
train_writer = tf.summary.FileWriter(_SAVE_PATH, sess.graph)
try:
print("\nTrying to restore last checkpoint ...")
last_chk_path = tf.train.latest_checkpoint(checkpoint_dir=_SAVE_PATH)
saver.restore(sess, save_path=last_chk_path)
print("Restored checkpoint from:", last_chk_path)
except ValueError:
print("\nFailed to restore checkpoint. Initializing variables instead.")
sess.run(tf.global_variables_initializer())
def train(epoch):
batch_size = int(math.ceil(len(train_x) / _BATCH_SIZE))
i_global = 0
for s in range(batch_size):
batch_xs = train_x[s*_BATCH_SIZE: (s+1)*_BATCH_SIZE]
batch_ys = train_y[s*_BATCH_SIZE: (s+1)*_BATCH_SIZE]
start_time = time()
i_global, _, batch_loss, batch_acc = sess.run(
[global_step, optimizer, loss, accuracy],
feed_dict={x: batch_xs, y: batch_ys, learning_rate: lr(epoch)})
duration = time() - start_time
if s % 10 == 0:
percentage = int(round((s/batch_size)*100))
bar_len = 29
filled_len = int((bar_len*int(percentage))/100)
bar = '=' * filled_len + '>' + '-' * (bar_len - filled_len)
msg = "Global step: {:>5} - [{}] {:>3}% - acc: {:.4f} - loss: {:.4f} - {:.1f} sample/sec"
print(msg.format(i_global, bar, percentage, batch_acc, batch_loss, _BATCH_SIZE / duration))
test_and_save(i_global, epoch)
def test_and_save(_global_step, epoch):
global global_accuracy
i = 0
predicted_class = np.zeros(shape=len(test_x), dtype=np.int)
while i < len(test_x):
j = min(i + _BATCH_SIZE, len(test_x))
batch_xs = test_x[i:j, :]
batch_ys = test_y[i:j, :]
predicted_class[i:j] = sess.run(
y_pred_cls,
feed_dict={x: batch_xs, y: batch_ys, learning_rate: lr(epoch)}
)
i = j
correct = (np.argmax(test_y, axis=1) == predicted_class)
acc = correct.mean()*100
correct_numbers = correct.sum()
mes = "\nEpoch {} - accuracy: {:.2f}% ({}/{})"
print(mes.format((epoch+1), acc, correct_numbers, len(test_x)))
if global_accuracy != 0 and global_accuracy < acc:
summary = tf.Summary(value=[
tf.Summary.Value(tag="Accuracy/test", simple_value=acc),
])
train_writer.add_summary(summary, _global_step)
saver.save(sess, save_path=_SAVE_PATH, global_step=_global_step)
mes = "This epoch receive better accuracy: {:.2f} > {:.2f}. Saving session..."
print(mes.format(acc, global_accuracy))
global_accuracy = acc
elif global_accuracy == 0:
global_accuracy = acc
print("###########################################################################################################")
def main():
for i in range(_EPOCH):
print("\nEpoch: {0}/{1}\n".format((i+1), _EPOCH))
train(i)
if __name__ == "__main__":
main()
sess.close()
In this example, i think, both test and traning data feeds networks, normally only train data must feed network. I can not see any difference between train() and test_and_save() functions. Am i wrong? Thanks
Here is an explanation if I understood your question correctly. The train function is called every epoch and iterates through the training data. At the end of the epoch the test_and_save function is called where the model accuracy is evaluated. This iterates through the test data on the learned weights and calculates the accuracy and saves the model. This is repeated _EPOCH times.
Edit: The model is saved in the test_and_save function. However, the weights are only updated (gradients calculated) when optimizer is passed through sess.run() in the train function. In the test_and_save function the test data is fed to the network however only y_pred_cls is evaluated by passing to sess.run().

ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables

I am training the "Show and tell" model using tensorflow in which the model automatically generates the captions of the images. How ever I am getting this error.
This is the traceback:
------------------------------------------------------------------------
---
ValueError Traceback (most recent call
last)
<ipython-input-36-b6da0a27b701> in <module>()
1 try:
2 #train(.001,False,False) #train from scratch
----> 3 train(.001,True,True) #continue training from pretrained weights #epoch500
4 #train(.001) #train from previously saved weights
5 except KeyboardInterrupt:
ipython-input-35-39693d0edd0a> in train(learning_rate, continue_training, transfer)
31 learning_rate = tf.train.exponential_decay(learning_rate, global_step,
32 int(len(index)/batch_size), 0.95)
---> 33 train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
34 tf.global_variables_initializer().run()
35
/home/niraj/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/optimizer.pyc in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
320 "No gradients provided for any variable, check your graph for ops"
321 " that do not support gradients, between variables %s
and loss %s." %
--> 322 ([str(v) for _, v in grads_and_vars], loss))
323
324 return self.apply_gradients(grads_and_vars,
global_step=global_step,
ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables ["tf.Variable 'word_embedding:0' shape=(2943, 256) dtype=float32_ref>", "tf.Variable 'embedding_bias:0' shape=(256,) dtype=float32_ref>", "tf.Variable 'img_embedding:0' shape=(4096, 256) dtype=float32_ref>", "tf.Variable 'img_embedding_bias:0' shape=(256,) dtype=float32_ref>", "tf.Variable 'word_encoding:0' shape=(256, 2943) dtype=float32_ref>", "tf.Variable 'word_encoding_bias:0' shape=(2943,) dtype=float32_ref>"] and loss Tensor("RNN/div:0", shape=(), dtype=float32).
I know that the error is due to the fact that there is a variable which doesen't holds the gradient during optimisation which in turn is cutting the graph but I am unable to pick it out.
I am using already trained VGG-net 16 model parameters and the FLICKR-30 image dataset having corresponding annotations.
This is the code:
def get_data(annotation_path, feature_path):
annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
return np.load(feature_path,'r'), annotations['caption'].values
def preProBuildWordVocab(sentence_iterator, word_count_threshold=30): # function from Andre Karpathy's NeuralTalk
print('preprocessing %d word vocab' % (word_count_threshold, ))
word_counts = {}
nsents = 0
for sent in sentence_iterator:
nsents += 1
for w in sent.lower().split(' '):
word_counts[w] = word_counts.get(w, 0) + 1
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))
ixtoword = {}
ixtoword[0] = '.'
wordtoix = {}
wordtoix['#START#'] = 0
ix = 1
for w in vocab:
wordtoix[w] = ix
ixtoword[ix] = w
ix += 1
word_counts['.'] = nsents
bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])
bias_init_vector /= np.sum(bias_init_vector)
bias_init_vector = np.log(bias_init_vector)
bias_init_vector -= np.max(bias_init_vector)
return wordtoix, ixtoword, bias_init_vector.astype(np.float32)
class Caption_Generator():
def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b):
self.dim_in = dim_in
self.dim_embed = dim_embed
self.dim_hidden = dim_hidden
self.batch_size = batch_size
self.n_lstm_steps = n_lstm_steps
self.n_words = n_words
# declare the variables to be used for our word embeddings
with tf.device("/cpu:0"):
self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')
self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')
# declare the LSTM itself
self.lstm = tf.contrib.rnn.BasicLSTMCell(dim_hidden)
# declare the variables to be used to embed the image feature embedding to the word embedding space
self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')
self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')
# declare the variables to go from an LSTM output to a word encoding output
self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')
# initialize this bias variable from the preProBuildWordVocab output
self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')
def build_model(self):
# declaring the placeholders for our extracted image feature vectors, our caption, and our mask
# (describes how long our caption is with an array of 0/1 values of length `maxlen`
img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
# getting an initial LSTM embedding from our image_imbedding
image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
# setting initial state of our LSTM
state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)
total_loss = 0.0
with tf.variable_scope("RNN"):
for i in range(self.n_lstm_steps):
if i > 0:
#if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding
# to the (i-1)th word in our caption
with tf.device("/cpu:0"):
current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias
else:
#if this is the first iteration of our LSTM we utilize the embedded image as our input
current_embedding = image_embedding
if i > 0:
# allows us to reuse the LSTM tensor variable on each iteration
tf.get_variable_scope().reuse_variables()
out, state = self.lstm(current_embedding, state)
#out, state = self.tf.nn.dynamic_rnn(current_embedding, state)
if i > 0:
#get the one-hot representation of the next word in our caption
labels = tf.expand_dims(caption_placeholder[:, i], 1)
ix_range=tf.range(0, self.batch_size, 1)
ixs = tf.expand_dims(ix_range, 1)
concat = tf.concat([ixs, labels],1)
onehot = tf.sparse_to_dense(
concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)
#perform a softmax classification to generate the next word in the caption
logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)
xentropy = xentropy * mask[:,i]
loss = tf.reduce_sum(xentropy)
total_loss += loss
total_loss = total_loss / tf.reduce_sum(mask[:,1:])
return total_loss, img, caption_placeholder, mask
### Parameters ###
dim_embed = 256
dim_hidden = 256
dim_in = 4096
batch_size = 128
momentum = 0.9
n_epochs = 150
def train(learning_rate=0.001, continue_training=False, transfer=True):
tf.reset_default_graph()
feats, captions = get_data(annotation_path, feature_path)
wordtoix, ixtoword, init_b = preProBuildWordVocab(captions)
np.save('data/ixtoword', ixtoword)
index = (np.arange(len(feats)).astype(int))
np.random.shuffle(index)
sess = tf.InteractiveSession()
n_words = len(wordtoix)
maxlen = np.max( [x for x in map(lambda x: len(x.split(' ')), captions) ] )
caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words, init_b)
loss, image, sentence, mask = caption_generator.build_model()
saver = tf.train.Saver(max_to_keep=100)
global_step=tf.Variable(0,trainable=False)
learning_rate = tf.train.exponential_decay(learning_rate, global_step,
int(len(index)/batch_size), 0.95)
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
tf.global_variables_initializer().run()
if continue_training:
if not transfer:
saver.restore(sess,tf.train.latest_checkpoint(model_path))
else:
saver.restore(sess,tf.train.latest_checkpoint(model_path_transfer))
losses=[]
for epoch in range(n_epochs):
for start, end in zip( range(0, len(index), batch_size), range(batch_size, len(index), batch_size)):
current_feats = feats[index[start:end]]
current_captions = captions[index[start:end]]
current_caption_ind = [x for x in map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix], current_captions)]
current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)
current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] )
current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
nonzeros = np.array([x for x in map(lambda x: (x != 0).sum()+2, current_caption_matrix )])
for ind, row in enumerate(current_mask_matrix):
row[:nonzeros[ind]] = 1
_, loss_value = sess.run([train_op, loss], feed_dict={
image: current_feats.astype(np.float32),
sentence : current_caption_matrix.astype(np.int32),
mask : current_mask_matrix.astype(np.float32)
})
print("Current Cost: ", loss_value, "\t Epoch {}/{}".format(epoch, n_epochs), "\t Iter {}/{}".format(start,len(feats)))
print("Saving the model from epoch: ", epoch)
saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
Branching in the loss building routine is invalid.
with tf.variable_scope("RNN"):
for i in range(self.n_lstm_steps):
if i > 0:
[...]
else:
[...]
if i > 0:
[...]
if i > 0:
[...]
Note, that last two ifs will never run, as they are in the else clause, meaning that i <= 0. Consequently your loss is actually a constant, equal 0, and thus TF do not see how to optimise it wrt. variables.

using tensorflow trian embedding, loss increase when each epoch start, then decrease

I write a simple code to do something like word2vec, but when training, I see the cross-entropy loss increases at each epochs beginning, then decrease. please help me find out if there is any mistake in my code, I have already reviewed many times...
from tensorflow.python import debug as tf_debug
import math
import os
import time
import random
import numpy as np
import tensorflow as tf
def gen_next_block(filenames,epochs):
for epoch in range(epochs):
for filename in filenames:
with open(filename) as f:
start = time.time()
line_cnt = 0
data = []
for line in f:
record = line.strip().split(',')
record = [int(record[0]), int(record[1]), float(record[2])] + [int(item) for item in record[3].split(';')]
record = record[:3] + [record[3 + epoch],]
data.append(record)
line_cnt += 1
if line_cnt % 4096000 == 0:
end = time.time()
elapsed_time = (end - start) * 1000
print("load block data: epoch %d, filename %s line_cnt %d, size %d, elapsed time %f ms" % (epoch, filename, line_cnt, len(data), elapsed_time))
random.shuffle(data)
yield data
data = []
start = time.time()
if len(data) > 0:
end = time.time()
elapsed_time = (end - start) * 1000
print("load block data: epoch %d, filename %s line_cnt %d, size %d, elapsed time %f ms" % (epoch, filename, line_cnt, len(data), elapsed_time))
random.shuffle(data)
yield data
data = None
next_block_generator = None
data_index = 0
last_time_data_index = 0
def generate_batch(filenames, epochs, batch_size):
global data
global data_index
global last_time_data_index
global next_block_generator
if next_block_generator is None:
next_block_generator = gen_next_block(filenames,epochs)
if data_index <= last_time_data_index:
data = next(next_block_generator,None)
data_index = 0
last_time_data_index = 0
if data is not None:
last_time_data_index = data_index
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size), dtype=np.int32)
negative_labels = np.ndarray(shape=(batch_size), dtype=np.int32)
weights = np.ndarray(shape=(batch_size), dtype=np.float32)
negative_weights = np.ones(shape=(batch_size), dtype=np.float32)
for i in range(batch_size):
batch[i] = data[data_index][0]
labels[i] = data[data_index][1]
weights[i] = data[data_index][2]
negative_labels[i] = data[data_index][3]
data_index = (data_index + 1) % len(data)
return batch, labels, negative_labels, weights, negative_weights
else:
raise Exception("finish load file list [%s] %d times" % (','.join(filenames),epochs))
filename = 'data/dr_xianyu_item2vec_train_with_meta_20170725_dir/dr_xianyu_item2vec_train_with_meta_20170725_dir_'
filenames = [filename + str(i) for i in range(10)]
epochs = 5
batch_size = 2048
embedding_size = 32 # Dimension of the embedding vector.
num_sampled = batch_size # Number of negative examples to sample.
vocabulary_size = 7483025 + 1
graph = tf.Graph()
with graph.as_default():
with tf.device('/cpu:0'):
with tf.name_scope('input_data'):
train_inputs = tf.placeholder(tf.int32, shape=[batch_size], name = 'context_placeholder')
positive_labels = tf.placeholder(tf.int32, shape=[batch_size], name = 'target_placeholder')
negative_labels = tf.placeholder(tf.int32, shape=[num_sampled], name = 'negative_target_placeholder')
positive_weights = tf.placeholder(tf.float32, shape=([batch_size]), name = 'target_weight')
negative_weights = tf.placeholder(tf.float32, shape=([num_sampled]), name = 'negative_target_weight')
with tf.name_scope('emb_layer'):
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -0.5/embedding_size, 0.5/embedding_size), name = 'emb')
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
with tf.name_scope("neg_layer"):
nce_weights = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -0.5/embedding_size, 0.5/embedding_size), name = 'nce_weight')
nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name = 'nce_biase')
positive_embed = tf.nn.embedding_lookup(nce_weights,positive_labels)
positive_bias = tf.nn.embedding_lookup(nce_biases,positive_labels)
negative_embed = tf.nn.embedding_lookup(nce_weights,negative_labels)
negative_bias = tf.nn.embedding_lookup(nce_biases,negative_labels)
positive_logits = tf.reduce_sum(tf.multiply(embed,positive_embed),1) + positive_bias
negative_logits = tf.reduce_sum(tf.multiply(embed,negative_embed),1) + negative_bias
with tf.name_scope('loss_layer'):
positive_xent = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(positive_logits), logits = positive_logits)
negative_xent = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.zeros_like(negative_logits), logits = negative_logits)
weighted_positive_logits = tf.multiply(positive_logits,positive_weights)
weighted_negative_logits = tf.multiply(negative_logits,negative_weights)
loss = (tf.reduce_sum(positive_xent) + tf.reduce_sum(negative_xent)) /(batch_size*2)
with tf.name_scope('train'):
optimizer = tf.train.RMSPropOptimizer(0.001).minimize(loss)
# global_step = tf.Variable(0, trainable=False)
# starter_learning_rate = 0.1
# learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 20000, 0.8, staircase=True)
# optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
init = tf.global_variables_initializer()
init_local = tf.local_variables_initializer()
tf.summary.scalar('loss_layer/loss', loss)
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
summary_op = tf.summary.merge_all()
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
# sess = tf_debug.LocalCLIDebugWrapperSession(sess)
# sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
for v in tf.global_variables():
print(v.name,v.device,v.shape)
for v in tf.local_variables():
print(v.name,v.device,v.shape)
# if os.path.exists('tmp/model.ckpt.meta'):
# saver = tf.train.import_meta_graph('tmp/model.ckpt.meta')
# saver.restore(sess,tf.train.latest_checkpoint('tmp/'))
# print("model restored")
# else:
if True:
init.run()
init_local.run()
print("model init")
summary_writer = tf.summary.FileWriter('tmp/log', sess.graph)
average_loss = 0
start = time.time()
step = 1
try:
while True:
batch_inputs, batch_labels, batch_negative_labels, positive_weights_np, negative_weights_np = generate_batch(filenames, epochs,batch_size)
feed_dict = {train_inputs: batch_inputs, positive_labels: batch_labels, negative_labels: batch_negative_labels, positive_weights:positive_weights_np, negative_weights:negative_weights_np}
if step%1000 == 0:
loss_val,summary_str,_ = sess.run([loss, summary_op, optimizer], feed_dict=feed_dict)
summary_writer.add_summary(summary_str,step)
else:
loss_val,_ = sess.run([loss, optimizer], feed_dict=feed_dict)
average_loss += loss_val
if step % 1000 == 0:
average_loss /= 1000
end = time.time()
elapsed_time = (end - start)*1000 / 1000
print('Average loss at step ', step, ': ', average_loss, 'time cost', elapsed_time, 'ms')
average_loss = 0
start = time.time()
if step % 20000 == 0:
print('save model...')
save_path = saver.save(sess,'tmp/model.ckpt')
print("saved model in",save_path)
step +=1
except Exception,e:
print e
print("total batch count %d" % step)
summary_writer.flush()
there is my loss
the first pic is sgd generated in 5 epoch
the second pic is RMSProp generated in 2 epoch(still running)