Related
I tried to test trained data.
!/usr/bin/env python
Copyright (c) 2016 Artsiom Sanakoyeu
from __future__ import division
from chainer import iterators
import cmd_options
import dataset
import os
import time
import regressionnet
import tensorflow as tf
import copy
from tqdm import tqdm
import numpy as np
import math
import pprint
import datetime
from regressionnet import evaluate_pcp, create_sumamry
def evaluate(net, pose_loss_op, test_iterator, summary_writer, tag='test/pose_loss'):
test_it = copy.copy(test_iterator)
total_loss = 0.0
cnt = 0
num_batches = int(math.ceil(len(test_it.dataset) / test_it.batch_size))
print len(test_it.dataset)
for batch in tqdm(test_it, total=num_batches):
feed_dict = regressionnet.fill_joint_feed_dict(net,
regressionnet.batch2feeds(batch)[:3],
conv_lr=0.0,
fc_lr=0.0,
phase='test')
global_step, loss_value = net.sess.run([net.global_iter_counter, pose_loss_op],
feed_dict=feed_dict)
total_loss += loss_value * len(batch)
cnt += len(batch)
avg_loss = total_loss / len(test_it.dataset)
print 'Step {} {} = {:.3f}'.format(global_step, tag, avg_loss)
summary_writer.add_summary(create_sumamry(tag, avg_loss),
global_step=global_step)
assert cnt == 1000, 'cnt = {}'.format(cnt)
def train_loop(net, saver, loss_op, pose_loss_op, train_op, dataset_name, train_iterator, test_iterator,
val_iterator=None,
max_iter=None,
test_step=None,
snapshot_step=None,
log_step=1,
batch_size=None,
conv_lr=None,
fc_lr=None,
fix_conv_iter=None,
output_dir='results',
):
summary_step = 50
with net.graph.as_default():
summary_writer = tf.summary.FileWriter(output_dir, net.sess.graph)
summary_op = tf.summary.merge_all()
fc_train_op = net.graph.get_operation_by_name('fc_train_op')
global_step = None
for step in xrange(max_iter + 1):
# test, snapshot
if step % test_step == 0 or step + 1 == max_iter or step == fix_conv_iter:
global_step = net.sess.run(net.global_iter_counter)
evaluate_pcp(net, pose_loss_op, test_iterator, summary_writer,
dataset_name=dataset_name,
tag_prefix='test')
if val_iterator is not None:
evaluate_pcp(net, pose_loss_op, val_iterator, summary_writer,
dataset_name=dataset_name,
tag_prefix='val')
if step % snapshot_step == 0 and step > 1:
checkpoint_prefix = os.path.join(output_dir, 'checkpoint')
assert global_step is not None
saver.save(net.sess, checkpoint_prefix, global_step=global_step)
if step == max_iter:
break
# training
start_time = time.time()
feed_dict = regressionnet.fill_joint_feed_dict(net,
regressionnet.batch2feeds(train_iterator.next())[:3],
conv_lr=conv_lr,
fc_lr=fc_lr,
phase='train')
if step < fix_conv_iter:
feed_dict['lr/conv_lr:0'] = 0.0
if step < fix_conv_iter:
cur_train_op = fc_train_op
else:
cur_train_op = train_op
if step % summary_step == 0:
global_step, summary_str, _, loss_value = net.sess.run(
[net.global_iter_counter,
summary_op,
cur_train_op,
pose_loss_op],
feed_dict=feed_dict)
summary_writer.add_summary(summary_str, global_step=global_step)
else:
global_step, _, loss_value = net.sess.run(
[net.global_iter_counter, cur_train_op, pose_loss_op],
feed_dict=feed_dict)
duration = time.time() - start_time
if step % log_step == 0 or step + 1 == max_iter:
print('Step %d: train/pose_loss = %.2f (%.3f s, %.2f im/s)'
% (global_step, loss_value, duration,
batch_size // duration))
def main(argv):
"""
Run training of the Deeppose stg-1
"""
args = cmd_options.get_arguments(argv)
if not os.path.exists(args.o_dir):
os.makedirs(args.o_dir)
suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
with open(os.path.join(args.o_dir, 'params.dump_{}.txt'.format(suffix)), 'w') as f:
f.write('{}\n'.format(pprint.pformat(args)))
net, loss_op, pose_loss_op, train_op = regressionnet.create_regression_net(
n_joints=args.n_joints,
init_snapshot_path=args.snapshot,
is_resume=args.resume,
reset_iter_counter=args.reset_iter_counter,
reset_moving_averages=args.reset_moving_averages,
optimizer_type=args.optimizer,
gpu_memory_fraction=0.32, # Set how much GPU memory to reserve for the network
net_type=args.net_type)
with net.graph.as_default():
saver = tf.train.Saver()
print 'args.resume: {}\nargs.snapshot: {}'.format(args.resume, args.snapshot)
bbox_extension_range = (args.bbox_extension_min, args.bbox_extension_max)
if bbox_extension_range[0] is None or bbox_extension_range[1] is None:
bbox_extension_range = None
test_bbox_extension_range = None
else:
test_bbox_extension_range = (bbox_extension_range[1], bbox_extension_range[1])
train_dataset = dataset.PoseDataset(
args.train_csv_fn, args.img_path_prefix, args.im_size,
fliplr=args.fliplr,
rotate=args.rotate,
rotate_range=args.rotate_range,
shift=args.shift,
bbox_extension_range=bbox_extension_range,
min_dim=args.min_dim,
coord_normalize=args.coord_normalize,
gcn=args.gcn,
fname_index=args.fname_index,
joint_index=args.joint_index,
symmetric_joints=args.symmetric_joints,
ignore_label=args.ignore_label,
should_downscale_images=args.should_downscale_images,
downscale_height=args.downscale_height
)
test_dataset = dataset.PoseDataset(
args.test_csv_fn, args.img_path_prefix, args.im_size,
fliplr=False, rotate=False,
shift=None,
bbox_extension_range=test_bbox_extension_range,
coord_normalize=args.coord_normalize,
gcn=args.gcn,
fname_index=args.fname_index,
joint_index=args.joint_index,
symmetric_joints=args.symmetric_joints,
ignore_label=args.ignore_label,
should_return_bbox=True,
should_downscale_images=args.should_downscale_images,
downscale_height=args.downscale_height
)
np.random.seed(args.seed)
train_iterator = iterators.MultiprocessIterator(train_dataset, args.batch_size,
n_processes=args.workers, n_prefetch=3)
test_iterator = iterators.MultiprocessIterator(
test_dataset, args.batch_size,
repeat=False, shuffle=False,
n_processes=1, n_prefetch=1)
val_iterator = None
if args.val_csv_fn is not None and args.val_csv_fn != '':
small_train_dataset = dataset.PoseDataset(
args.val_csv_fn,
args.img_path_prefix, args.im_size,
fliplr=False, rotate=False,
shift=None,
bbox_extension_range=test_bbox_extension_range,
coord_normalize=args.coord_normalize,
gcn=args.gcn,
fname_index=args.fname_index,
joint_index=args.joint_index,
symmetric_joints=args.symmetric_joints,
ignore_label=args.ignore_label,
should_return_bbox=True,
should_downscale_images=args.should_downscale_images,
downscale_height=args.downscale_height
)
val_iterator = iterators.MultiprocessIterator(
small_train_dataset, args.batch_size,
repeat=False, shuffle=False,
n_processes=1, n_prefetch=1)
train_loop(net, saver, loss_op, pose_loss_op, train_op, args.dataset_name,
train_iterator, test_iterator,
val_iterator=val_iterator,
max_iter=args.max_iter,
test_step=args.test_step,
log_step=args.log_step,
snapshot_step=args.snapshot_step,
batch_size=args.batch_size,
conv_lr=args.conv_lr,
fc_lr=args.fc_lr,
fix_conv_iter=args.fix_conv_iter,
output_dir=args.o_dir
)
if __name__ == '__main__':
import sys
main(sys.argv[1:])
This is code what I used.
I traind it about 370000 iteration and I tried to test trained data.
But it shows this messages
Data loss: not an sstable (bad magic number): perhaps your file is in a different file format and you need to use a different restore operator?
Traceback (most recent call last):
DataLossError (see above for traceback): Unable to open table file out/lsp_alexnet_imagenet_small/checkpoint-370000.data-00000-of-00001: Data loss: not an sstable (bad magic number): perhaps your file is in a different file format and you need to use a different restore operator?
How can I resolve this problem?
I have read many similar questions and just cannot get this to work properly.
I have my model being trained well and checkpoint files are being made every epoch. I want to have it so the program can continue from epoch x once reloaded and also for it to print that is on that epoch with every iteration. I could simply save the data outside of the checkpoint file, however I was also wanting to do this to give me confidence everything else is also being stored properly.
Unfortunately the value in the epoch/global_step variable is always still 0 when I restart.
import tensorflow as tf
import numpy as np
import tensorflow as tf
import numpy as np
# more imports
def extract_number(f): # used to get latest checkpint file
s = re.findall("epoch(\d+).ckpt",f)
return (int(s[0]) if s else -1,f)
def restore(init_op, sess, saver): # called to restore or just initialise model
list = glob(os.path.join("./params/e*"))
if list:
file = max(list,key=extract_number)
saver.restore(sess, file[:-5])
sess.run(init_op)
return
with tf.Graph().as_default() as g:
# build models
total_batch = data.train.num_examples / batch_size
epochLimit = 51
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
restore(init_op, sess, saver)
epoch = global_step.eval()
while epoch < epochLimit:
total_batch = data.train.num_examples / batch_size
for i in range(int(total_batch)):
sys.stdout.flush()
voxels = newData.eval()
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
sess.run(opt_G, feed_dict={z:batch_z, train:True})
sess.run(opt_D, feed_dict={input:voxels, z:batch_z, train:True})
with open("out/loss.csv", 'a') as f:
batch_loss_G = sess.run(loss_G, feed_dict={z:batch_z, train:False})
batch_loss_D = sess.run(loss_D, feed_dict={input:voxels, z:batch_z, train:False})
msgOut = "Epoch: [{0}], i: [{1}], G_Loss[{2:.8f}], D_Loss[{3:.8f}]".format(epoch, i, batch_loss_G, batch_loss_D)
print(msgOut)
epoch=epoch+1
sess.run(global_step.assign(epoch))
saver.save(sess, "params/epoch{0}.ckpt".format(epoch))
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
voxels = sess.run(x_, feed_dict={z:batch_z})
v = voxels[0].reshape([32, 32, 32]) > 0
util.save_binvox(v, "out/epoch{0}.vox".format(epoch), 32)
I also update the global step variable using assign at the bottom. Any ideas? Any help would be greatly appreciated.
When you call sess.run(init_op) after restoring this resets all variables to their initial values. Comment that line out and things should work.
My original code was wrong for several reasons because I was trying so many things. The first responder Alexandre Passos gives a valid point, but I believe what changed the game was also the use of scopes (maybe?).
Below is the working updated code if it helps anyone:
import tensorflow as tf
import numpy as np
# more imports
def extract_number(f): # used to get latest checkpint file
s = re.findall("epoch(\d+).ckpt",f)
return (int(s[0]) if s else -1,f)
def restore(sess, saver): # called to restore or just initialise model
list = glob(os.path.join("./params/e*"))
if list:
file = max(list,key=extract_number)
saver.restore(sess, file[:-5])
return saver, True, sess
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
sess.run(init_op)
return saver, False , sess
batch_size = 100
learning_rate = 0.0001
beta1 = 0.5
z_size = 100
save_interval = 1
data = dataset.read()
total_batch = data.train.num_examples / batch_size
def fill_queue():
for i in range(int(total_batch*epochLimit)):
sess.run(enqueue_op, feed_dict={batch: data.train.next_batch(batch_size)}) # runnig in seperate thread to feed a FIFOqueue
with tf.variable_scope("glob"):
global_step = tf.get_variable(name='global_step', initializer=0,trainable=False)
# build models
epochLimit = 51
saver = tf.train.Saver()
with tf.Session() as sess:
saver,rstr,sess = restore(sess, saver)
with tf.variable_scope("glob", reuse=True):
epocht = tf.get_variable(name='global_step', trainable=False, dtype=tf.int32)
epoch = epocht.eval()
while epoch < epochLimit:
total_batch = data.train.num_examples / batch_size
for i in range(int(total_batch)):
sys.stdout.flush()
voxels = newData.eval()
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
sess.run(opt_G, feed_dict={z:batch_z, train:True})
sess.run(opt_D, feed_dict={input:voxels, z:batch_z, train:True})
with open("out/loss.csv", 'a') as f:
batch_loss_G = sess.run(loss_G, feed_dict={z:batch_z, train:False})
batch_loss_D = sess.run(loss_D, feed_dict={input:voxels, z:batch_z, train:False})
msgOut = "Epoch: [{0}], i: [{1}], G_Loss[{2:.8f}], D_Loss[{3:.8f}]".format(epoch, i, batch_loss_G, batch_loss_D)
print(msgOut)
epoch=epoch+1
sess.run(global_step.assign(epoch))
saver.save(sess, "params/epoch{0}.ckpt".format(epoch))
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
voxels = sess.run(x_, feed_dict={z:batch_z})
v = voxels[0].reshape([32, 32, 32]) > 0
util.save_binvox(v, "out/epoch{0}.vox".format(epoch), 32)
I built up my own convolutional neural network, in which I track the moving averages of all trainable variables (tensorflow 1.0):
variable_averages = tf.train.ExponentialMovingAverage(
0.9999, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
train_op = tf.group(apply_gradient_op, variables_averages_op)
saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
summary_op = tf.summary.merge(summaries)
init = tf.global_variables_initializer()
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False))
sess.run(init)
# start queue runners
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
# training loop
start_time = time.time()
for step in range(FLAGS.max_steps):
_, loss_value = sess.run([train_op, loss])
duration = time.time() - start_time
start_time = time.time()
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 1 == 0:
# print current model status
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step/duration
sec_per_batch = duration/FLAGS.num_gpus
format_str = '{} step{}, loss {}, {} examples/sec, {} sec/batch'
print(format_str.format(datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
if step % 50 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 10 == 0 or step == FLAGS.max_steps:
print('save checkpoint')
# save checkpoint file
checkpoint_file = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_file, global_step=step)
This workes fine and checkpoint files are saved (saver version V2). Then I try to restore the checkpoints in a nother script for evaluating the model. There I have this piece of code
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
where I get the error "NotFoundError (see above for traceback): Key conv1/Variable/ExponentialMovingAverage not found in checkpoint" where conv1/variable/ is a variable scope.
This error ocuurs even before I try to restore the variables. Can you please help to solve it?
Thanks in advance
TheJude
I solved it in this way:
Call tf.reset_default_graph() before create second ExponentialMovingAverage(...) in the graph.
# reset the graph before create a new ema
tf.reset_default_graph()
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
It took me 2 hours...
I am running Tensorflow 0.12.1 on a GPU. I have a trained Deep CNN model whose weights I've saved using a checkpoint file. During inference, I reload the saved checkpoint using restorer.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir)). The code seems to run without issues, but everytime I re-run the script, I'm getting screwed up outputs. AFAIK, I do not shuffle my test set inputs. The inputs are being loaded and fed to the network properly. It is just the output of different runs of the CNN on the same test set using the same order is producing very different outputs. I'm perplexed! Also, how do I execute a graph loaded with saved checkpoint without running an init_op during inference? It seems my code requires all global and local variables to be initialized before execution. (I initialize first, and then only restore the checkpoint!).Here's a snippet of my code:
import tensorflow as tf
import numpy as np
import os
import os.path
from datetime import datetime
import time
import random
import json
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
from modelFCNN3 import model
def read_input(inp_queue,height=224,width=224,channels=3, mask=False):
value = tf.read_file(inp_queue)
image = tf.image.decode_png(value)
image = tf.image.resize_images(image, [height, width],method=2)
image = tf.cast(image, tf.uint8)
image.set_shape([height,width,channels])
image = tf.reshape(image,[height,width,channels])
if mask:
image = tf.to_float(tf.greater_equal(image,128))
image = tf.cast(image,tf.float32)
else:
image = tf.image.per_image_standardization(image)
image = tf.cast(image,tf.float32)
return image
if __name__ == '__main__':
tf.reset_default_graph()
with open('X_test.json', 'r') as infile:
X_test = json.load(infile)
with open('y_test.json', 'r') as infile:
y_test = json.load(infile)
imagelist = ops.convert_to_tensor(X_test, dtype=dtypes.string)
labellist = ops.convert_to_tensor(y_test, dtype=dtypes.string)
input_queue = tf.train.slice_input_producer([imagelist, labellist],
num_epochs=1,
shuffle=False)
image = read_input(input_queue[0],height=224,width=224,channels=3, mask=False)
label = read_input(input_queue[1],height=224,width=224,channels=1, mask=True)
images_batch, labels_batch = tf.train.batch([image, label], batch_size=FLAGS.batch_size,
enqueue_many=False,shapes=None, allow_smaller_final_batch=True)
global_step = tf.Variable(0, trainable=False)
images = tf.placeholder_with_default(images_batch, shape=[None, 224,224,3])
labels = tf.placeholder_with_default(labels_batch, shape=[None, 224,224,1])
restorer = tf.train.Saver()
logits = model(images).logits
labels = tf.cast(labels,tf.int32)
labels.set_shape([FLAGS.batch_size,224,224,1])
valid_prediction = tf.argmax(tf.nn.softmax(logits), dimension=3)
valid_prediction.set_shape([FLAGS.batch_size,224,224])
meanIOU,update_op_mIOU= tf.contrib.metrics.streaming_mean_iou(tf.cast(valid_prediction,tf.int32), tf.squeeze(labels),FLAGS.num_classes)
init = tf.global_variables_initializer()
init_locals = tf.local_variables_initializer()
with tf.Session() as sess:
sess.run([init, init_locals])
restorer.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir))
print("Model restored.")
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord,sess=sess)
summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
try:
step = 0
avg = []
while not coord.should_stop():
myimg, predimg, mylbl= sess.run([images,valid_prediction,labels])
mIOU,_ = sess.run([meanIOU,update_op_mIOU])
avg.append(mIOU)
step += 1
except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
coord.join(threads)
sess.close()
Are you running on same machine or different machine
#saver = tf.train.Saver()
The following comment is in tensorflow docs
#NOTE: Restarting training from saved meta_graph only works if the device assignments have not changed.
#saver = tf.train.import_meta_graph(metafile)
I have built and trained a tensorflow model, but unfortunately, the checkpoint file cannot be open as shown below by an error.
Now there isn't an error, but a bunch of warnings that don't really tell you anything.
This happens when I run the evalutation code:
import tensorflow as tf
import main
import Process
import Input
eval_dir = "/Users/Zanhuang/Desktop/NNP"
checkpoint_dir = "/Users/Zanhuang/Desktop/NNP"
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
def evaluate():
with tf.Graph().as_default() as g:
images, labels = Process.eval_inputs()
forward_propgation_results = Process.forward_propagation(images)
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
top_k_op = tf.nn.in_top_k(forward_propgation_results, labels, 1)
with tf.Session(graph = g) as sess:
tf.train.start_queue_runners(sess = sess)
sess.run(init_op)
saver.restore(sess, eval_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
for i in range(100):
print(sess.run(top_k_op))
def main(argv = None):
evaluate()
if __name__ == '__main__':
tf.app.run()
The next is how I had generated the checkpoint file:
if step % 2 == 0:
checkpoint_path = os.path.join(FLAGS.data_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step = step)