I tried to test trained data.
!/usr/bin/env python
Copyright (c) 2016 Artsiom Sanakoyeu
from __future__ import division
from chainer import iterators
import cmd_options
import dataset
import os
import time
import regressionnet
import tensorflow as tf
import copy
from tqdm import tqdm
import numpy as np
import math
import pprint
import datetime
from regressionnet import evaluate_pcp, create_sumamry
def evaluate(net, pose_loss_op, test_iterator, summary_writer, tag='test/pose_loss'):
test_it = copy.copy(test_iterator)
total_loss = 0.0
cnt = 0
num_batches = int(math.ceil(len(test_it.dataset) / test_it.batch_size))
print len(test_it.dataset)
for batch in tqdm(test_it, total=num_batches):
feed_dict = regressionnet.fill_joint_feed_dict(net,
regressionnet.batch2feeds(batch)[:3],
conv_lr=0.0,
fc_lr=0.0,
phase='test')
global_step, loss_value = net.sess.run([net.global_iter_counter, pose_loss_op],
feed_dict=feed_dict)
total_loss += loss_value * len(batch)
cnt += len(batch)
avg_loss = total_loss / len(test_it.dataset)
print 'Step {} {} = {:.3f}'.format(global_step, tag, avg_loss)
summary_writer.add_summary(create_sumamry(tag, avg_loss),
global_step=global_step)
assert cnt == 1000, 'cnt = {}'.format(cnt)
def train_loop(net, saver, loss_op, pose_loss_op, train_op, dataset_name, train_iterator, test_iterator,
val_iterator=None,
max_iter=None,
test_step=None,
snapshot_step=None,
log_step=1,
batch_size=None,
conv_lr=None,
fc_lr=None,
fix_conv_iter=None,
output_dir='results',
):
summary_step = 50
with net.graph.as_default():
summary_writer = tf.summary.FileWriter(output_dir, net.sess.graph)
summary_op = tf.summary.merge_all()
fc_train_op = net.graph.get_operation_by_name('fc_train_op')
global_step = None
for step in xrange(max_iter + 1):
# test, snapshot
if step % test_step == 0 or step + 1 == max_iter or step == fix_conv_iter:
global_step = net.sess.run(net.global_iter_counter)
evaluate_pcp(net, pose_loss_op, test_iterator, summary_writer,
dataset_name=dataset_name,
tag_prefix='test')
if val_iterator is not None:
evaluate_pcp(net, pose_loss_op, val_iterator, summary_writer,
dataset_name=dataset_name,
tag_prefix='val')
if step % snapshot_step == 0 and step > 1:
checkpoint_prefix = os.path.join(output_dir, 'checkpoint')
assert global_step is not None
saver.save(net.sess, checkpoint_prefix, global_step=global_step)
if step == max_iter:
break
# training
start_time = time.time()
feed_dict = regressionnet.fill_joint_feed_dict(net,
regressionnet.batch2feeds(train_iterator.next())[:3],
conv_lr=conv_lr,
fc_lr=fc_lr,
phase='train')
if step < fix_conv_iter:
feed_dict['lr/conv_lr:0'] = 0.0
if step < fix_conv_iter:
cur_train_op = fc_train_op
else:
cur_train_op = train_op
if step % summary_step == 0:
global_step, summary_str, _, loss_value = net.sess.run(
[net.global_iter_counter,
summary_op,
cur_train_op,
pose_loss_op],
feed_dict=feed_dict)
summary_writer.add_summary(summary_str, global_step=global_step)
else:
global_step, _, loss_value = net.sess.run(
[net.global_iter_counter, cur_train_op, pose_loss_op],
feed_dict=feed_dict)
duration = time.time() - start_time
if step % log_step == 0 or step + 1 == max_iter:
print('Step %d: train/pose_loss = %.2f (%.3f s, %.2f im/s)'
% (global_step, loss_value, duration,
batch_size // duration))
def main(argv):
"""
Run training of the Deeppose stg-1
"""
args = cmd_options.get_arguments(argv)
if not os.path.exists(args.o_dir):
os.makedirs(args.o_dir)
suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
with open(os.path.join(args.o_dir, 'params.dump_{}.txt'.format(suffix)), 'w') as f:
f.write('{}\n'.format(pprint.pformat(args)))
net, loss_op, pose_loss_op, train_op = regressionnet.create_regression_net(
n_joints=args.n_joints,
init_snapshot_path=args.snapshot,
is_resume=args.resume,
reset_iter_counter=args.reset_iter_counter,
reset_moving_averages=args.reset_moving_averages,
optimizer_type=args.optimizer,
gpu_memory_fraction=0.32, # Set how much GPU memory to reserve for the network
net_type=args.net_type)
with net.graph.as_default():
saver = tf.train.Saver()
print 'args.resume: {}\nargs.snapshot: {}'.format(args.resume, args.snapshot)
bbox_extension_range = (args.bbox_extension_min, args.bbox_extension_max)
if bbox_extension_range[0] is None or bbox_extension_range[1] is None:
bbox_extension_range = None
test_bbox_extension_range = None
else:
test_bbox_extension_range = (bbox_extension_range[1], bbox_extension_range[1])
train_dataset = dataset.PoseDataset(
args.train_csv_fn, args.img_path_prefix, args.im_size,
fliplr=args.fliplr,
rotate=args.rotate,
rotate_range=args.rotate_range,
shift=args.shift,
bbox_extension_range=bbox_extension_range,
min_dim=args.min_dim,
coord_normalize=args.coord_normalize,
gcn=args.gcn,
fname_index=args.fname_index,
joint_index=args.joint_index,
symmetric_joints=args.symmetric_joints,
ignore_label=args.ignore_label,
should_downscale_images=args.should_downscale_images,
downscale_height=args.downscale_height
)
test_dataset = dataset.PoseDataset(
args.test_csv_fn, args.img_path_prefix, args.im_size,
fliplr=False, rotate=False,
shift=None,
bbox_extension_range=test_bbox_extension_range,
coord_normalize=args.coord_normalize,
gcn=args.gcn,
fname_index=args.fname_index,
joint_index=args.joint_index,
symmetric_joints=args.symmetric_joints,
ignore_label=args.ignore_label,
should_return_bbox=True,
should_downscale_images=args.should_downscale_images,
downscale_height=args.downscale_height
)
np.random.seed(args.seed)
train_iterator = iterators.MultiprocessIterator(train_dataset, args.batch_size,
n_processes=args.workers, n_prefetch=3)
test_iterator = iterators.MultiprocessIterator(
test_dataset, args.batch_size,
repeat=False, shuffle=False,
n_processes=1, n_prefetch=1)
val_iterator = None
if args.val_csv_fn is not None and args.val_csv_fn != '':
small_train_dataset = dataset.PoseDataset(
args.val_csv_fn,
args.img_path_prefix, args.im_size,
fliplr=False, rotate=False,
shift=None,
bbox_extension_range=test_bbox_extension_range,
coord_normalize=args.coord_normalize,
gcn=args.gcn,
fname_index=args.fname_index,
joint_index=args.joint_index,
symmetric_joints=args.symmetric_joints,
ignore_label=args.ignore_label,
should_return_bbox=True,
should_downscale_images=args.should_downscale_images,
downscale_height=args.downscale_height
)
val_iterator = iterators.MultiprocessIterator(
small_train_dataset, args.batch_size,
repeat=False, shuffle=False,
n_processes=1, n_prefetch=1)
train_loop(net, saver, loss_op, pose_loss_op, train_op, args.dataset_name,
train_iterator, test_iterator,
val_iterator=val_iterator,
max_iter=args.max_iter,
test_step=args.test_step,
log_step=args.log_step,
snapshot_step=args.snapshot_step,
batch_size=args.batch_size,
conv_lr=args.conv_lr,
fc_lr=args.fc_lr,
fix_conv_iter=args.fix_conv_iter,
output_dir=args.o_dir
)
if __name__ == '__main__':
import sys
main(sys.argv[1:])
This is code what I used.
I traind it about 370000 iteration and I tried to test trained data.
But it shows this messages
Data loss: not an sstable (bad magic number): perhaps your file is in a different file format and you need to use a different restore operator?
Traceback (most recent call last):
DataLossError (see above for traceback): Unable to open table file out/lsp_alexnet_imagenet_small/checkpoint-370000.data-00000-of-00001: Data loss: not an sstable (bad magic number): perhaps your file is in a different file format and you need to use a different restore operator?
How can I resolve this problem?
Related
I made an error when running the following code when performing multi label text classification. The code is from GitHub and the link is https://github.com/hellonlp/classifier_multi_label_seq2seq_attention ,
enter image description here
the error information is as follows, and the error code is as follows:
# -*- coding: utf-8 -*-
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
import numpy as np
import tensorflow as tf
from classifier_multi_label_seq2seq_attention.networks import NetworkAlbertSeq2Seq
from classifier_multi_label_seq2seq_attention.classifier_utils import get_features
from classifier_multi_label_seq2seq_attention.hyperparameters import Hyperparamters as hp
from classifier_multi_label_seq2seq_attention.utils import select, shuffle_one, time_now_string
from classifier_multi_label_seq2seq_attention.load import normalization_label
pwd = os.path.dirname(os.path.abspath(__file__))
MODEL = NetworkAlbertSeq2Seq(is_training=True)
# Get data features
input_ids, input_masks, segment_ids, label_ids = get_features()
num_train_samples = len(input_ids)
arr = np.arange(num_train_samples)
num_batchs = int((num_train_samples - 1) / hp.batch_size) + 1
print('Number of batch:', num_batchs)
# Set up the graph
tf.reset_default_graph()
saver = tf.train.Saver(max_to_keep=hp.max_to_keep,defer_build=True)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# Load model saved before
MODEL_SAVE_PATH = '/home/classifier_multi_label_seq2seq_attention/model/CML_Seq2Seq'
ckpt = tf.train.get_checkpoint_state(MODEL_SAVE_PATH)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print('Restored model!')
with sess.as_default():
# Tensorboard writer
writer = tf.summary.FileWriter(hp.logdir, sess.graph)
for i in range(hp.num_train_epochs):
indexs = shuffle_one(arr)
for j in range(num_batchs - 1):
i1 = indexs[j * hp.batch_size:min((j + 1) * hp.batch_size, num_train_samples)]
# Get features
input_id_ = select(input_ids, i1)
input_mask_ = select(input_masks, i1)
segment_id_ = select(segment_ids, i1)
label_id_ = normalization_label(select(label_ids, i1))
# Feed dict
fd = {MODEL.input_ids: input_id_,
MODEL.input_masks: input_mask_,
MODEL.segment_ids: segment_id_,
MODEL.label_ids: label_id_}
# Optimizer
sess.run(MODEL.optimizer, feed_dict=fd)
# Tensorboard
if j % hp.summary_step == 0:
summary, glolal_step = sess.run([MODEL.merged, MODEL.global_step], feed_dict=fd)
writer.add_summary(summary, glolal_step)
# Save Model
if j % (num_batchs // hp.num_saved_per_epoch) == 0:
if not os.path.exists(os.path.join(pwd, hp.file_save_model)):
os.makedirs(os.path.join(pwd, hp.file_save_model))
saver.save(sess, os.path.join(pwd, hp.file_save_model, 'model' + '_%s_%s.ckpt' % (str(i), str(j))))
# Log
if j % hp.print_step == 0:
fd = {MODEL.input_ids: input_id_,
MODEL.input_masks: input_mask_,
MODEL.segment_ids: segment_id_,
MODEL.label_ids: label_id_}
loss = sess.run(MODEL.loss, feed_dict=fd)
print('Time:%s, Epoch:%s, Batch number:%s/%s, Loss:%s' % (
time_now_string(), str(i), str(j), str(num_batchs), str(loss)))
print('Train finished')
I'm trying to follow this tutorial enter link description hereon transfer learning, I used my own dataset , and I'm trying to use MobileNet instead to inception , the problem is in the MobileNet models there are 3 checkpoint files:
mobilenet_v1_0.5_128.ckpt.data-00000-of-00001
mobilenet_v1_0.5_128.ckpt.index
mobilenet_v1_0.5_128.ckpt.meta
when I use one of them got this Error :
NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt.meta
[[Node: save/RestoreV2_139 = RestoreV2[dtypes=[DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2_139/tensor_names, save/RestoreV2_139/shape_and_slices)]]
import tensorflow as tf
from tensorflow.contrib.framework.python.ops.variables import get_or_create_global_step
from tensorflow.python.platform import tf_logging as logging
#from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
from models.research.slim.nets.mobilenet_v1 import mobilenet_v1, mobilenet_v1_arg_scope
import os
import time
import h5py
import numpy as np
slim = tf.contrib.slim
# ================ DATASET INFORMATION ======================
# State dataset directory where the tfrecord files are located
dataset_dir = 'C://Nassima//lymphoma//subs3'
# State where your log file is at. If it doesn't exist, create it.
log_dir = './log'
# State where your checkpoint file is
checkpoint_file = 'C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt.meta'
# State the image size you're resizing your images to. We will use the default inception size of 299.
#image_size = 299
#image_size = 128
# State the number of classes to predict:
num_classes = 3
# State the labels file and read it
labels_file = 'C://Nassima//lymphoma//subs3//labels.txt'
labels = open(labels_file, 'r')
# Create a dictionary to refer each label to their string name
labels_to_name = {}
for line in labels:
label, string_name = line.split(':')
string_name = string_name[:-1] # Remove newline
labels_to_name[int(label)] = string_name
print(labels_to_name)
# Create the file pattern of your TFRecord files so that it could be recognized later on
"""
file_pattern = 'flowers_%s_*.tfrecord'
"""
# Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later.
items_to_descriptions = {
'image': 'A 3-channel RGB coloured lymphoma image that is either CLL, FL, MCL.',
'label': 'A label that is as such -- 0:CLL, 1:FL, 2:MCL'
}
# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 1
# State your batch size
#batch_size = 8
file_mean = "C://Nassima//lymphoma//subs3//train//mean.hdf5"
TRAINING_SET_SIZE = 41860
BATCH_SIZE = 128
IMAGE_SIZE = 144
IMAGE_RESIZE = 128
# Learning rate information and configuration (Up to you to experiment)
initial_learning_rate = 0.0002
learning_rate_decay_factor = 0.7
num_epochs_before_decay = 2
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
class _image_object: # image object from protobuf
def __init__(self):
self.image = tf.Variable([], dtype=tf.string)
self.height = tf.Variable([], dtype=tf.int64)
self.width = tf.Variable([], dtype=tf.int64)
self.filename = tf.Variable([], dtype=tf.string)
self.label = tf.Variable([], dtype=tf.int32)
def read_and_decode(filename_queue, mean):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(serialized_example, features = {
"image/encoded": tf.FixedLenFeature([], tf.string),
"image/height": tf.FixedLenFeature([], tf.int64),
"image/width": tf.FixedLenFeature([], tf.int64),
"image/filename": tf.FixedLenFeature([], tf.string),
"image/class/label": tf.FixedLenFeature([], tf.int64),})
image_encoded = features["image/encoded"]
image_raw = tf.decode_raw(image_encoded, tf.float32)
image_object = _image_object()
#image_object.image = tf.image.resize_image_with_crop_or_pad(image_raw, IMAGE_SIZE, IMAGE_SIZE)
image_r = tf.reshape(image_raw, [IMAGE_SIZE, IMAGE_SIZE, 3])
#added
image_r = image_r - mean
image_r = tf.random_crop(image_r ,[IMAGE_RESIZE ,IMAGE_RESIZE ,3], seed = 0, name = None)
image_object.image = image_r
image_object.height = features["image/height"]
image_object.width = features["image/width"]
image_object.filename = features["image/filename"]
image_object.label = tf.cast(features["image/class/label"], tf.int64)
return image_object
def flower_input(mean, if_random = True, if_training = True):
if(if_training):
filenames = [os.path.join(dataset_dir, "lymphoma_train_0000%d-of-00005.tfrecord" % i) for i in range(0, 5)]
else:
filenames = [os.path.join(dataset_dir, "lymphoma_validation_0000%d-of-00005.tfrecord" % i) for i in range(0, 5)]
for f in filenames:
if not tf.gfile.Exists(f):
raise ValueError("Failed to find file: " + f)
filename_queue = tf.train.string_input_producer(filenames)
image_object = read_and_decode(filename_queue, mean)
image = tf.image.per_image_standardization(image_object.image)
# image = image_object.image
# image = tf.image.adjust_gamma(tf.cast(image_object.image, tf.float32), gamma=1, gain=1) # Scale image to (0, 1)
filename = image_object.filename
label = image_object.label
if(if_random):
min_fraction_of_examples_in_queue = 0.4
min_queue_examples = int(TRAINING_SET_SIZE * min_fraction_of_examples_in_queue)
print("Filling queue with %d images before starting to train. " "This will take a few minutes." % min_queue_examples)
num_preprocess_threads = 1
image_batch, label_batch, filename_batch = tf.train.shuffle_batch(
[image, label, filename],
batch_size=BATCH_SIZE,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + 3 * BATCH_SIZE,
min_after_dequeue=min_queue_examples)
return image_batch, label_batch, filename_batch
else:
image_batch, label_batch, filename_batch = tf.train.batch(
[image, label, filename],
batch_size=BATCH_SIZE,
num_threads=1)
return image_batch, label_batch, filename_batch
"""
# ============== DATASET LOADING ======================
"""
def run():
# Create the log directory here. Must be done here otherwise import will activate this unneededly.
if not os.path.exists(log_dir):
os.mkdir(log_dir)
# ======================= TRAINING PROCESS =========================
# Now we start to construct the graph and build our model
with tf.Graph().as_default() as graph:
tf.logging.set_verbosity(tf.logging.INFO) # Set the verbosity to INFO level
# ajouter le mean de l'image
hdf5_file = h5py.File(file_mean, "r")
# subtract the training mean
mm = hdf5_file["train_mean"][0, ...]
mm = mm[np.newaxis, ...]
# Total number of samples
mean = tf.convert_to_tensor(mm, np.float32)
# First create the dataset and load one batch
images, labels, _ = flower_input(mean, if_random=True, if_training=True)
# Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(TRAINING_SET_SIZE / BATCH_SIZE)
num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
# Create the model inference
with slim.arg_scope(mobilenet_v1_arg_scope()):
logits, end_points = mobilenet_v1(images, num_classes= num_classes, is_training=True)
# Define the scopes that you want to exclude for restoration
#exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
exclude = ['MobilenetV1/Logits', 'MobilenetV1/AuxLogits']
#exclude = ["MobilenetV1/Logits/Conv2d_1c_1x1"]
#exclude = []
variables_to_restore = slim.get_variables_to_restore(exclude=exclude)
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = slim.one_hot_encoding(labels, num_classes)
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits)
total_loss = tf.losses.get_total_loss() # obtain the regularization losses as well
# Create the global step for monitoring the learning_rate and training.
global_step = get_or_create_global_step()
# Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate=initial_learning_rate,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=learning_rate_decay_factor,
staircase=True)
# Now we can define the optimizer that takes on the learning rate
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer)
# State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels)
metrics_op = tf.group(accuracy_update, probabilities)
# Now finally create all the summaries you need to monitor and group them into one summary op.
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
my_summary_op = tf.summary.merge_all()
# Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
def train_step(sess, train_op, global_step):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
'''
# Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op])
time_elapsed = time.time() - start_time
# Run the logging to print some results
logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
# Now we create a saver function that actually restores the variables from a checkpoint file in a sess
saver = tf.train.Saver(variables_to_restore)
saver = tf.train.import_meta_graph(checkpoint_file)
#added
def restore_fn(sess):
return saver.restore(sess, 'C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt')
# Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
sv = tf.train.Supervisor(logdir=log_dir, summary_op=None, init_fn=restore_fn)
# Run the managed session
with sv.managed_session() as sess:
for step in range(num_steps_per_epoch * num_epochs):
# At the start of every epoch, show the vital information:
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy])
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run(
[logits, probabilities, predictions, labels])
print
'logits: \n', logits_value
print
'Probabilities: \n', probabilities_value
print
'predictions: \n', predictions_value
print
'Labels:\n:', labels_value
# Log the summaries every 10 step.
if step % 10 == 0:
loss, _ = train_step(sess, train_op, sv.global_step)
summaries = sess.run(my_summary_op)
sv.summary_computed(sess, summaries)
# If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, sv.global_step)
# We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
# Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
# saver.save(sess, "./flowers_model.ckpt")
#sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
if __name__ == '__main__':
run()
and the error is
File "C:/Users/hp/PycharmProjects/tfSlim/lympho_mobileNet/train_lymphoma2.py", line 272, in <module>
run()
File "C:/Users/hp/PycharmProjects/tfSlim/lympho_mobileNet/train_lymphoma2.py", line 230, in run
sv = tf.train.Supervisor(logdir=log_dir, summary_op=None, init_fn=restore_fn)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\supervisor.py", line 300, in __init__
self._init_saver(saver=saver)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\supervisor.py", line 448, in _init_saver
saver = saver_mod.Saver()
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1218, in __init__
self.build()
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1227, in build
self._build(self._filename, build_save=True, build_restore=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1263, in _build
build_save=build_save, build_restore=build_restore)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 729, in _build_internal
saveables = self._ValidateAndSliceInputs(names_to_saveables)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 582, in _ValidateAndSliceInputs
names_to_saveables = BaseSaverBuilder.OpListToDict(names_to_saveables)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 554, in OpListToDict
name)
ValueError: At least two variables have the same name: MobilenetV1/Conv2d_7_depthwise/BatchNorm/gamma
I think because of the excluded layers or the instruction
tf.train.import_meta_graph(checkpoint_file)
You're loading the meta file, while you should be providing just the path to mobilenet_v1_0.5_128.ckpt
I write a simple code to do something like word2vec, but when training, I see the cross-entropy loss increases at each epochs beginning, then decrease. please help me find out if there is any mistake in my code, I have already reviewed many times...
from tensorflow.python import debug as tf_debug
import math
import os
import time
import random
import numpy as np
import tensorflow as tf
def gen_next_block(filenames,epochs):
for epoch in range(epochs):
for filename in filenames:
with open(filename) as f:
start = time.time()
line_cnt = 0
data = []
for line in f:
record = line.strip().split(',')
record = [int(record[0]), int(record[1]), float(record[2])] + [int(item) for item in record[3].split(';')]
record = record[:3] + [record[3 + epoch],]
data.append(record)
line_cnt += 1
if line_cnt % 4096000 == 0:
end = time.time()
elapsed_time = (end - start) * 1000
print("load block data: epoch %d, filename %s line_cnt %d, size %d, elapsed time %f ms" % (epoch, filename, line_cnt, len(data), elapsed_time))
random.shuffle(data)
yield data
data = []
start = time.time()
if len(data) > 0:
end = time.time()
elapsed_time = (end - start) * 1000
print("load block data: epoch %d, filename %s line_cnt %d, size %d, elapsed time %f ms" % (epoch, filename, line_cnt, len(data), elapsed_time))
random.shuffle(data)
yield data
data = None
next_block_generator = None
data_index = 0
last_time_data_index = 0
def generate_batch(filenames, epochs, batch_size):
global data
global data_index
global last_time_data_index
global next_block_generator
if next_block_generator is None:
next_block_generator = gen_next_block(filenames,epochs)
if data_index <= last_time_data_index:
data = next(next_block_generator,None)
data_index = 0
last_time_data_index = 0
if data is not None:
last_time_data_index = data_index
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size), dtype=np.int32)
negative_labels = np.ndarray(shape=(batch_size), dtype=np.int32)
weights = np.ndarray(shape=(batch_size), dtype=np.float32)
negative_weights = np.ones(shape=(batch_size), dtype=np.float32)
for i in range(batch_size):
batch[i] = data[data_index][0]
labels[i] = data[data_index][1]
weights[i] = data[data_index][2]
negative_labels[i] = data[data_index][3]
data_index = (data_index + 1) % len(data)
return batch, labels, negative_labels, weights, negative_weights
else:
raise Exception("finish load file list [%s] %d times" % (','.join(filenames),epochs))
filename = 'data/dr_xianyu_item2vec_train_with_meta_20170725_dir/dr_xianyu_item2vec_train_with_meta_20170725_dir_'
filenames = [filename + str(i) for i in range(10)]
epochs = 5
batch_size = 2048
embedding_size = 32 # Dimension of the embedding vector.
num_sampled = batch_size # Number of negative examples to sample.
vocabulary_size = 7483025 + 1
graph = tf.Graph()
with graph.as_default():
with tf.device('/cpu:0'):
with tf.name_scope('input_data'):
train_inputs = tf.placeholder(tf.int32, shape=[batch_size], name = 'context_placeholder')
positive_labels = tf.placeholder(tf.int32, shape=[batch_size], name = 'target_placeholder')
negative_labels = tf.placeholder(tf.int32, shape=[num_sampled], name = 'negative_target_placeholder')
positive_weights = tf.placeholder(tf.float32, shape=([batch_size]), name = 'target_weight')
negative_weights = tf.placeholder(tf.float32, shape=([num_sampled]), name = 'negative_target_weight')
with tf.name_scope('emb_layer'):
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -0.5/embedding_size, 0.5/embedding_size), name = 'emb')
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
with tf.name_scope("neg_layer"):
nce_weights = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -0.5/embedding_size, 0.5/embedding_size), name = 'nce_weight')
nce_biases = tf.Variable(tf.zeros([vocabulary_size]), name = 'nce_biase')
positive_embed = tf.nn.embedding_lookup(nce_weights,positive_labels)
positive_bias = tf.nn.embedding_lookup(nce_biases,positive_labels)
negative_embed = tf.nn.embedding_lookup(nce_weights,negative_labels)
negative_bias = tf.nn.embedding_lookup(nce_biases,negative_labels)
positive_logits = tf.reduce_sum(tf.multiply(embed,positive_embed),1) + positive_bias
negative_logits = tf.reduce_sum(tf.multiply(embed,negative_embed),1) + negative_bias
with tf.name_scope('loss_layer'):
positive_xent = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(positive_logits), logits = positive_logits)
negative_xent = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.zeros_like(negative_logits), logits = negative_logits)
weighted_positive_logits = tf.multiply(positive_logits,positive_weights)
weighted_negative_logits = tf.multiply(negative_logits,negative_weights)
loss = (tf.reduce_sum(positive_xent) + tf.reduce_sum(negative_xent)) /(batch_size*2)
with tf.name_scope('train'):
optimizer = tf.train.RMSPropOptimizer(0.001).minimize(loss)
# global_step = tf.Variable(0, trainable=False)
# starter_learning_rate = 0.1
# learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 20000, 0.8, staircase=True)
# optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
init = tf.global_variables_initializer()
init_local = tf.local_variables_initializer()
tf.summary.scalar('loss_layer/loss', loss)
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
summary_op = tf.summary.merge_all()
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
# sess = tf_debug.LocalCLIDebugWrapperSession(sess)
# sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
for v in tf.global_variables():
print(v.name,v.device,v.shape)
for v in tf.local_variables():
print(v.name,v.device,v.shape)
# if os.path.exists('tmp/model.ckpt.meta'):
# saver = tf.train.import_meta_graph('tmp/model.ckpt.meta')
# saver.restore(sess,tf.train.latest_checkpoint('tmp/'))
# print("model restored")
# else:
if True:
init.run()
init_local.run()
print("model init")
summary_writer = tf.summary.FileWriter('tmp/log', sess.graph)
average_loss = 0
start = time.time()
step = 1
try:
while True:
batch_inputs, batch_labels, batch_negative_labels, positive_weights_np, negative_weights_np = generate_batch(filenames, epochs,batch_size)
feed_dict = {train_inputs: batch_inputs, positive_labels: batch_labels, negative_labels: batch_negative_labels, positive_weights:positive_weights_np, negative_weights:negative_weights_np}
if step%1000 == 0:
loss_val,summary_str,_ = sess.run([loss, summary_op, optimizer], feed_dict=feed_dict)
summary_writer.add_summary(summary_str,step)
else:
loss_val,_ = sess.run([loss, optimizer], feed_dict=feed_dict)
average_loss += loss_val
if step % 1000 == 0:
average_loss /= 1000
end = time.time()
elapsed_time = (end - start)*1000 / 1000
print('Average loss at step ', step, ': ', average_loss, 'time cost', elapsed_time, 'ms')
average_loss = 0
start = time.time()
if step % 20000 == 0:
print('save model...')
save_path = saver.save(sess,'tmp/model.ckpt')
print("saved model in",save_path)
step +=1
except Exception,e:
print e
print("total batch count %d" % step)
summary_writer.flush()
there is my loss
the first pic is sgd generated in 5 epoch
the second pic is RMSProp generated in 2 epoch(still running)
I am learning tensor flow by modifying some examples I've found. To start off with I have taken an RNN example to try against the "Spam" data set from UCI.
My code and the sample data set can be found in full here:
https://trinket.io/python/c7d6b95452
When I run the code I get a 100% error rate. I figure even if this data set was not well suited for this particular model that I'd get at least something better than that, so I don't think it's my choice of a sample data set.
Below is my Python code. If anyone can suggest how to modify this to get the model to work properly I would appreciate it! I'd also appreciate any general tensor flow advice too.
# Example for my blog post at:
# https://danijar.com/introduction-to-recurrent-networks-in-tensorflow/
import functools
import os
import sets
import random
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
def lazy_property(function):
attribute = '_' + function.__name__
#property
#functools.wraps(function)
def wrapper(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return wrapper
class SequenceClassification:
def __init__(self, data, target, dropout, num_hidden=200, num_layers=3):
self.data = data
self.target = target
self.dropout = dropout
self._num_hidden = num_hidden
self._num_layers = num_layers
self.prediction
self.error
self.optimize
#lazy_property
def prediction(self):
# Recurrent network.
network = rnn_cell.GRUCell(self._num_hidden)
network = rnn_cell.DropoutWrapper(
network, output_keep_prob=self.dropout)
network = rnn_cell.MultiRNNCell([network] * self._num_layers)
output, _ = tf.nn.dynamic_rnn(network, self.data, dtype=tf.float32)
# Select last output.
output = tf.transpose(output, [1, 0, 2])
last = tf.gather(output, int(output.get_shape()[0]) - 1)
# Softmax layer.
weight, bias = self._weight_and_bias(
self._num_hidden, int(self.target.get_shape()[1]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
return prediction
#lazy_property
def cost(self):
cross_entropy = -tf.reduce_sum(self.target *tf.log(self.prediction))
return cross_entropy
#lazy_property
def optimize(self):
learning_rate = 0.003
optimizer = tf.train.RMSPropOptimizer(learning_rate)
return optimizer.minimize(self.cost)
#lazy_property
def error(self):
mistakes = tf.not_equal(
tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
return tf.reduce_mean(tf.cast(mistakes, tf.float32))
#staticmethod
def _weight_and_bias(in_size, out_size):
weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
bias = tf.constant(0.1, shape=[out_size])
return tf.Variable(weight), tf.Variable(bias)
def main():
sample_size=10
num_classes=2 #spam or ham
##
# import spam data
##
spam_data=[]
spam_data_train=[]
spam_data_test=[]
data_dir="."
data_file="spam.csv"
with open(os.path.join(data_dir, data_file), "r") as file_handle:
for row in file_handle:
spam_data.append(row)
spam_data=[line.rstrip().split(",") for line in spam_data if len(line) >=1]
random.shuffle(spam_data)
spam_data_train=spam_data[0:int(len(spam_data)*.8)]
spam_data_test=spam_data[int(len(spam_data)*.8):int(len(spam_data))]
def next_train_batch(batch_size):
a=random.sample(spam_data_train, batch_size)
return [np.array([line[:-1] for line in a]), np.array([line[len(line)-1] for line in a])]
def train_batch():
return [np.array([line[:-1] for line in spam_data_train]),np.array([line[len(line)-1] for line in spam_data_train])]
def next_test_batch(batch_size):
a=random.sample(spam_data_test, batch_size)
return [np.array([line[:-1] for line in a]), np.array([line[len(line)-1] for line in a])]
def test_batch():
return [np.array([line[:-1] for line in spam_data_test]),np.array([line[len(line)-1] for line in spam_data_test])]
t=train_batch();
train_input=t[0]
train_target=t[1]
test=test_batch()
test_input=t[0]
test_target=t[1]
training_data = tf.placeholder(tf.float32, [None, sample_size, len(train_input[0])], "training_data")
training_target = tf.placeholder(tf.float32, [None, sample_size], "training_target")
testing_data = tf.placeholder(tf.float32, [None, len(test_input), len(test_input[0])], "testing_data")
testing_target = tf.placeholder(tf.float32, [None, len(test_target)], "testing_target")
dropout = tf.placeholder(tf.float32)
training_model = SequenceClassification(training_data, training_target, dropout)
tf.get_variable_scope().reuse_variables()
testing_model = SequenceClassification(testing_data, testing_target, dropout)
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)
for epoch in range(sample_size):
for _ in range(100):
sample=random.sample(range(0,len(train_input)-1),sample_size)
batch_train = [train_input[i] for i in sample]
batch_target = [train_target[i] for i in sample]
sess.run(training_model.optimize, {
training_data: [batch_train], training_target: [batch_target] , dropout: 0.5})
error = sess.run(testing_model.error, {
testing_data: [test_input], testing_target: [test_target], dropout: 1.0})
print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
if __name__ == '__main__':
main()
I have built and successfully trained a convolutional model, however I cannot manage to restore the model and run my evaluation on it. The program throws a bunch of errors without giving an answer.
My code for the evaluation is here:
import tensorflow as tf
import main
import Process
import Input
eval_dir = "/Users/Zanhuang/Desktop/NNP/model.ckpt-98"
checkpoint_dir = "/Users/Zanhuang/Desktop/NNP/checkpoint"
def evaluate():
with tf.Graph().as_default() as g:
images, labels = Process.eval_inputs()
forward_propgation_results = Process.forward_propagation(images)
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
top_k_op = tf.nn.in_top_k(forward_propgation_results, labels, 1)
with tf.Session(graph = g) as sess:
sess.run(init_op)
tf.train.start_queue_runners(sess=sess)
saver.restore(sess, eval_dir)
for i in range(100):
print(sess.run(top_k_op))
def main(argv = None):
evaluate()
if __name__ == '__main__':
tf.app.run()
My output to the eval looks looks like this: but without the program running. It just stays stuck there.
E tensorflow/core/client/tensor_c_api.cc:485] /Users/Zanhuang/Desktop/NNP/Prostate_Cancer_Data1.bin
[[Node: ReaderRead = ReaderRead[_class=["loc:#FixedLengthRecordReader", "loc:#input_producer"], _device="/job:localhost/replica:0/task:0/cpu:0"](FixedLengthRecordReader, input_producer)]]
ERROR:tensorflow:Exception in QueueRunner: /Users/Zanhuang/Desktop/NNP/Prostate_Cancer_Data1.bin
[[Node: ReaderRead = ReaderRead[_class=["loc:#FixedLengthRecordReader", "loc:#input_producer"], _device="/job:localhost/replica:0/task:0/cpu:0"](FixedLengthRecordReader, input_producer)]]
The following is the main part of my program where I save the model and the checkpoint files.
import Input
import Process
import time
import numpy as np
import tensorflow as tf
from datetime import datetime
FLAGS = tf.app.flags.FLAGS
def train():
with tf.Session() as sess:
images, labels = Process.inputs()
forward_propgation_results = Process.forward_propagation(images)
train_loss, cost = Process.error(forward_propgation_results, labels)
image_summary_t = tf.image_summary(images.name, images, max_images = 2)
summary_op = tf.merge_all_summaries()
init = tf.initialize_all_variables()
saver = tf.train.Saver()
sess.run(init)
saver = tf.train.Saver(tf.all_variables())
tf.train.start_queue_runners(sess = sess)
train_dir = "/Users/Zanhuang/Desktop/NNP/model.ckpt"
summary_writer = tf.train.SummaryWriter(train_dir, sess.graph)
for step in range(100):
start_time = time.time()
print(sess.run([train_loss, cost]))
duration = time.time() - start_time
if step % 1 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, (%.1f examples/sec; %.3f ''sec/batch)')
print (format_str % (datetime.now(), step, examples_per_sec, sec_per_batch))
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 2 == 0:
checkpoint_path = train_dir
saver.save(sess, checkpoint_path, global_step = step)
def main(argv = None):
train()
if __name__ == '__main__':
tf.app.run()