import custom_model as CM
import input_pipeline as IP
import tensorflow as tf
def custom_estimator(features, labels, mode):
logits = CM.model_net(features=features, n_classes=5)
prediction = tf.keras.layers.Activation('softmax')(logits)
preds_dict = {'class': tf.argmax(input=prediction, axis=1),
'probabilities': prediction,
'logits': logits}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode,
predictions=preds_dict)
# Compute loss
labels = tf.reshape(labels, (BATCH_SIZE, 5))
loss = tf.keras.losses.categorical_crossentropy(y_true=labels,
y_pred=prediction)
# Compute evaluation metrics
accuracy = custom_accuracy(labels, prediction)
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss,
eval_metric_ops=metrics)
optimizer = tf.keras.optimizers.Adam()
train_op = optimizer.minimize(loss)
return tf.estimator.EstimatorSpec(mode, loss=loss,
train_op=train_op)
# Build tf_estimator
classifier = tf.estimator.Estimator(model_fn=custom_estimator,
model_dir=model_dir)
# Train the estimator
TRAIN_FILES, TRAIN_LABELS = IP.map_file_to_label(data_dir=data_dir)
TRAIN = classifier.train(input_fn=lambda:
IP.imgs_input_fn(TRAIN_FILES, labels=TRAIN_LABELS,
perform_shuffle=True, repeat_count=EPOCHS,
batch_size=BATCH_SIZE),
steps=int(len(TRAIN_LABELS)/BATCH_SIZE))
This is the error I am facing with TensorFlow-2.0. Error image & code for the same attached here. Please help.
If I enter var_list=None then the error is "ValueError: Passed in object of type , not tf.Tensor"
First of all, I don't think this Estimator code example is TensorFlow 2.0 compliant. In any case, if peradventure you're using the 1.x versions, replace:
train_op = optimizer.minimize(loss)
with this:
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
If indeed, you're using TensorFlow 2.0, then replace with:
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.compat.v1.train.get_global_step())
Related
I have a simple tensorflow model that consists of lstm layers - such as tf.contrib.rnn.LSTMBlockCell or tf.keras.layers.LSTM (I can provide the sample code also, if needed). I want to run the model on an iOS app. However, I have looked at several websites that say that presently there is no way to convert and run tensorflow model that consist LSTM layers on iOS apps.
I have tried these tools/libraries to convert the tensorflow model to .mlmodel format (or .tflite format)
1. Swift for Tensorflow
2. Tensorflow Lite for iOS
3. tfcoreml
However, these tools also does not seem to be able to convert the lstm layers model to .mlmodel format. These tools, however allow to use custom layers to be added. But I don't know how I can add LSTM custom layer.
Am I wrong in saying that there is no support to run tensorflow lstm model in iOS apps? If yes, then please guide me on how I can go ahead to include the model in iOS app. Is there any other tool/library that can be ued to convert it to .mlmodel format. If no, then are there any plans to include tensorflow support for iOS in future?
Model
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.contrib.rnn import *
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
#Summary parameters
logs_path = "logs/"
# Training Parameters
learning_rate = 0.001
training_steps = 1000
batch_size = 128
display_step = 200
# Network Parameters
num_input = 28 # MNIST data input (img shape: 28*28)
timesteps = 28 # timesteps
num_hidden = 128 # hidden layer num of features
num_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
X = tf.placeholder("float", [None, timesteps, num_input])
Y = tf.placeholder("float", [None, num_classes])
# Define weights
weights = {
'out': tf.Variable(tf.random_normal([num_hidden, num_classes]))
}
biases = {
'out': tf.Variable(tf.random_normal([num_classes]))
}
def RNN(x, weights, biases):
# Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
x = tf.unstack(x, timesteps, 1)
# Define a lstm cell with tensorflow
lstm_cell = rnn.LSTMBlockCell(num_hidden, forget_bias = 1.0)
#lstm_cell = tf.keras.layers.LSTMCell(num_hidden, unit_forget_bias=True)
# Get lstm cell output
outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
# Linear activation, using rnn inner loop last output
return tf.matmul(outputs[-1], weights['out']) + biases['out']
logits = RNN(X, weights, biases)
with tf.name_scope('Model'):
prediction = tf.nn.softmax(logits, name = "prediction_layer")
with tf.name_scope('Loss'):
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y, name = "loss_layer"), name = "reduce_mean_loss")
with tf.name_scope('SGD'):
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate, name = "Gradient_Descent")
train_op = optimizer.minimize(loss_op, name = "minimize_layer")
with tf.name_scope('Accuracy'):
# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1), name = "correct_pred_layer")
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name = "reduce_mean_acc_layer")
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
#Create a summary to monitor cost tensor
tf.summary.scalar("loss", loss_op)
#Create a summary to monitor accuracy tensor
tf.summary.scalar("accuracy", accuracy)
#Merge all summaries into a single op
merged_summary_op = tf.summary.merge_all()
saver = tf.train.Saver()
save_path = ""
model_save = "model.ckpt"
# Start training
with tf.Session() as sess:
# Run the initializer
sess.run(init)
# op to write logs to Tensorboard
summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
for step in range(1, training_steps+1):
total_batch = int(mnist.train.num_examples/batch_size)
batch_x, batch_y = mnist.train.next_batch(batch_size)
# Reshape data to get 28 seq of 28 elements
batch_x = batch_x.reshape((batch_size, timesteps, num_input))
# Run optimization op (backprop)
sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
if step % display_step == 0 or step == 1:
# Calculate batch loss and accuracy
loss, acc, summary = sess.run([loss_op, accuracy, merged_summary_op], feed_dict={X: batch_x,
Y: batch_y})
# Write logs at every iteration
summary_writer.add_summary(summary, step * total_batch)
print("Step " + str(step) + ", Minibatch Loss= " + \
"{:.4f}".format(loss) + ", Training Accuracy= " + \
"{:.3f}".format(acc))
saver.save(sess, model_save)
tf.train.write_graph(sess.graph_def, save_path, 'save_graph.pbtxt')
#print(sess.graph.get_operations())
print("Optimization Finished!")
print("Run the command line:\n" \
"--> tensorboard --logdir=logs/ " \
"\nThen open http://0.0.0.0:6006/ into your web browser")
# Calculate accuracy for 128 mnist test images
test_len = 128
test_data = mnist.test.images[:test_len].reshape((-1, timesteps, num_input))
test_label = mnist.test.labels[:test_len]
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={X: test_data, Y: test_label}))
Code to generate the frozen model
import tensorflow as tf
import numpy as np
from tensorflow.python.tools import freeze_graph
save_path = ""
model_name = "test_model_tf_keras_layers_lstm"
input_graph_path = save_path + "save_graph.pbtxt"
checkpoint_path = save_path + "model.ckpt"
input_saver_def_path = ""
input_binary = False
output_node_names = "Model/prediction_layer" #output node's name. Should match to that mentioned in the code
restore_op_name = 'save/restore_all'
filename_tensor_name = 'save/const:0'
output_frozen_graph_name = save_path + 'frozen_model' + '.pb' # name of .pb file that one would like to give
clear_devices = True
freeze_graph.freeze_graph(input_graph_path, input_saver_def_path, input_binary, checkpoint_path, output_node_names, restore_op_name, filename_tensor_name, output_frozen_graph_name, clear_devices, "")
print("Model Freezed")
Conversion Code to generate .mlmodel format file
import tfcoreml
coreml_model = tfcoreml.convert(tf_model_path = 'frozen_model_test_model_tf_keras_layers_lstm.pb',
mlmodel_path = 'test_model.mlmodel',
output_feature_names = ['Model/prediction_layer:0'],
add_custom_layers = True)
coreml_model.save("test_model.mlmodel")
Error message shown with
lstm_cell = rnn.BasicLSTMCell(num_hidden, name = "lstm_cell")
Value Error: Split op case not handled. Input shape = [1, 512], output shape = [1, 128]
Error message shown with
lstm_cell = rnn.LSTMBlockCell(num_hidden, name = "lstm_cell")
InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'LSTMBlockCell' used by node rnn/lstm_cell/LSTMBlockCell (defined at /anaconda2/lib/python2.7/site-packages/tfcoreml/_tf_coreml_converter.py:153) with these attrs: [forget_bias=1, use_peephole=false, cell_clip=-1, T=DT_FLOAT]
Registered devices: [CPU]
Registered kernels:
<no registered kernels>
[[node rnn/lstm_cell/LSTMBlockCell (defined at /anaconda2/lib/python2.7/site-packages/tfcoreml/_tf_coreml_converter.py:153) ]]
I expect that the frozen tensorflow model can be converted to .mlmodel format.
I use tf.estimator.train_and_evaluate to train and evaluate my model. This is my code:
import tensorflow as tf
import numpy as np
from tensorflow.contrib.slim.nets import resnet_v2
import tensorflow.contrib.slim as slim
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(path='mnist.npz')
x_train = np.expand_dims(x_train, 3).astype(np.float32)[:5000]
y_train = y_train.astype(np.int32)[:5000]
x_test = np.expand_dims(x_test, 3).astype(np.float32)[:1000]
y_test = y_test.astype(np.int32)[:1000]
tf.logging.set_verbosity(tf.logging.INFO)
cls_num = 10
def model_fn(features, labels, mode):
is_training = False
if mode == tf.estimator.ModeKeys.TRAIN:
is_training = True
with slim.arg_scope(resnet_v2.resnet_arg_scope()):
logits, endpoints = resnet_v2.resnet_v2_50(features,
num_classes=cls_num,
is_training=is_training,
reuse=None)
logits = tf.squeeze(logits, [1, 2])
preds = tf.argmax(logits, 1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
accuracy = tf.metrics.accuracy(labels=labels, predictions=preds)
metrics = {'accuracy': accuracy}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def process_fn(feature, label):
feature = tf.expand_dims(feature, 3)
return feature, label
def train_input_fn():
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
#dataset.map(process_fn)
dataset = dataset.repeat(1).batch(8)
return dataset
def eval_input_fn():
dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
dataset = dataset.repeat(1).batch(8)
return dataset
estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir='logs')
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn)
eval_specs = tf.estimator.EvalSpec(input_fn=eval_input_fn)
for _ in xrange(10):
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs)
The training step is ok and the loss became very small (about 0.001), but the evaluation result is wrong (the following is the evaluaiton log):
...
INFO:tensorflow:Saving dict for global step 625: accuracy = 0.5, global_step = 625, loss = 1330830600000.0
...
The task is very simple, just a binaray classfication. I don not think it is overfitting. Is there something wrong for my evaluation code?
I have a custom estimator and am trying to use some custom metrics during evaluation. However, whenever I add these metrics to evaluation, via eval_metric_ops the evaluation becomes really slow (much slower than training which is actually calculating the same metrics). If I don't add the metrics there then I can only see metrics in Tensorboard for training and not for evaluation.
What is the right way to add a custom metric for a custom estimator so that it is saved during evaluation.
This is what I have:
def compute_accuracy(preds, labels):
total = tf.shape(labels.values)[0]
preds = tf.sparse_to_dense(preds.indices, preds.dense_shape, preds.values, default_value=-1)
labels = tf.sparse_to_dense(labels.indices, labels.dense_shape, labels.values, default_value=-2)
r = tf.shape(labels)[0]
c = tf.minimum(tf.shape(labels)[1], tf.shape(preds)[1])
preds = tf.slice(preds, [0,0], [r,c])
labels = tf.slice(labels, [0,0], [r,c])
preds = tf.cast(preds, tf.int32)
labels = tf.cast(labels, tf.int32)
correct = tf.reduce_sum(tf.cast(tf.equal(preds, labels), tf.int32))
accuracy = tf.divide(correct, total)
return accuracy
In model_fn
edit_dist = tf.reduce_mean(tf.edit_distance(tf.cast(predicted_label[0], tf.int32), labels))
accuracy = compute_accuracy(predicted_label[0], labels)
tf.summary.scalar('edit_dist', edit_dist)
tf.summary.scalar('accuracy', accuracy)
metrics = {
'accuracy': tf.metrics.mean(accuracy),
'edit_dist':tf.metrics.mean(edit_dist),
}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
As requested, here is the complete model and TfRecord Writer code:
def crnn_model(features, labels, mode, params):
inputs = features['image']
print("INPUTS SHAPE", inputs.shape)
if mode == tf.estimator.ModeKeys.TRAIN:
batch_size = params['batch_size']
lr_initial = params['lr']
lr = tf.train.exponential_decay(lr_initial, global_step=tf.train.get_global_step(),
decay_steps=params['lr_decay_steps'], decay_rate=params['lr_decay_rate'],
staircase=True)
tf.summary.scalar('lr', lr)
else:
batch_size = params['test_batch_size']
with tf.variable_scope('crnn', reuse=False):
rnn_output, predicted_label, logits = CRNN(inputs, hidden_size=params['hidden_size'], batch_size=batch_size)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'predicted_label': predicted_label,
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
loss = tf.reduce_mean(tf.nn.ctc_loss(labels=labels, inputs=rnn_output,
sequence_length=23 * np.ones(batch_size),
ignore_longer_outputs_than_inputs=True))
edit_dist = tf.reduce_mean(tf.edit_distance(tf.cast(predicted_label[0], tf.int32), labels))
accuracy = compute_accuracy(predicted_label[0], labels)
metrics = {
'accuracy': tf.metrics.mean(accuracy),
'edit_dist':tf.metrics.mean(edit_dist),
}
tf.summary.scalar('loss', loss)
tf.summary.scalar('edit_dist', edit_dist)
tf.summary.scalar('accuracy', accuracy)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
assert mode == tf.estimator.ModeKeys.TRAIN
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
Tf Record Writer code
def _write_fn(self, out_file, image_list, label_list, mode):
writer = tf.python_io.TFRecordWriter(out_file)
N = len(image_list)
for i in range(N):
if (i % 1000) == 0:
print('%s Data: %d/%d records saved' % (mode, i,N))
sys.stdout.flush()
try:
#print('Try image: ', image_list[i])
image = load_image(image_list[i])
except (ValueError, AttributeError):
print('Ignoring image: ', image_list[i])
continue
label = label_list[i]
feature = {
'label': _int64_feature(label),
'image': _byte_feature(tf.compat.as_bytes(image.tostring()))
}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
In the Estimator framework, everything happens in the model_fn, namely your crnn_model(features, labels, mode, params). This is why this function has such a complex signature.
The mode parameter indicates whether it is called for training, evaluation or prediction. So, if you want to log additional summaries to tensorboard during the evaluation, you would add them under the if mode == tf.estimator.ModeKeys.EVAL section, or outside any if in the model_fn.
I suppose your eval is much slower because you have different batch sizes for train/eval and the eval batch size could be smaller. You indicated this is not the case.
After a closer look at your code, and having experienced with a similar model, I believe that the evaluation takes longer with metrics because one of the metrics is edit_distance() which is implemented sequentially on the CPU. During training, this op is not required so it is not run.
What I suggest is that you run your train() and evaluate() in different programs, with the same model_fn() and model_dir. This way, train does not need to wait for evaluate. And evaluate will run only when necessary, i.e. when there are new checkpoints in the model_dir. If you don't have 2 GPUs for this, you can either split the GPU memory between the two processes (using a custom run-config with gpu_memory_fraction=0.75 for train) or by hiding the GPU from evaluate() with CUDA_VISIBLE_DEVICES='' environment variable
I have trained a CNN using the tf.estimator API, but am having trouble getting predictions out in a way that is useful to me.
I need to feed images to my CNN in real time as they are received from a camera. In an older net design I made the Controller_tf class, which worked fine for doing that. So I have tried to adapt it to a new CNN trained using tf.estimator (as said earlier).
The estimator.predict interface seems to want to be invoked via a tf.app.run() call (would be glad to be proved wrong about that), which is why I am trying to run the CNN using tf.Session() (with if statements inside the model function to only run the relevant parts) but I'm currently getting the error:
ValueError: Fetch argument 'infer' cannot be interpreted as a Tensor. ("The name 'infer' refers to an Operation not in the graph.")
I can't quite see where I am going wrong. Is the trained model incompatable with the run in PREDICT mode? Any help will be very much appreciated. Any way here is the code:
class Controller_tf:
set_speed = None
def __init__(self, model, ckpt_path, set_speed_in):
self.set_speed = set_speed_in
self.x = tf.placeholder(tf.float32, shape = (None, 104, 160, 3))
self.y = model(self.x, None, tf.estimator.ModeKeys.PREDICT)
# make TF use memory growth method
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
self.sess = tf.Session(config=config)
saver = tf.train.Saver()
saver.restore(self.sess, ckpt_path)
def update(self, message):
# The current speed of the car
image = frame2numpy(message['frame'], (160,104))
image_array = np.asarray(image)
turn_logits = self.sess.run(self.y, {self.x: image_array[None, :, :, :]})
return turn_logits
model = cnn_model_fn3
ckpt = 'ckpts/stc_model3/model.ckpt-27621'
controller = Controller_tf(model, ckpt, 18)
image_file = 'G:/Datasets/ds072.001/ds072.001-fm-0008465.jpg'
#image_file = 'G:/Datasets/ds072.001/ds072.001-fm-0009156.jpg'
satnavimg = load_image(image_file)
satnavimg = np.asarray([satnavimg])
satnavimg = (satnavimg/127.5) - 1.0
print(np.shape(satnavimg))
msg = {'frame': satnavimg}
turn = controller.update(msg)
print(turn)
The model function is:
def cnn_model_fn3(features, labels, mode):
if mode == tf.estimator.ModeKeys.PREDICT:
input_layer = features
else:
input_layer = tf.reshape(features["image_data"], [-1, 104, 160, 3])
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[10, 10],
padding="same",
activation=tf.nn.relu,
name='Conv1')
... removed layer code for brevity ...
logits = tf.layers.dense(
inputs=dropout1,
units=3,
name='Dense3')
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=labels, predictions=predictions["classes"])}
if mode == tf.estimator.ModeKeys.PREDICT:
return logits
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
I am using TensorFlow 1.2.1 on Windows 10, and using the Estimator API. Everything runs without any errors, but whenever I have to restore the parameters from a checkpoint, some aspect of it doesn't work. I've checked that the values of every variable in classifier.get_variable_names() does not change after an evaluation, however the Loss spikes back up to near where it started, this is followed by a continued learning, each time learning faster than the last.
This happens within one TensorFlow run, when a validation or evaluation run happens, or when I rerun the python file to continue training.
The following graphs are one example of this problem, they are restoring the variables every 2500 steps:
http://imgur.com/6q9Wuat
http://imgur.com/CQ2hdR8
The following code is a significiantly reduced version of my code, which still replicates the error:
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
tf.logging.set_verbosity(tf.logging.INFO)
sess = tf.InteractiveSession()
def cnn_model_fn(features, labels, mode):
dense_layer1 = tf.layers.dense(inputs=features, units=512, activation=tf.nn.relu, name="FC_1")
dense_layer2 = tf.layers.dense(inputs=dense_layer1, units=1024, activation=tf.nn.relu, name="FC_2")
dense_layer3 = tf.layers.dense(inputs=dense_layer2, units=2048, activation=tf.nn.relu, name="FC_3")
dense_layer4 = tf.layers.dense(inputs=dense_layer3, units=512, activation=tf.nn.relu, name="FC_4")
logits = tf.layers.dense(inputs=dense_layer4, units=2, name="logit_layer")
loss = None
train_op = None
if mode != learn.ModeKeys.INFER:
loss = tf.losses.softmax_cross_entropy(
onehot_labels=labels, logits=logits)
if mode == learn.ModeKeys.TRAIN:
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=.001,
optimizer="SGD")
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(
logits, name="softmax_tensor")}
return model_fn_lib.ModelFnOps(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op)
def main(unused_param):
def data_pipeline(filenames, batch_size, num_epochs=None, min_after_dequeue=10000):
with tf.name_scope("data_pipeline"):
filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs)
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
row = tf.decode_csv(value, record_defaults=[[0.0] for _ in range(66)])
example_op, label_op = tf.stack(row[:len(row)-2]), tf.stack(row[len(row)-2:])
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example_op, label_op],
batch_size=batch_size,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
def input_data_fn(data_getter_ops):
batch, labels = sess.run(data_getter_ops)
return tf.constant(batch, dtype=tf.float32), tf.constant(labels, dtype=tf.float32)
NUM_EPOCHS = 6
BATCHES_IN_TRAINING_EPOCH = 8000
training_data_pipe_ops = data_pipeline(
filenames=["train_data.csv"],
batch_size=500,
min_after_dequeue=10000)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
classifier = tf.contrib.learn.Estimator(
model_fn=cnn_model_fn,
model_dir="/tmp/bug_finder")
for j in range(NUM_EPOCHS):
classifier.fit(
input_fn=lambda: input_data_fn(training_data_pipe_ops),
steps = BATCHES_IN_TRAINING_EPOCH)
print("Epoch", str(j+1), "training completed.")
coord.request_stop()
coord.join(threads)
if __name__ == "__main__":
tf.app.run()
I figured out the issue, I was creating data pipelines with the interactive session I created, and then having my input function evaluate the examples (like a feed-dictionary). The reason this is an issue is that the Estimator class creates it's own session (a MonitoredTraininSession), and since the graph operations weren't being created from within a call from the Estimator class (and thus with it's session), they were not being saved. Using an input function to create the graph operations, and return the final graph operation (the batching) has resulted in everything working smoothly.