How to get predictions from a tf.estimator trained CNN - tensorflow

I have trained a CNN using the tf.estimator API, but am having trouble getting predictions out in a way that is useful to me.
I need to feed images to my CNN in real time as they are received from a camera. In an older net design I made the Controller_tf class, which worked fine for doing that. So I have tried to adapt it to a new CNN trained using tf.estimator (as said earlier).
The estimator.predict interface seems to want to be invoked via a tf.app.run() call (would be glad to be proved wrong about that), which is why I am trying to run the CNN using tf.Session() (with if statements inside the model function to only run the relevant parts) but I'm currently getting the error:
ValueError: Fetch argument 'infer' cannot be interpreted as a Tensor. ("The name 'infer' refers to an Operation not in the graph.")
I can't quite see where I am going wrong. Is the trained model incompatable with the run in PREDICT mode? Any help will be very much appreciated. Any way here is the code:
class Controller_tf:
set_speed = None
def __init__(self, model, ckpt_path, set_speed_in):
self.set_speed = set_speed_in
self.x = tf.placeholder(tf.float32, shape = (None, 104, 160, 3))
self.y = model(self.x, None, tf.estimator.ModeKeys.PREDICT)
# make TF use memory growth method
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
self.sess = tf.Session(config=config)
saver = tf.train.Saver()
saver.restore(self.sess, ckpt_path)
def update(self, message):
# The current speed of the car
image = frame2numpy(message['frame'], (160,104))
image_array = np.asarray(image)
turn_logits = self.sess.run(self.y, {self.x: image_array[None, :, :, :]})
return turn_logits
model = cnn_model_fn3
ckpt = 'ckpts/stc_model3/model.ckpt-27621'
controller = Controller_tf(model, ckpt, 18)
image_file = 'G:/Datasets/ds072.001/ds072.001-fm-0008465.jpg'
#image_file = 'G:/Datasets/ds072.001/ds072.001-fm-0009156.jpg'
satnavimg = load_image(image_file)
satnavimg = np.asarray([satnavimg])
satnavimg = (satnavimg/127.5) - 1.0
print(np.shape(satnavimg))
msg = {'frame': satnavimg}
turn = controller.update(msg)
print(turn)
The model function is:
def cnn_model_fn3(features, labels, mode):
if mode == tf.estimator.ModeKeys.PREDICT:
input_layer = features
else:
input_layer = tf.reshape(features["image_data"], [-1, 104, 160, 3])
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[10, 10],
padding="same",
activation=tf.nn.relu,
name='Conv1')
... removed layer code for brevity ...
logits = tf.layers.dense(
inputs=dropout1,
units=3,
name='Dense3')
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=labels, predictions=predictions["classes"])}
if mode == tf.estimator.ModeKeys.PREDICT:
return logits
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

Related

Facing error in tensorflow2.0 while building tf_estimator

import custom_model as CM
import input_pipeline as IP
import tensorflow as tf
def custom_estimator(features, labels, mode):
logits = CM.model_net(features=features, n_classes=5)
prediction = tf.keras.layers.Activation('softmax')(logits)
preds_dict = {'class': tf.argmax(input=prediction, axis=1),
'probabilities': prediction,
'logits': logits}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode,
predictions=preds_dict)
# Compute loss
labels = tf.reshape(labels, (BATCH_SIZE, 5))
loss = tf.keras.losses.categorical_crossentropy(y_true=labels,
y_pred=prediction)
# Compute evaluation metrics
accuracy = custom_accuracy(labels, prediction)
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss,
eval_metric_ops=metrics)
optimizer = tf.keras.optimizers.Adam()
train_op = optimizer.minimize(loss)
return tf.estimator.EstimatorSpec(mode, loss=loss,
train_op=train_op)
# Build tf_estimator
classifier = tf.estimator.Estimator(model_fn=custom_estimator,
model_dir=model_dir)
# Train the estimator
TRAIN_FILES, TRAIN_LABELS = IP.map_file_to_label(data_dir=data_dir)
TRAIN = classifier.train(input_fn=lambda:
IP.imgs_input_fn(TRAIN_FILES, labels=TRAIN_LABELS,
perform_shuffle=True, repeat_count=EPOCHS,
batch_size=BATCH_SIZE),
steps=int(len(TRAIN_LABELS)/BATCH_SIZE))
This is the error I am facing with TensorFlow-2.0. Error image & code for the same attached here. Please help.
If I enter var_list=None then the error is "ValueError: Passed in object of type , not tf.Tensor"
First of all, I don't think this Estimator code example is TensorFlow 2.0 compliant. In any case, if peradventure you're using the 1.x versions, replace:
train_op = optimizer.minimize(loss)
with this:
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
If indeed, you're using TensorFlow 2.0, then replace with:
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.compat.v1.train.get_global_step())

Sagemaker and Tensorflow model not saved

I am learning Sagemaker and I have this entry point:
import os
import tensorflow as tf
from tensorflow.python.estimator.model_fn import ModeKeys as Modes
INPUT_TENSOR_NAME = 'inputs'
SIGNATURE_NAME = 'predictions'
LEARNING_RATE = 0.001
def model_fn(features, labels, mode, params):
# Input Layer
input_layer = tf.reshape(features[INPUT_TENSOR_NAME], [-1, 28, 28, 1])
# Convolutional Layer #1
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[5, 5],
padding='same',
activation=tf.nn.relu)
# Pooling Layer #1
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
# Convolutional Layer #2 and Pooling Layer #2
conv2 = tf.layers.conv2d(
inputs=pool1,
filters=64,
kernel_size=[5, 5],
padding='same',
activation=tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
# Dense Layer
pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
dropout = tf.layers.dropout(
inputs=dense, rate=0.4, training=(mode == Modes.TRAIN))
# Logits Layer
logits = tf.layers.dense(inputs=dropout, units=10)
# Define operations
if mode in (Modes.PREDICT, Modes.EVAL):
predicted_indices = tf.argmax(input=logits, axis=1)
probabilities = tf.nn.softmax(logits, name='softmax_tensor')
if mode in (Modes.TRAIN, Modes.EVAL):
global_step = tf.train.get_or_create_global_step()
label_indices = tf.cast(labels, tf.int32)
loss = tf.losses.softmax_cross_entropy(
onehot_labels=tf.one_hot(label_indices, depth=10), logits=logits)
tf.summary.scalar('OptimizeLoss', loss)
if mode == Modes.PREDICT:
predictions = {
'classes': predicted_indices,
'probabilities': probabilities
}
export_outputs = {
SIGNATURE_NAME: tf.estimator.export.PredictOutput(predictions)
}
return tf.estimator.EstimatorSpec(
mode, predictions=predictions, export_outputs=export_outputs)
if mode == Modes.TRAIN:
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(loss, global_step=global_step)
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
if mode == Modes.EVAL:
eval_metric_ops = {
'accuracy': tf.metrics.accuracy(label_indices, predicted_indices)
}
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=eval_metric_ops)
def serving_input_fn(params):
inputs = {INPUT_TENSOR_NAME: tf.placeholder(tf.float32, [None, 784])}
return tf.estimator.export.ServingInputReceiver(inputs, inputs)
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
})
image = tf.decode_raw(features['image_raw'], tf.uint8)
image.set_shape([784])
image = tf.cast(image, tf.float32) * (1. / 255)
label = tf.cast(features['label'], tf.int32)
return image, label
def train_input_fn(training_dir, params):
return _input_fn(training_dir, 'train.tfrecords', batch_size=100)
def eval_input_fn(training_dir, params):
return _input_fn(training_dir, 'test.tfrecords', batch_size=100)
def _input_fn(training_dir, training_filename, batch_size=100):
test_file = os.path.join(training_dir, training_filename)
filename_queue = tf.train.string_input_producer([test_file])
image, label = read_and_decode(filename_queue)
images, labels = tf.train.batch(
[image, label], batch_size=batch_size,
capacity=1000 + 3 * batch_size)
return {INPUT_TENSOR_NAME: images}, labels
def neo_preprocess(payload, content_type):
import logging
import numpy as np
import io
logging.info('Invoking user-defined pre-processing function')
if content_type != 'application/x-image' and content_type != 'application/vnd+python.numpy+binary':
raise RuntimeError('Content type must be application/x-image or application/vnd+python.numpy+binary')
f = io.BytesIO(payload)
image = np.load(f)*255
return image
### NOTE: this function cannot use MXNet
def neo_postprocess(result):
import logging
import numpy as np
import json
logging.info('Invoking user-defined post-processing function')
# Softmax (assumes batch size 1)
result = np.squeeze(result)
result_exp = np.exp(result - np.max(result))
result = result_exp / np.sum(result_exp)
response_body = json.dumps(result.tolist())
content_type = 'application/json'
return response_body, content_type
And I am training it
estimator = TensorFlow(entry_point='cnn_fashion_mnist.py',
role=role,
input_mode='Pipe',
training_steps=1,
evaluation_steps=1,
train_instance_count=1,
output_path=output_path,
train_instance_type='ml.c5.2xlarge',
base_job_name='mnist')
so far it is trying correctly and it tells me that everything when well, but when I check the output there is nothing there or if I try to deploy it I get the error saying it couldn't find the model because there is nothing in the bucker, any ideas or extra configurations? Thank you
Looks like you are using one of the older Tensorflow versions.
We would recommend switching to a newer more straight-forward way of running Tensorflow in SageMaker (script mode) by switching to a more recent Tensorflow version.
You can read more about it in our documentation:
https://sagemaker.readthedocs.io/en/stable/using_tf.html
Here is an example that might help:
https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/tensorflow_script_mode_training_and_serving/tensorflow_script_mode_training_and_serving.ipynb
Are you sure that your entry point has code that is really executed? You need a "main" / top--level code outside of functions. This code is executed as soon as you start the training. At least in my running examples.
import os
import tensorflow as tf
from tensorflow.python.estimator.model_fn import ModeKeys as Modes
INPUT_TENSOR_NAME = 'inputs'
SIGNATURE_NAME = 'predictions'
LEARNING_RATE = 0.001
ADD CODE FOR CREATION OF ESTIMATOR + TRAIN +....
ADD CODE THAT SAVES YOUR MODEL(e.g. joblib.dump(xxx, path)
In addition for executing the training, your "estimator = TensorFlow(..." should be followed by "estimater.fit(...)"-like call.
Have you double-checked in the protocolls for your training request in the aws console which part of your code was executed?

tensorflow estimator evaluate much slower than training

I have a custom estimator and am trying to use some custom metrics during evaluation. However, whenever I add these metrics to evaluation, via eval_metric_ops the evaluation becomes really slow (much slower than training which is actually calculating the same metrics). If I don't add the metrics there then I can only see metrics in Tensorboard for training and not for evaluation.
What is the right way to add a custom metric for a custom estimator so that it is saved during evaluation.
This is what I have:
def compute_accuracy(preds, labels):
total = tf.shape(labels.values)[0]
preds = tf.sparse_to_dense(preds.indices, preds.dense_shape, preds.values, default_value=-1)
labels = tf.sparse_to_dense(labels.indices, labels.dense_shape, labels.values, default_value=-2)
r = tf.shape(labels)[0]
c = tf.minimum(tf.shape(labels)[1], tf.shape(preds)[1])
preds = tf.slice(preds, [0,0], [r,c])
labels = tf.slice(labels, [0,0], [r,c])
preds = tf.cast(preds, tf.int32)
labels = tf.cast(labels, tf.int32)
correct = tf.reduce_sum(tf.cast(tf.equal(preds, labels), tf.int32))
accuracy = tf.divide(correct, total)
return accuracy
In model_fn
edit_dist = tf.reduce_mean(tf.edit_distance(tf.cast(predicted_label[0], tf.int32), labels))
accuracy = compute_accuracy(predicted_label[0], labels)
tf.summary.scalar('edit_dist', edit_dist)
tf.summary.scalar('accuracy', accuracy)
metrics = {
'accuracy': tf.metrics.mean(accuracy),
'edit_dist':tf.metrics.mean(edit_dist),
}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
As requested, here is the complete model and TfRecord Writer code:
def crnn_model(features, labels, mode, params):
inputs = features['image']
print("INPUTS SHAPE", inputs.shape)
if mode == tf.estimator.ModeKeys.TRAIN:
batch_size = params['batch_size']
lr_initial = params['lr']
lr = tf.train.exponential_decay(lr_initial, global_step=tf.train.get_global_step(),
decay_steps=params['lr_decay_steps'], decay_rate=params['lr_decay_rate'],
staircase=True)
tf.summary.scalar('lr', lr)
else:
batch_size = params['test_batch_size']
with tf.variable_scope('crnn', reuse=False):
rnn_output, predicted_label, logits = CRNN(inputs, hidden_size=params['hidden_size'], batch_size=batch_size)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'predicted_label': predicted_label,
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
loss = tf.reduce_mean(tf.nn.ctc_loss(labels=labels, inputs=rnn_output,
sequence_length=23 * np.ones(batch_size),
ignore_longer_outputs_than_inputs=True))
edit_dist = tf.reduce_mean(tf.edit_distance(tf.cast(predicted_label[0], tf.int32), labels))
accuracy = compute_accuracy(predicted_label[0], labels)
metrics = {
'accuracy': tf.metrics.mean(accuracy),
'edit_dist':tf.metrics.mean(edit_dist),
}
tf.summary.scalar('loss', loss)
tf.summary.scalar('edit_dist', edit_dist)
tf.summary.scalar('accuracy', accuracy)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
assert mode == tf.estimator.ModeKeys.TRAIN
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
Tf Record Writer code
def _write_fn(self, out_file, image_list, label_list, mode):
writer = tf.python_io.TFRecordWriter(out_file)
N = len(image_list)
for i in range(N):
if (i % 1000) == 0:
print('%s Data: %d/%d records saved' % (mode, i,N))
sys.stdout.flush()
try:
#print('Try image: ', image_list[i])
image = load_image(image_list[i])
except (ValueError, AttributeError):
print('Ignoring image: ', image_list[i])
continue
label = label_list[i]
feature = {
'label': _int64_feature(label),
'image': _byte_feature(tf.compat.as_bytes(image.tostring()))
}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
In the Estimator framework, everything happens in the model_fn, namely your crnn_model(features, labels, mode, params). This is why this function has such a complex signature.
The mode parameter indicates whether it is called for training, evaluation or prediction. So, if you want to log additional summaries to tensorboard during the evaluation, you would add them under the if mode == tf.estimator.ModeKeys.EVAL section, or outside any if in the model_fn.
I suppose your eval is much slower because you have different batch sizes for train/eval and the eval batch size could be smaller. You indicated this is not the case.
After a closer look at your code, and having experienced with a similar model, I believe that the evaluation takes longer with metrics because one of the metrics is edit_distance() which is implemented sequentially on the CPU. During training, this op is not required so it is not run.
What I suggest is that you run your train() and evaluate() in different programs, with the same model_fn() and model_dir. This way, train does not need to wait for evaluate. And evaluate will run only when necessary, i.e. when there are new checkpoints in the model_dir. If you don't have 2 GPUs for this, you can either split the GPU memory between the two processes (using a custom run-config with gpu_memory_fraction=0.75 for train) or by hiding the GPU from evaluate() with CUDA_VISIBLE_DEVICES='' environment variable

Tensorflow model outputs different value during inference

During training, I'm logging the predicted values that my regressor model outputs against the training data. When I ran that same dataset on prediction mode, the range of values that the model outputs is way different:
Tensorboard
Here we see that the model has been predicting values in the range (140, 250).
Predictions against the same dataset
Here we have the model predicting values between (17, 23). What gives?
My suspicion is that the estimator API doesn't magically save the moving_mean and moving_variance when using tf.layers.batch_normalization.
My model:
def model_fn(features, labels, mode, params):
training = mode == tf.estimator.ModeKeys.TRAIN
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
x = tf.reshape(features, [-1, 32, 32, 3])
x = tf.layers.batch_normalization(x, training=training, name='norm_128')
i = 1
for filters in [32, 64]:
x = tf.layers.conv2d(x, filters=filters, kernel_size=3, activation=None, name='conv{}'.format(i))
x = tf.layers.batch_normalization(x, training=training, name='norm{}'.format(i))
x = tf.nn.relu(x, name='act{}'.format(i))
i += 1
x = tf.layers.conv2d(x, filters=filters * 2, kernel_size=3, strides=2, activation=None, name='pool{}'.format(i))
x = tf.layers.batch_normalization(x, training=training, name='norm{}'.format(i))
x = tf.nn.relu(x, name='act{}'.format(i))
i += 1
flat = tf.contrib.layers.flatten(x, scope='flatten')
dropout = tf.layers.dropout(flat, rate=params['dropout_rate'], training=training, name='dropout')
output_layer = tf.layers.dense(dropout, units=1, name='output_layer')
predictions = tf.reshape(output_layer, [-1])
predictions_dict = {
'pred': predictions,
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions_dict)
loss = tf.losses.mean_squared_error(labels=labels, predictions=predictions)
tf.summary.scalar('loss', loss)
tf.summary.histogram('prediction', predictions)
tf.summary.scalar('prediction', tf.reduce_mean(predictions))
optimizer = tf.train.AdamOptimizer(learning_rate=params['learning_rate'])
with tf.control_dependencies(extra_update_ops):
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
eval_metric_ops = {
'rmse_val': tf.metrics.root_mean_squared_error(labels=tf.cast(labels, tf.float32), predictions=predictions)
}
tf.summary.scalar('rmse_train', eval_metric_ops['rmse_val'][1])
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
__Edited__
The only point of randomness in your code is drop out.After the train and at the prediction time, set the keep probability of drop out to 1. Because the drop out layer selects the subset of variables passed to them randomly and training occurs upon this subset for preventing overfitting.

TensorFlow Estimator restoring all variables properly, but loss spikes up afterwards

I am using TensorFlow 1.2.1 on Windows 10, and using the Estimator API. Everything runs without any errors, but whenever I have to restore the parameters from a checkpoint, some aspect of it doesn't work. I've checked that the values of every variable in classifier.get_variable_names() does not change after an evaluation, however the Loss spikes back up to near where it started, this is followed by a continued learning, each time learning faster than the last.
This happens within one TensorFlow run, when a validation or evaluation run happens, or when I rerun the python file to continue training.
The following graphs are one example of this problem, they are restoring the variables every 2500 steps:
http://imgur.com/6q9Wuat
http://imgur.com/CQ2hdR8
The following code is a significiantly reduced version of my code, which still replicates the error:
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
tf.logging.set_verbosity(tf.logging.INFO)
sess = tf.InteractiveSession()
def cnn_model_fn(features, labels, mode):
dense_layer1 = tf.layers.dense(inputs=features, units=512, activation=tf.nn.relu, name="FC_1")
dense_layer2 = tf.layers.dense(inputs=dense_layer1, units=1024, activation=tf.nn.relu, name="FC_2")
dense_layer3 = tf.layers.dense(inputs=dense_layer2, units=2048, activation=tf.nn.relu, name="FC_3")
dense_layer4 = tf.layers.dense(inputs=dense_layer3, units=512, activation=tf.nn.relu, name="FC_4")
logits = tf.layers.dense(inputs=dense_layer4, units=2, name="logit_layer")
loss = None
train_op = None
if mode != learn.ModeKeys.INFER:
loss = tf.losses.softmax_cross_entropy(
onehot_labels=labels, logits=logits)
if mode == learn.ModeKeys.TRAIN:
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=.001,
optimizer="SGD")
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(
logits, name="softmax_tensor")}
return model_fn_lib.ModelFnOps(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op)
def main(unused_param):
def data_pipeline(filenames, batch_size, num_epochs=None, min_after_dequeue=10000):
with tf.name_scope("data_pipeline"):
filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs)
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
row = tf.decode_csv(value, record_defaults=[[0.0] for _ in range(66)])
example_op, label_op = tf.stack(row[:len(row)-2]), tf.stack(row[len(row)-2:])
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example_op, label_op],
batch_size=batch_size,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
def input_data_fn(data_getter_ops):
batch, labels = sess.run(data_getter_ops)
return tf.constant(batch, dtype=tf.float32), tf.constant(labels, dtype=tf.float32)
NUM_EPOCHS = 6
BATCHES_IN_TRAINING_EPOCH = 8000
training_data_pipe_ops = data_pipeline(
filenames=["train_data.csv"],
batch_size=500,
min_after_dequeue=10000)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
classifier = tf.contrib.learn.Estimator(
model_fn=cnn_model_fn,
model_dir="/tmp/bug_finder")
for j in range(NUM_EPOCHS):
classifier.fit(
input_fn=lambda: input_data_fn(training_data_pipe_ops),
steps = BATCHES_IN_TRAINING_EPOCH)
print("Epoch", str(j+1), "training completed.")
coord.request_stop()
coord.join(threads)
if __name__ == "__main__":
tf.app.run()
I figured out the issue, I was creating data pipelines with the interactive session I created, and then having my input function evaluate the examples (like a feed-dictionary). The reason this is an issue is that the Estimator class creates it's own session (a MonitoredTraininSession), and since the graph operations weren't being created from within a call from the Estimator class (and thus with it's session), they were not being saved. Using an input function to create the graph operations, and return the final graph operation (the batching) has resulted in everything working smoothly.