tensorflow estimator evaluate much slower than training - tensorflow

I have a custom estimator and am trying to use some custom metrics during evaluation. However, whenever I add these metrics to evaluation, via eval_metric_ops the evaluation becomes really slow (much slower than training which is actually calculating the same metrics). If I don't add the metrics there then I can only see metrics in Tensorboard for training and not for evaluation.
What is the right way to add a custom metric for a custom estimator so that it is saved during evaluation.
This is what I have:
def compute_accuracy(preds, labels):
total = tf.shape(labels.values)[0]
preds = tf.sparse_to_dense(preds.indices, preds.dense_shape, preds.values, default_value=-1)
labels = tf.sparse_to_dense(labels.indices, labels.dense_shape, labels.values, default_value=-2)
r = tf.shape(labels)[0]
c = tf.minimum(tf.shape(labels)[1], tf.shape(preds)[1])
preds = tf.slice(preds, [0,0], [r,c])
labels = tf.slice(labels, [0,0], [r,c])
preds = tf.cast(preds, tf.int32)
labels = tf.cast(labels, tf.int32)
correct = tf.reduce_sum(tf.cast(tf.equal(preds, labels), tf.int32))
accuracy = tf.divide(correct, total)
return accuracy
In model_fn
edit_dist = tf.reduce_mean(tf.edit_distance(tf.cast(predicted_label[0], tf.int32), labels))
accuracy = compute_accuracy(predicted_label[0], labels)
tf.summary.scalar('edit_dist', edit_dist)
tf.summary.scalar('accuracy', accuracy)
metrics = {
'accuracy': tf.metrics.mean(accuracy),
'edit_dist':tf.metrics.mean(edit_dist),
}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
As requested, here is the complete model and TfRecord Writer code:
def crnn_model(features, labels, mode, params):
inputs = features['image']
print("INPUTS SHAPE", inputs.shape)
if mode == tf.estimator.ModeKeys.TRAIN:
batch_size = params['batch_size']
lr_initial = params['lr']
lr = tf.train.exponential_decay(lr_initial, global_step=tf.train.get_global_step(),
decay_steps=params['lr_decay_steps'], decay_rate=params['lr_decay_rate'],
staircase=True)
tf.summary.scalar('lr', lr)
else:
batch_size = params['test_batch_size']
with tf.variable_scope('crnn', reuse=False):
rnn_output, predicted_label, logits = CRNN(inputs, hidden_size=params['hidden_size'], batch_size=batch_size)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'predicted_label': predicted_label,
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
loss = tf.reduce_mean(tf.nn.ctc_loss(labels=labels, inputs=rnn_output,
sequence_length=23 * np.ones(batch_size),
ignore_longer_outputs_than_inputs=True))
edit_dist = tf.reduce_mean(tf.edit_distance(tf.cast(predicted_label[0], tf.int32), labels))
accuracy = compute_accuracy(predicted_label[0], labels)
metrics = {
'accuracy': tf.metrics.mean(accuracy),
'edit_dist':tf.metrics.mean(edit_dist),
}
tf.summary.scalar('loss', loss)
tf.summary.scalar('edit_dist', edit_dist)
tf.summary.scalar('accuracy', accuracy)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
assert mode == tf.estimator.ModeKeys.TRAIN
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
Tf Record Writer code
def _write_fn(self, out_file, image_list, label_list, mode):
writer = tf.python_io.TFRecordWriter(out_file)
N = len(image_list)
for i in range(N):
if (i % 1000) == 0:
print('%s Data: %d/%d records saved' % (mode, i,N))
sys.stdout.flush()
try:
#print('Try image: ', image_list[i])
image = load_image(image_list[i])
except (ValueError, AttributeError):
print('Ignoring image: ', image_list[i])
continue
label = label_list[i]
feature = {
'label': _int64_feature(label),
'image': _byte_feature(tf.compat.as_bytes(image.tostring()))
}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()

In the Estimator framework, everything happens in the model_fn, namely your crnn_model(features, labels, mode, params). This is why this function has such a complex signature.
The mode parameter indicates whether it is called for training, evaluation or prediction. So, if you want to log additional summaries to tensorboard during the evaluation, you would add them under the if mode == tf.estimator.ModeKeys.EVAL section, or outside any if in the model_fn.
I suppose your eval is much slower because you have different batch sizes for train/eval and the eval batch size could be smaller. You indicated this is not the case.
After a closer look at your code, and having experienced with a similar model, I believe that the evaluation takes longer with metrics because one of the metrics is edit_distance() which is implemented sequentially on the CPU. During training, this op is not required so it is not run.
What I suggest is that you run your train() and evaluate() in different programs, with the same model_fn() and model_dir. This way, train does not need to wait for evaluate. And evaluate will run only when necessary, i.e. when there are new checkpoints in the model_dir. If you don't have 2 GPUs for this, you can either split the GPU memory between the two processes (using a custom run-config with gpu_memory_fraction=0.75 for train) or by hiding the GPU from evaluate() with CUDA_VISIBLE_DEVICES='' environment variable

Related

Facing error in tensorflow2.0 while building tf_estimator

import custom_model as CM
import input_pipeline as IP
import tensorflow as tf
def custom_estimator(features, labels, mode):
logits = CM.model_net(features=features, n_classes=5)
prediction = tf.keras.layers.Activation('softmax')(logits)
preds_dict = {'class': tf.argmax(input=prediction, axis=1),
'probabilities': prediction,
'logits': logits}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode,
predictions=preds_dict)
# Compute loss
labels = tf.reshape(labels, (BATCH_SIZE, 5))
loss = tf.keras.losses.categorical_crossentropy(y_true=labels,
y_pred=prediction)
# Compute evaluation metrics
accuracy = custom_accuracy(labels, prediction)
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss,
eval_metric_ops=metrics)
optimizer = tf.keras.optimizers.Adam()
train_op = optimizer.minimize(loss)
return tf.estimator.EstimatorSpec(mode, loss=loss,
train_op=train_op)
# Build tf_estimator
classifier = tf.estimator.Estimator(model_fn=custom_estimator,
model_dir=model_dir)
# Train the estimator
TRAIN_FILES, TRAIN_LABELS = IP.map_file_to_label(data_dir=data_dir)
TRAIN = classifier.train(input_fn=lambda:
IP.imgs_input_fn(TRAIN_FILES, labels=TRAIN_LABELS,
perform_shuffle=True, repeat_count=EPOCHS,
batch_size=BATCH_SIZE),
steps=int(len(TRAIN_LABELS)/BATCH_SIZE))
This is the error I am facing with TensorFlow-2.0. Error image & code for the same attached here. Please help.
If I enter var_list=None then the error is "ValueError: Passed in object of type , not tf.Tensor"
First of all, I don't think this Estimator code example is TensorFlow 2.0 compliant. In any case, if peradventure you're using the 1.x versions, replace:
train_op = optimizer.minimize(loss)
with this:
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
If indeed, you're using TensorFlow 2.0, then replace with:
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.compat.v1.train.get_global_step())

Why my GraphDef implementation exceeds the 2GB limit in TensorFlow?

I am using the tf.estimator API and I have the following model_fn function:
def model_fn(features, labels, mode, params):
labels = tf.reshape(labels, (-1, 1))
model = word2vec.create_model(features, params)
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=model)
loss = sampled_softmax_loss.create_loss(model['softmax_w'],
model['softmax_b'],
model['relu_layer_1'],
labels,
params['softmax_sample'],
params['vocabulary_size'])
cost = tf.reduce_mean(loss)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode=mode, loss=cost)
optimizer = adam_optimizer.create_optimizer(params['learning_rate'])
train_operation = optimizer.minimize(cost)
if mode == tf.estimator.ModeKeys.TRAIN:
return tf.estimator.EstimatorSpec(mode=mode, loss=cost, train_op=train_operation)
raise RuntimeError('Not a valid Mode value')
The word2vec.create_model function is given below. The function returns a python dictionary with the interesting nodes of the network (e.g. the embeddings matrix, the softmax weight and bias for training etc.).
def create_model(features, hyper_params):
model = {}
vocabulary_size = hyper_params['vocabulary_size']
hidden_size = hyper_params['hidden_size']
feature_columns = hyper_params['feature_columns']
with tf.variable_scope('word2vec'):
# Creating the Embedding layer
net = tf.feature_column.input_layer(features, feature_columns)
# Creating the hidden layer
net = dense_layer.create_layer(model, net, hidden_size)
# Creating the SoftMax weight and bias variables to use in the sampled loss function
softmax_w = tf.Variable(tf.truncated_normal((vocabulary_size, hidden_size), stddev=0.1), name='softmax_weights')
softmax_b = tf.Variable(tf.zeros(vocabulary_size), name='softmax_bias')
model['softmax_w'] = softmax_w
model['softmax_b'] = softmax_b
return model
Last but not least, my main function, which in turn I use the tf.app.run(main) command to run:
def main():
path = os.path.join('data', 'data.csv')
(train_x, train_y), (test_x, test_y) = prepare_data.load_data(path, path, columns, columns[-1])
vocabulary_size = len(train_x[columns[0]].unique())
feature_columns = []
for key in train_x.keys():
item_id = tf.feature_column.categorical_column_with_identity(key=key, num_buckets=vocabulary_size)
feature_columns.append(tf.feature_column.embedding_column(item_id, 512))
classifier = tf.estimator.Estimator(
model_fn=model_fn,
params={
'feature_columns': feature_columns,
'vocabulary_size': vocabulary_size,
'hidden_size': 256,
'learning_rate': 0.001,
'softmax_sample': 100,
})
print('Training the classifier...')
classifier.train(input_fn=lambda: prepare_data.train_input_fn(train_x, train_y, 128), steps=2)
print('Evaluating on test dataset...')
eval_result = classifier.evaluate(input_fn=lambda: prepare_data.eval_input_fn(test_x, test_y, 128))
print('Printing results...')
print(eval_result)
When I run this, I get a ValueError: GraphDef cannot be larger than 2GB. error. Why is that? What can am I doing wrong?
Below is my train_input_fn:
def train_input_fn(features, labels, batch_size):
def gen():
for f, l in zip(features, labels):
yield f, l
ds = tf.data.Dataset.from_generator(gen, (tf.int64, tf.int64), (tf.TensorShape([None]), tf.TensorShape([None])))
ds = ds.repeat().batch(batch_size)
feature, label = ds.make_one_shot_iterator().get_next()
return {"Input": feature}, label
The dataset is a simple csv like below:
Input Label
0 12600 838
1 12600 4558
2 12600 838
3 12600 4558
4 838 12600
Dataset.from_tensor_slices adds the whole dataset to the computational graph (see details), so better use Dataset.from_generator. I have shown an example of how to do it using mnist:How to load MNIST via TensorFlow (including download)?

How to get predictions from a tf.estimator trained CNN

I have trained a CNN using the tf.estimator API, but am having trouble getting predictions out in a way that is useful to me.
I need to feed images to my CNN in real time as they are received from a camera. In an older net design I made the Controller_tf class, which worked fine for doing that. So I have tried to adapt it to a new CNN trained using tf.estimator (as said earlier).
The estimator.predict interface seems to want to be invoked via a tf.app.run() call (would be glad to be proved wrong about that), which is why I am trying to run the CNN using tf.Session() (with if statements inside the model function to only run the relevant parts) but I'm currently getting the error:
ValueError: Fetch argument 'infer' cannot be interpreted as a Tensor. ("The name 'infer' refers to an Operation not in the graph.")
I can't quite see where I am going wrong. Is the trained model incompatable with the run in PREDICT mode? Any help will be very much appreciated. Any way here is the code:
class Controller_tf:
set_speed = None
def __init__(self, model, ckpt_path, set_speed_in):
self.set_speed = set_speed_in
self.x = tf.placeholder(tf.float32, shape = (None, 104, 160, 3))
self.y = model(self.x, None, tf.estimator.ModeKeys.PREDICT)
# make TF use memory growth method
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
self.sess = tf.Session(config=config)
saver = tf.train.Saver()
saver.restore(self.sess, ckpt_path)
def update(self, message):
# The current speed of the car
image = frame2numpy(message['frame'], (160,104))
image_array = np.asarray(image)
turn_logits = self.sess.run(self.y, {self.x: image_array[None, :, :, :]})
return turn_logits
model = cnn_model_fn3
ckpt = 'ckpts/stc_model3/model.ckpt-27621'
controller = Controller_tf(model, ckpt, 18)
image_file = 'G:/Datasets/ds072.001/ds072.001-fm-0008465.jpg'
#image_file = 'G:/Datasets/ds072.001/ds072.001-fm-0009156.jpg'
satnavimg = load_image(image_file)
satnavimg = np.asarray([satnavimg])
satnavimg = (satnavimg/127.5) - 1.0
print(np.shape(satnavimg))
msg = {'frame': satnavimg}
turn = controller.update(msg)
print(turn)
The model function is:
def cnn_model_fn3(features, labels, mode):
if mode == tf.estimator.ModeKeys.PREDICT:
input_layer = features
else:
input_layer = tf.reshape(features["image_data"], [-1, 104, 160, 3])
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[10, 10],
padding="same",
activation=tf.nn.relu,
name='Conv1')
... removed layer code for brevity ...
logits = tf.layers.dense(
inputs=dropout1,
units=3,
name='Dense3')
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=labels, predictions=predictions["classes"])}
if mode == tf.estimator.ModeKeys.PREDICT:
return logits
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

How to get train loss and evaluate loss every global step in Tensorflow Estimator?

I can get traing loss every global step. But I do want to add the evaluate loss in graph 'lossxx' in tensorboard. How to do that?
class MyHook(tf.train.SessionRunHook):
def after_run(self,run_context,run_value):
_session = run_context.session
_session.run(_session.graph.get_operation_by_name('acc_op'))
def my_model(features, labels, mode):
...
logits = tf.layers.dense(net, 3, activation=None)
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'class': predicted_classes,
'prob': tf.nn.softmax(logits)
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# Compute loss.
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
acc, acc_op = tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
tf.identity(acc_op,'acc_op')
loss_sum = tf.summary.scalar('lossxx',loss)
accuracy_sum = tf.summary.scalar('accuracyxx',acc)
merg = tf.summary.merge_all()
# Create training op.
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op,
training_chief_hooks=[
tf.train.SummarySaverHook(save_steps=10, output_dir='./model', summary_op=merg)])
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops={'accuracy': (acc, acc_op)}
)
classifier.train(input_fn=train_input_fn, steps=1000,hooks=[ MyHook()])
You actually don't need to create a SummarySaverHook by yourself, as it is already included in the tf.estimator.Estimator. Just create all the summaries you want with tf.summary.xxx and they will all be evaluated every n steps. (See tf.estimator.RunConfig for this).
Also, you don't need to create a summary for your final loss loss. This will also be created for you automatically. If you do it like this, then the training and evaluation summaries will be shown in the same graph on TensorBoard. The estimator creates a sub-directory eval in your current model_dir to achieve this.
And a small hint: use the acc_op directly in summaries to update the metric and get the value of it. However, the tf.metrics functions are quite difficult to handle ;-)
You need to pass evaluation data to the model alongside with training data by using tf.estimator.train_and_evaluate

TensorFlow Estimator restoring all variables properly, but loss spikes up afterwards

I am using TensorFlow 1.2.1 on Windows 10, and using the Estimator API. Everything runs without any errors, but whenever I have to restore the parameters from a checkpoint, some aspect of it doesn't work. I've checked that the values of every variable in classifier.get_variable_names() does not change after an evaluation, however the Loss spikes back up to near where it started, this is followed by a continued learning, each time learning faster than the last.
This happens within one TensorFlow run, when a validation or evaluation run happens, or when I rerun the python file to continue training.
The following graphs are one example of this problem, they are restoring the variables every 2500 steps:
http://imgur.com/6q9Wuat
http://imgur.com/CQ2hdR8
The following code is a significiantly reduced version of my code, which still replicates the error:
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
tf.logging.set_verbosity(tf.logging.INFO)
sess = tf.InteractiveSession()
def cnn_model_fn(features, labels, mode):
dense_layer1 = tf.layers.dense(inputs=features, units=512, activation=tf.nn.relu, name="FC_1")
dense_layer2 = tf.layers.dense(inputs=dense_layer1, units=1024, activation=tf.nn.relu, name="FC_2")
dense_layer3 = tf.layers.dense(inputs=dense_layer2, units=2048, activation=tf.nn.relu, name="FC_3")
dense_layer4 = tf.layers.dense(inputs=dense_layer3, units=512, activation=tf.nn.relu, name="FC_4")
logits = tf.layers.dense(inputs=dense_layer4, units=2, name="logit_layer")
loss = None
train_op = None
if mode != learn.ModeKeys.INFER:
loss = tf.losses.softmax_cross_entropy(
onehot_labels=labels, logits=logits)
if mode == learn.ModeKeys.TRAIN:
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=.001,
optimizer="SGD")
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(
logits, name="softmax_tensor")}
return model_fn_lib.ModelFnOps(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op)
def main(unused_param):
def data_pipeline(filenames, batch_size, num_epochs=None, min_after_dequeue=10000):
with tf.name_scope("data_pipeline"):
filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs)
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
row = tf.decode_csv(value, record_defaults=[[0.0] for _ in range(66)])
example_op, label_op = tf.stack(row[:len(row)-2]), tf.stack(row[len(row)-2:])
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example_op, label_op],
batch_size=batch_size,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
def input_data_fn(data_getter_ops):
batch, labels = sess.run(data_getter_ops)
return tf.constant(batch, dtype=tf.float32), tf.constant(labels, dtype=tf.float32)
NUM_EPOCHS = 6
BATCHES_IN_TRAINING_EPOCH = 8000
training_data_pipe_ops = data_pipeline(
filenames=["train_data.csv"],
batch_size=500,
min_after_dequeue=10000)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
classifier = tf.contrib.learn.Estimator(
model_fn=cnn_model_fn,
model_dir="/tmp/bug_finder")
for j in range(NUM_EPOCHS):
classifier.fit(
input_fn=lambda: input_data_fn(training_data_pipe_ops),
steps = BATCHES_IN_TRAINING_EPOCH)
print("Epoch", str(j+1), "training completed.")
coord.request_stop()
coord.join(threads)
if __name__ == "__main__":
tf.app.run()
I figured out the issue, I was creating data pipelines with the interactive session I created, and then having my input function evaluate the examples (like a feed-dictionary). The reason this is an issue is that the Estimator class creates it's own session (a MonitoredTraininSession), and since the graph operations weren't being created from within a call from the Estimator class (and thus with it's session), they were not being saved. Using an input function to create the graph operations, and return the final graph operation (the batching) has resulted in everything working smoothly.