TensorBoard scalar summaries are single data points. How to fix? - tensorflow

I am logging using the tf.summary.scalar method AND with tf.train.LoggingTensorHook for some tensors. This is with the tf.estimator.Estimator framework.
The tf.train.LoggingTensorHook stuff is not even showing up AFAIK. The other stuff is showing but apparently without time steps.
Graphs and everything else (weights) look ok in tensorboard.
UPDATE: it looks like calling train multiple times results in a graph. Is there something about steps and every_n_iter that do not interact as expected?
import numpy as np
import tensorflow as tf
m = 10000
n = 5
X = np.random.randn(m, n)
A = np.random.randn(n)
y = X.dot(A) + np.random.randn(m) * 0.1
batch_size = 1024
def input_fn(batch_size):
ds = tf.data.Dataset.from_tensor_slices(dict(X=X, y=y))
ds = ds.repeat(-1)
ds = ds.batch(batch_size)
return ds
def model_fn(features, labels, mode, params):
X = features['X']
y = features['y']
l = X
for i, k in enumerate([32, 16, 16]):
l = tf.layers.dense(inputs=l, units=k, name=f'l_{i}', activation=tf.nn.tanh)
some_thing = tf.reduce_sum(l, axis=1, name='some_thing')
l = tf.layers.dense(inputs=l, units=1, name='l_final')
predictions = tf.squeeze(l, axis=-1)
loss = tf.losses.mean_squared_error(y, predictions, weights=1.0)
metric_ops = {"mse": tf.metrics.mean_squared_error(labels=y, predictions=predictions)}
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=metric_ops)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metric_ops)
raise Exception('should not hit this')
model = tf.estimator.Estimator(
model_fn=model_fn,
model_dir='/tmp/junk',
config=None,
params=dict(),
warm_start_from=None
)
tensors_to_log = dict(some_thing='some_thing')
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=10)
train_input_fn = lambda: input_fn(batch_size)
test_input_fn = lambda: input_fn(batch_size)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[logging_hook], max_steps=100)
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, hooks=[logging_hook])
out = tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
UPDATE: This one does not show in tensorboard until the end of the run and then it only shows one point too.
import numpy as np
import tensorflow as tf
# tf.enable_eager_execution()
tf.logging.set_verbosity(tf.logging.INFO)
m = 10000
n = 5
X = np.random.randn(m, n)
A = np.random.randn(n)
y = X.dot(A) + np.random.randn(m) * 0.1
steps = 1000
batch_size = 1024
def input_fn(repeat, batch_size):
ds = tf.data.Dataset.from_tensor_slices(dict(X=X, y=y))
ds = ds.repeat(repeat)
ds = ds.batch(batch_size)
return ds
def model_fn(features, labels, mode, params):
X = features['X']
y = features['y']
l = X
for i, k in enumerate([32, 16, 16]):
l = tf.layers.dense(inputs=l, units=k, name=f'l_{i}', activation=tf.nn.tanh)
some_thing = tf.reduce_sum(l, axis=1, name='some_thing')
l = tf.layers.dense(inputs=l, units=1, name='l_final')
predictions = tf.squeeze(l, axis=-1)
loss = tf.losses.mean_squared_error(y, predictions, weights=1.0)
metric_ops = {"mse": tf.metrics.mean_squared_error(labels=y, predictions=predictions)}
tf.summary.scalar('summary_loss', loss) # plot a dist across the batch
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=metric_ops)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metric_ops)
raise Exception('should not hit this')
model = tf.estimator.Estimator(
model_fn=model_fn,
model_dir='/tmp/junk',
config=None,
params=dict(),
warm_start_from=None
)
tensors_to_log = dict(some_thing='some_thing')
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=10)
train_input_fn = lambda: input_fn(steps, batch_size)
test_input_fn = lambda: input_fn(steps, batch_size)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[logging_hook], max_steps=None)
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, hooks=[logging_hook])
out = tf.estimator.train_and_evaluate(model, train_spec, eval_spec)

I ran into a very similar problem. and I was surprised that the solution is very simple. Close TensorBoard and initiated again, and wait for a few minutes. it takes time to catch up. For some reason, if you initiate TensorBoard during the training it will stuck. I hope this will help.
I was running the code on google cloud
from google.datalab.ml import TensorBoard
TensorBoard().start('gs://{}/directoy_where_my_models_are'.format(BUCKET))

Related

The method to use gradient accumulate in BERT finetune

I was doing a Bert finetune and I had OOM issues. I heard a good method to handle this is to use "gradient accumulate". Below are my optimization.py(include the gradient accumulate)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
from tensorflow.python.training import optimizer
from tensorflow.python.framework import ops
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = MultistepAdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
if use_tpu:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# This is how the model was pre-trained.
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=global_step)
# Normally the global step update is done inside of `apply_gradients`.
# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
# a different optimizer, you should probably take this line out.
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
class MultistepAdamWeightDecayOptimizer(optimizer.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
n = 1,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="MultistepAdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(MultistepAdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = learning_rate
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self._n = n
self.exclude_from_weight_decay = exclude_from_weight_decay
self._n_t = None
def _prepare(self):
super(MultistepAdamWeightDecayOptimizer, self)._prepare()
self._n_t=tf.convert_to_tensor(self._n, name="n")
def _create_slots(self,var_list):
super(MultistepAdamWeightDecayOptimizer, self)._create_slots(var_list)
first_var = min(var_list, key=lambda x: x.name)
self._create_non_slot_variable(initial_value=0 if self._n == 1 else 1,
name="iter",
colocate_with=first_var)
for v in var_list:
self._zeros_slot(v,"grad_acc",self._name)
def _get_iter_variable(self):
if tf.contrib.eager.in_eager_mode():
graph = None
else:
graph = tf.get_default_graph()
return self._get_non_slot_variable("iter", graph=graph)
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
"""See base class."""
update_ops = []
var_list = [v for g, v in grads_and_vars if g is not None]
with ops.init_scope():
self._create_slots(var_list)
self._prepare()
for(grad, param) in grads_and_vars:
if grad is None or param is None:
continue
grad_acc = self.get_slot(param, "grad_acc")
param_name = self._get_variable_name(params.name)
m = tf.get_variable(name=param_name + "/adam_m", shape=param.shape.as_list(),
dtype=tf.float32,trainable=False, initializer=tf.zeros_initializer())
v = tf.get_variable(name =param_name + "/adam_v", shape=param.sahpe.as_list(),
dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer())
def _apply_adam(grad_acc, grad, param, m, v):
total_grad = (grad_acc + grad) / tf.cast(self._n_t, grad.dtype)
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, total_grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(total_grad)))
update = next_m / (tf.sqrt(next_v) + self.epsilon)
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param
update_with_lr =self.learning_rate * update
next_param = param - update_with_lr
adam_op = tf.group(param.assign(next_param), m.assign(next_m),
v.assign(next_v))
with tf.control_dependencies([adam_op]):
grad_acc_to_zero_op = grad_acc.assign(tf.zero_like(grad_acc), use_locking=self._use_locking)
return tf.group(adam_op, grad_acc_to_zero_op)
def _accumulate_gradient(grad_acc, grad):
assign_up = tf.assign_add(grad_acc, grad, use_locking=self._use_locking)
return tf.group(assign_op)
update_op = tf.cond(tf.equal(self._get_iter_variable(),0),
lambda: _apply_adam(grad_acc, grad, param,m, v),
lambda: _accumulate_gradient(grad_acc, grad))
update_ops.append(update_op)
apply_updates = self._finish(update_ops, name_scope=name)
return apply_updates
def _finish(self, update_ops, name_scope):
iter_=self._get_iter_variable()
with tf.control_dependencies(update_ops):
with tf.colocate_with(iter_):
update_iter = iter_.assign(tf.mod(iter_+1, self._n_t),
use_locking=self._use_locking)
return tf.group(
*update_ops + [update_iter], name=name_scope)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
After I used this optimization.py, i could use large batch. But loss did not decrease and after 300 steps(i got 550000 training data, batch size 64, iteration 1000 and epoch 20), it said: train loop marked as finished and stopped.
I am not sure what problem is, could you please help me out? thanks.

When i use estimator with same checkpoints to predict same file for several times, the predicted result varies

I'm using textcnn model in estimator to classify some text. After i train the model, the trained model was stored in the form of checkpoints. But when i try to predict the same test file with same checkpoints,the predicted result(porbalility and logits) varies slightly.
I have set the dropout_keep_prob=1 in dropout layer
checkpoints and test file remain the same one.
I have used the LoggingTensorHook to check the tensor values during the predict, two values begin to vary at the max_pool step(at least the conv values are same but i am not sure)
import tensorflow as tf
def line_parser(line, vocab):
def parse_content(record):
items = record.decode().strip().split()
cat = int(items[-1])
tokens = items[:-1]
token_length = len(tokens)
if token_length > FLAGS.max_sequence_length:
tokens = tokens[:FLAGS.max_sequence_length]
if token_length < FLAGS.max_sequence_length:
tokens += [FLAGS.pad_word]*(FLAGS.max_sequence_length-token_length)
return [tokens, cat]
result = tf.py_func(parse_content, [line], [tf.string, tf.int64])
ids = vocab.lookup(result[0])
ids = tf.cast(ids, tf.int64)
ids = tf.reshape(ids, [FLAGS.max_sequence_length])
label = tf.one_hot(result[1], FLAGS.num_classes, dtype=tf.int32)
return [ids, label]
def predict_line_parser(line, vocab):
def parse_content(record):
feature = record.decode().strip()
tokens = feature.split()
token_length = len(tokens)
if token_length > FLAGS.max_sequence_length:
tokens = tokens[:FLAGS.max_sequence_length]
if token_length < FLAGS.max_sequence_length:
tokens += [FLAGS.pad_word]*(FLAGS.max_sequence_length-token_length)
return [tokens]
result = tf.py_func(parse_content, [line], [tf.string])
ids = vocab.lookup(result[0])
ids = tf.cast(ids, tf.int64)
ids = tf.reshape(ids, [FLAGS.max_sequence_length])
return ids
def train_input_fn(file_paths, batch_size):
vocab = tf.contrib.lookup.index_table_from_file(FLAGS.vocab_path)
dataset = tf.data.TextLineDataset(file_paths)
dataset = dataset.map(lambda line: line_parser(line, vocab))
dataset = dataset.shuffle(1000)
dataset = dataset.batch(batch_size).repeat()
return dataset
def eval_input_fn(file_paths, batch_size):
vocab = tf.contrib.lookup.index_table_from_file(FLAGS.vocab_path)
dataset = tf.data.TextLineDataset(file_paths)
dataset = dataset.map(lambda line: line_parser(line, vocab))
dataset = dataset.batch(batch_size=batch_size)
return dataset
def predict_input_fn(file_paths, batch_size):
vocab = tf.contrib.lookup.index_table_from_file(FLAGS.vocab_path)
dataset = tf.data.TextLineDataset(file_paths)
dataset = dataset.map(lambda line:predict_line_parser(line, vocab))
dataset = dataset.batch(batch_size=batch_size)
return dataset
def create_model(features, params):
# projection from sentence with id to embedding
embedding_inputs = tf.nn.embedding_lookup(params["embedding"], features)
embedding_inputs = tf.expand_dims(embedding_inputs, axis=-1)
l2_loss = tf.constant(0.0, name="l2_loss", dtype="float64")
# convolutional layer and pooling layer
pooled_outputs = list()
for i, filter_size in enumerate(params["filter_sizes"]):
with tf.name_scope("conv_{}".format(filter_size)):
filter_shape = [filter_size, params["embedding_size"], 1, params["num_filters"]]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1, dtype="float64"), name="W")
b = tf.Variable(tf.constant(0.1, shape=[params["num_filters"]], dtype="float64"), name="b")
conv = tf.nn.conv2d(embedding_inputs, W, strides=[1, 1, 1, 1], padding="VALID", use_cudnn_on_gpu=True,
name="conv".format(filter_size))
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu".format(filter_size))
pooled = tf.nn.max_pool(
h,
ksize=[1, params["sequence_length"] - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding="VALID",
name="pool".format(filter_size)
)
pooled_outputs.append(pooled)
# concatenate all feature vector
number_filters_total = params["num_filters"] * len(params["filter_sizes"])
h_pool = tf.concat(pooled_outputs, 3)
h_pool_flat = tf.reshape(h_pool, [-1, number_filters_total])
# dropout
with tf.name_scope("dropout"):
# h_drop = tf.nn.dropout(h_pool_flat, params["dropout_keep_prob"])
h_drop = tf.nn.dropout(h_pool_flat, 1)
# fully connected layer
with tf.name_scope("output"):
W = tf.Variable(
tf.truncated_normal(shape=[number_filters_total, params["num_classes"]], stddev=0.1, dtype="float64"),
name="W")
b = tf.Variable(tf.constant(0.1, shape=[params["num_classes"]], dtype="float64"), name="b")
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
logits = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
return logits, l2_loss
def model_fn_builder():
def text_cnn_model_fn(features, labels, mode, params):
logits, l2_loss = create_model(features, params)
# train mode branch
if mode == tf.estimator.ModeKeys.TRAIN:
# loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
loss = tf.reduce_mean(losses) + params["l2_reg_lambda"] * l2_loss
# optimizer function
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(params["learning_rate"])
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# eval mode branch
if mode == tf.estimator.ModeKeys.EVAL:
# loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=logits)
loss = tf.reduce_mean(losses) + params["l2_reg_lambda"] * l2_loss
# predictions
with tf.name_scope("prediction"):
probability = tf.nn.softmax(logits, axis=1, name="probability")
pred = tf.argmax(probability, axis=1, name="predictions")
# metrics
with tf.name_scope("metrics"):
accuracy = tf.metrics.accuracy(labels=tf.argmax(labels, axis=1), predictions=pred)
precision = tf.metrics.precision(labels=tf.argmax(labels, axis=1), predictions=pred)
recall = tf.metrics.recall(labels=tf.argmax(labels, axis=1), predictions=pred)
tf.summary.scalar("accuracy", accuracy[1])
tf.summary.scalar("precision", precision[1])
tf.summary.scalar("recall", recall[1])
tf.summary.scalar("loss", loss)
metrics = {"accuracy": accuracy, "precision": precision, "recall": recall}
metric_hook = tf.train.LoggingTensorHook(
{"f1-score": 2 * precision[1] * recall[1] / (precision[1] + recall[1]), "precision": precision[1],
"recall": recall[1]}, every_n_iter=100)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metrics,
evaluation_hooks=[metric_hook])
# predict mode branch
if mode == tf.estimator.ModeKeys.PREDICT:
# predictions
with tf.name_scope("prediction"):
probability = tf.nn.softmax(logits, axis=1, name="probability")
pred = tf.argmax(probability, axis=1, name="predictions")
predictions = {
"class": pred,
"probability": probability,
}
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
return text_cnn_model_fn
I expect the same output of twice predictions,but it varies like the following:
first time
0\0.02336916147480053
0\0.29461604884471243
0\0.04555523004833724
1\0.5450933830551228
0\0.042727966035733034
0\0.032764190484837884
0\0.11542703615898613
0\0.12662708812885717
0\0.01605587344580832
0\0.006454832043875243
second time
0\0.03389085341620636
0\0.31563690653966603
0\0.06185060165562852
1\0.5891016184323346
0\0.07184752629327144
0\0.04355442431024522
0\0.16290306166502935
0\0.17214872864042816
0\0.02437323886282706
0\0.0109889405648392
Autually,i figured out this problem. The variety is resulted by word embedding vectors which are generated randomly every time.

Op type not registered HashTableV2 in Tensorflow 1.4.1 while deploying in Cloud ML

When we deploying the model to cloud ml we are getting Bad model Op type not registered HashTableV2
Code:
def model_fn(features, labels, mode):
if mode == tf.estimator.ModeKeys.TRAIN:
tf.keras.backend.set_learning_phase(True)
else:
tf.keras.backend.set_learning_phase(False)
input_feature = features['x']
table = lookup.index_table_from_file(vocabulary_file='vocab.txt', num_oov_buckets=1, default_value=-1)
text = tf.squeeze(input_feature, [1])
words = tf.string_split(text)
dense_words = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
numbers = table.lookup(dense_words)
padding = tf.constant([[0, 0], [0, MAX_LEN]])
padded = tf.pad(numbers, padding)
sliced = tf.slice(padded, [0, 0], [-1, MAX_LEN])
print('words_sliced={}'.format(words))
embeds = tf.keras.layers.Embedding(MAX_FEATURES+1, 128, input_length=MAX_LEN)(sliced)
print('words_embed={}'.format(embeds))
f1 = tf.keras.layers.Dropout(0.2)(embeds)
f1 = tf.keras.layers.Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1)(f1)
f1 = tf.keras.layers.GlobalAveragePooling1D()(f1)
f1 = tf.keras.layers.Dense(hidden_dims)(f1)
f1 = tf.keras.layers.Dropout(0.5)(f1)
f1 = tf.keras.layers.Activation('relu')(f1)
logits = tf.keras.layers.Dense(11)(f1)
predictions_dict = {
'class': tf.argmax(logits, 1),
'prob': tf.nn.softmax(logits)
}
'''prediction_output = tf.estimator.export.PredictOutput({"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits,
name="softmax_tensor")})'''
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions_dict, export_outputs={
'prediction': tf.estimator.export.PredictOutput(predictions_dict)
})
loss = tf.losses.sparse_softmax_cross_entropy(labels, logits=logits)
if mode == tf.contrib.learn.ModeKeys.TRAIN:
train_op = tf.contrib.layers.optimize_loss(loss, tf.contrib.framework.get_global_step(), optimizer='Adam',
learning_rate=0.001)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
eval_metrics_ops = {
'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions_dict['class']),
'precision': tf.metrics.precision(labels=labels, predictions=predictions_dict['class']),
'recall': tf.metrics.recall(labels=labels, predictions=predictions_dict['class'])
}
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def get_train_record(record):
vector = tf.decode_csv(record, DEFAULTS, use_quote_delim=True)
return vector[1:], vector[0]
def preprocess(text):
text = text.lower()
result = ' '.join([word for word in text.split() if word not in (stop_words)])
return result
def build_vocab(file_name, vocab_file_name):
df = pd.read_csv(file_name, header=None, sep=',', skiprows=[1], names=['product', 'consumer_complaint_narrative'])
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(preprocess)
print(df['consumer_complaint_narrative'][0])
vocab_processor = tflearn.preprocessing.VocabularyProcessor(max_document_length=MAX_FEATURES, min_frequency=10,
tokenizer_fn=tflearn.preprocessing.tokenizer)
vocab_processor.fit(df['consumer_complaint_narrative'])
with gfile.Open(vocab_file_name, 'wb') as f:
f.write("{}\n".format(PADWORD))
for word, index in vocab_processor.vocabulary_._mapping.items():
f.write("{}\n".format(word))
nwords = len(vocab_processor.vocabulary_)
print('{} words into {}'.format(nwords, vocab_file_name))
def input_fn(file_name, batch_size, repeat_count, shuffle=False):
def _input_fn():
data_set = tf.data.TextLineDataset(filenames=file_name)
data_set = data_set.map(get_train_record)
if shuffle:
data_set = data_set.shuffle(shuffle)
data_set = data_set.repeat(repeat_count)
batch = data_set.batch(batch_size)
iterator = batch.make_one_shot_iterator()
features, labels = iterator.get_next()
return {'x': features}, labels
return _input_fn()
def get_train_spec(file_name, batch_size, repeat_count):
return tf.estimator.TrainSpec(input_fn=lambda: input_fn(file_name, batch_size, repeat_count, shuffle=True), max_steps=1000)
def get_test_spec(file_name, batch_size, repeat_count=1):
return tf.estimator.EvalSpec(input_fn=lambda: input_fn(file_name, batch_size, repeat_count, shuffle=True))
def serving_input_fn():
feature_tensor = tf.placeholder(tf.string, [None])
# features = tf.py_func(preprocess, [feature_tensor], tf.string)
features = tf.expand_dims(feature_tensor, -1)
return tf.estimator.export.ServingInputReceiver({'x': features}, {'x': features})
finance_classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir)
print('\n Training .....')
finance_classifier.train(input_fn=lambda: input_fn('dataset/train.csv', batch_size, repeat_count=5, shuffle=True))
print('\n Evaluating.....')
eval_results = finance_classifier.evaluate(input_fn=lambda: input_fn('dataset/valid.csv', batch_size, repeat_count=1,
shuffle=False))
for key in eval_results:
print(" {} was {}".format(key, eval_results[key]))
print('\n Exporting')
exported_model_dir = finance_classifier.export_savedmodel(model_dir, serving_input_receiver_fn=serving_input_fn)
decoded_model_dir = exported_model_dir.decode("utf-8")
Screenshot
One important thing to mention here is when I tried with Tensorflow 1.2 with some changes in the code in model_fn. Basically not using tf.keras but using tf.contrib.keras it was working.
The model which was exported in Tensorflow 1.2 works fine. Is it a bug in Tensorflow 1.4 ? How can we fix this error ?
Already created a gihub issue in Tensorflow Repo
The ML Engine supports TensorFlow 1.4 but the default version is TensorFlow 1.2. You can specify that you want 1.4 by adding the following code to your project's setup.py module:
REQUIRED_PACKAGES = ['tensorflow>=1.4']
setup(
...
install_requires=REQUIRED_PACKAGES,
...
)
You can see the full list of supported packages and versions here.
We can solve this problem by setting --runtime-version=1.4.
Use the following command when deploying model in cloud ml.
MODEL_BINARIES=$(gsutil ls gs://${BUCKET}/models/${MODEL_NAME}/export/)
gcloud ml-engine versions create ${MODEL_VERSION} --model=${MODEL_NAME} --origin=${MODEL_BINARIES} --runtime-version=1.4

How to create tf.RunMetadata and add it to writer when using tf.contrib.learn Module

Now I use tf.contrib.learn.Experiment, Estimator, learn_runner to help training the model. When run the learn_runner, it will implicitly create a tf.MoniteredSession and call its run() function, so I cannot add arguments options and run_metadata to run() function.
So how can I add options and run_metadata args to run function and call summary_writer.add_run_metadata()?
I am searching for a long time on net. But no use. Please help or try to give some ideas how to achieve this.
this is the code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
from tensorflow.contrib import slim, training, learn
tf.logging.set_verbosity(tf.logging.DEBUG)
def variable_summaries(var):
with tf.name_scope(var.name.split(':')[0]):
mean = tf.reduce_mean(var)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.add_to_collection('variable_summaries', tf.summary.scalar('mean', mean))
tf.add_to_collection('variable_summaries', tf.summary.scalar('stddev', stddev))
tf.add_to_collection('variable_summaries', tf.summary.scalar('max', tf.reduce_max(var)))
tf.add_to_collection('variable_summaries', tf.summary.scalar('min', tf.reduce_min(var)))
tf.add_to_collection('variable_summaries', tf.summary.histogram('histogram', var))
def model_fn(features, labels, mode, params):
id_ts = tf.get_collection('id_ts')[0]
fc1 = slim.fully_connected(features, 10, tf.nn.relu, scope='fc1')
variable_summaries(fc1)
fc2 = slim.fully_connected(fc1, 2, None, scope='fc2')
variable_summaries(fc2)
for i in tf.trainable_variables():
variable_summaries(i)
logits = fc2
prob = tf.nn.softmax(logits)
predictions = tf.argmax(logits, axis=1)
summay_op = tf.summary.merge_all('variable_summaries')
scaffold = tf.train.Scaffold(summary_op=summay_op)
if mode == learn.ModeKeys.TRAIN or mode == learn.ModeKeys.EVAL:
onehot_labels = slim.one_hot_encoding(labels, 2)
loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=onehot_labels)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss, slim.get_global_step())
eval_metric_ops = {
'accuracy': tf.metrics.accuracy(labels, predictions),
'auc': tf.metrics.auc(labels, predictions),
'precision': tf.metrics.precision(labels, predictions),
'recall': tf.metrics.recall(labels, predictions),
}
return learn.ModelFnOps(mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops,
scaffold=scaffold)
elif mode == learn.ModeKeys.INFER:
return learn.ModelFnOps(mode=mode, predictions={'prob': prob,
'fc1': fc1,
'fc2': fc2,
'id': id_ts})
def train_input_fn():
fn = tf.train.string_input_producer(['data.csv'])
reader = tf.TextLineReader()
key, value = reader.read(fn)
data_ts = tf.decode_csv(value, [[0.], [0.], [0.], [0.]], field_delim=',')
batch_ts = tf.train.shuffle_batch(data_ts, 10, 1000, 10)
id_ts = batch_ts[0]
tf.add_to_collection('id_ts', id_ts)
features_ts = tf.concat([tf.reshape(batch_ts[1], [-1, 1]), tf.reshape(batch_ts[2], [-1, 1])], axis=1)
labels_ts = tf.cast(batch_ts[3], tf.int32)
return features_ts, labels_ts
def eval_input_fn():
fn = tf.train.string_input_producer(['data.csv'])
reader = tf.TextLineReader()
key, value = reader.read(fn)
data_ts = tf.decode_csv(value, [[0.], [0.], [0.], [0.]], field_delim=',')
batch_ts = tf.train.batch(data_ts, 10, 1000)
id_ts = batch_ts[0]
tf.add_to_collection('id_ts', id_ts)
features_ts = tf.concat([tf.reshape(batch_ts[1], [-1, 1]), tf.reshape(batch_ts[2], [-1, 1])], axis=1)
labels_ts = tf.cast(batch_ts[3], tf.int32)
return features_ts, labels_ts
def run_experiment(_):
session_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True),
log_device_placement=False)
run_config = learn.RunConfig(save_checkpoints_steps=100,
model_dir='model_dir',
session_config=session_config,
keep_checkpoint_max=2)
hparams = training.HParams(train_steps=1000)
learn.learn_runner.run(experiment_fn=create_experiment_fn,
schedule='train_and_evaluate',
run_config=run_config,
hparams=hparams)
def create_experiment_fn(run_config, hparams):
estimator = get_estimator_fn(config=run_config, params=hparams)
return learn.Experiment(estimator=estimator,
train_input_fn=train_input_fn,
eval_input_fn=eval_input_fn,
train_steps=hparams.train_steps)
def get_estimator_fn(config, params):
return learn.Estimator(model_fn=model_fn,
model_dir=config.model_dir,
config=config,
params=params)
if __name__ == '__main__':
tf.app.run(main=run_experiment)

TensorFlow: loss jumps up after restoring RNN net

Environment info
Operating System: Windows 7 64-bit
Tensorflow installed from pre-built pip (no CUDA): 1.0.1
Python 3.5.2 64-bit
Problem
I have problems with restoring my net (RNN character base language model). Below is a simplified version with the same problem.
When I run it the first time, I get, for example, this.
...
step 160: loss = 1.956 (perplexity = 7.069016620211226)
step 180: loss = 1.837 (perplexity = 6.274748642468816)
step 200: loss = 1.825 (perplexity = 6.202084762557817)
But on the second run, after restoring parameters, I get this.
step 220: loss = 2.346 (perplexity = 10.446611983898903)
step 240: loss = 2.346 (perplexity = 10.446709120339545)
...
All the tf variables seem to be correctly restored, including the state, which will be fed to RNN.
Data position is also restored (from 'step').
I also made a similar program for MNIST recognition model, and this one works fine: the losses before and after the restoring are continuous.
Are there any other parameters or states that should be saved and restored?
import argparse
import os
import tensorflow as tf
import numpy as np
import math
B = 20 # batch size
H = 200 # size of hidden layer of neurons
T = 25 # number of time steps to unroll the RNN for
data_file = 'ptb.train.txt' # any plain text file will do
checkpoint_dir = "tmp"
#----------------
# prepare data
#----------------
data = open(data_file, 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has {0} characters, {1} unique.'.format(data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
input_index_raw = np.array([char_to_ix[ch] for ch in data])
input_index_raw = input_index_raw[0:len(input_index_raw) // T * T]
input_index_raw_shift = np.append(input_index_raw[1:], input_index_raw[0])
input_all = input_index_raw.reshape([-1, T])
target_all = input_index_raw_shift.reshape([-1, T])
num_packed_data = len(input_all)
#----------------
# build model
#----------------
class Model(object):
def __init__(self):
self.input_ph = tf.placeholder(tf.int32, [None, T], name="input_ph")
self.target_ph = tf.placeholder(tf.int32, [None, T], name="target_ph")
embedding = tf.get_variable("embedding", [vocab_size, H], initializer=tf.random_normal_initializer(), dtype=tf.float32)
# input_ph is B x T.
# input_embedded is B x T x H.
input_embedded = tf.nn.embedding_lookup(embedding, self.input_ph)
cell = tf.contrib.rnn.BasicRNNCell(H)
self.state_ph = tf.placeholder(tf.float32, (None, cell.state_size), name="state_ph")
# Make state variable so that it will be saved by the saver.
self.state = tf.get_variable("state", (B, cell.state_size), initializer=tf.zeros_initializer(), trainable=False, dtype=tf.float32)
# Construct initial_state according to whether restoring or not.
self.isRestore = tf.placeholder(tf.bool, shape=(), name="isRestore")
zero_state = cell.zero_state(B, dtype=tf.float32)
self.initial_state = tf.cond(self.isRestore, lambda: self.state, lambda: zero_state)
# input_embedded : B x T x H
# output: B x T x H
# state : B x cell.state_size
output, state_ = tf.nn.dynamic_rnn(cell, input_embedded, initial_state=self.state_ph)
self.final_state = tf.assign(self.state, state_)
# reshape to (B * T) x H.
output_flat = tf.reshape(output, [-1, H])
# Convert hidden layer's output to vector of logits for each vocabulary.
softmax_w = tf.get_variable("softmax_w", [H, vocab_size], dtype=tf.float32)
softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)
logits = tf.matmul(output_flat, softmax_w) + softmax_b
# cross_entropy is a vector of length B * T
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.target_ph, [-1]), logits=logits)
self.loss = tf.reduce_mean(cross_entropy)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
self.global_step = tf.get_variable("global_step", (), initializer=tf.zeros_initializer(), trainable=False, dtype=tf.int32)
self.training_op = optimizer.minimize(cross_entropy, global_step=self.global_step)
def train_batch(self, sess, input_batch, target_batch, initial_state):
final_state_, _, final_loss = sess.run([self.final_state, self.training_op, self.loss], feed_dict={self.input_ph: input_batch, self.target_ph: target_batch, self.state_ph: initial_state})
return final_state_, final_loss
# main
with tf.Session() as sess:
if not tf.gfile.Exists(checkpoint_dir):
tf.gfile.MakeDirs(checkpoint_dir)
batch_stride = num_packed_data // B
# make model
model = Model()
saver = tf.train.Saver()
# always initialize
init = tf.global_variables_initializer()
init.run()
# restore if necessary
isRestore = False
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt:
isRestore = True
last_model = ckpt.model_checkpoint_path
print("Loading " + last_model)
saver.restore(sess, last_model)
# set initial step
step = tf.train.global_step(sess, model.global_step) + 1
print("start step = {0}".format(step))
# fetch initial state
state = sess.run(model.initial_state, feed_dict={model.isRestore: isRestore})
print("Initial state: {0}".format(state))
while True:
# prepare batch data
idx = [(step + x * batch_stride) % num_packed_data for x in range(0, B)]
input_batch = input_all[idx]
target_batch = target_all[idx]
state, last_loss = model.train_batch(sess, input_batch, target_batch, state)
if step % 20 == 0:
print('step {0}: loss = {1:.3f} (perplexity = {2})'.format(step, last_loss, math.exp(last_loss)))
if step % 200 == 0:
saved_file = saver.save(sess, os.path.join(checkpoint_dir, "model.ckpt"), global_step=step)
print("Saved to " + saved_file)
print("Last state: {0}".format(model.state.eval()))
break;
step = step + 1
The problem is solved. It had nothing to do with RNN nor TensorFlow.
I changed
chars = list(set(data))
to
chars = sorted(set(data))
and now it works.
This is because python uses a random hash function to build the set, and every time python restarted, 'chars' had a different ordering.