Printing train accuracy every 100 steps tensorflow - tensorflow

I follow the MNIST tutorial here: https://www.tensorflow.org/tutorials/layers. I want to log out the accuracy per 100 steps. I tried to modify the train part in cnn_model_fn, but it doesn't work.
Here is my modification:
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
logging_hook = tf.train.LoggingTensorHook({"accuracy": accuracy}, every_n_iter=100)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
And I defined an accuracy just above the if
accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
But I got following error
Extracting MNIST-data/train-images-idx3-ubyte.gz
Extracting MNIST-data/train-labels-idx1-ubyte.gz
Extracting MNIST-data/t10k-images-idx3-ubyte.gz
Extracting MNIST-data/t10k-labels-idx1-ubyte.gz
2018-05-04 18:54:05.819366: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2018-05-04 18:54:05.819388: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2018-05-04 18:54:05.819396: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2018-05-04 18:54:05.819402: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2018-05-04 18:54:05.819408: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
INFO:tensorflow:Create CheckpointSaverHook.
Traceback (most recent call last):
File "cnn_mnist.py", line 119, in <module>
tf.app.run()
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "cnn_mnist.py", line 102, in main
steps=2000)
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 217, in train
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 577, in _train_model
config=config_pb2.ConfigProto(allow_soft_placement=True)) as mon_sess:
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 333, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 627, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 449, in __init__
h.begin()
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 162, in begin
for (tag, tensor) in self._tensors.items()}
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 162, in <dictcomp>
for (tag, tensor) in self._tensors.items()}
File "/Users/caitlinwen/miniconda2/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 688, in _as_graph_element
"to current graph %s." % (obj, graph))
ValueError: Passed (<tf.Tensor 'accuracy/value:0' shape=() dtype=float32>, <tf.Tensor 'accuracy/update_op:0' shape=() dtype=float32>) should have graph attribute that is equal to current graph <tensorflow.python.framework.ops.Graph object at 0x18139a8310>.
My full code is:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Imports
import numpy as np
import tensorflow as tf
# Our application logic will be added here
def cnn_model_fn(features, labels, mode):
"""Model function for CNN."""
# Input Layer
input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
# Convolutional Layer #1
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
# Pooling Layer #1
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
# Convolutional Layer #2 and Pooling Layer #2
conv2 = tf.layers.conv2d(
inputs=pool1,
filters=64,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
# Dense Layer
pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
dropout = tf.layers.dropout(
inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
# Logits Layer
logits = tf.layers.dense(inputs=dropout, units=10)
predictions = {
# Generate predictions (for PREDICT and EVAL mode)
"classes": tf.argmax(input=logits, axis=1),
# Add `softmax_tensor` to the graph. It is used for PREDICT and by the
# `logging_hook`.
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
logging_hook = tf.train.LoggingTensorHook({"accuracy": accuracy}, every_n_iter=100)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
# Add evaluation metrics (for EVAL mode)
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])}
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def main(unused_argv):
# Load training and eval data
mnist = tf.contrib.learn.datasets.load_dataset("mnist")
train_data = mnist.train.images # Returns np.array
train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
eval_data = mnist.test.images # Returns np.array
eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
# Create the Estimator
mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model")
# Set up logging for predictions
init = tf.global_variables_initializer()
tf.logging.set_verbosity(tf.logging.INFO)
with tf.Session() as sess:
sess.run(init)
# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": train_data},
y=train_labels,
batch_size=100,
num_epochs=None,
shuffle=True)
mnist_classifier.train(
input_fn=train_input_fn,
steps=2000)
# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data},
y=eval_labels,
num_epochs=1,
shuffle=False)
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)
# merged = tf.summary.merge_all()
# train_writer = tf.summary.FileWriter('./train', sess.graph)
# test_writer = tf.summary.FileWriter('./test')
# tf.global_variables_initializer().run()
if __name__ == "__main__":
tf.app.run()

The module tf.metrics.accuracy returns two parameters accuracy and update_op as defined here.
So you need to change your code to:
accuracy, update_op = tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])

Try with:
accuracy = tf.compat.v1.metrics.accuracy(
labels=labels,
predictions= prediction )

Related

How to visualize the network graph in tensorflow 1.15 with Eager mode using tensorboard?

Hi~ I want to visualize the NN in Eager mode in tf1.15 (can not switch to 2.0.0). And the implementation is based on low-level API of Tensorflow 1.15. I want to use the tensorboard to visualize it.
I write a log tracing code but get the error:
WARNING:tensorflow:
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
* https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
* https://github.com/tensorflow/addons
* https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.
Traceback (most recent call last):
File "/home/frank/PycharmProjects/reconstruction_NN/my_test.py", line 78, in <module>
tf.contrib.summary.trace_on(graph=True, profiler=True)
AttributeError: module 'tensorflow.contrib.summary.summary' has no attribute 'trace_on'
Environment information (required)
tensorboard 1.15.0
tensorflow-estimator 1.15.1
tensorflow-gpu 1.15.0
Ubuntu16.04
Issue description
Code:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D,Dropout
from tensorflow.keras import Model
tf.compat.v1.enable_eager_execution()
print(tf.__version__)
print(tf.executing_eagerly())
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
batch_size = 32
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__()
self.flatten = Flatten()
self.d1 = Dense(128, activation='relu')
self.dropout = Dropout(0.5)
self.d2 = Dense(10, activation='softmax')
def call(self, x):
x = self.flatten(x)
x = self.d1(x)
x = self.dropout(x)
return self.d2(x)
model = MyModel()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
#tf.function
def train_step(images, labels):
with tf.GradientTape() as tape:
predictions = model(images)
loss = loss_object(labels, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_accuracy(labels, predictions)
#tf.function
def test_step(images, labels):
predictions = model(images)
t_loss = loss_object(labels, predictions)
test_loss(t_loss)
test_accuracy(labels, predictions)
EPOCHS = 5
from datetime import *
stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = 'logs/func/%s' % stamp
writer = tf.contrib.summary.create_file_writer(logdir)
tf.summary.trace_on(graph=True, profiler=True)
for epoch in range(EPOCHS):
for images, labels in train_ds:
train_step(images, labels)
for test_images, test_labels in test_ds:
test_step(test_images, test_labels)
template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
print(template.format(epoch + 1, train_loss.result(),
train_accuracy.result() * 100,
test_loss.result(),
test_accuracy.result() * 100))
with writer.as_default():
tf.summary.trace_export(
name="my_func_trace",
step=0,
profiler_outdir=logdir)
The warning is coming because you are accessing the tf.contrib namespace, which is deprecated. The documentation specifies that you should use the writer object as
writer = tf.summary.create_file_writer(logdir)

AttributeError: KerasTPUModel' object has no attribute _ckpt_saved_epoch

I am trying to train a model on Google Colab, in order to play around with training on TPU. However, I am running into the following error:
AttributeError Traceback (most recent call last)
<ipython-input-82-e74efc36d872> in <module>()
----> 1 tpu_model.fit_generator(training_set, steps_per_epoch = 8000, epochs = 25)
2 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/callbacks.py in configure_callbacks(callbacks, model, do_validation, batch_size, epochs, steps_per_epoch, samples, verbose, count_mode, mode)
118 callback_list.model.stop_training = False
119 # pylint: disable=protected-access
--> 120 if callback_list.model._ckpt_saved_epoch is not None:
121 # The attribute `_ckpt_saved_epoch` is supposed to be None at the start of
122 # training (it should be made None at the end of successful multi-worker
AttributeError: 'KerasTPUModel' object has no attribute '_ckpt_saved_epoch'
While trying to run the following code.
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import zipfile
print(tf.VERSION)
local_zip = '/home/cats_and_dogs_filtered.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/home')
zip_ref.close()
def create_model():
classifier = tf.keras.models.Sequential()
classifier.add(layers.Conv2D(32, (3, 3), input_shape=(64, 64, 3), activation='relu'))
classifier.add(layers.MaxPooling2D(pool_size=(2, 2)))
classifier.add(layers.Conv2D(32, (3, 3), activation= 'relu'))
classifier.add(layers.MaxPooling2D(pool_size=(2, 2)))
classifier.add(layers.Flatten())
classifier.add(layers.Dense(units=128, activation= 'relu'))
classifier.add(layers.Dense(units=1, activation= 'sigmoid'))
return classifier
train_datagen = ImageDataGenerator(rescale = 1./255, shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True)
training_set = train_datagen.flow_from_directory('/home/cats_and_dogs_filtered/train', target_size = (64, 64), batch_size = 32, class_mode = 'binary')
model = create_model()
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
tpu_model = tf.contrib.tpu.keras_to_tpu_model( model, strategy=tf.contrib.tpu.TPUDistributionStrategy(tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))
tpu_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
tpu_model.save_weights('./tpu_model.h5', overwrite=True)
tpu_model.fit_generator(training_set, steps_per_epoch = 8000, epochs = 25)
I am not sure what is going on. I used similar code to train it on CPU (takes a long time to train).

load tfrecord files into keras model by batch

I want to load tfrecord extracted batch data in keras model
I used the tfrecord to store my data and its label, and I extracted it out using tf.dataset API by batch and created an iterator, but I dont know how to proceed to load data into keras model.
import tensorflow as tf
import keras as k
import numpy as np
num_epochs = 2
data_dim = 75
timesteps = 300
num_classes = 82
batch_size = 128
training_filename = [filepath]
validation_filename = [filepath]
def parse_function(example1):
features = tf.io.parse_single_example(example1, features={
'label': tf.io.FixedLenFeature(shape=(), dtype=tf.int64, default_value=None),
'skeleton': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
'skeleton_shape': tf.io.FixedLenFeature(shape=(3,), dtype=tf.int64)})
features['skeleton'] = tf.decode_raw(features['skeleton'], tf.float64)
skeleton = tf.reshape(features['skeleton'], (300, 75))
label = tf.one_hot(features['label'], num_classes, dtype=tf.float64)
return skeleton, label
def load_dataset(filename):
data_reading = tf.data.TFRecordDataset(filename)
dataset = data_reading.map(parse_function)
dataset = dataset.batch(batch_size)
iterator = dataset.make_one_shot_iterator()
skeleton, label = iterator.get_next()
return skeleton, label
x_train, y_train = load_dataset(training_filename)
x_val, y_val = load_dataset(validation_filename)
model = k.Sequential()
model.add(k.layers.LSTM(128, activation='relu', return_sequences=True))
model.add(k.layers.LSTM(128, activation='relu'))
model.add(k.layers.Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train,
epochs=5, shuffle=True,
validation_data=(x_val, y_val))
Traceback (most recent call last):
File "D:/MScProject/extraction/feeder.py", line 60, in <module>
y=run_model(x_train,y_train,x_val,y_val)
File "D:\MScProject\extraction\lstm.py", line 22, in run_model
validation_data=(x_val, y_val))
File "C:\Users\wille\Anaconda3\lib\site-packages\keras\engine\training.py", line 952, in fit
batch_size=batch_size)
File "C:\Users\wille\Anaconda3\lib\site-packages\keras\engine\training.py", line 677, in _standardize_user_data
self._set_inputs(x)
File "C:\Users\wille\Anaconda3\lib\site-packages\keras\engine\training.py", line 589, in _set_inputs
self.build(input_shape=(None,) + inputs.shape[1:])
TypeError: can only concatenate tuple (not "TensorShape") to tuple
Which version of TF are you using?
If you are with TF 2.0 you can feed directly tf.data.Dataset to model.fit.
Check this entry at TF documentation

Error Restoring Model in Tensorflow After Changing the Optimizer Paramter

I have trained a model in Tensorflow. During the training process, I was setting a var_list in the optimizer, in other words, I was training a GRU on top of a CNN. Here is the code for the optimizer:
with tf.name_scope('optimizer'):
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimizer = tf.train.AdamOptimizer(0.0001).minimize(MSE, var_list=gru_output_var_list)
Then, after training, and saving the variables in checkpoints, I was trying to remove the var_list from the optimizer in order to be able to fine tune the whole network, conv layers with GRU. But, that is raising an error:
Key weight_fc_sig/Adam_1 not found in checkpoint
where weight_fc_sig is the name of one of the variables in the model.
I have read through github, and I found that the state of the optimizer is saved as well as variables in the checkpoint. So, I would like to know how to solve this issue, in other words, I need to know how to reset the state of the optimizer.
Any help is much appreciated!!
To begin with, I build a model in tensorflow, and then I saved the graph, with variables... into a checkpoint through:
saver = tf.train.Saver()
saver.save(sess, model_path + "ckpt")
So when I inspect the list of variables stored through:
from tensorflow.python import pywrap_tensorflow
model_path = 'C:/Users/user/PycharmProjects/TensorflowDifferentProjects/MNIStDataset/tensorlogs/ckpt'
reader = pywrap_tensorflow.NewCheckpointReader(model_path)
var_to_shape_map = reader.get_variable_to_shape_map()
for key in sorted(var_to_shape_map):
print("tensor_name: ", key)
I got the following list of variables:
tensor_name: Adam_optimizer/beta1_power
tensor_name: Adam_optimizer/beta2_power
tensor_name: conv1/biases
tensor_name: conv1/biases/Adam
tensor_name: conv1/biases/Adam_1
tensor_name: conv1/weights
tensor_name: conv1/weights/Adam
tensor_name: conv1/weights/Adam_1
tensor_name: conv2/biases
tensor_name: conv2/biases/Adam
tensor_name: conv2/biases/Adam_1
tensor_name: conv2/weights
tensor_name: conv2/weights/Adam
tensor_name: conv2/weights/Adam_1
tensor_name: fc1/biases
tensor_name: fc1/biases/Adam
tensor_name: fc1/biases/Adam_1
tensor_name: fc1/weights
tensor_name: fc1/weights/Adam
tensor_name: fc1/weights/Adam_1
tensor_name: fc2/biases
tensor_name: fc2/biases/Adam
tensor_name: fc2/biases/Adam_1
tensor_name: fc2/weights
tensor_name: fc2/weights/Adam
tensor_name: fc2/weights/Adam_1
And when I trained the same model another time, but this time, I passed to the saver only the list of the weights and biases as:
saver = tf.train.Saver(var_list=lst_vars), and then I printed out the list of weights and biases saved,
I got the following list:
tensor_name: conv1/biases
tensor_name: conv1/weights
tensor_name: conv2/biases
tensor_name: conv2/weights
tensor_name: fc1/biases
tensor_name: fc1/weights
tensor_name: fc2/biases
tensor_name: fc2/weights
Now, when I tried to run the same model again, but with removing the list of variables to restore, So I have this saver now:
saver = tf.train.Saver(),
I ran into the following error:
Key fc2/weights/Adam_1 not found in checkpoint.
Therefore, the solution was to mention explicitly the list of variables I need to restore. In other words, even when I only
saved the list of weights and biases I need to store, when importing them, I should mention them specifically, so I should
say:
saver = tf.train.Saver(var_list=lst_vars)
where lst_vars is the list of variables I need to restore which is the same as the one
printed above.
So in general what happens is that whenever we try to restore the graph, if I didnt mention the list of variables to restore,
tensorflow will look and see that there are some variables that are not stored yet, I other words, whenever there is no list,
tensorflow assumes that we are trying to restore the whole graph, which is not true. I am only restoring the part responsible
for weights and biases. So once this is mentioned, tensorflow will know that I am not initializing the whole graph, but part
of it.
Now even if I was truly mentioning the list of variables I need to restore, as follows:
saver = tf.train.Saver(var_list=lst_vars)
This will not cause any problem
Also, I can add/modify the var_list passed to the optimizer, that will not cause any problem as well.
At the same time, even if I passed the list of variables for the optimizer to work on as:
with tf.name_scope('Adam_optimizer'):
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy, var_list=lst_vars[:3])
saver = tf.train.Saver(var_list=lst_vars)
Then I can run the same model another time but without the var_list parameter in the optimizer. So that would be the case with fine tuning.
Now, to go the extra mile, I can modify the model, add maybe more layers, but I should keep in mind, since I have only the
following variables stored in the checkpoint:
tensor_name: conv1/biases
tensor_name: conv1/weights
tensor_name: conv2/biases
tensor_name: conv2/weights
tensor_name: fc1/biases
tensor_name: fc1/weights
tensor_name: fc2/biases
tensor_name: fc2/weights
I should mention to the saver that these are the variables that I am restoring. So I said the following:
saver = tf.train.Saver(var_list=[lst_vars[0], lst_vars[1], lst_vars[2], lst_vars[3],
lst_vars[6], lst_vars[7], lst_vars[8], lst_vars[9]])
In this case, there would be no problem and the code will run fine!!!
and I can ask the optimizer as well to train the new model, maybe train certain parameters, I mean weights and biases and so on...
Also, note that I can save the whole model as:
saver = tf.train.Saver()
and then restore part of the model (by running the model another time, and pass:
saver = tf.train.Saver(var_list=lst_vars))
Also, I can modify the model and maybe add more conv layer. So I can fine tune the model as long as I mention exactly what
variables I am restoring. Ex:
saver = tf.train.Saver(var_list=[lst_vars[0], lst_vars[1], lst_vars[2], lst_vars[3],
lst_vars[6], lst_vars[7], lst_vars[8], lst_vars[9]])
all this explanation came because I though there might some problems with the optimizer, and I needed to know how to rest it. An issue was raised on github, exactly on how to rest the optimizer, and that it why I came up with all this conclusion
Here is the code for whomever is interested:
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import os
def conv2d(x, w):
return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
def weight_variable(shape, name):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial, name=name)
def bias_variable(shape, name):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial, name=name)
def deepnn(x):
with tf.name_scope('reshape'):
x_image = tf.reshape(x, [-1, 28, 28, 1])
# First convolutional layer, maps one grayscale image to 32 feature maps.
with tf.name_scope('conv1'):
w_conv1 = weight_variable([5, 5, 1, 32], name='weights')
b_conv1 = bias_variable([32], name='biases')
h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
# Pooling layer, downsampling by 2X
with tf.name_scope('pool1'):
h_pool1 = max_pool_2x2(h_conv1)
# Second convolutional layer -- maps 32 feature maps to 64
with tf.name_scope('conv2'):
w_conv2 = weight_variable([5, 5, 32, 64], name='weights')
b_conv2 = bias_variable([64], name='biases')
h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
# Second pooling layer
with tf.name_scope('pool2'):
h_pool2 = max_pool_2x2(h_conv2)
# Fully connected layer 1 -- after 2 round of downsampling, our 28 x 28 image is
# down to 7 x 7 x 64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
w_fc1 = weight_variable([7 * 7 * 64, 1024], name='weights')
b_fc1 = bias_variable([1024], name='biases')
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
# Dropout - control the complexity of the model, prevents co-adaptation of features
with tf.name_scope('dropout'):
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# Map the 1024 features to 10 classes, one for each digit.
with tf.name_scope('fc2'):
w_fc2 = weight_variable([1024, 10], name='weights')
b_fc2 = bias_variable([10], name='biases')
y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
return y_conv, keep_prob
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
# Build the graph for the deep net
y_conv, keep_prob = deepnn(x)
with tf.name_scope('loss'):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)
cross_entropy = tf.reduce_mean(cross_entropy)
# Note that this list of variables only include the weights and biases in the model.
lst_vars = []
for v in tf.global_variables():
lst_vars.append(v)
print(v.name, '....')
with tf.name_scope('Adam_optimizer'):
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(tf.arg_max(y_conv, 1), tf.arg_max(y_, 1))
correct_prediction = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
model_path = 'C:/Users/user/PycharmProjects/TensorflowDifferentProjects/MNIStDataset/tensorlogs/'
saver = tf.train.Saver(var_list=lst_vars)
train_writer = tf.summary.FileWriter(model_path + "EventsFile/")
train_writer.add_graph(tf.get_default_graph())
for v in tf.global_variables():
print(v.name)
# Note that a session is created within a with block so that it is destroyed
# once the block has been exited.
with tf.Session() as sess:
print('all variables initialized!!')
sess.run(tf.global_variables_initializer())
ckpt = tf.train.get_checkpoint_state(
os.path.dirname(model_path))
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print('checkpoints are saved!!!')
else:
print('No stored checkpoints')
for i in range(700):
batch = mnist.train.next_batch(50)
if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0})
print('step %d, training accuracy %g' % (i, train_accuracy))
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
print('test accuracy %g' % accuracy.eval(feed_dict={
x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
save_path = saver.save(sess, model_path + "ckpt")
And the modified model (Where I added another convlolutional layer):
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import os
def conv2d(x, w):
return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
def weight_variable(shape, name):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial, name=name)
def bias_variable(shape, name):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial, name=name)
def deepnn(x):
with tf.name_scope('reshape'):
x_image = tf.reshape(x, [-1, 28, 28, 1])
# First convolutional layer, maps one grayscale image to 32 feature maps.
with tf.name_scope('conv1'):
w_conv1 = weight_variable([5, 5, 1, 32], name='weights')
b_conv1 = bias_variable([32], name='biases')
h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
# Pooling layer, downsampling by 2X
with tf.name_scope('pool1'):
h_pool1 = max_pool_2x2(h_conv1)
# Second convolutional layer -- maps 32 feature maps to 64
with tf.name_scope('conv2'):
w_conv2 = weight_variable([5, 5, 32, 64], name='weights')
b_conv2 = bias_variable([64], name='biases')
h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
# Second pooling layer
with tf.name_scope('pool2'):
h_pool2 = max_pool_2x2(h_conv2)
with tf.name_scope('conv3'):
w_conv3 = weight_variable([1, 1, 64, 64], name='weights')
b_conv3 = bias_variable([64], name='biases')
h_conv3 = tf.nn.relu(conv2d(h_pool2, w_conv3) + b_conv3)
# Fully connected layer 1 -- after 2 round of downsampling, our 28 x 28 image is
# down to 7 x 7 x 64 feature maps -- maps this to 1024 features.
with tf.name_scope('fc1'):
w_fc1 = weight_variable([7 * 7 * 64, 1024], name='weights')
b_fc1 = bias_variable([1024], name='biases')
h_conv3_flat = tf.reshape(h_conv3, [-1, 7 * 7 * 64])
h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, w_fc1) + b_fc1)
# Dropout - control the complexity of the model, prevents co-adaptation of features
with tf.name_scope('dropout'):
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# Map the 1024 features to 10 classes, one for each digit.
with tf.name_scope('fc2'):
w_fc2 = weight_variable([1024, 10], name='weights')
b_fc2 = bias_variable([10], name='biases')
y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
return y_conv, keep_prob
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
# Build the graph for the deep net
y_conv, keep_prob = deepnn(x)
with tf.name_scope('loss'):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)
cross_entropy = tf.reduce_mean(cross_entropy)
# Note that this list of variables only include the weights and biases in the model.
lst_vars = []
for v in tf.global_variables():
lst_vars.append(v)
print(v.name, '....')
with tf.name_scope('Adam_optimizer'):
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(tf.arg_max(y_conv, 1), tf.arg_max(y_, 1))
correct_prediction = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
model_path = 'C:/Users/user/PycharmProjects/TensorflowDifferentProjects/MNIStDataset/tensorlogs/'
saver = tf.train.Saver(var_list=[lst_vars[0], lst_vars[1], lst_vars[2], lst_vars[3],
lst_vars[6], lst_vars[7], lst_vars[8], lst_vars[9]])
train_writer = tf.summary.FileWriter(model_path + "EventsFile/")
train_writer.add_graph(tf.get_default_graph())
for v in tf.global_variables():
print(v.name)
# Note that a session is created within a with block so that it is destroyed
# once the block has been exited.
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print('all variables initialized!!')
ckpt = tf.train.get_checkpoint_state(
os.path.dirname(model_path))
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print('checkpoints are saved!!!')
else:
print('No stored checkpoints')
for i in range(700):
batch = mnist.train.next_batch(50)
if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0})
print('step %d, training accuracy %g' % (i, train_accuracy))
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
print('test accuracy %g' % accuracy.eval(feed_dict={
x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
save_path = saver.save(sess, model_path + "ckpt")

When trying to use Keras to build a model to be trained with distributed tensorflow I get RuntimeError: Graph is finalized and cannot be modified

So I'm trying to adapt the distributed tensorflow examples to work with Keras. I'm not sure this is entirely possible but I figured I would give it a try. Here is my script to start the workers
import tensorflow as tf
import keras
from keras import backend as K
from keras.layers import Dense, Dropout, InputLayer
from keras.models import Sequential
from keras.datasets import mnist
def get_data():
(x_train, y_train), (x_test, y_test) = mnist.load_data()
num_classes = 10
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
return x_train, y_train
tf.app.flags.DEFINE_string("ps_hosts", "", "List of parameter server addresses'")
tf.app.flags.DEFINE_string("worker_hosts", "", "List of worker addresses'")
tf.app.flags.DEFINE_integer("task_index", "", "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
task_index = FLAGS.task_index
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
cluster_spec = tf.train.ClusterSpec({'ps' : ps_hosts, 'worker' : worker_hosts})
server = tf.train.Server(cluster_spec, job_name='worker', task_index=task_index)
worker_device = 'job:worker/task:{}'.format(task_index)
replica_device = tf.train.replica_device_setter(
worker_device = worker_device,
cluster = cluster_spec)
x_train, y_train = get_data()
with tf.device(replica_device):
K.manual_variable_initialization(True)
global_step = tf.get_variable(name='global_step', shape=[],
initializer=tf.constant_initializer(0), trainable=False)
init_op = tf.global_variables_initializer()
supervisor = tf.train.Supervisor(is_chief=(task_index == 0),
global_step=global_step, init_op=init_op)
sess = supervisor.prepare_or_wait_for_session(server.target)
K.set_session(sess)
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='sgd', loss='categorical_crossentropy')
model.fit(x_train[:1000], y_train[:1000], epochs=1)
Here is my script to start the parameter server
import tensorflow as tf
if __name__ == '__main__':
tf.app.flags.DEFINE_string("ps_hosts", "", "List of parameter server addresses'")
tf.app.flags.DEFINE_string("worker_hosts", "", "List of worker addresses'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
task_index = FLAGS.task_index
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
cluster = { 'ps' : ps_hosts, 'worker' : worker_hosts }
cluster = {"ps" : ps_hosts, "worker" : worker_hosts}
server = tf.train.Server(cluster, job_name='ps', task_index=task_index)
server.join()
Here is my script to run these processes
#!/bin/bash
python3 start_ps.py --job_name=ps --ps_hosts=localhost:2222 --worker_hosts=localhost:2223,localhost:2224 --task_index=0 &
python3 start_worker.py --ps_hosts=localhost:2222 --worker_hosts=localhost:2223,localhost:2224 --task_index=0 &
python3 start_worker.py --ps_hosts=localhost:2222 --worker_hosts=localhost:2223,localhost:2224 --task_index=1
When I run this I am getting the error
Traceback (most recent call last):
File "start_worker.py", line 62, in <module>
model.add(Dense(512, activation='relu', input_shape=(784,)))
File "/Users/user/projects/keras/keras/models.py", line 421, in add
dtype=layer.dtype, name=layer.name + '_input')
File "/Users/user/projects/keras/keras/engine/topology.py", line 1375, in Input
input_tensor=tensor)
File "/Users/pzx496/projects/keras/keras/engine/topology.py", line 1286, in __init__
name=self.name)
File "/Users/pzx496/projects/keras/keras/backend/tensorflow_backend.py", line 349, in placeholder
x = tf.placeholder(dtype, shape=shape, name=name)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1530, in placeholder
return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1954, in _placeholder
name=name)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2458, in create_op
self._check_not_finalized()
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2181, in _check_not_finalized
raise RuntimeError("Graph is finalized and cannot be modified.")
RuntimeError: Graph is finalized and cannot be modified.
I thought by setting K.manual_variable_initialization(True) Keras would not initialize vars until I told it too. This appears to not be the case. Any idea how I can fix this behavior?