Tensorflow embedding for categorical feature - tensorflow

In machine learning, it is common to represent a categorical (specifically: nominal) feature with one-hot-encoding. I am trying to learn how to use tensorflow's embedding layer to represent a categorical feature in a classification problem. I have got tensorflow version 1.01 installed and I am using Python 3.6.
I am aware of the tensorflow tutorial for word2vec, but it is not very instructive for my case. While building the tf.Graph, it uses NCE-specific weights and tf.nn.nce_loss.
I just want a simple feed-forward net as below, and the input layer to be an embedding. My attempt is below. It complains when I try to matrix multiply the embedding with the hidden layer due to shape incompatibility. Any ideas how I can fix this?
from __future__ import print_function
import pandas as pd;
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder
if __name__ == '__main__':
# 1 categorical input feature and a binary output
df = pd.DataFrame({'cat2': np.array(['o', 'm', 'm', 'c', 'c', 'c', 'o', 'm', 'm', 'm']),
'label': np.array([0, 0, 1, 1, 0, 0, 1, 0, 1, 1])})
encoder = LabelEncoder()
encoder.fit(df.cat2.values)
X = encoder.transform(df.cat2.values)
Y = np.zeros((len(df), 2))
Y[np.arange(len(df)), df.label.values] = 1
# Neural net parameters
training_epochs = 5
learning_rate = 1e-3
cardinality = len(np.unique(X))
embedding_size = 2
input_X_size = 1
n_labels = len(np.unique(Y))
n_hidden = 10
# Placeholders for input, output
x = tf.placeholder(tf.int32, [None, 1], name="input_x")
y = tf.placeholder(tf.float32, [None, 2], name="input_y")
# Neural network weights
embeddings = tf.Variable(tf.random_uniform([cardinality, embedding_size], -1.0, 1.0))
h = tf.get_variable(name='h2', shape=[embedding_size, n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
W_out = tf.get_variable(name='out_w', shape=[n_hidden, n_labels],
initializer=tf.contrib.layers.xavier_initializer())
# Neural network operations
embedded_chars = tf.nn.embedding_lookup(embeddings, x)
layer_1 = tf.matmul(embedded_chars,h)
layer_1 = tf.nn.relu(layer_1)
out_layer = tf.matmul(layer_1, W_out)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out_layer, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost],
feed_dict={x: X, y: Y})
print("Optimization Finished!")
EDIT:
Please see below the error message:
Traceback (most recent call last):
File "/home/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/common_shapes.py", line 671, in _call_cpp_shape_fn_impl
input_tensors_as_shapes, status)
File "/home/anaconda3/lib/python3.6/contextlib.py", line 89, in __exit__
next(self.gen)
File "/home/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape must be rank 2 but is rank 3 for 'MatMul' (op: 'MatMul') with input shapes: [?,1,2], [2,10].

Just make your x placeholder be size [None] instead of [None, 1]

Related

Tensorflow Embedding using Continous and Categorical Variable

Based on this post, I tried to create another model, where I'm adding both categorical and continous variables.
Please find the code below:
from __future__ import print_function
import pandas as pd;
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder
if __name__ == '__main__':
# 1 categorical input feature and a binary output
df = pd.DataFrame({'cat2': np.array(['o', 'm', 'm', 'c', 'c', 'c', 'o', 'm', 'm', 'm']),
'num1': np.random.rand(10),
'label': np.array([0, 0, 1, 1, 0, 0, 1, 0, 1, 1])})
encoder = LabelEncoder()
encoder.fit(df.cat2.values)
X1 = encoder.transform(df.cat2.values).reshape(-1,1)
X2 = np.array(df.num1.values).reshape(-1,1)
# X = np.concatenate((X1,X2), axis=1)
Y = np.zeros((len(df), 2))
Y[np.arange(len(df)), df.label.values] = 1
# Neural net parameters
training_epochs = 5
learning_rate = 1e-3
cardinality = len(np.unique(X))
embedding_size = 2
input_X_size = 1
n_labels = len(np.unique(Y))
n_hidden = 10
# Placeholders for input, output
cat2 = tf.placeholder(tf.int32, [None], name='cat2')
x = tf.placeholder(tf.float32, [None, 1], name="input_x")
y = tf.placeholder(tf.float32, [None, 2], name="input_y")
embed_matrix = tf.Variable(
tf.random_uniform([cardinality, embedding_size], -1.0, 1.0),
name="embed_matrix"
)
embed = tf.nn.embedding_lookup(embed_matrix, cat2)
inputs_with_embed = tf.concat([x, embedding_aggregated], axis=2, name="inputs_with_embed")
# Neural network weights
h = tf.get_variable(name='h2', shape=[inputs_with_embed, n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
W_out = tf.get_variable(name='out_w', shape=[n_hidden, n_labels],
initializer=tf.contrib.layers.xavier_initializer())
# Neural network operations
#embedded_chars = tf.nn.embedding_lookup(embeddings, x)
layer_1 = tf.matmul(inputs_with_embed,h)
layer_1 = tf.nn.relu(layer_1)
out_layer = tf.matmul(layer_1, W_out)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out_layer, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost],
feed_dict={x: X2,cat2:X1, y: Y})
print("Optimization Finished!")
But I'm getting the following error. It seems I'm not concatenating the continous variable and embedding properly. But I'm not understanding how to fix it.
Please if someone can please guide me.
ValueError: Shape must be at least rank 3 but is rank 2 for 'inputs_with_embed_2' (op: 'ConcatV2') with input shapes: [?,1], [?,2], [] and with computed input tensors: input[2] = <2>.
Thanks!
If by embedding_agregated you mean embed (probably typo)
The error is that there is no axis=2 in your case , it should be axis=1
inputs_with_embed = tf.concat([x, embed], axis=1, name="inputs_with_embed")
embed has a shape [None, embedding_dimension] and x has a shape [None, 1]
They are both 2D tensors, so you have access to axis=0 or axis=1 (indexing at 0 not 1), therefore to have your input_with_embed of shape [None, embedding_dimension+1] you need to concat on the axis=1

Tensorflow won't matmul inputs and weights. "Dimensions must be equal"

I've been working on a simple tensor flow neural network. My input placeholder is
x = tf.placeholder(tf.float32, shape=[None, 52000, 3]).
My weight matrix is initialized to all zeros as
W = tf.Variable(tf.zeros([52000, 10])).
I tried different combinations with and without the 3 for color channels, but I guess I'm just not understanding the dimensionality because I got the error:
Traceback (most recent call last): File
"C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\common_shapes.py",
line 686, in _call_cpp_shape_fn_impl
input_tensors_as_shapes, status) File "C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\errors_impl.py",
line 473, in exit
c_api.TF_GetCode(self.status.status)) tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape
must be rank 2 but is rank 3 for 'MatMul' (op: 'MatMul') with input
shapes: [?,52000,3], [52000,10].
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "rating.py", line 65, in
y = tf.matmul(x, W) + b # "fake" outputs to train/test File "C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\math_ops.py",
line 1891, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name) File
"C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\gen_math_ops.py",
line 2436, in _mat_mul
name=name) File "C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py",
line 787, in _apply_op_helper
op_def=op_def) File "C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py",
line 2958, in create_op
set_shapes_for_outputs(ret) File "C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py",
line 2209, in set_shapes_for_outputs
shapes = shape_func(op) File "C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py",
line 2159, in call_with_requiring
return call_cpp_shape_fn(op, require_shape_fn=True) File "C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\common_shapes.py",
line 627, in call_cpp_shape_fn
require_shape_fn) File "C:\Users\Everybody\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\common_shapes.py",
line 691, in _call_cpp_shape_fn_impl
raise ValueError(err.message) ValueError: Shape must be rank 2 but is rank 3 for 'MatMul' (op: 'MatMul') with input shapes: [?,52000,3],
[52000,10].
At first, I thought my next_batch() function was the culprit because I had to make my own due to the fact that I uploaded my images "manually" using scipy.misc.imread(), whose definition reads:
q = 0
def next_batch(batch_size):
x = images[q:q + batch_size]
y = one_hots[q:q + batch_size]
q = (q + batch_size) % len(images)
return x, y
However, after looking through, I don't see what's wrong with this, so I imagine that I'm just confused about dimensionality. It is supposed to be a "flattened" 200x260 color image. It just occurred to me now that maybe I have to flatten the color channels as well? I will place my full code below if curious. I'm a bit new to Tensorflow, so thanks, all. (Yes, it is not a CNN yet, I decided to start simple just to make sure I'm importing my dataset right. And, I know it is tiny, I'm starting my dataset small too.)
############# IMPORT DEPENDENCIES ####################################
import tensorflow as tf
sess = tf.InteractiveSession() #start session
import scipy.misc
import numpy as np
######################################################################
#SET UP DATA #########################################################
images = []
one_hots = []
########### IMAGES ##################################################
#put all the images in a list
for i in range(60):
images.append(scipy.misc.imread('./shoes/%s.jpg' % str(i+1)))
print("One image appended...\n")
#normalize them, "divide" by 255
for image in images:
print("One image normalized...\n")
for i in range(260):
for j in range(200):
for c in range(3):
image[i][j][c]/=255
for image in images:
tf.reshape(image, [52000, 3])
########################################################################
################# ONE-HOT VECTORS ######################################
f = open('rateVectors.txt')
lines = f.readlines()
for i in range(0, 600, 10):
fillerlist = []
for j in range(10):
fillerlist.append(float(lines[i+j][:-1]))
one_hots.append(fillerlist)
print("One one-hot vector added...\n")
########################################################################3
#set placeholders and such for input, output, weights, biases
x = tf.placeholder(tf.float32, shape=[None, 52000, 3])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
W = tf.Variable(tf.zeros([52000, 10])) # These are our weights and biases
b = tf.Variable(tf.zeros([10])) # initialized as zeroes.
#########################################################################
sess.run(tf.global_variables_initializer()) #initialize variables in the session
y = tf.matmul(x, W) + b # "fake" outputs to train/test
##################### DEFINING OUR MODEL ####################################
#our loss function
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
#defining our training as gradient descent
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
###################### TRAINING #############################################
#################### OUR CUSTOM BATCH FUNCTION ##############################
q = 0
def next_batch(batch_size):
x = images[q:q + batch_size]
y = one_hots[q:q + batch_size]
q = (q + batch_size) % len(images)
return x, y
#train
for i in range(6):
batch = next_batch(10)
train_step.run(feed_dict={x: batch[0], y_: batch[1]})
print("Batch Number: " + i + "\n")
print("Done training...\n")
################ RESULTS #################################################
#calculating accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
#print accuracy
print(accuracy.eval(feed_dict={x: images, y_: one_hots}))
Your placeholder should have the dimension [None, 200, 260, 3] where None is the batch size, 200, 260 is the image size, and 3 is the channels.
Your weights should be [filter_height, filter_width, num_channels, num_filters]
Your bias should be [num_filters]
And the dimensions for the labels should be [None, num_classes] where None is the batch size, and num_classes is the number of classes that your images have.
These are just to make sure that math works.
I took these codes from here

Why are some nodes ignored by `print_model_analysis ` when `run_meta` is provided?

I want to compute the number of variables and the number of floating point operations of models. However, it seems that tf.contrib.tfprof.model_analyzer.print_model_analysis ignores the first node when run_meta is provided.
For example, (test with tensorflow 1.0.0)
import numpy as np
import tensorflow as tf
slim = tf.contrib.slim
x = tf.placeholder(tf.float32, [None, 7, 7, 3])
c1 = slim.conv2d(x, 22, [3, 3])
run_metadata = tf.RunMetadata()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
_ = sess.run(c1, feed_dict={x: np.zeros([1, 7, 7, 3])},
options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
run_metadata=run_metadata)
analysis = tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(), run_meta=run_metadata,
tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
# 1078
print(analysis.total_float_ops)
It only contains the number of floating point operations for Conv/BiasAdd. How can I analyze the model correctly using tfprog?

Tensorflow tf.reshape() seems to behave differently to numpy.reshape()

I'm trying to train a LSTM network and it trains successfully in one way, but throws an error in the other way. In the first example I reshape the input array X using numpy reshape and in the other way I reshape it using tensorflow reshape.
Works fine:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.learn as learn
# Parameters
learning_rate = 0.1
training_steps = 3000
batch_size = 128
# Network Parameters
n_input = 4
n_steps = 10
n_hidden = 128
n_classes = 6
X = np.ones([1770,4])
y = np.ones([177])
# NUMPY RESHAPE OUTSIDE RNN_MODEL
X = np.reshape(X, (-1, n_steps, n_input))
def rnn_model(X, y):
# TENSORFLOW RESHAPE INSIDE RNN_MODEL
#X = tf.reshape(X, [-1, n_steps, n_input]) # (batch_size, n_steps, n_input)
# # permute n_steps and batch_size
X = tf.transpose(X, [1, 0, 2])
# # Reshape to prepare input to hidden activation
X = tf.reshape(X, [-1, n_input]) # (n_steps*batch_size, n_input)
# # Split data because rnn cell needs a list of inputs for the RNN inner loop
X = tf.split(0, n_steps, X) # n_steps * (batch_size, n_input)
# Define a GRU cell with tensorflow
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
# Get lstm cell output
_, encoding = tf.nn.rnn(lstm_cell, X, dtype=tf.float32)
return learn.models.logistic_regression(encoding, y)
classifier = learn.TensorFlowEstimator(model_fn=rnn_model, n_classes=n_classes,
batch_size=batch_size,
steps=training_steps,
learning_rate=learning_rate)
classifier.fit(X,y)
Does not work:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.learn as learn
# Parameters
learning_rate = 0.1
training_steps = 3000
batch_size = 128
# Network Parameters
n_input = 4
n_steps = 10
n_hidden = 128
n_classes = 6
X = np.ones([1770,4])
y = np.ones([177])
# NUMPY RESHAPE OUTSIDE RNN_MODEL
#X = np.reshape(X, (-1, n_steps, n_input))
def rnn_model(X, y):
# TENSORFLOW RESHAPE INSIDE RNN_MODEL
X = tf.reshape(X, [-1, n_steps, n_input]) # (batch_size, n_steps, n_input)
# # permute n_steps and batch_size
X = tf.transpose(X, [1, 0, 2])
# # Reshape to prepare input to hidden activation
X = tf.reshape(X, [-1, n_input]) # (n_steps*batch_size, n_input)
# # Split data because rnn cell needs a list of inputs for the RNN inner loop
X = tf.split(0, n_steps, X) # n_steps * (batch_size, n_input)
# Define a GRU cell with tensorflow
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
# Get lstm cell output
_, encoding = tf.nn.rnn(lstm_cell, X, dtype=tf.float32)
return learn.models.logistic_regression(encoding, y)
classifier = learn.TensorFlowEstimator(model_fn=rnn_model, n_classes=n_classes,
batch_size=batch_size,
steps=training_steps,
learning_rate=learning_rate)
classifier.fit(X,y)
The latter throws the following error:
WARNING:tensorflow:<tensorflow.python.ops.rnn_cell.BasicLSTMCell object at 0x7f1c67c6f750>: Using a concatenated state is slower and will soon be deprecated. Use state_is_tuple=True.
Traceback (most recent call last):
File "/home/blabla/test.py", line 47, in <module>
classifier.fit(X,y)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/base.py", line 160, in fit
monitors=monitors)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 484, in _train_model
monitors=monitors)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/graph_actions.py", line 328, in train
reraise(*excinfo)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/graph_actions.py", line 254, in train
feed_dict = feed_fn() if feed_fn is not None else None
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/io/data_feeder.py", line 366, in _feed_dict_fn
out.itemset((i, self.y[sample]), 1.0)
IndexError: index 974 is out of bounds for axis 0 with size 177
A couple of suggestions:
* use input_fn instead of X, Y to the fit
* use learn.Estimator instead of learn.TensorFlowEstimator
since you have small data, following should work. Otherwise you need to batch your data.
```
def _my_inputs():
return tf.constant(np.ones([1770,4])), tf.constant(np.ones([177]))
I was able to get this working with a couple small changes:
# Parameters
learning_rate = 0.1
training_steps = 10
batch_size = 8
# Network Parameters
n_input = 4
n_steps = 10
n_hidden = 128
n_classes = 6
X = np.ones([177, 10, 4]) # <---- Use shape [batch_size, n_steps, n_input] here.
y = np.ones([177])
def rnn_model(X, y):
X = tf.transpose(X, [1, 0, 2]) #|
X = tf.unpack(X) #| These two lines do the same thing as your code, just a bit simpler ;)
# Define a LSTM cell with tensorflow
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
# Get lstm cell output
outputs, _ = tf.nn.rnn(lstm_cell, X, dtype=tf.float64) # <---- I think you want to use the first return value here.
return tf.contrib.learn.models.logistic_regression(outputs[-1], y) # <----uses just the last output for classification, as is typical with RNNs.
classifier = tf.contrib.learn.TensorFlowEstimator(model_fn=rnn_model,
n_classes=n_classes,
batch_size=batch_size,
steps=training_steps,
learning_rate=learning_rate)
classifier.fit(X,y)
I think the central problem you were having was that X has to be shape [batch,...] when passed to fit(...). When you used numpy to reshape it outside the rnn_model() function, X had this shape so training worked.
I can't speak for the quality of the model this solution will produce, but at least it runs!

Tensorflow: issue with placeholder and summaries

I have modified existing cifar10 example to work as a siamese network.
But I am facing some difficulties in training it.
Changes Made :
placeholder instead of queue
custom loss function
Here is my modified cifar10_train.py :
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os.path
import time
import input_data
import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import cifar10
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('train_dir', 'tmp/cifar10_train',
"""Directory where to write event logs """
"""and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 1000000,
"""Number of batches to run.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
def train():
"""Train CIFAR-10 for a number of steps."""
dataset = input_data.read()
image, image_p, label = dataset.train_dataset
image_size = dataset.image_size
batch_size = 28
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable=False)
# Get images and labels for CIFAR-10.
images = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
images2 = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
labels = tf.placeholder(tf.float32, shape=(batch_size))
tf.image_summary('images', images)
tf.image_summary('images2', images)
# Build a Graph that computes the logits predictions from the
# inference model.
with tf.variable_scope('inference') as scope:
logits = cifar10.inference(images)
scope.reuse_variables()
logits2 = cifar10.inference(images2)
# Calculate loss.
loss = cifar10.loss(logits, logits2, labels)
# Build a Graph that trains the model with one batch of examples and
# updates the model parameters.
train_op = cifar10.train(loss, global_step)
# Create a saver.
saver = tf.train.Saver(tf.all_variables())
# Build the summary operation based on the TF collection of Summaries.
summary_op = tf.merge_all_summaries()
# Build an initialization operation to run below.
init = tf.initialize_all_variables()
# Start running operations on the Graph.
sess = tf.Session(config=tf.ConfigProto(
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
# Start the queue runners.
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
graph_def=sess.graph_def)
for step in xrange(FLAGS.max_steps):
start_time = time.time()
offset = (step * batch_size) % (dataset.train_samples - batch_size)
_, loss_value = sess.run([train_op, loss], feed_dict={images: image[offset:(offset + batch_size)], images2: image_p[offset:(offset + batch_size)], labels: 1.0*label[offset:(offset + batch_size)]})
duration = time.time() - start_time
print(loss_value)
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch)')
print (format_str % (datetime.now(), step, loss_value,
examples_per_sec, sec_per_batch))
if step % 100 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
# Save the model checkpoint periodically.
if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)
def main(argv=None):
# pylint: disable=unused-argument
train()
if __name__ == '__main__':
tf.app.run()
Modified cifar10.py
"""Builds the CIFAR-10 network.
Summary of available functions:
# Compute input images and labels for training. If you would like to run
# evaluations, use inputs() instead.
inputs, labels = distorted_inputs()
# Compute inference on the model inputs to make a prediction.
predictions = inference(inputs)
# Compute the total loss of the prediction with respect to the labels.
loss = loss(predictions, labels)
# Create a graph to run one step of training with respect to the loss.
train_op = train(loss, global_step)
"""
# pylint: disable=missing-docstring
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gzip
import os
import re
import sys
import tarfile
from six.moves import urllib
import tensorflow as tf
import input_data
FLAGS = tf.app.flags.FLAGS
# Basic model parameters.
tf.app.flags.DEFINE_integer('batch_size', 28,
"""Number of images to process in a batch.""")
tf.app.flags.DEFINE_string('data_dir_p', '/tmp/cifar10_data',
"""Path to the CIFAR-10 data directory.""")
# Global constants describing the CIFAR-10 data set.
# IMAGE_SIZE = cifar10_input.IMAGE_SIZE
# NUM_CLASSES = cifar10_input.NUM_CLASSES
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = input_data.train_samples
# NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
# Constants describing the training process.
MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.001 # Initial learning rate.
Q = 360.6244
# If a model is trained with multiple GPU's prefix all Op names with tower_name
# to differentiate the operations. Note that this prefix is removed from the
# names of the summaries when visualizing a model.
TOWER_NAME = 'tower'
DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
def _activation_summary(x):
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
tf.histogram_summary(tensor_name + '/activations', x)
tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
def _variable_on_cpu(name, shape, initializer):
with tf.device('/cpu:0'):
var = tf.get_variable(name, shape, initializer=initializer)
return var
def _variable_with_weight_decay(name, shape, stddev, wd):
var = _variable_on_cpu(name, shape, tf.truncated_normal_initializer(stddev=stddev))
if wd:
weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
return var
def inference(data):
# We instantiate all variables using tf.get_variable() instead of
# tf.Variable() in order to share variables across multiple GPU training runs.
# If we only ran this model on a single GPU, we could simplify this function
# by replacing all instances of tf.get_variable() with tf.Variable().
#
# conv1
with tf.variable_scope('conv1') as scope:
kernel = _variable_with_weight_decay('weights', shape=[5, 5, 1, 20],
stddev=0.1, wd=0.0)
conv = tf.nn.conv2d(data, kernel, [1, 1, 1, 1], padding='VALID')
biases = _variable_on_cpu('biases', [20], tf.constant_initializer(0.0))
conv1 = tf.nn.bias_add(conv, biases)
_activation_summary(conv1)
# pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
padding='VALID', name='pool1')
# conv2
with tf.variable_scope('conv2') as scope:
kernel = _variable_with_weight_decay('weights', shape=[5, 5, 20, 50],
stddev=0.1, wd=0.0)
conv = tf.nn.conv2d(pool1, kernel, [1, 1, 1, 1], padding='VALID')
biases = _variable_on_cpu('biases', [50], tf.constant_initializer(0.0))
conv2 = tf.nn.bias_add(conv, biases)
_activation_summary(conv2)
# pool2
pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='VALID', name='pool2')
# local3
with tf.variable_scope('local3') as scope:
# Move everything into depth so we can perform a single matrix multiply.
dim = 1
for d in pool2.get_shape()[1:].as_list():
dim *= d
reshape = tf.reshape(pool2, [pool2.get_shape()[0:].as_list()[0], dim])
weights = _variable_with_weight_decay('weights', shape=[dim, 500],
stddev=0.1, wd=0.0)
biases = _variable_on_cpu('biases', [500], tf.constant_initializer(0.10))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
_activation_summary(local3)
# local4
with tf.variable_scope('local4') as scope:
weights = _variable_with_weight_decay('weights', shape=[500, 10],
stddev=0.1, wd=0.0)
biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.0))
local4 = tf.add(tf.matmul(local3, weights), biases, name=scope.name)
_activation_summary(local4)
#local5
with tf.variable_scope('local5') as scope:
weights = _variable_with_weight_decay('weights', [10, 10],
stddev=0.1, wd=0.0)
biases = _variable_on_cpu('biases', [10],
tf.constant_initializer(0.0))
local5 = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
_activation_summary(local5)
return local5
def loss(features1, features2, labels):
energy_square = (tf.reduce_sum(tf.pow(tf.sub(features1, features2), 2),1))
loss = tf.add(tf.mul(tf.pow(tf.sub(labels,1),2),energy_square),tf.mul(labels,tf.maximum(tf.sub(1.0,energy_square),0)))
loss = tf.reduce_sum(loss) / features1.get_shape()[0:].as_list()[0] / 2
# Calculate the average cross entropy loss across the batch.
# labels = tf.cast(labels, tf.int64)
# cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
# logits, labels, name='cross_entropy_per_example')
# cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', loss)
# The total loss is defined as the cross entropy loss plus all of the weight
# decay terms (L2 loss).
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def _add_loss_summaries(total_loss):
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
losses = tf.get_collection('losses')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(l.op.name +' (raw)', l)
tf.scalar_summary(l.op.name, loss_averages.average(l))
return loss_averages_op
def train(total_loss, global_step):
loss_averages_op = _add_loss_summaries(total_loss)
num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
global_step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
tf.scalar_summary('learning_rate', lr)
# Generate moving averages of all losses and associated summaries.
loss_averages_op = _add_loss_summaries(total_loss)
# Compute gradients.
with tf.control_dependencies([loss_averages_op]):
opt = tf.train.GradientDescentOptimizer(lr)
grads = opt.compute_gradients(total_loss)
# Apply gradients.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Add histograms for trainable variables.
for var in tf.trainable_variables():
tf.histogram_summary(var.op.name, var)
# Add histograms for gradients.
for grad, var in grads:
if grad:
tf.histogram_summary(var.op.name + '/gradients', grad)
# Track the moving averages of all trainable variables.
variable_averages = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
train_op = tf.no_op(name='train')
return train_op
Error I am getting :
2016-03-01 15:56:59.483682: step 0, loss = 0.22 (9.7 examples/sec; 2.896 sec/batch)
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Invalid argument: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [28,112,92,1]
[[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[28,112,92,1], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary
[[Node: HistogramSummary = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary/tag, inference/conv1/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_1
[[Node: HistogramSummary_1 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_1/tag, inference/conv1/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Invalid argument: You must feed a value for placeholder tensor 'Placeholder_2' with dtype float and shape [28]
[[Node: Placeholder_2 = Placeholder[dtype=DT_FLOAT, shape=[28], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_3
[[Node: HistogramSummary_3 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_3/tag, inference/conv2/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_2
[[Node: HistogramSummary_2 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_2/tag, inference/conv2/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_4
[[Node: HistogramSummary_4 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_4/tag, inference/local3/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_5
[[Node: HistogramSummary_5 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_5/tag, inference/local3/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_6
[[Node: HistogramSummary_6 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_6/tag, inference/local4/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_7
[[Node: HistogramSummary_7 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_7/tag, inference/local4/biases/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_8
[[Node: HistogramSummary_8 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_8/tag, inference/local5/weights/read)]]
W tensorflow/core/common_runtime/executor.cc:1102] 0x7fd2340e8b60 Compute status: Out of range: Nan in summary histogram for: HistogramSummary_9
[[Node: HistogramSummary_9 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](HistogramSummary_9/tag, inference/local5/biases/read)]]
Traceback (most recent call last):
File "cifar10_train.py", line 110, in <module>
tf.app.run()
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/platform/default/_app.py", line 30, in run
sys.exit(main(sys.argv))
File "cifar10_train.py", line 106, in main
train()
File "cifar10_train.py", line 95, in train
summary_str = sess.run(summary_op)
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 315, in run
return self._run(None, fetches, feed_dict)
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 511, in _run
feed_dict_string)
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 564, in _do_run
target_list)
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 586, in _do_call
e.code)
tensorflow.python.framework.errors.InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [28,112,92,1]
[[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[28,112,92,1], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op u'Placeholder', defined at:
File "cifar10_train.py", line 110, in <module>
tf.app.run()
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/platform/default/_app.py", line 30, in run
sys.exit(main(sys.argv))
File "cifar10_train.py", line 106, in main
train()
File "cifar10_train.py", line 36, in train
images = tf.placeholder(tf.float32, shape=(batch_size, image_size[0], image_size[1], image_size[2]))
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 742, in placeholder
name=name)
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 583, in _placeholder
name=name)
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
op_def=op_def)
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2040, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/Users/Macbull/Desktop/GITHUB/tensorflow/venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1087, in __init__
self._traceback = _extract_stack()
Also, when I comment out merge_all_summaries(), the model diverges with loss= NaN
The problem here is that some of the summaries in your graph—collected by tf.merge_all_summaries()— depend on your placeholders. For example, the code in cifar10.py creates summaries for various activations at each step, which depend on the training example used.
The solution is to feed the same training batch when you evaluate summary_op:
if step % 100 == 0:
summary_str = sess.run(summary_op, feed_dict={
images: image[offset:(offset + batch_size)],
images2: image_p[offset:(offset + batch_size)],
labels: 1.0 * label[offset:(offset + batch_size)]})
While this gives the smallest modification to your original code, it is slightly inefficient, because it will re-execute the training step every 100 steps. The best way to address this (although it will require some restructuring of your training loop) is to fetch the summaries in the same call to sess.run() that performs a training step:
if step % 100 == 0:
_, loss_value, summary_str = sess.run([train_op, loss, summary_op], feed_dict={
images: image[offset:(offset + batch_size)],
images2: image_p[offset:(offset + batch_size)],
labels: 1.0 * label[offset:(offset + batch_size)]})