Distributed Tensorflow: non-chief worker blocked - tensorflow

I am trying the distributed tensorflow, and my code is shown as follow. The problem is that the chief worker can run as expected. However, non-chief worker will blocked at :
sess = sv.prepare_or_wait_for_session(target, config=sess_config)
Could anybody help me solve this problem?
# Copyright 2016 Google Inc. All Rights Reserved.
"""A library to train Inception using multiple replicas with synchronous update.
Please see accompanying README.md for details and instructions.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os.path
import time
import numpy as np
import tensorflow as tf
from inception.slim.datasets import dataset_factory
from inception.slim.nets import nets_factory
from inception.slim.preprocessing import preprocessing_factory
from inception import inception_model as inception
from inception.slim import slim
#from inception import image_processing
sslim = tf.contrib.slim
FLAGS = tf.app.flags.FLAGS
'dataset_name', 'imagenet', 'The name of the dataset to load.')
'dataset_split_name', 'train', 'The name of the train/test split.')
'train_image_size', None, 'Train image size')
'dataset_dir', None, 'The directory where the dataset files are stored.')
tf.app.flags.DEFINE_string('job_name', '', 'One of "ps", "worker"')
tf.app.flags.DEFINE_string('ps_hosts', '',
"""Comma-separated list of hostname:port for the """
"""parameter server jobs. e.g. """
tf.app.flags.DEFINE_string('worker_hosts', '',
"""Comma-separated list of hostname:port for the """
"""worker jobs. e.g. """
'weight_decay', 0.00004, 'The weight decay on the model weights.')
tf.app.flags.DEFINE_string('train_dir', '/tmp/imagenet_train',
"""Directory where to write event logs """
"""and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 100, 'Number of batches to run.')
tf.app.flags.DEFINE_string('subset', 'train', 'Either "train" or "validation".')
tf.app.flags.DEFINE_boolean('log_device_placement', False,
'Whether to log device placement.')
'model_name', 'inception_v3', 'The name of the architecture to train.')
'batch_size', 32, 'The number of samples in each batch.')
'preprocessing_name', None, 'The name of the preprocessing to use. If left '
'as `None`, then the model_name flag is used.')
# Task ID is used to select the chief and also to access the local_step for
# each replica to check staleness of the gradients in sync_replicas_optimizer.
'task_id', 0, 'Task ID of the worker/replica running the training.')
# More details can be found in the sync_replicas_optimizer class:
# tensorflow/python/training/sync_replicas_optimizer.py
tf.app.flags.DEFINE_integer('num_replicas_to_aggregate', -1,
"""Number of gradients to collect before """
"""updating the parameters.""")
tf.app.flags.DEFINE_integer('save_interval_secs', 10 * 60,
'Save interval seconds.')
tf.app.flags.DEFINE_integer('save_summaries_secs', 10 * 60,
'Save summaries interval seconds.')
# Please note that this learning rate schedule is heavily dependent on the
# hardware architecture, batch size and any changes to the model architecture
# specification. Selecting a finely tuned learning rate schedule is an
# empirical process that requires some experimentation. Please see README.md
# more guidance and discussion.
# Learning rate decay factor selected from https://arxiv.org/abs/1604.00981
tf.app.flags.DEFINE_float('initial_learning_rate', 0.045,
'Initial learning rate.')
tf.app.flags.DEFINE_float('num_epochs_per_decay', 2.0,
'Epochs after which learning rate decays.')
tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.94,
'Learning rate decay factor.')
# Constants dictating the learning rate schedule.
RMSPROP_DECAY = 0.9 # Decay term for RMSProp.
RMSPROP_MOMENTUM = 0.9 # Momentum in RMSProp.
RMSPROP_EPSILON = 1.0 # Epsilon term for RMSProp.
def train(target, dataset, cluster_spec):
"""Train Inception on a dataset for a number of steps."""
# Number of workers and parameter servers are infered from the workers and ps
# hosts string.
num_workers = len(cluster_spec.as_dict()['worker'])
num_parameter_servers = len(cluster_spec.as_dict()['ps'])
# If no value is given, num_replicas_to_aggregate defaults to be the number of
# workers.
if FLAGS.num_replicas_to_aggregate == -1:
num_replicas_to_aggregate = num_workers
num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate
# Both should be greater than 0 in a distributed training.
assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
' must be > 0.')
# Choose worker 0 as the chief. Note that any worker could be the chief
# but there should be only one chief.
is_chief = (FLAGS.task_id == 0)
# Ops are assigned to worker by default.
with tf.device('/job:worker/task:%d' % FLAGS.task_id):
# Variables and its related init/assign ops are assigned to ps.
with slim.scopes.arg_scope(
[slim.variables.variable, slim.variables.global_step],
# Create a variable to count the number of train() calls. This equals the
# number of updates applied to the variables.
global_step = slim.variables.global_step()
# Calculate the learning rate schedule.
num_batches_per_epoch = (dataset.num_examples_per_epoch() /
# Decay steps need to be divided by the number of replicas to aggregate.
decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
# Add a summary to track the learning rate.
tf.summary.scalar('learning_rate', lr)
# Create an optimizer that performs gradient descent.
opt = tf.train.RMSPropOptimizer(lr,
dataset = dataset_factory.get_dataset(
FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
network_fn = nets_factory.get_network_fn(
preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
image_preprocessing_fn = preprocessing_factory.get_preprocessing(
provider = sslim.dataset_data_provider.DatasetDataProvider(
common_queue_capacity=20 * FLAGS.batch_size,
common_queue_min=10 * FLAGS.batch_size)
[image, label] = provider.get(['image', 'label'])
train_image_size = FLAGS.train_image_size or network_fn.default_image_size
image = image_preprocessing_fn(image, train_image_size, train_image_size)
images, labels = tf.train.batch(
[image, label],
capacity=5 * FLAGS.batch_size)
# Number of classes in the Dataset label set plus 1.
# Label 0 is reserved for an (unused) background class.
num_classes = 1001
logits, end_points = network_fn(images)
# Add classification loss.
sparse_labels = tf.reshape(labels, [batch_size, 1])
indices = tf.reshape(tf.range(batch_size), [batch_size, 1])
#concated = tf.concat(1, [indices, sparse_labels])
sparse_labels = tf.cast(sparse_labels, tf.int32)
concated = tf.concat([indices, sparse_labels], 1)
dense_labels = tf.sparse_to_dense(concated,
[batch_size, 1001],
1.0, 0.0)
logits, dense_labels, label_smoothing=0.01, weight=1.0)
# Gather all of the losses including regularization losses.
losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
total_loss = tf.add_n(losses, name='total_loss')
if is_chief:
# Compute the moving average of all individual losses and the
# total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summmary to all individual losses and the total loss;
# do the same for the averaged version of the losses.
for l in losses + [total_loss]:
loss_name = l.op.name
# Name each loss as '(raw)' and name the moving average version of the
# loss as the original loss name.
tf.summary.scalar(loss_name + '_raw', l)
tf.summary.scalar(loss_name, loss_averages.average(l))
# Add dependency to compute loss_averages.
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
# Track the moving averages of all trainable variables.
# Note that we maintain a 'double-average' of the BatchNormalization
# global statistics.
# This is not needed when the number of replicas are small but important
# for synchronous distributed training with tens of workers/replicas.
exp_moving_averager = tf.train.ExponentialMovingAverage(
inception.MOVING_AVERAGE_DECAY, global_step)
variables_to_average = (
tf.trainable_variables() + tf.moving_average_variables())
# Add histograms for model variables.
for var in variables_to_average:
tf.summary.histogram(var.op.name, var)
# Create synchronous replica optimizer.
opt = tf.train.SyncReplicasOptimizer(
# Compute gradients with respect to the loss.
grads = opt.compute_gradients(total_loss)
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
tf.summary.histogram(var.op.name + '/gradients', grad)
apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)
with tf.control_dependencies([apply_gradients_op]):
train_op = tf.identity(total_loss, name='train_op')
# Get chief queue_runners, init_tokens and clean_up_op, which is used to
# synchronize replicas.
# More details can be found in sync_replicas_optimizer.
chief_queue_runners = [opt.get_chief_queue_runner()]
init_tokens_op = opt.get_init_tokens_op()
# Build the summary operation based on the TF collection of Summaries.
summary_op = tf.summary.merge_all()
# Build an initialization operation to run below.
#init_op = tf.global_variables_initializer()
# We run the summaries in the same thread as the training operations by
# passing in None for summary_op to avoid a summary_thread being started.
# Running summaries and training operations in parallel could run out of
# GPU memory.
sv = tf.train.Supervisor(is_chief=is_chief,
tf.logging.info('%s Supervisor' % datetime.now())
sess_config = tf.ConfigProto(
# Get a session.
sess = sv.prepare_or_wait_for_session(target, config=sess_config)
# Start the queue runners.
queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
sv.start_queue_runners(sess, queue_runners)
tf.logging.info('Started %d queues for processing input data.',
if is_chief:
sv.start_queue_runners(sess, chief_queue_runners)
# Train, checking for Nans. Concurrently run the summary operation at a
# specified interval. Note that the summary_op and train_op never run
# simultaneously in order to prevent running out of GPU memory.
#sess = sv.managed_session(target)
next_summary_time = time.time() + FLAGS.save_summaries_secs
while not sv.should_stop():
start_time = time.time()
loss_value, step = sess.run([train_op, global_step])
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step > FLAGS.max_steps:
duration = time.time() - start_time
if step % 10 == 0:
examples_per_sec = FLAGS.batch_size / float(duration)
format_str = ('Worker %d: %s: step %d, loss = %.2f'
'(%.1f examples/sec; %.3f sec/batch)')
tf.logging.info(format_str %
(FLAGS.task_id, datetime.now(), step, loss_value,
examples_per_sec, duration))
# Determine if the summary_op should be run on the chief worker.
if is_chief and next_summary_time < time.time():
tf.logging.info('Running Summary operation on the chief.')
summary_str = sess.run(summary_op)
sv.summary_computed(sess, summary_str)
tf.logging.info('Finished running Summary operation.')
# Determine the next time for running the summary.
next_summary_time += FLAGS.save_summaries_secs
if is_chief:
tf.logging.info('About to execute sync_clean_up_op!')
# Stop the supervisor. This also waits for service threads to finish.

Sync will create a local variable which will basically create the local step variable which is a local variable. But VariableDeviceChooser doesn't tell global from local so it is not functioning until we fix the device chooser. Thanks for reporting though.

Also concerned about this issue,Can you put your command line here?


pytorch isn't running on gpu while true

I want to train on my local gpu but it's only running on cpu while torch.cuda.is_available() is actually true and i can see my gpu but it runs only on cpu , so how to fix it
my CNN model:
import torch.nn as nn
import torch.nn.functional as F
from PIL import ImageFile
# define the CNN architecture
class Net(nn.Module):
### TODO: choose an architecture, and complete the class
def __init__(self):
super(Net, self).__init__()
## Define layers of a CNN
self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
# convolutional layer (sees 16x16x16 tensor)
self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
# convolutional layer (sees 8x8x32 tensor)
self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
# max pooling layer
self.pool = nn.MaxPool2d(2, 2)
# linear layer (64 * 4 * 4 -> 500)
self.fc1 = nn.Linear(64 * 28 * 28, 500)
# linear layer (500 -> 10)
self.fc2 = nn.Linear(500, 133)
# dropout layer (p=0.25)
self.dropout = nn.Dropout(0.25)
def forward(self, x):
## Define forward behavior
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
# flatten image input
x = x.view(-1, 64 * 28 * 28)
# add dropout layer
x = self.dropout(x)
# add 1st hidden layer, with relu activation function
x = F.relu(self.fc1(x))
# add dropout layer
x = self.dropout(x)
# add 2nd hidden layer, with relu activation function
x = self.fc2(x)
return x
#-#-# You so NOT have to modify the code below this line. #-#-#
# instantiate the CNN
model_scratch = Net()
# move tensors to GPU if CUDA is available
if use_cuda:
model_scratch = model_scratch.cuda()
train function :
def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path):
"""returns trained model"""
# initialize tracker for minimum validation loss
valid_loss_min = np.Inf
loaders_scratch = {'train': train_loader,'valid': valid_loader,'test': test_loader}
for epoch in range(1, n_epochs+1):
# initialize variables to monitor training and validation loss
train_loss = 0.0
valid_loss = 0.0
# train the model #
for batch_idx, (data, target) in enumerate(loaders['train']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## find the loss and update the model parameters accordingly
## record the average training loss, using something like
## train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
# clear the gradients of all optimized variables
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the batch loss
loss = criterion(output, target)
# backward pass: compute gradient of the loss with respect to model parameters
# perform a single optimization step (parameter update)
# update training loss
train_loss += loss.item()*data.size(0)
# validate the model #
for batch_idx, (data, target) in enumerate(loaders['valid']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## update the average validation loss
output = model(data)
# calculate the batch loss
loss = criterion(output, target)
# update average validation loss
valid_loss += loss.item()*data.size(0)
# calculate average losses
train_loss = train_loss/len(train_loader.dataset)
valid_loss = valid_loss/len(valid_loader.dataset)
# print training/validation statistics
print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
## TODO: save the model if validation loss has decreased
if valid_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(
torch.save(model.state_dict(), save_path)
valid_loss_min = valid_loss
# return trained model
return model
# train the model
loaders_scratch = {'train': train_loader,'valid': valid_loader,'test': test_loader}
model_scratch = train(100, loaders_scratch, model_scratch, optimizer_scratch,
criterion_scratch, use_cuda, 'model_scratch.pt')
# load the model that got the best validation accuracy
while i am getting "TRUE" in torch.cuda.is_available() but still not running on GPU
i am only running on CPU
the below picture shows that i am running on cpu with 62%
To utilize cuda in pytorch you have to specify that you want to run your code on gpu device.
a line of code like:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
will determine whether you have cuda available and if so, you will have it as your device.
later in the code you have to pass your tensors and model to this device:
net = net.to(device)
and do the same for your other tensors that need to go to gpu, like test and training values.
If you are experiencing an issue where your model is only using the CPU during training even though your GPU is available, it's likely due to the data loading and transformation process. When loading images from your local directory and applying transforms on your data, the majority of the time during training is spent on the data loading process, which is performed on the CPU.
To resolve this issue, you can preprocess your data by applying your custom transforms once and then saving the results. This way, when you load the preprocessed data, you can take advantage of the GPU's performance during training. This can help to significantly improve the training time of your model.
In summary, if you are facing a problem with model using CPU instead of GPU during training, it could be due to the data loading process. To fix this, preprocess your data and save the results, then use the preprocessed data while training. This will allow you to take advantage of the GPU's performance and reduce training time.

How to export test values on Tensorflow

I'm using a similar code to this as main train/test database and this to run the model.
I can print predictions in json but I can't print the test values to see which prediction refeers to each test.
How can I do that?
I'would like to export the tested datas.
Here is my code of import datas
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
"""A dataset loader for imports85.data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
import tensorflow as tf
import pandas as pd # pylint: disable=g-import-not-at-top
except ImportError:
# Order is important for the csv-readers, so we use an OrderedDict here.
defaults = collections.OrderedDict([
("mes", [""]),
("marca", [""]),
("linha", [""]),
("grupo", [""]),
("capacidade", [0.0]),
("grade", [0.0]),
("custo", [0.0]),
("benef", [""]),
("desenvolvimento", [""]),
("leadtime", [0.0])
]) # pyformat: disable
types = collections.OrderedDict((key, type(value[0]))
for key, value in defaults.items())
def dataset(file_name="treino.csv", y_name="leadtime", train_fraction=0.7):
"""Load the imports85 data as a (train,test) pair of `Dataset`.
Each dataset generates (features_dict, label) pairs.
y_name: The name of the column to use as the label.
train_fraction: A float, the fraction of data to use for training. The
remainder will be used for evaluation.
A (train,test) pair of `Datasets`
# Download and cache the data
path = file_name
# Define how the lines of the file should be parsed
def decode_line(line):
"""Convert a csv line into a (features_dict,label) pair."""
# Decode the line to a tuple of items based on the types of
# csv_header.values().
items = tf.decode_csv(line, list(defaults.values()),field_delim=';')
# Convert the keys and items to a dict.
pairs = zip(defaults.keys(), items)
features_dict = dict(pairs)
# Remove the label from the features_dict
label = features_dict.pop(y_name)
return features_dict, label
def has_no_question_marks(line):
"""Returns True if the line of text has no question marks."""
# split the line into an array of characters
chars = tf.string_split(line[tf.newaxis], "").values
# for each character check if it is a question mark
is_question = tf.equal(chars, "?")
any_question = tf.reduce_any(is_question)
no_question = ~any_question
return no_question
def in_training_set(line):
"""Returns a boolean tensor, true if the line is in the training set."""
# If you randomly split the dataset you won't get the same split in both
# sessions if you stop and restart training later. Also a simple
# random split won't work with a dataset that's too big to `.cache()` as
# we are doing here.
num_buckets = 1000000
bucket_id = tf.string_to_hash_bucket_fast(line, num_buckets)
# Use the hash bucket id as a random number that's deterministic per example
return bucket_id < int(train_fraction * num_buckets)
def in_test_set(line):
"""Returns a boolean tensor, true if the line is in the training set."""
# Items not in the training set are in the test set.
# This line must use `~` instead of `not` because `not` only works on python
# booleans but we are dealing with symbolic tensors.
return ~in_training_set(line)
base_dataset = (tf.contrib.data
# Get the lines from the file.
# drop lines with question marks.
train = (base_dataset
# Take only the training-set lines.
# Decode each line into a (features_dict, label) pair.
# Cache data so you only decode the file once.
# Do the same for the test-set.
test = (base_dataset.filter(in_test_set).cache().map(decode_line))
return train, test
def raw_dataframe():
"""Load the imports85 data as a pd.DataFrame."""
# Download and cache the data
path = file_name
# Load it into a pandas dataframe
df = pd.read_csv(path, names=types.keys(), dtype=types, na_values="?")
return df
def load_data(y_name="leadtime", train_fraction=0.7, seed=None):
"""Get the imports85 data set.
A description of the data is available at:
The data itself can be found at:
y_name: the column to return as the label.
train_fraction: the fraction of the dataset to use for training.
seed: The random seed to use when shuffling the data. `None` generates a
unique shuffle every run.
a pair of pairs where the first pair is the training data, and the second
is the test data:
`(x_train, y_train), (x_test, y_test) = get_imports85_dataset(...)`
`x` contains a pandas DataFrame of features, while `y` contains the label
# Load the raw data columns.
data = raw_dataframe()
# Delete rows with unknowns
data = data.dropna()
# Shuffle the data
# Split the data into train/test subsets.
x_train = data.sample(frac=train_fraction, random_state=seed)
x_test = data.drop(x_train.index)
# Extract the label from the features dataframe.
y_train = x_train.pop(y_name)
y_test = x_test.pop(y_name)
return (x_train, y_train), (x_test, y_test)
and here is my code to test, evaluate and predict
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
"""Regression using the DNNRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import pandas as pd
import importar_dados # pylint: disable=g-bad-import-order
STEPS = 100
def my_dnn_regression_fn(features, labels, mode, params):
"""A model function implementing DNN regression for a custom Estimator."""
# Extract the input into a dense layer, according to the feature_columns.
top = tf.feature_column.input_layer(features, params["feature_columns"])
# Iterate over the "hidden_units" list of layer sizes, default is [20].
for units in params.get("hidden_units", [100]):
# Add a hidden layer, densely connected on top of the previous layer.
top = tf.layers.dense(inputs=top, units=units, activation=tf.nn.relu)
# Connect a linear output layer on top.
output_layer = tf.layers.dense(inputs=top, units=1)
# Reshape the output layer to a 1-dim Tensor to return predictions
predictions = tf.squeeze(output_layer, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
# In `PREDICT` mode we only need to return predictions.
return tf.estimator.EstimatorSpec(
mode=mode, predictions={"leadtime": predictions})
# Calculate loss using mean squared error
average_loss = tf.losses.mean_squared_error(labels, predictions)
# Pre-made estimators use the total_loss instead of the average,
# so report total_loss for compatibility.
batch_size = tf.shape(labels)[0]
total_loss = tf.to_float(batch_size) * average_loss
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = params.get("optimizer", tf.train.AdamOptimizer)
optimizer = optimizer(params.get("learning_rate", None))
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(
mode=mode, loss=total_loss, train_op=train_op)
# In evaluation mode we will calculate evaluation metrics.
assert mode == tf.estimator.ModeKeys.EVAL
# Calculate root mean squared error
rmse = tf.metrics.root_mean_squared_error(labels, predictions)
# Add the rmse to the collection of evaluation metrics.
eval_metrics = {"rmse": rmse}
return tf.estimator.EstimatorSpec(
# Report sum of error for compatibility with pre-made estimators
def main(argv):
"""Builds, trains, and evaluates the model."""
assert len(argv) == 1
(train, test) = importar_dados.dataset()
# Switch the labels to units of thousands for better convergence.
def normalize_lt(features, labels):
return features, labels / LT_NORM_FACTOR
train = train.map(normalize_lt)
test = test.map(normalize_lt)
# Build the training input_fn.
def input_train():
return (
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
# Repeat forever
# Build the validation input_fn.
def input_test():
return (test.shuffle(1000).batch(128)
# The first way assigns a unique weight to each category. To do this you must
# specify the category's vocabulary (values outside this specification will
# receive a weight of zero). Here we specify the vocabulary using a list of
# options. The vocabulary can also be specified with a vocabulary file (using
# `categorical_column_with_vocabulary_file`). For features covering a
# range of positive integers use `categorical_column_with_identity`.
marca_vocab = ["ANIMALE","FABULA","FARM","A.BRAND","F.Y.I","MAS ANIMALE"]
marca = tf.feature_column.categorical_column_with_vocabulary_list(
key="marca", vocabulary_list=marca_vocab)
mes_vocab = ["1","2","3","4","5","6","7","8","9","10","11","12"]
mes = tf.feature_column.categorical_column_with_vocabulary_list(
key="mes", vocabulary_list=mes_vocab)
linha = tf.feature_column.categorical_column_with_vocabulary_list(
key="linha", vocabulary_list=linha_vocab)
grupo = tf.feature_column.categorical_column_with_vocabulary_list(
key="grupo", vocabulary_list=grupo_vocab)
benef_vocab = ["S","N"]
benef = tf.feature_column.categorical_column_with_vocabulary_list(
key="benef", vocabulary_list=benef_vocab)
desenvolvimento_vocab = ["INT","EX"]
desenvolvimento = tf.feature_column.categorical_column_with_vocabulary_list(
key="desenvolvimento", vocabulary_list=desenvolvimento_vocab)
# make = tf.feature_column.categorical_column_with_hash_bucket(
# key="make", hash_bucket_size=50)
feature_columns = [
# Since this is a DNN model, convert categorical columns from sparse
# to dense.
# Wrap them in an `indicator_column` to create a
# one-hot vector from the input.
# Or use an `embedding_column` to create a trainable vector for each
# index.
# tf.feature_column.embedding_column(make, dimension=3),
# Build a custom Estimator, using the model_fn.
# `params` is passed through to the `model_fn`.
model = tf.estimator.Estimator(
"feature_columns": feature_columns,
"learning_rate": 0.001,
"optimizer": tf.train.AdamOptimizer,
"hidden_units": [100,500,100]
# Train the model.
model.train(input_fn=input_train, steps=STEPS)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=input_test)
pred_result = model.predict(input_fn = input_test,
sess = tf.Session()
# Print the Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: {:.0f} Dias"
.format(LT_NORM_FACTOR * eval_result["rmse"]))
#prediction_df = pd.DataFrame(list(pred_result))
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.

Calling the same batch tensorflow

I have a tensorflow graph that is reading from .tfrecords files, as described in the process here (taken from Tflow docs):
def read_my_file_format(filename_queue):
reader = tf.SomeReader()
key, record_string = reader.read(filename_queue)
example, label = tf.some_decoder(record_string)
processed_example = some_processing(example)
return processed_example, label
def input_pipeline(filenames, batch_size, num_epochs=None):
filename_queue = tf.train.string_input_producer(
filenames, num_epochs=num_epochs, shuffle=True)
example, label = read_my_file_format(filename_queue)
# min_after_dequeue defines how big a buffer we will randomly sample
# from -- bigger means better shuffling but slower start up and more
# memory used.
# capacity must be larger than min_after_dequeue and the amount larger
# determines the maximum we will prefetch. Recommendation:
# min_after_dequeue + (num_threads + a small safety margin) * batch_size
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=capacity,
return example_batch, label_batch`
In my code, a single batch (as returned by input_pipeline above) is used as an input to multiple networks (let's call them A,B) in my graph per iteration. So if I call:
#...define graph...
does tensorflow guarantee that it will use the same batch for each sess.run call?
If input of model A and B is example_batch and you evaluate the models simultaneously (as in your example sess.run([A,B])) then I expect to see the same batch. Because both models are fed by the same dequeuing operation. As soon as you break the synchronization (i.e., running separately) then inputs will be different.
The following code snippet looks trivial but shows my point.
import tensorflow as tf
import numpy as np
import time
batch_size = 16
input_shape, target_shape = (128), () # input with dimensionality 128.
num_threads = 4 # for input pipeline
queue_capacity = 10 # for input pipeline
def get_random_data_sample():
# Random inputs and targets
np_input = np.float32(np.random.normal(0,1, input_shape))
np_target = np.int32(1)
# Sleep randomly between 1 and 3 seconds.
return np_input, np_target
tensorflow_input, tensorflow_target = tf.py_func(get_random_data_sample, [], [tf.float32, tf.int32])
def create_model(inputs, hidden_size, num_hidden_layers):
# Create a dummy dense network.
dense_layer = inputs
for i in range(num_hidden_layers):
dense_layer = tf.layers.dense(
kernel_initializer= tf.zeros_initializer(),
bias_initializer= tf.zeros_initializer(),
return dense_layer, inputs
# input pipeline
batch_inputs, batch_targets = tf.train.batch([tensorflow_input, tensorflow_target],
shapes=[input_shape, target_shape],
# Different models A and B using the same input operation.
modelA, modelA_inputs = create_model(batch_inputs, 32, 1) # 1 hidden layer
modelB, modelB_inputs = create_model(batch_inputs, 64, 2) # 2 hidden layers
sess = tf.InteractiveSession()
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
sess = tf.InteractiveSession()
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
# (1) Evaluate the models simultaneously.
resultA, resultB, inputsA, inputsB = sess.run([modelA, modelB, modelA_inputs, modelB_inputs])
assert((inputsA == inputsB).all())
# (2) Evaluate the models separately.
resultA2, inputsA2 = sess.run([modelA, modelA_inputs])
resultB2, inputsB2 = sess.run([modelB, modelB_inputs])
assert((inputsA2 == inputsB2).all())
Naturally the second evaluation uses different input batches and fails assertion. I hope this helps.

Distributed Tensorflow: good example for synchronous training on CPUs

I am new to distributed tensorflow and am looking for a good example to perform synchronous training on CPUs.
I have already tried the Distributed Tensorflow Example and it can perform the asynchronous training successfully over 1 parameter server (1 machine with 1 CPU) and 3 workers (each worker = 1 machine with 1 CPU). However, when it comes to the synchronous training, I am not able to run it correctly, although I have followed the tutorial of
SyncReplicasOptimizer(V1.0 and V2.0).
I have inserted the official SyncReplicasOptimizer code into the working asynchronous training example but the training process is still asynchronous. My detailed code is as follows. Any code relates to synchronous training is within the block of ******.
import tensorflow as tf
import sys
import time
# cluster specification ----------------------------------------------------------------------
parameter_servers = ["xx1.edu:2222"]
workers = ["xx2.edu:2222", "xx3.edu:2222", "xx4.edu:2222"]
cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers})
# input flags
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
# start a server for a specific task
server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
# Parameters ----------------------------------------------------------------------
N = 3 # number of replicas
learning_rate = 0.001
training_epochs = int(21/N)
batch_size = 100
# Network Parameters
n_input = 784 # MNIST data input (img shape: 28*28)
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_classes = 10 # MNIST total classes (0-9 digits)
if FLAGS.job_name == "ps":
print("--- Parameter Server Ready ---")
elif FLAGS.job_name == "worker":
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Between-graph replication
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
# count the number of updates
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0),
trainable = False,
dtype = tf.int32)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
# Create model
def multilayer_perceptron(x, weights, biases):
# Hidden layer with RELU activation
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
# Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_2 = tf.nn.relu(layer_2)
# Output layer with linear activation
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
return out_layer
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_classes]))
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
# ************************* SyncReplicasOpt Version 1.0 *****************************************************
''' This optimizer collects gradients from all replicas, "summing" them,
then applying them to the variables in one shot, after which replicas can fetch the new variables and continue. '''
# Create any optimizer to update the variables, say a simple SGD
opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
# Wrap the optimizer with sync_replicas_optimizer with N replicas: at each step the optimizer collects N gradients before applying to variables.
opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=N,
replica_id=FLAGS.task_index, total_num_replicas=N)
# Now you can call `minimize()` or `compute_gradients()` and `apply_gradients()` normally
train = opt.minimize(cost, global_step=global_step)
# You can now call get_init_tokens_op() and get_chief_queue_runner().
# Note that get_init_tokens_op() must be called before creating session
# because it modifies the graph.
init_token_op = opt.get_init_tokens_op()
chief_queue_runner = opt.get_chief_queue_runner()
# **************************************************************************************
# Test model
correct = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, "float"))
# Initializing the variables
init_op = tf.initialize_all_variables()
print("---Variables initialized---")
# **************************************************************************************
is_chief = (FLAGS.task_index == 0)
# Create a "supervisor", which oversees the training process.
sv = tf.train.Supervisor(is_chief=is_chief,
# **************************************************************************************
with sv.prepare_or_wait_for_session(server.target) as sess:
# **************************************************************************************
# After the session is created by the Supervisor and before the main while loop:
if is_chief:
sv.start_queue_runners(sess, [chief_queue_runner])
# Insert initial tokens to the queue.
# **************************************************************************************
# Statistics
net_train_t = 0
# Training
for epoch in range(training_epochs):
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# ======== net training time ========
begin_t = time.time()
sess.run(train, feed_dict={x: batch_x, y: batch_y})
end_t = time.time()
net_train_t += (end_t - begin_t)
# ===================================
# Calculate training accuracy
# acc = sess.run(accuracy, feed_dict={x: mnist.train.images, y: mnist.train.labels})
# print("Epoch:", '%04d' % (epoch+1), " Train Accuracy =", acc)
print("Epoch:", '%04d' % (epoch+1))
print("Training Finished!")
print("Net Training Time: ", net_train_t, "second")
# Testing
print("Testing Accuracy = ", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
Anything wrong with my code? Or can I have a good example to follow?
I think your question can be answered as the comments in the issue #9596 of the tensorflow.
This problem is caused by the bugs of the new version of tf.train.SyncReplicasOptimizer(). You can use old version of this API to avoid this problem.
Another solution is from the Tensorflow Distributed Benchmarks. Take a look at the source code, and you can find that they synchronize workers manually through the queue in the tensorflow. Through experiments, this benchmark runs exactly as what you expect.
Hope these comments and resources can help you solve your problem. Thanks!
I am not sure if you would be interested in user-transparent distributed tensorflow which uses MPI in the backend. We have recently developed one such version with MaTEx: https://github.com/matex-org/matex.
Hence, for distributed TensorFlow, you would not need to write a SyncReplicaOptimizer code, since all the changes are abstracted from the user.
Hope this helps.
One issue is that you need to specify an aggregation_method in the minimize method for it to run synchronously,
train = opt.minimize(cost, global_step=global_step, aggregation_method=tf.AggregationMethod.ADD_N)

Exporting cifar10 model from checkpoint file to tensorflow serving

I tried to modify the inception_export.py for CIFAR10 model, but I get the errors:
raise type(e)(node_def, op, message) tensorflow.python.framework.errors.InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [18,384] rhs shape= [2304,384]
[[Node: save/Assign_5 = Assign[T=DT_FLOAT, _class=["loc:#local3/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/cpu:0"](local3/weights, save/restore_slice_5)]] Caused by op u'save/Assign_5', defined at:
I am still very new to tensorflow, any help is much appreciated, thanks
EDIT1: here is my code. I haven't installed tensorflow serving so the related block is commented out. I also change the image_size to 24 to fit the CIFAR10 model.
# Copyright 2016 Google Inc. All Rights Reserved.
#!/usr/bin/env python2.7
"""Modified for CIFAR10 model from https://github.com/tensorflow/serving/blob/master/tensorflow_serving/example/inception_export.py
import os.path
import sys
# This is a placeholder for a Google-internal import.
import tensorflow as tf
from tensorflow.models.image.cifar10 import cifar10
#from inception import inception_model
#from tensorflow_serving.session_bundle import exporter
tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
"""Directory where to read training checkpoints.""")
tf.app.flags.DEFINE_string('export_dir', '/tmp/cifar10_export',
"""Directory where to export inference model.""")
tf.app.flags.DEFINE_integer('image_size', 24,
"""Needs to provide same value as in training.""")
FLAGS = tf.app.flags.FLAGS
WORKING_DIR = os.path.dirname(os.path.realpath(__file__))
SYNSET_FILE = os.path.join(WORKING_DIR, 'imagenet_lsvrc_2015_synsets.txt')
METADATA_FILE = os.path.join(WORKING_DIR, 'imagenet_metadata.txt')
def export():
"""can be deleted if my simply define the constant string manually below?
# Create index->synset mapping
synsets = []
with open(SYNSET_FILE) as f:
synsets = f.read().splitlines()
# Create synset->metadata mapping
texts = {}
with open(METADATA_FILE) as f:
for line in f.read().splitlines():
parts = line.split('\t')
assert len(parts) == 2
texts[parts[0]] = parts[1]
with tf.Graph().as_default():
# Build inference model.
# Please refer to Tensorflow inception model for details.
# Input transformation.
# TODO(b/27776734): Add batching support.
jpegs = tf.placeholder(tf.string, shape=(1))
image_buffer = tf.squeeze(jpegs, [0])
# Decode the string as an RGB JPEG.
# Note that the resulting image contains an unknown height and width
# that is set dynamically by decode_jpeg. In other words, the height
# and width of image is unknown at compile-time.
image = tf.image.decode_jpeg(image_buffer, channels=3)
# After this point, all image pixels reside in [0,1)
# until the very end, when they're rescaled to (-1, 1). The various
# adjust_* ops all require this range for dtype float.
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
# Crop the central region of the image with an area containing 87.5% of
# the original image.
image = tf.image.central_crop(image, central_fraction=0.875)
# Resize the image to the original height and width.
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image,
[FLAGS.image_size, FLAGS.image_size],
image = tf.squeeze(image, [0])
# Finally, rescale to [-1,1] instead of [0, 1)
image = tf.sub(image, 0.5)
image = tf.mul(image, 2.0)
images = tf.expand_dims(image, 0)
# Run inference.
logits = cifar10.inference(images)
# Transform output to topK result.
values, indices = tf.nn.top_k(logits, NUM_TOP_CLASSES)
# Create a constant string Tensor where the i'th element is
# the human readable class description for the i'th index.
# Note that the 0th index is an unused background class
# (see inception model definition code).
class_descriptions = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
# for s in synsets:
# class_descriptions.append(texts[s])
class_tensor = tf.constant(class_descriptions)
classes = tf.contrib.lookup.index_to_string(tf.to_int64(indices),
# Restore variables from training checkpoint.
variable_averages = tf.train.ExponentialMovingAverage(
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
with tf.Session() as sess:
# Restore variables from training checkpoints.
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
# Assuming model_checkpoint_path looks something like:
# /my-favorite-path/imagenet_train/model.ckpt-0,
# extract global_step from it.
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
print('Successfully loaded model from %s at step=%s.' %
(ckpt.model_checkpoint_path, global_step))
print('No checkpoint file found at %s' % FLAGS.checkpoint_dir)
""" Not exporting yet because I haven't installed tensorflow serving
# Export inference model.
init_op = tf.group(tf.initialize_all_tables(), name='init_op')
model_exporter = exporter.Exporter(saver)
signature = exporter.classification_signature(
input_tensor=jpegs, classes_tensor=classes, scores_tensor=values)
model_exporter.init(default_graph_signature=signature, init_op=init_op)
model_exporter.export(FLAGS.export_dir, tf.constant(global_step), sess)
print('Successfully exported model to %s' % FLAGS.export_dir)
def main(unused_argv=None):
if __name__ == '__main__':