tensorflow doesn't work well with thrift - tensorflow-serving

I write a simple tensorflow code as follws:
class Classify():
def __init__(self):
session_conf = tf.ConfigProto(log_device_placement=True,device_count = {'GPU': 0})
self.sess = sess = tf.InteractiveSession(config=session_conf)
x = tf.Variable([1.0, 2.0])
a = tf.constant([3.0, 3.0])
with sess.as_default():
# Initialize 'x' using the run() method of its initializer op.
x.initializer.run()
# Add an op to subtract 'a' from 'x'. Run it and print the result
self.sub = tf.add(x, a)
def clf(self, word):
with self.sess.as_default():
print 'xxxxxxxxxxxxxxxxxxx'
return self.sub.eval()
it works well when i drect create an obejct and call obj.clf.
But when i use thrift, i create obj in server end, and use thrift API to call to make server run obj.clf. it first return error
E tensorflow/stream_executor/cuda/cuda_driver.cc:1296] failed to enqueue
async memcpy from host to device: CUDA_ERROR_NOT_INITIALIZED; GPU dst:
0x2305ba0700; host src: 0x2048a0000; size: 8=0x8
when i reproduce some nums of this error, they after always blocks dead when i call thrift client.
Any one work well with thrift&tensorflow?

Related

Sharing of array list or variable between 2 distributed tensorflow processes

I am presently working on Distributed tensorflow considering 2 worker processes and facing the issue of sharing variable between these two worker process.
I found tf.get_collection/tf.add_collection but still unable to get the variable value shared between the 2 processes.
Adding Few details around how I want to share the data among the worker processes in Distributed Tensorflow :
def create_variable(layer_shape):
with tf.variable_scope("share_lay"):
layers = tf.get_variable("layers", shape=layer_shape, trainable=True)
with tf.variable_scope("share_lay", reuse=tf.AUTO_REUSE):
layers = tf.get_variable("layers", shape=layer_shape, trainable=True)
return layers
def set_layer(layers):
tf.add_to_collection("layers", layers)
def get_layer(name):
return tf.get_collection(name)[0]
taskid == 0:
layers = create_variable(layer_shape)
layers = <some value>
set_layer(layers)
taskid == 1:
layers = create_variable(layer_shape)
layers = get_layer("layers")
I am getting an error when performing get_layer() as :
return tf.get_collection(name)[0]
IndexError: list index out of range
It appears that the data cannot be share between the workers
Request some suggestions regarding the same
Any suggestions / pointers is appreciated,
Thanks,
Kapil
I finally solve the same problem by using tf.train.replica_device_setter() to place the variables on parameter server and add them to a colletion. Later, I can use tf.get_collection() in any worker to return that collection, which is actually a python list. Note that tf.get_collection only return a copy of original collection. If you want to change the variables in the original collection, you should use tf.get_collecion_ref which actually returns the collection list itself.
Here is an example:
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('job_name', '',
"""One of 'ps', 'worker' """)
tf.app.flags.DEFINE_integer('task_index', 0,
"""Index of task within the job""")
cluster = tf.train.ClusterSpec(
{'ps': ['localhost:22222'],
'worker': ['localhost:22223', 'localhost:22227']})
config = tf.ConfigProto(
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1)
if FLAGS.job_name == 'ps':
server = tf.train.Server(cluster, job_name='ps', task_index=FLAGS.task_index, config=config)
server.join()
else:
server = tf.train.Server(cluster, job_name='worker', task_index=FLAGS.task_index, config=config)
with tf.device(tf.train.replica_device_setter(cluster=cluster)):
#create a colletion 'shared_list' and add two variables to the collection 'shared_list'
#note that these two variables are placed on parameter server
a = tf.Variable(name='a', initial_value=tf.constant(1.0),
collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'shared_list'])
b = tf.Variable(name='b', initial_value=tf.constant(2.0),
collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'shared_list'])
#now let's print out the value of a+2.0 and b+2.0 using the collection 'shared_list' from different worker
#note that tf.get_collection will return a copy of exiting collection which is actually a python list
with tf.device('/job:worker/task:%d' %FLAGS.task_index):
c = tf.get_collection('shared_list')[0] + 2.0 # a+2.0
d = tf.get_collection('shared_list')[1] + 2.0 # b+2.0
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=(FLAGS.task_index==0),
config=config) as sess:
print('this is worker %d' % FLAGS.task_index)
print(c.eval(session=sess))
print(d.eval(session=sess))
server.join()
worker 0 will print out:
this is worker 0
3.0
4.0
worker 1 will print out:
this is worker 1
3.0
4.0
Edit: work 0 modifies the variable 'a' to 10, and then worker 1 prints out the new value of 'a', which becomes 10 immediately. Actually, variable 'a' is available for both worker 0 and worker 1 because they are in distributed setting. Below is an example. Also refers to this blog in Amid Fish by Matthew Rahtz for how to share variables in distributed tensorflow. Actually, we don't need any parameter server to share variables. Any two workers can share the same variable with each other as long as the two workers create two variables having exactly the same name.
Here is the example
import tensorflow as tf
from time import sleep
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('job_name', '',
"""One of 'ps', 'worker' """)
tf.app.flags.DEFINE_integer('task_index', 0,
"""Index of task within the job""")
cluster = tf.train.ClusterSpec(
{'ps': ['localhost:22222'],
'worker': ['localhost:22223', 'localhost:22227']})
if FLAGS.job_name == 'ps':
server = tf.train.Server(cluster, job_name='ps', task_index=FLAGS.task_index)
server.join()
else:
server = tf.train.Server(cluster, job_name='worker', task_index=FLAGS.task_index)
with tf.device(tf.train.replica_device_setter(cluster=cluster)):
# create a colletion 'shared_list' and add two variables to the collection 'shared_list'
# note that these two variables are placed on parameter server
a = tf.Variable(name='a', initial_value=tf.constant(1.0),
collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'shared_list'])
b = tf.Variable(name='b', initial_value=tf.constant(2.0),
collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'shared_list'])
# change the value of 'a' in worker 0
if FLAGS.task_index == 0:
change_a = a.assign(10)
# print out the new value of a in worker 1 using get_collction. Note that we may need to
# use read_value() method to force the op to read the current value of a
if FLAGS.task_index == 1:
with tf.device('/job:worker/task:1'): # place read_a to worker 1
read_a = tf.get_collection('shared_list')[0].read_value() # a = 10
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=(FLAGS.task_index == 0))as sess:
if FLAGS.task_index == 0:
sess.run(change_a)
if FLAGS.task_index == 1:
sleep(1) # sleep a little bit to wait until change_a has been executed
print(read_a.eval(session=sess))
server.join()
worker 1 prints out
10

How to use Tensorflow

I've built multiple DNN and conVNN using tensorflow, and I can reach now a good accuracy. Now my question is how can I use this trained networks in real example.
I case of a convNN for computer vision, how can I use the model to classify a new picture ? can I generate something like convNN.exe that get images as input parameter that through the classification result out ?
Once you've trained the model, you should save it somewhere by adding code similar to
builder = saved_model_builder.SavedModelBuilder(export_path)
builder.add_meta_graph_and_variables(
sess, [tag_constants.SERVING],
signature_def_map={
'predict_images':
prediction_signature,
signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
classification_signature,
},
legacy_init_op=legacy_init_op)
builder.save()
Then, you can use Tensorflow serving to serve your model using a high-performance C++ server by running
bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server \
--port=9000 --model_name=mnist \
--model_base_path=/tmp/mnist_model/
Modifying the code for your model, of course. You'll need to implement a client; there's an example for MNIST here. The guts of the client would be something like:
def do_inference(hostport, work_dir, concurrency, num_tests):
"""Tests PredictionService with concurrent requests.
Args:
hostport: Host:port address of the PredictionService.
work_dir: The full path of working directory for test data set.
concurrency: Maximum number of concurrent requests.
num_tests: Number of test images to use.
Returns:
The classification error rate.
Raises:
IOError: An error occurred processing test data set.
"""
test_data_set = mnist_input_data.read_data_sets(work_dir).test
host, port = hostport.split(':')
channel = implementations.insecure_channel(host, int(port))
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
result_counter = _ResultCounter(num_tests, concurrency)
for _ in range(num_tests):
request = predict_pb2.PredictRequest()
request.model_spec.name = 'mnist'
request.model_spec.signature_name = 'predict_images'
image, label = test_data_set.next_batch(1)
request.inputs['images'].CopyFrom(
tf.contrib.util.make_tensor_proto(image[0], shape=[1, image[0].size]))
result_counter.throttle()
result_future = stub.Predict.future(request, 5.0) # 5 seconds
result_future.add_done_callback(
_create_rpc_callback(label[0], result_counter))
return result_counter.get_error_rate()
def main(_):
if FLAGS.num_tests > 10000:
print('num_tests should not be greater than 10k')
return
if not FLAGS.server:
print('please specify server host:port')
return
error_rate = do_inference(FLAGS.server, FLAGS.work_dir,
FLAGS.concurrency, FLAGS.num_tests)
print('\nInference error rate: %s%%' % (error_rate * 100))
if __name__ == '__main__':
tf.app.run()
This is in Python, of course, but there's no reason you couldn't use another language (e.g. Go or C++) if you wanted to create a binary executable.

getting runtime statistics with monitoredtrainingsession in tensorflow

I am trying to get my tensorflow code profile (running and memory consumption of each layers in the network) by following the runtime statistics instruction here. As far as I understand, I need to create run options and run metadata like this
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
and pass them to sess.run
However, as I am also trying to use tf.train.MonitoredTrainingSession I don't know if I can pass the same thing into this class. A plausible approach could make use of Hooks but I do not know how to do it. I am still very new to them
You can simply create a custom hook and pass it to the MonitoredTrainingSession. There is no need to pass your own tf.RunMetadata() instance to the run call.
Here is an example Hook which stores metadata every N steps to ckptdir:
import tensorflow as tf
class TraceHook(tf.train.SessionRunHook):
"""Hook to perform Traces every N steps."""
def __init__(self, ckptdir, every_step=50, trace_level=tf.RunOptions.FULL_TRACE):
self._trace = every_step == 1
self.writer = tf.summary.FileWriter(ckptdir)
self.trace_level = trace_level
self.every_step = every_step
def begin(self):
self._global_step_tensor = tf.train.get_global_step()
if self._global_step_tensor is None:
raise RuntimeError("Global step should be created to use _TraceHook.")
def before_run(self, run_context):
if self._trace:
options = tf.RunOptions(trace_level=self.trace_level)
else:
options = None
return tf.train.SessionRunArgs(fetches=self._global_step_tensor,
options=options)
def after_run(self, run_context, run_values):
global_step = run_values.results - 1
if self._trace:
self._trace = False
self.writer.add_run_metadata(run_values.run_metadata,
f'{global_step}', global_step)
if not (global_step + 1) % self.every_step:
self._trace = True
It checks in before_run whether it has to trace or not and if so, adds the RunOptions. In after_run it checks if the next run call needs to be traced and if so, it sets _trace to True again. Additionally it stores the metadata when it is available.

Is there a way to get tensorflow tf.Print output to appear in Jupyter Notebook output

I'm using the tf.Print op in a Jupyter notebook. It works as required, but will only print the output to the console, without printing in the notebook. Is there any way to get around this?
An example would be the following (in a notebook):
import tensorflow as tf
a = tf.constant(1.0)
a = tf.Print(a, [a], 'hi')
sess = tf.Session()
a.eval(session=sess)
That code will print 'hi[1]' in the console, but nothing in the notebook.
Update Feb 3, 2017
I've wrapped this into memory_util package. Example usage
# install memory util
import urllib.request
response = urllib.request.urlopen("https://raw.githubusercontent.com/yaroslavvb/memory_util/master/memory_util.py")
open("memory_util.py", "wb").write(response.read())
import memory_util
sess = tf.Session()
a = tf.random_uniform((1000,))
b = tf.random_uniform((1000,))
c = a + b
with memory_util.capture_stderr() as stderr:
sess.run(c.op)
print(stderr.getvalue())
** Old stuff**
You could reuse FD redirector from IPython core. (idea from Mark Sandler)
import os
import sys
STDOUT = 1
STDERR = 2
class FDRedirector(object):
""" Class to redirect output (stdout or stderr) at the OS level using
file descriptors.
"""
def __init__(self, fd=STDOUT):
""" fd is the file descriptor of the outpout you want to capture.
It can be STDOUT or STERR.
"""
self.fd = fd
self.started = False
self.piper = None
self.pipew = None
def start(self):
""" Setup the redirection.
"""
if not self.started:
self.oldhandle = os.dup(self.fd)
self.piper, self.pipew = os.pipe()
os.dup2(self.pipew, self.fd)
os.close(self.pipew)
self.started = True
def flush(self):
""" Flush the captured output, similar to the flush method of any
stream.
"""
if self.fd == STDOUT:
sys.stdout.flush()
elif self.fd == STDERR:
sys.stderr.flush()
def stop(self):
""" Unset the redirection and return the captured output.
"""
if self.started:
self.flush()
os.dup2(self.oldhandle, self.fd)
os.close(self.oldhandle)
f = os.fdopen(self.piper, 'r')
output = f.read()
f.close()
self.started = False
return output
else:
return ''
def getvalue(self):
""" Return the output captured since the last getvalue, or the
start of the redirection.
"""
output = self.stop()
self.start()
return output
import tensorflow as tf
x = tf.constant([1,2,3])
a=tf.Print(x, [x])
redirect=FDRedirector(STDERR)
sess = tf.InteractiveSession()
redirect.start();
a.eval();
print "Result"
print redirect.stop()
I ran into the same problem and got around it by using a function like this in my notebooks:
def tf_print(tensor, transform=None):
# Insert a custom python operation into the graph that does nothing but print a tensors value
def print_tensor(x):
# x is typically a numpy array here so you could do anything you want with it,
# but adding a transformation of some kind usually makes the output more digestible
print(x if transform is None else transform(x))
return x
log_op = tf.py_func(print_tensor, [tensor], [tensor.dtype])[0]
with tf.control_dependencies([log_op]):
res = tf.identity(tensor)
# Return the given tensor
return res
# Now define a tensor and use the tf_print function much like the tf.identity function
tensor = tf_print(tf.random_normal([100, 100]), transform=lambda x: [np.min(x), np.max(x)])
# This will print the transformed version of the tensors actual value
# (which was summarized to just the min and max for brevity)
sess = tf.InteractiveSession()
sess.run([tensor])
sess.close()
FYI, using a logger instead of calling "print" in my custom function worked wonders for me as the stdout is often buffered by jupyter and not shown before "Loss is Nan" kind of errors -- which was the whole point in using that function in the first place in my case.
You can check the terminal where you launched the jupyter notebook to see the message.
import tensorflow as tf
tf.InteractiveSession()
a = tf.constant(1)
b = tf.constant(2)
opt = a + b
opt = tf.Print(opt, [opt], message="1 + 2 = ")
opt.eval()
In the terminal, I can see:
2018-01-02 23:38:07.691808: I tensorflow/core/kernels/logging_ops.cc:79] 1 + 2 = [3]
A simple way, tried it in regular python, but not jupyter yet.
os.dup2(sys.stdout.fileno(), 1)
os.dup2(sys.stdout.fileno(), 2)
Explanation is here: In python, how to capture the stdout from a c++ shared library to a variable
The issue that I faced was that one can't run a session inside a Tensorflow Graph, like in the training or in the evaluation.
That's why the options to use sess.run(opt) or opt.eval() were not a solution for me.
The best thing was to use tf.Print() and redirect the logging to an external file.
I did this using a temporal file, which I transferred to a regular file like this:
STDERR=2
import os
import sys
import tempfile
class captured:
def __init__(self, fd=STDERR):
self.fd = fd
self.prevfd = None
def __enter__(self):
t = tempfile.NamedTemporaryFile()
self.prevfd = os.dup(self.fd)
os.dup2(t.fileno(), self.fd)
return t
def __exit__(self, exc_type, exc_value, traceback):
os.dup2(self.prevfd, self.fd)
with captured(fd=STDERR) as tmp:
...
classifier.evaluate(input_fn=input_fn, steps=100)
with open('log.txt', 'w') as f:
print(open(tmp.name).read(), file=f)
And then in my evaluation I do:
a = tf.constant(1)
a = tf.Print(a, [a], message="a: ")

Two Class instances in Python not different

I'm working on another data acquisition project, which has turned into an object oriented programming question. In “main” at the bottom of my code I make two instances of the Object DAQInput. When I wrote this, I thought my method .getData would refer to the taskHandle of the particular instance, but it does not. When I run, the code does the getData task with the first handle twice, so clearly I don’t really understand object oriented programming in Python. I’m sorry this code will not run without PyDAQmx and a National Instruments board attached.
from PyDAQmx import *
import numpy
class DAQInput:
# Declare variables passed by reference
taskHandle = TaskHandle()
read = int32()
data = numpy.zeros((10000,),dtype=numpy.float64)
sumi = [0,0,0,0,0,0,0,0,0,0]
def __init__(self, num_data, num_chan, channel, high, low):
""" This is init function that opens the channel"""
#Get the passed variables
self.num_data = num_data
self.channel = channel
self.high = high
self.low = low
self.num_chan = num_chan
# Create a task and configure a channel
DAQmxCreateTask(b"",byref(self.taskHandle))
DAQmxCreateAIThrmcplChan(self.taskHandle, self.channel, b"",
self.low, self.high,
DAQmx_Val_DegC,
DAQmx_Val_J_Type_TC,
DAQmx_Val_BuiltIn, 0, None)
# Start the task
DAQmxStartTask(self.taskHandle)
def getData(self):
""" This function gets the data from the board and calculates the average"""
print(self.taskHandle)
DAQmxReadAnalogF64(self.taskHandle, self.num_data, 10,
DAQmx_Val_GroupByChannel, self.data, 10000,
byref(self.read), None)
# Calculate the average of the values in data (could be several channels)
i = self.read.value
for j in range(self.num_chan):
self.sumi[j] = numpy.sum(self.data[j*i:(j+1)*i])/self.read.value
return self.sumi
def killTask(self):
""" This function kills the tasks"""
# If the task is still alive kill it
if self.taskHandle != 0:
DAQmxStopTask(self.taskHandle)
DAQmxClearTask(self.taskHandle)
if __name__ == '__main__':
myDaq1 = DAQInput(1, 4, b"cDAQ1Mod1/ai0:3", 200.0, 10.0)
myDaq2 = DAQInput(1, 4, b"cDAQ1Mod2/ai0:3", 200.0, 10.0)
result = myDaq1.getData()
print (result[0:4])
result2 = myDaq2.getData()
print (result2[0:4])
myDaq1.killTask()
myDaq2.killTask()
These variables:
class DAQInput:
# Declare variables passed by reference
taskHandle = TaskHandle()
read = int32()
data = numpy.zeros((10000,),dtype=numpy.float64)
sumi = [0,0,0,0,0,0,0,0,0,0]
Are class variables. They belong to the class itself and are shared among instances of the class (i.e. if you modify self.data in Instance1, Instace2's self.data is modified as well).
If you want them to be instance variables, define them in __init__.