is there a version of the inference example of the Tensorflow Object detection API that can run on batches of images simultaneously? - tensorflow

I have trained a faster rcnn model using the Tensorflow object detection API and am using this inference script with my frozen graph:
https://github.com/tensorflow/models/blob/master/research/object_detection/object_detection_tutorial.ipynb
I intend to use it for object tracking in videos, but inference using this script is very slow since it only processes one image at a time instead of a batch of images. Is there any way to do inference on a batch of images at once ? The relevant inference function is here, I am wondering how to modify it to work with a stack of images
def run_inference_for_single_image(image, graph):
with graph.as_default():
with tf.Session() as sess:
# Get handles to input and output tensors
ops = tf.get_default_graph().get_operations()
all_tensor_names = {output.name for op in ops for output in op.outputs}
tensor_dict = {}
for key in ['num_detections', 'detection_boxes', 'detection_scores', 'detection_classes', 'detection_masks']:
tensor_name = key + ':0'
if tensor_name in all_tensor_names:
tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(tensor_name)
if 'detection_masks' in tensor_dict:
# The following processing is only for single image
detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
# Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(detection_masks, detection_boxes, image.shape[0], image.shape[1])
detection_masks_reframed = tf.cast(tf.greater(detection_masks_reframed, 0.5), tf.uint8)
# Follow the convention by adding back the batch dimension
tensor_dict['detection_masks'] = tf.expand_dims(detection_masks_reframed, 0)
image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
# Run inference
output_dict = sess.run(tensor_dict, feed_dict={image_tensor: np.expand_dims(image, 0)})
# all outputs are float32 numpy arrays, so convert types as appropriate
output_dict['num_detections'] = int(output_dict['num_detections'][0])
output_dict['detection_classes'] = output_dict['detection_classes'][0].astype(np.uint8)
output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
output_dict['detection_scores'] = output_dict['detection_scores'][0]
if 'detection_masks' in output_dict:
output_dict['detection_masks'] = output_dict['detection_masks'][0]
return output_dict

Instead of passing just one numpy array of the size (1, image_width, image_heigt, 3) you can pass a numpy array with your image batch of the size (batch_size, image_width, image_heigt, 3) to the sess.run command:
output_dict = sess.run(tensor_dict, feed_dict={image_tensor: image_batch})
The output_dict will be slightly different then before, still haven't figured out how exactly. Maybe someone can help furthermore?
Edit
It seems that the output_dict gets another index which corresponds to the image number in your batch. So you'll find the boxes for a certain image in:
output_dict['detection_boxes'][image_counter]
Edit2
For some reason this won't work with Mask RCNN...

If you run export_inference_graph.py, you should be able to input batches of images by default as it sets the image_tensor shape to [None, None, None, 3].
python object_detection/export_inference_graph.py \
--input_type image_tensor \
--pipeline_config_path ${PIPELINE_CONFIG_PATH} \
--trained_checkpoint_prefix ${TRAIN_PATH} \
--output_directory output_inference_graph.pb

Related

Tensorflow 2: Add operations to input and ouput of frozen graph

When I load a frozen graph from disc, it already contains a placeholder for the input (input_placeholder_name="main_input:0")
graph = tf.Graph()
graph.as_default()
sess = tf.compat.v1.Session()
with tf.io.gfile.GFile(model_filepath, "rb") as f:
graph_def = tf.compat.v1.GraphDef()
loaded = graph_def.ParseFromString(f.read())
However, I'd like to add my own input normalization to the graph:
img = tf.compat.v1.placeholder(tf.float32, shape=[1, 384, 256, 3], name="img")
normalized_input = tf.divide(img_bgr, 255.0)
normalized_input_b = tf.subtract(normalized_input[:, :, :, 0], tf.constant(0.406))
normalized_input_g = tf.subtract(normalized_input[:, :, :, 1], tf.constant(0.456))
normalized_input_r = tf.subtract(normalized_input[:, :, :, 2], tf.constant(0.485))
normalized_input_b = tf.expand_dims(normalized_input_b, axis=-1)
normalized_input_g = tf.expand_dims(normalized_input_g, axis=-1)
normalized_input_r = tf.expand_dims(normalized_input_r, axis=-1)
normalized_input_b = tf.divide(normalized_input_b, tf.constant(0.225))
normalized_input_g = tf.divide(normalized_input_g, tf.constant(0.224))
normalized_input_r = tf.divide(normalized_input_r, tf.constant(0.229))
input_data = tf.concat((normalized_input_r,
normalized_input_g,
normalized_input_b),
axis=3)
I try to combine the above described input normalization with the existing graph using import_graph_def:
tf.import_graph_def(graph_def, {input_paceholder_name: input_data})
When I try to run the new graph on some example input, I get an error of there being two placeholders to feed
sess.run(output_tensor, feed_dict={img: data})
I had something similar working for TF1 already (there is also a stackoverflow answer for that case), however I'm forced to use TF2 now and I'm really confused why it is not working by simply adding tf.compat.v1 and disable_eager_execution everywhere.

How to use GPU to detect object using pre-trained models in Tensorflow?

I'm using Tensorflow to detect object based on this tutorial. The reason why it's running so slow is this line output_dict =sess.run(tensor_dict,
feed_dict={image_tensor: np.expand_dims(image, 0)}). And below is the whole function code:
def run_inference_for_single_image(image, graph):
with graph.as_default():
with tf.device('/gpu:0'):
print('GPU is using')
with tf.Session() as sess:
time0 = datetime.datetime.now()
# Get handles to input and output tensors
ops = tf.get_default_graph().get_operations()
all_tensor_names = {output.name for op in ops for output in op.outputs}
tensor_dict = {}
for key in [
'num_detections', 'detection_boxes', 'detection_scores',
'detection_classes', 'detection_masks'
]:
tensor_name = key + ':0'
if tensor_name in all_tensor_names:
tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
tensor_name)
time1 = datetime.datetime.now()
if 'detection_masks' in tensor_dict:
# The following processing is only for single image
detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
# Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
detection_masks, detection_boxes, image.shape[0], image.shape[1])
detection_masks_reframed = tf.cast(
tf.greater(detection_masks_reframed, 0.5), tf.uint8)
# Follow the convention by adding back the batch dimension
tensor_dict['detection_masks'] = tf.expand_dims(
detection_masks_reframed, 0)
image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
time2 = datetime.datetime.now()
# Run inference
output_dict = sess.run(tensor_dict,
feed_dict={image_tensor: np.expand_dims(image, 0)})
time3 = datetime.datetime.now()
# all outputs are float32 numpy arrays, so convert types as appropriate
output_dict['num_detections'] = int(output_dict['num_detections'][0])
output_dict['detection_classes'] = output_dict[
'detection_classes'][0].astype(np.uint8)
output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
output_dict['detection_scores'] = output_dict['detection_scores'][0]
if 'detection_masks' in output_dict:
output_dict['detection_masks'] = output_dict['detection_masks'][0]
time4 = datetime.datetime.now()
print(time1-time0, time2-time1, time3-time2, time4-time3)
return output_dict
I don't know how to use GPU with tf.session.run(). Anyone can teach me how to use GPU with session.run???

How can I test the Deep MNIST for Experts code on a single image?

I am just starting out with tensorflow and I want to test the trained model from tensorflow's tutorial on my own images. This is the code I used to test the Softmax Regression Model in the beginning of the tutorial on my own images:
with open("three.jpeg", "rb") as f:
contents = f.read()
image = tf.image.decode_jpeg(contents, channels=1)
image_float = tf.image.convert_image_dtype(image, tf.float32)
resized_image = tf.image.resize_images(image_float, [28, 28])
resized_image = tf.reshape(resized_image, [784])
img = 1 - resized_image.eval()
classification = sess.run(tf.argmax(y, 1), feed_dict={x: [img]})
plt.imshow(img.reshape(28, 28), cmap=plt.cm.binary)
plt.show()
print ('NN predicted', classification[0])
Which worked fine for the softmax function but not for the Multilayer Convolutional Network. I tried changing y in this line
classification = sess.run(tf.argmax(y, 1), feed_dict={x: [img]})
to y_conv but it gave me this error:
InvalidArgumentError: You must feed a value for placeholder tensor
'Placeholder_2' with dtype float [[Node: Placeholder_2 =
Placeholderdtype=DT_FLOAT, shape=,
_device="/job:localhost/replica:0/task:0/cpu:0"]]
There is a placeholder somewhere in your graph which you are not feeding. Odds are you need another x in your feed_dict, for the other network.

Updating the Initial state of a recurrent neural network in tensorflow

Currently I have the following code:
init_state = tf.Variable(tf.zeros([batch_partition_length, state_size])) # -> [16, 1024].
final_state = tf.Variable(tf.zeros([batch_partition_length, state_size]))
And inside my inference method that is responsible producing the output, I have the following:
def inference(frames):
# Note that I write the final_state as a global valriable to avoid the shadowing issue, since it is referenced at the dynamic_rnn line.
global final_state
# .... Here we have some conv layers and so on...
# Now the RNN cell
with tf.variable_scope('local1') as scope:
# Move everything into depth so we can perform a single matrix multiply.
shape_d = pool3.get_shape()
shape = shape_d[1] * shape_d[2] * shape_d[3]
# tf_shape = tf.stack(shape)
tf_shape = 1024
print("shape:", shape, shape_d[1], shape_d[2], shape_d[3])
# So note that tf_shape = 1024, this means that we have 1024 features are fed into the network. And
# the batch size = 1024. Therefore, the aim is to divide the batch_size into num_steps so that
reshape = tf.reshape(pool3, [-1, tf_shape])
# Now we need to reshape/divide the batch_size into num_steps so that we would be feeding a sequence
rnn_inputs = tf.reshape(reshape, [batch_partition_length, step_size, tf_shape])
print('RNN inputs shape: ', rnn_inputs.get_shape()) # -> (16, 64, 1024).
cell = tf.contrib.rnn.BasicRNNCell(state_size)
# note that rnn_outputs are the outputs but not multiplied by W.
rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)
# linear Wx + b
with tf.variable_scope('softmax_linear') as scope:
weight_softmax = \
tf.Variable(
tf.truncated_normal([state_size, n_classes], stddev=1 / state_size, dtype=tf.float32, name='weight_softmax'))
bias_softmax = tf.constant(0.0, tf.float32, [n_classes], name='bias_softmax')
softmax_linear = tf.reshape(
tf.matmul(tf.reshape(rnn_outputs, [-1, state_size]), weight_softmax) + bias_softmax,
[batch_size, n_classes])
print('Output shape:', softmax_linear.get_shape())
return softmax_linear
# Here we define the loss, accuracy and the optimzer.
# now run the graph:
with tf.Session() as sess:
_, accuracy_train, loss_train, summary = \
sess.run([optimizer, accuracy, cost_scalar, merged], feed_dict={x: image_batch,
y_valence: valences,
confidence_holder: confidences})
....
Problem: How I would be able to assign initial_state the value stored in final_state? That is, how to more update a Variable value given the other?
I have used the following:
tf.assign(init_state, final_state.eval())
under session after running the sess.run command. But, this is throwing an error:
You must feed a value for placeholder tensor 'inputs' with dtype float
Where tf.Variable: "input" is declared as follows:
x = tf.placeholder(tf.float32, [None, 112, 112, 3], name='inputs')
And the feeding is done after reading the images from the tfRecords through the following command:
example = tf.train.Example()
example.ParseFromString(string_record)
height = int(example.features.feature['height']
.int64_list
.value[0])
width = int(example.features.feature['width']
.int64_list
.value[0])
img_string = (example.features.feature['image_raw']
.bytes_list
.value[0])
img_1d = np.fromstring(img_string, dtype=np.uint8)
reconstructed_img = img_1d.reshape((height, width, -1)) # Where this is added to the image_batch list, which is fed into the placeholder.
And if tried the following:
img_1d = np.fromstring(img_string, dtype=np.float32)
This will produce the following error:
ValueError: cannot reshape array of size 9408 into shape (112,112,newaxis)
Any help is much appreciated!!
So here are the mistakes that I have done so far. After doing some revision I figured out the following:
I shouldn't create the final_state as a tf.Variable. Since tf.nn.dynamic_rnn return tensors as ndarray, then, I should not instantiate the final_state int the beginning. And I should not use the global final_state under the function definition.
In order to assign the initial state the final_state, I used:
tf.assign(intial_state, final_state)
And things work out.
Note: in tensorflow, an operation returns the data as numpy array in python and as tensorflow::Tensor in C and C++.
Have a look at https://www.tensorflow.org/versions/r0.10/get_started/basic_usage for more informaiton.

Tensorflow: Input pipeline with sparse data for the SVM estimator

Introduction:
I am trying to train the tensorflow svm estimator tensorflow.contrib.learn.python.learn.estimators.svm with sparse data. Sample usage with sparse data at the github repo at tensorflow/contrib/learn/python/learn/estimators/svm_test.py#L167 (I am not allowed to post more links, so here the relative path).
The svm estimator expects as parameter example_id_column and feature_columns, where the feature columns should be derived of class FeatureColumn such as tf.contrib.layers.feature_column.sparse_column_with_hash_bucket. See Github repo at tensorflow/contrib/learn/python/learn/estimators/svm.py#L85 and the documentation at tensorflow.org at python/contrib.layers#Feature_columns.
Question:
How do I have to set up my input pipeline to format sparse data in such a way that I can use one of the tf.contrib.layers feature_columns as input for the svm estimator.
How would a dense input function with many features look like?
Background
The data that I use is the a1a dataset from the LIBSVM website. The data set has 123 features (that would correspond to 123 feature_columns if the data would be dense). I wrote an user op to read the data like tf.decode_csv() but for the LIBSVM format. The op returns the labels as dense tensor and the features as sparse tensor. My input pipeline:
NUM_FEATURES = 123
batch_size = 200
# my op to parse the libsvm data
decode_libsvm_module = tf.load_op_library('./libsvm.so')
def input_pipeline(filename_queue, batch_size):
with tf.name_scope('input'):
reader = tf.TextLineReader(name="TextLineReader_")
_, libsvm_row = reader.read(filename_queue, name="libsvm_row_")
min_after_dequeue = 1000
capacity = min_after_dequeue + 3 * batch_size
batch = tf.train.shuffle_batch([libsvm_row], batch_size=batch_size,
capacity=capacity,
min_after_dequeue=min_after_dequeue,
name="text_line_batch_")
labels, sp_indices, sp_values, sp_shape = \
decode_libsvm_module.decode_libsvm(records=batch,
num_features=123,
OUT_TYPE=tf.int64,
name="Libsvm_decoded_")
# Return the features as sparse tensor and the labels as dense
return tf.SparseTensor(sp_indices, sp_values, sp_shape), labels
Here is an example batch with batch_size = 5.
def input_fn(dataset_name):
maybe_download()
filename_queue_train = tf.train.string_input_producer([dataset_name],
name="queue_t_")
features, labels = input_pipeline(filename_queue_train, batch_size)
return {
'example_id': tf.as_string(tf.range(1,123,1,dtype=tf.int64)),
'features': features
}, labels
This is what I tried so far:
with tf.Session().as_default() as sess:
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
feature_column = tf.contrib.layers.sparse_column_with_hash_bucket(
'features', hash_bucket_size=1000, dtype=tf.int64)
svm_classifier = svm.SVM(feature_columns=[feature_column],
example_id_column='example_id',
l1_regularization=0.0,
l2_regularization=1.0)
svm_classifier.fit(input_fn=lambda: input_fn(TRAIN),
steps=30)
accuracy = svm_classifier.evaluate(
input_fn= lambda: input_fn(features, labels),
steps=1)['accuracy']
print(accuracy)
coord.request_stop()
coord.join(threads)
sess.close()
Here's an example, with made up data, that works for me in TensorFlow 1.1.0-rc2. I think my comment was misleading; you're best off converting ~100 binary features to real valued features (tf.sparse_tensor_to_dense) and using a real_valued_column, since sparse_column_with_integerized_feature is hiding most of the useful information from the SVM Estimator.
import tensorflow as tf
batch_size = 10
num_features = 123
num_examples = 100
def input_fn():
example_ids = tf.random_uniform(
[batch_size], maxval=num_examples, dtype=tf.int64)
# Construct a SparseTensor with features
dense_features = (example_ids[:, None]
+ tf.range(num_features, dtype=tf.int64)[None, :]) % 2
non_zeros = tf.where(tf.not_equal(dense_features, 0))
sparse_features = tf.SparseTensor(
indices=non_zeros,
values=tf.gather_nd(dense_features, non_zeros),
dense_shape=[batch_size, num_features])
features = {
'some_sparse_features': tf.sparse_tensor_to_dense(sparse_features),
'example_id': tf.as_string(example_ids)}
labels = tf.equal(dense_features[:, 0], 1)
return features, labels
svm = tf.contrib.learn.SVM(
example_id_column='example_id',
feature_columns=[
tf.contrib.layers.real_valued_column(
'some_sparse_features')],
l2_regularization=0.1, l1_regularization=0.5)
svm.fit(input_fn=input_fn, steps=1000)
positive_example = lambda: {
'some_sparse_features': tf.sparse_tensor_to_dense(
tf.SparseTensor([[0, 0]], [1], [1, num_features])),
'example_id': ['a']}
print(svm.evaluate(input_fn=input_fn, steps=20))
print(next(svm.predict(input_fn=positive_example)))
negative_example = lambda: {
'some_sparse_features': tf.sparse_tensor_to_dense(
tf.SparseTensor([[0, 0]], [0], [1, num_features])),
'example_id': ['b']}
print(next(svm.predict(input_fn=negative_example)))
Prints:
{'accuracy': 1.0, 'global_step': 1000, 'loss': 1.0645389e-06}
{'logits': array([ 0.01612902], dtype=float32), 'classes': 1}
{'logits': array([ 0.], dtype=float32), 'classes': 0}
Since TensorFlow 1.5.0 there is an inbuilt function to read LIBSVM data,
refer to my answer here
https://stackoverflow.com/a/56354308/3885491