Real-time counter using Tensorflow object detection API - tensorflow

Im currently working on real-time object detection using tensorflow API.I've gotten that figured out, but right now I would like to add in object counter. So, I'll have real-time object detection + counter.
The source code for object detection was taken from tensorflow ipynb tutorial and I added OpenCV for real-time detection. I've merged the real-time detection source code with the counter source code where initially it was for vehicle counting from this guy's repo.
So, my current output: No error and no output. But my webcam light flickers which shows it's being used so the opencv part is working. Could anyone take a look the code and help me figure what's wrong? It'd be a really great help. Thank you in advance.
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
import csv
import time
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
import cv2
cap = cv2.VideoCapture(0)
# initialize .csv
with open('traffic_measurement.csv', 'w') as f:
writer = csv.writer(f)
csv_line = \
'Person Movement Direction'
writer.writerows([csv_line.split(',')])
# Variables to count persons
total_passed_person = 0
# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
# ## Object detection imports
# Here are the imports from the object detection module.
from utils import label_map_util
from utils import visualization_utils as vis_util
# # Model preparation
# What model to download.
MODEL_NAME = 'ssd_mobilenet_v1_coco_11_06_2017'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')
NUM_CLASSES = 90
# ## Download Model
opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
file_name = os.path.basename(file.name)
if 'frozen_inference_graph.pb' in file_name:
tar_file.extract(file, os.getcwd())
# ## Load a (frozen) Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
# ## Loading label map
# Label maps map indices to category names, so that when our convolution network predicts `5`, we
know
that this corresponds to `airplane`. Here we use internal utility functions, but anything that
returns
a dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES,
use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# ## Helper code
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
# Size, in inches, of the output images.
IMAGE_SIZE = (12, 8)
# Detection
def object_detection_function():
total_passed_person = 0
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(input_frame, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# for all the frames that are extracted from input video
while cap.isOpened():
(ret,frame) = cap.read()
if not ret:
print ('end of the video file...')
break
input_frame = frame
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
(counter, csv_line) = \
vis_util.visualize_boxes_and_labels_on_image_array(
cap.get(1),
input_frame,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8)
total_passed_person = total_passed_person + counter
# insert information text to video frame
font = cv2.FONT_HERSHEY_SIMPLEX
cv2.putText(
input_frame,
'Detected Persons: ' + str(total_passed_person),
(10, 35),
font,
0.8,
(0, 0xFF, 0xFF),
2,
cv2.FONT_HERSHEY_SIMPLEX,
)
# when the vehicle passed over line and counted, make the color of ROI line green
if counter == 1:
cv2.line(input_frame, (0, 200), (640, 200), (0, 0xFF, 0), 5)
else:
cv2.line(input_frame, (0, 200), (640, 200), (0, 0, 0xFF), 5)
# insert information text to video frame
cv2.rectangle(input_frame, (10, 275), (230, 337), (180, 132, 109), -1)
cv2.putText(
input_frame,
'ROI Line',
(545, 190),
font,
0.6,
(0, 0, 0xFF),
2,
cv2.LINE_AA,
)
cv2.putText(
input_frame,
'-Movement Direction: ' + direction,
(14, 302),
font,
0.4,
(0xFF, 0xFF, 0xFF),
1,
cv2.FONT_HERSHEY_COMPLEX_SMALL,
)
if csv_line != 'not_available':
with open('traffic_measurement.csv', 'a') as f:
writer = csv.writer(f)
(direction) = \
csv_line.split(',')
writer.writerows([csv_line.split(',')])
cv2.imshow('object detection',cv2.resize(input_frame, (800,600)))
if cv2.waitKey(25) & 0xFF == ord('q'):
cv2.destroyAllWindows()
break

Related

Loading and testing a Tensorflow 2 trained model

i already was able to train a custom TF2 model using this tutorial:
https://neptune.ai/blog/how-to-train-your-own-object-detector-using-tensorflow-object-detection-api
Now im getting stuck with testing this model. The script i use for this is also from a turoial and i changed the paths etc but it still doesnt work... I tried and tried and tried for many hours now but at the time i just got demotivated...
I can resolve many errors but the current one not, maybe anyone can help me. Im quite new to object detection..
import numpy as np
import os
import six as urllib # import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
import cv2
cap = cv2.VideoCapture(1)
# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
# ## Object detection imports
# Here are the imports from the object detection module.
# In[3]:
from object_detection.utils import label_map_util # from utils import label_map_util
from object_detection.utils import visualization_utils as vis_util # from utils import visualization_utils as vis_util
# # Model preparation
# ## Variables
#
# Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_CKPT` to point to a new .pb file.
#
# By default we use an "SSD with Mobilenet" model here. See the [detection model zoo](https://github.com/tensorflow/models/blob/master/object_detection/g3doc/detection_model_zoo.md) for a list of other models that can be run out-of-the-box with varying speeds and accuracies.
# In[4]:
# What model to download.
MODEL_NAME = 'D:/VSCode/Machine_Learning_Tests/Tensorflow/workspace/exported_models/first_model/saved_model' # MODEL_NAME = 'inference_graph'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = MODEL_NAME + '/saved_model.pb' # PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = 'D:/VSCode/Machine_Learning_Tests/Tensorflow/workspace/data/label_map.pbtxt' # PATH_TO_LABELS = 'training/labelmap.pbtxt'
NUM_CLASSES = 1
# ## Load a (frozen) Tensorflow model into memory.
# In[6]:
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.compat.v1.GraphDef() # od_graph_def = tf.GraphDef()
with tf.compat.v2.io.gfile.GFile(PATH_TO_CKPT, 'rb') as fid: # with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
# ## Loading label map
# Label maps map indices to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`. Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine
# In[7]:
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# ## Helper code
# In[8]:
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
# # Detection
# In[9]:
# For the sake of simplicity we will use only 2 images:
# image1.jpg
# image2.jpg
# If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.
PATH_TO_TEST_IMAGES_DIR = 'images/test/'
TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, 'image{}.jpg'.format(i)) for i in range(3, 8) ]
# Size, in inches, of the output images.
IMAGE_SIZE = (12,8)
# In[10]:
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
while True:
image_np = np.array(cv2.imread('Test.jpg'))
cv2.imshow('image',image_np)
cv2.waitKey(1)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8)
cv2.imshow('object detection', cv2.resize(image_np, (800,600)))
if cv2.waitKey(25) & 0xFF == ord('q'):
cv2.destroyAllWindows()
break
Thats the code i use to try testing the model
And this is the current error:
Traceback (most recent call last):
File "d:\VSCode\Machine_Learning_Tests\Tensorflow\test\object_detection_tutorial_wwwPythonProgrammingNet__mitBild.py", line 65, in <module>
od_graph_def.ParseFromString(serialized_graph)
google.protobuf.message.DecodeError: Error parsing message with type 'tensorflow.GraphDef'

How to give a unique Id to each bounding box of each object detected

I am using a Tensorflow object detection API on my Windows system for which I've built a custom object detection classifier. It detects the object very well with the webcam feed but I am trying to figure out on how I can detect objects from the webcam with a unique object ID for every detected object.
Say, for example, if the webcam detects two similar objects (say 2 similar chairs) then it draws a bounding box on each chair. I want to track both the chairs with a unique ID and, when I subsequently extract a video frame, get the centroid of both the chairs.
Currently I am using this code:
# Import packages
import requests
import os
os.chdir('C:\\tensorflow1\\models\\research\\object_detection')
from firebase import firebase
import cv2
import numpy as np
import tensorflow as tf
import sys
# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
# Import utilites
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = 'inference_graph'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH, MODEL_NAME, 'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH, 'training', 'labelmap.pbtxt')
# Number of classes the object detector can identify
NUM_CLASSES = 1
# Load the label map.
# Label maps map indices to category names, so that when our convolution
# network predicts `5`, we know that this corresponds to `king`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES,
use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection classifier
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Initialize webcam feed
video = cv2.VideoCapture(1)
img_counter = 0
while True:
# Acquire frame and expand frame dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
ret, frame = video.read()
frame_expanded = np.expand_dims(frame, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: frame_expanded})
# Draw the results of the detection (aka 'visulaize the results')
vis_util.visualize_boxes_and_labels_on_image_array(
frame,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
np.squeeze(num),
category_index,
max_boxes_to_draw=3,
use_normalized_coordinates=True,
line_thickness=8,
min_score_thresh=0.90)
# All the results have been drawn on the frame, so it's time to display it.
frame = cv2.circle(frame, (350, 350), 1, (0, 0, 255), 5)
#frame = cv2.circle(frame, (337, 139), 1, (0, 0, 255), 5)
cv2.imshow('Object detector', frame)
if not ret:
break
k = cv2.waitKey(1)
if k % 256 == 27:
# ESC pressed
print("Escape hit, closing...")
break
elif k % 256 == ord('s'):
# SPACE pressed
img_name = "opencv_frame_{}.jpg".format(img_counter)
print("{} written!".format(img_name))
img_counter += 1
# Code to find the centroid of the bounding box on the object detected
height = frame.shape[0]
width = frame.shape[1]
print(height)
print(width)
min_score_thresh = 0.90
true_boxes = boxes[0][scores[0] > min_score_thresh]
for i in range(true_boxes.shape[0]):
ymin = int(true_boxes[i][0] * height)
xmin = int(true_boxes[i][1] * width)
ymax = int(true_boxes[i][2] * height)
xmax = int(true_boxes[i][3] * width)
#print(ymin,xmin,ymax,xmax)
y = int((ymin + ymax) / 2)
x = int((xmin + xmax) / 2)
print(x, y)
frame = cv2.circle(frame, (xmin, ymin), 1, (0, 0, 255), 3)
frame = cv2.circle(frame, (xmax, ymin), 1, (0, 0, 255), 3)
frame = cv2.circle(frame, (xmin, ymax), 1, (0, 0, 255), 3)
frame = cv2.circle(frame, (xmax, ymax), 1, (0, 0, 255), 3)
frame = cv2.circle(frame, (x, y), 1, (0, 255, 255), 5)
cv2.imshow(img_name, frame)
# Clean up
video.release()
cv2.destroyAllWindows()
But this only gives me the centroid of the chair with highest confidence score instead of all the chairs which are detected.
How can I modify my code to keep a track of each object with a unique ID and then, when the frames are extracted, get the centroid of each chair? Ideally the answer should be scaleable, so that 3 objects detected gives 3 unique IDs and 3 centroids.
Finally: does the track_ids parameter in visualize_boxes_and_labels_on_image_array function help to keep a track on the object and its bounding box? If so, how should it be used?
I think what you're looking for is an object tracking algorithm. Try using SORT(Simple Online Realtime Tracking) or maybe other algorithms. You can just pass your detections (bounding box coordinates) to the tracker and it returns the bounding box along with a unique ID for each tracking object..

Predictions in recorded video using object detection tensorflow API

I am trying to read a video file (using opencv), loop over all frames using tensorflow's object-detection API to do the predictions and bounding boxes, and writing the predicted frames (with boxes) to a new video file. I used the object_detection_tutorial.ipynb with some modifications to capture the video frames and process it in faster-rcnn-inception-resnet-v2 loaded from a frozen graph (after trained).
I am using a tesla P100 gpu in a cloud machine with windows 10 and 56GB ram. Also using tensorflow-gpu.
When I run the code, it takes 0,5 second per frame. Is it a normal speed for a tesla P100 or I am doing something wrong in the code to make it slower?
This code is just a test, as later I will have to use it in a real time video prediction task. If 0,5 second per frame is an expected speed using tensorflow API, I think I will cannot use it in my task :(
So, after running it, i get the following running times
processing frame number 1.0
time to capture video frame 0.0
time to predict 0.49225664138793945
time to generate boxes in a frame 0.14833950996398926
time to write a frame in video file 0.04687023162841797
total time in the loop 0.6874663829803467
As you guys can see, the code using the CPU (opencv) goes fast. But when I use the GPU, it takes almost 0,5 seconds just in prediction task (used in sess.run).
Any advices? Thank you in advance. Bellow follows my code
from distutils.version import StrictVersion
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
import time
from collections import defaultdict
from io import StringIO
#from matplotlib import pyplot as plt
from PIL import Image
import cv2
from imutils import paths
import re
#This is needed since the code is stored in the object_detection folder.
sys.path.append("..")
from object_detection.utils import ops as utils_ops
if StrictVersion(tf.__version__) < StrictVersion('1.9.0'):
raise ImportError('Please upgrade your TensorFlow installation to v1.9.* or later!')
from utils import label_map_util
from utils import visualization_utils as vis_util
#Detection using tensorflow inside write_video function
def write_video():
filename = 'output/teste_v2.avi'
codec = cv2.VideoWriter_fourcc('W', 'M', 'V', '2')
cap = cv2.VideoCapture('pneu_trim2.mp4')
framerate = round(cap.get(5),2)
w = int(cap.get(3))
h = int(cap.get(4))
resolution = (w, h)
VideoFileOutput = cv2.VideoWriter(filename, codec, framerate, resolution)
################################
# # Model preparation
# ## Variables
#
# Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_FROZEN_GRAPH` to point to a new .pb file.
#
# What model to download.
MODEL_NAME = 'training/pneu_incep_step_24887'
print("loading model from " + MODEL_NAME)
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'object-detection.pbtxt')
NUM_CLASSES = 5
# ## Load a (frozen) Tensorflow model into memory.
time_graph = time.time()
print('loading graphs')
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
print("tempo build graph = " + str(time.time() - time_graph))
# ## Loading label map
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
################################
with tf.Session(graph=detection_graph) as sess:
with detection_graph.as_default():
while (cap.isOpened()):
time_loop = time.time()
print('processing frame number: ' + str(cap.get(1)))
time_captureframe = time.time()
ret, image_np = cap.read()
print("time to capture video frame = " + str(time.time() - time_captureframe))
if (ret != True):
break
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
#image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
time_prediction = time.time()
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
print("time to predict = " + str(time.time() - time_prediction))
# Visualization of the results of a detection.
time_visualizeboxes = time.time()
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8)
print("time to generate boxes in a frame = " + str(time.time() - time_visualizeboxes))
time_writeframe = time.time()
VideoFileOutput.write(image_np)
print("time to write a frame in video file = " + str(time.time() - time_writeframe))
print("total time in the loop = " + str(time.time() - time_loop))
cap.release()
VideoFileOutput.release()
print('done')
Actually the problem is with the model you were using.
https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md
Basically the model Faster-rcnn-inception-resnet-v2 will take more time.
You can refer the link to know the speed for the model

Tensorflow object detection UnicodeEncodeError

I am trying to use the pretrained faster_rcnn_inception_resnet_v2_atrous_oid. The code is modified from the official Quick Start notebook. When I use other models like faster_rcnn_nas_coco_2017_11_08, everything works. However, when I change to faster_rcnn_inception_resnet_v2_atrous_oid, I got the following error:
runfile('D:/python/tf/models-master/research/object_detection/Learn_faster.py', wdir='D:/python/tf/models-master/research/object_detection')
Reloaded modules: utils, utils.label_map_util, utils.visualization_utils
downloaded
Traceback (most recent call last):
File "e:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2898, in run_code
self.showtraceback()
File "e:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1826, in showtraceback
self._showtraceback(etype, value, stb)
File "e:\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 554, in _showtraceback
dh.parent_header, ident=topic)
File "e:\Anaconda3\lib\site-packages\jupyter_client\session.py", line 712, in send
to_send = self.serialize(msg, ident)
File "e:\Anaconda3\lib\site-packages\jupyter_client\session.py", line 607, in serialize
content = self.pack(content)
File "e:\Anaconda3\lib\site-packages\jupyter_client\session.py", line 103, in <lambda>
ensure_ascii=False, allow_nan=False,
File "e:\Anaconda3\lib\site-packages\zmq\utils\jsonapi.py", line 43, in dumps
s = s.encode('utf8')
UnicodeEncodeError: 'utf-8' codec can't encode character '\udcd5' in position 2098: surrogates not allowed
The code is:
import numpy as np
import os
import six.moves.urllib as urllib
import tarfile
import tensorflow as tf
from matplotlib import pyplot as plt
from PIL import Image
if tf.__version__ != '1.4.0':
raise ImportError('Please upgrade your tensorflow installation to v1.4.0!')
from utils import label_map_util
from utils import visualization_utils as vis_util
# What model to download.
MODEL_NAME = 'faster_rcnn_inception_resnet_v2_atrous_oid_2017_11_08'#'faster_rcnn_nas_coco_2017_11_08'#'faster_rcnn_resnet101_coco_2017_11_08' #'faster_rcnn_nas_coco_2017_11_08' 'rfcn_resnet101_coco_2017_11_08'# , , 'ssd_inception_v2_coco_2017_11_08'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'oid_bbox_trainable_label_map')#'mscoco_label_map.pbtxt')
NUM_CLASSES = 545
opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
print("downloaded")
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
file_name = os.path.basename(file.name)
if 'frozen_inference_graph.pb' in file_name:
tar_file.extract(file, os.getcwd())
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
# For the sake of simplicity we will use only 2 images:
# image1.jpg
# image2.jpg
# If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.
PATH_TO_TEST_IMAGES_DIR = 'test_images'
TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, 'image{}.jpg'.format(i)) for i in range(1, 7) ]
# Size, in inches, of the output images.
IMAGE_SIZE = (12, 8)
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
# Definite input and output Tensors for detection_graph
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
for image_path in TEST_IMAGE_PATHS:
image = Image.open(image_path)
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Actual detection.
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8)
plt.figure(figsize=IMAGE_SIZE)
plt.imshow(image_np)
PATH_TO_LABELS = os.path.join('data', 'oid_bbox_trainable_label_map')
should be
PATH_TO_LABELS = os.path.join('data', 'oid_bbox_trainable_label_map.pbtxt')

More efficient way of loading images for detection

I am using tensorflow object detection api to do some semi real time object detection tasks.
The images will be taken by camera at a speed of 2 images/sec. Each image will be cropped into 4 small images so in total I need to process 8 images/sec.
My detection model has been exported into a frozen graph (.pb file) and loaded in GPU memory. Then I load images to numpy arrays to feed them into my model.
The detection itself only takes about 0.1 sec/image, however, loading each image takes about 0.45 sec.
The script I am using was revised from the code samples provided by object detection api(link), it reads each image and convert them into numpy array and then feed into detection models. The most time consumming part of this process is load_image_into_numpy_array, it takes almost 0.45 seconds.
The script is in below:
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
import timeit
import scipy.misc
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
from utils import label_map_util
from utils import visualization_utils as vis_util
# Path to frozen detection graph. This is the actual model that is used for the
# object detection.
PATH_TO_CKPT = 'animal_detection.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'animal_label_map.pbtxt')
NUM_CLASSES = 1
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def,name='')
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map,
max_num_classes=NUM_CLASSES,
use_display_name=True)
category_index = label_map_util.create_category_index(categories)
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
# For the sake of simplicity we will use only 2 images:
# image1.jpg
# image2.jpg
# If you want to test the code with your images, just add path to the
# images to the TEST_IMAGE_PATHS.
PATH_TO_TEST_IMAGES_DIR = 'test'
TEST_IMAGE_PATHS = [
os.path.join(PATH_TO_TEST_IMAGES_DIR,'image{}.png'.format(i)) for i in range(1, 10) ]
# Size, in inches, of the output images.
IMAGE_SIZE = (12, 8)
config = tf.ConfigProto()
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
with detection_graph.as_default():
with tf.Session(graph=detection_graph, config=config) as sess:
for image_path in TEST_IMAGE_PATHS:
start = timeit.default_timer()
image = Image.open(image_path)
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
end = timeit.default_timer()
print(end-start)
start = timeit.default_timer()
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
stop = timeit.default_timer()
print (stop - start)
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=2)
I am thinking of a more efficient way to load images that are produced by camera, the first thought is to avoid numpy array and try to use tensorflow native ways to load images, but I have no idea where to get start since I am very new to tensorflow.
If I could find some tensorflow way to load images, maybe I could take 4 images into 1 batch and feed them into my model so that I might get some improvement in speed.
An immature idea is try to save 4 small images cropped from 1 raw image into a tf_record file, and load tf_record file as one batch to feed the model, but I have no idea how to achieve that.
Any help will be appreciated.
I found one solution that can reduce image loading from 0.4 second to 0.01 second. I will post answer here in case if someone also has same problem.
Instead of using PIL.Image and numpy, we could use imread in opencv.
I also managed to batch images so that we can achieve a better speedup.
The script goes as follow:
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tensorflow as tf
import timeit
import cv2
from collections import defaultdict
from utils import label_map_util
from utils import visualization_utils as vis_util
MODEL_PATH = sys.argv[1]
IMAGE_PATH = sys.argv[2]
BATCH_SIZE = int(sys.argv[3])
# Path to frozen detection graph. This is the actual model that is used for the
# object detection.
PATH_TO_CKPT = os.path.join(MODEL_PATH, 'frozen_inference_graph.pb')
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'animal_label_map.pbtxt')
NUM_CLASSES = 1
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def,name='')
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map,
max_num_classes=NUM_CLASSES,
use_display_name=True)
category_index = label_map_util.create_category_index(categories)
PATH_TO_TEST_IMAGES_DIR = IMAGE_PATH
TEST_IMAGE_PATHS = [
os.path.join(PATH_TO_TEST_IMAGES_DIR,'image{}.png'.format(i)) for i in range(1, 129) ]
config = tf.ConfigProto()
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
with detection_graph.as_default():
with tf.Session(graph=detection_graph, config=config) as sess:
for i in range(0, len(TEST_IMAGE_PATHS), BATCH_SIZE):
images = []
start = timeit.default_timer()
for j in range(0, BATCH_SIZE):
image = cv2.imread(TEST_IMAGE_PATHS[i+j])
image = np.expand_dims(image, axis=0)
images.append(image)
image_np_expanded = np.concatenate(images, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
stop = timeit.default_timer()
print (stop - start)