In the ImageDataGenerator, I've used the following function to preprocess images, through the keyword of 'preprocessing' in .flow_from_dataframe().
However, I am now trying to use the image_dataset_from_directory, which does not work with the preprocess function, as it does not allow embedding this function.
I've tried to apply the preprocess_image() function after the dataset is generated by image_dataset_from_directory, through .map() function, but it does not work either.
Please could anyone advise?
train_Gen = dataGen.flow_from_dataframe(
directory=os.path.join(data_dir, 'train_images'),
target_size=(IMG_WIDTH, IMG_HEIGHT),
def crop_image_from_gray(img, tol=7):
Applies masks to the orignal image and
returns the a preprocessed image with
3 channels
:param img: A NumPy Array that will be cropped
:param tol: The tolerance used for masking
:return: A NumPy array containing the cropped image
# If for some reason we only have two channels
if img.ndim == 2:
mask = img > tol
return img[np.ix_(mask.any(1),mask.any(0))]
# If we have a normal RGB images
elif img.ndim == 3:
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
mask = gray_img > tol
check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
if (check_shape == 0): # image is too dark so that we crop out everything,
return img # return original image
img = np.stack([img1,img2,img3],axis=-1)
return img
def preprocess_image(image, sigmaX=10):
The whole preprocessing pipeline:
1. Read in image
2. Apply masks
3. Resize image to desired size
4. Add Gaussian noise to increase Robustness
:param img: A NumPy Array that will be cropped
:param sigmaX: Value used for add GaussianBlur to the image
:return: A NumPy array containing the preprocessed image
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = crop_image_from_gray(image)
image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
image = cv2.addWeighted (image,4, cv2.GaussianBlur(image, (0,0) ,sigmaX), -4, 128)
return image
I am using a Tensorflow object detection API on my Windows system for which I've built a custom object detection classifier. It detects the object very well with the webcam feed but I am trying to figure out on how I can detect objects from the webcam with a unique object ID for every detected object.
Say, for example, if the webcam detects two similar objects (say 2 similar chairs) then it draws a bounding box on each chair. I want to track both the chairs with a unique ID and, when I subsequently extract a video frame, get the centroid of both the chairs.
Currently I am using this code:
# Import packages
import requests
import os
from firebase import firebase
import cv2
import numpy as np
import tensorflow as tf
import sys
# This is needed since the notebook is stored in the object_detection folder.
# Import utilites
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
# Name of the directory containing the object detection module we're using
MODEL_NAME = 'inference_graph'
# Grab path to current working directory
CWD_PATH = os.getcwd()
# Path to frozen detection graph .pb file, which contains the model that is used
# for object detection.
PATH_TO_CKPT = os.path.join(CWD_PATH, MODEL_NAME, 'frozen_inference_graph.pb')
# Path to label map file
PATH_TO_LABELS = os.path.join(CWD_PATH, 'training', 'labelmap.pbtxt')
# Number of classes the object detector can identify
# Load the label map.
# Label maps map indices to category names, so that when our convolution
# network predicts `5`, we know that this corresponds to `king`.
# Here we use internal utility functions, but anything that returns a
# dictionary mapping integers to appropriate string labels would be fine
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES,
category_index = label_map_util.create_category_index(categories)
# Load the Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph =
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
# Define input and output tensors (i.e. data) for the object detection classifier
# Input tensor is the image
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Output tensors are the detection boxes, scores, and classes
# Each box represents a part of the image where a particular object was detected
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represents level of confidence for each of the objects.
# The score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
# Number of objects detected
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Initialize webcam feed
video = cv2.VideoCapture(1)
img_counter = 0
while True:
# Acquire frame and expand frame dimensions to have shape: [1, None, None, 3]
# i.e. a single-column array, where each item in the column has the pixel RGB value
ret, frame =
frame_expanded = np.expand_dims(frame, axis=0)
# Perform the actual detection by running the model with the image as input
(boxes, scores, classes, num) =
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: frame_expanded})
# Draw the results of the detection (aka 'visulaize the results')
# All the results have been drawn on the frame, so it's time to display it.
frame =, (350, 350), 1, (0, 0, 255), 5)
#frame =, (337, 139), 1, (0, 0, 255), 5)
cv2.imshow('Object detector', frame)
if not ret:
k = cv2.waitKey(1)
if k % 256 == 27:
# ESC pressed
print("Escape hit, closing...")
elif k % 256 == ord('s'):
# SPACE pressed
img_name = "opencv_frame_{}.jpg".format(img_counter)
print("{} written!".format(img_name))
img_counter += 1
# Code to find the centroid of the bounding box on the object detected
height = frame.shape[0]
width = frame.shape[1]
min_score_thresh = 0.90
true_boxes = boxes[0][scores[0] > min_score_thresh]
for i in range(true_boxes.shape[0]):
ymin = int(true_boxes[i][0] * height)
xmin = int(true_boxes[i][1] * width)
ymax = int(true_boxes[i][2] * height)
xmax = int(true_boxes[i][3] * width)
y = int((ymin + ymax) / 2)
x = int((xmin + xmax) / 2)
print(x, y)
frame =, (xmin, ymin), 1, (0, 0, 255), 3)
frame =, (xmax, ymin), 1, (0, 0, 255), 3)
frame =, (xmin, ymax), 1, (0, 0, 255), 3)
frame =, (xmax, ymax), 1, (0, 0, 255), 3)
frame =, (x, y), 1, (0, 255, 255), 5)
cv2.imshow(img_name, frame)
# Clean up
But this only gives me the centroid of the chair with highest confidence score instead of all the chairs which are detected.
How can I modify my code to keep a track of each object with a unique ID and then, when the frames are extracted, get the centroid of each chair? Ideally the answer should be scaleable, so that 3 objects detected gives 3 unique IDs and 3 centroids.
Finally: does the track_ids parameter in visualize_boxes_and_labels_on_image_array function help to keep a track on the object and its bounding box? If so, how should it be used?
I think what you're looking for is an object tracking algorithm. Try using SORT(Simple Online Realtime Tracking) or maybe other algorithms. You can just pass your detections (bounding box coordinates) to the tracker and it returns the bounding box along with a unique ID for each tracking object..
I used tensorflow object detection api for detecting multiple objects in my videos. However, I have been struggling with figuring out as to how to write these resulting object detections to a text/CSV/xml file (basically the bounding box information, the frame number of the images sequence, confidence of the bbox)
I've seen several answers in stackoverflow and github but most of them were either vague or I just could not get the exact answer I'm looking for.
Shown below is the last part of the detection code, I know that the detection_boxes and detection_scores are what I need but I just cannot figure out how to write these to a text file and also write only the final bbox detections which are seen on the images but not ALL detection bounding boxes
for image_path in TEST_IMAGE_PATHS:
image = # the array based representation of the image will be used later in order to prepare the result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image) # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0) # Actual detection.
output_dict = run_inference_for_single_image(image_np_expanded, detection_graph) # Visualization of the results of a detection.
line_thickness=8) plt.figure(figsize=IMAGE_SIZE)
You can try the following code
image =
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
(boxes, scores, classes, num_detections) =
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
width = 1024
height = 600
threshold = 0.5
temp = [] # list to store scores greater than 0.5
# iterate through all scores and pick those having a score greater than 0.5
for index,value in enumerate(classes[0]):
if scores[0,index] > threshold:
# Similarly, classes[0,index] will give you the class of the bounding box detection
# Actual detection.
output_dict = run_inference_for_single_image(image_np, detection_graph)
# For printing the bounding box coordinates
for i,j in zip(output_dict['detection_boxes'],output_dict['detection_scores']):
The above code snippet will provide you with the bounding box coordinates and the detection scores. You can use a minimum threshold to filter unnecessary detections. I hope this helps you out. Also, I could not quite understand what you meant by frame number. Could you please elucidate further on what you actually mean by this.
Please let me know if you face any issues
I am using the VGG16 Model, which expects a 4D Tensor as input. When I call, ytrain, ...) my xtrain is a list of 3D Tensor [size, size, features] - so in this case: [224,224,3]
What I want is 4D Tensors with [len(images), size, size, features]
How could I modify my code to get there?
I tried tf.expand_dims and tf.concant but it didn't work.
# Transforming my image to a 3D Tensor
image =
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
image = image / 255.0
Error msg after
Error when checking input: expected input_1 to have 4 dimensions, but got array with shape (224, 224, 3)
It looks like you are reading in only a single image and passing that. If that's the case, you can add a dimension of 1 to the first axis of the image. There's lots of ways to do that.
Using reshape:
image = image.reshape(1, 224, 224, 3)
Using some fancy numpy slicing notation to add an axis (personal favorite):
image = image[None, ...]
Using numpy.expand_dims() as explained in Abhijit's answer.
I imagine you want to be reading a bunch of images in though. Possibly an issue with your input process? Can you wrap your read in a loop and read multiple files? Something like:
images = []
for file in image_files:
image =
# ...
images = np.asarray(images)
numpy.expand_dims(image, axis=0)
Is there any way to use pre-trained models in Object Detection API of Tensorflow, which trained for RGB images, for single channel grayscale images(depth) ?
I tried the following approach to perform object detection on Grayscale (1 Channel images) using a pre-trained model (faster_rcnn_resnet101_coco_11_06_2017) in Tensorflow. It did work for me.
The model was trained on RGB Images, So I just had to modify certain code in object_detection_tutorial.ipynb, available in the Tensorflow Repo.
First Change:
Note that exisitng code in the ipynb was written for 3 Channel Images, So change the load_image_into_numpy array function as shown
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
channel_dict = {'L':1, 'RGB':3} # 'L' for Grayscale, 'RGB' : for 3 channel images
return np.array(image.getdata()).reshape(
(im_height, im_width, channel_dict[image.mode])).astype(np.uint8)
Second Change: Grayscale images have only data in 1 channel. To perform object detection we need 3 channels(the inference code was written for 3 channels)
This can be achieved in two ways.
a) Duplicate the single channel data into two more channels
b) Fill the other two channels with Zeros.
Both of them will work, I used the first method
In the ipynb, go the section where you read the images and convert them into numpy arrays (the forloop at the end of the ipynb).
Change the code From:
for image_path in TEST_IMAGE_PATHS:
image =
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
To this:
for image_path in TEST_IMAGE_PATHS:
image =
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image)
if image_np.shape[2] != 3:
image_np = np.broadcast_to(image_np, (image_np.shape[0], image_np.shape[1], 3)).copy() # Duplicating the Content
## adding Zeros to other Channels
## This adds Red Color stuff in background -- not recommended
# z = np.zeros(image_np.shape[:-1] + (2,), dtype=image_np.dtype)
# image_np = np.concatenate((image_np, z), axis=-1)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
That's it, Run the file and you should see the results.
