My task is to evaluate different Models of the TensorFlow Object detection API for the detection of zebra crossings. I have a dataset of 400 labeled pictures. I already tried the Faster R-CNN based models of the model zoo, and they all perform great. Now I am trying to do the same with any SSD model inside the model zoo, but my results are abysmal.
SSD_resnet50_v1 640x640 results
SSD resnet152 v1 640x640 results
I did not modify the configuration of the SSD besides removing the augmentation, because I do it myself for evaluation purposes. I was following this tutorial to set up my environment: https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/training.html
here is the pipeline.config of the SSD resnet50 v1
model {
ssd {
num_classes: 1
image_resizer {
fixed_shape_resizer {
height: 640
width: 640
}
}
feature_extractor {
type: "ssd_resnet50_v1_fpn_keras"
depth_multiplier: 1.0
min_depth: 16
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 0.00039999998989515007
}
}
initializer {
truncated_normal_initializer {
mean: 0.0
stddev: 0.029999999329447746
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
override_base_feature_extractor_hyperparams: true
fpn {
min_level: 3
max_level: 7
}
}
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 0.00039999998989515007
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.009999999776482582
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
depth: 256
num_layers_before_predictor: 4
kernel_size: 3
class_prediction_bias_init: -4.599999904632568
}
}
anchor_generator {
multiscale_anchor_generator {
min_level: 3
max_level: 7
anchor_scale: 4.0
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
scales_per_octave: 2
}
}
post_processing {
batch_non_max_suppression {
score_threshold: 9.99999993922529e-09
iou_threshold: 0.6000000238418579
max_detections_per_class: 100
max_total_detections: 100
use_static_shapes: false
}
score_converter: SIGMOID
}
normalize_loss_by_num_matches: true
loss {
localization_loss {
weighted_smooth_l1 {
}
}
classification_loss {
weighted_sigmoid_focal {
gamma: 2.0
alpha: 0.25
}
}
classification_weight: 1.0
localization_weight: 1.0
}
encode_background_as_zeros: true
normalize_loc_loss_by_codesize: true
inplace_batchnorm_update: true
freeze_batchnorm: false
}
}
train_config {
batch_size: 8
sync_replicas: true
optimizer {
momentum_optimizer {
learning_rate {
cosine_decay_learning_rate {
learning_rate_base: 0.03999999910593033
total_steps: 70000
warmup_learning_rate: 0.013333000242710114
warmup_steps: 2000
}
}
momentum_optimizer_value: 0.8999999761581421
}
use_moving_average: false
}
fine_tune_checkpoint: "pre-trained-models/ssd_resnet50_v1_fpn_640x640_coco17_tpu-8/checkpoint/ckpt-0"
num_steps: 70000
startup_delay_steps: 0.0
replicas_to_aggregate: 8
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_type: "detection"
use_bfloat16: false
fine_tune_checkpoint_version: V2
}
train_input_reader {
label_map_path: "annotations/label_map_cw.pbtxt"
tf_record_input_reader {
input_path: "annotations/train.record"
}
}
eval_config {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
}
eval_input_reader {
label_map_path: "annotations/label_map_cw.pbtxt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "annotations/test.record"
}
}
Both the Fast-RCNN and the SSD get the same traing data. I know that SSD network are not as accurate as Faster-RCNN networks but i did not expect this results. First I thougt, that ssd's are not good at recognizing zebra crossings but this is not the case. I even trained a new SSD mobilenet v2 in deepdetect (similar to tf) with the same training data and got ~0.43 mAP. This is not great but way better than my results with TF.
Any Ideas what could be the problem?
I am trying to detect the tree species in an images using Faster R-CNN Inception ResNet V2 1024x1024. Problem is that the model cannot detect all the trees in an image. setting the value of first_stage_max_proposals to 1500, only slightly helps. Also adjusting the values of grid_anchor_generator, max_detections_per_class and max_total_detections has not significant effect. Can anyone help me ? Which parameters should I adjust ?
Here is my model config file and one image where model missed most of the trees
enter image description here
# Faster R-CNN with Inception Resnet v2 (no atrous)
# Sync-trained on COCO (with 8 GPUs) with batch size 16 (800x1333 resolution)
# Initialized from Imagenet classification checkpoint
# TF2-Compatible, *Not* TPU-Compatible
#
# Achieves 39.6 mAP on COCO
model {
faster_rcnn {
num_classes: 7
image_resizer {
fixed_shape_resizer {
height: 867
width: 867
}
}
feature_extractor {
type: 'faster_rcnn_inception_resnet_v2_keras'
}
first_stage_anchor_generator {
grid_anchor_generator {
scales:[0.1,0.4,0.45,0.5,0.6,0.7,0.8,0.90,1.0,
1.2,1.25,1.3,1.35,1.4,1.45,1.5,1.6,1.8,1.90,1.95,2.0,
2.2,2.25,2.3,2.35,2.4,2.45,2.5,2.6,2.8,2.90,2.95,3.0,
3.2,3.25,3.3,3.35,3.4,3.45,3.5,3.6,3.8,3.90,3.95,4.0,
4.2,4.25,4.3,4.35,4.4,4.45,4.5,4.6,4.8,4.90,4.95,5.0,
5.2,5.25,5.3,5.35,5.4,5.45,5.5,5.6,5.8,5.90,5.95,6.0,
6.2,6.25,6.3,6.35,6.4,6.45,6.5,6.6,6.8,6.90,6.95,7.0,
7.2,7.25,7.3,7.35,7.4,7.45,7.5,7.6,7.8,7.90,7.95,8.0,
8.2,8.25,8.3,8.35,8.4,8.45,8.5,8.55,8.6,8.8,8.90,8.95,9.0
]
aspect_ratios: [0.2,0.5,0.75,1.0,1.25,1.3,1.35,1.4,1.45,1.5,1.6,1.7,1.75,1.8,1.9,1.95,2.0,2.25,2.5,2.75,3.0,3.5,4.0]
height_stride: 16
width_stride: 16
}
}
first_stage_box_predictor_conv_hyperparams {
op: CONV
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
truncated_normal_initializer {
stddev: 0.01
}
}
}
first_stage_nms_score_threshold: 0.0
first_stage_nms_iou_threshold: 0.4
first_stage_max_proposals: 300
first_stage_localization_loss_weight: 2.0
first_stage_objectness_loss_weight: 1.0
initial_crop_size: 17
maxpool_kernel_size: 1
maxpool_stride: 1
second_stage_box_predictor {
mask_rcnn_box_predictor {
use_dropout: false
dropout_keep_probability: 1.0
fc_hyperparams {
op: FC
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
variance_scaling_initializer {
factor: 1.0
uniform: true
mode: FAN_AVG
}
}
}
}
}
second_stage_post_processing {
batch_non_max_suppression {
score_threshold: 0.0
iou_threshold: 0.25
max_detections_per_class: 100
max_total_detections: 700
}
score_converter: SOFTMAX
}
second_stage_localization_loss_weight: 2.0
second_stage_classification_loss_weight: 1.0
}
}
train_config: {
batch_size: 2
num_steps: 25000
optimizer {
momentum_optimizer: {
learning_rate: {
cosine_decay_learning_rate {
learning_rate_base: 0.0008
total_steps: 25000
warmup_learning_rate: 0.00001
warmup_steps: 2000
}
}
momentum_optimizer_value: 0.9
}
use_moving_average: false
}
gradient_clipping_by_norm: 10.0
fine_tune_checkpoint_version: V2
fine_tune_checkpoint: "faster_rcnn_inception_resnet_v2_1024x1024_coco17_tpu-8/checkpoint/ckpt-0"
fine_tune_checkpoint_type: "detection"
}
train_input_reader: {
label_map_path: "images/labelmap.pbtxt"
tf_record_input_reader {
input_path: "train.record"
}
}
eval_config: {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
batch_size: 1;
}
eval_input_reader: {
label_map_path: "images/labelmap.pbtxt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "test.record"
}
}
I am training a model (MobileNet V2 fpn lite) on a dataset of images with tensorflow object detection api. I don't quite understand what type of transfer learning is used as I could't find information in the config file specifically related to it (such as the number of freezed layers).
Here is my config file:
model {
ssd {
num_classes: 1
image_resizer {
fixed_shape_resizer {
height: 320
width: 320
}
}
feature_extractor {
type: "ssd_mobilenet_v2_fpn_keras"
depth_multiplier: 1.0
min_depth: 16
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 3.9999998989515007e-05
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.009999999776482582
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
use_depthwise: true
override_base_feature_extractor_hyperparams: true
fpn {
min_level: 3
max_level: 7
additional_layer_depth: 128
}
}
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 3.9999998989515007e-05
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.009999999776482582
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
depth: 128
num_layers_before_predictor: 4
kernel_size: 3
class_prediction_bias_init: -4.599999904632568
share_prediction_tower: true
use_depthwise: true
}
}
anchor_generator {
multiscale_anchor_generator {
min_level: 3
max_level: 7
anchor_scale: 4.0
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
scales_per_octave: 2
}
}
post_processing {
batch_non_max_suppression {
score_threshold: 9.99999993922529e-09
iou_threshold: 0.6000000238418579
max_detections_per_class: 100
max_total_detections: 100
use_static_shapes: false
}
score_converter: SIGMOID
}
normalize_loss_by_num_matches: true
loss {
localization_loss {
weighted_smooth_l1 {
}
}
classification_loss {
weighted_sigmoid_focal {
gamma: 2.0
alpha: 0.25
}
}
classification_weight: 1.0
localization_weight: 1.0
}
encode_background_as_zeros: true
normalize_loc_loss_by_codesize: true
inplace_batchnorm_update: true
freeze_batchnorm: false
}
}
train_config {
batch_size: 35 #128
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_crop_image {
min_object_covered: 0.0
min_aspect_ratio: 0.75
max_aspect_ratio: 3.0
min_area: 0.75
max_area: 1.0
overlap_thresh: 0.0
}
}
sync_replicas: true
optimizer {
momentum_optimizer {
learning_rate {
cosine_decay_learning_rate {
learning_rate_base: 0.03 #0.07999999821186066
total_steps: 50000
warmup_learning_rate: 0.01 #0.026666000485420227
warmup_steps: 1000
}
}
momentum_optimizer_value: 0.8999999761581421
}
use_moving_average: false
}
fine_tune_checkpoint: 'path_to/pre-trained-models/ssd_mobilenet_v2_fpnlite_320x320_coco17_tpu-8/checkpoint/ckpt-0'
num_steps: 50000
startup_delay_steps: 0.0
replicas_to_aggregate: 8
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_type: "detection"
fine_tune_checkpoint_version: V2
}
train_input_reader {
label_map_path: 'path_to/data/object_detection.pbtxt'
tf_record_input_reader {
input_path: 'path_to/data/train.record'
}
}
eval_config {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
}
eval_input_reader {
label_map_path: 'path_to/data/object_detection.pbtxt'
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: 'path_to/data/test.record'
}
}
How does transfer learning work with tensorflow object detection api? Can we change some of the parameters to adjust transfer learning?
Here is a related questions: https://stackoverflow.com/questions/62234735/tensorflow-object-detection-api-with-transfer-learning-for-custom-classes-does
If you have checkpoints in the model folder then weights will be initialized according to the weight values saved in the checkpoint and you got a transfer learning! But in this case you use all the layers of pretrained model
I am training tensorflow object detection API model on the custom dataset i.e. License plate dataset. My goal is to deploy this model to the edge device using tensorflow lite so I can't use any RCNN family model. Because, I can't convert any RCNN family object detection model to tensorflow lite model (this is the limitation from tensorflow object detection API). I am using ssd_mobilenet_v2_coco model to train the custom dataset. Following is the code snippet of my config file:
model {
ssd {
num_classes: 1
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
ssd_anchor_generator {
num_layers: 6
min_scale: 0.2
max_scale: 0.95
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.3333
}
}
image_resizer {
fixed_shape_resizer {
height: 300
width: 300
}
}
box_predictor {
convolutional_box_predictor {
min_depth: 0
max_depth: 0
num_layers_before_predictor: 0
use_dropout: false
dropout_keep_probability: 0.8
kernel_size: 1
box_code_size: 4
apply_sigmoid_to_scores: false
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
train: true,
scale: true,
center: true,
decay: 0.9997,
epsilon: 0.001,
}
}
}
}
feature_extractor {
type: 'ssd_mobilenet_v2'
min_depth: 16
depth_multiplier: 1.0
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
train: true,
scale: true,
center: true,
decay: 0.9997,
epsilon: 0.001,
}
}
}
loss {
classification_loss {
weighted_sigmoid {
}
}
localization_loss {
weighted_smooth_l1 {
}
}
hard_example_miner {
num_hard_examples: 3000
iou_threshold: 0.99
loss_type: CLASSIFICATION
max_negatives_per_positive: 3
min_negatives_per_image: 3
}
classification_weight: 1.0
localization_weight: 1.0
}
normalize_loss_by_num_matches: true
post_processing {
batch_non_max_suppression {
score_threshold: 1e-8
iou_threshold: 0.6
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
}
}
train_config: {
batch_size: 24
optimizer {
rms_prop_optimizer: {
learning_rate: {
exponential_decay_learning_rate {
initial_learning_rate: 0.004
decay_steps: 800720
decay_factor: 0.95
}
}
momentum_optimizer_value: 0.9
decay: 0.9
epsilon: 1.0
}
}
fine_tune_checkpoint: "/home/sach/DL/Pycharm_Workspace/TF1.14/License_Plate_F-RCNN/dataset/experiments/training_SSD/ssd_mobilenet_v2_coco_2018_03_29/model.ckpt"
fine_tune_checkpoint_type: "detection"
num_steps: 150000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
ssd_random_crop {
}
}
}
train_input_reader: {
tf_record_input_reader {
input_path: "/home/sach/DL/Pycharm_Workspace/TF1.14/License_Plate_F-RCNN/dataset/records/training.record"
}
label_map_path: "/home/sach/DL/Pycharm_Workspace/TF1.14/License_Plate_F-RCNN/dataset/records/classes.pbtxt"
}
eval_config: {
num_examples: 488
num_visualizations : 488
}
eval_input_reader: {
tf_record_input_reader {
input_path: "/home/sach/DL/Pycharm_Workspace/TF1.14/License_Plate_F-RCNN/dataset/records/testing.record"
}
label_map_path: "/home/sach/DL/Pycharm_Workspace/TF1.14/License_Plate_F-RCNN/dataset/records/classes.pbtxt"
shuffle: false
num_readers: 1
}
I have total 1932 images (train images: 1444 and val images: 448). I have trained the model for 150000 steps. Following is the output from tensorboard:
DetectionBoxes Precision mAP#0.5 IOU: After 150K steps, the object detection model accuracy (mAP#0.5 IOU) is ~0.97 i.e. 97%. Which seems to be fine at the moment.
Training Loss: After 150K steps, the training loss is ~1.3. This seems to be okay.
Evaluation/Validation Loss: After 150K steps, the evaluation/validation loss is ~3.90 which is pretty high. However, there is huge difference between training and evaluation loss. Is there any overfitting exist? How can I overcome this problem? In my point of view, training and evaluation loss should be close to each other.
How can I reduce validation/evaluation loss?
I am using the default config file so by default use_dropout: false. Should I change it to use_dropout: true in case overfitting exist?
What should be the acceptable range of training and validation loss for object detection model?
Please share your views. Thanking you!
There are several reasons for overfitting problem In Neural networks, by looking at your config file, I would like to suggest a few things to try to avoid overfitting.
use_dropout: true so that it makes the Neurons less sensitive to minor changes in the weights.
Try increasing iou_threshold in batch_non_max_suppression.
Use l1 regularizer or combination of l1 and l2 regularizer.
Change the optimizer to Nadam or Adam Optimizers.
Include more Augmentation techniques.
You can also use Early Stopping to track your accuracy.
Alternatively, you can observe the Tensorboard visualization, take the weights before the step where the validation loss starts increasing.
I hope trying these steps will resolve the overfitting issue of your model.
I'm using the Faster R-CNN Inception Resnet v2 model pre-trained on COCO to train my own object detector with the purpose of detecting objects from 3 classes. The objects are small compared to the size (resolution) of the image. I'm relatively new to ML and OD.
I wonder what changes I should make to the model to make it better fit my purpose. Is it a good idea to decrease the complexity of some parts of the model since I only detect 3 classes? Are there any feature extractors better suited for small objects? Is it generally best to train on a pre-trained model or should I train from scratch?
I'm aware that tuning the network to a specific need is a trial-and-error process, however, since it takes about 3 days to train the network I'm looking for some educated guesses.
Model configuration:
model {
faster_rcnn {
num_classes: 3
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 600
max_dimension: 4048
}
}
feature_extractor {
type: 'faster_rcnn_inception_resnet_v2'
first_stage_features_stride: 8
}
first_stage_anchor_generator {
# grid_anchor_generator {
# scales: [0.25, 0.5, 1.0, 2.0, 3.0]
# aspect_ratios: [0.25,0.5, 1.0, 2.0]
# height_stride: 8
# width_stride: 8
# }
grid_anchor_generator {
scales: [0.25, 0.5, 1.0, 2.0, 3.0]
aspect_ratios: [1.0, 2.0, 3.0]
height: 64
width: 64
height_stride: 8
width_stride: 8
}
}
first_stage_atrous_rate: 2
first_stage_box_predictor_conv_hyperparams {
op: CONV
regularizer {
l2_regularizer {
weight: 0.01
}
}
initializer {
truncated_normal_initializer {
stddev: 0.01
}
}
}
first_stage_nms_score_threshold: 0.0
first_stage_nms_iou_threshold: 0.4
first_stage_max_proposals: 1000
first_stage_localization_loss_weight: 2.0
first_stage_objectness_loss_weight: 1.0
initial_crop_size: 17
maxpool_kernel_size: 1
maxpool_stride: 1
second_stage_box_predictor {
mask_rcnn_box_predictor {
use_dropout: True
dropout_keep_probability: 0.9
fc_hyperparams {
op: FC
regularizer {
l2_regularizer {
weight: 0.01
}
}
initializer {
variance_scaling_initializer {
factor: 1.0
uniform: true
mode: FAN_AVG
}
}
}
}
}
second_stage_post_processing {
batch_non_max_suppression {
score_threshold: 0.0
iou_threshold: 0.5
max_detections_per_class: 20
max_total_detections: 20
}
score_converter: SOFTMAX
}
second_stage_localization_loss_weight: 2.0
second_stage_classification_loss_weight: 1.0
}
}
train_config: {
batch_size: 1
optimizer {
momentum_optimizer: {
learning_rate: {
manual_step_learning_rate {
initial_learning_rate: 0.00001
schedule {
step: 100000
learning_rate: .000001
}
schedule {
step: 150000
learning_rate: .0000001
}
}
}
momentum_optimizer_value: 0.9
}
use_moving_average: false
}
gradient_clipping_by_norm: 10.0
# PATH_TO_BE_CONFIGURED: Below line needs to match location of model checkpoint: Either use checkpoint from rcnn model, or checkpoint from previously trained model on other dataset.
fine_tune_checkpoint: "/.../model.ckpt"
from_detection_checkpoint: true
# Note: The below line limits the training process to 200K steps, which we
# empirically found to be sufficient enough to train the pets dataset. This
# effectively bypasses the learning rate schedule (the learning rate will
# never decay). Remove the below line to train indefinitely.
# num_steps: 200000
data_augmentation_options {
random_horizontal_flip {}
}
data_augmentation_options {
random_crop_image {
min_object_covered : 1.0
min_aspect_ratio: 0.5
max_aspect_ratio: 2
min_area: 0.2
max_area: 1.
}
}
data_augmentation_options {
random_distort_color {}
}
}
# PATH_TO_BE_CONFIGURED: Need to make sure folder structure below is correct for both train-record and label_map.pbtxt
train_input_reader: {
tf_record_input_reader {
input_path: "/.../train.record"
}
label_map_path: "/..../label_map.pbtxt"
queue_capacity: 500
min_after_dequeue: 250
}
#PATH_TO_BE_CONFIGURED: Make sure folder structure for eval_export, validation.record and label_map.pbtxt below are correct.
eval_config: {
num_examples: 30
# Note: The below line limits the evaluation process to 10 evaluations.
# Remove the below line to evaluate indefinitely.
max_evals: 10
num_visualizations: 30
eval_interval_secs: 600
visualization_export_dir: "/.../eval_export"
}
eval_input_reader: {
tf_record_input_reader {
input_path: "/.../test.record"
}
label_map_path: "/.../label_map.pbtxt"
shuffle: True
num_readers: 1
}