Tensorflow Object Detection Api dont restore checkpoint to fine-tuning - tensorflow

I am trying to re-train EfficientDet D4, coming from Tensorflow Model Zoo (http://download.tensorflow.org/models/object_detection/tf2/20200711/efficientdet_d4_coco17_tpu-32.tar.gz) on my dataset.
The tutorial describes that it might see a log like this when running model_main_tf2 to fine tune the model:
W0716 05:24:19.108539 1364 util.py:151] A checkpoint was restored (e.g. tf.train.Checkpoint.restore or tf.keras.Model.load_weights) but not all checkpointed values were used. See above for specific issues. Use expect_partial() on the load status object, e.g. tf.train.Checkpoint.restore(...).expect_partial(), to silence these warnings, or use assert_consumed() to make the check explicit. See https://www.tensorflow.org/guide/checkpoint#loading_mechanics for details.
WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards.
INFO:tensorflow:Step 100 per-step time 1.153s loss=0.761
I0716 05:26:55.879558 1364 model_lib_v2.py:632] Step 100 per-step time 1.153s loss=0.761
But I don't see it.
Am I doing something wrong when loading the pre-training checkpoint?
The configuration file I am using is the following:
model {
ssd {
inplace_batchnorm_update: true
freeze_batchnorm: false
num_classes: 8
add_background_class: false
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
encode_background_as_zeros: true
anchor_generator {
multiscale_anchor_generator {
min_level: 3
max_level: 7
anchor_scale: 4.0
aspect_ratios: [1.0, 2.0, 0.5]
scales_per_octave: 3
}
}
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 1024
max_dimension: 1024
pad_to_max_dimension: true
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
depth: 224
class_prediction_bias_init: -4.6
conv_hyperparams {
force_use_bias: true
activation: SWISH
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
random_normal_initializer {
stddev: 0.01
mean: 0.0
}
}
batch_norm {
scale: true
decay: 0.99
epsilon: 0.001
}
}
num_layers_before_predictor: 4
kernel_size: 3
use_depthwise: true
}
}
feature_extractor {
type: 'ssd_efficientnet-b4_bifpn_keras'
bifpn {
min_level: 3
max_level: 7
num_iterations: 7
num_filters: 224
}
conv_hyperparams {
force_use_bias: true
activation: SWISH
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
scale: true,
decay: 0.99,
epsilon: 0.001,
}
}
}
loss {
classification_loss {
weighted_sigmoid_focal {
alpha: 0.25
gamma: 1.5
}
}
localization_loss {
weighted_smooth_l1 {
}
}
classification_weight: 1.0
localization_weight: 1.0
}
normalize_loss_by_num_matches: true
normalize_loc_loss_by_codesize: true
post_processing {
batch_non_max_suppression {
score_threshold: 1e-8
iou_threshold: 0.5
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
}
}
train_config: {
fine_tune_checkpoint: "/home/models/efd4/checkpoint/ckpt-0"
fine_tune_checkpoint_version: V2
fine_tune_checkpoint_type: "detection"
batch_size: 1
sync_replicas: true
startup_delay_steps: 0
replicas_to_aggregate: 8
use_bfloat16: true
num_steps: 2000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_scale_crop_and_pad_to_square {
output_size: 1024
scale_min: 0.1
scale_max: 2.0
}
}
optimizer {
momentum_optimizer: {
learning_rate: {
cosine_decay_learning_rate {
learning_rate_base: 0.002
total_steps: 2000
warmup_learning_rate: .0001
warmup_steps: 500
}
}
momentum_optimizer_value: 0.9
}
use_moving_average: false
}
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
}
train_input_reader: {
label_map_path: "/home/labels/label_map.txt"
tf_record_input_reader {
input_path: "/home/records/train.tfrecord"
}
}
eval_config: {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
batch_size: 1;
}
eval_input_reader: {
label_map_path: "/home/labels/label_map.txt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "/home/records/validation.tfrecord"
}
}
When I make use of model_main_tf2 to start training, no error appears. However, when I check the model accuracy, it does not detect anything.
Average Precision (AP) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50 | area= all | maxDets=100 ] = 0.001
Average Precision (AP) #[ IoU=0.75 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.021
Average Precision (AP) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.010
Average Recall (AR) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.001
Average Recall (AR) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.002
I try to modify parameters like learning rate, the number of epochs, etc but doesn't work
To Fine-Tuning this model, I have followed the steps established in the following guide (https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/training.html).
python /home/drive/MyDrive/VISDRONE/model_main_tf2.py \
--pipeline_config_path={pipeline_file} \
--model_dir={model_dir} \
--alsologtostderr \
--num_train_steps={num_steps} \
--sample_1_of_n_eval_examples=1 \
--num_eval_steps={num_eval_steps}
2022-03-24 14:56:39.530945: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
I0324 14:56:39.539781 140467518502784 mirrored_strategy.py:374] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
INFO:tensorflow:Maybe overwriting train_steps: 2000
I0324 14:56:39.543960 140467518502784 config_util.py:552] Maybe overwriting train_steps: 2000
INFO:tensorflow:Maybe overwriting use_bfloat16: False
I0324 14:56:39.544119 140467518502784 config_util.py:552] Maybe overwriting use_bfloat16: False
I0324 14:56:39.553249 140467518502784 ssd_efficientnet_bifpn_feature_extractor.py:146] EfficientDet EfficientNet backbone version: efficientnet-b4
I0324 14:56:39.553378 140467518502784 ssd_efficientnet_bifpn_feature_extractor.py:147] EfficientDet BiFPN num filters: 224
I0324 14:56:39.553517 140467518502784 ssd_efficientnet_bifpn_feature_extractor.py:149] EfficientDet BiFPN num iterations: 7
I0324 14:56:39.558310 140467518502784 efficientnet_model.py:144] round_filter input=32 output=48
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.580137 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.582051 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.584519 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.585638 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.592988 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.597373 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.603657 140467518502784 efficientnet_model.py:144] round_filter input=32 output=48
I0324 14:56:39.603788 140467518502784 efficientnet_model.py:144] round_filter input=16 output=24
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.619617 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.620819 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.623020 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.624058 140467518502784 cross_device_ops.py:618] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0324 14:56:39.829434 140467518502784 efficientnet_model.py:144] round_filter input=16 output=24
I0324 14:56:39.829590 140467518502784 efficientnet_model.py:144] round_filter input=24 output=32
I0324 14:56:40.442389 140467518502784 efficientnet_model.py:144] round_filter input=24 output=32
I0324 14:56:40.442584 140467518502784 efficientnet_model.py:144] round_filter input=40 output=56
I0324 14:56:41.058132 140467518502784 efficientnet_model.py:144] round_filter input=40 output=56
I0324 14:56:41.058324 140467518502784 efficientnet_model.py:144] round_filter input=80 output=112
I0324 14:56:41.971299 140467518502784 efficientnet_model.py:144] round_filter input=80 output=112
I0324 14:56:41.971578 140467518502784 efficientnet_model.py:144] round_filter input=112 output=160
I0324 14:56:42.896141 140467518502784 efficientnet_model.py:144] round_filter input=112 output=160
I0324 14:56:42.896331 140467518502784 efficientnet_model.py:144] round_filter input=192 output=272
I0324 14:56:44.146403 140467518502784 efficientnet_model.py:144] round_filter input=192 output=272
I0324 14:56:44.146590 140467518502784 efficientnet_model.py:144] round_filter input=320 output=448
I0324 14:56:44.446191 140467518502784 efficientnet_model.py:144] round_filter input=1280 output=1792
I0324 14:56:44.504505 140467518502784 efficientnet_model.py:454] Building model efficientnet with params ModelConfig(width_coefficient=1.4, depth_coefficient=1.8, resolution=380, dropout_rate=0.4, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32')
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/object_detection/model_lib_v2.py:564: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
W0324 14:56:44.738715 140467518502784 deprecation.py:343] From /usr/local/lib/python3.7/dist-packages/object_detection/model_lib_v2.py:564: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
INFO:tensorflow:Reading unweighted datasets: ['/content/drive/MyDrive/VISDRONE/train.record']
I0324 14:56:44.751177 140467518502784 dataset_builder.py:163] Reading unweighted datasets: ['/content/drive/MyDrive/VISDRONE/train.record']
INFO:tensorflow:Reading record datasets for input file: ['/content/drive/MyDrive/VISDRONE/train.record']
I0324 14:56:44.751728 140467518502784 dataset_builder.py:80] Reading record datasets for input file: ['/content/drive/MyDrive/VISDRONE/train.record']
INFO:tensorflow:Number of filenames to read: 1
I0324 14:56:44.751873 140467518502784 dataset_builder.py:81] Number of filenames to read: 1
WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards.
W0324 14:56:44.752046 140467518502784 dataset_builder.py:88] num_readers has been reduced to 1 to match input file shards.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py:105: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.deterministic`.
W0324 14:56:44.754448 140467518502784 deprecation.py:343] From /usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py:105: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.deterministic`.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py:237: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.map()
W0324 14:56:44.776529 140467518502784 deprecation.py:343] From /usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py:237: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.data.Dataset.map()
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:1082: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
W0324 14:56:49.483746 140467518502784 deprecation.py:343] From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:1082: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:1082: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
W0324 14:56:52.317593 140467518502784 deprecation.py:343] From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:1082: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
/usr/local/lib/python3.7/dist-packages/keras/backend.py:450: UserWarning: `tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
warnings.warn('`tf.keras.backend.set_learning_phase` is deprecated and '
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/deprecation.py:616: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Use fn_output_signature instead
W0324 14:57:59.473496 140462682519296 deprecation.py:547] From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/deprecation.py:616: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Use fn_output_signature instead
WARNING:tensorflow:Gradients do not exist for variables ['stack_6/block_1/expand_bn/gamma:0', 'stack_6/block_1/expand_bn/beta:0', 'stack_6/block_1/depthwise_conv2d/depthwise_kernel:0', 'stack_6/block_1/depthwise_bn/gamma:0', 'stack_6/block_1/depthwise_bn/beta:0', 'stack_6/block_1/project_bn/gamma:0', 'stack_6/block_1/project_bn/beta:0', 'top_bn/gamma:0', 'top_bn/beta:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
W0324 14:58:17.434093 140462682519296 utils.py:80] Gradients do not exist for variables ['stack_6/block_1/expand_bn/gamma:0', 'stack_6/block_1/expand_bn/beta:0', 'stack_6/block_1/depthwise_conv2d/depthwise_kernel:0', 'stack_6/block_1/depthwise_bn/gamma:0', 'stack_6/block_1/depthwise_bn/beta:0', 'stack_6/block_1/project_bn/gamma:0', 'stack_6/block_1/project_bn/beta:0', 'top_bn/gamma:0', 'top_bn/beta:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
WARNING:tensorflow:Gradients do not exist for variables ['stack_6/block_1/expand_bn/gamma:0', 'stack_6/block_1/expand_bn/beta:0', 'stack_6/block_1/depthwise_conv2d/depthwise_kernel:0', 'stack_6/block_1/depthwise_bn/gamma:0', 'stack_6/block_1/depthwise_bn/beta:0', 'stack_6/block_1/project_bn/gamma:0', 'stack_6/block_1/project_bn/beta:0', 'top_bn/gamma:0', 'top_bn/beta:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
W0324 14:58:42.918556 140462682519296 utils.py:80] Gradients do not exist for variables ['stack_6/block_1/expand_bn/gamma:0', 'stack_6/block_1/expand_bn/beta:0', 'stack_6/block_1/depthwise_conv2d/depthwise_kernel:0', 'stack_6/block_1/depthwise_bn/gamma:0', 'stack_6/block_1/depthwise_bn/beta:0', 'stack_6/block_1/project_bn/gamma:0', 'stack_6/block_1/project_bn/beta:0', 'top_bn/gamma:0', 'top_bn/beta:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
WARNING:tensorflow:Gradients do not exist for variables ['stack_6/block_1/expand_bn/gamma:0', 'stack_6/block_1/expand_bn/beta:0', 'stack_6/block_1/depthwise_conv2d/depthwise_kernel:0', 'stack_6/block_1/depthwise_bn/gamma:0', 'stack_6/block_1/depthwise_bn/beta:0', 'stack_6/block_1/project_bn/gamma:0', 'stack_6/block_1/project_bn/beta:0', 'top_bn/gamma:0', 'top_bn/beta:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
W0324 14:59:06.517044 140462682519296 utils.py:80] Gradients do not exist for variables ['stack_6/block_1/expand_bn/gamma:0', 'stack_6/block_1/expand_bn/beta:0', 'stack_6/block_1/depthwise_conv2d/depthwise_kernel:0', 'stack_6/block_1/depthwise_bn/gamma:0', 'stack_6/block_1/depthwise_bn/beta:0', 'stack_6/block_1/project_bn/gamma:0', 'stack_6/block_1/project_bn/beta:0', 'top_bn/gamma:0', 'top_bn/beta:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
WARNING:tensorflow:Gradients do not exist for variables ['stack_6/block_1/expand_bn/gamma:0', 'stack_6/block_1/expand_bn/beta:0', 'stack_6/block_1/depthwise_conv2d/depthwise_kernel:0', 'stack_6/block_1/depthwise_bn/gamma:0', 'stack_6/block_1/depthwise_bn/beta:0', 'stack_6/block_1/project_bn/gamma:0', 'stack_6/block_1/project_bn/beta:0', 'top_bn/gamma:0', 'top_bn/beta:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
W0324 14:59:31.055212 140462682519296 utils.py:80] Gradients do not exist for variables ['stack_6/block_1/expand_bn/gamma:0', 'stack_6/block_1/expand_bn/beta:0', 'stack_6/block_1/depthwise_conv2d/depthwise_kernel:0', 'stack_6/block_1/depthwise_bn/gamma:0', 'stack_6/block_1/depthwise_bn/beta:0', 'stack_6/block_1/project_bn/gamma:0', 'stack_6/block_1/project_bn/beta:0', 'top_bn/gamma:0', 'top_bn/beta:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
INFO:tensorflow:Step 100 per-step time 4.057s
I0324 15:04:44.796877 140467518502784 model_lib_v2.py:707] Step 100 per-step time 4.057s
INFO:tensorflow:{'Loss/classification_loss': 1.0777053,
'Loss/localization_loss': 0.71329135,
'Loss/regularization_loss': 0.048915524,
'Loss/total_loss': 1.8399122,
'learning_rate': 0.0002}
I0324 15:04:44.797298 140467518502784 model_lib_v2.py:708] {'Loss/classification_loss': 1.0777053,
'Loss/localization_loss': 0.71329135,
'Loss/regularization_loss': 0.048915524,
'Loss/total_loss': 1.8399122,
'learning_rate': 0.0002}

Try changing this part:
fine_tune_checkpoint_type: "detection"
to:
fine_tune_checkpoint_type: "full"

Related

Error while finding Mean Average Precision of a trained model using Tensorflow Object Detection API

I am trying to find the Mean Average Precision of a model that I trained using Tensorflow object detection API. I am using the EfficientDet model (using Google Colab).
This issue happens only when I am trying to evaluate the model and get the performance metrics.
There are no issue when I use the trained to model actually identify the objects using test images. In fact I get the final output with bounding boxes and acceptable performance.
I am using the below code to get the model evaluation:
!python /content/gdrive/MyDrive/content/models/research/object_detection/model_main_tf2.py \
--pipeline_config_path={pipeline_file} \
--model_dir={model_dir} \
--checkpoint_dir={model_dir}
These are the paths:
pipeline_file = '/content/gdrive/MyDrive/content/models/research/deploy/pipeline_file.config'
model_dir = '/content/gdrive/MyDrive/content/log_files_barca_bayern'
Log while running the evaluation code:
2022-11-13 16:36:37.304526: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-13 16:36:38.053407: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2022-11-13 16:36:38.053514: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2022-11-13 16:36:38.053532: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
WARNING:tensorflow:Forced number of epochs for all eval validations to be 1.
W1113 16:36:40.238175 139810192238464 model_lib_v2.py:1090] Forced number of epochs for all eval validations to be 1.
INFO:tensorflow:Maybe overwriting sample_1_of_n_eval_examples: None
I1113 16:36:40.238409 139810192238464 config_util.py:552] Maybe overwriting sample_1_of_n_eval_examples: None
INFO:tensorflow:Maybe overwriting use_bfloat16: False
I1113 16:36:40.238500 139810192238464 config_util.py:552] Maybe overwriting use_bfloat16: False
INFO:tensorflow:Maybe overwriting eval_num_epochs: 1
I1113 16:36:40.238580 139810192238464 config_util.py:552] Maybe overwriting eval_num_epochs: 1
WARNING:tensorflow:Expected number of evaluation epochs is 1, but instead encountered eval_on_train_input_config.num_epochs = 0. Overwriting num_epochs to 1.
W1113 16:36:40.238686 139810192238464 model_lib_v2.py:1110] Expected number of evaluation epochs is 1, but instead encountered eval_on_train_input_config.num_epochs = 0. Overwriting num_epochs to 1.
2022-11-13 16:36:41.088505: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
I1113 16:36:41.109091 139810192238464 ssd_efficientnet_bifpn_feature_extractor.py:146] EfficientDet EfficientNet backbone version: efficientnet-b0
I1113 16:36:41.109263 139810192238464 ssd_efficientnet_bifpn_feature_extractor.py:147] EfficientDet BiFPN num filters: 64
I1113 16:36:41.109331 139810192238464 ssd_efficientnet_bifpn_feature_extractor.py:149] EfficientDet BiFPN num iterations: 3
I1113 16:36:41.112856 139810192238464 efficientnet_model.py:143] round_filter input=32 output=32
I1113 16:36:41.145924 139810192238464 efficientnet_model.py:143] round_filter input=32 output=32
I1113 16:36:41.146051 139810192238464 efficientnet_model.py:143] round_filter input=16 output=16
I1113 16:36:41.218508 139810192238464 efficientnet_model.py:143] round_filter input=16 output=16
I1113 16:36:41.218694 139810192238464 efficientnet_model.py:143] round_filter input=24 output=24
I1113 16:36:41.404295 139810192238464 efficientnet_model.py:143] round_filter input=24 output=24
I1113 16:36:41.404441 139810192238464 efficientnet_model.py:143] round_filter input=40 output=40
I1113 16:36:41.577770 139810192238464 efficientnet_model.py:143] round_filter input=40 output=40
I1113 16:36:41.577946 139810192238464 efficientnet_model.py:143] round_filter input=80 output=80
I1113 16:36:41.833776 139810192238464 efficientnet_model.py:143] round_filter input=80 output=80
I1113 16:36:41.833942 139810192238464 efficientnet_model.py:143] round_filter input=112 output=112
I1113 16:36:42.104938 139810192238464 efficientnet_model.py:143] round_filter input=112 output=112
I1113 16:36:42.105093 139810192238464 efficientnet_model.py:143] round_filter input=192 output=192
I1113 16:36:42.436462 139810192238464 efficientnet_model.py:143] round_filter input=192 output=192
I1113 16:36:42.436625 139810192238464 efficientnet_model.py:143] round_filter input=320 output=320
I1113 16:36:42.518593 139810192238464 efficientnet_model.py:143] round_filter input=1280 output=1280
I1113 16:36:42.559586 139810192238464 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.0, resolution=224, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32')
INFO:tensorflow:Reading unweighted datasets: ['/content/gdrive/MyDrive/content/test/teams.tfrecord']
I1113 16:36:42.612493 139810192238464 dataset_builder.py:162] Reading unweighted datasets: ['/content/gdrive/MyDrive/content/test/teams.tfrecord']
INFO:tensorflow:Reading record datasets for input file: ['/content/gdrive/MyDrive/content/test/teams.tfrecord']
I1113 16:36:42.612948 139810192238464 dataset_builder.py:79] Reading record datasets for input file: ['/content/gdrive/MyDrive/content/test/teams.tfrecord']
INFO:tensorflow:Number of filenames to read: 1
I1113 16:36:42.613093 139810192238464 dataset_builder.py:80] Number of filenames to read: 1
WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards.
W1113 16:36:42.613175 139810192238464 dataset_builder.py:87] num_readers has been reduced to 1 to match input file shards.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py:104: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
W1113 16:36:42.616040 139810192238464 deprecation.py:356] From /usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py:104: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py:236: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.map() W1113 16:36:42.630358 139810192238464 deprecation.py:356] From /usr/local/lib/python3.7/dist-packages/object_detection/builders/dataset_builder.py:236: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.map()
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
W1113 16:36:46.364809 139810192238464 deprecation.py:356] From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
W1113 16:36:47.720357 139810192238464 deprecation.py:356] From /usr/local/lib/python3.7/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Waiting for new checkpoint at /content/gdrive/MyDrive/content/log_files_barca_bayern
I1113 16:36:50.155207 139810192238464 checkpoint_utils.py:142] Waiting for new checkpoint at /content/gdrive/MyDrive/content/log_files_barca_bayern
INFO:tensorflow:Found new checkpoint at /content/gdrive/MyDrive/content/log_files_barca_bayern/ckpt-11
I1113 16:36:51.766162 139810192238464 checkpoint_utils.py:151] Found new checkpoint at /content/gdrive/MyDrive/content/log_files_barca_bayern/ckpt-11
/usr/local/lib/python3.7/dist-packages/keras/backend.py:452: UserWarning: tf.keras.backend.set_learning_phase is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the training argument of the __call__ method of your layer or model.
"tf.keras.backend.set_learning_phase is deprecated and "
Traceback (most recent call last):
File "/content/gdrive/MyDrive/content/models/research/object_detection/model_main_tf2.py", line 114, in
tf.compat.v1.app.run()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/platform/app.py", line 36, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/usr/local/lib/python3.7/dist-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/usr/local/lib/python3.7/dist-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "/content/gdrive/MyDrive/content/models/research/object_detection/model_main_tf2.py", line 89, in main
wait_interval=300, timeout=FLAGS.eval_timeout)
File "/usr/local/lib/python3.7/dist-packages/object_detection/model_lib_v2.py", line 1164, in eval_continuously
global_step=global_step,
File "/usr/local/lib/python3.7/dist-packages/object_detection/model_lib_v2.py", line 1009, in eager_eval_loop
for evaluator in evaluators:
TypeError: 'NoneType' object is not iterable
Here is my config file:
#SSD with EfficientNet-b0 + BiFPN feature extractor,
#shared box predictor and focal loss (a.k.a EfficientDet-d0).
#See EfficientDet, Tan et al, https://arxiv.org/abs/1911.09070
#See Lin et al, https://arxiv.org/abs/1708.02002
#Trained on COCO, initialized from an EfficientNet-b0 checkpoint.
#Train on TPU-8
model {
ssd {
inplace_batchnorm_update: true
freeze_batchnorm: false
num_classes: 5
add_background_class: false
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
encode_background_as_zeros: true
anchor_generator {
multiscale_anchor_generator {
min_level: 3
max_level: 7
anchor_scale: 4.0
aspect_ratios: [1.0, 2.0, 0.5]
scales_per_octave: 3
}
}
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 512
max_dimension: 512
pad_to_max_dimension: true
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
depth: 64
class_prediction_bias_init: -4.6
conv_hyperparams {
force_use_bias: true
activation: SWISH
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
random_normal_initializer {
stddev: 0.01
mean: 0.0
}
}
batch_norm {
scale: true
decay: 0.99
epsilon: 0.001
}
}
num_layers_before_predictor: 3
kernel_size: 3
use_depthwise: true
}
}
feature_extractor {
type: 'ssd_efficientnet-b0_bifpn_keras'
bifpn {
min_level: 3
max_level: 7
num_iterations: 3
num_filters: 64
}
conv_hyperparams {
force_use_bias: true
activation: SWISH
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
scale: true,
decay: 0.99,
epsilon: 0.001,
}
}
}
loss {
classification_loss {
weighted_sigmoid_focal {
alpha: 0.25
gamma: 1.5
}
}
localization_loss {
weighted_smooth_l1 {
}
}
classification_weight: 1.0
localization_weight: 1.0
}
normalize_loss_by_num_matches: true
normalize_loc_loss_by_codesize: true
post_processing {
batch_non_max_suppression {
score_threshold: 1e-8
iou_threshold: 0.5
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
}
}
train_config: {
fine_tune_checkpoint: "/content/gdrive/MyDrive/content/models/research/deploy/efficientdet_d0_coco17_tpu-32/checkpoint/ckpt-0"
fine_tune_checkpoint_version: V2
fine_tune_checkpoint_type: "detection"
batch_size: 16
sync_replicas: true
startup_delay_steps: 0
replicas_to_aggregate: 8
use_bfloat16: true
num_steps: 8000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_scale_crop_and_pad_to_square {
output_size: 512
scale_min: 0.1
scale_max: 2.0
}
}
optimizer {
momentum_optimizer: {
learning_rate: {
cosine_decay_learning_rate {
learning_rate_base: 8e-2
total_steps: 300000
warmup_learning_rate: .001
warmup_steps: 2500
}
}
momentum_optimizer_value: 0.9
}
use_moving_average: false
}
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
}
train_input_reader: {
label_map_path: "/content/gdrive/MyDrive/content/train/teams_label_map.pbtxt"
tf_record_input_reader {
input_path: "/content/gdrive/MyDrive/content/train/teams.tfrecord"
}
}
eval_config: {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
batch_size: 16;
}
eval_input_reader: {
label_map_path: "/content/gdrive/MyDrive/content/train/teams_label_map.pbtxt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "/content/gdrive/MyDrive/content/test/teams.tfrecord"
}
}

After some steps getting : ERROR:tensorflow:Model diverged with loss = NaN during traning translation model

I'm working tensorflow 1.15-gpu object detection api. SSD_mobilenet_v2 using.
When i start to custom dataset training with whole layers, after some steps i'm gettin this error. U Can find log final file here :
620508 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_13/depthwise/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_14/expand/add_fold
I0615 16:46:04.620657 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_14/expand/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_14/depthwise/add_fold
I0615 16:46:04.620755 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_14/depthwise/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_15/expand/add_fold
I0615 16:46:04.620906 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_15/expand/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_15/depthwise/add_fold
I0615 16:46:04.621004 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_15/depthwise/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_16/expand/add_fold
I0615 16:46:04.621154 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_16/expand/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_16/depthwise/add_fold
I0615 16:46:04.621255 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/expanded_conv_16/depthwise/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/Conv_1/add_fold
I0615 16:46:04.621405 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/Conv_1/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/layer_19_1_Conv2d_2_1x1_256/add_fold
I0615 16:46:04.621502 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/layer_19_1_Conv2d_2_1x1_256/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/layer_19_2_Conv2d_2_3x3_s2_512/add_fold
I0615 16:46:04.621596 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/layer_19_2_Conv2d_2_3x3_s2_512/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/layer_19_1_Conv2d_3_1x1_128/add_fold
I0615 16:46:04.621691 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/layer_19_1_Conv2d_3_1x1_128/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/layer_19_2_Conv2d_3_3x3_s2_256/add_fold
I0615 16:46:04.621783 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/layer_19_2_Conv2d_3_3x3_s2_256/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/layer_19_1_Conv2d_4_1x1_128/add_fold
I0615 16:46:04.621879 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/layer_19_1_Conv2d_4_1x1_128/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/layer_19_2_Conv2d_4_3x3_s2_256/add_fold
I0615 16:46:04.621973 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/layer_19_2_Conv2d_4_3x3_s2_256/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/layer_19_1_Conv2d_5_1x1_64/add_fold
I0615 16:46:04.622072 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/layer_19_1_Conv2d_5_1x1_64/add_fold
INFO:tensorflow:Skipping quant after FeatureExtractor/MobilenetV2/layer_19_2_Conv2d_5_3x3_s2_128/add_fold
I0615 16:46:04.622165 139680523892544 quantize.py:299] Skipping quant after FeatureExtractor/MobilenetV2/layer_19_2_Conv2d_5_3x3_s2_128/add_fold
INFO:tensorflow:Done calling model_fn.
I0615 16:46:05.160237 139680523892544 estimator.py:1150] Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2022-06-15T16:46:05Z
I0615 16:46:05.170377 139680523892544 evaluation.py:255] Starting evaluation at 2022-06-15T16:46:05Z
INFO:tensorflow:Graph was finalized.
I0615 16:46:05.615129 139680523892544 monitored_session.py:240] Graph was finalized.
2022-06-15 16:46:05.615582: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-15 16:46:05.615788: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Found device 0 with properties:
name: NVIDIA GeForce RTX 2060 SUPER major: 7 minor: 5 memoryClockRate(GHz): 1.665
pciBusID: 0000:01:00.0
2022-06-15 16:46:05.615847: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2022-06-15 16:46:05.615860: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2022-06-15 16:46:05.615871: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2022-06-15 16:46:05.615882: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2022-06-15 16:46:05.615892: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2022-06-15 16:46:05.615902: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2022-06-15 16:46:05.615913: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2022-06-15 16:46:05.615952: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-15 16:46:05.616056: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-15 16:46:05.616133: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0
2022-06-15 16:46:05.616151: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1180] Device interconnect StreamExecutor with strength 1 edge matrix:
2022-06-15 16:46:05.616156: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1186] 0
2022-06-15 16:46:05.616160: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 0: N
2022-06-15 16:46:05.616206: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-15 16:46:05.616309: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-15 16:46:05.616392: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 7498 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce RTX 2060 SUPER, pci bus id: 0000:01:00.0, compute capability: 7.5)
INFO:tensorflow:Restoring parameters from /tensorflow/models/research/learn_coco/out/trainings/fold_0/train_1_714012/train/model.ckpt-46671
I0615 16:46:05.617070 139680523892544 saver.py:1284] Restoring parameters from /tensorflow/models/research/learn_coco/out/trainings/fold_0/train_1_714012/train/model.ckpt-46671
INFO:tensorflow:Running local_init_op.
I0615 16:46:06.635428 139680523892544 session_manager.py:500] Running local_init_op.
INFO:tensorflow:Done running local_init_op.
I0615 16:46:06.747072 139680523892544 session_manager.py:502] Done running local_init_op.
INFO:tensorflow:Loading and preparing annotation results...
I0615 16:48:20.656049 139676933404416 coco_tools.py:109] Loading and preparing annotation results...
INFO:tensorflow:DONE (t=0.13s)
I0615 16:48:20.790490 139676933404416 coco_tools.py:131] DONE (t=0.13s)
INFO:tensorflow:Finished evaluation at 2022-06-15-16:48:28
I0615 16:48:28.939589 139680523892544 evaluation.py:275] Finished evaluation at 2022-06-15-16:48:28
INFO:tensorflow:Saving dict for global step 46671: DetectionBoxes_Precision/mAP = 1.0515627e-05, DetectionBoxes_Precision/mAP (large) = 0.0, DetectionBoxes_Precision/mAP (medium) = 2.1307987e-05, DetectionBoxes_Precision/mAP (small) = 0.0, DetectionBoxes_Precision/mAP#.50IOU = 4.04426e-05, DetectionBoxes_Precision/mAP#.75IOU = 6.5893556e-07, DetectionBoxes_Recall/AR#1 = 0.0, DetectionBoxes_Recall/AR#10 = 0.0020118733, DetectionBoxes_Recall/AR#100 = 0.026451187, DetectionBoxes_Recall/AR#100 (large) = 0.0, DetectionBoxes_Recall/AR#100 (medium) = 0.05332447, DetectionBoxes_Recall/AR#100 (small) = 0.0, Loss/classification_loss = 144.96577, Loss/localization_loss = 1.1134313, Loss/regularization_loss = 2085.086, Loss/total_loss = 2231.236, global_step = 46671, learning_rate = 0.00236426, loss = 2231.236
I0615 16:48:28.939769 139680523892544 estimator.py:2049] Saving dict for global step 46671: DetectionBoxes_Precision/mAP = 1.0515627e-05, DetectionBoxes_Precision/mAP (large) = 0.0, DetectionBoxes_Precision/mAP (medium) = 2.1307987e-05, DetectionBoxes_Precision/mAP (small) = 0.0, DetectionBoxes_Precision/mAP#.50IOU = 4.04426e-05, DetectionBoxes_Precision/mAP#.75IOU = 6.5893556e-07, DetectionBoxes_Recall/AR#1 = 0.0, DetectionBoxes_Recall/AR#10 = 0.0020118733, DetectionBoxes_Recall/AR#100 = 0.026451187, DetectionBoxes_Recall/AR#100 (large) = 0.0, DetectionBoxes_Recall/AR#100 (medium) = 0.05332447, DetectionBoxes_Recall/AR#100 (small) = 0.0, Loss/classification_loss = 144.96577, Loss/localization_loss = 1.1134313, Loss/regularization_loss = 2085.086, Loss/total_loss = 2231.236, global_step = 46671, learning_rate = 0.00236426, loss = 2231.236
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 46671: /tensorflow/models/research/learn_coco/out/trainings/fold_0/train_1_714012/train/model.ckpt-46671
I0615 16:48:28.945906 139680523892544 estimator.py:2109] Saving 'checkpoint_path' summary for global step 46671: /tensorflow/models/research/learn_coco/out/trainings/fold_0/train_1_714012/train/model.ckpt-46671
INFO:tensorflow:global_step/sec: 0.598679
I0615 16:48:34.531725 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 0.598679
INFO:tensorflow:loss = 2104.397, step = 46700 (167.035 sec)
I0615 16:48:34.532555 139680523892544 basic_session_run_hooks.py:260] loss = 2104.397, step = 46700 (167.035 sec)
INFO:tensorflow:global_step/sec: 5.39949
I0615 16:48:53.052003 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.39949
INFO:tensorflow:loss = 2089.7336, step = 46800 (18.520 sec)
I0615 16:48:53.052544 139680523892544 basic_session_run_hooks.py:260] loss = 2089.7336, step = 46800 (18.520 sec)
INFO:tensorflow:global_step/sec: 5.39205
I0615 16:49:11.597810 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.39205
INFO:tensorflow:loss = 2575.289, step = 46900 (18.546 sec)
I0615 16:49:11.598227 139680523892544 basic_session_run_hooks.py:260] loss = 2575.289, step = 46900 (18.546 sec)
INFO:tensorflow:global_step/sec: 5.3781
I0615 16:49:30.191725 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.3781
INFO:tensorflow:loss = 2085.609, step = 47000 (18.594 sec)
I0615 16:49:30.192180 139680523892544 basic_session_run_hooks.py:260] loss = 2085.609, step = 47000 (18.594 sec)
INFO:tensorflow:global_step/sec: 5.3868
I0615 16:49:48.755650 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.3868
INFO:tensorflow:loss = 2086.8247, step = 47100 (18.564 sec)
I0615 16:49:48.756281 139680523892544 basic_session_run_hooks.py:260] loss = 2086.8247, step = 47100 (18.564 sec)
INFO:tensorflow:global_step/sec: 5.36392
I0615 16:50:07.398717 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.36392
INFO:tensorflow:loss = 2085.7932, step = 47200 (18.643 sec)
I0615 16:50:07.399319 139680523892544 basic_session_run_hooks.py:260] loss = 2085.7932, step = 47200 (18.643 sec)
INFO:tensorflow:global_step/sec: 5.37691
I0615 16:50:25.996762 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.37691
INFO:tensorflow:loss = 2084.9392, step = 47300 (18.598 sec)
I0615 16:50:25.997260 139680523892544 basic_session_run_hooks.py:260] loss = 2084.9392, step = 47300 (18.598 sec)
INFO:tensorflow:global_step/sec: 5.36369
I0615 16:50:44.640652 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.36369
INFO:tensorflow:loss = 2084.0159, step = 47400 (18.644 sec)
I0615 16:50:44.641378 139680523892544 basic_session_run_hooks.py:260] loss = 2084.0159, step = 47400 (18.644 sec)
INFO:tensorflow:global_step/sec: 5.37861
I0615 16:51:03.232832 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.37861
INFO:tensorflow:loss = 2084.3396, step = 47500 (18.592 sec)
I0615 16:51:03.233308 139680523892544 basic_session_run_hooks.py:260] loss = 2084.3396, step = 47500 (18.592 sec)
INFO:tensorflow:global_step/sec: 5.37582
I0615 16:51:21.834661 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.37582
INFO:tensorflow:loss = 2083.8713, step = 47600 (18.602 sec)
I0615 16:51:21.835240 139680523892544 basic_session_run_hooks.py:260] loss = 2083.8713, step = 47600 (18.602 sec)
INFO:tensorflow:global_step/sec: 5.36692
I0615 16:51:40.467312 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.36692
INFO:tensorflow:loss = 2083.82, step = 47700 (18.633 sec)
I0615 16:51:40.467999 139680523892544 basic_session_run_hooks.py:260] loss = 2083.82, step = 47700 (18.633 sec)
INFO:tensorflow:global_step/sec: 5.37798
I0615 16:51:59.061645 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.37798
INFO:tensorflow:loss = 2083.5405, step = 47800 (18.594 sec)
I0615 16:51:59.062341 139680523892544 basic_session_run_hooks.py:260] loss = 2083.5405, step = 47800 (18.594 sec)
INFO:tensorflow:global_step/sec: 5.3654
I0615 16:52:17.699595 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.3654
INFO:tensorflow:loss = 2083.3599, step = 47900 (18.638 sec)
I0615 16:52:17.700082 139680523892544 basic_session_run_hooks.py:260] loss = 2083.3599, step = 47900 (18.638 sec)
INFO:tensorflow:global_step/sec: 5.34293
I0615 16:52:36.416148 139680523892544 basic_session_run_hooks.py:692] global_step/sec: 5.34293
INFO:tensorflow:loss = 2104.6877, step = 48000 (18.718 sec)
I0615 16:52:36.418108 139680523892544 basic_session_run_hooks.py:260] loss = 2104.6877, step = 48000 (18.718 sec)
ERROR:tensorflow:Model diverged with loss = NaN.
E0615 16:52:37.836951 139680523892544 basic_session_run_hooks.py:760] Model diverged with loss = NaN.
Average Precision (AP) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.75 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
creating index...
index created!
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=4.36s).
Accumulating evaluation results...
DONE (t=2.93s).
Average Precision (AP) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.75 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.001
Average Recall (AR) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.001
Average Recall (AR) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
creating index...
index created!
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=12.10s).
Accumulating evaluation results...
DONE (t=2.87s).
Average Precision (AP) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.75 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
creating index...
index created!
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=4.71s).
Accumulating evaluation results...
DONE (t=3.09s).
Average Precision (AP) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.75 | area= all | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Precision (AP) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.002
Average Recall (AR) #[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.026
Average Recall (AR) #[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Recall (AR) #[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.053
Average Recall (AR) #[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
Traceback (most recent call last):
File "./object_detection/model_main.py", line 114, in <module>
tf.app.run()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "./object_detection/model_main.py", line 110, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/training.py", line 473, in train_and_evaluate
return executor.run()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/training.py", line 613, in run
return self.run_local()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/training.py", line 714, in run_local
saving_listeners=saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1195, in _train_model_default
saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1494, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 754, in run
run_metadata=run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1259, in run
run_metadata=run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1360, in run
raise six.reraise(*original_exc_info)
File "/usr/lib/python3/dist-packages/six.py", line 693, in reraise
raise value
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1345, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1426, in run
run_metadata=run_metadata))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/basic_session_run_hooks.py", line 761, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.
And here my pipeline file, whats wrong? when if i make few layers training with same data, it's not crash;
model {
ssd {
num_classes: 4
image_resizer {
fixed_shape_resizer {
height: 300
width: 300
}
}
feature_extractor {
type: "ssd_mobilenet_v2"
depth_multiplier: 1.0
min_depth: 16
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 3.99999989895e-05
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.00999999977648
}
}
activation: RELU_6
batch_norm {
decay: 0.97000002861
center: true
scale: true
epsilon: 0.0010000000475
}
}
override_base_feature_extractor_hyperparams: true
}
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
box_predictor {
convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 3.99999989895e-05
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.00999999977648
}
}
activation: RELU_6
batch_norm {
decay: 0.97000002861
center: true
scale: true
epsilon: 0.0010000000475
}
}
min_depth: 0
max_depth: 0
num_layers_before_predictor: 0
use_dropout: false
dropout_keep_probability: 0.800000011921
kernel_size: 1
box_code_size: 4
apply_sigmoid_to_scores: false
class_prediction_bias_init: -4.59999990463
}
}
anchor_generator {
ssd_anchor_generator {
num_layers: 6
min_scale: 0.20000000298
max_scale: 0.949999988079
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.333299994469
}
}
post_processing {
batch_non_max_suppression {
score_threshold: 0.300000011921
iou_threshold: 0.600000023842
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
normalize_loss_by_num_matches: true
loss {
localization_loss {
weighted_smooth_l1 {
}
}
classification_loss {
weighted_sigmoid_focal {
gamma: 2.0
alpha: 0.75
}
}
classification_weight: 1.0
localization_weight: 1.0
}
encode_background_as_zeros: true
normalize_loc_loss_by_codesize: true
inplace_batchnorm_update: true
freeze_batchnorm: false
}
}
train_config {
batch_size: 8
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
ssd_random_crop {
}
}
sync_replicas: true
optimizer {
momentum_optimizer {
learning_rate {
cosine_decay_learning_rate {
learning_rate_base: 0.20000000298
total_steps: 50000
warmup_learning_rate: 0.0599999986589
warmup_steps: 2000
}
}
momentum_optimizer_value: 0.899999976158
}
use_moving_average: false
}
fine_tune_checkpoint: "CKPT_DIR_TO_CONFIGURE/model.ckpt"
from_detection_checkpoint: true
load_all_detection_checkpoint_vars: true
num_steps: 50000
startup_delay_steps: 0.0
replicas_to_aggregate: 8
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
}
train_input_reader {
label_map_path: "DATASET_DIR_TO_CONFIGURE/label_map.pbtxt"
tf_record_input_reader {
input_path: "DATASET_DIR_TO_CONFIGURE/train.record"
}
}
eval_config {
num_examples: 8000
metrics_set: "coco_detection_metrics"
use_moving_averages: false
}
eval_input_reader {
label_map_path: "DATASET_DIR_TO_CONFIGURE/label_map.pbtxt"
shuffle: false
num_readers: 1
tf_record_input_reader {
input_path: "DATASET_DIR_TO_CONFIGURE/valid.record"
}
}
graph_rewriter {
quantization {
delay: 48000
weight_bits: 8
activation_bits: 8
}
}

model_main.py faster-rcnn CUDA_ERROR_OUT_OF_MEMORY

Description:
I am able to train faster-rcnn model with legacy/train.py, but it runs into problem as below when I try to use model_main.py to train with the same config setting.
Image resolution: 1920x1080
tensorflow/stream_executor/cuda/cuda_driver.cc:890] failed to alloc 8589934592 bytes on host: CUDA_ERROR_OUT_OF_MEMORY: out of memory
.\tensorflow/core/common_runtime/gpu/gpu_host_allocator.h:44] could not allocate pinned host memory of size: 8589934592
tensorflow/core/common_runtime/bfc_allocator.cc:764] Bin (256): Total Chunks: 4753, Chunks in use: 4753. 1.16MiB allocated for chunks. 1.16MiB in use in bin. 144.3KiB client-requested in use in bin.
tensorflow/core/common_runtime/bfc_allocator.cc:800] InUse at 0000000203800000 next 1 of size 256
What I have tried:
Set batch size to 1
use memory growing
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
or
session_config = tf.ConfigProto()
session_config.gpu_options.allow_growth = True
config = tf.estimator.RunConfig(model_dir=FLAGS.model_dir, session_config=session_config, log_step_count_steps=10, save_summary_steps=20, keep_checkpoint_max=20, save_checkpoints_steps=100)
don't allocate whole of your GPU memory
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.6
session = tf.Session(config=config)
or
session_config = tf.ConfigProto()
session_config.gpu_options.per_process_gpu_memory_fraction = 0.6
config = tf.estimator.RunConfig(model_dir=FLAGS.model_dir,
session_config=session_config, log_step_count_steps=10,
save_summary_steps=20, keep_checkpoint_max=20,
save_checkpoints_steps=100)
TensorFlow CUDA_ERROR_OUT_OF_MEMORY
Setting of queue_capacity, min_after_dequeue, num_readers, batch_queue_capacity, num_batch_queue_threads, prefetch_queue_capacity
Out Of Memory when training on Big Images
reduce min_dimension, max_dimension to 270, 480
None of these work for me.
Environment:
OS Platform and Distribution: Win 10 pro version: 1909
TensorFlow installed from: pip tensorflow-gpu
TensorFlow version 1.14
object-detection: 0.1 CUDA/cuDNN version: Cuda 10.0, Cudnn 10.0
GPU model and memory: NVIDIA GeForce RTX 2070 SUPER, Memory 8 G
system memory: 32G
My config:
# Faster R-CNN with Inception v2, configured for Oxford-IIIT Pets Dataset.
# Users should configure the fine_tune_checkpoint field in the train config as
# well as the label_map_path and input_path fields in the train_input_reader and
# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
# should be configured.
model {
faster_rcnn {
num_classes: 2
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 1080
max_dimension: 1920
}
}
feature_extractor {
type: 'faster_rcnn_inception_v2'
first_stage_features_stride: 16
}
first_stage_anchor_generator {
grid_anchor_generator {
scales: [0.25, 0.5, 1.0, 2.0]
aspect_ratios: [0.5, 1.0, 2.0]
height_stride: 16
width_stride: 16
}
}
first_stage_box_predictor_conv_hyperparams {
op: CONV
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
truncated_normal_initializer {
stddev: 0.01
}
}
}
first_stage_nms_score_threshold: 0.0
first_stage_nms_iou_threshold: 0.7
first_stage_max_proposals: 300
first_stage_localization_loss_weight: 2.0
first_stage_objectness_loss_weight: 1.0
initial_crop_size: 14
maxpool_kernel_size: 2
maxpool_stride: 2
second_stage_box_predictor {
mask_rcnn_box_predictor {
use_dropout: false
dropout_keep_probability: 1.0
fc_hyperparams {
op: FC
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
variance_scaling_initializer {
factor: 1.0
uniform: true
mode: FAN_AVG
}
}
}
}
}
second_stage_post_processing {
batch_non_max_suppression {
score_threshold: 0.0
iou_threshold: 0.6
max_detections_per_class: 100
max_total_detections: 300
}
score_converter: SOFTMAX
}
second_stage_localization_loss_weight: 2.0
second_stage_classification_loss_weight: 1.0
}
}
train_config: {
batch_size: 1
optimizer {
momentum_optimizer: {
learning_rate: {
manual_step_learning_rate {
initial_learning_rate: 0.0002
schedule {
step: 900000
learning_rate: .00002
}
schedule {
step: 1200000
learning_rate: .000002
}
}
}
momentum_optimizer_value: 0.9
}
use_moving_average: false
}
gradient_clipping_by_norm: 10.0
fine_tune_checkpoint: ""
from_detection_checkpoint: true
load_all_detection_checkpoint_vars: true
# Note: The below line limits the training process to 200K steps, which we
# empirically found to be sufficient enough to train the pets dataset. This
# effectively bypasses the learning rate schedule (the learning rate will
# never decay). Remove the below line to train indefinitely.
num_steps: 200000
data_augmentation_options {
random_horizontal_flip {
}
}
batch_queue_capacity: 60
num_batch_queue_threads: 30
prefetch_queue_capacity: 40
}
train_input_reader: {
tf_record_input_reader {
input_path: "D:\\object_detection\\train_data\\train.record"
}
label_map_path: "D:\\object_detection\\pascal_label_map.pbtxt"
queue_capacity: 2
min_after_dequeue: 1
num_readers: 1
}
eval_config: {
metrics_set: "coco_detection_metrics"
num_examples: 1101
}
eval_input_reader: {
tf_record_input_reader {
input_path: "D:\\object_detection\\eval_data\\eval.record"
}
label_map_path: "D:\\object_detection\\pascal_label_map.pbtxt"
shuffle: false
num_readers: 1
}
If there are other solutions, I will be very grateful to you.
Object detection models consume a lot of memory. This is because how they work and the large amount of anchors that they generate to find the boxes.
You are doing all fine, but your GPU is not enough for training these kind of models.
Things you can do:
Reduce the image size, lets say something like 720x512
Use SGD as optimizer, instead of others optimizers such Adam. SGD consumes approximately 3 times less memory than Adam.
Also is worth to mention that you are doing well with small batches of 1 instances. If I am not wrong, FasterRCNN is trained with only 2 images per batch
I just found that if I set batch_size to 3, then it works normally. When I set batch_size back to 1, it encounters OOM problem.
It is weird and I still don't know why, since it should always save memory with lower batch size.
If you encounter the same situation, can try to increase the batch size slightly, but I cannot guarantee it will work.

Why does Tensorflow multiclass-image-prediction not work when model is loaded?

I am currently trying to learn machine learning techniques and wanted to recreate a simple image recognition algorithm with tensorflow. Therefore I made two Python-files: One for training and one for prediction.
Tested on Ubuntu 18.04
Used Python Version: 3.7
Used Numpy Version: 1.18.1
Used Tensorflow Version: 1.14 and 2.1.0 (outputs below are from Version 1.14)
My images are from http://www.cs.columbia.edu/CAVE/databases/pubfig/download/#dev
The set consists of about 3000 images of cropped faces from 60 people.
train_model.py:
#!/usr/bin/env python
import concurrent.futures
import pandas as pd
import urllib
import pathlib
import hashlib
import os
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
people = pd.read_csv("dev_people.txt")
image_generator = ImageDataGenerator(rescale=1./255, validation_split=0.2, rotation_range=45, zoom_range=0.2)
IMG_HEIGHT = 128
IMG_WIDTH = 128
LEARNING_RATE = 0.0001
BATCH_SIZE = 32
NUM_TRAIN = 100
STEPS_PER_EPOCH = round(NUM_TRAIN) // BATCH_SIZE
VAL_STEPS = 20
NUM_EPOCHS = 3
train_data = image_generator.flow_from_directory(batch_size=BATCH_SIZE,
directory="persons-cropped",
shuffle=True,
target_size=(IMG_HEIGHT, IMG_WIDTH),
class_mode="categorical",
subset="training")
labels = train_data.class_indices
labels = {v: k for k, v in labels.items()}
with open("labels.json", "w") as labels_file:
labels_file.write(json.dumps(labels))
validation_data = image_generator.flow_from_directory(batch_size=BATCH_SIZE,
directory="persons-cropped",
shuffle=True,
target_size=(IMG_HEIGHT, IMG_WIDTH),
class_mode="categorical",
subset="validation")
base_model = tf.keras.applications.MobileNetV2(
input_shape=(IMG_WIDTH, IMG_HEIGHT, 3),
include_top=False,
weights="imagenet"
)
base_model.trainable = False
maxpool_layer = tf.keras.layers.GlobalMaxPooling2D()
prediction_layer = tf.keras.layers.Dense(60, activation="sigmoid")
dropout_layer = tf.keras.layers.Dropout(0.2)
model = tf.keras.Sequential([
base_model,
maxpool_layer,
# dropout_layer,
prediction_layer
])
model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE),
loss="binary_crossentropy",
metrics=["accuracy"]
)
model.summary()
model.fit(
train_data,
epochs=NUM_EPOCHS,
steps_per_epoch=None,
validation_data=validation_data,
validation_steps=None,
use_multiprocessing=False,
workers=6,
verbose=2
)
model.save("model.h5")
Output:
Found 2431 images belonging to 60 classes.
Found 573 images belonging to 60 classes.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
2020-01-25 22:23:40.036326: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2599985000 Hz
2020-01-25 22:23:40.036657: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x6b81c60 executing computations on platform Host. Devices:
2020-01-25 22:23:40.036789: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): <undefined>, <undefined>
2020-01-25 22:23:40.615771: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1412] (One-time warning): Not using XLA:CPU for cluster because envvar TF_XLA_FLAGS=--tf_xla_cpu_global_jit was not set. If you want XLA:CPU, either set that envvar, or use experimental_jit_scope to enable XLA:CPU. To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a proper command-line flag, not via TF_XLA_FLAGS) or set the envvar XLA_FLAGS=--xla_hlo_profile.
WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
mobilenetv2_1.00_224 (Model) (None, 7, 7, 1280) 2257984
_________________________________________________________________
global_max_pooling2d (Global (None, 1280) 0
_________________________________________________________________
dense (Dense) (None, 60) 76860
=================================================================
Total params: 2,334,844
Trainable params: 76,860
Non-trainable params: 2,257,984
_________________________________________________________________
Epoch 1/3
2020-01-25 22:23:55.995833: W tensorflow/core/framework/allocator.cc:107] Allocation of 154140672 exceeds 10% of system memory.
2020-01-25 22:23:56.730363: W tensorflow/core/framework/allocator.cc:107] Allocation of 156905472 exceeds 10% of system memory.
2020-01-25 22:24:02.782372: W tensorflow/core/framework/allocator.cc:107] Allocation of 154140672 exceeds 10% of system memory.
2020-01-25 22:24:03.531172: W tensorflow/core/framework/allocator.cc:107] Allocation of 156905472 exceeds 10% of system memory.
2020-01-25 22:24:09.474692: W tensorflow/core/framework/allocator.cc:107] Allocation of 154140672 exceeds 10% of system memory.
/usr/local/lib/python3.7/dist-packages/PIL/TiffImagePlugin.py:788: UserWarning: Corrupt EXIF data. Expecting to read 4 bytes but only got 0.
warnings.warn(str(msg))
76/76 - 602s - loss: 0.3851 - acc: 0.9097 - val_loss: 0.1812 - val_acc: 0.9495
Epoch 2/3
76/76 - 616s - loss: 0.1480 - acc: 0.9757 - val_loss: 0.1732 - val_acc: 0.9544
Epoch 3/3
76/76 - 627s - loss: 0.1452 - acc: 0.9760 - val_loss: 0.1767 - val_acc: 0.9516
It says the model has an accuracy score of about 95 % which is very good and should be no problem for predicting images now. However, this is the prediction file:
predict_image.py
#!/usr/bin/env python
import concurrent.futures
import pandas as pd
import numpy as np
import urllib
import pathlib
import hashlib
import os
import sys
import cv2
import json
import tensorflow as tf
import PIL
import skimage
from tensorflow.keras.preprocessing.image import ImageDataGenerator
IMG_HEIGHT = 128
IMG_WIDTH = 128
def load_image(filename):
img = tf.keras.preprocessing.image.load_img(filename, target_size=(IMG_WIDTH,IMG_HEIGHT))
img = tf.keras.preprocessing.image.img_to_array(img)
img = np.expand_dims(img, axis=0) / 255
return img
from glob import glob
class_names = glob("persons-cropped/*")
class_names = sorted(class_names)
labels_file = open("labels.json", "r")
labels = json.loads(labels_file.read())
print(labels)
model = tf.keras.models.load_model("model.h5")
model.summary()
img = load_image(sys.argv[1])
predictions = model.predict(img, verbose=1)
prediction = predictions.argmax(axis=-1)
print(predictions)
print(prediction)
map_labels = np.vectorize(lambda i: labels[str(i)])
print(map_labels(prediction))
Output: When using a Zach Braff image:
{'0': 'Abhishek Bachan', '1': 'Alex Rodriguez', '2': 'Ali Landry', '3': 'Alyssa Milano', '4': 'Anderson Cooper', '5': 'Anna Paquin', '6': 'Audrey Tautou', '7': 'Barack Obama', '8': 'Ben Stiller', '9': 'Christina Ricci', '10': 'Clive Owen', '11': 'Cristiano Ronaldo', '12': 'Daniel Craig', '13': 'Danny Devito', '14': 'David Duchovny', '15': 'Denise Richards', '16': 'Diane Sawyer', '17': 'Donald Faison', '18': 'Ehud Olmert', '19': 'Faith Hill', '20': 'Famke Janssen', '21': 'Hugh Jackman', '22': 'Hugh Laurie', '23': 'James Spader', '24': 'Jared Leto', '25': 'Julia Roberts', '26': 'Julia Stiles', '27': 'Karl Rove', '28': 'Katherine Heigl', '29': 'Kevin Bacon', '30': 'Kiefer Sutherland', '31': 'Kim Basinger', '32': 'Mark Ruffalo', '33': 'Meg Ryan', '34': 'Michelle Trachtenberg', '35': 'Michelle Wie', '36': 'Mickey Rourke', '37': 'Miley Cyrus', '38': 'Milla Jovovich', '39': 'Nicole Richie', '40': 'Rachael Ray', '41': 'Robert Gates', '42': 'Ryan Seacrest', '43': 'Sania Mirza', '44': 'Sarah Chalke', '45': 'Sarah Palin', '46': 'Scarlett Johansson', '47': 'Seth Rogen', '48': 'Shahrukh Khan', '49': 'Shakira', '50': 'Stephen Colbert', '51': 'Stephen Fry', '52': 'Steve Carell', '53': 'Steve Martin', '54': 'Tracy Morgan', '55': 'Ty Pennington', '56': 'Viggo Mortensen', '57': 'Wilmer Valderrama', '58': 'Zac Efron', '59': 'Zach Braff'}
2020-01-25 22:58:05.582049: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2599985000 Hz
2020-01-25 22:58:05.582514: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x618f910 executing computations on platform Host. Devices:
2020-01-25 22:58:05.582653: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): <undefined>, <undefined>
2020-01-25 22:58:06.454565: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1412] (One-time warning): Not using XLA:CPU for cluster because envvar TF_XLA_FLAGS=--tf_xla_cpu_global_jit was not set. If you want XLA:CPU, either set that envvar, or use experimental_jit_scope to enable XLA:CPU. To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a proper command-line flag, not via TF_XLA_FLAGS) or set the envvar XLA_FLAGS=--xla_hlo_profile.
1/1 [==============================] - 1s 999ms/sample
[[3.23683023e-04 6.47217035e-04 3.90201807e-04 2.69789696e-02
2.17908323e-02 1.53781831e-01 4.79090214e-03 8.64863396e-04
1.11432403e-01 8.87395382e-01 3.30170989e-03 2.17252970e-03
1.78458661e-01 1.09243691e-02 1.47551298e-04 2.62927115e-02
3.22049320e-01 2.69562006e-04 9.11523938e-01 2.44581699e-03
7.65213370e-03 2.90286541e-03 1.01376325e-01 6.43432140e-05
4.43832874e-02 3.94093990e-03 6.90050423e-02 7.47233629e-04
1.05589628e-03 8.04662704e-07 3.76045704e-03 4.28827941e-01
1.20029151e-02 1.77664489e-01 5.27173281e-04 2.45797634e-03
5.89579344e-03 9.46103930e-01 2.79089808e-03 2.09265649e-02
2.83238888e-02 4.86207008e-03 8.15459788e-02 1.30202770e-02
1.50602162e-02 1.33922696e-03 1.24056339e-02 5.76970875e-02
2.65627503e-02 5.18084109e-01 4.89562750e-04 3.15269828e-03
4.88847494e-04 2.13665128e-01 1.40489936e-02 2.93705761e-02
5.01989722e-02 1.21492555e-03 1.62564263e-01 2.91267484e-01]]
[37]
['Miley Cyrus']
The prediction algorithm is wrong all the time. If I use other Zach Braff images, the output stays the same for the same picture of course, but in 10 test cases it was never Zach Braff but always a different person. (Not only Miley Cyrus, but also Shakira, Steve Carell, ...)
This pattern appears for any class I use as input here.
I did not find any helpful advice on the internet and tried like every parameter combination I can image could work. I also used two versions of Tensorflow and made sure that all libraries are up to date.
Hey I believe you are getting strange predictions because your data distribution has 60 classes of people while your model is compiled with a loss function that is set to binary crossentropy.
Binary crossentropy is used to determine a max of 2 classes. What you need to do is change the loss function to categorical crossentropy.
model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE),
loss="categorical_crossentropy", # here
metrics=["accuracy"]
)

retrain object_detection not trained

Background:
Windows 10
Tensorflow: 1.12
Followed the official document here. As the dataset is generated from experiment, so there are not many images available, about 50 training image and 10 test image. The pre-trained model is ssd_inception_v2_coco. When training using
python train.py --logtostderr --train_dir=training/ --pipeline_config_path=training/ssd_inception_v2_coco.config
saw the following output and the program quit.
(a million lines here...)
W0423 15:59:38.764785 21492 variables_helper.py:144] Variable [FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/BatchNorm/beta/RMSProp] is not available in checkpoint
W0423 15:59:38.765782 21492 variables_helper.py:144] Variable [FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/BatchNorm/beta/RMSProp_1] is not available in checkpoint
W0423 15:59:38.765782 21492 variables_helper.py:144] Variable [FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/BatchNorm/gamma/ExponentialMovingAverage] is not available in checkpoint
W0423 15:59:38.765782 21492 variables_helper.py:144] Variable [FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/BatchNorm/gamma/RMSProp] is not available in checkpoint
W0423 15:59:38.765782 21492 variables_helper.py:144] Variable [FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/BatchNorm/gamma/RMSProp_1] is not available in checkpoint
W0423 15:59:38.765782 21492 variables_helper.py:144] Variable [FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/weights/ExponentialMovingAverage] is not available in checkpoint
W0423 15:59:38.765782 21492 variables_helper.py:144] Variable [FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/weights/RMSProp] is not available in checkpoint
W0423 15:59:38.765782 21492 variables_helper.py:144] Variable [FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/weights/RMSProp_1] is not available in checkpoint
WARNING:tensorflow:From d:\Anaconda3\lib\site-packages\tensorflow\contrib\slim\python\slim\learning.py:737: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
W0423 15:59:39.539828 21492 tf_logging.py:125] From d:\Anaconda3\lib\site-packages\tensorflow\contrib\slim\python\slim\learning.py:737: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
2019-04-23 15:59:41.155297: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2019-04-23 15:59:41.385078: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties:
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.7085
pciBusID: 0000:01:00.0
totalMemory: 11.00GiB freeMemory: 9.11GiB
2019-04-23 15:59:41.390824: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-04-23 15:59:42.311427: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-04-23 15:59:42.322811: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0
2019-04-23 15:59:42.324856: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N
2019-04-23 15:59:42.327029: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 8799 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1)
INFO:tensorflow:Restoring parameters from pre-trained-model/model.ckpt
I0423 15:59:46.439763 21492 tf_logging.py:115] Restoring parameters from pre-trained-model/model.ckpt
INFO:tensorflow:Running local_init_op.
I0423 15:59:46.674186 21492 tf_logging.py:115] Running local_init_op.
INFO:tensorflow:Done running local_init_op.
I0423 15:59:47.319484 21492 tf_logging.py:115] Done running local_init_op.
INFO:tensorflow:Starting Session.
I0423 15:59:54.453117 21492 tf_logging.py:115] Starting Session.
INFO:tensorflow:Saving checkpoint to path training/model.ckpt
I0423 15:59:54.647598 15672 tf_logging.py:115] Saving checkpoint to path training/model.ckpt
INFO:tensorflow:Starting Queues.
I0423 15:59:54.651614 21492 tf_logging.py:115] Starting Queues.
INFO:tensorflow:global_step/sec: 0
I0423 16:00:01.125150 4792 tf_logging.py:159] global_step/sec: 0
D:\workspace\demo>
And here is the configure file:
model {
ssd {
num_classes: 1
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
ssd_anchor_generator {
num_layers: 6
min_scale: 0.2
max_scale: 0.95
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.3333
reduce_boxes_in_lowest_layer: true
}
}
image_resizer {
fixed_shape_resizer {
height: 300
width: 300
}
}
box_predictor {
convolutional_box_predictor {
min_depth: 0
max_depth: 0
num_layers_before_predictor: 0
use_dropout: false
dropout_keep_probability: 0.8
kernel_size: 3
box_code_size: 4
apply_sigmoid_to_scores: false
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
}
}
}
feature_extractor {
type: 'ssd_inception_v2'
min_depth: 16
depth_multiplier: 1.0
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
train: true,
scale: true,
center: true,
decay: 0.9997,
epsilon: 0.001,
}
}
override_base_feature_extractor_hyperparams: true
}
loss {
classification_loss {
weighted_sigmoid {
}
}
localization_loss {
weighted_smooth_l1 {
}
}
hard_example_miner {
num_hard_examples: 3000
iou_threshold: 0.99
loss_type: CLASSIFICATION
max_negatives_per_positive: 3
min_negatives_per_image: 0
}
classification_weight: 1.0
localization_weight: 1.0
}
normalize_loss_by_num_matches: true
post_processing {
batch_non_max_suppression {
score_threshold: 1e-8
iou_threshold: 0.6
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
}
}
train_config: {
batch_size: 4
optimizer {
rms_prop_optimizer: {
learning_rate: {
exponential_decay_learning_rate {
initial_learning_rate: 0.0004
decay_steps: 5000
decay_factor: 0.99
}
}
momentum_optimizer_value: 0.9
decay: 0.9
epsilon: 1.0
}
}
fine_tune_checkpoint: "pre-trained-model/model.ckpt"
from_detection_checkpoint: true
# Note: The below line limits the training process to 200K steps, which we
# empirically found to be sufficient enough to train the pets dataset. This
# effectively bypasses the learning rate schedule (the learning rate will
# never decay). Remove the below line to train indefinitely.
num_steps: 200000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
ssd_random_crop {
}
}
}
train_input_reader: {
tf_record_input_reader {
input_path: "annotations/train.record"
}
label_map_path: "annotations/label_map.pbtxt"
}
eval_config: {
num_examples: 5
# Note: The below line limits the evaluation process to 10 evaluations.
# Remove the below line to evaluate indefinitely.
max_evals: 5
}
eval_input_reader: {
tf_record_input_reader {
input_path: "annotations/test.record"
}
label_map_path: "annotations/label_map.pbtxt"
shuffle: false
num_readers: 1
}
I guess the model is not get trained because the tensorboard looks like this:
Well, any idea how to make the training start?
Try to add --num_train_steps=10 to your cmd.
Well, after resize the images to 600 * 300, things works.