Tensorflow GPU Custom Object Detection not working - tensorflow

I am relatively new to object detection using tensorflow and need guidance on the below issue.
I am building a custom model to detect two objects using tensorflow and Faster_Rcnn_inception_v2 model. For this I have used 600 images which contains both the objects. These images are divided into 75% train and 25% test folders.
I am able to train with the model on GPU (Linux) machine and achieved loss of only 0.05
After generating frozen_inference_graph.pb file, when I tested, it is not even detecting a single object in over 10 images.
It is only working when I lowered the value of min_score_thresh parameter to 0.4
The objects are detected with around 47% confidence.
However, when I trained the same model on different CPU (Windows) machine, it works absolutely fine and results are satisfactory with confidence level above 80 percent.
Can someone please throw some light on this issue? Why the model is not working when trained on GPU but the same model working on CPU?
Note: The issue is occurring only recently, 2 months back, the GPU model was working exceptionally for a different object.
I can share the content of config labelmap or any other file if required.
Command for Training:
python train.py --logtostderr --train_dir="TrainingDp" --pipeline_config_path="TrainingDp/faster_rcnn.config"
Code:
#!/usr/bin/env python
# coding: utf-8
# In[3]:
import os
import pathlib
if "models" in pathlib.Path.cwd().parts:
while "models" in pathlib.Path.cwd().parts:
os.chdir('..')
elif not pathlib.Path('models').exists():
get_ipython().system('git clone --depth 1 https://github.com/tensorflow/models')
# In[4]:
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
import cv2
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image
from IPython.display import display
# In[5]:
from object_detection.utils import ops as utils_ops
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
# In[6]:
# patch tf1 into `utils.ops`
utils_ops.tf = tf.compat.v1
# Patch the location of gfile
tf.gfile = tf.io.gfile
# In[7]:
# If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.
PATH_TO_TEST_IMAGES_DIR = pathlib.Path('C:/Users/xxxxxx/Desktop/models-master/models-master/research/object_detection/test_images')
TEST_IMAGE_PATHS = sorted(list(PATH_TO_TEST_IMAGES_DIR.glob("*.jpg")))
len(TEST_IMAGE_PATHS)
num=len(TEST_IMAGE_PATHS)+1
# In[16]:
model_name = r'C:\Users\xxxxxx\Desktop\models-master\models-master\research\object_detection\TrainingDp2'
PATH_TO_FROZEN_GRAPH= r'C:\Users\xxxxxx\Desktop\models-master\models-master\research\object_detection\inference_graph\frozen_inference_graph.pb'
PATH_TO_LABELS= r'C:\Users\xxxxxx\Desktop\models-master\models-master\research\object_detection\TrainingDp2\labelmap.pbtxt'
# In[17]:
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
# In[18]:
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)
# In[19]:
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(image.getdata()).reshape(
(im_height, im_width, 3)).astype(np.uint8)
# In[20]:
# For the sake of simplicity we will use only 2 images:
# image1.jpg
# image2.jpg
# If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.
PATH_TO_TEST_IMAGES_DIR = 'test_images'
TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, 'image{}.jpg'.format(i)) for i in range(1, 10) ]
# Size, in inches, of the output images.
IMAGE_SIZE = (12, 8)
# In[21]:
def run_inference_for_single_image(image, graph):
with graph.as_default():
with tf.Session() as sess:
# Get handles to input and output tensors
ops = tf.get_default_graph().get_operations()
all_tensor_names = {output.name for op in ops for output in op.outputs}
tensor_dict = {}
for key in [
'num_detections', 'detection_boxes', 'detection_scores',
'detection_classes', 'detection_masks'
]:
tensor_name = key + ':0'
if tensor_name in all_tensor_names:
tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
tensor_name)
if 'detection_masks' in tensor_dict:
# The following processing is only for single image
detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
# Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
detection_masks, detection_boxes, image.shape[1], image.shape[2])
detection_masks_reframed = tf.cast(
tf.greater(detection_masks_reframed, 0.5), tf.uint8)
# Follow the convention by adding back the batch dimension
tensor_dict['detection_masks'] = tf.expand_dims(
detection_masks_reframed, 0)
image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
# Run inference
output_dict = sess.run(tensor_dict,
feed_dict={image_tensor: image})
# all outputs are float32 numpy arrays, so convert types as appropriate
output_dict['num_detections'] = int(output_dict['num_detections'][0])
output_dict['detection_classes'] = output_dict[
'detection_classes'][0].astype(np.int64)
output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
output_dict['detection_scores'] = output_dict['detection_scores'][0]
if 'detection_masks' in output_dict:
output_dict['detection_masks'] = output_dict['detection_masks'][0]
return output_dict
# In[22]:
get_ipython().run_line_magic('matplotlib', 'inline')
# In[23]:
count =0
for image_path in TEST_IMAGE_PATHS:
image = Image.open(image_path)
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Actual detection.
output_dict = run_inference_for_single_image(image_np_expanded, detection_graph)
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
output_dict['detection_boxes'],
output_dict['detection_classes'],
output_dict['detection_scores'],
category_index,
instance_masks=output_dict.get('detection_masks'),
use_normalized_coordinates=True,
line_thickness=4
,min_score_thresh=.4
)
plt.figure(figsize=IMAGE_SIZE)
plt.imshow(image_np)
#cv2.imshow('img',image_np)
RGB=cv2.cvtColor(image_np,cv2.COLOR_BGR2RGB)
filename=r'C:\Users\xxxxxx\Desktop\models-master\models-master\research\object_detection\validation\'iMAGE'+str(count)+'.jpg'
cv2.imwrite(filename,RGB)
count+=1
Content of Config file:
# Faster R-CNN with Inception v2, configured for Oxford-IIIT Pets Dataset.
# Users should configure the fine_tune_checkpoint field in the train config as
# well as the label_map_path and input_path fields in the train_input_reader and
# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
# should be configured.
model {
faster_rcnn {
num_classes: 2
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 600
max_dimension: 1024
}
}
feature_extractor {
type: 'faster_rcnn_inception_v2'
first_stage_features_stride: 16
}
first_stage_anchor_generator {
grid_anchor_generator {
scales: [0.25, 0.5, 1.0, 2.0]
aspect_ratios: [0.5, 1.0, 2.0]
height_stride: 16
width_stride: 16
}
}
first_stage_box_predictor_conv_hyperparams {
op: CONV
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
truncated_normal_initializer {
stddev: 0.01
}
}
}
first_stage_nms_score_threshold: 0.0
first_stage_nms_iou_threshold: 0.7
first_stage_max_proposals: 300
first_stage_localization_loss_weight: 2.0
first_stage_objectness_loss_weight: 1.0
initial_crop_size: 14
maxpool_kernel_size: 2
maxpool_stride: 2
second_stage_box_predictor {
mask_rcnn_box_predictor {
use_dropout: false
dropout_keep_probability: 1.0
fc_hyperparams {
op: FC
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
variance_scaling_initializer {
factor: 1.0
uniform: true
mode: FAN_AVG
}
}
}
}
}
second_stage_post_processing {
batch_non_max_suppression {
score_threshold: 0.0
iou_threshold: 0.6
max_detections_per_class: 100
max_total_detections: 300
}
score_converter: SOFTMAX
}
second_stage_localization_loss_weight: 2.0
second_stage_classification_loss_weight: 1.0
}
}
train_config: {
batch_size: 1
optimizer {
momentum_optimizer: {
learning_rate: {
manual_step_learning_rate {
initial_learning_rate: 0.0002
schedule {
step: 900000
learning_rate: .00002
}
schedule {
step: 1200000
learning_rate: .000002
}
}
}
momentum_optimizer_value: 0.9
}
use_moving_average: false
}
gradient_clipping_by_norm: 10.0
fine_tune_checkpoint: "C:/Users/xxxxxx/Desktop/models-master/models-master/research/object_detection/faster_rcnn_inception_v2_coco_2018_01_28/model.ckpt"
from_detection_checkpoint: true
load_all_detection_checkpoint_vars: true
# Note: The below line limits the training process to 200K steps, which we
# empirically found to be sufficient enough to train the pets dataset. This
# effectively bypasses the learning rate schedule (the learning rate will
# never decay). Remove the below line to train indefinitely.
num_steps: 200000
data_augmentation_options {
random_horizontal_flip {
}
}
}
train_input_reader: {
tf_record_input_reader {
input_path: "C:/Users/xxxxxx/Desktop/models-master/models-master/research/object_detection/Train.record"
}
label_map_path: "C:/Users/xxxxxx/Desktop/models-master/models-master/research/object_detection/TrainingDp2/labelmap.pbtxt"
}
eval_config: {
metrics_set: "coco_detection_metrics"
num_examples: 1101
}
eval_input_reader: {
tf_record_input_reader {
input_path: "C:/Users/xxxxxx/Desktop/models-master/models-master/research/object_detection/Test.record"
}
label_map_path: "C:/Users/xxxxxx/Desktop/models-master/models-master/research/object_detection/TrainingDp2/labelmap.pbtxt"
shuffle: false
num_readers: 1
}
Thanks.

In case if anyone is having the same issue,
I managed to resolve by changing the batch_size = 2 in the configuration file.

Related

The tensorflow object_detection model can only detect 1 face

I'm trying to build a model to detect faces, and I used the SSD MobileNet v2 320x320's pipeline.config to train without calling the fine checkpoint. The pbtxt I used is the face_label_map.pbtxt that is given officially.
The model detects well when the photos only contain 1 face, but only detects one when there is more than 1 face in the photo.
I am searching for a long time on net. But no use. Please help or try to give some ideas on how to solve this. Thanks in advance.
result of one face and
result of more than one face
The code of pipeline.config:
model {
ssd {
num_classes: 1
image_resizer {
fixed_shape_resizer {
height: 300
width: 300
}
}
feature_extractor {
type: "ssd_mobilenet_v2_keras"
depth_multiplier: 1.0
min_depth: 16
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 3.9999998989515007e-05
}
}
initializer {
truncated_normal_initializer {
mean: 0.0
stddev: 0.029999999329447746
}
}
activation: RELU_6
batch_norm {
decay: 0.9700000286102295
center: true
scale: true
epsilon: 0.0010000000474974513
train: true
}
}
override_base_feature_extractor_hyperparams: true
}
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
box_predictor {
convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 3.9999998989515007e-05
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.009999999776482582
}
}
activation: RELU_6
batch_norm {
decay: 0.9700000286102295
center: true
scale: true
epsilon: 0.0010000000474974513
train: true
}
}
min_depth: 0
max_depth: 0
num_layers_before_predictor: 0
use_dropout: false
dropout_keep_probability: 0.800000011920929
kernel_size: 1
box_code_size: 4
apply_sigmoid_to_scores: false
class_prediction_bias_init: -4.599999904632568
}
}
anchor_generator {
ssd_anchor_generator {
num_layers: 6
min_scale: 0.20000000298023224
max_scale: 0.949999988079071
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.33329999446868896
}
}
post_processing {
batch_non_max_suppression {
score_threshold: 9.99999993922529e-09
iou_threshold: 0.6000000238418579
max_detections_per_class: 100
max_total_detections: 100
use_static_shapes: false
}
score_converter: SIGMOID
}
normalize_loss_by_num_matches: true
loss {
localization_loss {
weighted_smooth_l1 {
delta: 1.0
}
}
classification_loss {
weighted_sigmoid_focal {
gamma: 2.0
alpha: 0.75
}
}
classification_weight: 1.0
localization_weight: 1.0
}
encode_background_as_zeros: true
normalize_loc_loss_by_codesize: true
inplace_batchnorm_update: true
freeze_batchnorm: false
}
}
train_config {
batch_size: 32
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
ssd_random_crop {
}
}
sync_replicas: true
optimizer {
momentum_optimizer {
learning_rate {
cosine_decay_learning_rate {
learning_rate_base: 0.800000011920929
total_steps: 50000
warmup_learning_rate: 0.13333000242710114
warmup_steps: 2000
}
}
momentum_optimizer_value: 0.8999999761581421
}
use_moving_average: false
}
num_steps: 50000
}
train_input_reader {
label_map_path: "/Users/aintor/Documents/Deep_Learning/Tensorflow/workspace/data/face_label_map.pbtxt"
tf_record_input_reader {
input_path: "/Users/aintor/Documents/Deep_Learning/Tensorflow/workspace/data/faces_train.tfrecord"
}
}
eval_config {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
}
eval_input_reader {
label_map_path: "/Users/aintor/Documents/Deep_Learning/Tensorflow/workspace/data/face_label_map.pbtxt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "/Users/aintor/Documents/Deep_Learning/Tensorflow/workspace/data/faces_test.tfrecord"
}
}
The code of the pbtxt I used:
item {
name: "face"
id: 1
display_name: "face"
}
The code to generate the result photo:
import tensorflow as tf
import os
import numpy as np
from PIL import Image
import matplotlib
from matplotlib import pyplot as plt
import sys
sys.path.insert(0, path2scripts) # making scripts in models/research available for import
from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.builders import model_builder
# do not change anything in this cell
configs = config_util.get_configs_from_pipeline_file(path2config) # importing config
model_config = configs['model'] # recreating model config
detection_model = model_builder.build(model_config=model_config, is_training=False) # importing model
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(os.path.join(path2model, ckpt_num)).expect_partial()
category_index = label_map_util.create_category_index_from_labelmap(path2label_map,use_display_name=True)
def detect_fn(image):
image, shapes = detection_model.preprocess(image)
prediction_dict = detection_model.predict(image, shapes)
detections = detection_model.postprocess(prediction_dict, shapes)
return detections
def load_image_into_numpy_array(path):
"""Load an image from file into a numpy array.
Puts image into numpy array to feed into tensorflow graph.
Note that by convention we put it into a numpy array with shape
(height, width, channels), where channels=3 for RGB.
Args:
path: the file path to the image
Returns:
numpy array with shape (img_height, img_width, 3)
"""
return np.array(Image.open(path))
def inference_with_plot(path2images, box_th=0.25):
"""
Function that performs inference and plots resulting b-boxes
Args:
path2images: an array with pathes to images
box_th: (float) value that defines threshold for model prediction.
Returns:
None
"""
matplotlib.use('MacOSX')
for image_path in path2images:
print('Running inference for {}... '.format(image_path), end='')
image_np = load_image_into_numpy_array(image_path)
input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32)
detections = detect_fn(input_tensor)
# All outputs are batches tensors.
# Convert to numpy arrays, and take index [0] to remove the batch dimension.
# We're only interested in the first num_detections.
num_detections = int(detections.pop('num_detections'))
detections = {key: value[0, :num_detections].numpy()
for key, value in detections.items()}
detections['num_detections'] = num_detections
# detection_classes should be ints.
detections['detection_classes'] = detections['detection_classes'].astype(np.int64)
label_id_offset = 1
image_np_with_detections = image_np.copy()
viz_utils.visualize_boxes_and_labels_on_image_array(
image_np_with_detections,
detections['detection_boxes'],
detections['detection_classes']+label_id_offset,
detections['detection_scores'],
category_index,
use_normalized_coordinates=True,
max_boxes_to_draw=200,
min_score_thresh=box_th,
agnostic_mode=False,
line_thickness=5)
plt.figure(figsize=(15,10))
plt.imshow(image_np_with_detections)
print('Done')
plt.show()
inference_with_plot(test_images)

Which accuracy should I trust? Model.Evaluate or Model.Predict?

Question: I am doing a multi-class image classification using TensorFlow 2.5 on Google Colab. I received three different values of classification accuracy and I do not know which one I should trust and why they are different.
Demonstration:
when I was evaluating on the test set, I received accuracy_1
29/29 [==============================] - 5s 147ms/step - loss: 1.1036 - accuracy: 0.3186
when I was predicting on the test set, I received accuracy_2, which is 0.22
precision recall f1-score support
0 0.69 0.12 0.21 1305
1 0.15 0.78 0.26 272
2 0.14 0.13 0.13 231
accuracy 0.22 1808
macro avg 0.33 0.34 0.20 1808
weighted avg 0.54 0.22 0.20 1808
Here's how I got accuracy_3, whose value is 0.2129424778761062
:
from sklearn.metrics import accuracy_score
prediction = np.argmax(detector.predict(test_dataset), axis=1)
accuracy_3 = accuracy_score(
np.concatenate([label.numpy() for image, label in test_dataset.take(-1)]),
prediction
))
I discovered that if I run the code block that calculates accuracy_3 for multiple times. I get different result each time but they will differ much from accuracy_2, which is 0.22. Here is the code for the calculation of accuracy_1 and accuracy_2:
from tensorflow.keras.callbacks import Callback
class Peek(Callback):
def on_epoch_begin(self, epoch, logs=None):
current_decayed_lr = self.model.optimizer._decayed_lr(tf.float32).numpy()
print(f"Current learning rate: {current_decayed_lr}")
def on_epoch_end(self, epoch, logs=None):
print("Evaluating...")
self.model.evaluate(test_dataset, verbose=1) # calculates accuracy_1
print("Predicting...")
predictions = np.argmax(self.model.predict(test_dataset), axis=1)
true_categories = np.array([label.numpy() for image, label in test_dataset.unbatch()])
print(classification_report(true_categories, predictions)) # calculates accuracy_2
The difference between accuracy_2 and accuracy_3 is more likely due to random chance but accuracy_1 is much larger than the other two. I searched on stackoverflow and some posts say the difference could be due to shuffle=True in ImageDataGenerator when creating the test set. My case is different because I was not using ImageDataGenerator for performance's sake. I was loading data using TFRecords, here is the full code.
import os
import math
import numpy as np
import tensorflow as tf
from glob import glob
from progressbar import progressbar
from os.path import basename, exists
from tensorflow.sparse import to_dense
from tensorflow.data import Dataset, Options, TFRecordDataset
from tensorflow.image import decode_jpeg, encode_jpeg, resize
from tensorflow.train import Feature, Features, BytesList, Int64List, FloatList, Example
from tensorflow.io import read_file, TFRecordWriter, FixedLenFeature, VarLenFeature, parse_single_example
from tensorflow.data.experimental import AUTOTUNE
class DataLoader:
def __init__(self, subset_name):
self.subset_name = subset_name
self.file_pattern = glob(
f"./dataset/{self.subset_name}/**/*.jpg",
recursive=True
)
self.target_size = (224, 224)
self.classes = [b"Negative", b"Positive", b"Unreadable"]
self.n_images = len(self.file_pattern)
self.n_shards = 32
self.write_shard_size = math.ceil(1.0 * self.n_images / self.n_shards)
self.read_shard_size = 64
self.output_dir = f"tfrecords-jpeg-{subset_name}-{'x'.join(map(lambda x: str(x), self.target_size))}"
def fetch_image_and_label(self, filename):
bits = read_file(filename)
image = decode_jpeg(bits)
image = resize(image, self.target_size)
height = tf.shape(image)[0]
width = tf.shape(image)[1]
image = tf.cast(image, tf.uint8)
image = encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
label = tf.expand_dims(filename, axis=-1)
label = tf.strings.split(label, sep="/")
label = label.values[-2]
return image, label, height, width
#staticmethod
def _bytestring_feature(list_of_bytestrings):
return Feature(bytes_list=BytesList(value=list_of_bytestrings))
#staticmethod
def _int_feature(list_of_ints):
return Feature(int64_list=Int64List(value=list_of_ints))
#staticmethod
def _float_feature(list_of_floats):
return Feature(float_list=FloatList(value=list_of_floats))
def to_tfrecord(self, tfrec_filewriter, img_bytes, label, height, width):
class_num = np.argmax(np.array(self.classes) == label)
one_hot_class = np.eye(len(self.classes))[class_num]
feature = {
"image": self._bytestring_feature([img_bytes]),
"class": self._int_feature([class_num]),
"label": self._bytestring_feature([label]),
"size": self._int_feature([height, width]),
"one_hot_class": self._float_feature(one_hot_class.tolist())
}
return Example(features=Features(feature=feature))
def write_records(self):
print(f"{self.n_images} images, {self.n_shards} shards with {self.write_shard_size} images each.")
filenames = Dataset.list_files(self.file_pattern, seed=35155)
dataset = filenames.map(self.fetch_image_and_label, num_parallel_calls=AUTOTUNE).batch(self.write_shard_size)
if not exists(self.output_dir):
os.mkdir(self.output_dir)
print("Writing TFRecords...")
for shard, (image, label, height, width) in enumerate(dataset):
shard_size = image.numpy().shape[0]
filename = f"{self.output_dir}/{str(shard).zfill(2)}-{shard_size}.tfrec"
with TFRecordWriter(filename) as out_file:
for i in progressbar(range(shard_size)):
example = self.to_tfrecord(
out_file,
image.numpy()[i],
label.numpy()[i],
height.numpy()[i],
width.numpy()[i]
)
out_file.write(example.SerializeToString())
print(f"Wrote file {filename} containing {shard_size} records")
def _read_tfrecord(self, example):
features = {
"image": FixedLenFeature([], tf.string),
"class": FixedLenFeature([], tf.int64),
"label": FixedLenFeature([], tf.string),
"size": FixedLenFeature([2], tf.int64),
"one_hot_class": VarLenFeature(tf.float32)
}
example = parse_single_example(example, features)
image = decode_jpeg(example["image"], channels=3)
image = tf.reshape(image, [*self.target_size, 3])
class_num = example["class"]
label = example["label"]
height = example["size"][0]
width = example["size"][1]
one_hot_class = to_dense(example["one_hot_class"])
# return image, class_num, label, height, width, one_hot_class
# return only image and class_num because we're classifying images
return image, class_num
def read_records(self):
from tensorflow.io.gfile import glob
option_no_order = Options()
option_no_order.experimental_deterministic = False
filenames = glob(f"{self.output_dir}/*.tfrec")
dataset = TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
dataset = dataset.with_options(option_no_order)
dataset = dataset.map(self._read_tfrecord, num_parallel_calls=AUTOTUNE)
dataset = dataset.shuffle(10000)
dataset = dataset.prefetch(buffer_size=AUTOTUNE)
dataset = dataset.batch(self.read_shard_size)
return dataset
train_loader = DataLoader("train")
validation_loader = DataLoader("validation")
test_loader = DataLoader("test")
train_dataset = train_loader.read_records()
validation_dataset = validation_loader.read_records()
test_dataset = test_loader.read_records()
train_dataset = train_dataset.concatenate(validation_dataset)
The difference between accuracy_2 and accuracy_3 still exists and accuracy_3 still changes every time I run the code block that computes accuracy_3 even after dataset = dataset.shuffle(10000) is removed from def read_records(self) in DataLoader class.
I will also paste the code regarding how the model was instantiated and compiled to provide more background information.
from tensorflow.keras import Input, Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.applications.densenet import DenseNet201
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.applications.densenet import preprocess_input
def create_model():
feature_extractor = DenseNet201(
weights="imagenet",
input_shape=(224, 224, 3),
include_top=False
)
feature_extractor.trainable = True
inputs = Input([224, 224, 3])
x = preprocess_input(inputs)
x = feature_extractor(x)
x = GlobalAveragePooling2D()(x)
x = Dense(32, activation="elu")(x)
x = Dropout(0.8)(x)
outputs = Dense(3, activation="softmax")(x)
detector = Model(inputs, outputs)
detector.compile(
optimizer=SGD(learning_rate=0.001, momentum=0.9),
loss=["sparse_categorical_crossentropy"],
metrics=["sparse_categorical_accuracy"]
)
return detector
detector = create_model()
peek = Peek()
detector.fit(
train_dataset,
epochs=1,
validation_data=test_dataset,
class_weight=class_weight,
callbacks=[peek],
)

Target 40 is out of bounds for nn.CrossEntropyLoss()

I create a custom image data set like:
from torch.utils.data.dataset import Dataset
from PIL import Image
import torchvision
from torchvision import datasets, models, transforms
import numpy as np
class MyCustomDataset(Dataset):
def __init__(self, df, transforms=None):
"""
Args:
df (pandas.DataFrame): path to images and in dataframe
transforms: pytorch transforms for transforms and tensor conversion
"""
# Transforms
self.transforms = transforms
# Read the dataframe
self.data_info = df
# First column contains the image paths
self.image_arr = np.asarray(self.data_info.iloc[:, 0])
# Second column is the labels
self.label_arr = np.asarray(self.data_info.iloc[:, 1])
# Calculate len
self.data_len = len(self.data_info.index)
def __getitem__(self, index):
# Get image name from the pandas df
single_image_name = self.image_arr[index]
# Open image
img_as_img = Image.open(single_image_name)
img_as_tensor= self.transforms(img_as_img)
# Get label(class) of the image based on the cropped pandas column
single_image_label = self.label_arr[index]
return (img_as_tensor, single_image_label)
def __len__(self):
return self.data_len
The input of df for MyCustomDataset(df, transforms) is pandas dataframe storing images' paths and labels look like below:
file_name label
0 M:\RealModels\images\001\001001.png 0
1 M:\RealModels\images\001\002001.png 0
2 M:\RealModels\images\001\003001.png 0
3 M:\RealModels\images\001\004001.png 0
4 M:\RealModels\images\001\006001.png 0
... ... ...
3197 M:\RenderedModels\images_rgb\450\116450.png 45
3198 M:\RenderedModels\images_rgb\450\117450.png 45
3199 M:\RenderedModels\images_rgb\450\118450.png 45
3200 M:\RenderedModels\images_rgb\450\119450.png 45
3201 M:\RenderedModels\images_rgb\450\120450.png 45
3202 rows × 2 columns
There are 16 classes in my dataset. The classes lables are like:['00', '01', '12', '34','35'...,'45']
My whole program is:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import time
import os
import copy
import scipy
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data.dataset import Dataset
import torchvision
from torchvision import datasets, models, transforms
# In[2]:
import sys
sys.path.append(r"M:\program\pytorch\Scripts")
import custom_fun
from custom_fun import custom_dataset
import create_folder
from create_folder import create_tb_folder
import dataset_from_image
from dataset_from_image import MyCustomDataset
# In[3]:
tb_dir = r'../'
path = create_tb_folder(tb_dir)
# In[4]:
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
# In[5]:
data_transforms = {
'train': transforms.Compose([
transforms.RandomRotation(degrees=15),
transforms.Resize((224,224)), # 299 for Inception v3
transforms.ColorJitter(),
transforms.ToTensor(),
transforms.Normalize(mean, std)
]),
'val': transforms.Compose([
transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize(mean, std)
]),
}
# In[6]:
data_dir = r'M:\dataset\first_att'
# In[7]:
batch_size = 4
lr = 0.003
# In[8]:
data = custom_dataset(data_dir,0.8)
# In[37]:
df = data['train']
df.head()
# In[9]:
image_datasets = {x: MyCustomDataset(data[x], data_transforms[x]) for x in ['train', 'val']}
# In[24]:
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
shuffle=True, num_workers=0)
for x in ['train', 'val']}
# In[25]:
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
device = torch.device("cpu")
# In[26]:
# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))
print(inputs.shape,classes.shape)
print(classes)
# Make a grid from batch
out = torchvision.utils.make_grid(inputs)
# In[27]:
tb1 = SummaryWriter(path[1])
tb2 = SummaryWriter(path[2])
# In[28]:
def train_model(model, criterion, optimizer, scheduler, num_epochs):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
optimizer.zero_grad()
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_sizes[phase] # images of training data
epoch_acc = running_corrects.double() / dataset_sizes[phase] # images of val data
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# Tensorboard works here
if phase == 'train':
tb1.add_scalar('Loss', epoch_loss, epoch)
tb1.add_scalar('Accuracy', epoch_acc, epoch)
for name, weight in model.named_parameters():
tb.add_histogram(name, weight, epoch)
tb.add_histogram(f'{name}.grad', weight.grad, epoch)
else:
tb2.add_scalar('Loss', epoch_loss, epoch)
tb2.add_scalar('Accuracy', epoch_acc, epoch)
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
# In[29]:
model = models.resnet18(pretrained=True)
# The way below is feature extraction.
#for param in model.parameters():
#param.requires_grad = False
# In[30]:
num_ftrs = model.fc.in_features
# In[31]:
model.fc = nn.Linear(num_ftrs, 16)
# In[32]:
model = model.to(device)
criterion = nn.CrossEntropyLoss()
# In[33]:
# Observe that all parameters are being optimized
optimizer = optim.Adam(model.parameters(), lr=lr)
# In[34]:
step_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
# In[35]:
tb = SummaryWriter(path[0])
grid = torchvision.utils.make_grid(inputs)
tb.add_image('images', grid)
tb.add_graph(model, inputs)
tb.close()
# In[36]:
model = train_model(model, criterion, optimizer, step_lr_scheduler, num_epochs=20)
# In[ ]:
torch.save(model.state_dict(), 'first_att_02.pth')
# In[ ]:
But I got the error:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-36-edba91d1cb93> in <module>
----> 1 model = train_model(model, criterion, optimizer, step_lr_scheduler, num_epochs=20)
<ipython-input-28-662f652902cf> in train_model(model, criterion, optimizer, scheduler, num_epochs)
29 outputs = model(inputs)
30 _, preds = torch.max(outputs, 1)
---> 31 loss = criterion(outputs, labels)
32
33 # backward + optimize only if in training phase
M:\program\pytorch\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
M:\program\pytorch\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
914 def forward(self, input, target):
915 return F.cross_entropy(input, target, weight=self.weight,
--> 916 ignore_index=self.ignore_index, reduction=self.reduction)
917
918
M:\program\pytorch\lib\site-packages\torch\nn\functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2019 if size_average is not None or reduce is not None:
2020 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2021 return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
2022
2023
M:\program\pytorch\lib\site-packages\torch\nn\functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
1836 .format(input.size(0), target.size(0)))
1837 if dim == 2:
-> 1838 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
1839 elif dim == 4:
1840 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
IndexError: Target 40 is out of bounds.
Could you let me know where I am wrong? Thanks very much.

Keras GPU memory overflow using with keras.utils.sequence and generator

Dataset.py
import os
import random
from skimage import io
import cv2
from skimage.transform import resize
import numpy as np
import tensorflow as tf
import keras
import Augmentor
def iter_sequence_infinite(seq):
"""Iterate indefinitely over a Sequence.
# Arguments
seq: Sequence object
# Returns
Generator yielding batches.
"""
while True:
for item in seq:
yield item
# data generator class
class DataGenerator(keras.utils.Sequence):
def __init__(self, ids, imgs_dir, masks_dir, batch_size=10, img_size=128, n_classes=1, n_channels=3, shuffle=True):
self.id_names = ids
self.indexes = np.arange(len(self.id_names))
self.imgs_dir = imgs_dir
self.masks_dir = masks_dir
self.batch_size = batch_size
self.img_size = img_size
self.n_classes = n_classes
self.n_channels = n_channels
self.shuffle = shuffle
self.on_epoch_end()
# for printing the statistics of the function
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.id_names))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation__(self, id_name):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
img_path = os.path.join(self.imgs_dir, id_name) # polyp segmentation/images/id_name.jpg
mask_path = os.path.join(self.masks_dir, id_name) # polyp segmenatation/masks/id_name.jpg
img = io.imread(img_path)
mask = cv2.imread(mask_path)
p = Augmentor.DataPipeline([[img, mask]])
p.resize(probability=1.0, width=self.img_size, height=self.img_size)
p.rotate_without_crop(probability=0.3, max_left_rotation=10, max_right_rotation=10)
#p.random_distortion(probability=0.3, grid_height=10, grid_width=10, magnitude=1)
p.shear(probability=0.3, max_shear_left=1, max_shear_right=1)
#p.skew_tilt(probability=0.3, magnitude=0.1)
p.flip_random(probability=0.3)
sample_p = p.sample(1)
sample_p = np.array(sample_p).squeeze()
p_img = sample_p[0]
p_mask = sample_p[1]
augmented_mask = (p_mask // 255) * 255 # denoising
q = Augmentor.DataPipeline([[p_img]])
q.random_contrast(probability=0.3, min_factor=0.2, max_factor=1.0) # low to High
q.random_brightness(probability=0.3, min_factor=0.2, max_factor=1.0) # dark to bright
sample_q = q.sample(1)
sample_q = np.array(sample_q).squeeze()
image = sample_q
mask = augmented_mask[::, ::, 0]
"""
# reading the image from dataset
## Reading Image
image = io.imread(img_path) # reading image to image vaiable
image = resize(image, (self.img_size, self.img_size), anti_aliasing=True) # resizing input image to 128 * 128
mask = io.imread(mask_path, as_gray=True) # mask image of same size with all zeros
mask = resize(mask, (self.img_size, self.img_size), anti_aliasing=True) # resizing mask to fit the 128 * 128 image
mask = np.expand_dims(mask, axis=-1)
"""
# image normalization
image = image / 255.0
mask = mask / 255.0
return image, mask
def __len__(self):
"Denotes the number of batches per epoch"
return int(np.floor(len(self.id_names) / self.batch_size))
def __getitem__(self, index): # index : batch no.
# Generate indexes of the batch
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
batch_ids = [self.id_names[k] for k in indexes]
imgs = list()
masks = list()
for id_name in batch_ids:
img, mask = self.__data_generation__(id_name)
imgs.append(img)
masks.append(np.expand_dims(mask,-1))
imgs = np.array(imgs)
masks = np.array(masks)
return imgs, masks # return batch
train.py
import argparse
import logging
import os
import sys
from tqdm import tqdm # progress bar
import numpy as np
import matplotlib.pyplot as plt
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import segmentation_models as sm
from segmentation_models.utils import set_trainable
from dataset import DataGenerator, iter_sequence_infinite
def train_model(model, train_gen, valid_gen, epochs, save_cp=True):
total_batch_count = 0
train_img_num = len(train_gen.id_names)
train_batch_num = len(train_gen)
train_gen_out = iter_sequence_infinite(train_gen)
valid_batch_num = len(valid_gen)
valid_img_num = len(valid_gen.id_names)
valid_gen_out = iter_sequence_infinite(valid_gen)
for epoch in range(epochs): # interation as many epochs
set_trainable(model)
epoch_loss = 0 # loss in this epoch
epoch_iou = 0
count = 0
with tqdm(total=train_img_num, desc=f'Epoch {epoch + 1}/{epochs}', position=0, leave=True, unit='img') as pbar: # make progress bar
for _ in range(train_batch_num):
batch = next(train_gen_out)
imgs = batch[0]
true_masks = batch[1]
loss, iou = model.train_on_batch(imgs, true_masks) # value of loss of this batch
epoch_loss += loss
epoch_iou += iou
pbar.set_postfix(**{'Batch loss': loss, 'Batch IoU': iou}) # floating the loss at the post in the pbar
pbar.update(imgs.shape[0]) # update progress
count += 1
total_batch_count += 1
train_gen.on_epoch_end()
print( "Epoch : loss: {}, IoU : {}".format(epoch_loss/count, epoch_iou/count))
# Do validation
validation_model(model, valid_gen_out, valid_batch_num, valid_img_num)
valid_gen.on_epoch_end()
if save_cp:
try:
if not os.path.isdir(checkpoint_dir):
os.mkdir(checkpoint_dir)
logging.info('Created checkpoint directory')
else:
pass
except OSError:
pass
model.save_weights(os.path.join(checkpoint_dir , f'CP_epoch{epoch + 1}.h5'))
logging.info(f'Checkpoint {epoch + 1} saved !')
def validation_model(model, valid_gen_out, valid_batch_num, valid_img_num):
epoch_loss = 0 # loss in this epoch
epoch_iou = 0
count = 0
with tqdm(total=valid_img_num, desc='Validation round', position=0, leave=True, unit='img') as pbar: # make progress bar
for _ in range(valid_batch_num):
batch = next(valid_gen_out)
imgs = batch[0]
true_masks = batch[1]
loss, iou = model.test_on_batch(imgs, true_masks) # value of loss of this batch
epoch_loss += loss
epoch_iou += iou
pbar.set_postfix(**{'Batch, loss': loss, 'Batch IoU': iou}) # floating the loss at the post in the pbar
pbar.update(imgs.shape[0]) # update progress
count += 1
print("Validation loss: {}, IoU: {}".format(epoch_loss / count, epoch_iou / count))
pred_mask = model.predict(np.expand_dims(imgs[0],0))
plt.subplot(131)
plt.imshow(imgs[0])
plt.subplot(132)
plt.imshow(true_masks[0].squeeze(), cmap="gray")
plt.subplot(133)
plt.imshow(pred_mask.squeeze(), cmap="gray")
plt.show()
print()
def get_args():
parser = argparse.ArgumentParser(description='Train the UNet on images and target masks',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-e', '--epochs', metavar='E', type=int, default=50,
help='Number of epochs', dest='epochs')
parser.add_argument('-b', '--batch_size', metavar='B', type=int, nargs='?', default=2,
help='Batch size', dest='batch_size')
parser.add_argument('-l', '--learning-rate', metavar='LR', type=float, nargs='?', default=1e-5,
help='Learning rate', dest='lr')
parser.add_argument('-bb', '--backbone', default='resnet50', metavar='FILE',
help="backcone name")
parser.add_argument('-w', '--weight', dest='load', type=str, default=False,
help='Load model from a .h5 file')
parser.add_argument('-s', '--resizing', dest='resizing', type=int, default=384,
help='Downscaling factor of the images')
parser.add_argument('-v', '--validation', dest='val', type=float, default=20.0,
help='Percent of the data that is used as validation (0-100)')
return parser.parse_args()
if __name__ == '__main__':
img_dir = './data/train/imgs/' # ./data/train/imgs/CVC_Original/'
mask_dir = './data/train/masks/' # ./data/train/masks/CVC_Ground Truth/'
checkpoint_dir = './checkpoints'
args = get_args()
# train path
train_ids = os.listdir(img_dir)
# Validation Data Size
n_val = int(len(train_ids) * args.val/100) # size of validation set
valid_ids = train_ids[:n_val] # list of image ids used for validation of result 0 to 9
train_ids = train_ids[n_val:] # list of image ids used for training dataset
# print(valid_ids, "\n\n")
print("training_size: ", len(train_ids), "validation_size: ", len(valid_ids))
train_gen = DataGenerator(train_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
valid_gen = DataGenerator(valid_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
print("total training batches: ", len(train_gen))
print("total validaton batches: ", len(valid_gen))
train_steps = len(train_ids) // args.batch_size
valid_steps = len(valid_ids) // args.batch_size
# define model
model = sm.Unet(args.backbone, encoder_weights='imagenet')
optimizer = optimizers.Adam(lr=args.lr, decay=1e-4)
model.compile(
optimizer=optimizer,
# "Adam",
loss=sm.losses.bce_dice_loss, # sm.losses.bce_jaccard_loss, # sm.losses.binary_crossentropy,
metrics=[sm.metrics.iou_score],
)
#model.summary()
callbacks = [
EarlyStopping(patience=6, verbose=1),
ReduceLROnPlateau(factor=0.1, patience=3, min_lr=1e-7, verbose=1),
ModelCheckpoint('./weights.Epoch{epoch:02d}-Loss{loss:.3f}-VIou{val_iou_score:.3f}.h5', verbose=1,
monitor='val_accuracy', save_best_only=True, save_weights_only=True)
]
train_model(model=model, train_gen=train_gen, valid_gen=valid_gen, epochs=args.epochs)
When I try to run this code, some epochs are well progressed but, in 20epochs, it occurs gpu memory overflow error like below
(0) Resource exhausted: OOM when allocating tensor with shape[2,64,96,96] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node decoder_stage2b_bn/FusedBatchNorm}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
so, I think that it is because of data generation.
This code generate batch in this order.
in train.py, initialize Datageneratr class which is sequence model that is implemented in Dataset.py
train_gen = DataGenerator(train_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
valid_gen = DataGenerator(valid_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
At the first in the function 'train_model' convert Datagenerator(sequence model) to generator with using function 'iter_sequence_infinite'
train_gen_out = iter_sequence_infinite(train_gen)
valid_gen_out = iter_sequence_infinite(valid_gen)
using magic-function, 'next', get batch
batch = next(train_gen_out)
I think that there will be no memory problem but it's occurred.
What is the problem and how to solve it?
Thanks.

My tensorboard events appear many charts I did not summary

I only summary my loss as 'xentropy_mean' in training() ,but in tensorboard ,I had not find the 'xentropy_mean' chart but many other charts I did not defined. I don't know where I wrote wrong, and what's the matter indeed. Is it because I use thread in my code? If I don't use thread, how should I wrote it?
The tensorboard screenshot
There are 6 charts under the queue,I don't know what are the meanings either
I create the model in the file below
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow.python.platform
import tensorflow as tf
# The MNIST dataset has 10 classes, representing the digits 0 through 9.
NUM_CLASSES = 16
# The MNIST images are always 28x28 pixels.
IMAGE_SIZE = 28
IMAGE_PIXELS = 784
def inference(images, hidden1_units, hidden2_units):
"""Build the MNIST model up to where it may be used for inference.
Args:
images: Images placeholder, from inputs().
hidden1_units: Size of the first hidden layer.
hidden2_units: Size of the second hidden layer.
Returns:
softmax_linear: Output tensor with the computed logits.
"""
# Hidden 1
with tf.name_scope('hidden1'):
weights = tf.Variable(
tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
name='weights')
biases = tf.Variable(tf.zeros([hidden1_units]),
name='biases')
hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
# Hidden 2
with tf.name_scope('hidden2'):
weights = tf.Variable(
tf.truncated_normal([hidden1_units, hidden2_units],
stddev=1.0 / math.sqrt(float(hidden1_units))),
name='weights')
biases = tf.Variable(tf.zeros([hidden2_units]),
name='biases')
hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
# Linear
with tf.name_scope('softmax_linear'):
weights = tf.Variable(
tf.truncated_normal([hidden2_units, NUM_CLASSES],
stddev=1.0 / math.sqrt(float(hidden2_units))),
name='weights')
biases = tf.Variable(tf.zeros([NUM_CLASSES]),
name='biases')
logits = tf.matmul(hidden2, weights) + biases
return logits
def loss(logits, labels):
batch_size = tf.size(labels)
#print('batch size %d' %(batch_size))
labels = tf.expand_dims(labels, 1)
indices = tf.expand_dims(tf.range(0, batch_size), 1)
concated = tf.concat(1, [indices, labels])
#print('Done2')
onehot_labels = tf.sparse_to_dense(
concated, tf.pack([batch_size, 16]), 1.0, 0.0)
#print('Done1')
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
onehot_labels,
name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
tf.summary.scalar(loss.op.name, loss)
return loss
def training(loss, learning_rate):
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
global_step=tf.Variable(0,name='global_step',trainable=False)
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
def evaluation(logits, labels):
correct = tf.nn.in_top_k(logits, labels, 1)
# Return the number of true entries.
return tf.reduce_sum(tf.cast(correct, tf.int32))
and train the model in this file:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import sys
import time
import numpy as np
import tensorflow as tf
import mnist
# Basic model parameters as external flags.
#FLAGS = None
# Constants used for dealing with the files, matches convert_to_records.
TRAIN_FILE = 'train.tfrecords'
VALIDATION_FILE = 'validation.tfrecords'
TEST_FILE='test.tfrecords'
flags = tf.app.flags
FLAGS = flags.FLAGS
#FLAGS = None
flags.DEFINE_string('train_dir', '/home/queenie/image2tfrecord/tfrecords-28-gray/', 'Directory to put the training data.')
flags.DEFINE_string('filename', 'train.tfrecords', 'Directory to put the training data.')
flags.DEFINE_integer('batch_size', 100, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('num_epochs', None, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('hidden1', 128,'balabala')
flags.DEFINE_integer('hidden2', 32,'balabala')
flags.DEFINE_integer('learning_rate', 0.01,'balabala')
flags.DEFINE_integer('max_steps', 50000,'balabala')
def placeholder_inputs(batch_size):
images_placeholder=tf.placeholder(tf.float32,shape=(batch_size,mnist.IMAGE_PIXELS))
labels_placeholder=tf.placeholder(tf.int32,shape=(batch_size))
return images_placeholder,labels_placeholder
def fill_feed_dict(images_feed,labels_feed,images_pl,labels_pl):
feed_dict={
images_pl:images_feed,
labels_pl:labels_feed,
}
return feed_dict
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
})
# Convert from a scalar string tensor (whose single string has
# length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
# [mnist.IMAGE_PIXELS].
image = tf.decode_raw(features['image_raw'], tf.uint8)
image.set_shape([mnist.IMAGE_PIXELS])
# OPTIONAL: Could reshape into a 28x28 image and apply distortions
# here. Since we are not applying any distortions in this
# example, and the next step expects the image to be flattened
# into a vector, we don't bother.
# Convert from [0, 255] -> [-0.5, 0.5] floats.
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
# Convert label from a scalar uint8 tensor to an int32 scalar.
label = tf.cast(features['label'], tf.int32)
return image, label
def do_eval(sess,eval_correct):
true_count=0
for step in xrange(FLAGS.batch_size):
#print(sess.run(eval_correct))
true_count+=sess.run(eval_correct)
precision=float(true_count)/FLAGS.batch_size/FLAGS.batch_size
print(' Num examples: %d Num correct: %d Precision # 1: %0.04f' %
(FLAGS.batch_size, true_count, precision))
return precision
def inputs(train, batch_size, num_epochs):
if not num_epochs: num_epochs = None
if train=='train':
filename=os.path.join(FLAGS.train_dir,TRAIN_FILE)
elif train=='validation':
filename=os.path.join(FLAGS.train_dir,VALIDATION_FILE)
else:
filename=os.path.join(FLAGS.train_dir,TEST_FILE)
# filename = os.path.join(FLAGS.train_dir,
# TRAIN_FILE if train else VALIDATION_FILE)
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=None)
# Even when reading in multiple threads, share the filename
# queue.
image, label = read_and_decode(filename_queue)
# Shuffle the examples and collect them into batch_size batches.
# (Internally uses a RandomShuffleQueue.)
# We run this in two threads to avoid being a bottleneck.
images, sparse_labels = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, num_threads=2,
capacity=1000 + 3 * batch_size,
# Ensures a minimum amount of shuffling of examples.
min_after_dequeue=1000)
return images, sparse_labels
def run_training():
with tf.Graph().as_default():
# Build a Graph that computes predictions from the inference model.
images, labels = inputs(train='train', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
images_valid,labels_valid=inputs(train='validation', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
images_test,labels_test=inputs(train='test', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
logits = mnist.inference(images,
FLAGS.hidden1,
FLAGS.hidden2)
# Add to the Graph the loss calculation.
valid_prediction=mnist.inference(images_valid,FLAGS.hidden1,FLAGS.hidden2)
test_prediction=mnist.inference(images_test,FLAGS.hidden1,FLAGS.hidden2)
loss = mnist.loss(logits, labels)
# Add to the Graph operations that train the model.
train_op = mnist.training(loss, FLAGS.learning_rate)
eval_correct=mnist.evaluation(logits,labels)
eval_correct_valid=mnist.evaluation(valid_prediction,labels_valid)
eval_correct_test=mnist.evaluation(test_prediction,labels_test)
summary_op=tf.merge_all_summaries()
# The op for initializing the variables.
init_op = tf.group(tf.initialize_all_variables(),
tf.initialize_local_variables())
saver = tf.train.Saver()
# Create a session for running operations in the Graph.
sess = tf.Session()
# Initialize the variables (the trained variables and the
# epoch counter).
sess.run(init_op)
summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
train_precision=0
validation_precision=0
test_precision=0
#while not coord.should_stop():
while not coord.should_stop():
start_time = time.time()
_, loss_value,images_see,labels_see = sess.run([train_op, loss,images,labels])
#print('run done')
duration = time.time() - start_time
# Print an overview fairly often.
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
duration))
precision_tr=do_eval(sess,eval_correct)
summary_str=sess.run(summary_op)
summary_writer.add_summary(summary_str,step)
if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_file = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_file, global_step=step)
print('Train:')
do_eval(sess,eval_correct)
print('Validation:')
do_eval(sess,eval_correct_valid)
print('Test:')
do_eval(sess,eval_correct_test)
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
# When done, ask the threads to stop.
coord.request_stop()
# Wait for threads to finish.
coord.join(threads)
sess.close()
run_training()
then I get the tensorboard like these,6 charts about queue.
The tensorboard screenshot
The queue charts you are seeing are created by default from shuffle_batch and friends, and can be used to monitor the performance of your input pipeline (you'll ideally want all the queues to stay at capacity, as that means your GPU isn't blocking on input reading).
I don't understand why your summary isn't showing in tensorboard. Can I get more information?