Tensorflow - synchronize readings from tfrecord - tensorflow

I'm a bit new to tensorflow and I'm trying to create an input pipeline based on tfrecord file. Each entry in the file contains three field: 2 strings with paths to 2 image file and 1 float tensor (the labels for the example). I'm able to write and read back again the information, unfortunately I have some problem keeping image and labels synchronized.
To save the records I'm using this piece of code
writer = tf.python_io.TFRecordWriter(output_tfrecord)
...
for index in shuffled_indexes:
example = tf.train.Example(
features=tf.train.Features(
feature={
'label': tf.train.Feature(float_list=tf.train.FloatList(value=target.ravel().tolist()),
'image_1': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_1.encode()])),
'image_2': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_2.encode()]))
}
)
)
writer.write(example.SerializeToString())
writer.close()
And to read it back again this one (for this example I'm ignoring field 'image_2' in each record):
def read_and_decode(filename, target_shape):
# first construct a queue containing a list of filenames.
# this lets a user split up there dataset in multiple files to keep
# size down
filename_queue = tf.train.string_input_producer(filename,num_epochs=None)
#symbolic reader to read one example at a time
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'label': tf.FixedLenFeature(target_shape, tf.float32),
'image_1': tf.FixedLenFeature([], tf.string),
'image_2': tf.FixedLenFeature([], tf.string)
}
)
img_filename_queue = tf.train.string_input_producer([features['image_1']],shuffle=False)
image_reader = tf.WholeFileReader()
_, image_file = image_reader.read(img_filename_queue)
image = tf.image.decode_jpeg(image_file, channels=3)
with tf.control_dependencies([image]):
label = features['label']
return image,label
Each couple image and label are an example from my training set. If I try to run them in a single session what I get are not synchronized result, e.g. in a toy example with just two records in the tfrecord file the image and label are exchanged: first label with second image and vice versa.
Example of my session code:
image,label = read_and_decode([outputfileName],result_shape)
with tf.Session() as sess:
# Start the queue runners (input threads)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for i in range(2):
img,trg = sess.run([image,label])
ioUtils.visualizeLabel(img,trg)
# When done, ask the threads to stop.
coord.request_stop()
# Wait for threads to finish.
coord.join(threads)
Any advice on what I'm doing wrong?

Ok I figured it out, the problem was in
img_filename_queue = tf.train.string_input_producer([features['image_1']],shuffle=False)
the string_input_producer was messing up with the rest of the piepline. The proper way to write read_and_decode is
def read_and_decode_tfrecord(filename, target_shape):
# first construct a queue containing a list of filenames.
# this lets a user split up there dataset in multiple files to keep
# size down
filename_queue = tf.train.string_input_producer(filename,num_epochs=None)
#symbolic reader to read one example at a time
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'label': tf.FixedLenFeature(target_shape, tf.float32),
'image_1': tf.FixedLenFeature([], tf.string),
'image_2': tf.FixedLenFeature([], tf.string)
}
)
image_file = tf.read_file(image_path_1)
image = tf.image.decode_jpeg(image_file, channels=3)
with tf.control_dependencies([image]):
label = features['label']
return image,label

Related

Got a mismatch result between writing and reading tfrecord files

Here I use this func to write multiple tfrecord files:
writer = tf.python_io.TFRecordWriter(save)
for pth, lb in tqdm(zip(piece_p, piece_l)):
# mind that the path should be read into image data first
# to convert the byteslist data format into raw bytes
data = Image.open(pth)
if resize is not None:
data.thumbnail(resize, Image.ANTIALIAS)
features = tf.train.Features(feature={
'image': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[data.tobytes()])),
'label': tf.train.Feature(
int64_list=tf.train.Int64List(value=[lb]))
})
example = tf.train.Example(features=features)
# serialize the constructed data format before writing step
writer.write(example.SerializeToString())
sys.stdout.flush()
writer.close()
And parse the binary file using code as below:
def parse_fn(serialized):
features = {
'image': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64)
}
parse_exp = tf.parse_single_example(serialized=serialized,
features=features)
labels = parse_exp['label']
data = parse_exp['image']
data = tf.decode_raw(data, tf.uint8)
data = tf.cast(data, tf.float32)
del parse_exp
return data, labels
dataset = tf.data.Dataset.list_files(data_list, shuffle=True)
dataset = dataset.interleave(tf.data.TFRecordDataset,
cycle_length=file_num)
# dataset = tf.data.TFRecordDataset(data_list[0])
dataset = dataset.map(parse_fn, num_parallel_calls=4)
But why is the number of labels and data always mismatching...?
every time when adding the following code to make multiple batches or sth
dataset = dataset.batch(12)
dataset = dataset.repeat(1)
iterator = dataset.make_initializable_iterator()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(iterator.initializer)
data, labels = iterator.get_next()
and the labels quantity always remains half of data. are there something wrong with my arguments setting? I pretty sure that there are no wrong with my saving part and reading part separately... but there are some problems when combining them together.

Tensorflow While Body Not Executing

I have a FIFO Queue reading from tfrecords file in tensorflow. Each record is consisted of an image and its annotation, that is, a set of features. I was trying to skip some images that is, not feeding them into the graph, or not viewing them, according to some features in mind. Therefore, I thought that the best case scenario was to use on a while loop. That loop is going to test the value of the specified feature and decide whether to proceed or not.
Kindly look at the following code:
import tensorflow as tf
import numpy as np
num_epoch = 100
tfrecords_filename_seq = ["C:/Users/user/PycharmProjects/AffectiveComputing/P16_db.tfrecords"]
filename_queue = tf.train.string_input_producer(tfrecords_filename_seq, num_epochs=num_epoch, shuffle=False, name='queue')
reader = tf.TFRecordReader()
current_image_confidence = tf.constant(0.0, dtype=tf.float32)
def body(i):
key, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'height': tf.FixedLenFeature([], tf.int64),
'width': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
'annotation_raw': tf.FixedLenFeature([], tf.string)
})
# This is how we create one example, that is, extract one example from the database.
image = tf.decode_raw(features['image_raw'], tf.uint8)
# The height and the weights are used to
height = tf.cast(features['height'], tf.int32)
width = tf.cast(features['width'], tf.int32)
# The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the
# height and the weight to restore the original image back.
image = tf.reshape(image, [height, width, 3])
annotation = tf.cast(features['annotation_raw'], tf.string)
t1 = tf.string_split([annotation], delimiter=',')
t2 = tf.reshape(t1.values, [1, -1])
t3 = tf.string_to_number(t2, out_type=tf.float32)
t_ = tf.slice(t3, begin=[0, 3], size=[1, 1])
# Note that t_ is holding a value of 1.0 or 0.0. So its a particular feature I'm interested in.
t_ = tf.Print(t_, data=[tf.shape(t_)], message='....')
z = tf.cond(t_[0][0] < 1.0, lambda: tf.add(i, 0.0), lambda: tf.add(i, 1.0))
return z
cond = lambda i: tf.equal(i, tf.constant(0.0, dtype=tf.float32))
loop = tf.while_loop(cond, body, [current_image_confidence])
init_op = tf.group(tf.local_variables_initializer(),
tf.global_variables_initializer())
with tf.Session() as sess:
sess.run(init_op)
sess.run(loop)
Finally, when trying to run the following code, it seems that the body is not executing and hence stuck in an infinite loop. And the tf.Print(...) in the body was not executed.
Why this is the case?
Any help is much appreciated!!
Your program is getting stick because you're not starting queue runners. Run tf.start_queue_runners() before running your loop op.

Multi Layer Tiff labelled dataset conversion to format that tensor flow can use for model optimisation

I'm a Python and Tensor Flow newbie, and was wondering...
How best to convert a labelled dataset of Multi-Layer Tiffs into a format that Tensor Flow can use for model optimisation / fine tuning ?
I currently have this code that puts each layer of a folder of Multi-Tiffs into a 3D Array, but i need to preserve the label or filename of the Multi-Tiffs. I have seen some tensor flow scripts to convert to TFRecords, however, I'm not sure if these preserve the file name ? How best would you go about this ? It will be quite a big dataset.
Any help much appreciated
import os # For file handling
from PIL import Image# Import Pillow image processing library
import numpy
CroppedMultiTiffs = "MultiTiffs/"
for filename in os.listdir(MultiTiffs):
## Imports Multi-Layer TIFF into 3D Numpy Array.
img = Image.open(MultiTiffs + filename)
imgArray = numpy.zeros( ( img.n_frames, img.size[1], img.size[0] ),numpy.uint8 )
try:
# for frames in range, img.n_frames for whole folder.
for frame in range(2,img.n_frames):
img.seek( frame )
imgArray[frame,:,:] = img
frame = frame + 1
except (EOFError): img.seek( 0 )
# output error if it doesn't find a file.
pass
print(imgArray.shape) # imgArray is now 3D
print(imgArray.size)
best wishes
TWP
okay, so I figured it out using the thread from Daniils blog
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
However my current implimentation creates multiple TFRecords, and I think it needs to be a single TFRecord, so trying to figure out how to make it a single TFRecord. How do I do that?
Then I can validate it using a TFRecord Reading script to read it back and check it is in the right format for Tensor Flow. I currently get errors using the reading script.
from PIL import Image
import numpy as np
import tensorflow as tf
import os
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
path = 'test/'
output = 'output/'
fileList = [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(path) for f in files if f.endswith('.tif')]
print (fileList)
for filename in fileList:
basename = os.path.basename(filename)
file_name = basename[:-4]
print ("processing file: " , filename)
print (file_name)
if not os.path.exists(output):
os.mkdir(output)
writer = tf.python_io.TFRecordWriter(output+ file_name + '.tfrecord')
img = Image.open(filename)
imgArray = np.zeros( ( img.n_frames, img.size[1], img.size[0] ),np.uint8 )
## Imports Multi-Layer file into 3D Numpy Array.
try:
for frame in range(0,img.n_frames):
img.seek( frame )
imgArray[frame,:,:] = img
frame = frame + 1
except (EOFError): img.seek( 0 )
pass
print ("print img size:" , img.size)
print ("print image shape: " , imgArray.shape)
print ("print image size: " , imgArray.size)
annotation = np.array(Image.open(filename))
height = imgArray.shape[0]
width = imgArray.shape[1]
depth = imgArray.shape[2]
img_raw = imgArray.tostring()
annotation_raw = annotation.tostring()
example = tf.train.Example(features=tf.train.Features(feature={
'height': _int64_feature(height),
'width': _int64_feature(width),
'depth': _int64_feature(depth), # for 3rd dimension
'image_raw': _bytes_feature(img_raw),
'mask_raw': _bytes_feature(annotation_raw)}))
writer.write(example.SerializeToString())
My current TFRecords Reading script
import tensorflow as tf
import os
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
'height': tf.FixedLenFeature([], tf.int64),
'width': tf.FixedLenFeature([], tf.int64),
'depth': tf.FixedLenFeature([], tf.int64)
})
image = tf.decode_raw(features['image_raw'], tf.uint8)
label = tf.cast(features['label'], tf.int32)
height = tf.cast(features['height'], tf.int32)
width = tf.cast(features['width'], tf.int32)
depth = tf.cast(features['depth'], tf.int32)
return image, label, height, width, depth
with tf.Session() as sess:
filename_queue = tf.train.string_input_producer(["output/A.3.1.tfrecord"])
image, label, height, width, depth = read_and_decode(filename_queue)
image = tf.reshape(image, tf.stack([height, width, 3]))
image.set_shape([32,32,3])
init_op = tf.initialize_all_variables()
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(1000):
example, l = sess.run([image, label])
print (example,l)
coord.request_stop()
coord.join(threads)
receiving the error:-
InvalidArgumentError (see above for traceback): Name: , Feature: label (data type: int64) is required but could not be found.
Images are grayscale multi-page

Using While loop with Queues in Tensorflow

I have a set of images which I'm going to feed into a graph in tensorflow. Fetching the data is done through a FIFOQueue. The problem is that in some images, the face is not detected, that is, the image does not contain a face. Therefore, I am going to ignore these images before feeding them into the graph. My code is as follows:
import tensorflow as tf
import numpy as np
num_epoch = 100
tfrecords_filename_seq = ["C:/Users/user/PycharmProjects/AffectiveComputing/P16_db.tfrecords"]
filename_queue = tf.train.string_input_producer(tfrecords_filename_seq, num_epochs=num_epoch, shuffle=False, name='queue')
reader = tf.TFRecordReader()
current_image_confidence = tf.Variable(tf.constant(0.0, dtype=tf.float32))
image = tf.Variable(tf.ones([112, 112, 3]), dtype=tf.float32)
annotation = tf.Variable('', dtype=tf.string)
def body():
key, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'height': tf.FixedLenFeature([], tf.int64),
'width': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
'annotation_raw': tf.FixedLenFeature([], tf.string)
})
# This is how we create one example, that is, extract one example from the database.
image_ = tf.decode_raw(features['image_raw'], tf.uint8)
# The height and the weights are used to
height = tf.cast(features['height'], tf.int32)
width = tf.cast(features['width'], tf.int32)
# The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the
# height and the weight to restore the original image back.
image.assign(tf.reshape(image_, [height, width, 3]))
annotation.assign(tf.cast(features['annotation_raw'], tf.string))
current_image_confidence.assign(tf.slice(tf.string_to_number(tf.string_split(annotation, delimiter=','),
out_type=tf.float32),
begin=[0, 3],
size=[1, 1]))
def cond():
tf.equal(current_image_confidence, tf.constant(0.0, dtype=tf.float32))
loop = tf.while_loop(cond, body, [current_image_confidence, reader, image, annotation])
Therefore, I need a while loop that will run until I get an image with face. That is when I need to terminate the loop and send the image to the graph.
Please note that my data is stored in a tfrecord file. So each record contains one image and a set of features called annotation, saved as a tf.string. So the current_image_confidence Variable is used to hold a value of 1 or 0 based whether a face is present or not.
How to fix the code??
Any help is much appreciated!!

Tensorflow - Correct way to read data from single large txt file

I want to ask what the correct pattern to read large text data in batches using tensorflow?
Here is one line of text data. There are billions of lines of such data in single txt file.
target context label
Right now I am trying to use tfrecords as recommended in official documentation.
here is my way
filename_queue = tf.train.string_input_producer([self._train_data], num_epochs=self._num_epochs)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'target': tf.FixedLenFeature([], tf.int64),
'context': tf.FixedLenFeature([], tf.int64),
'label': tf.FixedLenFeature([], tf.int64),
})
target = features['target']
context = features['context']
label = features['label']
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * self._batch_size
target_batch, context_batch, label_batch = tf.train.shuffle_batch(
[target, context, label], batch_size=self._batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue, num_threads=self._concurrent_steps)
After that I used timeline to do profiling. The result shows that this part take most of the time.
Here is the profiling diagram.
the profiling result
Btw. I am using batch size 500.
Any suggestions?
It is often more efficient to apply tf.parse_example() to a batch of elements than to apply tf.parse_single_example() on each individual element, because the former op has an efficient multithreaded implementation that can be used when the input contains multiple examples. The following rewrite of your code should improve the performance:
filename_queue = tf.train.string_input_producer([self._train_data], num_epochs=self._num_epochs)
reader = tf.TFRecordReader()
# Read a batch of up to 128 examples at once.
_, serialized_examples = reader.read_up_to(filename_queue, 128)
features = tf.parse_example(
serialized_examples,
# Defaults are not specified since both keys are required.
features={
'target': tf.FixedLenFeature([], tf.int64),
'context': tf.FixedLenFeature([], tf.int64),
'label': tf.FixedLenFeature([], tf.int64),
})
target = features['target']
context = features['context']
label = features['label']
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * self._batch_size
# Pass `enqueue_many=True` because the input is now a batch of parsed examples.
target_batch, context_batch, label_batch = tf.train.shuffle_batch(
[target, context, label], batch_size=self._batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue, num_threads=self._concurrent_steps,
enqueue_many=True)