Reading multiple feature vectors from one TFRecord example in Tensorflow - tensorflow

I know how to store one feature per example inside a tfrecord file and then read it by using something like this:
import tensorflow as tf
import numpy as np
import os
# This is used to parse an example from tfrecords
def parse(serialized_example):
features = tf.parse_single_example(
serialized_example,
features ={
"label": tf.FixedLenFeature([], tf.string, default_value=""),
"feat": tf.FixedLenFeature([], tf.string, default_value="")
})
feat = tf.decode_raw(features['feat'], tf.float64)
label = tf.decode_raw(features['label'], tf.int64)
return feat, label
################# Generate data
cwd = os.getcwd()
numdata = 10
with tf.python_io.TFRecordWriter(os.path.join(cwd, 'data.tfrecords')) as writer:
for i in range(numdata):
feat = np.random.randn(2)
label = np.array(np.random.randint(0,9))
featb = feat.tobytes()
labelb = label.tobytes()
import pudb.b
example = tf.train.Example(features=tf.train.Features(
feature={
'feat': tf.train.Feature(bytes_list=tf.train.BytesList(value=[featb])),
'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[labelb])),}))
writer.write(example.SerializeToString())
print('wrote f {}, l {}'.format(feat, label))
print('Done writing! Start reading and printing data')
################# Read data
filename = ['data.tfrecords']
dataset = tf.data.TFRecordDataset(filename).map(parse)
dataset = dataset.batch(100)
iterator = dataset.make_initializable_iterator()
feat, label = iterator.get_next()
with tf.Session() as sess:
sess.run(iterator.initializer)
try:
while True:
example = sess.run((feat,label))
print example
except tf.errors.OutOfRangeError:
pass
What do I do in the case where each example has multiple feature vectors + labels in it. For example, in the above code, if feat was stored as a 2D array. I still want to do the same thing as before, which is to train a DNN with one feature per label, but each example in the tfrecords file has multiple features and multiple labels. This should be simple but I'm having trouble unpacking multiple features in tensorflow using tfrecords.

Firstly, note that np.ndarray.tobytes() flattens out multi-dimensional arrays into a list, i.e.
feat = np.random.randn(N, 2)
reshaped = np.reshape(feat, (N*2,))
feat.tobytes() == reshaped.tobytes() ## True
So, if you have a N*2 array that's saved as bytes in TFRecord format, you have to reshape it after parsing.
If you do that, you can unbatch the elements of a tf.data.Dataset so that each iteration gives you one feature and one label. Your code should be as follows:
# This is used to parse an example from tfrecords
def parse(serialized_example):
features = tf.parse_single_example(
serialized_example,
features ={
"label": tf.FixedLenFeature([], tf.string, default_value=""),
"feat": tf.FixedLenFeature([], tf.string, default_value="")
})
feat = tf.decode_raw(features['feat'], tf.float64) # array of shape (N*2, )
feat = tf.reshape(feat, (N, 2)) # array of shape (N, 2)
label = tf.decode_raw(features['label'], tf.int64) # array of shape (N, )
return feat, label
################# Generate data
cwd = os.getcwd()
numdata = 10
with tf.python_io.TFRecordWriter(os.path.join(cwd, 'data.tfrecords')) as writer:
for i in range(numdata):
feat = np.random.randn(N, 2)
label = np.array(np.random.randint(0,9, N))
featb = feat.tobytes()
labelb = label.tobytes()
example = tf.train.Example(features=tf.train.Features(
feature={
'feat': tf.train.Feature(bytes_list=tf.train.BytesList(value=[featb])),
'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[labelb])),}))
writer.write(example.SerializeToString())
print('wrote f {}, l {}'.format(feat, label))
print('Done writing! Start reading and printing data')
################# Read data
filename = ['data.tfrecords']
dataset = tf.data.TFRecordDataset(filename).map(parse).apply(tf.contrib.data.unbatch())
... etc

Related

How to shuffle/preprocess multiple TFRecords file?

I have already split large TFRecord files into multiple smaller ones as it keeps crashing whenever I train it. From what I read, I know that tf.data could help. But I have no idea on how to shuffle the TFRecord in other formats and convert it back to TFRecord.
import tensorflow as tf
ITEMS_PER_FILE = 10
raw_dataset = tf.data.TFRecordDataset('test.record')
batch_idx = 0
for batch in raw_dataset.batch(10):
print(raw_dataset)
batch_ds = tf.data.Dataset.from_tensor_slices(batch)
filename = f'test-{batch_idx:03d}.record'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(batch_ds)
batch_idx += 1
print(len(batch))
This is how I split the TFRecords.
def read_tfrecord(tfrecord, epochs, batch_size):
dataset = tf.data.TFRecordDataset(tfrecord)
def parse(record):
features = {
"image": tf.io.FixedLenFeature([], tf.string),
"target": tf.io.FixedLenFeature([], tf.int64)
}
example = tf.io.parse_single_example(record,features)
image = decode.image(example["image"])
label = tf.cast(example["target"])
return image,label
dataset = dataset.map(parse)
dataset = dataset.shuffle(buffer_size=500)
dataset = dataset.prefetch(buffer_size=batch_size)
dataset = dataset.batch(batch_size,drop_remainder=True)
dataset = dataset.repeat(epochs)
return dataset
dataset = read_tfrecord(tfrecord='/content/gdrive/MyDrive/Tensorflow/workspace/annotations/train-*.record',epochs = 1, batch_size=1)
iterator = iter(dataset)
x,y = next(iterator)
I was trying to preprocess the TFRecords.
python Tensorflow/models/research/object_detection/model_main_tf2.py --model_dir=Tensorflow/workspace/models/my_ssd_mobilenet_1 --pipeline_config_path=Tensorflow/workspace/models/my_ssd_mobilenet_1/pipeline.config --num_train_steps=100
Is it possible to obtain the output of the preprocessing in the form of TFRecords? As I intended to use the above as the code to generate the training. Hence, I will just key in the path to TFRecords in the pipeline.config.
Thank you and sorry I'm a beginner

Got a mismatch result between writing and reading tfrecord files

Here I use this func to write multiple tfrecord files:
writer = tf.python_io.TFRecordWriter(save)
for pth, lb in tqdm(zip(piece_p, piece_l)):
# mind that the path should be read into image data first
# to convert the byteslist data format into raw bytes
data = Image.open(pth)
if resize is not None:
data.thumbnail(resize, Image.ANTIALIAS)
features = tf.train.Features(feature={
'image': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[data.tobytes()])),
'label': tf.train.Feature(
int64_list=tf.train.Int64List(value=[lb]))
})
example = tf.train.Example(features=features)
# serialize the constructed data format before writing step
writer.write(example.SerializeToString())
sys.stdout.flush()
writer.close()
And parse the binary file using code as below:
def parse_fn(serialized):
features = {
'image': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64)
}
parse_exp = tf.parse_single_example(serialized=serialized,
features=features)
labels = parse_exp['label']
data = parse_exp['image']
data = tf.decode_raw(data, tf.uint8)
data = tf.cast(data, tf.float32)
del parse_exp
return data, labels
dataset = tf.data.Dataset.list_files(data_list, shuffle=True)
dataset = dataset.interleave(tf.data.TFRecordDataset,
cycle_length=file_num)
# dataset = tf.data.TFRecordDataset(data_list[0])
dataset = dataset.map(parse_fn, num_parallel_calls=4)
But why is the number of labels and data always mismatching...?
every time when adding the following code to make multiple batches or sth
dataset = dataset.batch(12)
dataset = dataset.repeat(1)
iterator = dataset.make_initializable_iterator()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(iterator.initializer)
data, labels = iterator.get_next()
and the labels quantity always remains half of data. are there something wrong with my arguments setting? I pretty sure that there are no wrong with my saving part and reading part separately... but there are some problems when combining them together.

Tensorflow While Body Not Executing

I have a FIFO Queue reading from tfrecords file in tensorflow. Each record is consisted of an image and its annotation, that is, a set of features. I was trying to skip some images that is, not feeding them into the graph, or not viewing them, according to some features in mind. Therefore, I thought that the best case scenario was to use on a while loop. That loop is going to test the value of the specified feature and decide whether to proceed or not.
Kindly look at the following code:
import tensorflow as tf
import numpy as np
num_epoch = 100
tfrecords_filename_seq = ["C:/Users/user/PycharmProjects/AffectiveComputing/P16_db.tfrecords"]
filename_queue = tf.train.string_input_producer(tfrecords_filename_seq, num_epochs=num_epoch, shuffle=False, name='queue')
reader = tf.TFRecordReader()
current_image_confidence = tf.constant(0.0, dtype=tf.float32)
def body(i):
key, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'height': tf.FixedLenFeature([], tf.int64),
'width': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
'annotation_raw': tf.FixedLenFeature([], tf.string)
})
# This is how we create one example, that is, extract one example from the database.
image = tf.decode_raw(features['image_raw'], tf.uint8)
# The height and the weights are used to
height = tf.cast(features['height'], tf.int32)
width = tf.cast(features['width'], tf.int32)
# The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the
# height and the weight to restore the original image back.
image = tf.reshape(image, [height, width, 3])
annotation = tf.cast(features['annotation_raw'], tf.string)
t1 = tf.string_split([annotation], delimiter=',')
t2 = tf.reshape(t1.values, [1, -1])
t3 = tf.string_to_number(t2, out_type=tf.float32)
t_ = tf.slice(t3, begin=[0, 3], size=[1, 1])
# Note that t_ is holding a value of 1.0 or 0.0. So its a particular feature I'm interested in.
t_ = tf.Print(t_, data=[tf.shape(t_)], message='....')
z = tf.cond(t_[0][0] < 1.0, lambda: tf.add(i, 0.0), lambda: tf.add(i, 1.0))
return z
cond = lambda i: tf.equal(i, tf.constant(0.0, dtype=tf.float32))
loop = tf.while_loop(cond, body, [current_image_confidence])
init_op = tf.group(tf.local_variables_initializer(),
tf.global_variables_initializer())
with tf.Session() as sess:
sess.run(init_op)
sess.run(loop)
Finally, when trying to run the following code, it seems that the body is not executing and hence stuck in an infinite loop. And the tf.Print(...) in the body was not executed.
Why this is the case?
Any help is much appreciated!!
Your program is getting stick because you're not starting queue runners. Run tf.start_queue_runners() before running your loop op.

Multi Layer Tiff labelled dataset conversion to format that tensor flow can use for model optimisation

I'm a Python and Tensor Flow newbie, and was wondering...
How best to convert a labelled dataset of Multi-Layer Tiffs into a format that Tensor Flow can use for model optimisation / fine tuning ?
I currently have this code that puts each layer of a folder of Multi-Tiffs into a 3D Array, but i need to preserve the label or filename of the Multi-Tiffs. I have seen some tensor flow scripts to convert to TFRecords, however, I'm not sure if these preserve the file name ? How best would you go about this ? It will be quite a big dataset.
Any help much appreciated
import os # For file handling
from PIL import Image# Import Pillow image processing library
import numpy
CroppedMultiTiffs = "MultiTiffs/"
for filename in os.listdir(MultiTiffs):
## Imports Multi-Layer TIFF into 3D Numpy Array.
img = Image.open(MultiTiffs + filename)
imgArray = numpy.zeros( ( img.n_frames, img.size[1], img.size[0] ),numpy.uint8 )
try:
# for frames in range, img.n_frames for whole folder.
for frame in range(2,img.n_frames):
img.seek( frame )
imgArray[frame,:,:] = img
frame = frame + 1
except (EOFError): img.seek( 0 )
# output error if it doesn't find a file.
pass
print(imgArray.shape) # imgArray is now 3D
print(imgArray.size)
best wishes
TWP
okay, so I figured it out using the thread from Daniils blog
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
However my current implimentation creates multiple TFRecords, and I think it needs to be a single TFRecord, so trying to figure out how to make it a single TFRecord. How do I do that?
Then I can validate it using a TFRecord Reading script to read it back and check it is in the right format for Tensor Flow. I currently get errors using the reading script.
from PIL import Image
import numpy as np
import tensorflow as tf
import os
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
path = 'test/'
output = 'output/'
fileList = [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(path) for f in files if f.endswith('.tif')]
print (fileList)
for filename in fileList:
basename = os.path.basename(filename)
file_name = basename[:-4]
print ("processing file: " , filename)
print (file_name)
if not os.path.exists(output):
os.mkdir(output)
writer = tf.python_io.TFRecordWriter(output+ file_name + '.tfrecord')
img = Image.open(filename)
imgArray = np.zeros( ( img.n_frames, img.size[1], img.size[0] ),np.uint8 )
## Imports Multi-Layer file into 3D Numpy Array.
try:
for frame in range(0,img.n_frames):
img.seek( frame )
imgArray[frame,:,:] = img
frame = frame + 1
except (EOFError): img.seek( 0 )
pass
print ("print img size:" , img.size)
print ("print image shape: " , imgArray.shape)
print ("print image size: " , imgArray.size)
annotation = np.array(Image.open(filename))
height = imgArray.shape[0]
width = imgArray.shape[1]
depth = imgArray.shape[2]
img_raw = imgArray.tostring()
annotation_raw = annotation.tostring()
example = tf.train.Example(features=tf.train.Features(feature={
'height': _int64_feature(height),
'width': _int64_feature(width),
'depth': _int64_feature(depth), # for 3rd dimension
'image_raw': _bytes_feature(img_raw),
'mask_raw': _bytes_feature(annotation_raw)}))
writer.write(example.SerializeToString())
My current TFRecords Reading script
import tensorflow as tf
import os
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
'height': tf.FixedLenFeature([], tf.int64),
'width': tf.FixedLenFeature([], tf.int64),
'depth': tf.FixedLenFeature([], tf.int64)
})
image = tf.decode_raw(features['image_raw'], tf.uint8)
label = tf.cast(features['label'], tf.int32)
height = tf.cast(features['height'], tf.int32)
width = tf.cast(features['width'], tf.int32)
depth = tf.cast(features['depth'], tf.int32)
return image, label, height, width, depth
with tf.Session() as sess:
filename_queue = tf.train.string_input_producer(["output/A.3.1.tfrecord"])
image, label, height, width, depth = read_and_decode(filename_queue)
image = tf.reshape(image, tf.stack([height, width, 3]))
image.set_shape([32,32,3])
init_op = tf.initialize_all_variables()
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(1000):
example, l = sess.run([image, label])
print (example,l)
coord.request_stop()
coord.join(threads)
receiving the error:-
InvalidArgumentError (see above for traceback): Name: , Feature: label (data type: int64) is required but could not be found.
Images are grayscale multi-page

How to set a number for epoch in tf.python_io.tf_record_iterator

I was trying to iterate over my data set several times. I used a tf.python_io.tf_record_iterator. But, I used it as follows:
record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)
for z in range(4):
for k, string_record in enumerate(record_iterator):
....
Hence, the outer loop has no effect, and iteration finished just after the inner loop was done iterating over the dataset.
Any help is much appreciated!!
Finally, the new tensorflow Dataset api encoded this functionality. The full documentation is found at: https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset.
Long story short, this new api will enable the end user to iterate over his database multiple times using a for loop, or using the repeat() from the Dataset class.
Here is complete code on how I have used this API:
import tensorflow as tf
import numpy as np
import time
import cv2
num_epoch = 2
batch_size = 8 # This is set to 8 since
num_threads = 9
common = "C:/Users/user/PycharmProjects/AffectiveComputingNew/database/"
filenames = [(common + "train_1_db.tfrecords"), (common + "train_2_db.tfrecords"), (common + "train_3_db.tfrecords"),
(common + "train_4_db.tfrecords"), (common + "train_5_db.tfrecords"), (common + "train_6_db.tfrecords"),
(common + "train_7_db.tfrecords"), (common + "train_8_db.tfrecords"), (common + "train_9_db.tfrecords")]
# Transforms a scalar string `example_proto` into a pair of a scalar string and
# a scalar integer, representing an image and its label, respectively.
def _parse_function(example_proto):
features = {
'height': tf.FixedLenFeature([], tf.int64),
'width': tf.FixedLenFeature([], tf.int64),
'image_raw': tf.FixedLenFeature([], tf.string),
'features': tf.FixedLenFeature([432], tf.float32)
}
parsed_features = tf.parse_single_example(example_proto, features)
# This is how we create one example, that is, extract one example from the database.
image = tf.decode_raw(parsed_features['image_raw'], tf.uint8)
# The height and the weights are used to
height = tf.cast(parsed_features['height'], tf.int32)
width = tf.cast(parsed_features['width'], tf.int32)
# The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the
# height and the weight to restore the original image back.
image = tf.reshape(image, [height, width, 3])
features = parsed_features['features']
return features, image
random_features = tf.Variable(tf.zeros([72, 432], tf.float32))
random_images = tf.Variable(tf.zeros([72, 112, 112, 3], tf.uint8))
datasets = []
for _ in filenames:
datasets.append(tf.contrib.data.TFRecordDataset(_).map(_parse_function))
dataset_ziped = tf.contrib.data.TFRecordDataset.zip((datasets[0], datasets[1], datasets[2], datasets[3],
datasets[4], datasets[5], datasets[6], datasets[7], datasets[8]))
dataset = dataset_ziped.batch(batch_size)
iterator = dataset.make_initializable_iterator()
next_batch = iterator.get_next() # This has shape: [9, 2]
features = tf.concat((next_batch[0][0], next_batch[1][0], next_batch[2][0], next_batch[3][0],
next_batch[4][0], next_batch[5][0], next_batch[6][0], next_batch[7][0],
next_batch[8][0]), axis=0)
images = tf.concat((next_batch[0][1], next_batch[1][1], next_batch[2][1], next_batch[3][1],
next_batch[4][1], next_batch[5][1], next_batch[6][1], next_batch[7][1],
next_batch[8][1]), axis=0)
def get_features(features, images):
with tf.control_dependencies([tf.assign(random_features, features), tf.assign(random_images, images)]):
features = tf.reshape(features, shape=[9, 8, 432]) # where 8 * 9 = 72
features = tf.transpose(features, perm=[1, 0, 2]) # shape becomes: [8, 9, 432]
features = tf.reshape(features, shape=[72, 432]) # Now frames will be: 1st frame from 1st video, second from second video...
images = tf.reshape(images, shape=[9, 8, 112, 112, 3])
images = tf.transpose(images, perm=[1, 0, 2, 3, 4])
images = tf.reshape(images, shape=[72, 112, 112, 3])
return features, images
condition1 = tf.equal(tf.shape(features)[0], batch_size * 9)
condition2 = tf.equal(tf.shape(images)[0], batch_size * 9)
condition = tf.logical_and(condition1, condition2)
features, images = tf.cond(condition,
lambda: get_features(features, images),
lambda: get_features(random_features, random_images))
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
# Initialize `iterator` with training data.
sess.run(init_op)
for _ in range(num_epoch):
sess.run(iterator.initializer)
# This while loop will run indefinitly until the end of the first epoch
while True:
try:
lst = []
features_np, images_np = sess.run([features, images])
for f in features_np:
lst.append(f[0])
print(lst)
except tf.errors.OutOfRangeError:
print('errorrrrr')
break
One thing, since the last retrieved could be truncated, and this will lead to a problem (Notice how I am doing resize operations on features), therefore, I used a temporary variable that will be equal to a batch whenever the batch size is equal to my (batch_size * 9) "This is not important for now".