How to shuffle/preprocess multiple TFRecords file? - tensorflow

I have already split large TFRecord files into multiple smaller ones as it keeps crashing whenever I train it. From what I read, I know that tf.data could help. But I have no idea on how to shuffle the TFRecord in other formats and convert it back to TFRecord.
import tensorflow as tf
ITEMS_PER_FILE = 10
raw_dataset = tf.data.TFRecordDataset('test.record')
batch_idx = 0
for batch in raw_dataset.batch(10):
print(raw_dataset)
batch_ds = tf.data.Dataset.from_tensor_slices(batch)
filename = f'test-{batch_idx:03d}.record'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(batch_ds)
batch_idx += 1
print(len(batch))
This is how I split the TFRecords.
def read_tfrecord(tfrecord, epochs, batch_size):
dataset = tf.data.TFRecordDataset(tfrecord)
def parse(record):
features = {
"image": tf.io.FixedLenFeature([], tf.string),
"target": tf.io.FixedLenFeature([], tf.int64)
}
example = tf.io.parse_single_example(record,features)
image = decode.image(example["image"])
label = tf.cast(example["target"])
return image,label
dataset = dataset.map(parse)
dataset = dataset.shuffle(buffer_size=500)
dataset = dataset.prefetch(buffer_size=batch_size)
dataset = dataset.batch(batch_size,drop_remainder=True)
dataset = dataset.repeat(epochs)
return dataset
dataset = read_tfrecord(tfrecord='/content/gdrive/MyDrive/Tensorflow/workspace/annotations/train-*.record',epochs = 1, batch_size=1)
iterator = iter(dataset)
x,y = next(iterator)
I was trying to preprocess the TFRecords.
python Tensorflow/models/research/object_detection/model_main_tf2.py --model_dir=Tensorflow/workspace/models/my_ssd_mobilenet_1 --pipeline_config_path=Tensorflow/workspace/models/my_ssd_mobilenet_1/pipeline.config --num_train_steps=100
Is it possible to obtain the output of the preprocessing in the form of TFRecords? As I intended to use the above as the code to generate the training. Hence, I will just key in the path to TFRecords in the pipeline.config.
Thank you and sorry I'm a beginner

Related

Got a mismatch result between writing and reading tfrecord files

Here I use this func to write multiple tfrecord files:
writer = tf.python_io.TFRecordWriter(save)
for pth, lb in tqdm(zip(piece_p, piece_l)):
# mind that the path should be read into image data first
# to convert the byteslist data format into raw bytes
data = Image.open(pth)
if resize is not None:
data.thumbnail(resize, Image.ANTIALIAS)
features = tf.train.Features(feature={
'image': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[data.tobytes()])),
'label': tf.train.Feature(
int64_list=tf.train.Int64List(value=[lb]))
})
example = tf.train.Example(features=features)
# serialize the constructed data format before writing step
writer.write(example.SerializeToString())
sys.stdout.flush()
writer.close()
And parse the binary file using code as below:
def parse_fn(serialized):
features = {
'image': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64)
}
parse_exp = tf.parse_single_example(serialized=serialized,
features=features)
labels = parse_exp['label']
data = parse_exp['image']
data = tf.decode_raw(data, tf.uint8)
data = tf.cast(data, tf.float32)
del parse_exp
return data, labels
dataset = tf.data.Dataset.list_files(data_list, shuffle=True)
dataset = dataset.interleave(tf.data.TFRecordDataset,
cycle_length=file_num)
# dataset = tf.data.TFRecordDataset(data_list[0])
dataset = dataset.map(parse_fn, num_parallel_calls=4)
But why is the number of labels and data always mismatching...?
every time when adding the following code to make multiple batches or sth
dataset = dataset.batch(12)
dataset = dataset.repeat(1)
iterator = dataset.make_initializable_iterator()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(iterator.initializer)
data, labels = iterator.get_next()
and the labels quantity always remains half of data. are there something wrong with my arguments setting? I pretty sure that there are no wrong with my saving part and reading part separately... but there are some problems when combining them together.

Reading multiple feature vectors from one TFRecord example in Tensorflow

I know how to store one feature per example inside a tfrecord file and then read it by using something like this:
import tensorflow as tf
import numpy as np
import os
# This is used to parse an example from tfrecords
def parse(serialized_example):
features = tf.parse_single_example(
serialized_example,
features ={
"label": tf.FixedLenFeature([], tf.string, default_value=""),
"feat": tf.FixedLenFeature([], tf.string, default_value="")
})
feat = tf.decode_raw(features['feat'], tf.float64)
label = tf.decode_raw(features['label'], tf.int64)
return feat, label
################# Generate data
cwd = os.getcwd()
numdata = 10
with tf.python_io.TFRecordWriter(os.path.join(cwd, 'data.tfrecords')) as writer:
for i in range(numdata):
feat = np.random.randn(2)
label = np.array(np.random.randint(0,9))
featb = feat.tobytes()
labelb = label.tobytes()
import pudb.b
example = tf.train.Example(features=tf.train.Features(
feature={
'feat': tf.train.Feature(bytes_list=tf.train.BytesList(value=[featb])),
'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[labelb])),}))
writer.write(example.SerializeToString())
print('wrote f {}, l {}'.format(feat, label))
print('Done writing! Start reading and printing data')
################# Read data
filename = ['data.tfrecords']
dataset = tf.data.TFRecordDataset(filename).map(parse)
dataset = dataset.batch(100)
iterator = dataset.make_initializable_iterator()
feat, label = iterator.get_next()
with tf.Session() as sess:
sess.run(iterator.initializer)
try:
while True:
example = sess.run((feat,label))
print example
except tf.errors.OutOfRangeError:
pass
What do I do in the case where each example has multiple feature vectors + labels in it. For example, in the above code, if feat was stored as a 2D array. I still want to do the same thing as before, which is to train a DNN with one feature per label, but each example in the tfrecords file has multiple features and multiple labels. This should be simple but I'm having trouble unpacking multiple features in tensorflow using tfrecords.
Firstly, note that np.ndarray.tobytes() flattens out multi-dimensional arrays into a list, i.e.
feat = np.random.randn(N, 2)
reshaped = np.reshape(feat, (N*2,))
feat.tobytes() == reshaped.tobytes() ## True
So, if you have a N*2 array that's saved as bytes in TFRecord format, you have to reshape it after parsing.
If you do that, you can unbatch the elements of a tf.data.Dataset so that each iteration gives you one feature and one label. Your code should be as follows:
# This is used to parse an example from tfrecords
def parse(serialized_example):
features = tf.parse_single_example(
serialized_example,
features ={
"label": tf.FixedLenFeature([], tf.string, default_value=""),
"feat": tf.FixedLenFeature([], tf.string, default_value="")
})
feat = tf.decode_raw(features['feat'], tf.float64) # array of shape (N*2, )
feat = tf.reshape(feat, (N, 2)) # array of shape (N, 2)
label = tf.decode_raw(features['label'], tf.int64) # array of shape (N, )
return feat, label
################# Generate data
cwd = os.getcwd()
numdata = 10
with tf.python_io.TFRecordWriter(os.path.join(cwd, 'data.tfrecords')) as writer:
for i in range(numdata):
feat = np.random.randn(N, 2)
label = np.array(np.random.randint(0,9, N))
featb = feat.tobytes()
labelb = label.tobytes()
example = tf.train.Example(features=tf.train.Features(
feature={
'feat': tf.train.Feature(bytes_list=tf.train.BytesList(value=[featb])),
'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[labelb])),}))
writer.write(example.SerializeToString())
print('wrote f {}, l {}'.format(feat, label))
print('Done writing! Start reading and printing data')
################# Read data
filename = ['data.tfrecords']
dataset = tf.data.TFRecordDataset(filename).map(parse).apply(tf.contrib.data.unbatch())
... etc

Reading Images from TFrecord using Dataset API and showing them on Jupyter notebook

I created a tfrecord from a folder of images, now I want to iterate over entries in TFrecord file using Dataset API and show them on Jupyter notebook. However I'm facing problems with reading tfrecord file.
Code I used to create TFRecord
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def generate_tfr(image_list):
with tf.python_io.TFRecordWriter(output_path) as writer:
for image in images:
image_bytes = open(image,'rb').read()
image_array = imread(image)
image_shape = image_array.shape
image_x, image_y, image_z = image_shape[0],image_shape[1], image_shape[2]
data = {
'image/bytes':_bytes_feature(image_bytes),
'image/x':_int64_feature(image_x),
'image/y':_int64_feature(image_y),
'image/z':_int64_feature(image_z)
}
features = tf.train.Features(feature=data)
example = tf.train.Example(features=features)
serialized = example.SerializeToString()
writer.write(serialized)
Code to read TFRecord
#This code is incomplete and has many flaws.
#Please give some suggestions in correcting this code if you can
def parse(serialized):
features = \
{
'image/bytes': tf.FixedLenFeature([], tf.string),
'image/x': tf.FixedLenFeature([], tf.int64),
'image/y': tf.FixedLenFeature([], tf.int64),
'image/z': tf.FixedLenFeature([], tf.int64)
}
parsed_example = tf.parse_single_example(serialized=serialized,features=features)
image = parsed_example['image/bytes']
image = tf.decode_raw(image,tf.uint8)
x = parsed_example['image/x'] # breadth
y = parsed_example['image/y'] # height
z = parsed_example['image/z'] # depth
image = tf.cast(image,tf.float32)
# how can I reshape image tensor here? tf.reshape throwing some weird errors.
return {'image':image,'x':x,'y':y,'z':z}
dataset = tf.data.TFRecordDataset([output_path])
dataset.map(parse)
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
epoch = 1
with tf.Session() as sess:
for _ in range(epoch):
img = next_element.eval()
print(img)
# when I print image, it shows byte code.
# How can I convert it to numpy array and then show image on my jupyter notebook ?
I've never worked with any of this before and I'm stuck at reading TFRecords. Please answer how to iterate over the contents of TFrecords and show them on Jupyter notebook. Feel free to correct/optimize both pieces of code. That would help me a lot.
Is this what you may be looking for? I think once u convert to numpy array you can show in jupyter notebook using PIL.Image.
convert tf records to numpy => How can I convert TFRecords into numpy arrays?
show numpy array as image
https://gist.github.com/kylemcdonald/2f1b9a255993bf9b2629

Input pipeline using TensorFlow Dataset API and Pandas?

I am trying to create a TensorFlow Dataset that takes in a list of path names for CSV files and create batches of training data. First I create a parse function which uses Pandas to read the first n rows. I give this function as argument for the 'map' method in Dataset
def _get_data_for_dataset(file_name,rows=100):
print(file_name.decode())
df_input=pd.read_csv(os.path.join(folder_name, file_name.decode()),
usecols =['Wind_MWh','Actual_Load_MWh'],nrows = rows)
X_data = df_input.as_matrix()
X_data.astype('float32', copy=False)
return X_data
dataset = tf.data.Dataset.from_tensor_slices(file_names)
dataset = dataset2.map(lambda file_name: tf.py_func(_get_data_for_dataset,[file_name,100], tf.float64))
dataset= dataset.batch(2) #Create batches
iter = dataset.make_one_shot_iterator()
get_batch = iter.get_next()
with tf.Session() as sess:
print(sess.run(get_batch).shape)
The above code works but instead of producing a dataset with shape (200,2) it produces a dataset with shape (2, 100, 2). Please help.
I finally got the answer from Dataset API 'flat_map' method producing error for same code which works with 'map' method
I am posting the full code in case it may help others who want to use Pandas and Dataset API together.
folder_name = './data/power_data/'
file_names = os.listdir(folder_name)
def _get_data_for_dataset(file_name):
df_input=pd.read_csv(os.path.join(folder_name, file_name.decode()),
usecols=['Wind_MWh', 'Actual_Load_MWh'])
X_data = df_input.as_matrix()
return X_data.astype('float32', copy=False)
dataset = tf.data.Dataset.from_tensor_slices(file_names)
# Use `Dataset.from_tensor_slices()` to make a `Dataset` from the output of
# the `tf.py_func()` op.
dataset = dataset.flat_map(lambda file_name: tf.data.Dataset.from_tensor_slices(
tf.py_func(_get_data_for_dataset, [file_name], tf.float32)))
dataset = dataset.batch(100)
iter = dataset.make_one_shot_iterator()
get_batch = iter.get_next()
with tf.Session() as sess:
print(sess.run(get_batch))

do not save check point for a final step for Estimator

I use Estimator and I train model in the loop to feed data. Every step is the final step. The checkpoints are saved for every final step too. I want to avoid saving checkpoint in every iteration to increase the performance (speed) of the training.
I can not find any information how to do this. Do you have any ideas/suggestions/solutions?
classifier = Estimator(
model_fn=cnn_model_fn,
model_dir="./temp_model_Adam",
config=tf.contrib.learn.RunConfig(
save_checkpoints_secs=None,
save_checkpoints_steps=100,
save_summary_steps=None
)
)
# Train the model
for e in range(0, 10):
numbers = np.arange(10000)
np.random.shuffle(numbers)
for step in range(0, 2000):
classifier.fit(
input_fn=lambda: read_images_for_training_as_batch(step, path, 5, numbers),
steps=1
)
Nowadays the api got changed a bit but from what I see you were using the fit (currently train) method incorrectly, you should put steps=2000 and have your input function return an iterator over your dataset. Today you have tf.estimator.inputs.numpy_input_fn at your disposal that can help you when you have small data sets, otherwise you have to use tf.data.DataSet api.
Something like this (it loads .wav files):
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
# ...
def input_fn(num_epochs, batch_size, shuffle=False, mode='training')
def input_fn_bound():
def _read_file(fn, label):
return io_ops.read_file(fn), label
def _decode(data, label):
pcm = contrib_audio.decode_wav(data,
desired_channels=1,
desired_samples=desired_samples)
return pcm.audio, label
filenames = get_files(mode)
classes = get_classes(mode)
labels = {'class': np.array(classes)}
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
if shuffle:
dataset = dataset.shuffle(buffer_size=len(labels))
dataset = dataset.map(_read_file, num_parallel_calls=num_map_threads)
dataset = dataset.map(_decode, num_parallel_calls=num_map_threads)
dataset = dataset.map(lambda wav, label: ({'wav': wav}, label))
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(2) # To load next batch while the first one is being processed on GPU
iter = dataset.make_one_shot_iterator()
features, labels = iter.get_next()
return features, labels
return input_fn_bound
# ....
estimator.train(input_fn=input_fn(
num_epoths=None,
batch_size=64,
shuffle=True,
mode='training'),
steps=10000)