one read_and_decode function for different training data - tensorflow

I'm new to TensorFlow, and here's what I'm trying to do: save training data from different scenarios and then read them back. The sizes of feature and output may be different for different scenarios.
The issue is when I tried to read the data back, I got an exception that looks like the following:
InvalidArgumentError (see above for traceback): Name: <unknown>, Key: observation, Index: 0. Number of float values != expected. Values size: 17 but output shape: []
The function for saving data looks like below:
def save_data(obs, actions, filename):
writer = tf.python_io.TFRecordWriter(filename)
for index in range(num_examples):
o = obs[index].tolist()
a = actions[index].tolist()
example = tf.train.Example(features=tf.train.Features(
feature = {
'obs' : tf.train.Feature(float_list=tf.train.FloatList(value=o)),
'action': tf.train.Feature(float_list=tf.train.FloatList(value=a)),
'obs_size' : tf.train.Feature(int64_list=tf.train.Int64List(value=[len(o)])),
'action_size': tf.train.Feature(int64_list=tf.train.Feature(int64_list=tf.train.Int64List(value=[len(a)])),
}
))
writer.write(example.SerializeToString())
writer.close()
The function to read the data back is as follows:
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, example = reader.read(filename_queue)
features = tf.parse_single_example(
example,
features = {
'obs' : tf.FixedLenFeature([], tf.float32),
'action' : tf.FixedLenFeature([], tf.float32),
'obs_size': tf.FixedLenFeature([], tf.int64),
'action_size' : tf.FixedLenFeature([], tf.int64)
}
)
obs_size = tf.cast(features['observation_size'], tf.int32)
action_size = tf.cast(features['action_size'], tf.int32)
obs_shape = tf.pack([1, obs_size])
action_shape = tf.pack([1, action_size])
obs = tf.reshape(obs, obs_shape)
action = tf.reshape(action, action_shape)

Related

How can I preprocess my Mapdataset to fit my model input?

I use a MapDataset compose of label in text and a vector of float in string.
Here is the way I read the content of my tfrecord:
def extract_data(tfrecord_ds):
feature_description = {
'classes_text': tf.io.FixedLenFeature((), tf.string),
'data': tf.io.FixedLenFeature([], tf.string)
}
def _parse_data_function(example_proto):
return tf.compat.v1.parse_single_example(example_proto, feature_description)
parsed_dataset = tfrecord_ds.map(_parse_data_function)
dataset = parsed_dataset.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
return dataset
I want to convert the label_text to int according to label.txt file and the data string to vector of float.
I want to use this data to train a custom model like this:
my_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
name='input_embedding'),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(num_classes)
], name='audio_detector')
How can I process my MapDataset from (string,string) to (int, float_array) to be able to train my model?
Edit:
Here is the way I encode my data:
features = {}
features['classes_text'] = tf.train.Feature(
bytes_list=tf.train.BytesList(value=[audio_data_generator.label.encode()]))
bytes = embedding.numpy().tobytes()
features['data'] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes]))
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
It is easier to encode the embedding using tf.train.FloatList.
When writing to tfrecords use:
features = {
'classes_text': tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode()])),
'data': tf.train.Feature(float_list=tf.train.FloatList(value=embedding))
}
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
And when reading give the embedding size to tf.io.FixedLenFeature, for example:
embedding_size = 10
feature_description = {
'classes_text': tf.io.FixedLenFeature((), tf.string),
'data': tf.io.FixedLenFeature([embedding_size], tf.float32)
}
To convert label_text to int you can use tf.lookup.StaticVocabularyTable.
# Assuming lable.txt contains a single label per line.
with open('label.txt', 'r') as fin:
categories = [line.strip() for line in fin.readlines()]
init = tf.lookup.KeyValueTensorInitializer(
keys=tf.constant(categories),
values=tf.constant(list(range(len(categories))), dtype=tf.int64))
label_table = tf.lookup.StaticVocabularyTable(
init,
num_oov_buckets=1)
feature_description = {
'classes_text': tf.io.FixedLenFeature((), tf.string),
'data': tf.io.FixedLenFeature([embedding_size], tf.float32)
}
def _parse_data_function(example_proto):
example = tf.compat.v1.parse_single_example(example_proto, feature_description)
# Apply the label lookup.
example['classes_text'] = label_table.lookup(example['classes_text'])
return example
parsed_dataset = tfrecord_ds.map(_parse_data_function)
dataset = parsed_dataset.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
Edit
If you wish to keep the way you save data you can use np.frombuffer to convert the numpy vectors to from binary stings. You will have to wrap this code in a tf.function and tf.py_function though.
def decode_embedding(embedding_bytes):
return np.frombuffer(embedding_bytes.numpy())
#tf.function()
def tf_decode_embedding(embedding_bytes):
return tf.py_function(decode_embedding, inp=[embedding_bytes], Tout=tf.float32)
feature_description = {
'classes_text': tf.io.FixedLenFeature((), tf.string),
'data': tf.io.FixedLenFeature([], tf.string)
}
def _parse_data_function(example_proto):
example = tf.compat.v1.parse_single_example(example_proto, feature_description)
example['classes_text'] = label_table.lookup(example['classes_text'])
example['data'] = tf_decode_embedding(example['data'])
return example
parsed_dataset = tfrecord_ds.map(_parse_data_function)
dataset = parsed_dataset.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)

How to save large float into TFRecord format? float_list/float32 seems to truncate the values

We write processed data into TFRecords and we are noticing data loss when read back from TFRecords. Reproducible example below. Strange thing is that it doesn't just drop the decimals but seem to randomly roundup/down values. Since it only allows float32, int64 and string, we are not sure what other options to try.
We are writing these values
[20191221.1, 20191222.1, 20191223.1, 20191224.1, 20191225.1, 20191226.1, 20191227.1, 20191228.1, 20191229.1, 20191230.1]
But reading from tfrecords returns these values
tf.Tensor(
[20191222. 20191222. 20191224. 20191224. 20191226. 20191226. 20191228.
20191228. 20191230. 20191230.], shape=(10,), dtype=float32)
Reproducible Code
import tensorflow as tf
def write_date_tfrecord():
#writes 10 dummy values to replicate the issue
data = [20191221.1 + x for x in range(0,10)]
print("Writing data - ", data)
example = tf.train.Example(
features = tf.train.Features(
feature = {
'data':tf.train.Feature(float_list=tf.train.FloatList(value=data))
}
))
writer = tf.io.TFRecordWriter("data.tf_record")
writer.write(example.SerializeToString())
def parse_function(serialized_example):
features = {
'data': tf.io.FixedLenSequenceFeature([], tf.float32,allow_missing=True)
}
features = tf.io.parse_single_example(serialized=serialized_example, features=features)
data = features['data']
return data
def dataset_generator():
trRecordDataset = tf.data.TFRecordDataset("data.tf_record")
trRecordDataset = trRecordDataset.map(parse_function, num_parallel_calls = tf.data.experimental.AUTOTUNE)
return trRecordDataset
if __name__ == '__main__':
write_date_tfrecord()
generator = dataset_generator()
for data in generator:
print(data)
This solved my issue. I had this issue when writing audio files as floating point matrix using FloatList.. but when i used BytesList and stored the data into tfrecords and then read the data by decoding it.. the issue resolved.. note that even decoding with tf.float32 will lead not solve the issue. we need to decode it with tf.float64..
def _bytes_feature2(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def serialize_example(sound):
feature = {
'snd': _bytes_feature2(sound.tobytes()),
}
example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
return example_proto.SerializeToString()
def write_tfrecords(rf,snd):
nsamples = len(snd)
with tf.io.TFRecordWriter(rf) as writer:
for i in range(nsamples):
SND = snd[i]
tf_example = serialize_example(SND)
writer.write(tf_example)
# writing records
write_tfrecords(os.getcwd()+'\\tfrec\\'+'train.tfrecords',train)
# loading records
raw_dataset = tf.data.TFRecordDataset(os.getcwd()+'\\tfrec\\'+'train.tfrecords')
def parse_record(record):
name_to_features= {
'snd':tf.io.FixedLenFeature([],tf.string),
}
return tf.io.parse_single_example(record, name_to_features)
def decode_record(record):
aud = tf.io.decode_raw(
record['snd'], out_type=tf.float64
)
return aud
for record in raw_dataset:
parsed_record = parse_record(record)
decoded_record = decode_record(parsed_record)
aud = decoded_record
print(aud.numpy()[0:10])
print(train[0][0:10])
output:
[ 417.69951205 -231.58708746 -10.05624011 -146.10342256 -66.60317323
-159.91550792 -3.93602823 29.94517981 106.22196629 65.53008959]
[ 417.69951205 -231.58708746 -10.05624011 -146.10342256 -66.60317323
-159.91550792 -3.93602823 29.94517981 106.22196629 65.53008959]

Writing and Reading lists to TFRecord example

I want to write a list of integers (or any multidimensional numpy matrix) to one TFRecords example. For both a single value or a list of multiple values I can creates the TFRecord file without error. I know also how to read the single value back from TFRecord file as shown in the below code sample I compiled from various sources.
# Making an example TFRecord
my_example = tf.train.Example(features=tf.train.Features(feature={
'my_ints': tf.train.Feature(int64_list=tf.train.Int64List(value=[5]))
}))
my_example_str = my_example.SerializeToString()
with tf.python_io.TFRecordWriter('my_example.tfrecords') as writer:
writer.write(my_example_str)
# Reading it back via a Dataset
featuresDict = {'my_ints': tf.FixedLenFeature([], dtype=tf.int64)}
def parse_tfrecord(example):
features = tf.parse_single_example(example, featuresDict)
return features
Dataset = tf.data.TFRecordDataset('my_example.tfrecords')
Dataset = Dataset.map(parse_tfrecord)
iterator = Dataset.make_one_shot_iterator()
with tf.Session() as sess:
print(sess.run(iterator.get_next()))
But how can I read back a list of values (e.g. [5,6]) from one example? The featuresDict defines the feature to be of type int64, and it fails when I have multiple values in it and I get below error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Key: my_ints. Can't parse serialized Example.
You can achieve this by using tf.train.SequenceExample. I've edited your code to return both 1D and 2D data. First, you create a list of features which you place in a tf.train.FeatureList. We convert our 2D data to bytes.
vals = [5, 5]
vals_2d = [np.zeros((5,5), dtype=np.uint8), np.ones((5,5), dtype=np.uint8)]
features = [tf.train.Feature(int64_list=tf.train.Int64List(value=[val])) for val in vals]
features_2d = [tf.train.Feature(bytes_list=tf.train.BytesList(value=[val.tostring()])) for val in vals_2d]
featureList = tf.train.FeatureList(feature=features)
featureList_2d = tf.train.FeatureList(feature=features_2d)
In order to get the correct shape of our 2D feature we need to provide context (non-sequential data), this is done with a context dictionary.
context_dict = {'height': tf.train.Feature(int64_list=tf.train.Int64List(value=[vals_2d[0].shape[0]])),
'width': tf.train.Feature(int64_list=tf.train.Int64List(value=[vals_2d[0].shape[1]])),
'length': tf.train.Feature(int64_list=tf.train.Int64List(value=[len(vals_2d)]))}
Then you place each FeatureList in a tf.train.FeatureLists dictionary. Finally, this is placed in a tf.train.SequenceExample along with the context dictionary
my_example = tf.train.SequenceExample(feature_lists=tf.train.FeatureLists(feature_list={'1D':featureList,
'2D': featureList_2d}),
context = tf.train.Features(feature=context_dict))
my_example_str = my_example.SerializeToString()
with tf.python_io.TFRecordWriter('my_example.tfrecords') as writer:
writer.write(my_example_str)
To read it back into tensorflow you need to use tf.FixedLenSequenceFeature for the sequential data and tf.FixedLenFeature for the context data. We convert the bytes back to integers and we parse the context data in order to restore the correct shape.
# Reading it back via a Dataset
featuresDict = {'1D': tf.FixedLenSequenceFeature([], dtype=tf.int64),
'2D': tf.FixedLenSequenceFeature([], dtype=tf.string)}
contextDict = {'height': tf.FixedLenFeature([], dtype=tf.int64),
'width': tf.FixedLenFeature([], dtype=tf.int64),
'length':tf.FixedLenFeature([], dtype=tf.int64)}
def parse_tfrecord(example):
context, features = tf.parse_single_sequence_example(
example,
sequence_features=featuresDict,
context_features=contextDict
)
height = context['height']
width = context['width']
seq_length = context['length']
vals = features['1D']
vals_2d = tf.decode_raw(features['2D'], tf.uint8)
vals_2d = tf.reshape(vals_2d, [seq_length, height, width])
return vals, vals_2d
Dataset = tf.data.TFRecordDataset('my_example.tfrecords')
Dataset = Dataset.map(parse_tfrecord)
iterator = Dataset.make_one_shot_iterator()
with tf.Session() as sess:
print(sess.run(iterator.get_next()))
This will output the sequence of [5, 5] and the 2D numpy arrays. This blog post has a more in depth look at defining sequences with tfrecords https://dmolony3.github.io/Working%20with%20image%20sequences.html

barriers&prefecth error when using batch_sequences_with_states

I encounter an error related to barriers when using tf.contrib.training.batch_sequences_with_states.
I write a handmade data to tfrecords format, it's contain 100 examples, each example is a variable length sequence with 50 dimension features.
test_list = list()
for i in range(100):
l = [[], []]
llen = int(random.uniform(100, 500))
for j in range(llen):
l[0].append([j*0.1]*50)
l[1].append(j)
test_list.append(l)
writer = tf.python_io.TFRecordWriter("my_test.sequence.tfrecords")
try:
for (idx, i) in enumerate(test_list):
example = tf.train.SequenceExample(
context=tf.train.Features(
feature={
'id' : tf.train.Feature(bytes_list=tf.train.BytesList(value=['list'+str(idx)+'_'])),
'length' : tf.train.Feature(int64_list=tf.train.Int64List(value=[len(i[0])]))
}
),
feature_lists=tf.train.FeatureLists(
feature_list={
'feat' : tf.train.FeatureList(feature=[tf.train.Feature(float_list=tf.train.FloatList(value=frame)) for frame in i[0]]),
'label' : tf.train.FeatureList(feature=[tf.train.Feature(int64_list=tf.train.Int64List(value=[frame])) for frame in i[1]])
}
)
)
writer.write(example.SerializeToString())
finally:
writer.close()
Then I read and decode the tfrecods dataset, and use tf.contrib.training.batch_sequences_with_states to batch the sequence inputs.
def read_and_decode(filename, num_epochs=None):
context_features={
'id' : tf.FixedLenFeature([], dtype=tf.string),
'length' : tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
"feat": tf.FixedLenSequenceFeature([50], dtype=tf.float32, allow_missing=False),
"label": tf.FixedLenSequenceFeature([], dtype=tf.int64, allow_missing=False)
}
filename_queue = tf.train.string_input_producer([filename], num_epochs)
reader = tf.TFRecordReader()
key, serialized_example = reader.read(filename_queue)
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
return (key, context_parsed, sequence_parsed)
key, context_parsed, sequence_parsed = read_and_decode("my_test.sequence.tfrecords")
batch_size = 16
num_unroll = 8
num_enqueue_threads = 2
hidden_size = 256
cell = tf.contrib.rnn.GRUCell(hidden_size)
initial_state_values = tf.zeros(cell.state_size, dtype=tf.float32)
initial_states = {"gru": initial_state_values}
batch = tf.contrib.training.batch_sequences_with_states(
input_key=context_parsed['id'],
input_sequences=sequence_parsed,
input_context=context_parsed,
initial_states=initial_states,
num_unroll=num_unroll,
batch_size=batch_size,
input_length = tf.cast(context_parsed["length"], tf.int32),
pad = True,
num_threads=num_enqueue_threads,
capacity= 1000 + batch_size * num_enqueue_threads * 2,
make_keys_unique=False,
allow_small_batch=True,
name='batch_sequence')
k = batch.key
nk = batch.next_key
inputs = batch.sequences['feat']
labels = batch.sequences['label']
wid = batch.context['id']
inputs_by_time = tf.split(inputs, num_unroll, 1)
inputs_by_time = [tf.squeeze(elem, squeeze_dims=1) for elem in inputs_by_time]
assert len(inputs_by_time) == num_unroll
outputs, state = tf.contrib.rnn.static_state_saving_rnn(
cell,
inputs_by_time,
sequence_length=batch.context["length"],
state_saver=batch,
state_name='gru')
with tf.device('/cpu:0'):
sess = tf.Session()
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
sess.run(init_op)
fetches = [k, nk, wid]
feed_dict={}
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess,coord=coord,daemon=False)
try:
while True:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
k, nk, wid\
= sess.run(fetches)
print('key: %s' % k)
print('next key: %s' % nk)
print('id: %s' % wid)
except tf.errors.OutOfRangeError:
print("done training")
finally:
print('request stop ............')
coord.request_stop()
coord.join(threads)
sess.close()
But I get an error enter image description here
The error message shows the barrie inserted an existed key. It seems like the graph cannot prefetch the second time step of a sequence. Maybe the error is from prefetch_op?
If I set make_keys_unique=True, the batch also cannot reach to second time step.
enter image description here

Storing a list of list as tf record in tensorflow

What would be the best way of storing/reading a list of list in a tf record in TensorFlow?
I tried to serialize the data to a one dimensional list, then reshape it to its original size when reading. However, the encoding process is taking forever.
Writing to tf_record:
Variable in question: word_data (shape=[nb_channels, 1500])
electrodes_coordinates=word_data['electrodes_coordinates']
electrodes_loc3=word_data['electrodes_loc3']
nb_electrodes=word_data['nb_electrodes']
label=word_data['label']
word_data=word_data['word']
#reshape word_data from list of list (nb_channel,nb_time points) to list (nb_channel*nb_timepoints)
word_data=np.reshape(word_data, [-1])
context = tf.train.Features(feature={
"word/word_id": _bytes_feature(word),
"word/nb_channels": _int64_feature(nb_electrodes),
"word/label": _int64_feature(int(label))
})
feature_lists = tf.train.FeatureLists(feature_list={
"word/electrode_x_coordinates":_float_feature_list(electrodes_coordinates[:,0]),
"word/electrode_y_coordinates":_float_feature_list(electrodes_coordinates[:,1]),
"word/electrode_z_coordinates":_float_feature_list(electrodes_coordinates[:,2]),
"word/electrode_location3":_int64_feature_list(loc3_to_id(electrodes_loc3,loc3_dict)),
"word/data": _float_feature_list(word_data)})
sequence_example = tf.train.SequenceExample(context=context ,feature_lists=feature_lists)
return sequence_example
Reading from tf_record:
context, sequence = tf.parse_single_sequence_example(serialized,
context_features={
nb_channels: tf.FixedLenFeature([], dtype=tf.int64),
label: tf.FixedLenFeature([], dtype=tf.int64)
},
sequence_features={
electrode_x_coordinates: tf.FixedLenSequenceFeature([], dtype=tf.float32),
electrode_y_coordinates: tf.FixedLenSequenceFeature([], dtype=tf.float32),
electrode_z_coordinates: tf.FixedLenSequenceFeature([], dtype=tf.float32),
electrode_location3: tf.FixedLenSequenceFeature([], dtype=tf.int64),
word_data: tf.FixedLenSequenceFeature([], dtype=tf.float32)
}
)
encoded_nb_channels = context[nb_channels]
encoded_label = context[label]
encoded_electrode_x_coordinates = sequence[electrode_x_coordinates]
encoded_electrode_y_coordinates = sequence[electrode_y_coordinates]
encoded_electrode_z_coordinates = sequence[electrode_z_coordinates]
encoded_electrode_location3 = sequence[electrode_location3]
encoded_word_data = sequence[word_data]