tensorflow: Reading time series data from TFRecord - tensorflow

I'm using a SequenceExample protobuf to read/write time-series data into a TFRecord file.
I serialized a pair the np arrays as follows:
writer = tf.python_io.TFRecordWriter(file_name)
context = tf.train.Features( ... Feature( ... ) ... )
feature_data = tf.train.FeatureList(feature=[
labels = tf.train.FeatureList(feature=[
##feature_data and labels are of similar, but varying lengths
feature_list = {"feature_data": feature_data,
"labels": labels}
feature_lists = tf.train.FeatureLists(feature_list=feature_list)
example = tf.train.SequenceExample(context=context,
## serialize and close
When trying to read the .tfrecords file, I've gotten quite a few errors, primarily because the SequenceExample protobuf writes the time series data as a series of values (e.g. value: -12.2549, value: -18.1372, .... value:13.1234). My code to read the .tfrecords file is as follows:
dataset = tf.data.TFRecordDataset("data/tf_record.tfrecords")
dataset = dataset.map(decode)
dataset = dataset.make_one_shot_iterator().get_next()
### reshape tensors and feed to estimator###
My decode() function is defined as follows:
def decode(serialized_proto):
context_features = {...}
sequence_features = {"feature_data": tf.FixedLenSequenceFeature((None,),
"labels": tf.FixedLenSequenceFeature(((None,),
context, sequence = tf.parse_single_sequence_example(serialized_proto,
return context, sequence
One of the errors is as follows:
Shape [?] is not fully defined for 'ParseSingleSequenceExample/ParseSingleSequenceExample' (op: 'ParseSingleSequenceExample') with input shapes: [], [0], [], [], [], [], [], [], [].
My primary question is how to think about the structure of Datasets. I'm not sure I really understand the structure of the data returned. I'm having a hard time iterating through this Dataset and returning the variably-sized Tensors. Thanks in advance!

you can only use tf.FixedLenSequenceFeature when the shape of the feature is known. Otherwise, use tf.VarLenFeature instead.


Writing and Reading lists to TFRecord example

I want to write a list of integers (or any multidimensional numpy matrix) to one TFRecords example. For both a single value or a list of multiple values I can creates the TFRecord file without error. I know also how to read the single value back from TFRecord file as shown in the below code sample I compiled from various sources.
# Making an example TFRecord
my_example = tf.train.Example(features=tf.train.Features(feature={
'my_ints': tf.train.Feature(int64_list=tf.train.Int64List(value=[5]))
my_example_str = my_example.SerializeToString()
with tf.python_io.TFRecordWriter('my_example.tfrecords') as writer:
# Reading it back via a Dataset
featuresDict = {'my_ints': tf.FixedLenFeature([], dtype=tf.int64)}
def parse_tfrecord(example):
features = tf.parse_single_example(example, featuresDict)
return features
Dataset = tf.data.TFRecordDataset('my_example.tfrecords')
Dataset = Dataset.map(parse_tfrecord)
iterator = Dataset.make_one_shot_iterator()
with tf.Session() as sess:
But how can I read back a list of values (e.g. [5,6]) from one example? The featuresDict defines the feature to be of type int64, and it fails when I have multiple values in it and I get below error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Key: my_ints. Can't parse serialized Example.
You can achieve this by using tf.train.SequenceExample. I've edited your code to return both 1D and 2D data. First, you create a list of features which you place in a tf.train.FeatureList. We convert our 2D data to bytes.
vals = [5, 5]
vals_2d = [np.zeros((5,5), dtype=np.uint8), np.ones((5,5), dtype=np.uint8)]
features = [tf.train.Feature(int64_list=tf.train.Int64List(value=[val])) for val in vals]
features_2d = [tf.train.Feature(bytes_list=tf.train.BytesList(value=[val.tostring()])) for val in vals_2d]
featureList = tf.train.FeatureList(feature=features)
featureList_2d = tf.train.FeatureList(feature=features_2d)
In order to get the correct shape of our 2D feature we need to provide context (non-sequential data), this is done with a context dictionary.
context_dict = {'height': tf.train.Feature(int64_list=tf.train.Int64List(value=[vals_2d[0].shape[0]])),
'width': tf.train.Feature(int64_list=tf.train.Int64List(value=[vals_2d[0].shape[1]])),
'length': tf.train.Feature(int64_list=tf.train.Int64List(value=[len(vals_2d)]))}
Then you place each FeatureList in a tf.train.FeatureLists dictionary. Finally, this is placed in a tf.train.SequenceExample along with the context dictionary
my_example = tf.train.SequenceExample(feature_lists=tf.train.FeatureLists(feature_list={'1D':featureList,
'2D': featureList_2d}),
context = tf.train.Features(feature=context_dict))
my_example_str = my_example.SerializeToString()
with tf.python_io.TFRecordWriter('my_example.tfrecords') as writer:
To read it back into tensorflow you need to use tf.FixedLenSequenceFeature for the sequential data and tf.FixedLenFeature for the context data. We convert the bytes back to integers and we parse the context data in order to restore the correct shape.
# Reading it back via a Dataset
featuresDict = {'1D': tf.FixedLenSequenceFeature([], dtype=tf.int64),
'2D': tf.FixedLenSequenceFeature([], dtype=tf.string)}
contextDict = {'height': tf.FixedLenFeature([], dtype=tf.int64),
'width': tf.FixedLenFeature([], dtype=tf.int64),
'length':tf.FixedLenFeature([], dtype=tf.int64)}
def parse_tfrecord(example):
context, features = tf.parse_single_sequence_example(
height = context['height']
width = context['width']
seq_length = context['length']
vals = features['1D']
vals_2d = tf.decode_raw(features['2D'], tf.uint8)
vals_2d = tf.reshape(vals_2d, [seq_length, height, width])
return vals, vals_2d
Dataset = tf.data.TFRecordDataset('my_example.tfrecords')
Dataset = Dataset.map(parse_tfrecord)
iterator = Dataset.make_one_shot_iterator()
with tf.Session() as sess:
This will output the sequence of [5, 5] and the 2D numpy arrays. This blog post has a more in depth look at defining sequences with tfrecords https://dmolony3.github.io/Working%20with%20image%20sequences.html

Tensorflow parse_single_example returns all dataset

I'm creating a basic LinearClassifier in Tensorflow, but it seems that my input function returns the whole dataset at the first iteration, instead of just one example & its label.
My TFRecord has the following structure (obtained with print( tf.train.Example.FromString(example.SerializeToString())) )
features {
feature {
key: "attackType"
value {
int64_list {
value: 0
value: 0
feature {
key: "dst_ip_addr"
value {
bytes_list {
value: "EXT_SERVER"
It seems the TFRecord file is well formatted. However, when I try to parse it with the following snippet:
def input_fn_train(repeat=10, batch_size=32):
Reads dataset from tfrecord, apply parser with map
# Import MNIST data
dataset = tf.data.TFRecordDataset([processed_bucket+processed_key])
# Map the parser over dataset, and batch results by up to batch_size
dataset = dataset.map(_decode)
dataset = dataset.repeat(repeat)
dataset = dataset.batch(batch_size)
return dataset
def _decode(serialized_ex):
'src_ip_addr': tf.FixedLenFeature(src_ip_size,tf.string),
'src_pt': tf.FixedLenFeature(src_pt_size,tf.int64),
'dst_ip_addr': tf.FixedLenFeature(dst_ip_size,tf.string),
'dst_pt': tf.FixedLenFeature(dst_pt_size,tf.int64),
'proto': tf.FixedLenFeature(proto_size,tf.string),
'packets': tf.FixedLenFeature(packets_size,tf.int64),
'subnet': tf.FixedLenFeature(subnet_size,tf.int64),
'attackType': tf.FixedLenFeature(attack_type_size,tf.int64)
parsed_features = tf.parse_single_example(serialized_ex, features)
label = parsed_features.pop('attackType')
return parsed_features, label
sess = tf.Session()
it = input_fn_train().make_one_shot_iterator()
It shows that it.get_next() returns
({'dst_ip_addr': array([[b'OPENSTACK_NET', b'EXT_SERVER',...
This is incorrect since it yields an array of array! The result should be
Any thoughts ? I've been trying to change the shape parameter of FixedLenFeature, with no success.
Ok, seems it's the dataset.batch command that created this strange behavior. Removed it, and it works fine now !

How to read (decode) tfrecords with tf.data API

I have a custom dataset, that I then stored as tfrecord, doing
# toy example data
label = np.asarray([[1,2,3],
[4,5,6]]).reshape(2, 3, -1)
sample = np.stack((label + 200).reshape(2, 3, -1))
def bytes_feature(values):
"""Returns a TF-Feature of bytes.
values: A string.
A TF-Feature.
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))
def labeled_image_to_tfexample(sample_binary_string, label_binary_string):
return tf.train.Example(features=tf.train.Features(feature={
'sample/image': bytes_feature(sample_binary_string),
'sample/label': bytes_feature(label_binary_string)
def _write_to_tf_record():
with tf.Graph().as_default():
image_placeholder = tf.placeholder(dtype=tf.uint16)
encoded_image = tf.image.encode_png(image_placeholder)
label_placeholder = tf.placeholder(dtype=tf.uint16)
encoded_label = tf.image.encode_png(image_placeholder)
with tf.python_io.TFRecordWriter("./toy.tfrecord") as writer:
with tf.Session() as sess:
feed_dict = {image_placeholder: sample,
label_placeholder: label}
# Encode image and label as binary strings to be written to tf_record
image_string, label_string = sess.run(fetches=(encoded_image, encoded_label),
# Define structure of what is going to be written
file_structure = labeled_image_to_tfexample(image_string, label_string)
However I cannot read it. First I tried (based on http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html , https://medium.com/coinmonks/storage-efficient-tfrecord-for-images-6dc322b81db4 and https://medium.com/mostly-ai/tensorflow-records-what-they-are-and-how-to-use-them-c46bc4bbb564)
def read_tfrecord_low_level():
data_path = "./toy.tfrecord"
filename_queue = tf.train.string_input_producer([data_path], num_epochs=1)
reader = tf.TFRecordReader()
_, raw_records = reader.read(filename_queue)
decode_protocol = {
'sample/image': tf.FixedLenFeature((), tf.int64),
'sample/label': tf.FixedLenFeature((), tf.int64)
enc_example = tf.parse_single_example(raw_records, features=decode_protocol)
recovered_image = enc_example["sample/image"]
recovered_label = enc_example["sample/label"]
return recovered_image, recovered_label
I also tried variations casting enc_example and decoding it, such as in Unable to read from Tensorflow tfrecord file However when I try to evaluate them my python session just freezes and gives no output or traceback.
Then I tried using eager execution to see what is happening, but apparently it is only compatible with tf.data API. However as far as I understand transformations on tf.data API are made on the whole dataset. https://www.tensorflow.org/api_guides/python/reading_data mentions that a decode function must be written, but doesn't give an example on how to do that. All the tutorials I have found are made for TFRecordReader (which doesn't work for me).
Any help (pinpointing what I am doing wrong/ explaining what is happening/ indications on how to decode tfrecords with tf.data API) is highly appreciated.
According to https://www.youtube.com/watch?v=4oNdaQk0Qv4 and https://www.youtube.com/watch?v=uIcqeP7MFH0 tf.data is the best way to create input pipelines, so I am highly interested on learning that way.
Thanks in advance!
I am not sure why storing the encoded png causes the evaluation to not work, but here is a possible way of working around the problem. Since you mentioned that you would like to use the tf.data way of creating input pipelines, I'll show how to use it with your toy example:
label = np.asarray([[1,2,3],
[4,5,6]]).reshape(2, 3, -1)
sample = np.stack((label + 200).reshape(2, 3, -1))
First, the data has to be saved to the TFRecord file. The difference from what you did is that the image is not encoded to png.
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
writer = tf.python_io.TFRecordWriter("toy.tfrecord")
example = tf.train.Example(features=tf.train.Features(feature={
'label_raw': _bytes_feature(tf.compat.as_bytes(label.tostring())),
'sample_raw': _bytes_feature(tf.compat.as_bytes(sample.tostring()))}))
What happens in the code above is that the arrays are turned into strings (1d objects) and then stored as bytes features.
Then, to read the data back using the tf.data.TFRecordDataset and tf.data.Iterator class:
filename = 'toy.tfrecord'
# Create a placeholder that will contain the name of the TFRecord file to use
data_path = tf.placeholder(dtype=tf.string, name="tfrecord_file")
# Create the dataset from the TFRecord file
dataset = tf.data.TFRecordDataset(data_path)
# Use the map function to read every sample from the TFRecord file (_read_from_tfrecord is shown below)
dataset = dataset.map(_read_from_tfrecord)
# Create an iterator object that enables you to access all the samples in the dataset
iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes)
label_tf, sample_tf = iterator.get_next()
# Similarly to tf.Variables, the iterators have to be initialised
iterator_init = iterator.make_initializer(dataset, name="dataset_init")
with tf.Session() as sess:
# Initialise the iterator passing the name of the TFRecord file to the placeholder
sess.run(iterator_init, feed_dict={data_path: filename})
# Obtain the images and labels back
read_label, read_sample = sess.run([label_tf, sample_tf])
The function _read_from_tfrecord() is:
def _read_from_tfrecord(example_proto):
feature = {
'label_raw': tf.FixedLenFeature([], tf.string),
'sample_raw': tf.FixedLenFeature([], tf.string)
features = tf.parse_example([example_proto], features=feature)
# Since the arrays were stored as strings, they are now 1d
label_1d = tf.decode_raw(features['label_raw'], tf.int64)
sample_1d = tf.decode_raw(features['sample_raw'], tf.int64)
# In order to make the arrays in their original shape, they have to be reshaped.
label_restored = tf.reshape(label_1d, tf.stack([2, 3, -1]))
sample_restored = tf.reshape(sample_1d, tf.stack([2, 3, -1]))
return label_restored, sample_restored
Instead of hard-coding the shape [2, 3, -1], you could also store that too into the TFRecord file, but for simplicity I didn't do it.
I made a little gist with a working example.
Hope this helps!

TFRecords for embedded text data

For a project at Uni, I'm working on the implementation of a Question Answering (bAbI dataset Task 5 at the moment, see https://research.fb.com/downloads/babi/) system with Neural Nets in TensorFlow, and I want to use TFRecords for my Input Pipeline.
My idea is that one Example in TFRecords terms should consist of the context for the question, the question itself, the answer, and the supporting sentence number (int which points to the most important sentence in the context to be able to answer the question). Here is how I've defined the function:
def make_example(context, question, answer, support):
ex = tf.train.SequenceExample()
fl_context = ex.feature_lists.feature_list["context"]
fl_question = ex.feature_lists.feature_list["question"]
fl_answer = ex.feature_lists.feature_list["answer"]
for token in context:
for qWord in question:
for ansWord in answer:
return ex
However, before passing the context, question, and answer, I want to embed the words and represent them by their GloVe vectors, i.e. by a (m,d) matrix, where m is the number of tokens in the sentence, and d is the number of dimensions each word vector has. This seems not to be handled well by my make_example function as I get:
theTypeError: (array([[ -9.58490000e-01, 1.73210000e-01,
-5.61450000e-01, -1.21440000e-01, 1.54350000e+00,
-1.28930000e+00, -9.77790000e-01, -1.35480000e-01,
-6.06930000e-01, -1.37810000e+00, 6.33470000e-01,
1.33160000e-01, 2.46320000e-01, 6.60260000e-01,
-4.46130000e-02, 4.09510000e-01, -7.61670000e-01,
4.67530000e-01, -6.67810000e-01, 2.99850000e-01,
-2.74810000e-01, -5.47990000e-01, -8.56820000e-01,
5.30880000e-02, -2.01700000e+00, 7.48530000e-01,
-1.27830000e-01, 1.32050000e-01, -2.19450000e-01,
2.29830000e+00, -3.17680000e-01, -8.64940000e-01,
-1.08630000e-01, -8.13770000e-02, -7.03420000e-01,
4.60000000e-01, -3.34730000e-01, 4.37030000e-02,
-7.55080000e-01, -6.89710000e-01, 7.14380000e-01,
-8.35950000e-02, 1.58620000e-02, -5.23850000e-01,
1.72520000e-01, -4.98740000e-01, 2.30810000e-01,
-3.64690000e-01, 1.5 has type <class 'tuple'>, but expected one of:
(<class 'int'>,)
Pointing to the fl_context.feature.add().int64_list.value.append(token) above... Could someone point out where I've misunderstood the concept of TFRecords, and give me an advice how to approach the problem?
I've searched a lot for learning materials, but usually the examples on TFRecords are with image data. So far my references are https://medium.com/#TalPerry/getting-text-into-tensorflow-with-the-dataset-api-ffb832c8bec6 and http://web.stanford.edu/class/cs20si/lectures/notes_09.pdf .
Thanks a lot in advance!
The solution to my question can be found here: https://github.com/simonada/q-and-a-tensorflow/blob/master/src/Q%26A%20with%20TF-%20TFRecords%20and%20Eager%20Execution.ipynb
My approach is as following:
Store the texts into a csv file: per row (context, question, answer)
Define a function to convert sequence to tf_example, in my case
def sequence_to_tf_example(context, question, answer):
context_ids= vectorize(context, False, word_to_index)
question_ids= vectorize(question, False, word_to_index)
answer_ids= vectorize(answer, True, word_to_index)
ex = tf.train.SequenceExample()
context_tokens = ex.feature_lists.feature_list["context"]
question_tokens = ex.feature_lists.feature_list["question"]
answer_tokens = ex.feature_lists.feature_list["answer"]
for token in context_ids:
for token in question_ids:
for token in answer_ids:
return ex
Define write functions
def write_example_to_tfrecord(context, question, answer, tfrecord_file, writer):
example= sequence_to_tf_example(context, question, answer)
def write_data_to_tf_record(filename):
file_csv= filename+'.csv'
file_tfrecords= filename+'.tfrecords'
with open(file_csv) as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
next(readCSV) #skip header
writer= tf.python_io.TFRecordWriter(file_tfrecords)
for row in readCSV:
write_example_to_tfrecord(row[0], row[1], row[2], file_tfrecords, writer)
Define read functions
def read_from_tfrecord(ex):
sequence_features = {
"context": tf.FixedLenSequenceFeature([], dtype=tf.int64),
"question": tf.FixedLenSequenceFeature([], dtype=tf.int64),
"answer": tf.FixedLenSequenceFeature([], dtype=tf.int64)
# Parse the example (returns a dictionary of tensors)
_, sequence_parsed = tf.parse_single_sequence_example(
return {"context": sequence_parsed['context'], "question": sequence_parsed['question'],
"answer": sequence_parsed['answer']}
Create dataset
def make_dataset(path, batch_size=128):
Makes a Tensorflow dataset that is shuffled, batched and parsed.
# Read a tf record file. This makes a dataset of raw TFRecords
dataset = tf.data.TFRecordDataset([path])
# Apply/map the parse function to every record. Now the dataset is a bunch of dictionaries of Tensors
dataset = dataset.map(read_from_tfrecord)
#Shuffle the dataset
dataset = dataset.shuffle(buffer_size=10000)
# specify padding for each tensor seperatly
dataset = dataset.padded_batch(batch_size, padded_shapes={
"context": tf.TensorShape([None]),
"question": tf.TensorShape([None]),
"answer": tf.TensorShape([None])
return dataset

TensorFlow input function for reading sparse data (in libsvm format)

I'm new to TensorFlow and trying to use the Estimator API for some simple classification experiments. I have a sparse dataset in libsvm format. The following input function works for small datasets:
def libsvm_input_function(file):
def input_function():
indexes_raw = []
indicators_raw = []
values_raw = []
labels_raw = []
for line in open(file, "r"):
data = line.split(" ")
label = int(data[0])
for fea in data[1:]:
id, value = fea.split(":")
indexes = tf.SparseTensor(indices=indexes_raw,
dense_shape=[i, num_features])
values = tf.SparseTensor(indices=indexes_raw,
dense_shape=[i, num_features])
labels = tf.constant(labels_raw, dtype=tf.int32)
return {"indexes": indexes, "values": values}, labels
return input_function
However, for a dataset of a few GB size I get the following error:
ValueError: Cannot create a tensor proto whose content is larger than 2GB.
How can I avoid this error? How should I write an input function to read medium-sized sparse datasets (in libsvm format)?
When use estimator, for libsvm data input, you can create dense index list, dense value list, then use feature_column.categorical_column_with_identity and feature_column.weighted_categorical_column to create feature column, finally, put feature columns to estimator. Maybe your input features length is variable, you can use padded_batch to handle it.
here some codes:
## here is input_fn
def input_fn(data_dir, is_training, batch_size):
def parse_csv(value):
## here some process to create feature_indices list, feature_values list and labels
return {"index": feature_indices, "value": feature_values}, labels
dataset = tf.data.Dataset.from_tensor_slices(your_filenames)
ds = dataset.flat_map(
lambda f: tf.data.TextLineDataset(f).map(parse_csv)
ds = ds.padded_batch(batch_size, ds.output_shapes, padding_values=(
"index": tf.constant(-1, dtype=tf.int32),
"value": tf.constant(0, dtype=tf.float32),
tf.constant(False, dtype=tf.bool)
return ds.repeat().prefetch(batch_size)
## create feature column
def build_model_columns():
categorical_column = tf.feature_column.categorical_column_with_identity(
key='index', num_buckets=your_feature_dim)
sparse_columns = tf.feature_column.weighted_categorical_column(
categorical_column=categorical_column, weight_feature_key='value')
dense_columns = tf.feature_column.embedding_column(sparse_columns, your_embedding_dim)
return [sparse_columns], [dense_columns]
## when created feature column, you can put them into estimator, eg. put dense_columns into DNN, and sparse_columns into linear model.
## for export savedmodel
def raw_serving_input_fn():
feature_spec = {"index": tf.placeholder(shape=[None, None], dtype=tf.int32),
"value": tf.placeholder(shape=[None, None], dtype=tf.float32)}
return tf.estimator.export.build_raw_serving_input_receiver_fn(feature_spec)
Another way, you can create your custom feature column, like this: _SparseArrayCategoricalColumn
I have been using tensorflow.contrib.libsvm. Here's an example (i am using eager execution with generators)
import os
import tensorflow as tf
import tensorflow.contrib.libsvm as libsvm
def all_libsvm_files(folder_path):
for file in os.listdir(folder_path):
if file.endswith(".libsvm"):
yield os.path.join(folder_path, file)
def load_libsvm_dataset(path_to_folder):
return tf.data.TextLineDataset(list(all_libsvm_files(path_to_folder)))
def libsvm_iterator(path_to_folder):
dataset = load_libsvm_dataset(path_to_folder)
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
yield libsvm.decode_libsvm(tf.reshape(next_element, (1,)),
libsvm_iterator gives you a feature-label pair back on each iteration, from multiple files inside a folder that you specify.