Tensorflow export strategy input function for bow_encoder - tensorflow

I am trying to implement serving function to be able to make predictions on saved text classification model. As I understood the goal is to create a function which will do almost exactly same as train_input_fn/eval_input_fn? I have following implementation of those functions:
def generate_training_input_fn(filename):
train_raw = pd.read_csv(filename[0], header=None)
x_train = train_raw.iloc[:, 1]
y_train = train_raw.iloc[:, 0]
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
x_train = np.array(list(vocab_processor.fit_transform(x_train)))
n_words = len(vocab_processor.vocabulary_)
#print('Total words: %d' % n_words)
# Save a vocabulary list to file. Needed by the serving_input_fn for exporting the model.
with open('vocab_processor.pickle', 'wb') as f:
pickle.dump(vocab_processor, f)
features = tf.contrib.layers.bow_encoder(
x_train, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
return features, y_train
def generate_eval_input_fn(filename):
eval_raw = pd.read_csv(filename[0], header=None)
x_eval = eval_raw.iloc[:, 1]
y_eval = eval_raw.iloc[:, 0]
with open('vocab_processor.pickle', 'rb') as f:
vocab_processor = pickle.load(f)
n_words = len(vocab_processor.vocabulary_)
x_eval = np.array(list(vocab_processor.transform(x_eval)))
features = tf.contrib.layers.bow_encoder(
x_eval, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
#labels = tf.one_hot(y_eval, 15, 1, 0)
return features, y_eval
There is comment "Save a vocabulary list to file. Needed by the serving_input_fn for exporting the model" but there is no such function implemented and Experiment is created with export_strategies=None ( got this code from another coder ). I've tried to implement serving_input_fn like in census tutorial
def csv_serving_input_fn():
csv_row = tf.placeholder(shape=[None],dtype=tf.string)
features = parse_csv(csv_row)
return tf.contrib.learn.InputFnOps(features, None, {'csv_row': csv_row})
but have no idea how to implement parse_csv since my generate_eval_input_fn takes whole csv as pandas DataFrame

Related

Multiple inputs of keras model with tf.data.Dataset.from_generator in Tensorflow 2

I am trying to implement a model in keras that will have multiple inputs:
image (200x200)
some numbers (1x50)
three 1d signals (1x50000, 2x100000)
To feed that model, I want to write a generator to use with tf.data.Dataset.from_generator. From the docs of from_generator, its not clear to me how I should provide its parameters output_types, output_shapes. Can anyone help me with this?
I had a similar issue, and it took me many tries to get the structure right for those inputs. Here's an example of a network with 3 inputs and 2 outputs, complete to the .fit call.
The following works in tensorflow 2.1.0
import tensorflow as tf
import numpy as np
def generator(N=10):
"""
Returns tuple of (inputs,outputs) where
inputs = (inp1,inp2,inp2)
outputs = (out1,out2)
"""
dt=np.float32
for i in range(N):
inputs = (np.random.rand(N,3,3,1).astype(dt),
np.random.rand(N,3,3,1).astype(dt),
np.random.rand(N,3,3,1).astype(dt))
outputs = (np.random.rand(N,3,3,1).astype(dt),
np.random.rand(N,3,3,1).astype(dt))
yield inputs,outputs
# Create dataset from generator
types = ( (tf.float32,tf.float32,tf.float32),
(tf.float32,tf.float32) )
shapes = (([None,3,3,1],[None,3,3,1],[None,3,3,1]),
([None,3,3,1],[None,3,3,1]))
data = tf.data.Dataset.from_generator(generator,
output_types=types,
output_shapes=shapes
)
# Define a model
inp1 = tf.keras.Input(shape=(3,3,1),name='inp1')
inp2 = tf.keras.Input(shape=(3,3,1),name='inp2')
inp3 = tf.keras.Input(shape=(3,3,1),name='inp3')
out1 = tf.keras.layers.Conv2D(1,kernel_size=3,padding='same')(inp1)
out2 = tf.keras.layers.Conv2D(1,kernel_size=3,padding='same')(inp2)
model = tf.keras.Model(inputs=[inp1,inp2,inp3],outputs=[out1,out2])
model.compile(loss=['mse','mse'])
# Train
model.fit(data)
So assuming you have a generator that is similar to this mock:
def dummy_generator():
number_of_records = 100
for i in range(100):
an_image = tf.random.uniform((200,200,3))
some_numbers = tf.random.uniform((50,))
signal1 = tf.random.uniform((50000,))
signal2 = tf.random.uniform((100000,))
signal3 = tf.random.uniform((100000,))
yield an_image, some_numbers, signal1, signal2, signal3
each record is of datatype float32 so the output types are easy:
out_types = (tf.float32, tf.float32, tf.float32, tf.float32, tf.float32)
for the output shapes we just list the shapes in the same order:
out_shapes = ((200,200,3), (50,), (50000,), (100000,), (100000,))
so now we can just call from_generator:
ds = tf.data.Dataset.from_generator(dummy_generator,
output_types=out_types,
output_shapes=out_shapes)
model.fit([input_1, input_2, input_3], y, epochs=EPOCHS)
You got to have n(3 in the case above) input layers in your model.

TensorFlow - Error when using interleave or parallel_interleave

I'm using tf.data.Datasets of V1.12 API like this Q&A to read several .h5 files pre-saved batch per file in a directory.
I first made a generator:
class generator_yield:
def __init__(self, file):
self.file = file
def __call__(self):
with h5py.File(self.file, 'r') as f:
yield f['X'][:], f['y'][:]
Then make a list of filenames and passe them in Dataset:
def _fnamesmaker(dir, mode='h5'):
fnames = []
for dirpath, _, filenames in os.walk(dir):
for fname in filenames:
if fname.endswith(mode):
fnames.append(os.path.abspath(os.path.join(dirpath, fname)))
return fnames
fnames = _fnamesmaker('./')
len_fnames = len(fnames)
fnames = tf.data.Dataset.from_tensor_slices(fnames)
Apply the interleave method of Dataset:
# handle multiple files
ds = fnames.interleave(lambda filename: tf.data.Dataset.from_generator(
generator_yield(filename), output_types=(tf.float32, tf.float32),
output_shapes=(tf.TensorShape([100, 100, 1]), tf.TensorShape([100, 100, 1]))), cycle_length=len_fnames)
ds = ds.batch(5).shuffle(5).prefetch(5)
# init iterator
it = ds.make_initializable_iterator()
init_op = it.initializer
X_it, y_it = it.get_next()
Model:
# model
with tf.name_scope("Conv1"):
W = tf.get_variable("W", shape=[3, 3, 1, 1],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable("b", shape=[1], initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.conv2d(X_it, W, strides=[1, 1, 1, 1], padding='SAME') + b
logits = tf.nn.relu(layer1)
loss = tf.reduce_mean(tf.losses.mean_squared_error(labels=y_it, predictions=logits))
train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)
Start session:
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(), init_op])
while True:
try:
data = sess.run(train_op)
print(data.shape)
except tf.errors.OutOfRangeError:
print('done.')
break
The Error looks like:
TypeError: expected str, bytes or os.PathLike object, not Tensor
At the init method of generator. Apparently when one applies interleave the it's a Tensor passes through to the generator
You cannot run the dataset object directly through sess.run. You have to define an iterator, get the next element. Try doing something like:
next_elem = files.make_one_shot_iterator.get_next()
data = sess.run(next_elem)
You should be able to get your tensors.
According to this post, my case won't benefit in performance with the parralel_interleave.
...have a transformation that transforms each element of a source
dataset into multiple elements into the destination dataset...
It's more relevant in the typical classification problem with datas (dog, cat...)saved in separate directories. We have a segmentation problem here which means that a label contains identical dimension of a input image. All datas are stocked in one directory and each .h5 file contains an image and its labels(masks)
Herein, a simple map with num_parallel_calls is sufficient.

Reading multiple feature vectors from one TFRecord example in Tensorflow

I know how to store one feature per example inside a tfrecord file and then read it by using something like this:
import tensorflow as tf
import numpy as np
import os
# This is used to parse an example from tfrecords
def parse(serialized_example):
features = tf.parse_single_example(
serialized_example,
features ={
"label": tf.FixedLenFeature([], tf.string, default_value=""),
"feat": tf.FixedLenFeature([], tf.string, default_value="")
})
feat = tf.decode_raw(features['feat'], tf.float64)
label = tf.decode_raw(features['label'], tf.int64)
return feat, label
################# Generate data
cwd = os.getcwd()
numdata = 10
with tf.python_io.TFRecordWriter(os.path.join(cwd, 'data.tfrecords')) as writer:
for i in range(numdata):
feat = np.random.randn(2)
label = np.array(np.random.randint(0,9))
featb = feat.tobytes()
labelb = label.tobytes()
import pudb.b
example = tf.train.Example(features=tf.train.Features(
feature={
'feat': tf.train.Feature(bytes_list=tf.train.BytesList(value=[featb])),
'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[labelb])),}))
writer.write(example.SerializeToString())
print('wrote f {}, l {}'.format(feat, label))
print('Done writing! Start reading and printing data')
################# Read data
filename = ['data.tfrecords']
dataset = tf.data.TFRecordDataset(filename).map(parse)
dataset = dataset.batch(100)
iterator = dataset.make_initializable_iterator()
feat, label = iterator.get_next()
with tf.Session() as sess:
sess.run(iterator.initializer)
try:
while True:
example = sess.run((feat,label))
print example
except tf.errors.OutOfRangeError:
pass
What do I do in the case where each example has multiple feature vectors + labels in it. For example, in the above code, if feat was stored as a 2D array. I still want to do the same thing as before, which is to train a DNN with one feature per label, but each example in the tfrecords file has multiple features and multiple labels. This should be simple but I'm having trouble unpacking multiple features in tensorflow using tfrecords.
Firstly, note that np.ndarray.tobytes() flattens out multi-dimensional arrays into a list, i.e.
feat = np.random.randn(N, 2)
reshaped = np.reshape(feat, (N*2,))
feat.tobytes() == reshaped.tobytes() ## True
So, if you have a N*2 array that's saved as bytes in TFRecord format, you have to reshape it after parsing.
If you do that, you can unbatch the elements of a tf.data.Dataset so that each iteration gives you one feature and one label. Your code should be as follows:
# This is used to parse an example from tfrecords
def parse(serialized_example):
features = tf.parse_single_example(
serialized_example,
features ={
"label": tf.FixedLenFeature([], tf.string, default_value=""),
"feat": tf.FixedLenFeature([], tf.string, default_value="")
})
feat = tf.decode_raw(features['feat'], tf.float64) # array of shape (N*2, )
feat = tf.reshape(feat, (N, 2)) # array of shape (N, 2)
label = tf.decode_raw(features['label'], tf.int64) # array of shape (N, )
return feat, label
################# Generate data
cwd = os.getcwd()
numdata = 10
with tf.python_io.TFRecordWriter(os.path.join(cwd, 'data.tfrecords')) as writer:
for i in range(numdata):
feat = np.random.randn(N, 2)
label = np.array(np.random.randint(0,9, N))
featb = feat.tobytes()
labelb = label.tobytes()
example = tf.train.Example(features=tf.train.Features(
feature={
'feat': tf.train.Feature(bytes_list=tf.train.BytesList(value=[featb])),
'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[labelb])),}))
writer.write(example.SerializeToString())
print('wrote f {}, l {}'.format(feat, label))
print('Done writing! Start reading and printing data')
################# Read data
filename = ['data.tfrecords']
dataset = tf.data.TFRecordDataset(filename).map(parse).apply(tf.contrib.data.unbatch())
... etc

Making prediction on Iris dataset

I have a basic classification code for Irish dataset.
import tensorflow as tf
import pandas as pd
COLUMN_NAMES = [
'SepalLength',
'SepalWidth',
'PetalLength',
'PetalWidth',
'Species'
]
# Import training dataset
training_dataset = pd.read_csv('iris_training.csv', names=COLUMN_NAMES, header=0)
train_x = training_dataset.iloc[:, 0:4]
train_y = training_dataset.iloc[:, 4]
# Import testing dataset
test_dataset = pd.read_csv('iris_test.csv', names=COLUMN_NAMES, header=0)
test_x = test_dataset.iloc[:, 0:4]
test_y = test_dataset.iloc[:, 4]
columns_feat = [
tf.feature_column.numeric_column(key='SepalLength'),
tf.feature_column.numeric_column(key='SepalWidth'),
tf.feature_column.numeric_column(key='PetalLength'),
tf.feature_column.numeric_column(key='PetalWidth')
]
classifier = tf.estimator.DNNClassifier(
feature_columns=columns_feat,
# Two hidden layers of 10 nodes each.
hidden_units=[10, 10],
# The model is classifying 3 classes
n_classes=3)
def train_function(inputs, outputs, batch_size):
dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), outputs))
dataset = dataset.shuffle(1000).repeat().batch(batch_size)
return dataset.make_one_shot_iterator().get_next()
# Train the Model.
classifier.train(
input_fn=lambda:train_function(train_x, train_y, 100),
steps=1000)
def evaluation_function(attributes, classes, batch_size):
attributes=dict(attributes)
if classes is None:
inputs = attributes
else:
inputs = (attributes, classes)
dataset = tf.data.Dataset.from_tensor_slices(inputs)
assert batch_size is not None, "batch_size must not be None"
dataset = dataset.batch(batch_size)
return dataset.make_one_shot_iterator().get_next()
# Evaluate the model.
eval_result = classifier.evaluate(
input_fn=lambda:evaluation_function(test_x, test_y, 100))
I evaluate the result but how can i make a prediction on my data because now i get only console info of loss and epochs, accuracy. For example if i have everything except species. I want to give my own sepal length and etc so i can get prediction of the species and it will be another variable. Do i have to create variables like pred_x or pred_y(pandas dataframe) and then put them into eval_result?
Is that what you mean? for example:new_samples = np.array([[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32) If you want new data like this to make predictions, then you can refer to this code.TensorFlow-Iris-Classification
Like all estimator classes, the DNNClassifier class has a predict method that makes real-world predictions. The documentation is here.

Does the tf.data.Dataset support to generate dictionary structure?

The following is a piece of code from [https://www.tensorflow.org/programmers_guide/datasets]. In this example, the map function is a user-defined function to read the data. And in the map function, we need to set the output types are [tf.uint8, label.dtype].
import cv2
# Use a custom OpenCV function to read the image, instead of the standard
# TensorFlow `tf.read_file()` operation.
def _read_py_function(filename, label):
image_decoded = cv2.imread(image_string, cv2.IMREAD_GRAYSCALE)
return image_decoded, label
# Use standard TensorFlow operations to resize the image to a fixed shape.
def _resize_function(image_decoded, label):
image_decoded.set_shape([None, None, None])
image_resized = tf.image.resize_images(image_decoded, [28, 28])
return image_resized, label
filenames = ["/var/data/image1.jpg", "/var/data/image2.jpg", ...]
labels = [0, 37, 29, 1, ...]
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
dataset = dataset.map(
lambda filename, label: tuple(tf.py_func(
_read_py_function, [filename, label], [tf.uint8, label.dtype])))
dataset = dataset.map(_resize_function)
My question is, if we want to the _read_py_function() output a Python dictionary, then how do we set the outptu types? Is there an inherit data type such as tf.dict? For example:
def _read_py_function(filename):
image_filename = filename[0]
label_filename = filename[1]
image_id = filename[2]
image_age = filename[3]
image_decoded = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE)
image_decoded = cv2.imread(label_fielname, cv2.IMREAD_GRAYSCALE)
return {'image':image_decoded, 'label':label_decoded, 'id':image_id, 'age':image_age}
Then, how do we design the dataset.map() function?
Returning dicts inside the function called by tf.data.Dataset.map should work as expected.
Here is an example:
dataset = tf.data.Dataset.range(10)
dataset = dataset.map(lambda x: {'a': x, 'b': 2 * x})
dataset = dataset.map(lambda y: y['a'] + y['b'])
res = dataset.make_one_shot_iterator().get_next()
with tf.Session() as sess:
for i in range(10):
assert sess.run(res) == 3 * i
To add to the above answer this also works:
dataset = tf.data.Dataset.range(10)
dataset = dataset.map(lambda x: {'a': x, 'b': 2 * x})
res = dataset.make_one_shot_iterator().get_next()
with tf.Session() as sess:
for i in range(10):
curr_res = sess.run(res)
assert curr_res['a'] == i
assert curr_res['b'] == 2 * i