Input pipeline using TensorFlow Dataset API and Pandas? - pandas

I am trying to create a TensorFlow Dataset that takes in a list of path names for CSV files and create batches of training data. First I create a parse function which uses Pandas to read the first n rows. I give this function as argument for the 'map' method in Dataset
def _get_data_for_dataset(file_name,rows=100):
print(file_name.decode())
df_input=pd.read_csv(os.path.join(folder_name, file_name.decode()),
usecols =['Wind_MWh','Actual_Load_MWh'],nrows = rows)
X_data = df_input.as_matrix()
X_data.astype('float32', copy=False)
return X_data
dataset = tf.data.Dataset.from_tensor_slices(file_names)
dataset = dataset2.map(lambda file_name: tf.py_func(_get_data_for_dataset,[file_name,100], tf.float64))
dataset= dataset.batch(2) #Create batches
iter = dataset.make_one_shot_iterator()
get_batch = iter.get_next()
with tf.Session() as sess:
print(sess.run(get_batch).shape)
The above code works but instead of producing a dataset with shape (200,2) it produces a dataset with shape (2, 100, 2). Please help.

I finally got the answer from Dataset API 'flat_map' method producing error for same code which works with 'map' method
I am posting the full code in case it may help others who want to use Pandas and Dataset API together.
folder_name = './data/power_data/'
file_names = os.listdir(folder_name)
def _get_data_for_dataset(file_name):
df_input=pd.read_csv(os.path.join(folder_name, file_name.decode()),
usecols=['Wind_MWh', 'Actual_Load_MWh'])
X_data = df_input.as_matrix()
return X_data.astype('float32', copy=False)
dataset = tf.data.Dataset.from_tensor_slices(file_names)
# Use `Dataset.from_tensor_slices()` to make a `Dataset` from the output of
# the `tf.py_func()` op.
dataset = dataset.flat_map(lambda file_name: tf.data.Dataset.from_tensor_slices(
tf.py_func(_get_data_for_dataset, [file_name], tf.float32)))
dataset = dataset.batch(100)
iter = dataset.make_one_shot_iterator()
get_batch = iter.get_next()
with tf.Session() as sess:
print(sess.run(get_batch))

Related

Generate and assemble model predictions for each stratified kfold test split

I would like to generate multiple test data splits using stratified KFold (skf) and then generate/assemble predictions for each of these test data splits (and hence all of the data) using a sklearn model. I am at a wits end on how to do this programmatically.
I have recaptured my code using a minimal data example below. Briefly, (after data import), I have a function that does the model fit and generates model predicted probabilities. Subsequently, I attempt to pass this function to each skf split of my data so as to generate and subsequently collate predicted probabilities for each row of my data. However, this step fails and generates a valueerror (boolean array expected). My code follows below:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
#load data, assemble dataframe
iris = datasets.load_iris()
X = pd.DataFrame(iris.data[51:150, :], columns = ["sepal_length", "sepal_width",
"petal_length", "petal_width"])
y = pd.DataFrame(iris.target[51:150,], columns = ["target"])
df = pd.concat([X,y], axis = 1)
#instantiate logistic regression
log = LogisticRegression()
#modelling function
def train_model(train, test, fold):
X = df.drop("target", axis = 1)
y = df["target"]
X_train = train[X]
y_train = train[y]
X_test = test[X]
y_test = test[y]
#generate probability of class 1 predictions from logistic regression model fit
prob = log.fit(X_train, y_train).predict_proba(X_test)[:, 1]
return (prob)
#generate straified k-fold splits (2 used as example here)
skf = StratifiedKFold(n_splits = 2)
#generate and collate all predictions (for each row in df)
fold = 1
outputs = []
for train_index, test_index in skf.split(df, y):
train_df = df.loc[train_index,:]
test_df = df.loc[test_index,:]
output = train_model(train_df,test_df,fold) #generate model probabilities for X_test
in skf split
outputs.append(output) #append all model probabilities
fold = fold + 1
all_preds = pd.concat(outputs)
Can somebody please guide me to the solution that includes row index and its predicted probability?

How to get the labels from tensorflow dataset

ds_test = tf.data.experimental.make_csv_dataset(
file_pattern = "./dfj_test/part-*.csv.gz",
batch_size=batch_size, num_epochs=1,
#column_names=use_cols,
label_name='label_id',
#select_columns= select_cols,
num_parallel_reads=30, compression_type='GZIP',
shuffle_buffer_size=12800)
This is my tesetset during training. After completing the model, I want to zip the columns of predictions and labels for the df_test .
preds = model.predict(df_test)
Getting the predictions is quite simple, and it is of numpy array format. However, I don't know how to get the corresponding labels from the df_test.
I want to zip(preds, labels) for further analysis.
Any hint? Thanks.
(tf version 2.3.1)
You can map each example to return the field you want
# load some exemplary data
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
dataset = tf.data.experimental.make_csv_dataset(train_file_path, batch_size=100, num_epochs=1)
# get field by unbatching
labels_iterator= dataset.unbatch().map(lambda x: x['survived']).as_numpy_iterator()
labels = np.array(list(labels_iterator))
# get field by concatenating batches
labels_iterator= dataset.map(lambda x: x['survived']).as_numpy_iterator()
labels = np.concatenate(list(labels_iterator))

Multiple inputs of keras model with tf.data.Dataset.from_generator in Tensorflow 2

I am trying to implement a model in keras that will have multiple inputs:
image (200x200)
some numbers (1x50)
three 1d signals (1x50000, 2x100000)
To feed that model, I want to write a generator to use with tf.data.Dataset.from_generator. From the docs of from_generator, its not clear to me how I should provide its parameters output_types, output_shapes. Can anyone help me with this?
I had a similar issue, and it took me many tries to get the structure right for those inputs. Here's an example of a network with 3 inputs and 2 outputs, complete to the .fit call.
The following works in tensorflow 2.1.0
import tensorflow as tf
import numpy as np
def generator(N=10):
"""
Returns tuple of (inputs,outputs) where
inputs = (inp1,inp2,inp2)
outputs = (out1,out2)
"""
dt=np.float32
for i in range(N):
inputs = (np.random.rand(N,3,3,1).astype(dt),
np.random.rand(N,3,3,1).astype(dt),
np.random.rand(N,3,3,1).astype(dt))
outputs = (np.random.rand(N,3,3,1).astype(dt),
np.random.rand(N,3,3,1).astype(dt))
yield inputs,outputs
# Create dataset from generator
types = ( (tf.float32,tf.float32,tf.float32),
(tf.float32,tf.float32) )
shapes = (([None,3,3,1],[None,3,3,1],[None,3,3,1]),
([None,3,3,1],[None,3,3,1]))
data = tf.data.Dataset.from_generator(generator,
output_types=types,
output_shapes=shapes
)
# Define a model
inp1 = tf.keras.Input(shape=(3,3,1),name='inp1')
inp2 = tf.keras.Input(shape=(3,3,1),name='inp2')
inp3 = tf.keras.Input(shape=(3,3,1),name='inp3')
out1 = tf.keras.layers.Conv2D(1,kernel_size=3,padding='same')(inp1)
out2 = tf.keras.layers.Conv2D(1,kernel_size=3,padding='same')(inp2)
model = tf.keras.Model(inputs=[inp1,inp2,inp3],outputs=[out1,out2])
model.compile(loss=['mse','mse'])
# Train
model.fit(data)
So assuming you have a generator that is similar to this mock:
def dummy_generator():
number_of_records = 100
for i in range(100):
an_image = tf.random.uniform((200,200,3))
some_numbers = tf.random.uniform((50,))
signal1 = tf.random.uniform((50000,))
signal2 = tf.random.uniform((100000,))
signal3 = tf.random.uniform((100000,))
yield an_image, some_numbers, signal1, signal2, signal3
each record is of datatype float32 so the output types are easy:
out_types = (tf.float32, tf.float32, tf.float32, tf.float32, tf.float32)
for the output shapes we just list the shapes in the same order:
out_shapes = ((200,200,3), (50,), (50000,), (100000,), (100000,))
so now we can just call from_generator:
ds = tf.data.Dataset.from_generator(dummy_generator,
output_types=out_types,
output_shapes=out_shapes)
model.fit([input_1, input_2, input_3], y, epochs=EPOCHS)
You got to have n(3 in the case above) input layers in your model.

TensorFlow - Error when using interleave or parallel_interleave

I'm using tf.data.Datasets of V1.12 API like this Q&A to read several .h5 files pre-saved batch per file in a directory.
I first made a generator:
class generator_yield:
def __init__(self, file):
self.file = file
def __call__(self):
with h5py.File(self.file, 'r') as f:
yield f['X'][:], f['y'][:]
Then make a list of filenames and passe them in Dataset:
def _fnamesmaker(dir, mode='h5'):
fnames = []
for dirpath, _, filenames in os.walk(dir):
for fname in filenames:
if fname.endswith(mode):
fnames.append(os.path.abspath(os.path.join(dirpath, fname)))
return fnames
fnames = _fnamesmaker('./')
len_fnames = len(fnames)
fnames = tf.data.Dataset.from_tensor_slices(fnames)
Apply the interleave method of Dataset:
# handle multiple files
ds = fnames.interleave(lambda filename: tf.data.Dataset.from_generator(
generator_yield(filename), output_types=(tf.float32, tf.float32),
output_shapes=(tf.TensorShape([100, 100, 1]), tf.TensorShape([100, 100, 1]))), cycle_length=len_fnames)
ds = ds.batch(5).shuffle(5).prefetch(5)
# init iterator
it = ds.make_initializable_iterator()
init_op = it.initializer
X_it, y_it = it.get_next()
Model:
# model
with tf.name_scope("Conv1"):
W = tf.get_variable("W", shape=[3, 3, 1, 1],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable("b", shape=[1], initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.conv2d(X_it, W, strides=[1, 1, 1, 1], padding='SAME') + b
logits = tf.nn.relu(layer1)
loss = tf.reduce_mean(tf.losses.mean_squared_error(labels=y_it, predictions=logits))
train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss)
Start session:
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(), init_op])
while True:
try:
data = sess.run(train_op)
print(data.shape)
except tf.errors.OutOfRangeError:
print('done.')
break
The Error looks like:
TypeError: expected str, bytes or os.PathLike object, not Tensor
At the init method of generator. Apparently when one applies interleave the it's a Tensor passes through to the generator
You cannot run the dataset object directly through sess.run. You have to define an iterator, get the next element. Try doing something like:
next_elem = files.make_one_shot_iterator.get_next()
data = sess.run(next_elem)
You should be able to get your tensors.
According to this post, my case won't benefit in performance with the parralel_interleave.
...have a transformation that transforms each element of a source
dataset into multiple elements into the destination dataset...
It's more relevant in the typical classification problem with datas (dog, cat...)saved in separate directories. We have a segmentation problem here which means that a label contains identical dimension of a input image. All datas are stocked in one directory and each .h5 file contains an image and its labels(masks)
Herein, a simple map with num_parallel_calls is sufficient.

do not save check point for a final step for Estimator

I use Estimator and I train model in the loop to feed data. Every step is the final step. The checkpoints are saved for every final step too. I want to avoid saving checkpoint in every iteration to increase the performance (speed) of the training.
I can not find any information how to do this. Do you have any ideas/suggestions/solutions?
classifier = Estimator(
model_fn=cnn_model_fn,
model_dir="./temp_model_Adam",
config=tf.contrib.learn.RunConfig(
save_checkpoints_secs=None,
save_checkpoints_steps=100,
save_summary_steps=None
)
)
# Train the model
for e in range(0, 10):
numbers = np.arange(10000)
np.random.shuffle(numbers)
for step in range(0, 2000):
classifier.fit(
input_fn=lambda: read_images_for_training_as_batch(step, path, 5, numbers),
steps=1
)
Nowadays the api got changed a bit but from what I see you were using the fit (currently train) method incorrectly, you should put steps=2000 and have your input function return an iterator over your dataset. Today you have tf.estimator.inputs.numpy_input_fn at your disposal that can help you when you have small data sets, otherwise you have to use tf.data.DataSet api.
Something like this (it loads .wav files):
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
# ...
def input_fn(num_epochs, batch_size, shuffle=False, mode='training')
def input_fn_bound():
def _read_file(fn, label):
return io_ops.read_file(fn), label
def _decode(data, label):
pcm = contrib_audio.decode_wav(data,
desired_channels=1,
desired_samples=desired_samples)
return pcm.audio, label
filenames = get_files(mode)
classes = get_classes(mode)
labels = {'class': np.array(classes)}
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
if shuffle:
dataset = dataset.shuffle(buffer_size=len(labels))
dataset = dataset.map(_read_file, num_parallel_calls=num_map_threads)
dataset = dataset.map(_decode, num_parallel_calls=num_map_threads)
dataset = dataset.map(lambda wav, label: ({'wav': wav}, label))
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(2) # To load next batch while the first one is being processed on GPU
iter = dataset.make_one_shot_iterator()
features, labels = iter.get_next()
return features, labels
return input_fn_bound
# ....
estimator.train(input_fn=input_fn(
num_epoths=None,
batch_size=64,
shuffle=True,
mode='training'),
steps=10000)