I have a peculiar case of slow model training while trying to train using a generator. The reason I need to use a generator is because I have multiple parquet files that cannot be loaded into memory at once. Here is the code snippet without a generator
d_df = pd.read_parquet("..")
label = pd_df.pop("label")
dataset = tf.data.Dataset.from_tensor_slices((dict(pd_df), label))
# alternate
# dataset = createDataset(bucket,prefix)
def is_test(x, y):
return x % 4 == 0
def is_train(x, y):
return not is_test(x, y)
recover = lambda x, y: y
val_dataset = dataset.enumerate() \
.filter(is_test) \
.map(recover).batch(batch_size)
train_dataset = dataset.enumerate() \
.filter(is_train) \
.map(recover).batch(batch_size)
feature_columns = _create_feature_columns()
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
model = tf.keras.Sequential([
feature_layer,
layers.Dense(1280, activation='relu'),
layers.Dense(512, activation='relu'),
layers.Dense(1280, activation='relu'),
layers.Dense(1)
])
model.compile(optimizer='adam',
loss=tf.keras.losses.MeanSquaredError(),
metrics=['accuracy', 'mean_absolute_error'])
om_model.fit(train_dataset, epochs=10, validation_data=val_dataset, verbose=1)
This runs with each steps 295ms. Naturally since its not possible to load all my data in one go I wrote the following generator ( P.S. I'm new to TF and my generator may be off but from what I could find online it looks good to me).
def getSplit(original_list, n):
return [original_list[i:i + n] for i in range(0, len(original_list), n)]
#
# 200 files -> 48 Mb (1 file)
# 15 files in memory at a time
# 5 generators
# 3 files per generator
#
def pandasGenerator(s3files, n=3):
print(f"Processing: {s3files} to : {tf.get_static_value(s3files)}")
s3files = tf.get_static_value(s3files)
s3files = [str(s3file)[2:-1] for s3file in s3files]
batches = getSplit(s3files, n)
for batch in batches:
t = time.process_time()
print(f"Processing Batch: {batch}")
panda_ds = pd.concat([pd.read_parquet(s3file) for s3file in batch], ignore_index=True)
elapsed_time = time.process_time() - t
print(f"base_read_time: {elapsed_time}")
for row in panda_ds.itertuples(index=False):
pan_row = dict(row._asdict())
labels = pan_row.pop('label')
yield dict(pan_row), labels
return
def createDS(s3bucket, s3prefix):
s3files = getFileLists(bucket=s3bucket, prefix=s3prefix)
dataset = (tf.data.Dataset.from_tensor_slices(getSplit(s3files, 40))
.interleave(
lambda files: tf.data.Dataset.from_generator(pandasGenerator, output_signature=(
{
}, tf.TensorSpec(shape=(), dtype=tf.float64)),
args=(files, 3)),
num_parallel_calls=tf.data.AUTOTUNE
)).prefetch(tf.data.AUTOTUNE)
return dataset
When using the generator the per step jumps to 2s.
I'd appreciate any help in improving the generator. Thanks.
Related
I am using graph convolutions in Deepchem/Keras for predicting molecular properties. Following the Deepchem tutorials I created a data generator. While there is no error in my code below, I fail to understand why the size of pred changes with epoch and batch_size.
First we create some dummy data.
!pip install --pre deepchem
!pip install --pre rdkit
import deepchem as dc
import numpy as np
import tensorflow as tf
from deepchem.feat.mol_graphs import ConvMol
mol = ['C-C-O']*240
ftr = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
X=ftr.featurize(mol)
y = np.arange(0,240,1)
w = np.arange(0,240,1)
ids = np.arange(0,240,1)
ds = dc.data.NumpyDataset(X=X, y=y, ids=ids)
Edit: We use the following function as generator:
def data_generator(dataset, epochs=1, batch_size = 100, pad_batches = True):
print(dataset)
for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size, epochs,
deterministic=False, pad_batches=pad_batches)):
multiConvMol = ConvMol.agglomerate_mols(X_b)
inputs = [multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership)]
for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
labels = [y_b]
weights = [w_b]
yield (inputs, labels, weights)
(end edit)
Then we define the model and fit it to the dataset generated above:
batch_size = 100
n_tasks = 1
class TestModel(tf.keras.Model):
def __init__(self, model = 1):
super(TestModel, self).__init__()
self.model = model
#____________Test Model 1___________
if self.model == 1:
self.gc1 = GraphConv(128, activation_fn=tf.nn.tanh)
self.readout = GraphGather(batch_size=batch_size,
activation_fn=tf.nn.tanh)
self.dense2 = layers.Dense(1)
def call(self, inputs):
#____________Test Model 1___________
if self.model == 1:
gc1_output = self.gc1(inputs)
readout_output = self.readout([gc1_output]+ inputs[1:])
dense2_output = self.dense2(readout_output)
return dense2_output
#Fit_generator
print("_________\nFitting:")
testmodel = dc.models.KerasModel(TestModel(1), loss=dc.models.losses.L2Loss())
testmodel.fit_generator(data_generator(ds, epochs=1, batch_size = 100))
Finally we try to predict the dataset labels setting epochs = 2:
#Predict
print("_________\nPredicting:")
pred = testmodel.predict_on_generator(data_generator(ds, epochs = 2, batch_size = 100, pad_batches = True))
print(ds.y.shape, pred.shape)
Giving:
_________
Predicting:
<NumpyDataset X.shape: (240,), y.shape: (240,), w.shape: (240,), ids: [0 1 2 ... 237 238 239], task_names: [0]>
(240,) (600, 1)
However if I change epochs to 1, the size of pred changes (300, 1) i.e. half of what we had before. Similarly, changing the batch_size affects the prediction size too.
Can anyone explain what I'm doing wrong?
I am currently using a tf.keras.utils.Sequence object to generate image batches for a CNN. I am using Tensorflow 2.2 and the Model.fit method for the model. When I fit the model, the following warning is thrown in each epoch when I set use_multiprocessing=True in tf.keras.model.fit(...):
WARNING:tensorflow:multiprocessing can interact badly with TensorFlow,
causing nondeterministic deadlocks. For high performance data pipelines tf.data is recommended
The model is optimizing just fine, as expected from the docs and the fact that I am using a Sequence-based generator. But if use_multiprocessing is going to be a deprecated functionality in lieu of tf.data objects, I would like to be using the most up-to-date input pipeline. I currently use the following tf.keras.utils.Sequence-based generator inspired by this article on good practices for partitioning large datasets:
https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, list_IDs, labels, data_dir, batch_size=32, dim=(128,128), n_channels=1,
n_classes=2, shuffle=True, **augmentation_kwargs):
'Initialization'
self.dim = dim
self.batch_size = batch_size
self.labels = labels
self.list_IDs = list_IDs
self.data_dir = data_dir
self.n_channels = n_channels
self.n_classes = n_classes
self.shuffle = shuffle
self.on_epoch_end()
self.augmentor = keras.preprocessing.image.ImageDataGenerator(**augmentation_kwargs)
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_IDs_temp):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
X = np.empty((self.batch_size, *self.dim))
y = np.empty((self.batch_size), dtype=int)
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
X[i,] = np.load(self.data_dir +'/{}_stars.npy'.format(ID))
# Store class
y[i] = self.labels[ID]
# Reshape and apply augmentation to sample
X,y = self.augmentor.flow(X.reshape(self.batch_size,*self.dim,1),y=y,
shuffle=False,batch_size=self.batch_size)[0]
return X, y
All data from all classes is in the data_dir directory and are stored as individual .npy files. The IDs come from a list of strings. The class labels are taken from a dictionary whose keys are the IDs -- as in the article.
I really like the intuition of the Sequence generator set-up. I can also easily generator random batches to check that it is behaving as I would expect. But how can I reproduce this set-up with tf.data? How do I reproduce the multiprocessing batch generation of a Sequence generator with the interleave and prefetch methods of tf.data.Dataset? And/or can I simply ingest this Sequence-based generator with the tf.data.Dataset.from_generator() method?
Many thanks in advance.
may be to late to answer, but that what I did and it's work fine for me;
1- my class was like that;
class DataGen(Sequence):
def __init__(self, df, sr=8000, seconds=3, batch_size=16, shuffle=True):
self.files = np.array(df.filepath)
self.label = np.array(df.label)
self.batch_size = batch_size
self.shuffle = shuffle
self.sr = sr
self.seconds = seconds
self.dim = self.sr*self.seconds
self.on_epoch_end()
def __len__():
return len(self.label)//self.batch_size
def __getitem__(self, x):
indexs = self.indexs[np.arange(x, x+self.batch_size)]
return self.__getBatch__(indexs)
def __getBatch__(self, indexs):
X, y = [], []
for i in indexs:
wav = self.__loadFile__(self.files[i])
X.append(librosa.feature.mfcc(wav, self.sr).T)
y.append(self.label[i])
return tf.convert_to_tensor(X), to_categorical(y, num_classes=2)
def __loadFile__(self, file):
y, sr = librosa.load(file, sr=8000, mono=True)
if len(y)>self.dim:
return y[:self.dim]
return np.pad(y, (0, self.dim-len(y)), 'constant', constant_values=0)
def on_epoch_end(self):
self.indexs = np.arange(len(self.label))
if self.shuffle:
np.random.shuffle(self.indexs)
2- than I change to a function like follow;
def gen(sr=8000, seconds=3, batch_size=16, shuffle=True):
dim = sr*seconds
def loadFile(file):
wav, _ = librosa.load(file, sr=sr, mono=True)
if len(wav)>dim:
return wav[:dim]
return np.pad(wav, (0, dim-len(wav)), 'constant', constant_values=0)
while True:
indexs = np.arange(len(df))
if shuffle:
np.random.shuffle(indexs)
for x in range(len(df)//batch_size):
X, y = [], []
for i in indexs[np.arange(x*batch_size, (x+1)*batch_size)]:
X.append(librosa.feature.mfcc(loadFile(df.filepath[i]), sr).T)
y.append(df.label[i])
yield tf.convert_to_tensor(X), to_categorical(y, num_classes=2)
3- and works fine:
dataset = tf.data.Dataset.from_generator(gen, (tf.dtypes.float32, tf.dtypes.int32))
here's another method that I use with tensorflow and it's workes fine:
class DataGen():
def __init__(self, df, batch_size=32, shuffle=True):
self.data = np.array(df)
self.indexs = np.arange(self.data.shape[0])
if shuffle:
np.random.shuffle(self.indexs)
self.batch_size = batch_size
def __len__(self):
return self.data.shape[0]//self.batch_size
def get_item(self, x):
# data preprocessing
data, label = self.data[x]
return data, label
def __call__(self):
for i in self.indexs:
yield self.get_item(i)
train_gen = DataGen(train_df)
types = (tf.float32, tf.int32)
shapes = ((1, 500, 201), (n_classes))
batch_size = 32
train_data = Dataset.from_generator(train_gen, output_types=types, output_shapes=shapes)
train_data = train_data.batch(batch_size)
# test
X, y = next(iter(train_data))
print(X.shape, y.shape)
I want to train my timeseries prediction model (LSTM-model, using tensorflow 2.2) with using the timeseries generator in order to generate the data on the fly.
What I observed is that the train loss decreases good in the first steps of training, but then it increases with the last 12 steps. Then I found out, that the model.fit method runs the timeseriesgenerator 11 times and starts training after that. So thats the cause that my model doesn't train with the first samples of data and therefore starts in the middle of my dataset. When I reach the end of the epochs, there is no data left so it gets an empty array and I think that this "destoys" the training progress and therefore the training loss starts to increase.
Here are the important parts of my code to understand my problem:
Model:
model = keras.Sequential()
model.add(keras.layers.Bidirectional(k.layers.LSTM(20, activation='relu'), input_shape=(seq_length, 1)))
model.add(keras.layers.Dense(1))
Training:
training_data = DataGenerator(data=train_data,
seq_length=1024,
batch_size=2048,
shuffle=False)
training_process = model.fit(x=training_data,
epochs=8,
verbose=True,
validation_data=validation_data)
The DataGenerator class countains the data generator, with the real values after the # :
class DataGenerator(k.utils.Sequence):
def __init__(self, data, seq_length, batch_size, shuffle=True):
self.data = data
self.n_sequences = np.shape(self.data)[0] #2
self.data_length = np.shape(self.data)[1] #60000
self.seq_length = seq_length #1024
self.batch_size = batch_size #2048
self.shuffle = shuffle #False
self.seq_index = 0
self.own_index = 0
self.n_batches = int(np.floor((self.data_length-self.seq_length)/self.batch_size)) #28
self.on_epoch_end()
def __len__(self):
return int(np.floor((self.data_length-self.seq_length)/self.batch_size)*self.n_sequences) #56
def __getitem__(self, index):
print('own index=' + str(self.own_index)) # for testing generator
index = int(self.own_index/self.n_batches) #0
start = (self.own_index % self.n_batches) * self.batch_size #0
end = start + self.batch_size #2048
gen = TimeseriesGenerator(data=self.data[index],
targets=self.data[index],
length=self.seq_length,
sampling_rate=1, stride=1, start_index=start, end_index=end,
shuffle=False, reverse=False, batch_size=self.batch_size)
self.own_index += 1
if self.own_index > (self.n_sequences*self.n_batches-1):
self.own_index = 0
x, y = gen[0]
x = np.expand_dims(x, axis=2)
return x, y
def on_epoch_end(self):
self.own_index = 0
if self.shuffle is True:
perm = np.random.permutation(self.data.shape[0])
self.data = self.data[perm]
When I run this code, I get the following output:
own index=0 own index=1 own index=2 own index=3 own index=4 own
index=5 own index=6 own index=7 own index=8 own index=9 own index=10
own index=11
1/56 [..............................] - ETA: 0s - loss: 0.3705own
index=12
So the training starts with the index of 12, which causes the start-value to be 24576 instead of 0. So all the data before this start index are not used for training.
Can someone help me to find out, what causes the fit method to run the generator 11 times before training starts?
I'm new to tensorflow and I'm trying to adopt transfer learning for feature extraction. I have a large image dataset of 600k images stored in a gzip compressed hdf5 file of 100GB. I'm using a generator to load the images into the vgg16 model. However, it is going to take me 2000+ hours to complete 1 epoch. Is there any way to optimize the code so that I can have a faster training speed?
NAME = "vgg16-CNN"
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75)
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True,gpu_options=gpu_options))
#Model
num_classes=58
image_input = Input(shape=(224, 224, 3))
model = VGG16(input_tensor=image_input,include_top=True, weights='imagenet')
output_vgg16_conv = model.get_layer('fc2').output
x = Dense(num_classes, activation='softmax', name='predictions') (output_vgg16_conv)
pretrained_model = Model(inputs=image_input, outputs=x)
for layer in pretrained_model.layers[:-1]:
layer.trainable=False
pretrained_model.compile(loss='categorical_crossentropy',
optimizer='adam', metrics=['accuracy'])
pretrained_model.summary()
#Generator
def generator():
extendable_hdf5_file = h5py.File('npx_train.hdf5','r')['dataset']
y_train=pd.read_csv('train.csv')['Category']
len_class=58
y_train = to_categorical(np.array(y_train),len_class)
for a,im in enumerate(extendable_hdf5_file):
yield (im,y_train[a])
#Dataset from generator
ds = tf.data.Dataset.from_generator(
generator,
(tf.float32, tf.float32),
((224,224,3),(58,)))
ds = ds.prefetch(tf.contrib.data.AUTOTUNE)
ds = ds.batch(10)
#Model compile
with sess:
sess.run(tf.global_variables_initializer())
pretrained_model.fit(ds,epochs=10,steps_per_epoch=66662,
verbose=1,callbacks=[tensorboard],workers=0)
UPDATE:
I've managed to cut the training time to 60 hours per epoch by loading the generator directly to model.fit
hdf5_path = "npx_train.hdf5"
extendable_hdf5_file = h5py.File(hdf5_path,'r')['dataset']
def train_loader(files,y_train, batch_size):
L = 553292
while True:
batch_start = 0
batch_end = batch_size
while batch_start < L:
limit = min(batch_end, L)
X = files[batch_start:limit]
X = X/255
X = np.float32(X)
Y = y_train[batch_start:limit]
yield (X,Y)
batch_start += batch_size
batch_end += batch_size
with tf.device('/gpu:0'):
pretrained_model.fit_generator(generator=train_loader(extendable_hdf5_file,y_train, 32),
steps_per_epoch=16666, epochs=10, verbose=1,callbacks=[tensorboard],
validation_data=val_loader(extendable_hdf5_file,y_train, 32),
validation_steps=4167, workers=0)
However, it is still a long time to spend to train a single layer. Would appreciate help to speed up the process.
Graphics card: gtx1070
I am trying to build a CNN, I have 8 classes in the input_samples with 45 samples in each class. so total number of input samples are 360. I have divided my first 20 samples as train samples and remaining 25 samples as test samples in each class (My input is a text file and the data is in rows is my preprocessed data, so I am reading the rows in textfile and reshaping the images which are 16x12 size).
I am unable to fix the error in the code
My code:
import numpy as np
import random
import tensorflow as tf
folder = 'D:\\Lab_Project_Files\\TF\\Practice Files\\'
Datainfo = 'dataset_300.txt'
ClassInfo = 'classTrain.txt'
INPUT_WIDTH = 16
IMAGE_HEIGHT = 12
IMAGE_DEPTH = 1
IMAGE_PIXELS = INPUT_WIDTH * IMAGE_HEIGHT # 192 = 12*16
NUM_CLASSES = 8
STEPS = 500
STEP_VALIDATE = 100
BATCH_SIZE = 5
def load_data(file1,file2,folder):
filename1 = folder + file1
filename2 = folder + file2
# loading the data file
x_data = np.loadtxt(filename1, unpack=True)
x_data = np.transpose(x_data)
# loading the class information of the data loaded
y_data = np.loadtxt(filename2, unpack=True)
y_data = np.transpose(y_data)
# divide the data in to test and train data
x_data_train = x_data[np.r_[0:20, 45:65, 90:110, 135:155, 180:200, 225:245, 270:290, 315:335],:]
x_data_test = x_data[np.r_[20:45, 65:90, 110:135, 155:180, 200:225, 245:270, 290:315, 335:360], :]
y_data_train = y_data[np.r_[0:20, 45:65, 90:110, 135:155, 180:200, 225:245, 270:290, 315:335]]
y_data_test = y_data[np.r_[20:45, 65:90, 110:135, 155:180, 200:225, 245:270, 290:315, 335:360],:]
return x_data_train,x_data_test,y_data_train,y_data_test
def reshapedata(data_train,data_test):
data_train = np.reshape(data_train, (len(data_train),INPUT_WIDTH,IMAGE_HEIGHT))
data_test = np.reshape(data_test, (len(data_test), INPUT_WIDTH, IMAGE_HEIGHT))
return data_train,data_test
def batchdata(data,label, batchsize):
# generate random number required to batch data
order_num = random.sample(range(1, len(data)), batchsize)
data_batch = []
label_batch = []
for i in range(len(order_num)):
data_batch.append(data[order_num[i-1]])
label_batch.append(label[order_num[i-1]])
return data_batch, label_batch
# CNN trail
def conv_net(x):
weights = tf.Variable(tf.random_normal([INPUT_WIDTH * IMAGE_HEIGHT * IMAGE_DEPTH, NUM_CLASSES]))
biases = tf.Variable(tf.random_normal([NUM_CLASSES]))
out = tf.add(tf.matmul(x, weights), biases)
return out
sess = tf.Session()
# get filelist and labels for training and testing
data_train,data_test,label_train,label_test = load_data(Datainfo,ClassInfo,folder)
data_train, data_test, = reshapedata(data_train, data_test)
############################ get files for training ####################################################
image_batch, label_batch = batchdata(data_train,label_train,BATCH_SIZE)
# input output placeholders
x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS])
y_ = tf.placeholder(tf.float32,[None, NUM_CLASSES])
# create the network
y = conv_net( x )
# loss
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
# train step
train_step = tf.train.AdamOptimizer( 1e-3 ).minimize( cost )
############################## get files for validataion ###################################################
image_batch_test, label_batch_test = batchdata(data_test,label_test,BATCH_SIZE)
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.initialize_all_variables())
################ CNN Program ##############################
for i in range(STEPS):
# checking the accuracy in between.
if i % STEP_VALIDATE == 0:
imgs, lbls = sess.run([image_batch_test, label_batch_test])
print(sess.run(accuracy, feed_dict={x: imgs, y_: lbls}))
imgs, lbls = sess.run([image_batch, label_batch])
sess.run(train_step, feed_dict={x: imgs, y_: lbls})
imgs, lbls = sess.run([image_batch_test, label_batch_test])
print(sess.run(accuracy, feed_dict={ x: imgs, y_: lbls}))
file can be downloaded dataset_300.txt and ClassInfo.txt
Session.run accepts only a list of tensors or tensor names.
imgs, lbls = sess.run([image_batch_test, label_batch_test])
In the previous line, you are passing image_batch_test and label_batch_test which are numpy arrays. I am not sure what you are trying to do by imgs, lbls = sess.run([image_batch_test, label_batch_test])