keras.utils.Sequence with multiple files - tensorflow

I understand how to use keras.utils.Sequence with one data file. You subclass the keras.utils.Sequence class and implement its interface: __len__ and __getitem__.
For example:
def __len__(self):
"Denotes the number of batches per epoch"
return int(np.ceil(self.no_examples / float(self.batch_size)))
def __getitem__(self, idx):
#build the batch w/ idx and self.batch_size
But what if your data is spread across multiple files? For example:
train_part1.csv
train_part2.csv
train_partn.csv
How can you iterate through all batches with only one pointer idx?

you can set up a mapping of (range, file_path)
def __init__(self, file_paths, batch_size):
self.batch_size = batch_size
self._mapping = dict()
count = 0
for file_path in file_paths:
with open(file_path, 'r') as f:
size = len(f.readlines())
self._mapping[(count, count+size)] = file_path
count += size
self.no_examples = count
def _find_file_path(self, idx):
for range, file_path in self._mapping.items():
start, end = range[0], range[1]
if start <= idx and idx <= end:
in_file_idx = idx - start
return (in_file_idx, file_path)
def __len__(self):
"Denotes the number of batches per epoch"
return int(np.ceil(self.no_examples / float(self.batch_size)))
#functools.lru_cache(maxsize=128) # add memoize for file caching
def _read_file_data(self, file_path):
with open(file_path, 'r') as f:
return list(f.readlines())
def __getitem__(self, idx):
in_file_idx, file_path = self._find_file_path(idx)
lines = self._read_file_data(file_path)
return lines[in_file_idx]
Further optimization:
stats memory consumed, delete memoized file content(if your files are too large to fit your memory) due to your memory size;
implement more efficient _find_file_path if large number of files, the current implementation is O(n);

Related

TensorFlow-Keras generator: Turn off auto-sharding or switch auto_shard_policiy to DATA

While training my model I ran into the issue described in the post Tensorflow - Keras: Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. My question now is: Does the solution mentioned by #Graham501617 work with generators as well? Here is some dummy code for what I use so far:
class BatchGenerator(Sequence):
def __init__(self, some_args):
...
def __len__(self):
num_batches_in_sequence = ...
def __getitem__(self, _):
data, labels = get_one_batch(self.some_args)
return data, labels
In the main script I do something like:
train_generator = BatchGenerator(some_args)
valid_generator = BatchGenerator(some_args)
cross_device_ops = tf.distribute.HierarchicalCopyAllReduce(num_packs=2)
strategy = tf.distribute.MirroredStrategy(cross_device_ops=cross_device_ops)
with strategy.scope():
model = some_model
model.compile(some_args)
history = model.fit(
x=train_generator,
validation_data=valid_generator,
...
)
I would probably have to modify the __getitem__ function somehow, do I?
I appreciate your support!
You'd have to wrap your generator into a single function...
Example below assumes your data is stored as numpy array (.npy), each file already has the correct amount of mini-batch size, is labeled 0_x.npy, 1_x.npy, 2_x.npy, etc.. and both data and label arrays are float64.
from pathlib import Path
import tensorflow as tf
import numpy as np
# Your new generator as a function rather than an object you need to instantiate
def getNextBatch(stop, data_dir):
i = 0
data_dir = data_dir.decode('ascii')
while True:
while i < stop:
x = np.load(str(Path(data_dir + "/" + str(i) + "_x.npy")))
y = np.load(str(Path(data_dir + "/" + str(i) + "_y.npy")))
yield x, y
i += 1
i = 0
# Make a dataset given the directory and strategy
def makeDataset(generator_func, dir, strategy=None):
# Get amount of files
data_size = int(len([name for name in os.listdir(dir) if os.path.isfile(os.path.join(dir, name))])/2)
ds = tf.data.Dataset.from_generator(generator_func, args=[data_size, dir], output_types=(tf.float64, tf.float64)) # Make a dataset from the generator. MAKE SURE TO SPECIFY THE DATA TYPE!!!
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
ds = ds.with_options(options)
# Optional: Make it a distributed dataset if you're using a strategy
if strategy is not None:
ds = strategy.experimental_distribute_dataset(ds)
return ds
training_ds = makeDataset(getNextBatch, str(Path(data_dir + "/training")), None)
validation_ds = makeDataset(getNextBatch, str(Path(data_dir + "/validation")), None)
model.fit(training_ds,
epochs=epochs,
callbacks=callbacks,
validation_data=validation_ds)
You might need to pass the amount of steps per epoch in your fit() call, in which case you can use the generator you've already made.

Custom Keras Metrics Class -> Metric at a certain recall value

I am trying to build a metric that is comparable to the metrics.PrecisionAtRecall class. Therefore, I've tried to build a custom metric by extending the keras.metrics.Metric class.
The original function is WSS = (TN + FN)/N − 1 + TP/(TP + FN) and this should be calculated at a certain recall value, for say 95%.
What I have until now is the following:
class WorkSavedOverSamplingAtRecall(tf.keras.metrics.Metric):
def __init__(self, recall, name='wss_at_recall', **kwargs):
super(WorkSavedOverSamplingAtRecall, self).__init__(name=name, **kwargs)
self.wss = self.add_weight(name='wss', initializer='zeros')
def update_state(self, y_true, y_pred, sample_weight=None):
y_pred_pos = tf.cast(backend.round(backend.clip(y_pred, 0, 1)), tf.float32)
y_pred_neg = 1 - y_pred_pos
y_pos = tf.cast(backend.round(backend.clip(y_true, 0, 1)), tf.float32)
y_neg = 1 - y_pos
fn = backend.sum(y_neg * y_pred_pos)
tn = backend.sum(y_neg * y_pred_neg)
tp = backend.sum(y_pos * y_pred_pos)
n = len(y_true) # number of studies in batch
r = tp/(tp+fn+backend.epsilon()) # recall
self.wss.assign(((tn+fn)/n)-(1+r))
def result(self):
return self.wss
def reset_states(self):
# The state of the metric will be reset at the start of each epoch.
self.wss.assign(0.)
How can I calculate the WSS at a certain recall? I've seen the following in tensorflow's own git repository:
def __init__(self, recall, num_thresholds=200, name=None, dtype=None):
if recall < 0 or recall > 1:
raise ValueError('`recall` must be in the range [0, 1].')
self.recall = recall
self.num_thresholds = num_thresholds
super(PrecisionAtRecall, self).__init__(
value=recall,
num_thresholds=num_thresholds,
name=name,
dtype=dtype)
But that is't really possible through the keras.metrics.Metric class
If we follow the definition of the WSS#95 given by this paper :Reducing Workload in Systematic Review Preparation Using Automated Citation Classification, then we have
For the present work, we have fixed recall at 0.95 and therefore work saved over sampling at 95% recall (WSS#95%) is:
And you could define your update function by :
class WorkSavedOverSamplingAtRecall(tf.keras.metrics.Metric):
def __init__(self, recall, name='wss_at_recall', **kwargs):
if recall < 0 or recall > 1:
raise ValueError('`recall` must be in the range [0, 1].')
self.recall = recall
super(WorkSavedOverSamplingAtRecall, self).__init__(name=name, **kwargs)
self.wss = self.add_weight(name='wss', initializer='zeros')
def update_state(self, y_true, y_pred, sample_weight=None):
y_pred_pos = tf.cast(backend.round(backend.clip(y_pred, 0, 1)), tf.float32)
y_pred_neg = 1 - y_pred_pos
y_neg = 1 - y_pos
fn = backend.sum(y_neg * y_pred_pos)
tn = backend.sum(y_neg * y_pred_neg)
n = len(y_true) # number of studies in batch
self.wss.assign(((tn+fn)/n)-(1-self.recall))
One other solution would be to extend from the tensorflow class SensitivitySpecificityBase and to implement the WSS as the PresicionAtRecall class is implemented.
By using this class, here's how the WSS is calculated :
Compute the recall at all the thresholds (200 thresholds by default).
Find the index of the threshold where the recall is closest to the requested value. (0.95 in that case).
Compute the WSS at that index.
The number of thresholds is use to match the given recall.
import tensorflow as tf
from tensorflow.python.keras.metrics import SensitivitySpecificityBase
class WorkSavedOverSamplingAtRecall(SensitivitySpecificityBase):
def __init__(self, recall, num_thresholds=200, name="wss_at_recall", dtype=None):
if recall < 0 or recall > 1:
raise ValueError('`recall` must be in the range [0, 1].')
self.recall = recall
self.num_thresholds = num_thresholds
super(WorkSavedOverSamplingAtRecall, self).__init__(
value=recall, num_thresholds=num_thresholds, name=name, dtype=dtype
)
def result(self):
recalls = tf.math.div_no_nan(
self.true_positives, self.true_positives + self.false_negatives
)
n = self.true_negatives + self.true_positives + self.false_negatives + self.false_positives
wss = tf.math.div_no_nan(
self.true_negatives+self.false_negatives, n
)
return self._find_max_under_constraint(
recalls, wss, tf.math.greater_equal
)
def get_config(self):
"""For serialization purposes"""
config = {'num_thresholds': self.num_thresholds, 'recall': self.recall}
base_config = super(WorkSavedOverSamplingAtRecall, self).get_config()
return dict(list(base_config.items()) + list(config.items()))

How to show the class distribution in Dataset object in Tensorflow

I am working on a multi-class classification task using my own images.
filenames = [] # a list of filenames
labels = [] # a list of labels corresponding to the filenames
full_ds = tf.data.Dataset.from_tensor_slices((filenames, labels))
This full dataset will be shuffled and split into train, valid and test dataset
full_ds_size = len(filenames)
full_ds = full_ds.shuffle(buffer_size=full_ds_size*2, seed=128) # seed is used for reproducibility
train_ds_size = int(0.64 * full_ds_size)
valid_ds_size = int(0.16 * full_ds_size)
train_ds = full_ds.take(train_ds_size)
remaining = full_ds.skip(train_ds_size)
valid_ds = remaining.take(valid_ds_size)
test_ds = remaining.skip(valid_ds_size)
Now I am struggling to understand how each class is distributed in train_ds, valid_ds and test_ds. An ugly solution is to iterate all the element in the dataset and count the occurrence of each class. Is there any better way to solve it?
My ugly solution:
def get_class_distribution(dataset):
class_distribution = {}
for element in dataset.as_numpy_iterator():
label = element[1]
if label in class_distribution.keys():
class_distribution[label] += 1
else:
class_distribution[label] = 0
# sort dict by key
class_distribution = collections.OrderedDict(sorted(class_distribution.items()))
return class_distribution
train_ds_class_dist = get_class_distribution(train_ds)
valid_ds_class_dist = get_class_distribution(valid_ds)
test_ds_class_dist = get_class_distribution(test_ds)
print(train_ds_class_dist)
print(valid_ds_class_dist)
print(test_ds_class_dist)
The answer below assumes:
there are five classes.
labels are integers from 0 to 4.
It can be modified to suit your needs.
Define a counter function:
def count_class(counts, batch, num_classes=5):
labels = batch['label']
for i in range(num_classes):
cc = tf.cast(labels == i, tf.int32)
counts[i] += tf.reduce_sum(cc)
return counts
Use the reduce operation:
initial_state = dict((i, 0) for i in range(5))
counts = train_ds.reduce(initial_state=initial_state,
reduce_func=count_class)
print([(k, v.numpy()) for k, v in counts.items()])
A solution inspired by user650654 's answer, only using TensorFlow primitives (with tf.unique_with_counts instead of for loop):
In theory, this should have better performance and scale better to large datasets, batches or class count.
num_classes = 5
#tf.function
def count_class(counts, batch):
y, _, c = tf.unique_with_counts(batch[1])
return tf.tensor_scatter_nd_add(counts, tf.expand_dims(y, axis=1), c)
counts = train_ds.reduce(
initial_state=tf.zeros(num_classes, tf.int32),
reduce_func=count_class)
print(counts.numpy())
Similar and simpler version with numpy that actually had better performances for my simple use-case:
count = np.zeros(num_classes, dtype=np.int32)
for _, labels in train_ds:
y, _, c = tf.unique_with_counts(labels)
count[y.numpy()] += c.numpy()
print(count)

Read randomly elements in .h5 file without loading whole matrix

I have a gigantic training data set that couldn't fit in RAM. I tried to load random batch of images in a stack without loading whole .h5. My approach was to create a list of indexes and shuffle them instead of shuffling the whole .h5 file.
Let's say:
a = np.arange(2000*2000*2000).reshape(2000, 2000, 2000)
idx = np.random.randint(2000, size = 800) #so that I only need to shuffle this idx at the end of epoch
# create this huge data 32GBs > my RAM
with h5py.File('./tmp.h5', 'w') as f:
tmp = f.create_dataset('a', (2000, 2000, 2000))
tmp[:] = a
# read it
with h5py.File('./tmp.h5', 'r') as f:
tensor = f['a'][:][idx] #if I don't do [:] there will be error if I do so it will load whole file which I don't want
Does somebody has a solution?
Thanks to #max9111, here's how I propose to solve it:
batch_size = 100
idx = np.arange(2000)
# shuffle
idx = np.random.shuffle(idx)
Due to the constraint of h5py:
Selection coordinates must be given in increasing order
One should sort before reading:
for step in range(epoch_len // batch_size):
try:
with h5py.File(path, 'r') as f:
return f['img'][np.sort(idx[step * batch_size])], f['label'][np.sort(idx[step * batch_size])]
except:
raise('epoch finished and drop the remainder')

Dequeueing from RandomShuffleQueue does not reduce size

In order to train a model I have encapsulated my model in a class.
I use a tf.RandomShuffleQueue to enqueue a list of filenames to.
However when I dequeue the elements they get dequeued but the size of the queue does not reduce.
Following are more specific questions followed by the code snippet :
If I have only 5 images for example, but steps range upto 100, would this result in the addfilenames called repeatedly automatically ? It does not give me any error on dequeuing so I am thinking that it is getting called automatically.
Why the size of the tf.RandomShuffleQueue is not changing ? It remains constant.
import os
import time
import functools
import tensorflow as tf
from Read_labelclsloc import readlabel
def ReadTrain(traindir):
# Returns a list of training images, their labels and a dictionay.
# The dictionary maps label names to integer numbers.
return trainimgs, trainlbls, classdict
def ReadVal(valdir, classdict):
# Reads the validation image labels.
# Returns a dictionary with filenames as keys and
# corresponding labels as values.
return valdict
def lazy_property(function):
# Just a decorator to make sure that on repeated calls to
# member functions, ops don't get created repeatedly.
# Acknowledgements : https://danijar.com/structuring-your-tensorflow-models/
attribute= '_cache_' + function.__name__
#property
#functools.wraps(function)
def decorator(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return decorator
class ModelInitial:
def __init__(self, traindir, valdir):
self.graph
self.traindir = traindir
self.valdir = valdir
self.traininginfo()
self.epoch = 0
def traininginfo(self):
self.trainimgs, self.trainlbls, self.classdict = ReadTrain(self.traindir)
self.valdict = ReadVal(self.valdir, self.classdict)
with self.graph.as_default():
self.trainimgs_tensor = tf.constant(self.trainimgs)
self.trainlbls_tensor = tf.constant(self.trainlbls, dtype=tf.uint16)
self.trainimgs_dict = {}
self.trainimgs_dict["ImageFile"] = self.trainimgs_tensor
return None
#lazy_property
def graph(self):
g = tf.Graph()
with g.as_default():
# Layer definitions go here
return g
#lazy_property
def addfilenames (self):
# This is the function where filenames are pushed to a RandomShuffleQueue
filename_queue = tf.RandomShuffleQueue(capacity=len(self.trainimgs), min_after_dequeue=0,\
dtypes=[tf.string], names=["ImageFile"],\
seed=0, name="filename_queue")
sz_op = filename_queue.size()
dq_op = filename_queue.dequeue()
enq_op = filename_queue.enqueue_many(self.trainimgs_dict)
return filename_queue, enq_op, sz_op, dq_op
def Train(self):
# The function for training.
# I have not written the training part yet.
# Still struggling with preprocessing
with self.graph.as_default():
filename_q, filename_enqueue_op, sz_op, dq_op= self.addfilenames
qr = tf.train.QueueRunner(filename_q, [filename_enqueue_op])
filename_dequeue_op = filename_q.dequeue()
init_op = tf.global_variables_initializer()
sess = tf.Session(graph=self.graph)
sess.run(init_op)
coord = tf.train.Coordinator()
enq_threads = qr.create_threads(sess, coord=coord, start=True)
counter = 0
for step in range(100):
print(sess.run(dq_op["ImageFile"]))
print("Epoch = %d "%(self.epoch))
print("size = %d"%(sess.run(sz_op)))
counter+=1
names = [n.name for n in self.graph.as_graph_def().node]
coord.request_stop()
coord.join(enq_threads)
print("Counter = %d"%(counter))
return None
if __name__ == "__main__":
modeltrain = ModelInitial(<Path to training images>,\
<Path to validation images>)
a = modeltrain.graph
print(a)
modeltrain.Train()
print("Success")
The mystery is caused by the tf.train.QueueRunner that you created for the queue, which causes it to be filled in the background.
The following lines cause a background "queue runner" thread to be created:
qr = tf.train.QueueRunner(filename_q, [filename_enqueue_op])
# ...
enq_threads = qr.create_threads(sess, coord=coord, start=True)
This thread calls filename_enqueue_op in a loop, which causes the queue to be filled up as you remove elements from it.
The background thread from step 1 will almost always have a pending enqueue operation (filename_enqueue_op) on the queue. This means that after you dequeue a filename, the pending enqueue will run add fill the queue back up to capacity. (Technically there is a race condition here and you could see a size of capacity - 1, but this is quite unlikely).