memmap arrays to pytorch and gradient accumulation - tensorflow

I have A Large dataset (> 62 GiB) after processing saved as two NumPy.memmap arrays one of the data and the other for the labels the dataset has these shapes (7390,60,224,224,3) , and (7390) and is NOT shuffled so i need to shuffle it first.
now i use tensorflow2 and used this code with my generator to manage memmap arrays before
def my_generator():
for i in range(len(numpy_array)):
yield numpy_array[i,:,:,:,:],np.array(labels[i]).reshape(1)
full_dataset = tf.data.Dataset.from_generator(
generator=my_generator,
output_types=(np.uint8,np.int32),
output_shapes=((60,224,224,3),(1))
)
full_dataset = full_dataset.shuffle(SHUFFLE_BUFFER_SIZE, reshuffle_each_iteration=False)
train_dataset = full_dataset.take(train_size)
test_dataset = full_dataset.skip(train_size)
val_dataset = test_dataset.skip(test_size)
test_dataset = test_dataset.take(test_size)
That way i can train without loading to memory the entire dataset with shuffling and batching.
Now with this current model and dataset the vram is not enogh for more than 2 batches to be loaded as tensors.
and i can't train with batchsize of 2.
i thought of gradient accumulation but i couldn't do it with TF2 and i found it easy with pytorch but i can't find how to deal with the memmap arrays with shuffle and split as in tensorflow with generators.
so i need to know how to load the datset from pytorch with the same shuffling and batching in pytorch.
Or if someone has a readymade code for GA on TF2

I will just address the shuffle question.
Instead of shuffling with tf.data.Dataset, do it at the generator level. This should work:
class Generator(object):
def __init__(self, images, labels, batch_size):
self.images = images
self.labels = labels
self.batch_size = batch_size
self.idxs = np.arange(len(self.images))
self.on_epoch_end()
def on_epoch_end(self):
# Shuffle the indices
np.random.shuffle(self.idxs)
def generator(self):
i = 0
while i < len(self.idxs):
idx = self.idxs[i]
yield (self.images[idx], self.labels[i])
i += 1
self.on_epoch_end()
def batch_generator(self):
it = iter(self.generator)
while True:
vals = [next(it) for i in range(self.batch_size)]
images, labels = zip(*vals)
yield images, labels
Then you can use it by
gen = Generator(...)
it = iter(gen)
batch = next(it) # Call this every time you want a new batch
I'm sure pytorch has build in methods for this kind of stuff though

Related

Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.
For both models:
Setting the last 50 layers trainable and adding the same fully connected layers to the end.
Learning rate 3e-2
Batch size 32
Adam optimizer with the same betas
100 epochs
The inputs consist of RGB unscaled images
Pytorch
Model
def _init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
m.bias.data.fill_(0.01)
def get_mob_v3_small():
model = torchvision.models.mobilenet_v3_small(pretrained=True)
children_list = get_children(model)
for c in children_list[:-50]:
for p in c.parameters():
p.requires_grad = False
return model
class TransferMobileNetV3_v2(nn.Module):
def __init__(self,
num_keypoints: int = 5):
super(TransferMobileNetV3_v2, self).__init__()
self.classifier_neurons = num_keypoints*2
self.base_model = get_mob_v3_small()
self.base_model.classifier = nn.Sequential(
nn.Linear(in_features=1024, out_features=1024),
nn.ReLU(),
nn.Linear(in_features=1024, out_features=512),
nn.ReLU(),
nn.Linear(in_features=512, out_features=self.classifier_neurons)
)
self.base_model.apply(_init_weights)
def forward(self, x):
out = self.base_model(x)
return out
Training Script
def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):
len_dataloader = len(trainloader)
for epoch in range(1, args.epochs+1):
net.train()
for batch_idx, sample in enumerate(trainloader):
inputs, labels = sample
inputs, labels = inputs.to(args.device), labels.to(args.device)
optimizer.zero_grad()
with torch.cuda.amp.autocast(args.use_amp):
prediction = net(inputs)
loss = train_loss_fn(prediction, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
def main():
args = make_args_parser()
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = args.seed
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=3e-2,
betas=(0.9, 0.999))
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)
Tensorflow
Model
base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',
input_shape=(224,224,3))
x_in = base_model.layers[-6].output
x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=x)
for layer in model.layers[:-50]:
layer.trainable=False
Training Script
model.compile(loss = "mse",
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))
history = model.fit(input_numpy, output_numpy,
verbose=1,
batch_size=32, epochs=100,validation_split = 0.2)
Results
The PyTorch model predicts one single point around the center for all 5 different points.
The Tensorflow model predicts the points quite well and are quite accurate.
The loss in the Pytorch model is much higher than the Tensorflow model.
Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results. Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.
Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I'm putting it here just in case.

Preprocessing for TensorFlow Dataset 'cats_vs_dogs'

I am trying to create a preprocessing function so that the training_dataset can be directly fed into a keras sequential neural network. The preprocess function should return features and labels.
def preprocessing_function(data):
features = ...
labels = ...
return features, labels
dataset, info = tfds.load(name='cats_vs_dogs', split=tfds.Split.TRAIN, with_info=True)
training_dataset = dataset.map(preprocessing_function)
How should I write the preprocessing_function? I spent several hours researching and trying to make it happen, but to no avail. Hoping someone can assist.
Here are two functions for preprocessing. FIrst one will be applied to both train and validation data to normalize the data and resize to the expected size of network. The second function, augmentation, will be applied to training set only. The type of augmentation you want to do depends on your dataset and application, but I provided this as an example.
#Fetching, pre-processing & preparing data-pipeline
def preprocess(ds):
x = tf.image.resize_with_pad(ds['image'], IMG_SIZE_W, IMG_SIZE_H)
x = tf.cast(x, tf.float32)
x = (x-MEAN)/(VARIANCE)
y = tf.one_hot(ds['label'], NUM_CLASSES)
return x, y
def augmentation(image,label):
image = tf.image.random_flip_left_right(image)
image = tf.image.resize_with_crop_or_pad(image, IMG_W+4, IMG_W+4) # zero pad each side with 4 pixels
image = tf.image.random_crop(image, size=[BATCH_SIZE, IMG_W, IMG_H, 3]) # Random crop back to 32x32
return image, label
and to load training and validation datasets, do something like this:
def get_dataset(dataset_name, shuffle_buff_size=1024, batch_size=BATCH_SIZE, augmented=True):
train, info_train = tfds.load(dataset_name, split='train[:80%]', with_info=True)
val, info_val = tfds.load(dataset_name, split='train[80%:]', with_info=True)
TRAIN_SIZE = info_train.splits['train'].num_examples * 0.8
VAL_SIZE = info_train.splits['train'].num_examples * 0.2
train = train.map(preprocess).cache().repeat().shuffle(shuffle_buff_size).batch(batch_size)
if augmented==True:
train = train.map(augmentation)
train = train.prefetch(tf.data.experimental.AUTOTUNE)
val = val.map(preprocess).cache().repeat().batch(batch_size)
val = val.prefetch(tf.data.experimental.AUTOTUNE)
return train, info_train, val, info_val, TRAIN_SIZE, VAL_SIZE

How to use Tensorflow's tf.cond() with two different Dataset iterators without iterating both?

I want to feed a CNN with the tensor "images". I want this tensor to contain images from the training set ( which have FIXED size ) when the placeholder is_training is True, otherwise I want it to contain images from the test set ( which are of NOT FIXED size ).
This is needed because in training I take a random fixed crop from the training images, while in test I want to perform a dense evaluation and feed the entire images inside the network ( it is fully convolutional so it will accept them)
The current NOT WORKING way is to create two different iterators, and try to select the training/test input with tf.cond at the session.run(images,{is_training:True/False}).
The problem is that BOTH the iterators are evaluated. The training and test dataset are also of different size so I cannot iterate both of them until the end. Is there a way to make this work? Or to rewrite this in a smarter way?
I've seen some questions/answers about this but they always used tf.assign which takes a numpy array and assigns it to a tensor. In this case I cannot use tf.assign because I already have a tensor coming from the iterators.
The current code that I have is this one. It simply checks the shape of the tensor "images":
train_filenames, train_labels = list_images(args.train_dir)
val_filenames, val_labels = list_images(args.val_dir)
graph = tf.Graph()
with graph.as_default():
# Preprocessing (for both training and validation):
def _parse_function(filename, label):
image_string = tf.read_file(filename)
image_decoded = tf.image.decode_jpeg(image_string, channels=3)
image = tf.cast(image_decoded, tf.float32)
return image, label
# Preprocessing (for training)
def training_preprocess(image, label):
# Random flip and crop
image = tf.image.random_flip_left_right(image)
image = tf.random_crop(image, [args.crop,args.crop, 3])
return image, label
# Preprocessing (for validation)
def val_preprocess(image, label):
flipped_image = tf.image.flip_left_right(image)
batch = tf.stack([image,flipped_image],axis=0)
return batch, label
# Training dataset
train_filenames = tf.constant(train_filenames)
train_labels = tf.constant(train_labels)
train_dataset = tf.contrib.data.Dataset.from_tensor_slices((train_filenames, train_labels))
train_dataset = train_dataset.map(_parse_function,num_threads=args.num_workers, output_buffer_size=args.batch_size)
train_dataset = train_dataset.map(training_preprocess,num_threads=args.num_workers, output_buffer_size=args.batch_size)
train_dataset = train_dataset.shuffle(buffer_size=10000)
batched_train_dataset = train_dataset.batch(args.batch_size)
# Validation dataset
val_filenames = tf.constant(val_filenames)
val_labels = tf.constant(val_labels)
val_dataset = tf.contrib.data.Dataset.from_tensor_slices((val_filenames, val_labels))
val_dataset = val_dataset.map(_parse_function,num_threads=1, output_buffer_size=1)
val_dataset = val_dataset.map(val_preprocess,num_threads=1, output_buffer_size=1)
train_iterator = tf.contrib.data.Iterator.from_structure(batched_train_dataset.output_types,batched_train_dataset.output_shapes)
val_iterator = tf.contrib.data.Iterator.from_structure(val_dataset.output_types,val_dataset.output_shapes)
train_images, train_labels = train_iterator.get_next()
val_images, val_labels = val_iterator.get_next()
train_init_op = train_iterator.make_initializer(batched_train_dataset)
val_init_op = val_iterator.make_initializer(val_dataset)
# Indicates whether we are in training or in test mode
is_training = tf.placeholder(tf.bool)
def f_true():
with tf.control_dependencies([tf.identity(train_images)]):
return tf.identity(train_images)
def f_false():
return val_images
images = tf.cond(is_training,f_true,f_false)
num_images = images.shape
with tf.Session(graph=graph) as sess:
sess.run(train_init_op)
#sess.run(val_init_op)
img = sess.run(images,{is_training:True})
print(img.shape)
The problem is that when I want to use only the training iterator, I comment the line to initialize the val_init_op but there is the following error:
FailedPreconditionError (see above for traceback): GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element.
[[Node: IteratorGetNext_1 = IteratorGetNext[output_shapes=[[2,?,?,3], []], output_types=[DT_FLOAT, DT_INT32], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator_1)]]
If I do not comment that line everything works as expected, when is_training is true I get training images and when is_training is False I get validation images. The issue is that both the iterators need to be initialized and when I evaluate one of them, the other is incremented too. Since as I said they are of different size this causes an issue.
I hope there is a way to solve it! Thanks in advance
The trick is to call iterator.get_next() inside the f_true() and f_false() functions:
def f_true():
train_images, _ = train_iterator.get_next()
return train_images
def f_false():
val_images, _ = val_iterator.get_next()
return val_images
images = tf.cond(is_training, f_true, f_false)
The same advice applies to any TensorFlow op that has a side effect, like assigning to a variable: if you want that side effect to happen conditionally, the op must be created inside the appropriate branch function passed to tf.cond().

Calling the same batch tensorflow

I have a tensorflow graph that is reading from .tfrecords files, as described in the process here (taken from Tflow docs):
def read_my_file_format(filename_queue):
reader = tf.SomeReader()
key, record_string = reader.read(filename_queue)
example, label = tf.some_decoder(record_string)
processed_example = some_processing(example)
return processed_example, label
def input_pipeline(filenames, batch_size, num_epochs=None):
filename_queue = tf.train.string_input_producer(
filenames, num_epochs=num_epochs, shuffle=True)
example, label = read_my_file_format(filename_queue)
# min_after_dequeue defines how big a buffer we will randomly sample
# from -- bigger means better shuffling but slower start up and more
# memory used.
# capacity must be larger than min_after_dequeue and the amount larger
# determines the maximum we will prefetch. Recommendation:
# min_after_dequeue + (num_threads + a small safety margin) * batch_size
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch`
In my code, a single batch (as returned by input_pipeline above) is used as an input to multiple networks (let's call them A,B) in my graph per iteration. So if I call:
#...define graph...
sess.run([A,B])
does tensorflow guarantee that it will use the same batch for each sess.run call?
If input of model A and B is example_batch and you evaluate the models simultaneously (as in your example sess.run([A,B])) then I expect to see the same batch. Because both models are fed by the same dequeuing operation. As soon as you break the synchronization (i.e., running separately) then inputs will be different.
The following code snippet looks trivial but shows my point.
import tensorflow as tf
import numpy as np
import time
batch_size = 16
input_shape, target_shape = (128), () # input with dimensionality 128.
num_threads = 4 # for input pipeline
queue_capacity = 10 # for input pipeline
def get_random_data_sample():
# Random inputs and targets
np_input = np.float32(np.random.normal(0,1, input_shape))
np_target = np.int32(1)
# Sleep randomly between 1 and 3 seconds.
#time.sleep(np.random.randint(1,3,1)[0])
return np_input, np_target
tensorflow_input, tensorflow_target = tf.py_func(get_random_data_sample, [], [tf.float32, tf.int32])
def create_model(inputs, hidden_size, num_hidden_layers):
# Create a dummy dense network.
dense_layer = inputs
for i in range(num_hidden_layers):
dense_layer = tf.layers.dense(
inputs=dense_layer,
units=hidden_size,
kernel_initializer= tf.zeros_initializer(),
bias_initializer= tf.zeros_initializer(),
activation=None,
use_bias=True,
reuse=False)
return dense_layer, inputs
# input pipeline
batch_inputs, batch_targets = tf.train.batch([tensorflow_input, tensorflow_target],
batch_size=batch_size,
num_threads=num_threads,
shapes=[input_shape, target_shape],
capacity=queue_capacity)
# Different models A and B using the same input operation.
modelA, modelA_inputs = create_model(batch_inputs, 32, 1) # 1 hidden layer
modelB, modelB_inputs = create_model(batch_inputs, 64, 2) # 2 hidden layers
sess = tf.InteractiveSession()
tf.train.start_queue_runners()
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
sess = tf.InteractiveSession()
tf.train.start_queue_runners()
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
# (1) Evaluate the models simultaneously.
resultA, resultB, inputsA, inputsB = sess.run([modelA, modelB, modelA_inputs, modelB_inputs])
assert((inputsA == inputsB).all())
# (2) Evaluate the models separately.
resultA2, inputsA2 = sess.run([modelA, modelA_inputs])
resultB2, inputsB2 = sess.run([modelB, modelB_inputs])
assert((inputsA2 == inputsB2).all())
Naturally the second evaluation uses different input batches and fails assertion. I hope this helps.

do not save check point for a final step for Estimator

I use Estimator and I train model in the loop to feed data. Every step is the final step. The checkpoints are saved for every final step too. I want to avoid saving checkpoint in every iteration to increase the performance (speed) of the training.
I can not find any information how to do this. Do you have any ideas/suggestions/solutions?
classifier = Estimator(
model_fn=cnn_model_fn,
model_dir="./temp_model_Adam",
config=tf.contrib.learn.RunConfig(
save_checkpoints_secs=None,
save_checkpoints_steps=100,
save_summary_steps=None
)
)
# Train the model
for e in range(0, 10):
numbers = np.arange(10000)
np.random.shuffle(numbers)
for step in range(0, 2000):
classifier.fit(
input_fn=lambda: read_images_for_training_as_batch(step, path, 5, numbers),
steps=1
)
Nowadays the api got changed a bit but from what I see you were using the fit (currently train) method incorrectly, you should put steps=2000 and have your input function return an iterator over your dataset. Today you have tf.estimator.inputs.numpy_input_fn at your disposal that can help you when you have small data sets, otherwise you have to use tf.data.DataSet api.
Something like this (it loads .wav files):
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
# ...
def input_fn(num_epochs, batch_size, shuffle=False, mode='training')
def input_fn_bound():
def _read_file(fn, label):
return io_ops.read_file(fn), label
def _decode(data, label):
pcm = contrib_audio.decode_wav(data,
desired_channels=1,
desired_samples=desired_samples)
return pcm.audio, label
filenames = get_files(mode)
classes = get_classes(mode)
labels = {'class': np.array(classes)}
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
if shuffle:
dataset = dataset.shuffle(buffer_size=len(labels))
dataset = dataset.map(_read_file, num_parallel_calls=num_map_threads)
dataset = dataset.map(_decode, num_parallel_calls=num_map_threads)
dataset = dataset.map(lambda wav, label: ({'wav': wav}, label))
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(2) # To load next batch while the first one is being processed on GPU
iter = dataset.make_one_shot_iterator()
features, labels = iter.get_next()
return features, labels
return input_fn_bound
# ....
estimator.train(input_fn=input_fn(
num_epoths=None,
batch_size=64,
shuffle=True,
mode='training'),
steps=10000)