gpu pulsating 0 to 100% very fast while training tensorflow model - gpu

While training a tensorflow convnet model on gpu, the gpu utilization keeps on pulsating 0 to 100% in about 0.5 seconds.
Is this a desirable effect? or something is wrong in my model.
I need to utilize full gpu power because of my big model and big images.
Please help.
Note:- I am using TfRecorddataset for input pipeline.
Below is the code for making input pipeline;-
def _parse_function(example_proto):
features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
"mask": tf.FixedLenFeature((), tf.string, default_value="")}
parsed_features = tf.parse_single_example(example_proto, features)
return parsed_features["image"], parsed_features["mask"]
# In[5]:
tfRecord_file = "/home/tarun26mi/carvan_challenge/tfrecords/"
tfFiles = [tfRecord_file+f for f in listdir(tfRecord_file)[:-2] if f[-9:]== "tfrecords"]
totalEamples = len(tfFiles) * EXAMPLE_PER_FILE
itr_for_one_epoch = totalEamples/BATCH_SIZE
f_print(totalEamples,opstring="Total Example: ")
f_print(itr_for_one_epoch,opstring="Iterations for one epoch ")
f_print(tfFiles)
# In[6]:
dataset = tf.contrib.data.TFRecordDataset(tfFiles)
dataset = dataset.map(_parse_function)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.repeat()
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
img, mask = next_element
img = tf.decode_raw(img,tf.float32)
mask = tf.decode_raw(mask,tf.float32)
img = tf.reshape(img,shape=[-1,IMAGE_ROWS,IMAGE_COLS,NUM_CHANNELS])
mask = tf.reshape(mask,shape=[-1,OUTPUT_ROWS,OUTPUT_COLS,1])

Related

HuggingFace Pytorch trainer giving worse results than tensorflow

I’m trying to make the switch from tensorflow to pytorch, but I’m getting a good bit worse results when running a model in pytorch using Trainer.
I’m using bert-base-uncased, and as far as I can tell am using primarily the same settings across both (batch size, epochs, learning rate, etc). However I am getting a f1 score of 0.9967 from tensorflow, and a 0.944649446494465 from pytorch. The loss also seems to fluctuate a lot more in pytorch. I’m still pretty new to machine learning and python in general, so I feel like it’s gotta be something obvious, but I’ve yet to find it. Here are my scripts. Thanks in advance.
Tensorflow
SEQ_LEN = 256
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def train():
def preprocess_function(examples):
return tokenizer(examples["text"], max_length=SEQ_LEN, truncation=True, padding='max_length', add_special_tokens=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
dataset = load_dataset('json', data_files={"train": "full-items.json", "test": "validation-2.json"})
tokenized = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
batch_size = 8
num_epochs = 4
batches_per_epoch = len(tokenized["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=4e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = TFAutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
tf_train_set = model.prepare_tf_dataset(
tokenized["train"],
shuffle=True,
batch_size=batch_size,
collate_fn=data_collator,
)
tf_validation_set = model.prepare_tf_dataset(
tokenized["test"],
shuffle=False,
batch_size=batch_size,
collate_fn=data_collator,
)
eval_metrics = evaluate.load("f1")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return eval_metrics.compute(predictions=predictions, references=labels)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
METRICS = [
tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
tf.keras.metrics.SparseCategoricalCrossentropy(from_logits=True, name='sparse_crossentropy'),
]
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_train_set)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
class_weights = dict(enumerate(sklearn.utils.class_weight.compute_class_weight('balanced',
classes=np.unique(tokenized["train"]["label"]),
y=tokenized["train"]["label"])))
model.compile(optimizer=optimizer, loss=loss, metrics=METRICS)
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, class_weight=class_weights, callbacks=[early_stop, metric_callback])
model.save_pretrained('lease_to_own_model', save_format="tf")
Pytorch
def pyTorch():
def preprocess_function(examples):
return tokenizer(examples["text"], max_length=SEQ_LEN, truncation=True, padding='max_length', add_special_tokens=True, return_attention_mask=True, return_token_type_ids=False)
dataset = load_dataset('json', data_files={"train": "full-items.json", "test": "validation-2.json"})
tokenized = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
eval_f1 = evaluate.load("f1")
eval_accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
f1 = eval_f1.compute(predictions=predictions, references=labels)
accuracy = eval_accuracy.compute(predictions=predictions, references=labels)
return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
device = torch.device("cuda")
model.to(device)
batch_size = 8
training_args = TrainingArguments(
num_train_epochs=4,
output_dir="pytorch",
learning_rate=4e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
evaluation_strategy="epoch",
save_strategy="epoch",
metric_for_best_model='f1',
load_best_model_at_end=True,
logging_strategy="epoch",
warmup_steps=0,
)
class_weights = sklearn.utils.class_weight.compute_class_weight('balanced',
classes=np.unique(tokenized["train"]["label"]),
y=tokenized["train"]["label"])
weights= torch.tensor(class_weights,dtype=torch.float).to(device)
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.get("labels")
outputs = model(**inputs)
logits = outputs.get("logits")
loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("pytorch")

Difference between valid accuracy from CategoricalAccuracy() and manual prediction

I used CategoricalAccuracy() to calculate the valid_accuracy during training and reached 0.733 in a certain round. After training, I use the model weights saved under the epoch to predict the same valid data set, and the accuracy obtained is only 0.38. Why is this?
This is my training log
I only divided the data into two sets: training set and validation set.
There are 517 images in my validation set. I use this model to make manual predictions and compare them with the true labels. Only 199 images are correctly predicted. The accuracy rate is only 0.38, which is less than 0.733.
Even if it is overfitting, I think these two values should be close.
for epoch in range(EPOCHS):
step = 0
for features in train_dataset:
step += 1
images, labels = process_features(features, data_augmentation=True)
train_step(images, labels)
print("Epoch: {}/{}, step: {}/{}, loss: {:.5f}, accuracy: {:.5f}".format(epoch,
EPOCHS,
step,
math.ceil(train_count / BATCH_SIZE),
train_loss.result().numpy(),
train_accuracy.result().numpy()))
for features in valid_dataset:
valid_images, valid_labels = process_features(features, data_augmentation=False)
valid_step(valid_images, valid_labels)
print("Epoch: {}/{}, train loss: {:.5f}, train accuracy: {:.5f}, "
"valid loss: {:.5f}, valid accuracy: {:.5f}".format(epoch,
EPOCHS,
train_loss.result().numpy(),
train_accuracy.result().numpy(),
valid_loss.result().numpy(),
valid_accuracy.result().numpy()))
def process_features(features, data_augmentation):
image_raw = features['image_raw'].numpy()
image_tensor_list = []
for image in image_raw:
image_tensor = load_and_preprocess_image(image, data_augmentation=data_augmentation)
image_tensor_list.append(image_tensor)
images = tf.stack(image_tensor_list, axis=0)
labels = features['label'].numpy()
new_labels = tf.one_hot(labels,7)
return images, new_labels
def load_and_preprocess_image(image_raw, data_augmentation=False):
# decode
image_tensor = tf.io.decode_image(contents=image_raw, channels=CHANNELS, dtype=tf.dtypes.float32)
if data_augmentation:
image = tf.image.random_flip_left_right(image=image_tensor)
image = tf.image.resize_with_crop_or_pad(image=image,
target_height=int(IMAGE_HEIGHT * 1.2),
target_width=int(IMAGE_WIDTH * 1.2))
image = tf.image.random_crop(value=image, size=[IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS])
image = tf.image.random_brightness(image=image, max_delta=0.5)
else:
image = tf.image.resize(image_tensor, [IMAGE_HEIGHT, IMAGE_WIDTH])
return image
def valid_step(image_batch, label_batch):
predictions = model(image_batch, training=False)
v_loss = loss_object(label_batch, predictions)
valid_loss.update_state(values=v_loss)
valid_accuracy.update_state(y_true=label_batch, y_pred=predictions)
The following is my code to detect each picture of validation set
import tensorflow as tf
from configuration import save_model_dir, test_image_dir
from prepare_data import load_and_preprocess_image
from train import get_model
def get_single_picture_prediction(model, picture_dir):
image_tensor = load_and_preprocess_image(tf.io.read_file(filename=picture_dir), data_augmentation=False)
image = tf.expand_dims(image_tensor, axis=0)
prediction = model(image, training=False)
pred_class = tf.math.argmax(prediction, axis=-1)
return pred_class
if __name__ == '__main__':
# GPU settings
gpus = tf.config.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# load the model
model = get_model()
model.load_weights(filepath=save_model_dir+"model")
pred_class = get_single_picture_prediction(model, test_image_dir)
print(pred_class)
This is my picture

ways to improve training speed

I'm new to tensorflow and I'm trying to adopt transfer learning for feature extraction. I have a large image dataset of 600k images stored in a gzip compressed hdf5 file of 100GB. I'm using a generator to load the images into the vgg16 model. However, it is going to take me 2000+ hours to complete 1 epoch. Is there any way to optimize the code so that I can have a faster training speed?
NAME = "vgg16-CNN"
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75)
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True,gpu_options=gpu_options))
#Model
num_classes=58
image_input = Input(shape=(224, 224, 3))
model = VGG16(input_tensor=image_input,include_top=True, weights='imagenet')
output_vgg16_conv = model.get_layer('fc2').output
x = Dense(num_classes, activation='softmax', name='predictions') (output_vgg16_conv)
pretrained_model = Model(inputs=image_input, outputs=x)
for layer in pretrained_model.layers[:-1]:
layer.trainable=False
pretrained_model.compile(loss='categorical_crossentropy',
optimizer='adam', metrics=['accuracy'])
pretrained_model.summary()
#Generator
def generator():
extendable_hdf5_file = h5py.File('npx_train.hdf5','r')['dataset']
y_train=pd.read_csv('train.csv')['Category']
len_class=58
y_train = to_categorical(np.array(y_train),len_class)
for a,im in enumerate(extendable_hdf5_file):
yield (im,y_train[a])
#Dataset from generator
ds = tf.data.Dataset.from_generator(
generator,
(tf.float32, tf.float32),
((224,224,3),(58,)))
ds = ds.prefetch(tf.contrib.data.AUTOTUNE)
ds = ds.batch(10)
#Model compile
with sess:
sess.run(tf.global_variables_initializer())
pretrained_model.fit(ds,epochs=10,steps_per_epoch=66662,
verbose=1,callbacks=[tensorboard],workers=0)
UPDATE:
I've managed to cut the training time to 60 hours per epoch by loading the generator directly to model.fit
hdf5_path = "npx_train.hdf5"
extendable_hdf5_file = h5py.File(hdf5_path,'r')['dataset']
def train_loader(files,y_train, batch_size):
L = 553292
while True:
batch_start = 0
batch_end = batch_size
while batch_start < L:
limit = min(batch_end, L)
X = files[batch_start:limit]
X = X/255
X = np.float32(X)
Y = y_train[batch_start:limit]
yield (X,Y)
batch_start += batch_size
batch_end += batch_size
with tf.device('/gpu:0'):
pretrained_model.fit_generator(generator=train_loader(extendable_hdf5_file,y_train, 32),
steps_per_epoch=16666, epochs=10, verbose=1,callbacks=[tensorboard],
validation_data=val_loader(extendable_hdf5_file,y_train, 32),
validation_steps=4167, workers=0)
However, it is still a long time to spend to train a single layer. Would appreciate help to speed up the process.
Graphics card: gtx1070

How to use TensorFlow Dataset API in GANs training?

I am training GANs model. For loading the dataset, I am using Dataset API of TensorFlow.
# train_dataset has image and label. z_train dataset has noise (z).
train_dataset = tf.data.TFRecordDataset(train_file)
z_train = tf.data.Dataset.from_tensor_slices(tf.random_uniform([total_training_samples, seq_length, z_dim],
minval=0, maxval=1, dtype=tf.float32))
train_dataset = tf.data.Dataset.zip((train_dataset, z_train))
Creating Iterator:
iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
Using the iterator:
(img, label), z = iter.get_next()
train_init_op = iter.make_initializer(train_dataset)
While training the GAN in session:
Training Discriminator first:
_, disc_loss = sess.run([disc_optim, disc_loss])
then training Generator:
_, gen_loss = sess.run([gen_optim, gen_loss])
Here is the catch. Since, I am using label as condition (CGAN) in both, discriminator and generator graph, using two sess.run produces two different set of batch of label during the same run of batch.
for epoch in range(num_of_epochs):
sess.run([tf.global_variables_initializer(), train_init_op.initializer])
for batch in range(num_of_batches):
_, disc_loss = sess.run([disc_optim, disc_loss])
_, gen_loss = sess.run([gen_optim, gen_loss])
Since, I have to feed the same batch of label in the generator's session run as in discriminator's session run, how shall I prevent Dataset API to produce two different batches in the same loop of a batch?
Note: I am using TensorFlow v1.9
Thanks in advance.
You can create 2 iterators for the same dataset. If you need to shuffle the dataset, you can even do that by specifying the seed as a tensor. See example below.
import tensorflow as tf
seed_ts = tf.placeholder(tf.int64)
ds = tf.data.Dataset.from_tensor_slices([1,2,3,4,5]).shuffle(5, seed=seed_ts, reshuffle_each_iteration=True)
it1 = ds.make_initializable_iterator()
it2 = ds.make_initializable_iterator()
input1 = it1.get_next()
input2 = it2.get_next()
with tf.Session() as sess:
for ep in range(10):
sess.run(it1.initializer, feed_dict={seed_ts: ep})
sess.run(it2.initializer, feed_dict={seed_ts: ep})
print("Epoch" + str(ep))
for i in range(5):
x = sess.run(input1)
y = sess.run(input2)
print([x, y])

Model evaluation from a checkpoint with Multi GPU

I know how to train a network on a single GPU -> save a checkpoint -> later on load this checkpoint -> run benchmarks.
I can't figure how to do it when I train using multiple GPUs and using the new Data API.
Here is the 'normal' training code:
import tensorflow as tf
images_placeholder = tf.placeholder(tf.float32, shape=(None, image_size,
image_size, 1), name='input')
labels_placeholder = tf.placeholder(tf.int32, shape=(None))
embeddings = build_graph(images_placeholder)
loss = add_loss(embeddings, labels_placeholder)
embeddings = tf.identity(embeddings, 'embeddings')
Later on, when I want to benchmark:
with tf.Graph().as_default():
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
load_graph_def(model_path) # for example: d:\model.ckpt-0
images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
images = benchmark_utils.load_data(paths_batch, image_size)
feed_dict = {images_placeholder: images}
predictions = sess.run(embeddings, feed_dict=feed_dict)
So now I want to train with multiple GPUs like so:
with tf.Graph().as_default(), tf.device('/cpu:0'):
dataset = tf.data.Dataset.from_tensor_slices((images_list, labels_list))
dataset = dataset.map(load_images)
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(128)
dataset = dataset.repeat()
opt = tf.train.MomentumOptimizer(0.01, momentum=0.9, use_nesterov=True)
tower_grads = []
with tf.variable_scope(tf.get_variable_scope()):
for i in range(num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
image_batch, label_batch = dataset.iterator.get_next()
loss = tower_loss(scope, image_batch, label_batch)
What I can't figure out is how can I get the 'input' and 'embeddings' tensor when I want to benchmark the checkpoint.
How do I define for example the tensor called 'input' that should receive the images that should be evaluated ?
I'm guessing that somewhere in the multi-gpu code, I should define this images_placeholder like I defined in the single-gpu training.
Thanks for any advice!