I have a rather straightforward script for training and validating. I'm using tensorflow-gpu and I can see GPU:0 being used. However, the python process itself appears to be using just a single core with just around 90% utilisation. My GPU isn't getting maxed out during training either. It gets fully utilised during validation, however.
I wonder whether the use of a single core is preventing the GPU from being utilised more. Is there a way to use more CPU cores? I've tried setting config.intra_op_parallelism_threads = 4, but still only a single core is used.
Here's my script:
import model
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from visual_callbacks import AccLossPlotter
import numpy as np
def main():
np.random.seed(45)
nb_class = 2
width, height = 224, 224
sn = model.SqueezeNet(nb_classes=nb_class, inputs=(3, height, width))
print('Build model')
sgd = SGD(lr=0.001, decay=0.0002, momentum=0.9, nesterov=True)
sn.compile(
optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
print(sn.summary())
# Training
train_data_dir = 'data/train'
validation_data_dir = 'data/validation'
nb_train_samples = 2000
nb_validation_samples = 800
nb_epoch = 500
# Generator
train_datagen = ImageDataGenerator(
rescale=1./255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
#train_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
train_data_dir,
target_size=(width, height),
batch_size=32,
class_mode='categorical')
validation_generator = test_datagen.flow_from_directory(
validation_data_dir,
target_size=(width, height),
batch_size=32,
class_mode='categorical')
# Instantiate AccLossPlotter to visualise training
plotter = AccLossPlotter(graphs=['acc', 'loss'], save_graph=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0)
checkpoint = ModelCheckpoint(
'weights.{epoch:02d}-{val_loss:.2f}.h5',
monitor='val_loss',
verbose=0,
save_best_only=True,
save_weights_only=True,
mode='min',
period=1)
sn.fit_generator(
train_generator,
samples_per_epoch=nb_train_samples,
nb_epoch=nb_epoch,
validation_data=validation_generator,
nb_val_samples=nb_validation_samples,
callbacks=[plotter, checkpoint])
sn.save_weights('weights.h5')
if __name__ == '__main__':
main()
input('Press ENTER to exit...')
You can not utilize both CPU as well as GPU simultaneously. When you are using GPU for computation, your CPU not doing the actual computation, it is only doing the book-keeping job for GPU kernels. And for doing book-keeping, CPU does not have to utilize all the cores (single core is enough).
My GPU isn't getting maxed out during training either. It gets fully utilised during validation, however.
That is because during training you are calculating the gradients and doing back-prop which are not massively parallel processes compare to simple forward pass(you have to update weights after every batch forward pass). And so those can not fully utilize the GPU. But during the validation you only calculating the forward pass and that is why during validation GPU's are fully utilized.
Although you may get more GPU utilization if you increase the batch_size.
Related
## MODEL IMPORTING ##
import tensorflow
import pandas as pd
import numpy as np
import os
import keras
import random
import cv2
import math
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense,GlobalAveragePooling2D,Convolution2D,BatchNormalization
from tensorflow.keras.layers import Flatten,MaxPooling2D,Dropout
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator,img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
import warnings
warnings.filterwarnings("ignore")
WIDTH = 299
HEIGHT = 299
CLASSES = 4
base_model = InceptionV3(weights='imagenet', include_top=False)
for layer in base_model.layers:
layer.trainable = False
x = base_model.output
x = GlobalAveragePooling2D(name='avg_pool')(x)
x = Dropout(0.4)(x)
predictions = Dense(CLASSES, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)
model.summary()
model.compile(optimizer='adam', ##also tried other optimiser --> similar poor accuracy found
loss='categorical_crossentropy',
metrics=['accuracy'])
## IMAGE DATA GENERATOR ##
from keras.applications.inception_v3 import preprocess_input
train_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input,
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest',
validation_split=0.2)
train_generator = train_datagen.flow_from_directory(
TRAIN_DIR,
target_size=(HEIGHT, WIDTH),
batch_size=BATCH_SIZE,
class_mode='categorical',
subset="training")
validation_generator = train_datagen.flow_from_directory(
TRAIN_DIR,
target_size=(HEIGHT, WIDTH),
batch_size=BATCH_SIZE,
class_mode='categorical',
subset="validation")
test_datagen = ImageDataGenerator(rescale=1./255)
generator_test = test_datagen.flow_from_directory(directory=TEST_DIR,
target_size=(HEIGHT, WIDTH),
batch_size=BATCH_SIZE,
class_mode='categorical',
shuffle=False)
## MODEL training ##
EPOCHS = 20
STEPS_PER_EPOCH = 320 #train_generator.n//train_generator.batch_size
VALIDATION_STEPS = 64 #validation_generator.n//validation_generator.batch_size
history = model.fit_generator(
train_generator,
epochs=EPOCHS,
steps_per_epoch=STEPS_PER_EPOCH,
validation_data=validation_generator,
validation_steps=VALIDATION_STEPS)
Result found:
VALIDATION ACCURACY around 0.55-0.67 fluctuating..
TRAINING ACCURACY 0.99
Questions:
What/Where is the problem in transfer learning process?
are train, validate and test data generator function parameter are chosen correctly?
Well I think you would be better off to train the entire model. So remove the code that makes the base model layers not trainable. If you look at the documentation for Inceptionv3 located here you can set pooling='max' which puts a GlobalMaxPooling2d layer as the output layer so if you do that you do not need to add your own layer as you did. Now I noticed you imported the callbacks ModelCheckpoint and ReduceLROnPlateau but you did not use them in model.fit. Using an adjustable learning rate will be beneficial to achieving a lower validation loss. ModelCheckpoint is useful to save the best model for use in predictions. See code below for implementations. save_loc is the directory where you want to store the results from ModelCheckpoint. NOTE in ModelCheckpoint I set save_weights_only=True. Reason is this is far faster than saving the entire model on each epoch for which the validation loss decreases.
checkpoint=tf.keras.callbacks.ModelCheckpoint(filepath=save_loc, monitor='val_loss', verbose=1, save_best_only=True,
save_weights_only=True, mode='auto', save_freq='epoch', options=None)
lr_adjust=tf.keras.callbacks.ReduceLROnPlateau( monitor="val_loss", factor=0.5, patience=1, verbose=1, mode="auto",
min_delta=0.00001, cooldown=0, min_lr=0)
callbacks=[checkpoint, lr_adjust]
history = model.fit_generator( train_generator, epochs=EPOCHS,
steps_per_epoch=STEPS_PER_EPOCH,validation_data=validation_generator,
validation_steps=VALIDATION_STEPS, callbacks=callbacks)
model.load_weights(save_loc) # load the saved weights
# after this use the model to evaluate or predict on the test set.
# if you are satisfied with the results you can then save the entire model with
model.save(save_loc)
Be careful on the test set generator. Ensure you do the same preprocessing on the test data as you did on the training data. I noticed you only rescaled the pixels. Not sure what the preprocess function does but I would use that.
I would also remove the dropout layer initially. Monitor the training loss and validation loss on each epoch and plot the results. If the training loss continues to decrease and the validation loss trends toward increasing then you are over fitting if so then restore the dropout layer if needed. If you do evaluation or prediction on the test set, you only want to go through the test set once. So select the test batch size to that no. of test samples/test batch size is an integer and use that integer as the number of test steps. Here is a
handy function that will determine that for you where length is the number of test samples and b_max is the maximum batch size you will allow based on your memory capacity.
def get_bs(length,b_max):
batch_size=sorted([int(length/n) for n in range(1,length+1) if length % n ==0 and length/n<=b_max],reverse=True)[0]
return batch_size,int(length/batch_size)
# example of use
batch_size, step=get_bs(10000,70)
#result is batch_size= 50 steps=200
I'm doing HPO on a small custom CNN. During training the GPU is under-utilised and I'm finding a bottleneck in the CPU: the data augmentation process is too slow. Looking online, I found that I could use multiple CPU cores for the generator and speedup the process. I set up workers=n_cores and this did improve things, but not as much as I'd like.
So I though that I could train multiple models simultaneously on the GPU, and feed the same augmented data to the models. However, I can't come up with some idea on how to do this and I couldn't find any similar question.
Here's a minimal example (I'm leaving out imports for brevity):
# load model and set only last layer as trainable
def create_model(learning_rate, alpha, dropout):
model_path = '/content/drive/My Drive/Progetto Advanced Machine Learning/Model Checkpoints/Custom Model 1 2020-06-01 10:56:21.010759.hdf5'
model = tf.keras.models.load_model(model_path)
x = model.layers[-2].output
x = Dropout(dropout)(x)
predictions = Dense(120, activation='softmax', name='prediction', kernel_regularizer=tf.keras.regularizers.l2(alpha))(x)
model = Model(inputs=model.inputs, outputs=predictions)
for layer in model.layers[:-2]:
layer.trainable = False
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate), metrics=['accuracy'])
return model
#declare the search space
SEARCH_SPACE = [skopt.space.Real(0.0001, 0.1, name='learning_rate', prior='log-uniform'),
skopt.space.Real(1e-9, 1, name='alpha', prior='log-uniform'),
skopt.space.Real(0.0001, 0.95, name='dropout', prior='log-uniform')]
# declare generator
train_datagenerator = ImageDataGenerator(rescale=1. / 255, rotation_range=30, zoom_range=0.2, horizontal_flip=True, validation_split=0.2, data_format='channels_last')
# training function to be called by the optimiser
#use_named_args(SEARCH_SPACE)
def fitness(learning_rate, alpha, dropout):
model = create_model(learning_rate, alpha, dropout)
#compile generators
train_batches = train_datagenerator.flow_from_directory(train_out_path, target_size=image_size, color_mode="rgb", class_mode="categorical" , batch_size=32, subset='training', seed = 20052020)
val_batches = train_datagenerator.flow_from_directory(directory=train_out_path, target_size=image_size, color_mode="rgb", class_mode="categorical" , batch_size=32, subset='validation', shuffle=False, seed = 20052020)
#train
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
training_results = model.fit(train_batches, epochs=5, verbose=1, shuffle=True, validation_data=val_batches, workers=2)
history[hyperpars] = training_results.history
with open(dict_save_path, 'wb') as f:
pickle.dump(history, f)
return training_results.history['val_accuracy'][-1]
# HPO
result = skopt.forest_minimize(fitness, SEARCH_SPACE, n_calls=10, callback=checkpoint_saver)
In TensorFlow, when using class_weights in fit_generator causes the training process to continuously consume more and more CPU RAM until depletion. There is a stepped increased in memory usage after each epoch. See below for the reproducible example. To keep the reproducible example small, I decreased the size of the dataset and batch size, which shows the trend of increasing memory. While training with my actual data, it depletes the full 128GB RAM by 70 EPOCS.
Anyone ran into this problem or have any suggestions on this? My data has unbalanced data so I have to use class_weights but I cannot run the training for long with this.
In the code sample below, if you comment out the class weights, the program trains without depleting memory.
First image shows memory usage with class_weights while second one shows usage without class_weights.
import tensorflow as tf
tf.enable_eager_execution()
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import CuDNNLSTM, Dense
from tensorflow.keras.optimizers import Adadelta
feature_count = 25
batch_size = 16
look_back = 5
target_groups = 10
def random_data_generator( ):
x_data_size =(batch_size, look_back, feature_count) # batches, lookback, features
x_data = np.random.uniform(low=-1.0, high=5, size=x_data_size)
y_data_size = (batch_size, target_groups)
Y_data = np.random.randint(low=1, high=21, size=y_data_size)
return x_data, Y_data
def get_simple_Dataset_generator():
while True:
yield random_data_generator()
def build_model():
model = Sequential()
model.add(CuDNNLSTM(feature_count,
batch_input_shape=(batch_size,look_back, feature_count),
stateful=False))
model.add(Dense(target_groups, activation='softmax'))
optimizer = Adadelta(learning_rate=1.0, epsilon=None)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
def run_training():
model = build_model()
train_generator = get_simple_Dataset_generator()
validation_generator = get_simple_Dataset_generator()
class_weights = {0:2, 1:8, 2:1, 3:4, 4:8, 5:35, 6:30, 7:4, 8:5, 9:3}
model.fit_generator(generator = train_generator,
steps_per_epoch=1,
epochs=1000,
verbose=2,
validation_data=validation_generator,
validation_steps=20,
max_queue_size = 10,
workers = 0,
use_multiprocessing = False,
class_weight = class_weights
)
if __name__ == '__main__':
run_training()
For any future users, there seems to be a bug in the nightly build which seems to be fixed in the subsequent nightly builds. More details here in bug report.
https://github.com/tensorflow/tensorflow/issues/31253
I'm trying to train an LSTMs network on Google Colab. However, this error occurs:
AlreadyExistsError: Resource __per_step_116/training_4/Adam/gradients/bidirectional_4/while/ReadVariableOp/Enter_grad/ArithmeticOptimizer/AddOpsRewrite_Add/tmp_var/N10tensorflow19TemporaryVariableOp6TmpVarE
[[{{node training_4/Adam/gradients/bidirectional_4/while/ReadVariableOp/Enter_grad/ArithmeticOptimizer/AddOpsRewrite_Add/tmp_var}}]]
I don't know where can be the issue. This is the model of the network:
sl_model = keras.models.Sequential()
sl_model.add(keras.layers.Embedding(max_index+1, hidden_size, mask_zero=True))
sl_model.add(keras.layers.Bidirectional(keras.layers.LSTM(hidden_size,
activation='tanh', dropout=0.2, recurrent_dropout = 0.2, return_sequences=True)))
sl_model.add(keras.layers.Bidirectional(keras.layers.LSTM(hidden_size, activation='tanh', dropout=0.2, recurrent_dropout = 0.2, return_sequences=False))
)
sl_model.add(keras.layers.Dense(max_length, activation='softsign'))
optimizer = keras.optimizers.Adam()
sl_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])
batch_size = 128
epochs = 3
cbk = keras.callbacks.TensorBoard("logging/keras_model")
print("\nStarting training...")
sl_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
shuffle=True, validation_data=(x_dev, y_dev), callbacks=[cbk])
Thank you so much!
You need to restart your runtime -- this happens when you have defined multiple graphs built in a single jupyter (Colaboratory) runtime.
Calling tf.reset_default_graph() may also help, but depending on whether you are using eager exection and how you've defined your sessions this may or may not work.
My friend and I challenged a kaggle competition named "Plant Seedlings Classification". He built a keras kernel as follow :
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from os import listdir, makedirs
from os.path import join, exists, expanduser
from keras import models
from keras.preprocessing import image
from keras.applications import xception
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
from keras import optimizers
from keras import layers
from keras.applications.xception import Xception
train_dir = './Plant_Seedlings_Classification/train/'
valid_dir = './Plant_Seedlings_Classification/validation/'
im_size = 299
batch_size = 10
train_num = 3803
valid_num = 947
conv_base = Xception(weights='imagenet',\
include_top=False,
input_shape=(im_size, im_size, 3))
conv_base.trainable = False
model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten()) # Flatten
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(12, activation='softmax'))
model.summary()
train_datagen = ImageDataGenerator(rescale=1./255,\
rotation_range=30,\
width_shift_range=0.2,\
height_shift_range=0.2,\
zoom_range=0.2,\
horizontal_flip=True,\
fill_mode='nearest')
valid_datagen = ImageDataGenerator(rescale=1./255)
print("train gen")
train_generator = train_datagen.flow_from_directory(
train_dir,
class_mode='categorical',
target_size=(im_size, im_size),
color_mode='rgb',
batch_size=batch_size,)
print("validation gen")
validation_generator = valid_datagen.flow_from_directory(
valid_dir,
class_mode='categorical',
target_size=(im_size, im_size),
color_mode='rgb',
batch_size=batch_size,)
print("train indices",train_generator.class_indices)
print("validation indices", validation_generator.class_indices)
model.compile(loss='categorical_crossentropy',
optimizer=optimizers.Nadam(lr=1e-4,
beta_1=0.9, \
beta_2=0.999, \
epsilon=1e-08, \
schedule_decay=0.004),
metrics=['acc'])
steps_per_epoch = int(train_num/batch_size)+50
validation_step = int(valid_num/batch_size)+1
print("steps_per_epoch", steps_per_epoch)
print("validation_step", validation_step)
model_save_path = 'xception_299_0304_nomask_epoch{epoch:02d}_vacc{val_acc:.4f}.h5'
history = model.fit_generator(
train_generator,
steps_per_epoch=steps_per_epoch,
epochs=50,
validation_data=validation_generator,
validation_steps=validation_step,
callbacks=[TensorBoard(log_dir='/tmp/tensorflow/log/1'), \
ModelCheckpoint(filepath=model_save_path, \
monitor='val_acc',\
save_best_only=True,\
mode='max')])
The code will run 50 epochs.
When the kernel ran on my friend's PC (I7-7700, GTX 1060, 8G DDR4-2400), it took about 90 seconds per epoch.
And it took about 120 seconds per epoch when it ran on my PC (I5-7400, GTX 1070Ti, 16G DDR4-2400)
We both used tensorflow_gpu to run this kernel. My question is, why the GPU computing speed on my PC is much slower than my friend's?
We checked the GPU usage using nvidia-smi. When the kernel was running, the Volatile GPU-Util is only 30~60% on my GPU, and >95% on my friend's.
The Volatile GPU-Util can reach ~100% on my GPU when I run other ML frameworks like Caffe or Tensorflow without keras API.
Any pointer? Thanks.