Parallel GPU computations - utilization fluctuation - tensorflow

I have a server with two Nvidia GTX 1080 (driver 384.111). Currently, I am computing two larger CNN models in parallel, one at each GPU (Tensorflow backend) by adapting the TF config and passing the session to Keras:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))
I noticed that, for one GPU, the utilization keeps between 95% and 100% (let's call this one GPU A), which is normal. However, at some days(!), the utilization of the other GPU (B) fluctuates heavily between 0% and 100%, resulting in slow model training (I ensured that the GPU is really used - so I am not unintentionally using the CPU). Also, power usage seems to be rather low and fluctuating with the utilization rate. Memory is allocated properly on both GPUs. The temperature is at 80 degrees Celsius for the performing GPU and 90 degrees Celsius for the other (they are built in sub-optimally, so GPU A is also heating up GPU B a bit, hence the temperature difference.
What could be causing this? Tensorflow? Drivers? PSU (current one is > 1150)? Temperature? Bad GPU?
--- UPDATE 1 ---
Fluctuating performance does not occur when only using 1 GPU.
Furthermore, this phenomenon recently also occurred in reversed ways. GPU A was performing poorly while GPU B was doing fine.
Also, I figured it is not a temperature issue as this has also happened when both cards were cold (just tested). Therefore, I guess it boils down to (a) Tensorflow, (b) Drivers, (c) PSU ?
--- UPDATE 2 ---
Here's are functions that do the model assembly (1) and the batching and image augmentation (2, 3). I am feeding a pretrained InceptionV3 model to the make_pretrained_model function and the returned model from that function into train_model. Essentially, on each GPU, one of these models with a slightly different setup is running.
def make_pretrained_model(model, num_classes, frozen_top_layers=0, lr=0.0001, num_size_dense=[]):
'''
Building architecture of pretrained model
'''
in_inc = model.output
gap1 = GlobalAveragePooling2D()(in_inc)
for size, drop in num_size_dense:
gap1 = Dense(size, activation='relu')(gap1)
gap1 = Dropout(drop)(gap1)
softmax = Dense(num_classes, activation='softmax')(gap1)
model = Model(inputs=model.input, outputs=softmax)
if frozen_top_layers > 0:
for layer in model.layers[:-frozen_top_layers]:
layer.trainable = True
for layer in model.layers[-frozen_top_layers:]:
layer.trainable = False
else:
for layer in model.layers:
layer.trainable = True
opt = rmsprop(lr=lr, decay=1e-6)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
return model
def augment_image_batch(imgdatagen, x, y, batch_size):
return imgdatagen.flow(x, y, batch_size=batch_size).__next__()
def train_model(model, x_train, y_train, x_test, y_test, train_batch_size, test_batch_size,
epochs, model_dir, model_name, num_classes, int_str, preprocessing_fun, patience=5, monitor='val_acc'):
'''
Training function
'''
train_img_datagen = ImageDataGenerator(
featurewise_center=False, # also use in test gen if activated AND fit test_gen on train data
samplewise_center=False,
featurewise_std_normalization=False, # also use in test gen if activated AND fit test_gen on train data
samplewise_std_normalization=False,
zca_whitening=False,
zca_epsilon=0,
rotation_range=0.05,
width_shift_range=0.05,
height_shift_range=0.05,
channel_shift_range=0,
fill_mode='nearest',
cval=0,
vertical_flip=False,
preprocessing_function=preprocessing_fun,
shear_range=0.,
zoom_range=0.,
horizontal_flip=False)
val_img_datagen = ImageDataGenerator(
preprocessing_function=preprocessing_fun)
callbacks = [ModelCheckpoint(MODEL_DIR+model_name+'.h5',
monitor=monitor,
save_best_only=True),
EarlyStopping(monitor=monitor, patience=patience),
TensorBoard(LOG_DIR+model_name+'_'+str(time())),
ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=2)]
def train_feat_gen(x_train, y_train, train_batch_size):
while True:
for batch in range(len(x_train) // train_batch_size + 1):
if batch > max(range(len(x_train) // train_batch_size)):
x, y = x_train[batch*train_batch_size:].astype(float), y_train[batch*train_batch_size:]
yield augment_image_batch(train_img_datagen, x, y, train_batch_size, False)
else:
x, y = x_train[batch*train_batch_size:(1+batch)*train_batch_size].astype(float), y_train[batch*train_batch_size:(1+batch)*train_batch_size]
yield augment_image_batch(train_img_datagen, x, y, train_batch_size, False)
def val_feat_gen(x_val, y_val, test_batch_size):
while True:
for batch in range(len(x_val) // test_batch_size + 1):
if batch > max(range(len(x_val) // test_batch_size)):
x, y = x_val[batch*test_batch_size:].astype(float), y_val[batch*test_batch_size:]
yield augment_image_batch(val_img_datagen, x, y, test_batch_size, True)
else:
x, y = x_val[batch*test_batch_size:(1+batch)*test_batch_size].astype(float), y_val[batch*test_batch_size:(1+batch)*test_batch_size]
yield augment_image_batch(val_img_datagen, x, y, test_batch_size, True)
train_gen_new = train_feat_gen(x_train, y_train, train_batch_size)
val_gen_new = val_feat_gen(x_test, y_test, test_batch_size)
model.fit_generator(
train_gen_new,
steps_per_epoch=len(x_train) // train_batch_size,
epochs=epochs,
validation_data=val_gen_new,
validation_steps=len(y_test) // test_batch_size,
callbacks=callbacks,
shuffle=True)

Related

400% higher error with PyTorch compared with identical Keras model (with Adam optimizer)

TLDR:
A simple (single hidden-layer) feed-forward Pytorch model trained to predict the function y = sin(X1) + sin(X2) + ... sin(X10) substantially underperforms an identical model built/trained with Keras. Why is this so and what can be done to mitigate the difference in performance?
In training a regression model, I noticed that PyTorch drastically underperforms an identical model built with Keras.
This phenomenon has been observed and reported previously:
The same model produces worse results on pytorch than on tensorflow
CNN model in pytorch giving 30% less accuracy to Tensoflowflow model:
PyTorch Adam vs Tensorflow Adam
Suboptimal convergence when compared with TensorFlow model
RNN and Adam: slower convergence than Keras
PyTorch comparable but worse than keras on a simple feed forward network
Why is the PyTorch model doing worse than the same model in Keras even with the same weight initialization?
Why Keras behave better than Pytorch under the same network configuration?
The following explanations and suggestions have been made previously as well:
Using the same decimal precision (32 vs 64): 1, 2,
Using a CPU instead of a GPU: 1,2
Change retain_graph=True to create_graph=True in computing the 2nd derivative with autograd.grad: 1
Check if keras is using a regularizer, constraint, bias, or loss function in a different way from pytorch: 1,2
Ensure you are computing the validation loss in the same way: 1
Use the same initialization routine: 1,2
Training the pytorch model for longer epochs: 1
Trying several random seeds: 1
Ensure that model.eval() is called in validation step when training pytorch model: 1
The main issue is with the Adam optimizer, not the initialization: 1
To understand this issue, I trained a simple two-layer neural network (much simpler than my original model) in Keras and PyTorch, using the same hyperparameters and initialization routines, and following all the recommendations listed above. However, the PyTorch model results in a mean squared error (MSE) that is 400% higher than the MSE of the Keras model.
Here is my code:
0. Imports
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
1. Generate a reproducible dataset
def get_data():
np.random.seed(0)
Xtrain = np.random.normal(0, 1, size=(7000,10))
Xval = np.random.normal(0, 1, size=(700,10))
ytrain = np.sum(np.sin(Xtrain), axis=-1)
yval = np.sum(np.sin(Xval), axis=-1)
scaler = MinMaxScaler()
ytrain = scaler.fit_transform(ytrain.reshape(-1,1)).reshape(-1)
yval = scaler.transform(yval.reshape(-1,1)).reshape(-1)
return Xtrain, Xval, ytrain, yval
class XYData(Dataset):
def __init__(self, X, y):
super(XYData, self).__init__()
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.float32)
self.len = len(y)
def __getitem__(self, index):
return (self.X[index], self.y[index])
def __len__(self):
return self.len
# Data, dataset, and dataloader
Xtrain, Xval, ytrain, yval = get_data()
traindata = XYData(Xtrain, ytrain)
valdata = XYData(Xval, yval)
trainloader = DataLoader(dataset=traindata, shuffle=True, batch_size=32, drop_last=False)
valloader = DataLoader(dataset=valdata, shuffle=True, batch_size=32, drop_last=False)
2. Build Keras and PyTorch models with identical hyperparameters and initialization methods
class TorchLinearModel(nn.Module):
def __init__(self, input_dim=10, random_seed=0):
super(TorchLinearModel, self).__init__()
_ = torch.manual_seed(random_seed)
self.hidden_layer = nn.Linear(input_dim,100)
self.initialize_layer(self.hidden_layer)
self.output_layer = nn.Linear(100, 1)
self.initialize_layer(self.output_layer)
def initialize_layer(self, layer):
_ = torch.nn.init.xavier_normal_(layer.weight)
#_ = torch.nn.init.xavier_uniform_(layer.weight)
_ = torch.nn.init.constant(layer.bias,0)
def forward(self, x):
x = self.hidden_layer(x)
x = self.output_layer(x)
return x
def mean_squared_error(ytrue, ypred):
return torch.mean(((ytrue - ypred) ** 2))
def build_torch_model():
torch_model = TorchLinearModel()
optimizer = optim.Adam(torch_model.parameters(),
betas=(0.9,0.9999),
eps=1e-7,
lr=1e-3,
weight_decay=0)
return torch_model, optimizer
def build_keras_model():
x = layers.Input(shape=10)
z = layers.Dense(units=100, activation=None, use_bias=True, kernel_regularizer=None,
bias_regularizer=None)(x)
y = layers.Dense(units=1, activation=None, use_bias=True, kernel_regularizer=None,
bias_regularizer=None)(z)
keras_model = Model(x, y, name='linear')
optimizer = Adam(learning_rate=1e-3, beta_1=0.9, beta_2=0.9999, epsilon=1e-7,
amsgrad=False)
keras_model.compile(optimizer=optimizer, loss='mean_squared_error')
return keras_model
# Instantiate models
torch_model, optimizer = build_torch_model()
keras_model = build_keras_model()
3. Train PyTorch model for 100 epochs:
torch_trainlosses, torch_vallosses = [], []
for epoch in range(100):
# Training
losses = []
_ = torch_model.train()
for i, (x,y) in enumerate(trainloader):
optimizer.zero_grad()
ypred = torch_model(x)
loss = mean_squared_error(y, ypred)
_ = loss.backward()
_ = optimizer.step()
losses.append(loss.item())
torch_trainlosses.append(np.mean(losses))
# Validation
losses = []
_ = torch_model.eval()
with torch.no_grad():
for i, (x, y) in enumerate(valloader):
ypred = torch_model(x)
loss = mean_squared_error(y, ypred)
losses.append(loss.item())
torch_vallosses.append(np.mean(losses))
print(f"epoch={epoch+1}, train_loss={torch_trainlosses[-1]:.4f}, val_loss={torch_vallosses[-1]:.4f}")
4. Train Keras model for 100 epochs:
history = keras_model.fit(Xtrain, ytrain, sample_weight=None, batch_size=32, epochs=100,
validation_data=(Xval, yval))
5. Loss in training history
plt.plot(torch_trainlosses, color='blue', label='PyTorch Train')
plt.plot(torch_vallosses, color='blue', linestyle='--', label='PyTorch Val')
plt.plot(history.history['loss'], color='brown', label='Keras Train')
plt.plot(history.history['val_loss'], color='brown', linestyle='--', label='Keras Val')
plt.legend()
Keras records a much lower error in the training. Since this may be due to a difference in how Keras computes the loss, I calculated the prediction error on the validation set with sklearn.metrics.mean_squared_error
6. Validation error after training
ypred_keras = keras_model.predict(Xval).reshape(-1)
ypred_torch = torch_model(torch.tensor(Xval, dtype=torch.float32))
ypred_torch = ypred_torch.detach().numpy().reshape(-1)
mse_keras = metrics.mean_squared_error(yval, ypred_keras)
mse_torch = metrics.mean_squared_error(yval, ypred_torch)
print('Percent error difference:', (mse_torch / mse_keras - 1) * 100)
r_keras = pearsonr(yval, ypred_keras)[0]
r_pytorch = pearsonr(yval, ypred_torch)[0]
print("r_keras:", r_keras)
print("r_pytorch:", r_pytorch)
plt.scatter(ypred_keras, yval); plt.title('Keras'); plt.show(); plt.close()
plt.scatter(ypred_torch, yval); plt.title('Pytorch'); plt.show(); plt.close()
Percent error difference: 479.1312469426776
r_keras: 0.9115184443702814
r_pytorch: 0.21728812737220082
The correlation of predicted values with ground truth is 0.912 for Keras but 0.217 for Pytorch, and the error for Pytorch is 479% higher!
7. Other trials
I also tried:
Lowering the learning rate for Pytorch (lr=1e-4), R increases from 0.217 to 0.576, but it's still much worse than Keras (r=0.912).
Increasing the learning rate for Pytorch (lr=1e-2), R is worse at 0.095
Training numerous times with different random seeds. The performance is roughly the same, regardless.
Trained for longer than 100 epochs. No improvement was observed!
Used torch.nn.init.xavier_uniform_ instead of torch.nn.init.xavier_normal_ in the initialization of the weights. R improves from 0.217 to 0.639, but it's still worse than Keras (0.912).
What can be done to ensure that the PyTorch model converges to a reasonable error comparable with the Keras model?
The problem here is unintentional broadcasting in the PyTorch training loop.
The result of a nn.Linear operation always has shape [B,D], where B is the batch size and D is the output dimension. Therefore, in your mean_squared_error function ypred has shape [32,1] and ytrue has shape [32]. By the broadcasting rules used by NumPy and PyTorch this means that ytrue - ypred has shape [32,32]. What you almost certainly meant is for ypred to have shape [32]. This can be accomplished in many ways; probably the most readable is to use Tensor.flatten
class TorchLinearModel(nn.Module):
...
def forward(self, x):
x = self.hidden_layer(x)
x = self.output_layer(x)
return x.flatten()
which produces the following train/val curves

Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.
For both models:
Setting the last 50 layers trainable and adding the same fully connected layers to the end.
Learning rate 3e-2
Batch size 32
Adam optimizer with the same betas
100 epochs
The inputs consist of RGB unscaled images
Pytorch
Model
def _init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
m.bias.data.fill_(0.01)
def get_mob_v3_small():
model = torchvision.models.mobilenet_v3_small(pretrained=True)
children_list = get_children(model)
for c in children_list[:-50]:
for p in c.parameters():
p.requires_grad = False
return model
class TransferMobileNetV3_v2(nn.Module):
def __init__(self,
num_keypoints: int = 5):
super(TransferMobileNetV3_v2, self).__init__()
self.classifier_neurons = num_keypoints*2
self.base_model = get_mob_v3_small()
self.base_model.classifier = nn.Sequential(
nn.Linear(in_features=1024, out_features=1024),
nn.ReLU(),
nn.Linear(in_features=1024, out_features=512),
nn.ReLU(),
nn.Linear(in_features=512, out_features=self.classifier_neurons)
)
self.base_model.apply(_init_weights)
def forward(self, x):
out = self.base_model(x)
return out
Training Script
def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):
len_dataloader = len(trainloader)
for epoch in range(1, args.epochs+1):
net.train()
for batch_idx, sample in enumerate(trainloader):
inputs, labels = sample
inputs, labels = inputs.to(args.device), labels.to(args.device)
optimizer.zero_grad()
with torch.cuda.amp.autocast(args.use_amp):
prediction = net(inputs)
loss = train_loss_fn(prediction, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
def main():
args = make_args_parser()
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = args.seed
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=3e-2,
betas=(0.9, 0.999))
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)
Tensorflow
Model
base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',
input_shape=(224,224,3))
x_in = base_model.layers[-6].output
x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=x)
for layer in model.layers[:-50]:
layer.trainable=False
Training Script
model.compile(loss = "mse",
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))
history = model.fit(input_numpy, output_numpy,
verbose=1,
batch_size=32, epochs=100,validation_split = 0.2)
Results
The PyTorch model predicts one single point around the center for all 5 different points.
The Tensorflow model predicts the points quite well and are quite accurate.
The loss in the Pytorch model is much higher than the Tensorflow model.
Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results. Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.
Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I'm putting it here just in case.

pytorch isn't running on gpu while true

I want to train on my local gpu but it's only running on cpu while torch.cuda.is_available() is actually true and i can see my gpu but it runs only on cpu , so how to fix it
my CNN model:
import torch.nn as nn
import torch.nn.functional as F
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
# define the CNN architecture
class Net(nn.Module):
### TODO: choose an architecture, and complete the class
def __init__(self):
super(Net, self).__init__()
## Define layers of a CNN
self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
# convolutional layer (sees 16x16x16 tensor)
self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
# convolutional layer (sees 8x8x32 tensor)
self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
# max pooling layer
self.pool = nn.MaxPool2d(2, 2)
# linear layer (64 * 4 * 4 -> 500)
self.fc1 = nn.Linear(64 * 28 * 28, 500)
# linear layer (500 -> 10)
self.fc2 = nn.Linear(500, 133)
# dropout layer (p=0.25)
self.dropout = nn.Dropout(0.25)
def forward(self, x):
## Define forward behavior
x = self.pool(F.relu(self.conv1(x)))
#print(x.shape)
x = self.pool(F.relu(self.conv2(x)))
#print(x.shape)
x = self.pool(F.relu(self.conv3(x)))
#print(x.shape)
#print(x.shape)
# flatten image input
x = x.view(-1, 64 * 28 * 28)
# add dropout layer
x = self.dropout(x)
# add 1st hidden layer, with relu activation function
x = F.relu(self.fc1(x))
# add dropout layer
x = self.dropout(x)
# add 2nd hidden layer, with relu activation function
x = self.fc2(x)
return x
#-#-# You so NOT have to modify the code below this line. #-#-#
# instantiate the CNN
model_scratch = Net()
# move tensors to GPU if CUDA is available
if use_cuda:
print("TRUE")
model_scratch = model_scratch.cuda()
train function :
def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path):
"""returns trained model"""
# initialize tracker for minimum validation loss
valid_loss_min = np.Inf
loaders_scratch = {'train': train_loader,'valid': valid_loader,'test': test_loader}
for epoch in range(1, n_epochs+1):
# initialize variables to monitor training and validation loss
train_loss = 0.0
valid_loss = 0.0
###################
# train the model #
###################
model.train()
for batch_idx, (data, target) in enumerate(loaders['train']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## find the loss and update the model parameters accordingly
## record the average training loss, using something like
## train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
# clear the gradients of all optimized variables
optimizer.zero_grad()
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the batch loss
loss = criterion(output, target)
# backward pass: compute gradient of the loss with respect to model parameters
loss.backward()
# perform a single optimization step (parameter update)
optimizer.step()
# update training loss
train_loss += loss.item()*data.size(0)
######################
# validate the model #
######################
model.eval()
for batch_idx, (data, target) in enumerate(loaders['valid']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## update the average validation loss
output = model(data)
# calculate the batch loss
loss = criterion(output, target)
# update average validation loss
valid_loss += loss.item()*data.size(0)
# calculate average losses
train_loss = train_loss/len(train_loader.dataset)
valid_loss = valid_loss/len(valid_loader.dataset)
# print training/validation statistics
print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
epoch,
train_loss,
valid_loss
))
## TODO: save the model if validation loss has decreased
if valid_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(
valid_loss_min,
valid_loss))
torch.save(model.state_dict(), save_path)
valid_loss_min = valid_loss
# return trained model
return model
# train the model
loaders_scratch = {'train': train_loader,'valid': valid_loader,'test': test_loader}
model_scratch = train(100, loaders_scratch, model_scratch, optimizer_scratch,
criterion_scratch, use_cuda, 'model_scratch.pt')
# load the model that got the best validation accuracy
model_scratch.load_state_dict(torch.load('model_scratch.pt'))
while i am getting "TRUE" in torch.cuda.is_available() but still not running on GPU
i am only running on CPU
the below picture shows that i am running on cpu with 62%
To utilize cuda in pytorch you have to specify that you want to run your code on gpu device.
a line of code like:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
will determine whether you have cuda available and if so, you will have it as your device.
later in the code you have to pass your tensors and model to this device:
net = net.to(device)
and do the same for your other tensors that need to go to gpu, like test and training values.
If you are experiencing an issue where your model is only using the CPU during training even though your GPU is available, it's likely due to the data loading and transformation process. When loading images from your local directory and applying transforms on your data, the majority of the time during training is spent on the data loading process, which is performed on the CPU.
To resolve this issue, you can preprocess your data by applying your custom transforms once and then saving the results. This way, when you load the preprocessed data, you can take advantage of the GPU's performance during training. This can help to significantly improve the training time of your model.
In summary, if you are facing a problem with model using CPU instead of GPU during training, it could be due to the data loading process. To fix this, preprocess your data and save the results, then use the preprocessed data while training. This will allow you to take advantage of the GPU's performance and reduce training time.

TPU keras regression very slow compared to GPU/CPU

I'm doing a regression on a column in a dataframe. When I use a CPU, each epoch is ~95 seconds, when using a GPU it's ~45 seconds, but when using a TPU it's over 8 mins for each epoch.
I basically initialized the tpu, wrapped my model definition and compile into a TPU distribution strategy.
I * think * the problem is in my dataset. I've seen tutorials were the data is put into tensors (for my gpu/cpu performance I was sending the dataframe (X_train and y_train in my code below). I tried to both dataframe and tensor's, both are order of magnitude worst than a cpu. I'm sure this is a user error I just can't see my mistake.
Here's my code:
#setup tpu
import os
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
tpu_strategy = tf.distribute.TPUStrategy(resolver)
def KerasRegression(FullDF, variableToPredict):
df_train1 = FullDF[FullDF[variableToPredict].notna()].copy() #lets make train data not have na for variable we are trying to predict
X_train = df_train1.drop(variableToPredict, axis=1)
y_train = df_train1[variableToPredict].copy()
x_train_shape = X_train.shape[1]
dateset=tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size=100).prefetch(buffer_size=5000)
activationLayer = 'relu'
with tpu_strategy.scope():
model = Sequential()
model.add(Dense(x_train_shape, activation=activationLayer, input_dim=x_train_shape))
model.add(Dense(x_train_shape, activation=activationLayer))
model.add(Dense(1, activation='linear'))
optimizer = tf.keras.optimizers.Adam()
model.compile(loss='mse', optimizer=optimizer, metrics=['mse'],
experimental_steps_per_execution = 50)
model.fit(dateset, epochs=100)
# model.fit(X_train, y_train, epochs=100)
return model
Also if it helps the shape of my testing data is:
(590543, 209)
Any feedback is welcomed!

How to obtain second derivatives of a Loss function with respect to the parameters of a neural network using gradient tape in Tensorflow eager mode

I am creating a basic auto-encoder for the MNIST dataset using TensorFlow eager mode. I would like to observe the second-order partial derivatives of my loss function with respect to the parameters of the network as it trains. Currently, calling tape.gradient() on the output of in_tape.gradient returns None (where in_tape is a GradientTape nested inside the outer GradientTape called tape, I have included my code below)
I have tried calling the tape.gradient() directly on the in_tape.gradient() with None being returned. My next approach was to iterate over the output of in_tape.gradient() and apply tape.gradient() to each gradient individually (with respect to my model variables) with None being returned each time.
I receive a single None value for any tape.gradient() call, not a list of None values which I believe would indicate None for a single partial derivative, which would be expected in some cases.
I am currently only trying to get the second derivatives for the first set of weights (from input to hidden layers), however, I will scale it to include all weights once I have this working.
tf.enable_eager_execution()
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images.reshape((train_images.shape[0], train_images.shape[1]*train_images.shape[2])).astype(np.float32)/255
test_images = test_images.reshape((test_images.shape[0], test_images.shape[1]*test_images.shape[2])).astype(np.float32)/255
num_epochs = 200
batch_size = 100
learning_rate = 0.0003
class MNISTModel(tf.keras.Model):
def __init__(self, device='/gpu:0'):
super(MNISTModel, self).__init__()
self.device = device
self.initializer = tf.initializers.random_uniform(0.0, 0.5)
self.hidden = tf.keras.layers.Dense(200, use_bias=False, kernel_initializer=tf.initializers.random_uniform(0.0, 0.5), name="Hidden")
self.out = tf.keras.layers.Dense(train_images.shape[1], use_bias=False, kernel_initializer=tf.initializers.random_uniform(0.0, 0.5), name="Output")
self.hidden.build(train_images.shape[1])
self.out.build(200)
def call(self, x):
return self.out(self.hidden(x))
def loss_func(model, x, y_):
return tf.reduce_mean(tf.losses.mean_squared_error(labels=y_, predictions=model(x)))
#return tf.reduce_mean((y_ - model(x))**4)
model = MNISTModel()
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
for epochs in range(num_epochs):
print("Started epoch ", epochs)
print("Num batches is: ", train_images.shape[0]/batch_size)
for i in range(0,1): #(int(train_images.shape[0]/batch_size)):
with tfe.GradientTape(persistent=True) as tape:
tape.watch(model.variables)
with tfe.GradientTape() as in_tape:
in_tape.watch(model.variables)
loss = loss_func(model,train_images[0:batch_size],train_images[0:batch_size])
grads = tape.gradient(loss, model.variables)
IH_partial_grads = np.array([])
for i in range(len(grads[0])):
collector = np.array([])
for j in range(len(grads[0][i])):
collector = np.append(collector, tape.gradient(grads[0][i][j], model.variables[0]))
IH_partial_grads = np.append(IH_partial_grads, collector)
optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
print("Epoch test loss: ", loss_func(model, test_images, test_images))
My ultimate goal is to form the hessian matrix for the loss function with respect to all parameters of my network.
Thanks for any and all help!