WGAN-GP Large Oscillating Loss - tensorflow

I am trying to train a WaveGAN as described here: https://github.com/chrisdonahue/wavegan
In the paper, the WaveGAN is trained using WGAN-GP, so I have tried to implement it myself by adapting code from: https://github.com/LynnHo/DCGAN-LSGAN-WGAN-GP-DRAGAN-Tensorflow-2. However, after even only 2000 steps (~1 epoch), the loss values I am getting for the critic and the generator are large (< 1000) and oscillate between negative and positive. My audio is the same piano recordings that they used, just resampled at 16000Hz and converted to mono from stereo.
My loss graphs are:
I was hoping someone could please validate whether my implementation is correct and if so, what experiments can I run to diagnose this problem?
Note: TIMESTEPS indicates the number of samples I wish to generate for each generator pass. Currently this is set to 1 to replicate WaveGAN, and I wish to experiment with this in the future. For now, I don't think it is relevant to the issue.
My train.py script is:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import numpy as np
import librosa
import random
import os
import sys
import time
import GANModels
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
MODEL_DIMS = 64
NUM_SAMPLES = 16384
TIMESTEPS = 1
D_UPDATES_PER_G_UPDATE = 5
GRADIENT_PENALTY_WEIGHT = 10.0
NOISE_LEN = 100
EPOCHS = 2000
EPOCHS_PER_SAMPLE = 2
BATCH_SIZE = 8
Fs = 16000
class GAN:
def __init__(self, model_dims=MODEL_DIMS, num_samples=NUM_SAMPLES, timesteps=TIMESTEPS, gradient_penalty_weight=GRADIENT_PENALTY_WEIGHT,
noise_len=NOISE_LEN, batch_size=BATCH_SIZE, sr=Fs):
self.model_dims = model_dims
self.num_samples = num_samples
self.timesteps = timesteps
self.noise_dims = (timesteps, noise_len)
self.batch_size = batch_size
self.G = GANModels.Generator(self.model_dims, self.timesteps, num_samples)
self.D = GANModels.Critic(self.model_dims, self.timesteps, num_samples)
self.G_optimizer = Adam(learning_rate=1e-4, beta_1=0.5, beta_2=0.9)
self.D_optimizer = Adam(learning_rate=1e-4, beta_1=0.5, beta_2=0.9)
print(self.G.summary())
print(self.D.summary())
self.gradient_penalty_weight = gradient_penalty_weight
self.sr = sr
def _d_loss_fn(self, r_logit, f_logit):
r_loss = - tf.reduce_mean(r_logit)
f_loss = tf.reduce_mean(f_logit)
return r_loss, f_loss
def _g_loss_fn(self, f_logit):
f_loss = - tf.reduce_mean(f_logit)
return f_loss
def _gradient_penalty(self, real, fake):
def _interpolate(a, b):
shape = [tf.shape(a)[0]] + [1] * (a.shape.ndims - 1)
alpha = tf.random.uniform(shape=shape, minval=0., maxval=1.)
inter = a + alpha * (b - a)
inter.set_shape(a.shape)
return inter
x = _interpolate(real, fake)
with tf.GradientTape() as t:
t.watch(x)
pred = self.D(x, training=True)
grad = t.gradient(pred, x)
norm = tf.norm(tf.reshape(grad, [tf.shape(grad)[0], -1]), axis=1)
gp = tf.reduce_mean((norm - 1.)**2)
return gp
#tf.function
def train_G(self):
with tf.GradientTape() as t:
z = tf.random.normal(shape=(self.batch_size,) + self.noise_dims)
x_fake = self.G(z, training=True)
x_fake_d_logit = self.D(x_fake, training=True)
G_loss = self._g_loss_fn(x_fake_d_logit)
G_grad = t.gradient(G_loss, self.G.trainable_variables)
self.G_optimizer.apply_gradients(zip(G_grad, self.G.trainable_variables))
return {'g_loss': G_loss}
#tf.function
def train_D(self, x_real):
with tf.GradientTape() as t:
z = tf.random.normal(shape=(x_real.shape[0],) + self.noise_dims) #Half fake and half real
x_fake = self.G(z, training=True)
x_real_d_logit = self.D(x_real, training=True)
x_fake_d_logit = self.D(x_fake, training=True)
x_real_d_loss, x_fake_d_loss = self._d_loss_fn(x_real_d_logit, x_fake_d_logit)
gp = self._gradient_penalty(x_real, x_fake)
D_loss = (x_real_d_loss + x_fake_d_loss) + gp * self.gradient_penalty_weight
D_grad = t.gradient(D_loss, self.D.trainable_variables)
self.D_optimizer.apply_gradients(zip(D_grad, self.D.trainable_variables))
return {'d_loss': x_real_d_loss + x_fake_d_loss, 'gp': gp}
def sample(self, epoch, num_samples=10):
z = tf.random.normal(shape=(num_samples,) + self.noise_dims)
result = self.G(z, training=False)
for i in range(num_samples):
audio = np.array(result[i, :, :])
audio.flatten()
librosa.output.write_wav(f"output/piano/{epoch}-{i}.wav", audio, sr=self.sr)
#############################################################################
gan = GAN()
X_train = []
for file in os.listdir(r"D:\ML_Datasets\mancini_piano\piano\train"):
with open(r"D:\ML_Datasets\mancini_piano\piano\train" + fr"\{file}", "rb") as f:
samples, _ = librosa.load(f, Fs)
if len(samples) < TIMESTEPS*NUM_SAMPLES:
audio = np.array([np.array([sample]) for sample in samples])
padding = np.zeros(shape=(TIMESTEPS*NUM_SAMPLES - len(samples), 1), dtype='float32')
X_train.append(np.append(audio, padding, axis=0))
else:
for i in range(len(samples) // (TIMESTEPS*NUM_SAMPLES)):
X_train.append(np.array([np.array([sample]) for sample in samples[:TIMESTEPS*NUM_SAMPLES]]))
samples = np.delete(samples, np.s_[:TIMESTEPS*NUM_SAMPLES])
print(f"X_train shape = {(len(X_train),) + X_train[0].shape}")
librosa.output.write_wav("output/piano/test.wav", X_train[0], sr=Fs)
train_summary_writer = tf.summary.create_file_writer("logs/train")
with train_summary_writer.as_default():
steps_per_epoch = len(X_train) // BATCH_SIZE
for e in range(EPOCHS):
for i in range(steps_per_epoch):
D_loss_sum = 0
for n in range(D_UPDATES_PER_G_UPDATE):
D_loss_dict = gan.train_D(np.array(random.sample(X_train, BATCH_SIZE)))
D_loss_sum += D_loss_dict['d_loss']
D_loss = D_loss_sum / D_UPDATES_PER_G_UPDATE
G_loss_dict = gan.train_G()
G_loss = G_loss_dict['g_loss']
tf.summary.scalar('d_loss', D_loss, step=(e*steps_per_epoch)+i)
tf.summary.scalar('g_loss', G_loss, step=(e*steps_per_epoch)+i)
print(f"step {(e*steps_per_epoch)+i}: d_loss = {D_loss} g_loss = {G_loss}")
if e % EPOCHS_PER_SAMPLE == 0:
gan.sample(e)
My GANModels.py script is:
def Generator(d, a, num_samples, c=16):
# Prelim layers
input_layer = Input(shape=(100,))
dense_layer0 = Dense(256*d, input_shape=(100,))(input_layer)#
reshape_layer0 = Reshape((c, c*d))(dense_layer0)#
relu_layer0 = Activation('relu')(reshape_layer0)#
# WaveCNN layers
c //= 2
expanded_layer0 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer0)#relu_layer1
conv1d_t_layer0 = Conv2DTranspose(c*d, (1, 25), strides=(1, 4), padding='same')(expanded_layer0)
slice_layer0 = Lambda(lambda x: x[:, 0])(conv1d_t_layer0)
relu_layer2 = Activation('relu')(slice_layer0)
c //= 2
expanded_layer1 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer2)
conv1d_t_layer1 = Conv2DTranspose(c*d, (1, 25), strides=(1, 4), padding='same')(expanded_layer1)
slice_layer1 = Lambda(lambda x: x[:, 0])(conv1d_t_layer1)
relu_layer3 = Activation('relu')(slice_layer1)
c //= 2
expanded_layer2 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer3)
conv1d_t_layer2 = Conv2DTranspose(c*d, (1, 25), strides=(1, 4), padding='same')(expanded_layer2)
slice_layer2 = Lambda(lambda x: x[:, 0])(conv1d_t_layer2)
relu_layer4 = Activation('relu')(slice_layer2)
c //= 2
expanded_layer3 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer4)
conv1d_t_layer3 = Conv2DTranspose(c*d, (1, 25), strides=(1, 4), padding='same')(expanded_layer3)
slice_layer3 = Lambda(lambda x: x[:, 0])(conv1d_t_layer3)
relu_layer5 = Activation('relu')(slice_layer3)
expanded_layer4 = Lambda(lambda x: K.expand_dims(x, axis=1))(relu_layer5)
conv1d_t_layer4 = Conv2DTranspose(1, (1, 25), strides=(1, 4), padding='same')(expanded_layer4)#strides=(1,1)
slice_layer4 = Lambda(lambda x: x[:, 0])(conv1d_t_layer4)
tanh_layer0 = Activation('tanh')(slice_layer4)
model = Model(inputs=input_layer, outputs=tanh_layer0)
return model
def _apply_phaseshuffle(x, rad=2, pad_type='reflect'):
b, x_len, nch = x.get_shape().as_list()
phase = tf.random.uniform([], minval=-rad, maxval=rad + 1, dtype=tf.int32)
pad_l = tf.maximum(phase, 0)
pad_r = tf.maximum(-phase, 0)
phase_start = pad_r
x = tf.pad(x, [[0, 0], [pad_l, pad_r], [0, 0]], mode=pad_type)
x = x[:, phase_start:phase_start+x_len]
x.set_shape([b, x_len, nch])
return x
def Critic(d, a, num_samples, c=1):
input_layer = Input(shape=(a*num_samples, 1))#d*d
conv1d_layer0 = Conv1D(c*d, 25, strides=4, padding='same')(input_layer)#//2
LReLU_layer0 = LeakyReLU(alpha=0.2)(conv1d_layer0)
phaseshuffle_layer0 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer0)
c *= 2
conv1d_layer1 = Conv1D(c*d, 25, strides=4, padding='same')(phaseshuffle_layer0)#d
LReLU_layer1 = LeakyReLU(alpha=0.2)(conv1d_layer1)
phaseshuffle_layer1 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer1)
c *= 2
conv1d_layer2 = Conv1D(c*d, 25, strides=4, padding='same')(phaseshuffle_layer1)#2*d
LReLU_layer2 = LeakyReLU(alpha=0.2)(conv1d_layer2)
phaseshuffle_layer2 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer2)
c *= 2
conv1d_layer3 = Conv1D(c*d, 25, strides=4, padding='same')(phaseshuffle_layer2)#4*d
LReLU_layer3 = LeakyReLU(alpha=0.2)(conv1d_layer3)
phaseshuffle_layer3 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer3)
c *= 2
conv1d_layer4 = Conv1D(c*d, 25, strides=4, padding='same')(phaseshuffle_layer3)#8*d,strides=4
LReLU_layer4 = LeakyReLU(alpha=0.2)(conv1d_layer4)
phaseshuffle_layer4 = Lambda(lambda x: _apply_phaseshuffle(x))(LReLU_layer4)
slice_layer0 = Lambda(lambda x: x[:, 0])(phaseshuffle_layer4)
dense_layer1 = Dense(1, input_shape=(256*d,))(slice_layer0)
model = Model(inputs=input_layer, outputs=dense_layer1)
return model

Related

tensorflow implementation is much slower than pytorch (with slow gpu utility) and occupy more memory

I am trying to converting a pytorch implementation to tensorflow, but I found that the program occupies more memory, has lower gpu utility and is much more slower than pytorch version. I am wondering it is a general case, or I am wrong with the tf model structure?
I put the code on, it can be directly run.
https://colab.research.google.com/drive/1oI6GVnt3sAULvbMMAGTY4B6LsHguJd9U#scrollTo=DNH5pPynm-jm
The pytorch version uses half memory with twice gpu util compared with tf version.
Code:
import math
import os
import tensorflow as tf
import json
import numpy as np
import time
def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
assert diffusion_step_embed_dim_in % 2 == 0
half_dim = diffusion_step_embed_dim_in // 2
_embed = tf.math.log(tf.convert_to_tensor(10000.0)) / (half_dim - 1)
_embed = tf.math.exp(tf.cast(tf.experimental.numpy.arange(start = 0,stop = half_dim),dtype = tf.float32) * -_embed)
_embed = tf.cast(diffusion_steps,dtype=tf.float32) * _embed
diffusion_step_embed = tf.concat((tf.math.sin(_embed),
tf.math.cos(_embed)), 1)
assert diffusion_step_embed.shape[0] == diffusion_steps.shape[0]
assert diffusion_step_embed.shape[1] == diffusion_step_embed_dim_in
return diffusion_step_embed
class Residual_block(tf.keras.layers.Layer):
def __init__(self, res_channels, skip_channels,
diffusion_step_embed_dim_out, in_channels,
s4_lmax,
s4_d_state,
s4_dropout,
s4_bidirectional,
s4_layernorm):
super(Residual_block, self).__init__()
self.res_channels = res_channels
self.fc_t = tf.keras.layers.Dense(self.res_channels)
self.conv_layer = tf.keras.layers.Conv1D(filters=2 * self.res_channels, kernel_size=3, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.cond_conv = tf.keras.layers.Conv1D(filters=2*self.res_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.res_conv1 = tf.keras.layers.Conv1D(filters=res_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.res_conv2 = tf.keras.layers.Conv1D(filters=skip_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
def call(self, input_data):
x, cond, diffusion_step_embed = input_data
h = x
B, C, L = h.shape
part_t = self.fc_t(diffusion_step_embed)
part_t = tf.reshape(part_t,[B, self.res_channels, 1])
h = h + part_t
h = self.conv_layer(h)
cond = self.cond_conv(cond)
h += cond
out = tf.math.tanh(h[:,:self.res_channels,:]) * tf.math.sigmoid(h[:,self.res_channels:,:])
res = self.res_conv1(out)
skip = self.res_conv2(out)
return (x + res) * tf.math.sqrt(0.5), skip # normalize for training stability
class Residual_group(tf.keras.Model):
def __init__(self, res_channels, skip_channels, num_res_layers,
diffusion_step_embed_dim_in,
diffusion_step_embed_dim_mid,
diffusion_step_embed_dim_out,
in_channels,
s4_lmax,
s4_d_state,
s4_dropout,
s4_bidirectional,
s4_layernorm):
super(Residual_group, self).__init__()
self.num_res_layers = num_res_layers
self.diffusion_step_embed_dim_in = diffusion_step_embed_dim_in
self.fc_t1 = tf.keras.layers.Dense(diffusion_step_embed_dim_mid)
self.fc_t2 = tf.keras.layers.Dense(diffusion_step_embed_dim_out)
self.residual_blocks = []
for n in range(self.num_res_layers):
self.residual_blocks.append(Residual_block(res_channels, skip_channels,
diffusion_step_embed_dim_out=diffusion_step_embed_dim_out,
in_channels=in_channels,
s4_lmax=s4_lmax,
s4_d_state=s4_d_state,
s4_dropout=s4_dropout,
s4_bidirectional=s4_bidirectional,
s4_layernorm=s4_layernorm))
def call(self, input_data):
h, conditional, diffusion_steps = input_data
diffusion_step_embed = calc_diffusion_step_embedding(diffusion_steps, self.diffusion_step_embed_dim_in)
diffusion_step_embed = tf.keras.activations.swish((self.fc_t1(diffusion_step_embed)))
diffusion_step_embed = tf.keras.activations.swish((self.fc_t2(diffusion_step_embed)))
#out = self.residual_blocks((h, tf.zeros((8,256,248)),conditional, diffusion_step_embed))
#skip = out[1]
skip = tf.zeros((8,256,248))
for n in range(self.num_res_layers):
h, skip_n = self.residual_blocks[n]((h, conditional, diffusion_step_embed))
skip += skip_n
return skip * tf.math.sqrt(1.0 / self.num_res_layers)
class SSSDS4Imputer(tf.keras.Model):
def __init__(self, in_channels, res_channels, skip_channels, out_channels,
num_res_layers,
diffusion_step_embed_dim_in,
diffusion_step_embed_dim_mid,
diffusion_step_embed_dim_out,
s4_lmax,
s4_d_state,
s4_dropout,
s4_bidirectional,
s4_layernorm):
super(SSSDS4Imputer, self).__init__()
# convert the dimension of input from (B,in_channels,L) to (B,res_channels,L)
self.init_conv = tf.keras.layers.Conv1D(filters=res_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.res_channels = res_channels
self.skip_channels = skip_channels
self.residual_layer = Residual_group(res_channels=res_channels,
skip_channels=skip_channels,
num_res_layers=num_res_layers,
diffusion_step_embed_dim_in=diffusion_step_embed_dim_in,
diffusion_step_embed_dim_mid=diffusion_step_embed_dim_mid,
diffusion_step_embed_dim_out=diffusion_step_embed_dim_out,
in_channels=in_channels,
s4_lmax=s4_lmax,
s4_d_state=s4_d_state,
s4_dropout=s4_dropout,
s4_bidirectional=s4_bidirectional,
s4_layernorm=s4_layernorm)
# convert the dimension from (B,skip_channels,L) to (B,out_channels,L)
self.final_conv1 = tf.keras.layers.Conv1D(filters=skip_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='he_normal', data_format='channels_first')
self.final_conv2 = tf.keras.layers.Conv1D(filters=out_channels, kernel_size=1, padding = 'SAME',use_bias=False, kernel_initializer='zeros', data_format='channels_first')
def call(self, input_data):
x, conditional, mask, diffusion_steps = input_data
conditional = conditional * mask
conditional = tf.concat([conditional, tf.cast(mask,dtype=tf.float32)], axis=1)
x = tf.nn.relu(self.init_conv(x))
x = self.residual_layer((x, conditional, diffusion_steps))
y = tf.nn.relu(self.final_conv1(x))
y = tf.nn.relu(self.final_conv2(y))
return y
def train_step(X, y, net,loss_fn,optimizer):
with tf.GradientTape() as tape:
logits = net(X)
loss_value = loss_fn(y, logits)
grads = tape.gradient(loss_value, net.trainable_variables,unconnected_gradients=tf.UnconnectedGradients.ZERO)
optimizer.apply_gradients(zip(grads, net.trainable_variables))
return loss_value.numpy()
if __name__ == '__main__':
model_config = {'in_channels': 12, 'out_channels': 12, 'num_res_layers': 36, 'res_channels': 256, 'skip_channels': 256,
'diffusion_step_embed_dim_in': 128, 'diffusion_step_embed_dim_mid': 512, 'diffusion_step_embed_dim_out': 512,
's4_lmax': 250, 's4_d_state': 64, 's4_dropout': 0.0, 's4_bidirectional': 1, 's4_layernorm': 1}
net = SSSDS4Imputer(**model_config)
# define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
#training
n_iter = 0
#iterator = iter(dataset)
while n_iter < 150000 + 1:
#try:
X = (tf.random.normal([8,12,248], 0, 1, tf.float32, seed=1),tf.random.normal([8,12,248], 0, 1, tf.float32, seed=2),
tf.random.normal([8,12,248], 0, 1, tf.float32, seed=2),tf.random.normal([8,1], 0, 1, tf.float32, seed=4))
y = tf.random.normal([8,12,248], 0, 1, tf.float32, seed=1)
t0 = time.time()
loss = train_step(X,y,net,tf.keras.losses.MeanSquaredError(),optimizer)
print(time.time()-t0)
n_iter += 1
The tensorflow version I use is 2.4.1

Error in defining custom layers using subclasses in keras

I am getting an error in defining layers to my model. The input for self.layer[i] is not working when I'm implementing X = self.layers[i](A, A)inside the code. But the layer[i] which is made from GTLayer, works well for when I call it separetly, A = tf.random.uniform(shape=[2,2]) d = GTLayer(2,2,1) d.call(A,A)
..
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class GTN(keras.Model): # layers.Layer keeps track of everything under the hood!
def __init__(self, num_edge, num_channels, w_in, w_out, num_class,num_layers,norm):
super(GTN, self).__init__()
self.num_layers = 3
self.num_edge = 6
self.num_channels = 3
self.w_in = 2#tf.random.uniform(shape=[2,2])
self.w_out = 2#tf.random.uniform(shape=[2,2])
self.num_class =2
layers = []
for i in tf.range(num_layers):
print (4)
if i == 0:
layers.append(GTLayer(num_edge, num_channels, first=True))
else:
print(i)
layers.append(GTLayer(num_edge, num_channels, first=False))
#print ((self.w_out*self.num_channels, ))
self.linear1 = tf.keras.layers.Dense( self.w_out, input_shape =(self.w_out*self.num_channels, ), activation= None)
self.linear2 = tf.keras.layers.Dense( self.num_class, input_shape=(self.w_out, ), activation= None)
def call(self, A, X, target_x, target):
#A = tf.expand_dims(A, 0)
Ws = []
print('hello')
for i in range(self.num_layers):
print('i:',i)
if i == 0:
print('layers:',layers)
print('layers[i](A):', self.layers[i](A))
H, W = self.layers[i](A) #self.layers = nn.ModuleList(layers)
else:
#H = self.normalization(H)
print(H)
print(W)
print('A', A)
X = self.layers[i](A, A)
print('H_dash',self.layers[i](A))
H, W = self.layers[i](A, H)
Ws.append(W)
for i in range(self.num_channels):
if i==0:
X_ = tf.nn.relu(self.gcn_conv(X,H[i])).numpy()
else:
X_tmp = tf.nn.relu(self.gcn_conv(X,H[i])).numpy()
X_ = tf.concat((X_,X_tmp), dim=1)
X_ = self.linear1(X_)
X_ = tf.nn.relu(X_).numpy()
y = self.linear2(X_[target_x])
loss = self.loss(y, target)
return loss, y, Ws
class GTLayer(keras.layers.Layer):
def __init__(self, in_channels, out_channels, first=True):
super(GTLayer, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.first = first
def call(self, A, H_ = None):
if self.first == True:
a = tf.random.uniform(shape=[2,2])
b = tf.random.uniform(shape=[2,2])
print(a)
print(b)
#H = torch.bmm(np.array(a),np.array(b))
H = tf.matmul( a, b)
else:
a = tf.random.uniform(shape=[2])
H = tf.random.uniform(shape=[2])
return H,W
Input used for check:
d = GTN(2,3,4,2,2,2,1)
A = tf.random.uniform(shape=[2,2])
X = tf.random.uniform(shape=[2,2])
t_x = 5
t = 4
d.call(A,X,t_x,t)
Error:
---> 47 X = self.layers[i](A, A)
TypeError: call() takes 2 positional arguments but 3 were given

Trouble Training Same Tensorflow Model in PyTorch

I have trained a model in Tensorflow and am having trouble replicating it in PyTorch. The Tensorflow model achieves near 100% accuracy (the task is simple), but the PyTorch model performs at random. I've spent a while trying to figure this out, and can't understand what the problem could be.
The model is trained for the task of binary classification. Given an input utterance describing a quadrant and a (x, y, z) coordinate, the model has to predict if the (x, z) portion of the coordinate is in the quadrant described. For example, if the input text was "quadrant 1" and the coordinate was (0.5, -, 0.5), then the prediction should be true, but if the region was "quadrant 2" with the same coordinate, then the prediction should be false.
I generated some data and trained the model in Tensorflow using this code:
x_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="x_data")
y_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="y_data")
z_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="z_data")
# text and labels placeholders
text_data = tf.placeholder(tf.int32, [FLAGS.batch_size, maxtextlength])
text_lengths = tf.placeholder(tf.int32, [FLAGS.batch_size])
y_labels_placeholder = tf.placeholder(tf.int64, [FLAGS.batch_size])
# encode text and coordinate
embeddings = tf.Variable(tf.random_uniform([100, embedding_size], -1, -1))
rnn_inputs = tf.nn.embedding_lookup(embeddings, text_data)
rnn_layers = [tf.compat.v1.nn.rnn_cell.LSTMCell(size, initializer=tf.compat.v1.keras.initializers.glorot_normal) for size in [256]]
multi_rnn_cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(rnn_layers, state_is_tuple=True)
text_outputs, text_fstate = tf.compat.v1.nn.dynamic_rnn(cell=multi_rnn_cell,
inputs=rnn_inputs,
dtype=tf.float32, sequence_length=text_lengths)
# have fully connected layers to map them the input coordinates into the same dimension as the LSTM output layer from above
x_output_layer = tf.compat.v1.layers.dense(x_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='x_coordinate')
y_output_layer = tf.compat.v1.layers.dense(y_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='y_coordinate')
z_output_layer = tf.compat.v1.layers.dense(z_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='z_coordinate')
# add the representations
total_output_layer = x_output_layer + y_output_layer + z_output_layer + lstm_output_layer
# make the predictions with two fully connected layers
fc_1 = tf.compat.v1.layers.dense(total_output_layer, units=FLAGS.hidden_layer_size, activation=tf.nn.relu, name='fc_1')
logits = tf.compat.v1.layers.dense(fc_1, units=FLAGS.output_dims, activation=None, name='logits')
# train the model
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_labels_placeholder, logits=logits))
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate, epsilon=1e-7)
gradients, variables = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.gradient_clip_threshold)
optimize = optimizer.apply_gradients(zip(gradients, variables))
# then it'll be trained with sess.run ...
Now for the PyTorch replication:
class BaselineModel(nn.Module):
def __init__(self):
super(BaselineModel, self).__init__()
self.encode_x = nn.Linear(1, embed_size)
self.encode_y = nn.Linear(1, embed_size)
self.encode_z = nn.Linear(1, embed_size)
self._embeddings = nn.Embedding(vocab_size, self.embedding_table_size)
nn.init.uniform_(self._embeddings.weight, -1.0, 1.0)
self.num_layers = 1
self.rnn = nn.LSTM(self.embedding_table_size, self.hidden_size, batch_first=True)
self.fc_after_text_lstm = nn.Linear(self.hidden_size, 100)
self.fc = nn.Linear(100, 256)
self.fc_final = nn.Linear(256, 2)
self.relu_activation = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
def init_hidden(self, batch_size, device='cuda:0'):
# for LSTM, we need # of layers
h_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
c_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
return h_0, c_0
def forward(self, input_text, x_coordinate=None, y_coordinate=None, z_coordinate=None):
x_embed = self.relu_activation(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
y_embed = self.relu_activation(self.encode_y(y_coordinate.cuda().to(torch.float32))).cuda()
z_embed = self.relu_activation(self.encode_z(z_coordinate.cuda().to(torch.float32))).cuda()
embeds = self._embeddings(input_text)
embedding, hidden = self.rnn(embeds, self.hidden)
text_fc = self.relu_activation(self.fc_after_text_lstm(embedding[:, -1]))
representations_so_far_added = torch.sum(torch.stack([text_fc, x_embed, y_embed, z_embed]), dim=0)
pre_final_embedding = self.relu_activation(self.fc(representations_so_far_added))
return self.fc_final(pre_final_embedding )
### training code
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, eps=1e-7)
criterion = nn.CrossEntropyLoss()
for input_text, x_coordinate, y_coordinate, z_coordinate, targets, train_data:
optimizer.zero_grad()
pred = model(input_text, x_coordinate=x_coordinate, y_coordinate=y_coordinate, z_coordinate=z_coordinate)
loss = criterion(pred.float(), targets)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
optimizer.step()
scheduler.step()
# accuracy evaluation code, this is evaluated over the entire epoch
pred_idx = F.log_softmax(pred, dim=1)
target_labels = targets.cpu().int()
pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()
curr_acc = skm.accuracy_score(target_labels, pred_labels)
If anyone can spot any issue with the PyTorch implementation or maybe tell me what could be wrong, that would be much appreciated! I also tried to load the weights of the Tensorflow model into all the appropriate layers, and performance still struggles in PyTorch! Thanks in advance!
EDIT:
I have created a minimally reproducible example, because I still cannot figure out what the problem is. Any help would be still appreciated!
import torch
import torch.nn as nn
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr = 0.0005
n_epochs = 10
input_dim = 4
hidden_dim = 128
layer_dim = 2
output_dim = 2
batch_size = 50
class FeatureDataSet(torch.utils.data.Dataset):
def __init__(self, x_train, y_train, x_coordinates):
self.x_train = torch.tensor(x_train, dtype=torch.long)
self.y_train = torch.tensor(y_train)
self.x_coordinates = torch.tensor(x_coordinates, dtype=torch.float32)
def __len__(self):
return len(self.y_train)
def __getitem__(self, idx):
return self.x_train[idx], self.y_train[idx], self.x_coordinates[idx]
class RNN(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, batch_size):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
# linear layer to encode the coordinate
self.encode_x = nn.Linear(1, hidden_dim).cuda()
self._embeddings = nn.Embedding(40, 100).cuda()
# hidden_dim is 128
# layer_dim is 2
self.lstm = nn.LSTM(100, hidden_dim, layer_dim, batch_first=True).cuda()
self.fc = nn.Linear(2 * hidden_dim, output_dim).cuda()
self.batch_size = batch_size
self.hidden = None
def init_hidden(self, x):
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
return [t.cpu() for t in (h0, c0)]
def forward(self, x, x_coordinate):
#initializing the hidden states
h0, c0 = self.init_hidden(x)
embeds = self._embeddings(x)
out, (hn, cn) = self.lstm(embeds.cuda(), (h0.cuda(), c0.cuda()))
x_embed = F.relu(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
representations_so_far_added = torch.cat([out[:, -1, :], x_embed], dim=1)
out = self.fc(representations_so_far_added)
return out
model = RNN(input_dim, hidden_dim, layer_dim, output_dim, batch_size)
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=0.001)
print('Start model training')
import sklearn.metrics as skm
import torch.nn.functional as F
x_train = []
x_coordinates = []
y_train = []
for i in range(10000):
# create the data. if x_coordinate > 0 and the sentence says that (represented by [1, 5, 6, 8]), then we should predict positive else negative (if the x_coordinate > 0)
# same applies if the x_coordinate < 0, just that the sentence is now [1, 5, 6, 9]
if np.random.randint(0, 2) == 0:
if np.random.randint(0, 2) == 0:
# x coordinate > 0
x_train.append([1, 5, 6, 8])
x_coordinates.append([round(np.random.uniform(0.01, 1.00, 1)[0], 2)])
y_train.append(1.0)
else:
# x coordinate > 0 negative
x_train.append([1, 5, 6, 8])
x_coordinates.append([round(np.random.uniform(-1.00, 0.00, 1)[0], 2)])
y_train.append(0.0)
else:
if np.random.randint(0, 2) == 0:
# x coordinate < 0
x_train.append([1, 5, 6, 9])
x_coordinates.append([round(np.random.uniform(-1.00, 0.00, 1)[0], 2)])
y_train.append(1.0)
else:
# x coordinate < 0 negative
x_train.append([1, 5, 6, 9])
x_coordinates.append([round(np.random.uniform(0.01, 1.00, 1)[0], 2)])
y_train.append(0.0)
# print a sample of data
print(x_train[:10])
print(y_train[:10])
print(x_coordinates[:10])
# create a dataloader
trainingDataset = FeatureDataSet(x_train=x_train, y_train=y_train, x_coordinates=x_coordinates)
train_loader = torch.utils.data.DataLoader(dataset=trainingDataset, batch_size=batch_size, shuffle=True)
# for each epoch
for epoch in range(1, n_epochs + 1):
acc_all = []
# each batch
for i, (x_batch, y_batch, x_coord_batch) in enumerate(train_loader):
x_batch = x_batch.to(device)
y_batch = y_batch.to(device)
x_coord_batch = x_coord_batch.to(device)
opt.zero_grad()
# pass in the text (x_batch) and coordinate (x_coord_batch)
out = model(x_batch, x_coordinate=x_coord_batch)
loss = criterion(out.float(), y_batch.type(torch.LongTensor).cuda())
loss.backward()
opt.step()
pred_idx = F.log_softmax(out, dim=1)
target_labels = y_batch.cpu().int()
pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()
curr_acc = skm.accuracy_score(target_labels, pred_labels)
acc_all.append(curr_acc)
print(np.mean(acc_all))
I suppose perhaps there are some mistakes in your dataset implementation in the PyTorch version.
I tried your pytorch BaselineModel on both the dataset in your "minimally reproducible example" and my own dataset constructed according to your description, and find that it works fine.
The following is my codes for testing on my own dataset. Note that I add several hyperparameters to the code of BaselineModel to make it run. I got accuracy over 99%.
import random
import torch
import torch.nn as nn
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr = 0.0005
n_epochs = 100
input_dim = 4
hidden_dim = 128
layer_dim = 2
output_dim = 2
batch_size = 50
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
class FeatureDataSet(torch.utils.data.Dataset):
def __init__(self, x_train, y_train, x_coordinates, y_coordinates, z_coordinates):
self.x_train = torch.tensor(x_train, dtype=torch.long)
self.y_train = torch.tensor(y_train)
self.x_coordinates = torch.tensor(x_coordinates, dtype=torch.float32)
self.y_coordinates = torch.tensor(y_coordinates, dtype=torch.float32)
self.z_coordinates = torch.tensor(z_coordinates, dtype=torch.float32)
def __len__(self):
return len(self.y_train)
def __getitem__(self, idx):
return self.x_train[idx], self.y_train[idx], self.x_coordinates[idx], self.y_coordinates[idx], self.z_coordinates[idx]
class BaselineModel(nn.Module):
def __init__(self):
super(BaselineModel, self).__init__()
vocab_size = 40
self.hidden_size = 100
self.embedding_table_size = self.hidden_size
self.encode_x = nn.Linear(1, self.hidden_size)
self.encode_y = nn.Linear(1, self.hidden_size)
self.encode_z = nn.Linear(1, self.hidden_size)
self._embeddings = nn.Embedding(vocab_size, self.embedding_table_size)
nn.init.uniform_(self._embeddings.weight, -1.0, 1.0)
self.num_layers = 1
self.rnn = nn.LSTM(self.embedding_table_size, self.hidden_size, batch_first=True)
self.fc_after_text_lstm = nn.Linear(self.hidden_size, 100)
self.fc = nn.Linear(100, 256)
self.fc_final = nn.Linear(256, 2)
self.relu_activation = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
self.hidden = self.init_hidden(batch_size)
def init_hidden(self, batch_size, device='cuda:0'):
# for LSTM, we need # of layers
h_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
c_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
return h_0, c_0
def forward(self, input_text, x_coordinate=None, y_coordinate=None, z_coordinate=None):
x_embed = self.relu_activation(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
y_embed = self.relu_activation(self.encode_y(y_coordinate.cuda().to(torch.float32))).cuda()
z_embed = self.relu_activation(self.encode_z(z_coordinate.cuda().to(torch.float32))).cuda()
embeds = self._embeddings(input_text)
embedding, hidden = self.rnn(embeds, self.hidden)
text_fc = self.relu_activation(self.fc_after_text_lstm(embedding[:, -1]))
representations_so_far_added = torch.sum(torch.stack([text_fc, x_embed, y_embed, z_embed]), dim=0)
pre_final_embedding = self.relu_activation(self.fc(representations_so_far_added))
return self.fc_final(pre_final_embedding)
# model = RNN(input_dim, hidden_dim, layer_dim, output_dim, batch_size)
model = BaselineModel().cuda()
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=0.001)
print('Start model training')
import sklearn.metrics as skm
import torch.nn.functional as F
x_train = []
x_coordinates = []
y_coordinates = []
z_coordinates = []
y_train = []
for i in range(10000):
x_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
y_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
z_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
x_coordinates.append([x_coordinate])
y_coordinates.append([y_coordinate])
z_coordinates.append([z_coordinate])
if np.random.randint(0, 2) == 0: # positive example
if x_coordinate <= 0 and z_coordinate <= 0:
x_train.append([1, 5, 6, 8])
elif x_coordinate <= 0 and z_coordinate > 0:
x_train.append([1, 5, 6, 9])
elif x_coordinate > 0 and z_coordinate <= 0:
x_train.append([1, 5, 6, 10])
elif x_coordinate > 0 and z_coordinate > 0:
x_train.append([1, 5, 6, 11])
y_train.append(1.0)
else:
if x_coordinate <= 0 and z_coordinate <= 0:
x_train.append(random.choice([[1, 5, 6, 9], [1, 5, 6, 10], [1, 5, 6, 11]]))
elif x_coordinate <= 0 and z_coordinate > 0:
x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 10], [1, 5, 6, 11]]))
elif x_coordinate > 0 and z_coordinate <= 0:
x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 9], [1, 5, 6, 11]]))
elif x_coordinate > 0 and z_coordinate > 0:
x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 9], [1, 5, 6, 10]]))
y_train.append(0.0)
# print a sample of data
print(x_train[:10])
print(y_train[:10])
print(x_coordinates[:10])
print(y_coordinates[:10])
print(z_coordinates[:10])
# create a dataloader
trainingDataset = FeatureDataSet(x_train=x_train, y_train=y_train, x_coordinates=x_coordinates, y_coordinates=y_coordinates, z_coordinates=z_coordinates)
train_loader = torch.utils.data.DataLoader(dataset=trainingDataset, batch_size=batch_size, shuffle=True)
# for each epoch
loss_meter = AverageMeter()
for epoch in range(1, n_epochs + 1):
acc_all = []
# each batch
loss_meter.reset()
for i, (x_batch, y_batch, x_coord_batch, y_coord_batch, z_coord_batch) in enumerate(train_loader):
x_batch = x_batch.to(device)
y_batch = y_batch.to(device)
x_coord_batch = x_coord_batch.to(device)
y_coord_batch = y_coord_batch.to(device)
z_coord_batch = z_coord_batch.to(device)
opt.zero_grad()
# pass in the text (x_batch) and coordinate (x_coord_batch)
out = model(x_batch, x_coordinate=x_coord_batch, y_coordinate=y_coord_batch, z_coordinate=z_coord_batch)
loss = criterion(out.float(), y_batch.type(torch.LongTensor).cuda())
loss.backward()
opt.step()
pred_idx = F.log_softmax(out, dim=1)
target_labels = y_batch.cpu().int()
pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()
curr_acc = skm.accuracy_score(target_labels, pred_labels)
acc_all.append(curr_acc)
loss_meter.update(loss.item())
print(np.mean(acc_all))
print("loss is %f" % loss_meter.val)
As for the "minimally reproducible example", I think the model RNN doesn't work is quite reasonable, as I have stated in the comments. I suppose that tensorflow can not fit as well, although I have not tried it. Your "minimally reproducible example" may be unrelated to your main problem.

Why arbitrarily define a tf.constant (not using it anywhere) in a project which built by tf1 will cause network performance degradation?

Why does pinn's Burgers identification code (which can be understood as a TF1 neural network) define a tf.constant whether in the network class, in the main function or the outermost (not using it in any functions) will cause network performance degradation?
Before defining it, my network has better prediction accuracy and convergence time.
Here is the code:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from scipy.interpolate import griddata
from plotting import newfig, savefig
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.gridspec as gridspec
import time
np.random.seed(1234)
tf.set_random_seed(1234)
class PhysicsInformedNN:
# Initialize the class
def __init__(self, X, u, layers, lb, ub):
self.n=tf.constant([0.0])
self.lb = lb
self.ub = ub
self.x = X[:,0:1]
self.t = X[:,1:2]
self.u = u
self.layers = layers
# Initialize NNs
self.weights, self.biases = self.initialize_NN(layers)
# tf placeholders and graph
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
log_device_placement=True))
# Initialize parameters
self.lambda_1 = tf.Variable([0.0], dtype=tf.float32)
self.lambda_2 = tf.Variable([-6.0], dtype=tf.float32)
self.x_tf = tf.placeholder(tf.float32, shape=[None, self.x.shape[1]])
self.t_tf = tf.placeholder(tf.float32, shape=[None, self.t.shape[1]])
self.u_tf = tf.placeholder(tf.float32, shape=[None, self.u.shape[1]])
self.u_pred = self.net_u(self.x_tf, self.t_tf)
self.f_pred = self.net_f(self.x_tf, self.t_tf)
self.loss = tf.reduce_mean(tf.square(self.u_tf - self.u_pred)) + \
tf.reduce_mean(tf.square(self.f_pred))
self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.loss,
method = 'L-BFGS-B',
options = {'maxiter': 50000,
'maxfun': 50000,
'maxcor': 50,
'maxls': 50,
'ftol' : 1.0 * np.finfo(float).eps})
self.optimizer_Adam = tf.train.AdamOptimizer()
self.train_op_Adam = self.optimizer_Adam.minimize(self.loss)
init = tf.global_variables_initializer()
self.sess.run(init)
def initialize_NN(self, layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(0,num_layers-1):
W = self.xavier_init(size=[layers[l], layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype=tf.float32), dtype=tf.float32)
weights.append(W)
biases.append(b)
return weights, biases
def xavier_init(self, size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
return tf.Variable(tf.truncated_normal([in_dim, out_dim], stddev=xavier_stddev), dtype=tf.float32)
def neural_net(self, X, weights, biases):
num_layers = len(weights) + 1
H = 2.0*(X - self.lb)/(self.ub - self.lb) - 1.0
for l in range(0,num_layers-2):
W = weights[l]
b = biases[l]
H = tf.tanh(tf.add(tf.matmul(H, W), b))
W = weights[-1]
b = biases[-1]
Y = tf.add(tf.matmul(H, W), b)
return Y
def net_u(self, x, t):
u = self.neural_net(tf.concat([x,t],1), self.weights, self.biases)
return u
def net_f(self, x, t):
lambda_1 = self.lambda_1
lambda_2 = tf.exp(self.lambda_2)
u = self.net_u(x,t)
u_t = tf.gradients(u, t)[0]
u_x = tf.gradients(u, x)[0]
u_xx = tf.gradients(u_x, x)[0]
f = u_t + lambda_1*u*u_x - lambda_2*u_xx
return f
def callback(self, loss, lambda_1, lambda_2):
print('Loss: %e, l1: %.5f, l2: %.5f' % (loss, lambda_1, np.exp(lambda_2)))
def train(self, nIter):
tf_dict = {self.x_tf: self.x, self.t_tf: self.t, self.u_tf: self.u}
start_time = time.time()
for it in range(nIter):
self.sess.run(self.train_op_Adam, tf_dict)
# Print
if it % 10 == 0:
elapsed = time.time() - start_time
loss_value = self.sess.run(self.loss, tf_dict)
lambda_1_value = self.sess.run(self.lambda_1)#run的参数为返回值fetches
lambda_2_value = np.exp(self.sess.run(self.lambda_2))
print('It: %d, Loss: %.3e, Lambda_1: %.3f, Lambda_2: %.6f, Time: %.2f' %
(it, loss_value, lambda_1_value, lambda_2_value, elapsed))
start_time = time.time()
self.optimizer.minimize(self.sess,
feed_dict = tf_dict,
fetches = [self.loss, self.lambda_1, self.lambda_2],
loss_callback = self.callback)
def predict(self, X_star):
tf_dict = {self.x_tf: X_star[:,0:1], self.t_tf: X_star[:,1:2]}
u_star = self.sess.run(self.u_pred, tf_dict)
f_star = self.sess.run(self.f_pred, tf_dict)
return u_star, f_star
if __name__ == "__main__":
nu = 0.01/np.pi
N_u = 2000
layers = [2, 20, 20, 20, 20, 20, 20, 20, 20, 1]
data = scipy.io.loadmat('../Data/burgers_shock.mat')
t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T
X, T = np.meshgrid(x,t)
X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]
# Doman bounds
lb = X_star.min(0)
ub = X_star.max(0)
######################################################################
######################## Noiseles Data ###############################
######################################################################
noise = 0.0
idx = np.random.choice(X_star.shape[0], N_u, replace=False)
X_u_train = X_star[idx,:]
u_train = u_star[idx,:]
model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub)
model.train(0)
u_pred, f_pred = model.predict(X_star)
error_u = np.linalg.norm(u_star-u_pred,2)/np.linalg.norm(u_star,2)
#U_pred = griddata(X_star, u_pred.flatten(), (X, T), method='cubic')
lambda_1_value = model.sess.run(model.lambda_1)
lambda_2_value = model.sess.run(model.lambda_2)
lambda_2_value = np.exp(lambda_2_value)
error_lambda_1 = np.abs(lambda_1_value - 1.0)*100
error_lambda_2 = np.abs(lambda_2_value - nu)/nu * 100
print('Error u: %e' % (error_u))
print('Error l1: %.5f%%' % (error_lambda_1))
print('Error l2: %.5f%%' % (error_lambda_2))

TensorFlow training with large dataset takes too long

Yesterday, I have created a pretrained VGG19 with custom head and tried to train it with 60000 images. After more than 12 hours, the training of first epoch didn't complete.
The batch size has been set to 64 and the number of steps per epoch has been set to training_set_size/batch_size.
Below is the code of DataLoader:
IMAGE_CHANNEL = 3
def crop(image, margin):
return image[margin:-margin, margin:-margin]
def random_rotation(image, angle):
M = cv2.getRotationMatrix2D((0, 0),angle,1)
rows,cols, _ = image.shape
new_img = cv2.warpAffine(image, M, (cols, rows))
return new_img
def get_generator(in_gen, should_augment=True):
weights = None
if should_augment:
image_gen = tf.keras.preprocessing.image.ImageDataGenerator(fill_mode='reflect',
data_format='channels_last',
brightness_range=[0.5, 1.5])
else:
image_gen = tf.keras.preprocessing.image.ImageDataGenerator(fill_mode='reflect',
data_format='channels_last',
brightness_range=[1, 1])
for items in in_gen:
in_x, in_y = items
g_x = image_gen.flow(255 * in_x, in_y, batch_size=in_x.shape[0])
x, y = next(g_x)
yield x / 255.0, y
class DataLoader:
def __init__(self, source_filename, dataset_path, image_size, batch_size, training_set_size=0.8, sample_size=None):
path_dataset = Path(dataset_path)
path_image_folders = path_dataset / 'images'
self.data = pd.read_pickle(source_filename)
if sample_size is not None:
self.data = self.data[:sample_size]
self.image_size = image_size
self.batch_size = batch_size
self.training_set_size = training_set_size
self.steps_per_epoch = int(self.data.shape[0] * training_set_size // batch_size)
if self.steps_per_epoch == 0: self.steps_per_epoch = 1
self.validation_steps = int(self.data.shape[0] * (1 - training_set_size)//batch_size)
if self.validation_steps == 0: self.validation_steps = 1
def draw_idx(self, i):
img_path = self.data.iloc[i].image
img = tf.keras.preprocessing.image.img_to_array(tf.keras.preprocessing.image.load_img(str(img_path)))
# print(img.shape)
height, width, _ = img.shape
fig = plt.figure(figsize=(15, 15), facecolor='w')
# original image
ax = fig.add_subplot(1, 1, 1)
ax.imshow(img / 255.0)
openness = self.data.iloc[i].Openness
conscientiousness = self.data.iloc[i].Conscientiousness
extraversion = self.data.iloc[i].Extraversion
agreeableness = self.data.iloc[i].Agreeableness
neuroticism = self.data.iloc[i].Neuroticism
ax.title.set_text(
f'O: {openness}, C: {conscientiousness}, E: {extraversion}, A: {agreeableness}, N: {neuroticism}')
plt.axis('off')
plt.tight_layout()
plt.show()
def get_image(self, index, data, should_augment):
# Read image and appropiate landmarks
image = cv2.imread(data['image'].values[index])
h, w, _ = image.shape
o, c, e, a, n = data[['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']].values[
index]
should_flip = random.randint(0, 1)
should_rotate = random.randint(0, 1)
should_crop = random.randint(0, 1)
if should_augment:
if should_flip == 1:
# print("Image {} flipped".format(data['path'].values[index]))
image = cv2.flip(image, 1)
if should_rotate == 1:
angle = random.randint(-5, 5)
image = random_rotation(image, angle)
if should_crop == 1:
margin = random.randint(1, 10)
image = crop(image, margin)
image = cv2.resize(image, (self.image_size, self.image_size))
return [image, o, c, e, a, n]
def generator(self, data, should_augment=True):
while True:
# Randomize the indices to make an array
indices_arr = np.random.permutation(data.count()[0])
for batch in range(0, len(indices_arr), self.batch_size):
# slice out the current batch according to batch-size
current_batch = indices_arr[batch:(batch + self.batch_size)]
# initializing the arrays, x_train and y_train
x_train = np.empty(
[0, self.image_size, self.image_size, IMAGE_CHANNEL], dtype=np.float32)
y_train = np.empty([0, 5], dtype=np.int32)
for i in current_batch:
# get an image and its corresponding color for an traffic light
[image, o, c, e, a, n] = self.get_image(i, data, should_augment)
# Appending them to existing batch
x_train = np.append(x_train, [image], axis=0)
y_train = np.append(y_train, [[o, c, e, a, n]], axis=0)
# replace nan values with zeros
y_train = np.nan_to_num(y_train)
yield (x_train, y_train)
def get_training_and_test_generators(self, should_augment_training=True, should_augment_test=True):
msk = np.random.rand(len(self.data)) < self.training_set_size
train = self.data[msk]
test = self.data[~msk]
train_gen = self.generator(train, should_augment_training)
test_gen = self.generator(test, should_augment_test)
return get_generator(train_gen, should_augment_training), get_generator(test_gen, should_augment_test)
def show_batch_images_sample(self, images, landmarks, n_rows=3, n_cols=3):
assert n_rows * n_cols <= self.batch_size, "Number of expected images to display is larger than batch!"
fig = plt.figure(figsize=(15, 15))
xs, ys = [], []
count = 1
for img, y in zip(images, landmarks):
ax = fig.add_subplot(n_rows, n_cols, count)
ax.imshow(img)
h, w, _ = img.shape
o, c, e, a, n = y
ax.title.set_text(f'{o}, {c}, {e}, {a}, {n}')
ax.axis('off')
if count == n_rows * n_cols:
break
count += 1
class CallbackTensorboardImageOutput(Callback):
def __init__(self, model, generator, log_dir, feed_inputs_display=9):
# assert ((feed_inputs_display & (feed_inputs_display - 1)) == 0) and feed_inputs_display != 0
self.generator = generator
self.model = model
self.log_dir = log_dir
self.writer = tf.summary.create_file_writer(self.log_dir)
self.feed_inputs_display = feed_inputs_display
self.seen = 0
def plot_to_image(figure):
"""Converts the matplotlib plot specified by 'figure' to a PNG image and
returns it. The supplied figure is closed and inaccessible after this call."""
# Save the plot to a PNG in memory.
buf = io.BytesIO()
plt.savefig(buf, format='png')
# Closing the figure prevents it from being displayed directly inside
# the notebook.
plt.close(figure)
buf.seek(0)
# Convert PNG buffer to TF image
image = tf.image.decode_png(buf.getvalue(), channels=4)
# Add the batch dimension
image = tf.expand_dims(image, 0)
return image
#staticmethod
def get_loss(gt, predictions):
return tf.losses.mse(gt, predictions)
def on_epoch_end(self, epoch, logs={}):
self.seen += 1
if self.seen % 1 == 0:
items = next(self.generator)
images_to_display = self.feed_inputs_display
images_per_cell_count = int(math.sqrt(images_to_display))
# in case of regular model training using generator, an array is passed
if not isinstance(items, dict):
frames_arr, ocean_scores = items
# Take just 1st sample from batch
batch_size = frames_arr.shape[0]
if images_to_display > batch_size:
images_to_display = batch_size
frames_arr = frames_arr[0:images_to_display]
ocean_scores = ocean_scores[0:images_to_display]
y_pred = self.model.predict(frames_arr)
# in case of adversarial training, a dictionary is passed
else:
batch_size = items['feature'].shape[0]
if images_to_display > batch_size:
images_to_display = batch_size
# items['feature'] = items['feature'][0:images_to_display]
# landmarks = items['label'][0:images_to_display]
frames_arr = items['feature']
landmarks = items['label']
y_pred = self.model.predict(items)
figure = plt.figure(figsize=(15, 15))
for i in range(images_to_display):
image_current = frames_arr[i]
y_prediction_current = y_pred[i]
y_gt_current = ocean_scores[i]
lbl_prediction = 'plot/img/{}'.format(i)
ax = plt.subplot(images_per_cell_count, images_per_cell_count, i + 1, title=lbl_prediction)
ax.imshow(image_current)
ax.axis('off')
with self.writer.as_default():
tf.summary.image("Training Data", CallbackTensorboardImageOutput.plot_to_image(figure), step=self.seen)
Below is the definition of the network architecture and the call of fit_generator function:
data_loader = dataloader.DataLoader('dataset.pkl', '/home/niko/data/PsychoFlickr', 224, 64)
train_gen, test_gen = data_loader.get_training_and_test_generators()
pre_trained_model = tf.keras.applications.VGG19(input_shape=(data_loader.image_size, data_loader.image_size, dataloader.IMAGE_CHANNEL), weights='imagenet', include_top=False)
x = pre_trained_model.output
x = tf.keras.layers.Flatten()(x)
# Add a fully connected layer with 256 hidden units and ReLU activation
x = tf.keras.layers.Dense(256)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dropout(rate=0.5)(x)
x = tf.keras.layers.Dense(256)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dropout(rate=0.5)(x)
x = tf.keras.layers.Dense(5, name='regresion_output')(x)
x = tf.keras.layers.Activation('linear')(x)
model = tf.keras.Model(pre_trained_model.input, x)
print(model.summary())
log_dir = "logs/{}".format(model_name)
model_filename = "saved-models/{}.h5".format(model_name)
cb_tensorboard = TensorBoard(log_dir=log_dir)
callback_save_images = dataloader.CallbackTensorboardImageOutput(model, test_gen, log_dir)
checkpoint = ModelCheckpoint(model_filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
lr = 1e-3
opt = tf.optimizers.Adam(lr=lr)
model.compile(loss=loss_mse, optimizer=opt, metrics=[loss_mse])
history = model.fit_generator(
train_gen,
validation_data=test_gen,
steps_per_epoch=data_loader.steps_per_epoch,
epochs=20,
validation_steps=data_loader.validation_steps,
verbose=2,
use_multiprocessing=True,
callbacks=[checkpoint, callback_save_images, cb_tensorboard]
)
When I tried to run the same procedure with small sample data (200 records), everything seemed to work fine. On the dataset of 60000 records, however, after more than 12 hours the training of 1st epoch hasn't completed.
The training is performed on NVIDIA RTX2080Ti.
I would be thankful if anyone suggested what has to be modified or in general configured in order to train the network on reasonable time.