Tensorflow Probability is stuck with negative binomial distribution on fit, no error - tensorflow

I have this simple bayesian neural network that gets stuck on .fit(x, y)
def get_model(input_shape, loss, optimizer, metrics, kl_weight, output_shape):
inputs = Input(shape=(input_shape))
x = BatchNormalization()(inputs)
x = tfpl.DenseVariational(units=128, activation='tanh', make_posterior_fn=get_posterior, make_prior_fn=get_prior, kl_weight=kl_weight)(x)
count = Dense(1)(x)
logits = Dense(output_shape, activation = 'sigmoid')(x)
neg_binom = tfp.layers.DistributionLambda(
lambda t: tfd.NegativeBinomial(total_count=t[..., 0:1], logits = t[..., 1:]))
cat = Concatenate(axis=-1)([count, logits])
outputs = neg_binom(cat)
model = Model(inputs, outputs)
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
return model
I do not get an error, it compiles and when I call model.fit(x,y) I just get:
Epoch 1/500
and it's stuck here forever (about 20 minutes I waited for the longest).
When I use a Poisson Layer, which I did before it starts fitting instantly, an epoch runs about 1s.
What could be the cause of this?
Many thanks for your insights and tips of things to try and debug this behaviour.

Related

Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.
For both models:
Setting the last 50 layers trainable and adding the same fully connected layers to the end.
Learning rate 3e-2
Batch size 32
Adam optimizer with the same betas
100 epochs
The inputs consist of RGB unscaled images
Pytorch
Model
def _init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
m.bias.data.fill_(0.01)
def get_mob_v3_small():
model = torchvision.models.mobilenet_v3_small(pretrained=True)
children_list = get_children(model)
for c in children_list[:-50]:
for p in c.parameters():
p.requires_grad = False
return model
class TransferMobileNetV3_v2(nn.Module):
def __init__(self,
num_keypoints: int = 5):
super(TransferMobileNetV3_v2, self).__init__()
self.classifier_neurons = num_keypoints*2
self.base_model = get_mob_v3_small()
self.base_model.classifier = nn.Sequential(
nn.Linear(in_features=1024, out_features=1024),
nn.ReLU(),
nn.Linear(in_features=1024, out_features=512),
nn.ReLU(),
nn.Linear(in_features=512, out_features=self.classifier_neurons)
)
self.base_model.apply(_init_weights)
def forward(self, x):
out = self.base_model(x)
return out
Training Script
def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):
len_dataloader = len(trainloader)
for epoch in range(1, args.epochs+1):
net.train()
for batch_idx, sample in enumerate(trainloader):
inputs, labels = sample
inputs, labels = inputs.to(args.device), labels.to(args.device)
optimizer.zero_grad()
with torch.cuda.amp.autocast(args.use_amp):
prediction = net(inputs)
loss = train_loss_fn(prediction, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
def main():
args = make_args_parser()
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = args.seed
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=3e-2,
betas=(0.9, 0.999))
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)
Tensorflow
Model
base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',
input_shape=(224,224,3))
x_in = base_model.layers[-6].output
x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=x)
for layer in model.layers[:-50]:
layer.trainable=False
Training Script
model.compile(loss = "mse",
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))
history = model.fit(input_numpy, output_numpy,
verbose=1,
batch_size=32, epochs=100,validation_split = 0.2)
Results
The PyTorch model predicts one single point around the center for all 5 different points.
The Tensorflow model predicts the points quite well and are quite accurate.
The loss in the Pytorch model is much higher than the Tensorflow model.
Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results. Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.
Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I'm putting it here just in case.

CNN with imbalanced data stuck with 70% testing accuracy

I'm working on image classification task for diabetic retinopathy with fundus image data. There are 5 classes. The data distribution is 1805 images (class 1), 370 images (class 2), 999 images (class 3), 193 images (class 4), 295 images (class 5).
Here are the steps that I have tried to run:
Preprocessing (resized 224 * 224)
The divide of train and test data is 85% : 15%
x_train, xtest, y_train, ytest = train_test_split(
x_train, y_train,
test_size = 0.15,
random_state=SEED,
stratify = y_train
)
Data agumentation
ImageDataGenerator(
zoom_range=0.15,
fill_mode='constant',
cval=0.,
horizontal_flip=True,
vertical_flip=True,
)
Training with the ResNet-50 model and cross-validation
def getResNet():
modelres = ResNet50(weights=None, include_top=False, input_shape= (IMAGE_HEIGHT,IMAGE_HEIGHT, 3))
x = modelres.output
x = GlobalAveragePooling2D()(x)
x = Dense(5, activation= 'softmax')(x)
model = Model(inputs = modelres.input, outputs = x)
return model
num_folds = 5
skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=2021)
cvscores = []
fold = 1
for train, val in skf.split(x_train, y_train.argmax(1)):
print('Fold: ', fold)
Xtrain = x_train[train]
Xval = x_train[val]
Ytrain = y_train[train]
Yval = y_train[val]
data_generator = create_datagen().flow(Xtrain, Ytrain, batch_size=32, seed=2021)
model = getResNet()
model.compile(loss='categorical_crossentropy',
optimizer=Adam(lr=0.0001),
metrics=['accuracy'])
with tf.compat.v1.device('/device:GPU:0'):
model_train = model.fit(data_generator,
validation_data=(Xval, Yval),
epochs=30, batch_size = 32, verbose=1)
model_name = 'cnn_keras_aug_Fold_'+str(fold)+'.h5'
model.save(model_name)
scores = model.evaluate(xtest, ytest, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1] * 100)
fold = fold +1
The maximum results I got from this method were training accuracy of 81.2%, validation accuracy of 72.2%, and test accuracy of 70.73%.
Can anyone give me an idea to improve the model so that I can get the test accuracy above 90% as possible?
Later, I will use this model as a pre-trained model to train diabetic retinopathy data as well but from other sources.
BTW, I've tried replacing my preprocessing with this method:
def preprocessing(path):
image = cv2.imread(path)
image = crop_image_from_gray(image)
green = image[:,:,1]
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
cl = clahe.apply(green)
image[:,:,0] = image[:,:,0]
image[:,:,2] = image[:,:,2]
image[:,:,1] = cl
image = cv2.resize(image, (224,224))
return image
I've also tried to replace my model with VGG16, EfficientNetB0. However, none of that had much effect on my results. I'm still stucked with about 70% accuracy.
Please help me come up with ideas to improve my modeling results. I hope.
Your training accuracy is 81.2%. It is generally impossible to have testing accuracy higher that training accuracy, i.e. with current setup you will not achieve 90%.
However, your validation (and also testing) accuracy is about 70-72%. I can suggest that on your small dataset your model is overfitting. So if you add model regularization (e.g. dropout), it is possible that the gap between your training and your validation (and test) will decrease. This way you can improve your validation score.
To further increase the score, you need to check your data manually and try to understand which classes contribute the most to the errors and figure out how those errors can be reduced (e.g. updating your preprocessing pipeline).

Good training accuracy but bad evaluation

I trained a DNN model, get good training accuracy but bad evaluation accuracy.
def DNN_Metrix(shape, dropout):
model = tf.keras.Sequential()
print(shape)
model.add(tf.keras.layers.Flatten(input_shape=shape))
model.add(tf.keras.layers.Dense(10,activation=tf.nn.relu))
for i in range(0,2):
model.add(tf.keras.layers.Dense(10,activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(8,activation=tf.nn.tanh))
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.compile(loss='binary_crossentropy',
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy'])
return model
model_dnn = DNN_Metrix(shape=(28,20,1), dropout=0.1)
model_dnn.fit(
train_dataset,
steps_per_epoch=1000,
epochs=10,
verbose=2
)
Here is my training process, and result:
Epoch 10/10
- 55s - loss: 0.4763 - acc: 0.7807
But when I evaluation with test dataset, I got:
result = model_dnn.evaluate(np.array(X_test), np.array(y_test), batch_size=len(X_test))
loss, accuracy = [0.9485417604446411, 0.3649936616420746]
it's a binary classification, Positive label : Negetive label is about
0.37 : 0.63
I don't think it was result from overfiting, I have 700k instances when training, with shape of 28 * 20, and my DNN model is simple and have few parameters.
Here is my code when generating the test data and training data:
def parse_function(example_proto):
dics = {
'feature': tf.FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
'label': tf.FixedLenFeature(shape=(2), dtype=tf.float32),
'shape': tf.FixedLenFeature(shape=(2), dtype=tf.int64)
}
parsed_example = tf.parse_single_example(example_proto, dics)
parsed_example['feature'] = tf.decode_raw(parsed_example['feature'], tf.float64)
parsed_example['feature'] = tf.reshape(parsed_example['feature'], [28,20,1])
label_t = tf.cast(parsed_example['label'], tf.int32)
parsed_example['label'] = parsed_example['label'][1]
return parsed_example['feature'], parsed_example['label']
def read_tfrecord(train_tfrecord):
dataset = tf.data.TFRecordDataset(train_tfrecord)
dataset = dataset.map(parse_function)
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.repeat(100)
dataset = dataset.batch(670)
return dataset
def read_tfrecord_test(test_tfrecord):
dataset = tf.data.TFRecordDataset(test_tfrecord)
dataset = dataset.map(parse_function)
return dataset
# tf_record_target = 'train_csv_temp_norm_vx.tfrecords'
train_files = 'train_baseline.tfrecords'
test_files = 'test_baseline.tfrecords'
train_dataset = read_tfrecord(train_files)
test_dataset = read_tfrecord_test(test_files)
it_test_dts = test_dataset.make_one_shot_iterator()
it_train_dts = train_dataset.make_one_shot_iterator()
X_test = []
y_test = []
el = it_test_dts.get_next()
count = 1
with tf.Session() as sess:
while True:
try:
x_t, y_t = sess.run(el)
X_test.append(x_t)
y_test.append(y_t)
except tf.errors.OutOfRangeError:
break
Judging from the fact that your data distribution in your test set is [37%-63%] and your final accuracy is 0.365, I would first check the labels predicted on the test set.
Most probably, all your predictions are of class 0, provided that class 0 amounts for 37% of your dataset. In this case, it means that your neural network is not able to learn anything on the training set, and you have a massive scenario of overfitting.
I recommend that you always use a validation set, so that at the end of each epoch, you would check to see if your neural network has learnt anything. In such a situation(like yours), you would see very fast the overfitting issue.
Training accuracy doesn't mean much. A NN can fit any random set of inputs and outputs, even if they're unrelated. That's why you want to use validation data.
After training look at your loss curves, this will give you a better idea of where things are going wrong.
NN's default to just guessing the most popular class it's seen in training data for classification problems. This is usually what happens when you haven't setup your experiment correctly.
And since your dealing with binary classification you might want to look at things like StratifiedKFold which will provided you folds of train/test data were the sample % is persevered.

Backpropagation Using Tensorflow and Numpy MSE not Dropping

I am trying to create a Backpropagation but I do not want to use the GradientDescentOptimizer from TF. I just wanted to update my own weights and biases. The problem is that the Mean Square Error or Cost is not approaching to zero. It just stays at some 0.2xxx. Is it because of my inputs which are 520x1600 (yes, each input has 1600 units and yes, there are 520 of them) or my number of neurons in the Hidden Layer is problematic? I have tried implementing this using the GradientDescentOptimizer and minimize(cost) which is working fine (Cost reduces near to zero as training goes on) but maybe I have an issue in my code of updating the weights and biases.
Here's my code:
import tensorflow as tf
import numpy as np
from BPInputs40 import pattern, desired;
#get the inputs and desired outputs, 520 inputs, each has 1600 units
train_in = pattern
train_out = desired
learning_rate=tf.constant(0.5)
num_input_neurons = len(train_in[0])
num_output_neurons = len(train_out[0])
num_hidden_neurons = 20
#weight matrix initialization with random values
w_h = tf.Variable(tf.random_normal([num_input_neurons, num_hidden_neurons]), dtype=tf.float32)
w_o = tf.Variable(tf.random_normal([num_hidden_neurons, num_output_neurons]), dtype=tf.float32)
b_h = tf.Variable(tf.random_normal([1, num_hidden_neurons]), dtype=tf.float32)
b_o = tf.Variable(tf.random_normal([1, num_output_neurons]), dtype=tf.float32)
# Model input and output
x = tf.placeholder("float")
y = tf.placeholder("float")
def sigmoid(v):
return tf.div(tf.constant(1.0),tf.add(tf.constant(1.0),tf.exp(tf.negative(v*0.001))))
def derivative(v):
return tf.multiply(sigmoid(v), tf.subtract(tf.constant(1.0), sigmoid(v)))
output_h = tf.sigmoid(tf.add(tf.matmul(x,w_h),b_h))
output_o = tf.sigmoid(tf.add(tf.matmul(output_h,w_o),b_o))
error = tf.subtract(output_o,y) #(1x35)
mse = tf.reduce_mean(tf.square(error))
delta_o=tf.multiply(error,derivative(output_o))
delta_b_o=delta_o
delta_w_o=tf.matmul(tf.transpose(output_h), delta_o)
delta_backprop=tf.matmul(delta_o,tf.transpose(w_o))
delta_h=tf.multiply(delta_backprop,derivative(output_h))
delta_b_h=delta_h
delta_w_h=tf.matmul(tf.transpose(x),delta_h)
#updating the weights
train = [
tf.assign(w_h, tf.subtract(w_h, tf.multiply(learning_rate, delta_w_h))),
tf.assign(b_h, tf.subtract(b_h, tf.multiply(learning_rate, tf.reduce_mean(delta_b_h, 0)))),
tf.assign(w_o, tf.subtract(w_o, tf.multiply(learning_rate, delta_w_o))),
tf.assign(b_o, tf.subtract(b_o, tf.multiply(learning_rate, tf.reduce_mean(delta_b_o, 0))))
]
sess = tf.Session()
sess.run(tf.global_variables_initializer())
err,target=1, 0.005
epoch, max_epochs = 0, 2000000
while epoch < max_epochs:
epoch += 1
err, _ = sess.run([mse, train],{x:train_in,y:train_out})
if (epoch%1000 == 0):
print('Epoch:', epoch, '\nMSE:', err)
answer = tf.equal(tf.floor(output_o + 0.5), y)
accuracy = tf.reduce_mean(tf.cast(answer, "float"))
print(sess.run([output_o], feed_dict={x: train_in, y: train_out}));
print("Accuracy: ", (1-err) * 100 , "%");
Update: I got it working now. The MSE dropped to almost zero once I increased the number of neurons in the hidden layer. I tried using 5200 and 6400 neurons for the hidden layer and with just 5000 epochs, the accuracy was almost 99%. Also, the largest learning rate I used is 0.1 because when above that, the MSE will not be close to zero.
I'm not an expert in this field, but it looks like your weights are updated correctly. And the fact that your MSE decreases from some higher values to 0.2xxx is the strong indicator of that. I would definitely try to run this problem with way more hidden neurons (e.g. 500)
Btw, are your inputs normalized? If not, that obviously could be the reason

Tensorflow: can not obtain same result mini-batch SGD optimizer compared to Kaldi nnet1

I am trying to build a Tensorflow example with a simple multl-layer
perceptron (MLP) functionality with one hidden layer. However, when I tested it and compared to other software e.g. Kaldi nnet1, the convergence during the training is not efficient, or cannot be comparable to Kaldi nnet1. I tried my best to make all the parameters the same (input, int target, batch size, learning rate, etc.), however, still confused where could be the reasons. Some pieces of codes are as follows:
Initialization:
self.weight = [tf.Variable(tf.truncated_normal([440, 8192],stddev=0.1))]
self.bias = [tf.Variable(tf.constant(0.01, shape=8192))]
self.weight.append( tf.Variable(tf.truncated_normal([8192, 8],stddev=0.1)) )
self.bias.append( tf.Variable(tf.constant(0.01, shape=8)) )
self.act = [tf.nn.sigmoid( tf.matmul(self.input, self.weight[0]) + self.bias[0] )]
self.nn_out = tf.matmul(self.act, self.weight[1]) + self.bias[1])
self.nn_softmax = tf.nn.softmax(self.nn_out)
self.nn_tgt = tf.placeholder("int64", shape=[None,])
self.cost_mean = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(self.nn_out, self.nn_tgt))
self.train_step = tf.train.GradientDescentOptimizer(self.learn_rate).minimize(self.cost_mean)
# saver
self.saver = tf.train.Saver()
self.sess = tf.Session()
self.sess.run(tf.initialize_all_variables())
Training:
for epoch in xrange(20):
feats_tr, tgts_tr = shuffle(feats_tr, tgts_tr, random_state=777)
# restore the exisiting model
ckpt = tf.train.get_checkpoint_state(ckpt_dir)
if ckpt and ckpt.model_checkpoint_path:
self.load(ckpt.model_checkpoint_path)
# mini-batch
tr_loss = []
for idx_begin in range(0,len(feats_tr), 512):
idx_end = idx_begin + batch_size
batch_feats, batch_tgts = feats_tr[idx_begin:idx_end],tgts_tr[idx_begin:idx_end]
_, loss_val = self.sess.run([self.train_step, self.cost_mean], feed_dict = {self.nn_in: batch_feats,
self.nn_tgt: batch_tgts,self.learn_rate: learn_rate})
tr_loss.append(loss_val)
# cross-validation
cv_loss = []
for idx_begin in range(0,len(feats_cv), 512):
idx_end = idx_begin + batch_size
batch_feats, batch_tgts = feats[idx_begin:idx_end],tgts[idx_begin:idx_end]
loss_all.append(self.sess.run(self.cost_mean,
feed_dict = { self.nn_in: batch_feats,
self.nn_tgt: batch_tgts}))
print( "Avg Loss for Training: "+str(np.mean(tr_loss)) + \
" Avg Loss for Validation: "+str(np.mean(cv_loss)) )
# save model per epoch if np.mean(cv_loss) less than previous
if (epoch+1)%1==0:
if loss_new < loss:
loss = loss_new
print( "Model accepted in epoch %d" %(epoch+1) )
# save model to ckpt_dir with mdl_nam
self.saver.save(self.sess, mdl_nam, global_step=epoch+1)
else:
print( "Model rejected in epoch %d" %(epoch+1) )
and I generated a simple annealing learning rate control as: if the average of cross-validation loss is not improved by a certain threshold, then halving the 'learn_late' with initial 0.008.
I checked all the parameters when compared to Kaldi nnet1, and the only difference now is the initialization parameters of weights and biases. I am not sure whether initialization will affect too much. However, the convergence in terms of 'cv_loss' during training in Tensorflow (Avg. CV Loss 1.99) is not good as Kaldi nnet1 (Avg. CV Loss 0.95). Can someone help to point out where I did something wrong or I missed something?
Many thanks in advance !!!
At each epoch, you call self.load(ckpt.model_checkpoint_path) which seems to load previously saved weights.
Your model cannot learn if it is reset to the initial weights at each epoch.