:)
I'm new to deep learning and I'm trying to create a Yolo v1 detection model from scratch with keras and tensorflow.
I need to detect only one class so I've made the network have an output size of 7x7x5 (7 by 7 grid and one bbox per cell [P(object) x y w h]
I've created the dataset and resized all the images to 448x448 and I've assigned the bounding boxes, normalized as in the paper, to the corresponding cell.
My custom loss doesn't take into account the IoU to get the best box, indeed there's only one per cell to choose from.
Furthermore it considers only the coordinate loss and the object-no object losses as there's only one class.
Here's how I implemented it with Keras
def custom_loss_tipregovai(y_true, y_pred):
mse = tf.keras.losses.MeanSquaredError(reduction = "sum")
predictions = tf.reshape(y_pred,(-1,7,7,5))
exists_box = tf.expand_dims(y_true[...,0], 3)
#BOX LOSS
pred_box = exists_box*predictions[...,1:5]
target_box = exists_box*y_true[...,1:5]
epsilon = tf.fill(pred_box[..., 2:4].shape, 1e-6)
wh_pred = tf.math.sign(pred_box[...,3:5]) * tf.math.sqrt(tf.math.abs(pred_box[...,3:5] + epsilon))
wh_targ = tf.math.sqrt(target_box[...,3:5] + epsilon)
xy_pred = pred_box[...,1:3]
xy_true = target_box[...,1:3]
final_pred_box = tf.concat([xy_pred,wh_pred], axis = 3)
final_true_box = tf.concat([xy_true,wh_targ], axis = 3)
box_loss = mse(tf.reshape(final_pred_box, (-1, final_pred_box.shape[-1])),tf.reshape(final_true_box, (-1, final_true_box.shape[-1])))
#OBJECT LOSS
pred_obj = predictions[...,0:1]
true_obj = y_true[...,0:1]
object_loss = mse(tf.reshape(exists_box*pred_obj, (-1, )), tf.reshape(exists_box*true_obj, (-1, )) )
#NO OBJECT LOSS
non_exists_box = 1 - exists_box
no_object_loss = mse(tf.reshape(non_exists_box*pred_obj, (-1, )), tf.reshape(non_exists_box*true_obj, (-1, )))
total_loss = 5*box_loss + object_loss + 0.5*no_object_loss
return total_loss
I really don't understand why :(
Any help is appreciated, thank you!!
Related
I am trying to convert a Tensorflow object localization code into Pytorch. In the original code, the author use model.compile / model.fit to train the model so I don't understand how the losses of classification of the MNIST digits and box regressions work. Still, I'm trying to implement my own training loop in Pytorch.
The goal here is, after some preprocessing, past the MNIST digits randomly into a black square image and then, classify and localize (bounding boxes) the digit.
I set two losses : nn.CrossEntropyLoss and nn.MSELoss and I do (loss_1+loss_2).backward() to compute the gradients. I know it's the right way to compute gradients with two losses from here and here.
But still, my loss doesn't decrease whereas it collapses quasi-imediately with the Tensorflow code. I checked the model with torchinfo.summary and it seems behaving as well as the Tensorflow implementation.
EDIT :
I looked for the predicted labels of my model and it doesn't seem to change at all.
This line of code label_preds, bbox_coords_preds = model(digits) always returns the same values
label_preds[0] = tensor([[0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156]], device='cuda:0', grad_fn=<SliceBackward0>)
Here are my questions :
Is my custom network set correctly ?
Are my losses set correctly ?
Why my label predictions don't change ?
Do my training loop work as well as the .compile and .fit Tensorflow methods ?
Thanks a lot !
PYTORCH CODE
class ConvNetwork(nn.Module):
def __init__(self):
super(ConvNetwork, self).__init__()
self.conv2d_1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3)
self.conv2d_2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
self.conv2d_3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
self.avgPooling2D = nn.AvgPool2d((2,2))
self.dense_1 = nn.Linear(in_features=3136, out_features=128)
self.dense_classifier = nn.Linear(in_features=128, out_features=10)
self.softmax = nn.Softmax(dim=0)
self.dense_regression = nn.Linear(in_features=128, out_features=4)
def forward(self, input):
x = self.avgPooling2D(F.relu(self.conv2d_1(input)))
x = self.avgPooling2D(F.relu(self.conv2d_2(x)))
x = self.avgPooling2D(F.relu(self.conv2d_3(x)))
x = nn.Flatten()(x)
x = F.relu(self.dense_1(x))
output_classifier = self.softmax(self.dense_classifier(x))
output_regression = self.dense_regression(x)
return [output_classifier, output_regression]
######################################################
learning_rate = 0.1
EPOCHS = 1
BATCH_SIZE = 64
model = ConvNetwork()
model = model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
classification_loss = nn.CrossEntropyLoss()
regression_loss = nn.MSELoss()
######################################################
begin_time = time.time()
for epoch in range(EPOCHS) :
tot_loss = 0
train_start = time.time()
training_losses = []
print("-"*20)
print(" "*5 + f"EPOCH {epoch+1}/{EPOCHS}")
print("-"*20)
model.train()
for batch, (digits, labels, bbox_coords) in enumerate(training_dataset):
digits, labels, bbox_coords = digits.to(device), labels.to(device), bbox_coords.to(device)
optimizer.zero_grad()
[label_preds, bbox_coords_preds] = model(digits)
class_loss = classification_loss(label_preds, labels)
box_loss = regression_loss(bbox_coords_preds, bbox_coords)
training_loss = class_loss + box_loss
training_loss.backward()
optimizer.step()
######### print part #######################
training_losses.append(training_loss.item())
if batch+1 <= len_training_ds//BATCH_SIZE:
current_training_sample = (batch+1)*BATCH_SIZE
else:
current_training_sample = (batch)*BATCH_SIZE + len_training_ds%BATCH_SIZE
if (batch+1) == 1 or (batch+1)%100 == 0 or (batch+1) == len_training_ds//BATCH_SIZE +1:
print(f"Elapsed time : {(time.time()-train_start)/60:.3f}",\
f" --- Digit : {current_training_sample}/{len_training_ds}",\
f" : loss = {training_loss:.5f}")
if batch+1 == (len_training_ds//BATCH_SIZE)+1:
print(f"Total elapsed time for training : {(time.time()-begin_time)/60:.3f}")
ORIGINAL TENSORFLOW CODE
def feature_extractor(inputs):
x = tf.keras.layers.Conv2D(16, activation='relu', kernel_size=3, input_shape=(75, 75, 1))(inputs)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
x = tf.keras.layers.Conv2D(32,kernel_size=3,activation='relu')(x)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
x = tf.keras.layers.Conv2D(64,kernel_size=3,activation='relu')(x)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
return x
def dense_layers(inputs):
x = tf.keras.layers.Flatten()(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)
return x
def classifier(inputs):
classification_output = tf.keras.layers.Dense(10, activation='softmax', name = 'classification')(inputs)
return classification_output
def bounding_box_regression(inputs):
bounding_box_regression_output = tf.keras.layers.Dense(units = '4', name = 'bounding_box')(inputs)
return bounding_box_regression_output
def final_model(inputs):
feature_cnn = feature_extractor(inputs)
dense_output = dense_layers(feature_cnn)
classification_output = classifier(dense_output)
bounding_box_output = bounding_box_regression(dense_output)
model = tf.keras.Model(inputs = inputs, outputs = [classification_output,bounding_box_output])
return model
def define_and_compile_model(inputs):
model = final_model(inputs)
model.compile(optimizer='adam',
loss = {'classification' : 'categorical_crossentropy',
'bounding_box' : 'mse'
},
metrics = {'classification' : 'accuracy',
'bounding_box' : 'mse'
})
return model
inputs = tf.keras.layers.Input(shape=(75, 75, 1,))
model = define_and_compile_model(inputs)
EPOCHS = 10 # 45
steps_per_epoch = 60000//BATCH_SIZE # 60,000 items in this dataset
validation_steps = 1
history = model.fit(training_dataset,
steps_per_epoch=steps_per_epoch,
validation_data=validation_dataset,
validation_steps=validation_steps, epochs=EPOCHS)
loss, classification_loss, bounding_box_loss, classification_accuracy, bounding_box_mse = model.evaluate(validation_dataset, steps=1)
print("Validation accuracy: ", classification_accuracy)
I answering to myself about this bug :
What I found :
I figured that I use a Softmax layer in my code while I'm using the nn.CrossEntropyLoss() as a loss.
What this problem was causing :
This loss already apply a softmax (doc)
Apply a softmax twice must add some noise to the loss and preventing convergence
What I did :
One should let a linear layer as an output for the classification layer.
An other way is to use the NLLLoss (doc) instead and let the softmax layer in the model class.
Also :
I don't fully understand how the .compile() and .fit() Tensorflow methods work but I think it should optimize the training one way or another (I think about the learning rate) since I had to decrease the learning rate to 0.001 in Pytorch to "unstick" the loss and makes it decrease.
I have a terrible problem on doing my work with Keras, and here's a problem;
encoder_path = "my own encoder path"
if (os.path.exists(encoder_path)):
encoder = tf.keras.models.load_model(encoder_path, compile=False)
# encoder.summary()
print("Encoder model exist & loaded ...")
else:
print("There is no file! Check " + encoder_path + ' ...')
## Making latent vector layer code ##
loc_z_mean = len(encoder.layers) - 11
loc_z_log_var = len(encoder.layers) - 10
z_mean = encoder.layers[loc_z_mean]
z_log_var = encoder.layers[loc_z_log_var]
print(z_mean.get_weights())
z_mean_weights = z_mean.get_weights()[0]
z_mean_bias = z_mean.get_weights()[1]
print(np.shape(z_mean_weights))
print(np.shape(z_mean_bias))
z_log_var_weights = z_log_var.get_weights()[0]
z_log_var_bias = z_log_var.get_weights()[1]
z_weights = z_mean_weights + np.exp(0.5 * z_log_var_weights)
z_bias = z_mean_bias + np.exp(0.5 * z_log_var_bias)
# z_weights_init = z_weights.numpy()
# z_bias_init = z_bias.numpy()
z = tf.keras.layers.Dense(16, name="latent_z").set_weights([z_weights, z_bias])
# z = tf.keras.layers.Dense(16, kernel_initializer=z_weights_init, bias_initializer=z_bias_init, name="latent_z")
# z.trainable = False # Freeze layer
print(z)
I'm trying to make a new weight from former model. But it doesn't work when trying
z = tf.keras.layers.Dense(16, name="latent_z").set_weights([z_weights, z_bias])
with this error;
ValueError: You called `set_weights(weights)` on layer "latent_z" with a weight list of length 2, but the layer was expecting 0 weights. Provided weights: [array([[0.85919297, 0.39330506, 1.4273021 , 0.780...
I set the shape of z_weights and z_bias size, (16, 16) and (16,) respectively since those size are exactly same as first loaded weights, but it didn't work.
Is there any solution?
Thanks in advance.
You need to invoke build() for the layer with the required input_shape before being able to set the weights
z = tf.keras.layers.Dense(16, name="latent_z")
# set shape
z.build(input_shape=(100,)
# using random weights and bias for now
bias = np.random.randn(16)
weight = np.random.randn(100,16)
z.set_weights([weight, bias])
Without invoking build(), the layers weights are not defined since the shape of the weights depend on the shape of the input. In the above sample code, the input shape was 100 and hence the weight shape is [100,16].
I am new with Deep Learning with Pytorch. I am more experienced with Tensorflow, and thus I should say I am not new to Deep Learning itself.
Currently, I am working on a simple ANN classification. There are only 2 classes so quite naturally I am using a Softmax BCELoss combination.
The dataset is like this:
shape of X_train (891, 7)
Shape of Y_train (891,)
Shape of x_test (418, 7)
I transformed the X_train and others to torch tensors as train_data and so on. The next step is:
train_ds = TensorDataset(train_data, train_label)
# Define data loader
batch_size = 32
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
I made the model class like:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(7, 32)
self.bc1 = nn.BatchNorm1d(32)
self.fc2 = nn.Linear(32, 64)
self.bc2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, 128)
self.bc3 = nn.BatchNorm1d(128)
self.fc4 = nn.Linear(128, 32)
self.bc4 = nn.BatchNorm1d(32)
self.fc5 = nn.Linear(32, 10)
self.bc5 = nn.BatchNorm1d(10)
self.fc6 = nn.Linear(10, 1)
self.bc6 = nn.BatchNorm1d(1)
self.drop = nn.Dropout2d(p=0.5)
def forward(self, x):
torch.nn.init.xavier_uniform(self.fc1.weight)
x = self.fc1(x)
x = self.bc1(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc2(x)
x = self.bc2(x)
x = F.relu(x)
#x = self.drop(x)
x = self.fc3(x)
x = self.bc3(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc4(x)
x = self.bc4(x)
x = F.relu(x)
#x = self.drop(x)
x = self.fc5(x)
x = self.bc5(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc6(x)
x = self.bc6(x)
x = torch.sigmoid(x)
return x
model = Net()
The loss function and the optimizer are defined:
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
At last, the task is to run the forward in epochs:
num_epochs = 1000
# Repeat for given number of epochs
for epoch in range(num_epochs):
# Train with batches of data
for xb,yb in train_dl:
pred = model(xb)
yb = torch.unsqueeze(yb, 1)
#print(pred, yb)
print('grad', model.fc1.weight.grad)
l = loss(pred, yb)
#print('loss',l)
# 3. Compute gradients
l.backward()
# 4. Update parameters using gradients
optimizer.step()
# 5. Reset the gradients to zero
optimizer.zero_grad()
# Print the progress
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, l.item()))
I can see in the output that after each iteration with all the batches, the hard weights are non-zero, after this zero_grad is applied.
However, the model is pretty bad. I get an F1 score of around 50% only! And the model is bad when I call it to predict the train_dl itself!!!
I am wondering what the reason is. The grad of weights not zero but not updating properly? The optimizer not optimizing the weights? Or what else?
Can someone please have a look?
I already tried different loss functions and optimizers. I tried with smaller datasets, bigger batches, different hyperparameters.
Thanks! :)
First of all, you don't use softmax activation for BCE loss, unless you have 2 output nodes, which is not the case. In PyTorch, BCE loss doesn't apply any activation function before calculating the loss, unlike the CCE which has a built-in softmax function. So, if you want to use BCE, you have to use sigmoid (or any function f: R -> [0, 1]) at the output layer, which you don't have.
Moreover, you should ideally do optimizer.zero_grad() for each batch if you want to do SGD (which is the default). If you don't do that, you will be just doing full-batch gradient descent, which is quite slow and gets stuck in local minima easily.
I want to implement tf.nn.sparse_softmax_cross_entropy by my self. But after some batches, loss became nan!
There is my code:
logits_batch_size = tf.shape(logits)[0]
labels = tf.reshape(tgt_seq, [-1])
eps = tf.fill(tf.shape(logits), 1e-8)
logits = logits + eps
labels_1 = tf.expand_dims(labels, 1)
index = tf.expand_dims(tf.range(0, logits_batch_size), 1)
concated = tf.concat([index, labels_1], 1)
onehot_labels = tf.sparse_to_dense(concated, tf.stack([logits_batch_size, tvsize]), 1.0, 0.0)
y_log = tf.log(tf.nn.softmax(logits))
cost = tf.reduce_mean(-tf.reduce_sum(tf.multiply(onehot_labels, y_log), 0))
logits is the same as the logits in tf.nn.sparse_softmax_cross_entropy, a 2-D tensor, tgt_seq is a 2-D tensor, too. My task is a sequence-to-sequence learning task.
Can anyone help me?
A recent paper (here) introduced a secondary loss function that they called center loss. It is based on the distance between the embeddings in a batch and the running average embedding for each of the respective classes. There has been some discussion in the TF Google groups (here) regarding how such embedding centers can be computed and updated. I've put together some code to generate class-average embeddings in my answer below.
Is this the best way to do this?
The previously posted method is too simple for cases like center loss where the expected value of the embeddings change over time as the model becomes more refined. This is because the previous center-finding routine averages all instances since start and therefore tracks changes in expected value very slowly. Instead, a moving window average is preferred. An exponential moving-window variant is as follows:
def get_embed_centers(embed_batch, label_batch):
''' Exponential moving window average. Increase decay for longer windows [0.0 1.0]
'''
decay = 0.95
with tf.variable_scope('embed', reuse=True):
embed_ctrs = tf.get_variable("ctrs")
label_batch = tf.reshape(label_batch, [-1])
old_embed_ctrs_batch = tf.gather(embed_ctrs, label_batch)
dif = (1 - decay) * (old_embed_ctrs_batch - embed_batch)
embed_ctrs = tf.scatter_sub(embed_ctrs, label_batch, dif)
embed_ctrs_batch = tf.gather(embed_ctrs, label_batch)
return embed_ctrs_batch
with tf.Session() as sess:
with tf.variable_scope('embed'):
embed_ctrs = tf.get_variable("ctrs", [nclass, ndims], dtype=tf.float32,
initializer=tf.constant_initializer(0), trainable=False)
label_batch_ph = tf.placeholder(tf.int32)
embed_batch_ph = tf.placeholder(tf.float32)
embed_ctrs_batch = get_embed_centers(embed_batch_ph, label_batch_ph)
sess.run(tf.initialize_all_variables())
tf.get_default_graph().finalize()
The get_new_centers() routine below takes in labelled embeddings and updates shared variables center/sums and center/cts. These variables are then used to calculate and return the embedding centers using the updated values.
The loop just exercises get_new_centers() and shows that it converges to the expected average embeddings for all classes over time.
Note that the alpha term used in the original paper isn't included here but should be straightforward to add if needed.
ndims = 2
nclass = 4
nbatch = 100
with tf.variable_scope('center'):
center_sums = tf.get_variable("sums", [nclass, ndims], dtype=tf.float32,
initializer=tf.constant_initializer(0), trainable=False)
center_cts = tf.get_variable("cts", [nclass], dtype=tf.float32,
initializer=tf.constant_initializer(0), trainable=False)
def get_new_centers(embeddings, indices):
'''
Update embedding for selected class indices and return the new average embeddings.
Only the newly-updated average embeddings are returned corresponding to
the indices (including duplicates).
'''
with tf.variable_scope('center', reuse=True):
center_sums = tf.get_variable("sums")
center_cts = tf.get_variable("cts")
# update embedding sums, cts
if embeddings is not None:
ones = tf.ones_like(indices, tf.float32)
center_sums = tf.scatter_add(center_sums, indices, embeddings, name='sa1')
center_cts = tf.scatter_add(center_cts, indices, ones, name='sa2')
# return updated centers
num = tf.gather(center_sums, indices)
denom = tf.reshape(tf.gather(center_cts, indices), [-1, 1])
return tf.div(num, denom)
with tf.Session() as sess:
labels_ph = tf.placeholder(tf.int32)
embeddings_ph = tf.placeholder(tf.float32)
unq_labels, ul_idxs = tf.unique(labels_ph)
indices = tf.gather(unq_labels, ul_idxs)
new_centers_with_update = get_new_centers(embeddings_ph, indices)
new_centers = get_new_centers(None, indices)
sess.run(tf.initialize_all_variables())
tf.get_default_graph().finalize()
for i in range(100001):
embeddings = 100*np.random.randn(nbatch, ndims)
labels = np.random.randint(0, nclass, nbatch)
feed_dict = {embeddings_ph:embeddings, labels_ph:labels}
rval = sess.run([new_centers_with_update], feed_dict)
if i % 1000 == 0:
feed_dict = {labels_ph:range(nclass)}
rval = sess.run(new_centers, feed_dict)
print('\nFor step ', i)
for iclass in range(nclass):
print('Class %d, center: %s' % (iclass, str(rval[iclass])))
A typical result at step 0 is:
For step 0
Class 0, center: [-1.7618252 -0.30574229]
Class 1, center: [ -4.50493908 10.12403965]
Class 2, center: [ 3.6156714 -9.94263649]
Class 3, center: [-4.20281982 -8.28845882]
and the output at step 10,000 demonstrates convergence:
For step 10000
Class 0, center: [ 0.00313433 -0.00757505]
Class 1, center: [-0.03476512 0.04682625]
Class 2, center: [-0.03865958 0.06585111]
Class 3, center: [-0.02502561 -0.03370816]