Object localization MNIST Tensorflow to Pytorch : Losses doesn't decrease - tensorflow

I am trying to convert a Tensorflow object localization code into Pytorch. In the original code, the author use model.compile / model.fit to train the model so I don't understand how the losses of classification of the MNIST digits and box regressions work. Still, I'm trying to implement my own training loop in Pytorch.
The goal here is, after some preprocessing, past the MNIST digits randomly into a black square image and then, classify and localize (bounding boxes) the digit.
I set two losses : nn.CrossEntropyLoss and nn.MSELoss and I do (loss_1+loss_2).backward() to compute the gradients. I know it's the right way to compute gradients with two losses from here and here.
But still, my loss doesn't decrease whereas it collapses quasi-imediately with the Tensorflow code. I checked the model with torchinfo.summary and it seems behaving as well as the Tensorflow implementation.
EDIT :
I looked for the predicted labels of my model and it doesn't seem to change at all.
This line of code label_preds, bbox_coords_preds = model(digits) always returns the same values
label_preds[0] = tensor([[0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156]], device='cuda:0', grad_fn=<SliceBackward0>)
Here are my questions :
Is my custom network set correctly ?
Are my losses set correctly ?
Why my label predictions don't change ?
Do my training loop work as well as the .compile and .fit Tensorflow methods ?
Thanks a lot !
PYTORCH CODE
class ConvNetwork(nn.Module):
def __init__(self):
super(ConvNetwork, self).__init__()
self.conv2d_1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3)
self.conv2d_2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
self.conv2d_3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
self.avgPooling2D = nn.AvgPool2d((2,2))
self.dense_1 = nn.Linear(in_features=3136, out_features=128)
self.dense_classifier = nn.Linear(in_features=128, out_features=10)
self.softmax = nn.Softmax(dim=0)
self.dense_regression = nn.Linear(in_features=128, out_features=4)
def forward(self, input):
x = self.avgPooling2D(F.relu(self.conv2d_1(input)))
x = self.avgPooling2D(F.relu(self.conv2d_2(x)))
x = self.avgPooling2D(F.relu(self.conv2d_3(x)))
x = nn.Flatten()(x)
x = F.relu(self.dense_1(x))
output_classifier = self.softmax(self.dense_classifier(x))
output_regression = self.dense_regression(x)
return [output_classifier, output_regression]
######################################################
learning_rate = 0.1
EPOCHS = 1
BATCH_SIZE = 64
model = ConvNetwork()
model = model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
classification_loss = nn.CrossEntropyLoss()
regression_loss = nn.MSELoss()
######################################################
begin_time = time.time()
for epoch in range(EPOCHS) :
tot_loss = 0
train_start = time.time()
training_losses = []
print("-"*20)
print(" "*5 + f"EPOCH {epoch+1}/{EPOCHS}")
print("-"*20)
model.train()
for batch, (digits, labels, bbox_coords) in enumerate(training_dataset):
digits, labels, bbox_coords = digits.to(device), labels.to(device), bbox_coords.to(device)
optimizer.zero_grad()
[label_preds, bbox_coords_preds] = model(digits)
class_loss = classification_loss(label_preds, labels)
box_loss = regression_loss(bbox_coords_preds, bbox_coords)
training_loss = class_loss + box_loss
training_loss.backward()
optimizer.step()
######### print part #######################
training_losses.append(training_loss.item())
if batch+1 <= len_training_ds//BATCH_SIZE:
current_training_sample = (batch+1)*BATCH_SIZE
else:
current_training_sample = (batch)*BATCH_SIZE + len_training_ds%BATCH_SIZE
if (batch+1) == 1 or (batch+1)%100 == 0 or (batch+1) == len_training_ds//BATCH_SIZE +1:
print(f"Elapsed time : {(time.time()-train_start)/60:.3f}",\
f" --- Digit : {current_training_sample}/{len_training_ds}",\
f" : loss = {training_loss:.5f}")
if batch+1 == (len_training_ds//BATCH_SIZE)+1:
print(f"Total elapsed time for training : {(time.time()-begin_time)/60:.3f}")
ORIGINAL TENSORFLOW CODE
def feature_extractor(inputs):
x = tf.keras.layers.Conv2D(16, activation='relu', kernel_size=3, input_shape=(75, 75, 1))(inputs)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
x = tf.keras.layers.Conv2D(32,kernel_size=3,activation='relu')(x)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
x = tf.keras.layers.Conv2D(64,kernel_size=3,activation='relu')(x)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
return x
def dense_layers(inputs):
x = tf.keras.layers.Flatten()(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)
return x
def classifier(inputs):
classification_output = tf.keras.layers.Dense(10, activation='softmax', name = 'classification')(inputs)
return classification_output
def bounding_box_regression(inputs):
bounding_box_regression_output = tf.keras.layers.Dense(units = '4', name = 'bounding_box')(inputs)
return bounding_box_regression_output
def final_model(inputs):
feature_cnn = feature_extractor(inputs)
dense_output = dense_layers(feature_cnn)
classification_output = classifier(dense_output)
bounding_box_output = bounding_box_regression(dense_output)
model = tf.keras.Model(inputs = inputs, outputs = [classification_output,bounding_box_output])
return model
def define_and_compile_model(inputs):
model = final_model(inputs)
model.compile(optimizer='adam',
loss = {'classification' : 'categorical_crossentropy',
'bounding_box' : 'mse'
},
metrics = {'classification' : 'accuracy',
'bounding_box' : 'mse'
})
return model
inputs = tf.keras.layers.Input(shape=(75, 75, 1,))
model = define_and_compile_model(inputs)
EPOCHS = 10 # 45
steps_per_epoch = 60000//BATCH_SIZE # 60,000 items in this dataset
validation_steps = 1
history = model.fit(training_dataset,
steps_per_epoch=steps_per_epoch,
validation_data=validation_dataset,
validation_steps=validation_steps, epochs=EPOCHS)
loss, classification_loss, bounding_box_loss, classification_accuracy, bounding_box_mse = model.evaluate(validation_dataset, steps=1)
print("Validation accuracy: ", classification_accuracy)

I answering to myself about this bug :
What I found :
I figured that I use a Softmax layer in my code while I'm using the nn.CrossEntropyLoss() as a loss.
What this problem was causing :
This loss already apply a softmax (doc)
Apply a softmax twice must add some noise to the loss and preventing convergence
What I did :
One should let a linear layer as an output for the classification layer.
An other way is to use the NLLLoss (doc) instead and let the softmax layer in the model class.
Also :
I don't fully understand how the .compile() and .fit() Tensorflow methods work but I think it should optimize the training one way or another (I think about the learning rate) since I had to decrease the learning rate to 0.001 in Pytorch to "unstick" the loss and makes it decrease.

Related

Time-Series Transformer Model trains well but performs worse on Test Data

I have created a transformer model for multivariate time series predictions (many-to-one classification model).
Details about the Dataset
I have the hourly varying data i.e., 8 different features (hour, month, temperature, humidity, windspeed, solar radiations concentration etc.) and with them I am trying to predict the time sequence (energy consumption of a building. So my input has the shape X.shape = (8783, 168, 8) i.e., 8783 time sequences, each sequence contains 168 hourly entries/vectors and each vector contains 8 features. My output has the shape Y.shape = (8783,1) i.e., 8783 sequences each containing 1 output value (i.e., building energy consumption value after every hour).
Model Details
I took as a model an example from the official keras site. It is created for classification problems, I modified it for my regression problem by changing the activation of last output layer from sigmoid to relu.
Input shape (train_f) = (8783, 168, 8)
Output shape (train_P) = (8783,1)
When I train the model for 100 no. of epochs it converges very well for less number of epochs as compared to my reference models (i.e., LSTMs and LSTMS with self attention). After training, when the model is asked to make prediction by feeding in the test data, the prediction performance is worse as compare to the reference models.
I would be grateful if you please have a look at the code and let me know of the potential steps to improve the prediction/test accuracy.
Here is the code;
df_weather = pd.read_excel(r"Downloads\WeatherData.xlsx")
df_energy = pd.read_excel(r"Downloads\Building_energy_consumption_record.xlsx")
visa = pd.concat([df_weather, df_energy], axis = 1)
df_data = visa.loc[:, ~visa.columns.isin(["Time1", "TD", "U", "DR", "FX"])
msna.bar(df_data)
plt.figure(figsize = (16,6))
sb.heatmap(df_data.corr(), annot = True, linewidths=1, fmt = ".2g", cmap= 'coolwarm')
plt.xticks(rotation = 'horizontal') # how the titles will look likemeans their orientation
extract_for_normalization = list(df_data)[1:9]
df_data_float = df_data[extract_for_normalization].astype(float)
from sklearn.model_selection import train_test_split
train_X, test_X = train_test_split(df_data_float, train_size = 0.7, shuffle = False)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_train_X=scaler.fit_transform(train_X)
**Converting train_X into required shape (inputs,sequences, features)**
train_f = [] #features input from training data
train_p = [] # prediction values
#test_q = []
#test_r = []
n_future = 1 #number of days we want to predict into the future
n_past = 168 # no. of time series input features to be considered for training
for val in range(n_past, len(scaled_train_X) - n_future+1):
train_f.append(scaled_train_X[val - n_past:val, 0:scaled_train_X.shape[1]])
train_p.append(scaled_train_X[val + n_future - 1:val + n_future, -1])
train_f, train_p = np.array(train_f), np.array(train_p)
**Transformer Model**
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
# Normalization and Attention
x = layers.LayerNormalization(epsilon=1e-6)(inputs)
x = layers.MultiHeadAttention(
key_dim=head_size, num_heads=num_heads, dropout=dropout
)(x, x)
x = layers.Dropout(dropout)(x)
res = x + inputs
# Feed Forward Part
x = layers.LayerNormalization(epsilon=1e-6)(res)
x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
x = layers.Dropout(dropout)(x)
x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
return x + res
def build_model(
input_shape,
head_size,
num_heads,
ff_dim,
num_transformer_blocks,
mlp_units,
dropout=0,
mlp_dropout=0,
):
inputs = keras.Input(shape=input_shape)
x = inputs
for _ in range(num_transformer_blocks):
x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
for dim in mlp_units:
x = layers.Dense(dim, activation="relu")(x)
x = layers.Dropout(mlp_dropout)(x)
outputs = layers.Dense(train_p.shape[1])(x)
return keras.Model(inputs, outputs)
input_shape = (train_f.shape[1], train_f.shape[2])
model = build_model(
input_shape,
head_size=256,
num_heads=4,
ff_dim=4,
num_transformer_blocks=4,
mlp_units=[128],
mlp_dropout=0.4,
dropout=0.25,
)
model.compile(loss=tf.keras.losses.mean_absolute_error,
optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
metrics=["mse"])
model.summary()
history = model.fit(train_f, train_p, epochs=100, batch_size = 32, validation_split = 0.15, verbose = 1)
trainYPredict = model.predict(train_f)
**Inverse transform the prediction and keep the last value(output)**
trainYPredict1 = np.repeat(trainYPredict, scaled_train_X.shape[1], axis = -1)
trainYPredict_actual = scaler.inverse_transform(trainYPredict1)[:, -1]
train_p_actual = np.repeat(train_p, scaled_train_X.shape[1], axis = -1)
train_p_actual1 = scaler.inverse_transform(train_p_actual)[:, -1]
Prediction_mse=mean_squared_error(train_p_actual1 ,trainYPredict_actual)
print("Mean Squared Error of prediction is:", str(Prediction_mse))
Prediction_rmse =sqrt(Prediction_mse)
print("Root Mean Squared Error of prediction is:", str(Prediction_rmse))
prediction_r2=r2_score(train_p_actual1 ,trainYPredict_actual)
print("R2 score of predictions is:", str(prediction_r2))
prediction_mae=mean_absolute_error(train_p_actual1 ,trainYPredict_actual)
print("Mean absolute error of prediction is:", prediction_mae)
**Testing of model**
scaled_test_X = scaler.transform(test_X)
test_q = []
test_r = []
for val in range(n_past, len(scaled_test_X) - n_future+1):
test_q.append(scaled_test_X[val - n_past:val, 0:scaled_test_X.shape[1]])
test_r.append(scaled_test_X[val + n_future - 1:val + n_future, -1])
test_q, test_r = np.array(test_q), np.array(test_r)
testPredict = model.predict(test_q )
Validation and training loss image is also attached Training and validation Loss

Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.
For both models:
Setting the last 50 layers trainable and adding the same fully connected layers to the end.
Learning rate 3e-2
Batch size 32
Adam optimizer with the same betas
100 epochs
The inputs consist of RGB unscaled images
Pytorch
Model
def _init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
m.bias.data.fill_(0.01)
def get_mob_v3_small():
model = torchvision.models.mobilenet_v3_small(pretrained=True)
children_list = get_children(model)
for c in children_list[:-50]:
for p in c.parameters():
p.requires_grad = False
return model
class TransferMobileNetV3_v2(nn.Module):
def __init__(self,
num_keypoints: int = 5):
super(TransferMobileNetV3_v2, self).__init__()
self.classifier_neurons = num_keypoints*2
self.base_model = get_mob_v3_small()
self.base_model.classifier = nn.Sequential(
nn.Linear(in_features=1024, out_features=1024),
nn.ReLU(),
nn.Linear(in_features=1024, out_features=512),
nn.ReLU(),
nn.Linear(in_features=512, out_features=self.classifier_neurons)
)
self.base_model.apply(_init_weights)
def forward(self, x):
out = self.base_model(x)
return out
Training Script
def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):
len_dataloader = len(trainloader)
for epoch in range(1, args.epochs+1):
net.train()
for batch_idx, sample in enumerate(trainloader):
inputs, labels = sample
inputs, labels = inputs.to(args.device), labels.to(args.device)
optimizer.zero_grad()
with torch.cuda.amp.autocast(args.use_amp):
prediction = net(inputs)
loss = train_loss_fn(prediction, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
def main():
args = make_args_parser()
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = args.seed
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=3e-2,
betas=(0.9, 0.999))
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)
Tensorflow
Model
base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',
input_shape=(224,224,3))
x_in = base_model.layers[-6].output
x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=x)
for layer in model.layers[:-50]:
layer.trainable=False
Training Script
model.compile(loss = "mse",
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))
history = model.fit(input_numpy, output_numpy,
verbose=1,
batch_size=32, epochs=100,validation_split = 0.2)
Results
The PyTorch model predicts one single point around the center for all 5 different points.
The Tensorflow model predicts the points quite well and are quite accurate.
The loss in the Pytorch model is much higher than the Tensorflow model.
Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results. Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.
Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I'm putting it here just in case.

My Pytorch model is giving very bad results

I am new with Deep Learning with Pytorch. I am more experienced with Tensorflow, and thus I should say I am not new to Deep Learning itself.
Currently, I am working on a simple ANN classification. There are only 2 classes so quite naturally I am using a Softmax BCELoss combination.
The dataset is like this:
shape of X_train (891, 7)
Shape of Y_train (891,)
Shape of x_test (418, 7)
I transformed the X_train and others to torch tensors as train_data and so on. The next step is:
train_ds = TensorDataset(train_data, train_label)
# Define data loader
batch_size = 32
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
I made the model class like:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(7, 32)
self.bc1 = nn.BatchNorm1d(32)
self.fc2 = nn.Linear(32, 64)
self.bc2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, 128)
self.bc3 = nn.BatchNorm1d(128)
self.fc4 = nn.Linear(128, 32)
self.bc4 = nn.BatchNorm1d(32)
self.fc5 = nn.Linear(32, 10)
self.bc5 = nn.BatchNorm1d(10)
self.fc6 = nn.Linear(10, 1)
self.bc6 = nn.BatchNorm1d(1)
self.drop = nn.Dropout2d(p=0.5)
def forward(self, x):
torch.nn.init.xavier_uniform(self.fc1.weight)
x = self.fc1(x)
x = self.bc1(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc2(x)
x = self.bc2(x)
x = F.relu(x)
#x = self.drop(x)
x = self.fc3(x)
x = self.bc3(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc4(x)
x = self.bc4(x)
x = F.relu(x)
#x = self.drop(x)
x = self.fc5(x)
x = self.bc5(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc6(x)
x = self.bc6(x)
x = torch.sigmoid(x)
return x
model = Net()
The loss function and the optimizer are defined:
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
At last, the task is to run the forward in epochs:
num_epochs = 1000
# Repeat for given number of epochs
for epoch in range(num_epochs):
# Train with batches of data
for xb,yb in train_dl:
pred = model(xb)
yb = torch.unsqueeze(yb, 1)
#print(pred, yb)
print('grad', model.fc1.weight.grad)
l = loss(pred, yb)
#print('loss',l)
# 3. Compute gradients
l.backward()
# 4. Update parameters using gradients
optimizer.step()
# 5. Reset the gradients to zero
optimizer.zero_grad()
# Print the progress
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, l.item()))
I can see in the output that after each iteration with all the batches, the hard weights are non-zero, after this zero_grad is applied.
However, the model is pretty bad. I get an F1 score of around 50% only! And the model is bad when I call it to predict the train_dl itself!!!
I am wondering what the reason is. The grad of weights not zero but not updating properly? The optimizer not optimizing the weights? Or what else?
Can someone please have a look?
I already tried different loss functions and optimizers. I tried with smaller datasets, bigger batches, different hyperparameters.
Thanks! :)
First of all, you don't use softmax activation for BCE loss, unless you have 2 output nodes, which is not the case. In PyTorch, BCE loss doesn't apply any activation function before calculating the loss, unlike the CCE which has a built-in softmax function. So, if you want to use BCE, you have to use sigmoid (or any function f: R -> [0, 1]) at the output layer, which you don't have.
Moreover, you should ideally do optimizer.zero_grad() for each batch if you want to do SGD (which is the default). If you don't do that, you will be just doing full-batch gradient descent, which is quite slow and gets stuck in local minima easily.

How to multiply a layer by a constant vector element wise in Keras?

I want to make a weighted average ensemble of 3 of my trained models. So, I want first to multiply the softmax output of a model (element-wise) by a vector and then average the 3 weighted outputs of the 3 models.
I used the following code to multiply the output of the first model by its weight vector:
from keras.layers import Multiply, Average
resnet_weights = np.asarray([[0.91855, 0.99485, 0.89065, 0.96525, 0.98005,
0.93645, 0.6149, 0.934, 0.92505, 0.785, 0.85]], np.float32)
resnet_weight_tensor=tf.constant(resnet_weights, np.float32)
sess = tf.InteractiveSession()
print(resnet_weight_tensor.eval())
sess.close()
resnet_weighted = Multiply()([finetuned_model.layers[-1].output, resnet_weight_tensor])
print(resnet_weighted)
new_model=Model(model.input, resnet_weighted)
However, I'm stuck with the following error:
What can I do?
Use Lambda instead of Multiply, and K.constant instead of tf.constant (is backend-neutral):
resnet_weight_tensor=K.constant(resnet_weights, 'float32')
out = finetuned_model.layers[-1].output
resnet_weighted = Lambda(lambda x: x * resnet_weight_tensor)(out)
FULL EXAMPLE:
## BUILD MODELS
batch_size = 32
num_batches = 100
input_shape = (4,)
num_classes = 3
model_1 = make_model(input_shape, 8, num_classes)
model_2 = make_model(input_shape, 10, num_classes)
model_3 = make_model(input_shape, 12, num_classes)
## BUILD ENSEMBLE
models = (model_1, model_2, model_3)
models_ins = [model.input for model in models]
models_outs = [model.input for model in models]
outputs_weights = [np.random.random((batch_size, num_classes)),
np.random.random((batch_size, num_classes)),
np.random.random((batch_size, num_classes))]
outs_avg = model_outputs_average(models, outputs_weights)
final_out = Dense(num_classes, activation='softmax')(outs_avg)
model_ensemble = Model(inputs=models_ins, outputs=final_out)
model_ensemble.compile('adam', loss='categorical_crossentropy')
### TEST ENSEMBLE
x1 = np.random.randn(batch_size, *input_shape) # toy data
x2 = np.random.randn(batch_size, *input_shape)
x3 = np.random.randn(batch_size, *input_shape)
y = np.random.randint(0,2,(batch_size, num_classes)) # toy labels
model_ensemble.fit([x1,x2,x3], y)
Verify averaging:
[print(layer.name) for layer in model_ensemble.layers] # show layer names
preouts1 = get_layer_outputs(model_ensemble, 'lambda_1', [x1,x2,x3])
preouts2 = get_layer_outputs(model_ensemble, 'lambda_2', [x1,x2,x3])
preouts3 = get_layer_outputs(model_ensemble, 'lambda_3', [x1,x2,x3])
preouts_avg = get_layer_outputs(model_ensemble, 'average_1',[x1,x2,x3])
preouts = np.asarray([preouts1, preouts2, preouts3])
sum_of_diff_of_means = np.sum(np.mean(preouts, axis=0) - preouts_avg)
print(np.sum(np.mean([preouts1, preouts2, preouts3],axis=0) - preouts_avg))
# 4.69e-07
Functions used:
def make_model(input_shape, dense_dim, num_classes=3):
ipt = Input(shape=input_shape)
x = Dense(dense_dim, activation='relu')(ipt)
out = Dense(num_classes, activation='softmax')(x)
model = Model(ipt, out)
model.compile('adam', loss='categorical_crossentropy')
return model
def model_outputs_average(models, outputs_weights):
outs = [model.output for model in models]
out_shape = K.int_shape(outs[0])[1:] # ignore batch dim
assert all([(K.int_shape(out)[1:] == out_shape) for out in outs]), \
"All model output shapes must match"
outs_weights = [K.constant(w, 'float32') for w in outputs_weights]
ow_shape = K.int_shape(outs_weights[0])
assert all([(K.int_shape(w) == ow_shape) for w in outs_weights]), \
"All outputs_weights and model.output shapes must match"
weights_layers = [Lambda(lambda x: x * ow)(out) for ow, out
in zip(outs_weights, outs)]
return Average()(weights_layers)
def get_layer_outputs(model,layer_name,input_data,train_mode=False):
outputs = [layer.output for layer in model.layers if layer_name in layer.name]
layers_fn = K.function([model.input, K.learning_phase()], outputs)
return [layers_fn([input_data,int(train_mode)])][0][0]
The bug is possibly caused by the mixture of kears api and tensorflow api, since your resnet_weight_tensor is a tensor from tensorflow api, while finetuned_model.layers[-1].output is the output from a keras layer. Some discusses can be seen here issue 7362
One walk around is to wrap resnet_weight_tensor into keras Input layer.
from keras.layers import Multiply, Average, Input
resnet_weights = np.asarray([[0.91855, 0.99485, 0.89065, 0.96525, 0.98005,
0.93645, 0.6149, 0.934, 0.92505, 0.785, 0.85]], np.float32)
resnet_weight_tensor=tf.constant(resnet_weights, np.float32)
resnet_weight_input = Input(tensor=resnet_weight_tensor)
sess = tf.InteractiveSession()
print(resnet_weight_tensor.eval())
sess.close()
resnet_weighted = Multiply()([finetuned_model.layers[-1].output, resnet_weight_input])
print(resnet_weighted)
new_model=Model([model.input, resnet_weight_input], resnet_weighted)

Finetuning DNN with continuous outputs in the last layer

Greatly appreciate it if someone could help me out here:
I'm trying to do some finetuning on a regression task --- my inputs are 200X200 RGB images and my prediction output/label is a set of real values (let's say, within [0,10], though scaling is not a big deal here...?) --- on top of InceptionV3 architecture. Here are my functions that take a pretrained Inception model, remove the last layer and add a a new layer, set up for finetuning...
"""
Fine-tuning functions
"""
IM_WIDTH, IM_HEIGHT = 299, 299 #fixed size for InceptionV3
NB_EPOCHS = 3
BAT_SIZE = 32
FC_SIZE = 1024
NB_IV3_LAYERS_TO_FREEZE = 172
def eucl_dist(inputs):
x, y = inputs
return ((x - y)**2).sum(axis=-1)
def add_new_last_continuous_layer(base_model):
"""Add last layer to the convnet
Args:
base_model: keras model excluding top, for instance:
base_model = InceptionV3(weights='imagenet',include_top=False)
Returns:
new keras model with last layer
"""
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(FC_SIZE, activation='relu')(x)
predictions = Lambda(eucl_dist, output_shape=(1,))(x)
model = Model(input=base_model.input, output=predictions)
return model
def setup_to_finetune_continuous(model):
"""Freeze the bottom NB_IV3_LAYERS and retrain the remaining top
layers.
note: NB_IV3_LAYERS corresponds to the top 2 inception blocks in
the inceptionv3 architecture
Args:
model: keras model
"""
for layer in model.layers[:NB_IV3_LAYERS_TO_FREEZE]:
layer.trainable = False
for layer in model.layers[NB_IV3_LAYERS_TO_FREEZE:]:
layer.trainable = True
model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
loss='eucl_dist')
Here are my implementations:
base_model = InceptionV3(weights = "imagenet",
include_top=False, input_shape=(3,200,200))
model0 = add_new_last_continuous_layer(base_model)
setup_to_finetune_continuous(model0)
history=model0.fit(train_x, train_y, validation_data = (test_x, test_y), nb_epoch=epochs, batch_size=32)
scores = model0.evaluate(test_x, test_y, verbose = 0)
features = model0.predict(X_train)
where train_x is a (168435, 3, 200, 200) numpy array and train_y is a (168435,) numpy array. The same goes for test_x and test_y except the number of observations is 42509.
I got the TypeError: Tensor object is not iterable bug which occurred at predictions = Lambda(eucl_dist, output_shape=(1,))(x)'' when going through theadd_new_last_continuous_layer()`` function. Could you anyone kindly give me some guidance to get around that and what the problem is? Greatly appreciated and happy holidays!
EDIT:
Changed the functions to:
def eucl_dist(inputs):
x, y = inputs
return ((x - y)**2).sum(axis=-1)
def add_new_last_continuous_layer(base_model):
"""Add last layer to the convnet
Args:
base_model: keras model excluding top, for instance:
base_model = InceptionV3(weights='imagenet',include_top=False)
Returns:
new keras model with last layer
"""
x = base_model.output
x = GlobalAveragePooling2D()(x)
x1 = Dense(FC_SIZE, activation='relu')(x)
x2 = Dense(FC_SIZE, activation='relu')(x)
predictions = Lambda(eucl_dist, output_shape=eucl_dist_shape)([x1,x2])
model = Model(input=base_model.input, output=predictions)
return model
Your output shape for the lambda layer is wrong. Define your functions like this:
from keras import backend as K
def euclidean_distance(vects):
x, y = vects
return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
predictions = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([input1, input2])