Custom Keras binary_crossentropy loss function not working - tensorflow

I’m trying to re-define keras’s binary_crossentropy loss function so that I can customize it but it’s not giving me the same results as the existing one.
I'm using TF 1.13.1 with Keras 2.2.4.
I went through Keras’s github code. My understanding is that the loss in model.compile(optimizer='adam', loss='binary_crossentropy', metrics =['accuracy']), is defined in losses.py, using binary_crossentropy defined in tensorflow_backend.py.
I ran a dummy data and model to test it. Here are my findings:
The custom loss function outputs the same results as keras’s one
Using the custom loss in a keras model gives different accuracy results
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
import tensorflow as tf
from keras import losses
import keras.backend as K
import keras.backend.tensorflow_backend as tfb
from keras.layers import Dense
from keras import Sequential
#Dummy check of loss output
def binary_crossentropy_custom(y_true, y_pred):
return K.mean(binary_crossentropy_custom_tf(y_true, y_pred), axis=-1)
def binary_crossentropy_custom_tf(target, output, from_logits=False):
"""Binary crossentropy between an output tensor and a target tensor.
# Arguments
target: A tensor with the same shape as `output`.
output: A tensor.
from_logits: Whether `output` is expected to be a logits tensor.
By default, we consider that `output`
encodes a probability distribution.
# Returns
A tensor.
"""
# Note: tf.nn.sigmoid_cross_entropy_with_logits
# expects logits, Keras expects probabilities.
if not from_logits:
# transform back to logits
_epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
output = tf.log(output / (1 - output))
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target,
logits=output)
logits = tf.constant([[-3., -2.11, -1.22],
[-0.33, 0.55, 1.44],
[2.33, 3.22, 4.11]])
labels = tf.constant([[1., 1., 1.],
[1., 1., 0.],
[0., 0., 0.]])
custom_sigmoid_cross_entropy_with_logits = binary_crossentropy_custom(labels, logits)
keras_binary_crossentropy = losses.binary_crossentropy(y_true=labels, y_pred=logits)
with tf.Session() as sess:
print('CUSTOM sigmoid_cross_entropy_with_logits: ', sess.run(custom_sigmoid_cross_entropy_with_logits), '\n')
print('KERAS keras_binary_crossentropy: ', sess.run(keras_binary_crossentropy), '\n')
#CUSTOM sigmoid_cross_entropy_with_logits: [16.118095 10.886106 15.942386]
#KERAS keras_binary_crossentropy: [16.118095 10.886106 15.942386]
#Dummy check of model accuracy
X_train = tf.random.uniform((3, 5), minval=0, maxval=1, dtype=tf.dtypes.float32)
labels = tf.constant([[1., 0., 0.],
[0., 0., 1.],
[1., 0., 0.]])
model = Sequential()
#First Hidden Layer
model.add(Dense(5, activation='relu', kernel_initializer='random_normal', input_dim=5))
#Output Layer
model.add(Dense(3, activation='sigmoid', kernel_initializer='random_normal'))
#I ran model.fit for each model.compile below 10 times using the same X_train and provide the range of accuracy measurement
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics =['accuracy']) #0.748 < acc < 0.779
# model.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics =['accuracy']) #0.761 < acc < 0.778
model.compile(optimizer='adam', loss=binary_crossentropy_custom, metrics =['accuracy']) #0.617 < acc < 0.663
history = model.fit(X_train, labels, steps_per_epoch=100, epochs=1)
I'd expect the custom loss function to give similar model accuracy output but it does not. Any idea? Thanks!

Keras automatically selects which accuracy implementation to use according to the loss, and this won't work if you use a custom loss. But in this case you can just explictly use the right accuracy, which is binary_accuracy:
model.compile(optimizer='adam', loss=binary_crossentropy_custom, metrics =['binary_accuracy'])

Related

How to interpret get_weights for Keras GRU?

I am unable to interpret the results of get_weights from a GRU layer. Here's my code -
#Modified from - https://machinelearningmastery.com/understanding-simple-recurrent-neural-networks-in-keras/
from pandas import read_csv
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, GRU
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import math
import matplotlib.pyplot as plt
model = Sequential()
model.add(GRU(units = 2, input_shape = (3,1), activation = 'linear'))
model.add(Dense(units = 1, activation = 'linear'))
model.compile(loss = 'mean_squared_error', optimizer = 'adam')
initial_weights = model.layers[0].get_weights()
print("Shape = ",initial_weights)
I am familiar with GRU concepts. In addition, I understand how the get_weights work for Keras Simple RNN layer, where the first array represents the input weights, the second the activation weights and the third the bias. However, I am lost with output of GRU, which is given below -
Shape = [array([[-0.64266175, -0.0870676 , -0.25356603, -0.03685969, 0.22260845,
-0.04923642]], dtype=float32), array([[ 0.01929092, -0.4932567 , 0.3723044 , -0.6559699 , -0.33790302,
0.27062896],
[-0.4214194 , 0.46456426, 0.27233726, -0.00461334, -0.6533575 ,
-0.32483965]], dtype=float32), array([[0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0.]], dtype=float32)]
I am assuming it has something to do with GRU gates.
Update:7/4 - This page says that keras GRU has 3 gates, update, reset and output. However, based on this, GRU shouldn't have the output gate.
Best way I know would be to track the add_weight() calls in the build() function of the GRUCell.
Let's take an example model,
model = tf.keras.models.Sequential(
[
tf.keras.layers.GRU(32, input_shape=(5, 10), name='gru'),
tf.keras.layers.Dense(10)
]
)
How we'll print some metadata about what's returned by weights = model.get_layer('gru').get_weights(). Which gives,
Number of arrays in weights: 3
Shape of each array in weights: [(10, 96), (32, 96), (2, 96)]
Let's go back to what weights defined by the GRUCell. We got,
self.kernel = self.add_weight(
shape=(input_dim, self.units * 3),
...
)
self.recurrent_kernel = self.add_weight(
shape=(self.units, self.units * 3),
...
)
...
bias_shape = (2, 3 * self.units)
self.bias = self.add_weight(
shape=bias_shape,
...
)
This is what you're seeing as weights (in that order). Here's why they are shaped like this. GRU computations are outlined here.
The first matrix in weights (of shape [10, 96]) is a concatenation of Wz|Wr|Wh (in that order). Each of these is a [10, 32] sized tensor. Concatenation gives a [10, 32*3=96] sized tensor.
Similarly, the second matrix is a concatenation of Uz|Ur|Uh. Each of these is a [32, 32] sized tensor which becomes [32, 96] after concatenation.
You can see how they break this combined weight matrix to each of z, r and h components here.
Finally the bias. It contains 2 biases i.e. [2, 96] sized tensor; input_bias and recurrent_bias. Again, biases from all gates/weights are combined to a single tensor. Typically, only the input_bias is used. But if you have reset_after (decides how the reset gate is applied) set to True, then the recurrent_bias gets used. It's an implementation detail.

Create a weighted MSE loss function in Tensorflow

I want to train a recurrent neural network using Tensorflow. My model outputs a 1 by 100 vector for each training sample. Assume that y = [y_1, y_2, ..., y_100] is my output for training sample x and the expected output is y'= [y'_1, y'_2, ..., y'_100].
I wish to write a custom loss function that calculates the loss of this specific sample as follows:
Loss = 1/sum(weights) * sqrt(w_1*(y_1-y'_1)^2 + ... + w_100*(y_100-y'_100)^2)
which weights = [w_1,...,w_100] is a given weight array.
Could someone help me with implementing such a custom loss function? (I also use mini-batches while training)
I want to underline that you have 2 possibilities according to your problem:
[1] If the weights are equal for all your samples:
You can build a loss wrapper. Here a dummy example:
n_sample = 200
X = np.random.uniform(0,1, (n_sample,10))
y = np.random.uniform(0,1, (n_sample,100))
W = np.random.uniform(0,1, (100,)).astype('float32')
def custom_loss_wrapper(weights):
def loss(true, pred):
sum_weights = tf.reduce_sum(weights) * tf.cast(tf.shape(pred)[0], tf.float32)
resid = tf.sqrt(tf.reduce_sum(weights * tf.square(true - pred)))
return resid/sum_weights
return loss
inp = Input((10,))
x = Dense(256)(inp)
pred = Dense(100)(x)
model = Model(inp, pred)
model.compile('adam', loss=custom_loss_wrapper(W))
model.fit(X, y, epochs=3)
[2] If the weights are different between samples:
You should build your model usind add_loss in order to dinamically take into account the weights for each sample. Here a dummy example:
n_sample = 200
X = np.random.uniform(0,1, (n_sample,10))
y = np.random.uniform(0,1, (n_sample,100))
W = np.random.uniform(0,1, (n_sample,100))
def custom_loss(true, pred, weights):
sum_weights = tf.reduce_sum(weights)
resid = tf.sqrt(tf.reduce_sum(weights * tf.square(true - pred)))
return resid/sum_weights
inp = Input((10,))
true = Input((100,))
weights = Input((100,))
x = Dense(256)(inp)
pred = Dense(100)(x)
model = Model([inp,true,weights], pred)
model.add_loss(custom_loss(true, pred, weights))
model.compile('adam', loss=None)
model.fit([X,y,W], y=None, epochs=3)
When using add_loss you should pass all the tensors involved in the loss as input layer and pass them inside the loss for the computation.
At inference time you can compute predictions as always, simply removing the true and weights as input:
final_model = Model(model.input[0], model.output)
final_model.predict(X)
You can implement custom weighted mse in the following way
import numpy as np
from tensorflow.keras import backend as K
def custom_mse(class_weights):
def weighted_mse(gt, pred):
# Formula:
# w_1*(y_1-y'_1)^2 + ... + w_100*(y_100-y'_100)^2 / sum(weights)
return K.sum(class_weights * K.square(gt - pred)) / K.sum(class_weights)
return weighted_mse
y_true = np.array([[0., 1., 1, 0.], [0., 0., 1., 1.]])
y_pred = np.array([[0., 1, 0., 1.], [1., 0., 1., 1.]])
weights = np.array([0.25, 0.50, 1., 0.75])
print(y_true.shape, y_pred.shape, weights.shape)
(2, 4) (2, 4) (4,)
loss = custom_mse(class_weights=weights)
loss(y_true, y_pred).numpy()
0.8
Using it with model compilation.
model.compile(loss=custom_mse(weights))
This will compute mse with the provided weighted matrices. However, in your question, you quote sqrt..., from which I presume you meant root mse (rmse). To do that you can use K.sqrt(K.sum(...)) / K.sum(...) in the custom function of custom_mse.
FYI, you may also interest to look at class_weights and sample_weights during Model. fit. From source:
class_weight: Optional dictionary mapping class indices (integers) to a weight (float) value, used for weighting the loss
function (during training only). This can be useful to tell the model
to "pay more attention" to samples from an under-represented class.
sample_weight: Optional Numpy array of weights for the training samples, used for weighting the loss function (during training only).
You can either pass a flat (1D) Numpy array with the same length as
the input samples (1:1 mapping between weights and samples), or in the
case of temporal data, you can pass a 2D array with shape (samples,
sequence_length), to apply a different weight to every timestep of
every sample. This argument is not supported when x is a dataset,
generator, or keras.utils.Sequence instance, instead provides the
sample_weights as the third element of x.
And also loss_weights in Model.compile, from source
loss_weights: Optional list or dictionary specifying scalar coefficients (Python floats) to weight the loss contributions of
different model outputs. The loss value that will be minimized by the
model will then be the weighted sum of all individual losses, weighted
by the loss_weights coefficients. If a list, it is expected to have a
1:1 mapping to the model's outputs. If a dict, it is expected to map
output names (strings) to scalar coefficients.
A class version of the weighted mean squared error loss function.
class WeightedMSE(object):
def __init__(self):
pass
def __call__(self, y_true, y_pred, weights):
sum_weights = tf.reduce_sum(weights)
resid = tf.reduce_sum(weights * tf.square(y_true - y_pred))
return resid / sum_weights

The list of Numpy arrays that you are passing to your model is not the size the model expected

I am trying to obtain three different loss functions in Keras by passing them like so
input_img = Input(shape=(728,))
encoded = Dense(450, activation='relu')(input_img)
encoded = Dense(250, activation='relu')(encoded)
encoded= Dense(20, activation='relu')(encoded)
decoded = Dense(250, activation='relu')(encoded)
decoded = Dense(450, activation='relu')(decoded)
decoded = Dense(728, activation='sigmoid')(decoded)
loss1 = Dense(728, activation='sigmoid', name='p1')(decoded)
loss2 = Dense(728, activation='sigmoid', name='p2')(decoded)
loss3 = Dense(728, activation='sigmoid', name='p3')(decoded)
I defined three different loss functions and compiled succesfully
autoencoder = Model(inputs = [input_img], outputs=[loss1,loss2,loss3])
autoencoder.compile(optimizer='Adam', loss = [w_loss,b_loss, loss], metrics = [w_loss,b_loss], loss_weights=[1., 1., 1.])
I then fit the model
history_modified = autoencoder.fit(X_train, X_train, epochs=200, batch_size= 100, shuffle=True, validation_data=(X_test, X_test))
Where X_train dimensions are (100000, 728) and X_test dimensions are (50000,728)
The error I'm getting is
Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 3 array(s), but instead got the following list of 1 arrays: [array([[0., 0., 0., ..., 0., 0., 0.],
I don't what exactly is causing the problem, but I think it might have to do with the layers and how I have multiple loss functions.
As the error says, it has the problem with the shape of the target. Ideally, as there are three outputs (loss1,loss2,loss3) from the model, there should be three arrays in the output to compare the three output arrays. You must be passing only one array in the target and hence this error.

Keras TimeDistributed Not Masking CNN Model

For the sake of example, I have an input consisting of 2 images,of total shape (2,299,299,3). I'm trying to apply inceptionv3 on each image, and then subsequently process the output with an LSTM. I'm using a masking layer to exclude a blank image from being processed (specified below).
The code is:
import numpy as np
from keras import backend as K
from keras.models import Sequential,Model
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D, BatchNormalization, \
Input, GlobalAveragePooling2D, Masking,TimeDistributed, LSTM,Dense,Flatten,Reshape,Lambda, Concatenate
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.applications import inception_v3
IMG_SIZE=(299,299,3)
def create_base():
base_model = inception_v3.InceptionV3(weights='imagenet', include_top=False)
x = GlobalAveragePooling2D()(base_model.output)
base_model=Model(base_model.input,x)
return base_model
base_model=create_base()
#Image mask to ignore images with pixel values of -1
IMAGE_MASK = -2*np.expand_dims(np.ones(IMG_SIZE),0)
final_input=Input((2,IMG_SIZE[0],IMG_SIZE[1],IMG_SIZE[2]))
final_model = Masking(mask_value = -2.)(final_input)
final_model = TimeDistributed(base_model)(final_model)
final_model = Lambda(lambda x: x, output_shape=lambda s:s)(final_model)
#final_model = Reshape(target_shape=(2, 2048))(final_model)
#final_model = Masking(mask_value = 0.)(final_model)
final_model = LSTM(5,return_sequences=False)(final_model)
final_model = Model(final_input,final_model)
#Create a sample test image
TEST_IMAGE = np.ones(IMG_SIZE)
#Create a test sample input, consisting of a normal image and a masked image
TEST_SAMPLE = np.concatenate((np.expand_dims(TEST_IMAGE,axis=0),IMAGE_MASK))
inp = final_model.input # input placeholder
outputs = [layer.output for layer in final_model.layers] # all layer outputs
functors = [K.function([inp]+ [K.learning_phase()], [out]) for out in outputs]
layer_outs = [func([np.expand_dims(TEST_SAMPLE,0), 1.]) for func in functors]
This does not work correctly. Specifically, the model should mask the IMAGE_MASK part of the input, but it instead processes it with inception (giving a nonzero output). here are the details:
layer_out[-1] , the LSTM output is fine:
[array([[-0.15324114, -0.09620268, -0.01668587, 0.07938149, -0.00757846]], dtype=float32)]
layer_out[-2] and layer_out[-3] , the LSTM input is wrong, it should have all zeros in the second array:
[array([[[ 0.37713543, 0.36381325, 0.36197218, ..., 0.23298527,
0.43247852, 0.34844452],
[ 0.24972123, 0.2378867 , 0.11810347, ..., 0.51930511,
0.33289322, 0.33403745]]], dtype=float32)]
layer_out[-4], the input to the CNN is correctly masked:
[[ 1., 1., 1.],
[ 1., 1., 1.],
[ 1., 1., 1.],
...,
[ 1., 1., 1.],
[ 1., 1., 1.],
[ 1., 1., 1.]]],
[[[-0., -0., -0.],
[-0., -0., -0.],
[-0., -0., -0.],
...,
[-0., -0., -0.],
[-0., -0., -0.],
[-0., -0., -0.]],
Note that the code seems to work correctly with a simpler base_model such as:
def create_base():
input_layer=Input(IMG_SIZE)
base_model=Flatten()(input_layer)
base_model=Dense(2048)(base_model)
base_model=Model(input_layer,base_model)
return base_model
I have exhausted most online resources on this. Permutations of this question have been asked on Keras's github, such as here, here and here, but I can't seem to find any concrete resolution.
The links suggest that the issues seem to be stemming from a combination of TimeDistributed being applied to BatchNormalization, and the hacky fixes of either the Lambda identity layer, or Reshape layers remove errors but don't seem to output the correct model.
I've tried to force the base model to support masking via:
base_model.__setattr__('supports_masking',True)
and I've also tried applying an identity layer via:
TimeDistributed(Lambda(lambda x: base_model(x), output_shape=lambda s:s))(final_model)
but none of these seem to work. Note that I would like the final model to be trainable, in particular the CNN part of it should remain trainable.
Not entirely sure this will work, but based on the comment made here, with a newer version of tensorflow + keras it should work:
final_model = TimeDistributed(Flatten())(final_input)
final_model = Masking(mask_value = -2.)(final_model)
final_model = TimeDistributed(Reshape(IMG_SIZE))(final_model)
final_model = TimeDistributed(base_model)(final_model)
final_model = Model(final_input,final_model)
I took a look at the source code of masking, and I noticed Keras creates a mask tensor that only reduces the last axis. As long as you're dealing with 5D tensors, it will cause no problem, but when you reduce the dimensions for the LSTM, this masking tensor becomes incompatible.
Doing the first flatten step, before masking, will assure that the masking tensor works properly for 3D tensors. Then you expand the image again to its original size.
I'll probably try to install newer versions soon to test it myself, but these installing procedures have caused too much trouble and I'm in the middle of something important here.
On my machine, this code compiles, but that strange error appears in prediction time (see link at the first line of this answer).
Creating a model for predicting the intermediate layers
I'm not sure, by the code I've seen, that the masking function is kept internally in tensors. I don't know exactly how it works, but it seems to be managed separately from the building of the functions inside the layers.
So, try using a keras standard model to make the predictions:
inp = final_model.input # input placeholder
outputs = [layer.output for layer in final_model.layers] # all layer outputs
fullModel = Model(inp,outputs)
layerPredictions = fullModel.predict(np.expand_dims(TEST_SAMPLE,0))
print(layerPredictions[-2])
It seems to be working as intended. Masking in Keras doesn't produce zeros as you would expect, it instead skips the timesteps that are masked in upstream layers such as LSTM and loss calculation. In case of RNNs, Keras (at least tensorflow) is implemented such that the states from the previous step are carried over, tensorflow_backend.py. This is done in part to preserve the shapes of tensors when dynamic input is given.
If you really want zeros you will have to implement your own layer with a similar logic to Masking and return zeros explicitly. To solve your problem, you need a mask before the final LSTM layer using the final_input:
class MyMask(Masking):
"""Layer that adds a mask based on initial input."""
def compute_mask(self, inputs, mask=None):
# Might need to adjust shapes
return K.any(K.not_equal(inputs[0], self.mask_value), axis=-1)
def call(self, inputs):
# We just return input back
return inputs[1]
def compute_output_shape(self, input_shape):
return input_shape[1]
final_model = MyMask(mask_value=-2.)([final_input, final_model])
You probably can attach the mask in a simpler manner but this custom class essentially adds a mask based on your initial inputs and outputs a Keras tensor that now has a mask.
Your LSTM will ignore in your example the second image. To confirm you can return_sequences=Trueand check that the output for 2 images are identical.
I'm trying implement the same thing, I want my LSTM sequences to have variable sizes. However I can't even implement your original model. I obtain the following error: TypeError: Layer input_1 does not support masking, but was passed an input_mask: Tensor("time_distributed_1/Reshape_1:0", shape=(?, 100, 100), dtype=bool) I'm using tensorflow 1.10 and keras 2.2.2
I solved the problem by adding a second input, a mask to specify which timesteps to take into account for the LSTM. That way the image sequence always has the same number of timesteps, the CNN always generates an output, but some of them are ignored for the LSTM input. However, the missing images need to be chosen carefully so that the batch normalization is not affected.
def LSTM_CNN(params):
resnet = ResNet50(include_top=False, weights='imagenet', pooling = 'avg')
input_layer = Input(shape=(params.numFrames, params.height, params.width, 3))
input_mask = Input(shape=(params.numFrames,1))
curr_layer = TimeDistributed(resnet)(input_layer)
resnetOutput = Dropout(0.5)(curr_layer)
curr_layer = multiply([resnetOutput,input_mask])
cnn_output = curr_layer
curr_layer = Masking(mask_value=0.0)(curr_layer)
lstm_out = LSTM(256, dropout=0.5)(curr_layer)
output = Dense(output_dim=params.numClasses, activation='sigmoid')(lstm_out)
model = Model([input_layer, input_mask], output)
return model

tensorflow simple estimator input function problems

I am trying to create a simple input function with the feature data being the numbers 1-10 and the labels being 0 when x < 5; 5 when x = 5 and 10 when x > 5.
example:
# data
nmbrs = [10., 1., 2., 3., 4., 5., 6. , 7., 8., 9.]
labels = [10., 0., 0., 0., 0., 5., 10., 10., 10., 10.]
# input function
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'numbers': np.array(nmbrs)}, y=np.array(labels),
batch_size=batch_size, num_epochs=None, shuffle=True)
The problem i am having is that the nmbrs and labels array doesnt seem to be in the right form, i tried making it into a 2d array but that didnt work either im sure im doing something really easy wrong here...
EDIT: model and neural net functions
def neural_net(x_dict):
# TF Estimator input is a dict, in case of multiple inputs
x = x_dict['numbers']
# Hidden fully connected layer with 128 neurons
layer_1 = tf.layers.dense(x, n_hidden_1)
# Hidden fully connected layer with 128 neurons
layer_2 = tf.layers.dense(layer_1, n_hidden_2)
# Output fully connected layer with a neuron for each class
out_layer = tf.layers.dense(layer_2, num_classes)
return out_layer
# Define the model function (following TF Estimator Template)
def model_fn(features, labels, mode):
# Build the neural network
logits = neural_net(features)
# Predictions
pred_classes = tf.argmax(logits, axis=1)
pred_probas = tf.nn.softmax(logits)
# If prediction mode, early return
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=tf.cast(labels, dtype=tf.int32)))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())