Compound Poisson Keras custom loss function - tensorflow

I am trying to implement a custom loss function using Tensorflow as the negative loglikelihood of this expression (which is a compound Poisson-Gamma):
The first term (represented by the Dirac delta) refers to the case when z == 0, while the sum (which needs to be truncated at some point in the implementation as it goes to infinity) represents the product of the probability from a Gamma and a Poisson distribution.
This is the tentative implementation in Tensorflow:
import tensorflow as tf
import tensorflow_probability as tfp
from functools import partial
import numpy as np
def pois_gamma_compound_loss(y_true, y_pred):
lambda_, alpha, beta = y_pred[:, 0], y_pred[:, 1], y_pred[:, 2]
poisson_distr = tfp.distributions.Poisson(rate=lambda_)
ijk_0 = (1.0, tf.zeros_like(y_true))
c = lambda i, p: i < 4
b = lambda i, p: (tf.add(i, 1),
p + tf.math.multiply(x = poisson_distr.prob(tf.zeros_like(y_true) + i),
y = tfp.distributions.Gamma(concentration=tf.math.multiply(x = alpha,
y = tf.zeros_like(y_true) + i),
ijk_final = tf.while_loop(c, b, ijk_0)
batch_lik = tf.add(ijk_final[1], tf.math.exp(tf.multiply(lambda_, -1.0)))
return -tf.reduce_mean(batch_lik)
inputs = Input(shape=(39,))
x = Dense(4, activation='relu', kernel_initializer='random_uniform')(inputs)
x = Dense(4, activation='relu', kernel_initializer='random_uniform')(inputs)
x = Dense(6, activation='relu', kernel_initializer='random_uniform')(inputs)
lambda_ = Dense(1, activation="softmax", name="lambda", kernel_initializer='random_uniform')(x)
alpha = Dense(1, activation="softmax", name="alpha", kernel_initializer='random_uniform')(x)
beta = Dense(1, activation="softmax", name="beta", kernel_initializer='random_uniform')(x)
output_params = Concatenate(name="pvec", axis=1)([lambda_, alpha, beta])
model = Model(inputs, output_params)
model.compile(loss=pois_gamma_compound_loss, optimizer='adam'), y_train, epochs=60, batch_size=20)


How to create this custom ANN using tensorflow?

I am trying to create this custom ANN using tensorflow. Here is image of the toy network and code.
import tensorflow as tf
import numpy as np
in = np.array([1, 2, 3, 4], , dtype="float32")
y_true = np.array([10, 11], , dtype="float32")
# w is vector of weights
# y_pred = np.array([in[0]*w[0]+in[1]*w[0]], [in[2]*w[1]+in[3]*w[1]] )
# y_pred1 = 1 / (1 + tf.math.exp(-y_pred)) # sigmoid activation function
def loss_fun(y_true, y_pred1):
loss1 = tf.reduce_sum(tf.pow(y_pred1 - y_true, 2))
# model.compile(loss=loss_fun, optimizer='adam', metrics=['accuracy'])
The output of this network goes to another ANN to the right and I know that stuff, but don't know how can I create the connections, update the w, y_pred, and compile the model. Any help?
Something like this ought to work
import tensorflow as tf
import numpy as np
def y_pred(x, w):
return [x[0]*w[0]+x[1]*w[0], x[2]*w[1]+x[3]*w[1]]
def loss_fun(y_true, y_pred):
return tf.reduce_sum(tf.pow(y_pred - y_true, 2))
x = np.array([1, 2, 3, 4], dtype="float32")
y_true = np.array([10, 11], dtype="float32")
w = tf.Variable(initial_value=np.random.normal(size=(2)), name='weights', dtype=tf.float32)
xt = tf.convert_to_tensor(x)
yt = tf.convert_to_tensor(y_true)
sgd_opt = tf.optimizers.SGD()
training_steps = 100
display_steps = 10
for step in range(training_steps):
with tf.GradientTape() as tape:
yp = y_pred(xt, w)
loss = loss_fun(yt, yp)
dl_dw = tape.gradient(loss, w)
sgd_opt.apply_gradients(zip([dl_dw], [w]))
if step % display_steps == 0:
print(loss, w)

Batchnormalization in Keras vs PyTorch vs Numpy are different

I created the BatchNormalization layer in Keras, PyTorch and calculated the same operation using Numpy but I get three different results. Am I making some error here?
Things I assume below: layer.get_weights() in tf.keras for BN layer returns in order gamma, beta, running_mean, running_var. For the BN operation I am using the following operation: gamma * (x - running_mean) / sqrt(running_var + epsilon) + beta
Code snippet to reproduce the issue:
import torch
import tensorflow
from torch.nn import Module, BatchNorm1d, Conv1d
from torch.nn.functional import pad
import numpy as np
from tensorflow.keras.layers import Conv1D, BatchNormalization, Input
from tensorflow.keras.models import Model
torch.backends.cudnn.deterministic = True
z = Input((1024, 8), dtype=np.float32)
inp = z
z = Conv1D(64, 16, padding='same', use_bias=False)(z)
z = BatchNormalization(epsilon=0.001)(z)
keras_model = Model(inp, z)
# in order: conv-layer weight, gamma, beta, running_mean, running_var
weights = [np.random.random((16, 8, 64)), np.random.random((64,)), np.random.random((64,)), np.random.random((64,)),
weights = [np.array(x, dtype=np.float32) for x in weights]
keras_model_subpart = Model(keras_model.inputs, keras_model.layers[1].output)
class TorchModel(Module):
def __init__(self):
super(TorchModel, self).__init__()
self.l1 = Conv1d(8, 64, 16, bias=False)
self.l2 = BatchNorm1d(64, 0.001)
def forward(self, x):
x = pad(x, (7, 8))
x = self.l1(x)
y = x
x = self.l2(x)
return y, x
torch_model = TorchModel().to(torch.device('cpu')) = torch.from_numpy(weights[0].T).float() = torch.from_numpy(weights[1].T).float() = torch.from_numpy(weights[2]).float()
torch_model.l2.running_mean = torch.from_numpy(weights[3]).float()
torch_model.l2.running_var = torch.from_numpy(weights[4]).float()
input_value = np.array(np.random.random((1024, 8)), dtype=np.float32)
keras_results = [np.array(keras_model_subpart.predict(input_value[np.newaxis, :, :])),
np.array(keras_model.predict(input_value[np.newaxis, :, :]))]
with torch.no_grad():
torch_results = [x.detach().numpy() for x in torch_model(torch.from_numpy(input_value.T[np.newaxis, :, :]).float())]
keras_results = [np.squeeze(x) for x in keras_results]
torch_results = [np.squeeze(x) for x in torch_results]
numpy_results = weights[1] * (keras_results[0] - weights[3]) / np.sqrt(weights[4] + 0.001) + weights[2]
print(torch.__version__, tensorflow.__version__, np.__version__, sep=",")
print('\tLayer 1 difference:', np.mean(np.abs(keras_results[0] - torch_results[0].T).flatten()))
print('\tLayer 2 difference:', np.mean(np.abs(keras_results[1] - torch_results[1].T).flatten()))
print('\tLayer 2 keras - numpy:', np.mean(np.abs(keras_results[1] - numpy_results).flatten()))
print('\tLayer 2 torch - numpy:', np.mean(np.abs(torch_results[1] - numpy_results.T).flatten()))
The output I get (after all the initialization printing of tensorflow)
Layer 1 difference: 0.0
Layer 2 difference: 6.8671216e-07
Layer 2 keras - numpy: 2.291581e-06
Layer 2 torch - numpy: 1.8929532e-06

different results in inference between python and c++ opencv Mat::

i'm doing a re identification network, implementing a triplet-loss function, at that point everything is fine. the networks works fine in python, I implemented the network on keras with tensorflow as backend, I passed the .hd5 to a .pb file to make inference in tensorflow c++, the probmes is that with the same images the result is difference between python and c++ and I don't know why anyone to help me?
here is the the model in python:
import keras
import keras.applications
import keras.layers as layer
import tensorflow as tf
from keras import backend as K
from keras.backend.tensorflow_backend import set_session
from keras.models import Model as md
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = tf.Session(config=config)
class Model:
def init(self, shape):
self.shape = shape
self.params = {
'optimizer': 'sgd',
'first_neuron': 12,
'first_max_pooling': 2,
'second_neuron': 12,
'second_max_pooling': 2,
'third_neuron': 20,
'third_max_pooling': 3,
'dense_neuron': 64,
'final_neuron': 128,
self.feature_model = self.create_features_model()
self.triplet_model = self.create_model()
def create_features_model(self):
# Define the vision modules
img_input = layer.Input(shape=(self.shape))
x = layer.Conv2D(self.params['first_neuron'], (3, 3), activation='relu')(img_input)
x = layer.MaxPooling2D((self.params['first_max_pooling'], self.params['first_max_pooling']))(x)
x = layer.Conv2D(self.params['second_neuron'], (3, 3), activation='relu')(x)
x = layer.MaxPooling2D((self.params['second_max_pooling'], self.params['second_max_pooling']))(x)
x = layer.Conv2D(self.params['third_neuron'], (3, 3), activation='relu')(x)
x = layer.MaxPooling2D((self.params['third_max_pooling'], self.params['third_max_pooling']))(x)
x = layer.Flatten()(x)
x = layer.Dense(self.params['dense_neuron'], activation='relu')(x)
x = layer.Dense(self.params['final_neuron'], activation='relu')(x)
out = layer.Lambda(lambda x: K.l2_normalize(x, axis=1), name='t_emb_1_lnorm')(x)
features_model = md(img_input, out)
return features_model
def create_model(self):
base_model = self.feature_model
# triplet framework, shared weights
input_shape = (self.shape)
input_target = layer.Input(shape=input_shape, name='input_target')
input_positive = layer.Input(shape=input_shape, name='input_pos')
input_negative = layer.Input(shape=input_shape, name='input_neg')
net_target = base_model(input_target)
net_positive = base_model(input_positive)
net_negative = base_model(input_negative)
# The Lamda layer produces output using given function. Here its Euclidean distance.
positive_distance = layer.Lambda(self.euclidean_distance, name='pos_dist')([net_target, net_positive])
negative_distance = layer.Lambda(self.euclidean_distance, name='neg_dist')([net_target, net_negative])
diference = layer.Lambda(self.euclidean_distance, name='dif')([net_positive, net_negative])
# This lambda layer simply stacks outputs so both distances are available to the objective
distances = layer.Lambda(lambda vects: K.stack(vects, axis=1), name='distance')(
[positive_distance, negative_distance, diference])
model = md([input_target, input_positive, input_negative], distances, name='result')
# Setting up optimizer designed for variable learning rate
model.compile(optimizer=keras.optimizers.Adam(lr=0.001, decay=0.00002),
loss=self.triplet_loss, metrics=[self.accuracy])
return model
def triplet_loss(self, _, y_pred):
margin = K.constant(0.5)
return K.mean(K.maximum(K.constant(0), K.square(y_pred[:, 0, 0]) - 0.5 * (
K.square(y_pred[:, 1, 0]) + K.square(y_pred[:, 2, 0])) + margin))
def accuracy(self, _, y_pred):
return K.mean(y_pred[:, 0, 0] < y_pred[:, 1, 0])
def lnorm(self, x):
return K.l2_normalize(x, axis=-1)
def euclidean_distance(self, vects):
x, y = vects
return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
this is how I made inference on python:
from model import Model as model
from keras.utils import HDF5Matrix
import numpy as np
import cv2
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = tf.Session(config=config)
def load_datasets(in_h5_path, partition='train'):
if partition == 'train':
target = HDF5Matrix(datapath=in_h5_path, dataset="targets")
positive = HDF5Matrix(datapath=in_h5_path, dataset="positives")
negative = HDF5Matrix(datapath=in_h5_path, dataset="negatives")
return target, positive, negative
print("Invalid 'partition' parameter: Valid values: ['train', 'test']")
tar = cv2.imread("/home/amejia/PycharmProjects/triplet_loss/tra1.png")
nega = cv2.imread("/home/amejia/PycharmProjects/triplet_loss/dec1.png")
tar = cv2.resize(tar, (32, 32), interpolation=cv2.INTER_CUBIC)
nega = cv2.resize(nega, (32, 32), interpolation=cv2.INTER_CUBIC)
t1 = np.array(tar).reshape((1, 32, 32, 3))
t2 = np.array(nega).reshape((1, 32, 32, 3))
target, positive, negative = load_datasets('/home/amejia/PycharmProjects/lossDatasetGenerator/test/test32.h5')
net = model((32, 32, 3))
enter = [t1, t2, t1]
a = net.triplet_model.predict(x=enter, batch_size=1)
the inference in c++ :
in c++ this si how I made inference:
tensorflow::Tensor target(tensorflow::DT_FLOAT,
{1, image_size, image_size, 3}));
tensorflow::Tensor positive(tensorflow::DT_FLOAT,
{1, image_size, image_size, 3}));
img_to_float2(tracks, detections, target, positive, frame);
std::vector<std::pair<std::string, tensorflow::Tensor>> Input = {{"input_target:0", target},
{"input_pos:0", positive},
{"input_neg:0", target}};
std::vector<tensorflow::Tensor> Outputs;
tensorflow::Status Status = session->Run(Input, {"distance/stack:0"}, {}, &Outputs);
auto data = Outputs[0].flat<float>();
std::cout << Outputs[0].DebugString() << std::endl;
and this is the function to put create the in tensor:
void LossModel::img_to_float2(Track &tracks, Detection &detections, tensorflow::Tensor &tracksTensor,
tensorflow::Tensor &detectionsTensor, cv::Mat &frame) {
auto *tar = tracksTensor.flat<float>().data();
auto *dec = detectionsTensor.flat<float>().data();
cv::Mat detectionImg = frame(detections.getBox()).clone();
resize(detectionImg, detectionImg, cv::Size(FEATURES_IMG_SIZE, FEATURES_IMG_SIZE), 0, 0,
cv::Mat resizedImage(FEATURES_IMG_SIZE, FEATURES_IMG_SIZE, CV_32FC3, dec);
detectionImg.convertTo(resizedImage, CV_32FC3);
cv::Mat trackImg = tracks.get_img().clone();
resize(trackImg, trackImg, cv::Size(FEATURES_IMG_SIZE, FEATURES_IMG_SIZE), 0, 0,
cv::Mat resizedImage2(FEATURES_IMG_SIZE, FEATURES_IMG_SIZE, CV_32FC3, tar);
trackImg.convertTo(resizedImage2, CV_32FC3);

Keras Model using Tensorflow Distribution for loss fails with batch size > 1

I'm trying to use a distribution from tensorflow_probability to define a custom loss function in Keras. More specifically, I'm trying to build a Mixture Density Network.
My model works on a toy dataset when batch_size = 1 (it learns to predict the correct mixture distribution for y using x). But it "fails" when batch_size > 1 (it predicts the same distribution for all y, ignoring x). This makes me think my problem has to do with batch_shape vs. sample_shape.
To reproduce:
import random
import keras
from keras import backend as K
from keras.layers import Dense, Activation, LSTM, Input, Concatenate, Reshape, concatenate, Flatten, Lambda
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.models import Sequential, Model
import tensorflow
import tensorflow_probability as tfp
tfd = tfp.distributions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# generate toy dataset
n_obs = 20000
x = np.random.uniform(size=(n_obs, 4))
df = pd.DataFrame(x, columns = ['x_{0}'.format(i) for i in np.arange(4)])
# 2 latent classes, with noisy assignment based on x_0, x_1, (x_2 and x_3 are noise)
df['latent_class'] = 0
df.loc[df.x_0 + df.x_1 + np.random.normal(scale=.05, size=n_obs) > 1, 'latent_class'] = 1
# Latent class will determines which mixture distribution we draw from
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(probs=[0.3, 0.7]),
loc=[-1., 1], scale=[0.1, 0.5]))
d0_samples = d0.sample(sample_shape=(df.latent_class == 0).sum()).numpy()
d1 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(probs=[0.5, 0.5]),
loc=[-2., 2], scale=[0.2, 0.6]))
d1_samples = d1.sample(sample_shape=(df.latent_class == 1).sum()).numpy()
df.loc[df.latent_class == 0, 'y'] = d0_samples
df.loc[df.latent_class == 1, 'y'] = d1_samples
fig, ax = plt.subplots()
bins = np.linspace(-4, 5, 9*4 + 1)
df.y[df.latent_class == 0].hist(ax=ax, bins=bins, label='Class 0', alpha=.4, density=True)
df.y[df.latent_class == 1].hist(ax=ax, bins=bins, label='Class 1', alpha=.4, density=True)
# mixture density network
N_COMPONENTS = 2 # number of components in the mixture
input_feature_space = 4
flat_input = Input(shape=(input_feature_space,),
batch_shape=(None, input_feature_space),
x = Dense(6, activation='relu',
x = Dense(6, activation='relu',
# 3 params per component: weight, loc, scale
output = Dense(N_COMPONENTS*3,
model = Model(inputs=[flat_input],
I suspect the problem is in the next 3 functions:
def get_mixture_coef(output, num_components):
Extract mixture params from output
out_pi = output[:, :num_components]
out_sigma = output[:, num_components:2*num_components]
out_mu = output[:, 2*num_components:]
# use softmax to normalize pi into prob distribution
max_pi = K.max(out_pi, axis=1, keepdims=True)
out_pi = out_pi - max_pi
out_pi = K.exp(out_pi)
normalize_pi = 1 / K.sum(out_pi, axis=1, keepdims=True)
out_pi = normalize_pi * out_pi
# use exp to ensure sigma is pos
out_sigma = K.exp(out_sigma)
return out_pi, out_sigma, out_mu
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
loc=out_mu, scale=out_sigma,
# I suspect the problem is here
return -1 * d0.log_prob(y)
def mdn_loss(num_components):
def loss(y_true, y_pred):
out_pi, out_sigma, out_mu = get_mixture_coef(y_pred, num_components)
return get_lossfunc(out_pi, out_sigma, out_mu, y_true)
return loss
opt = Adam(lr=.001)
loss = mdn_loss(N_COMPONENTS),
es = EarlyStopping(monitor='val_loss',
verbose=1, mode='auto')
validation = .15
validate_idx = np.random.choice(df.index.values,
size=int(validation * df.shape[0]),
train_idx = [i for i in df.index.values if i not in validate_idx]
x_cols = ['x_0', 'x_1', 'x_2', 'x_3'][train_idx, x_cols].values,
y=df.loc[train_idx, 'y'].values[:, np.newaxis],
df.loc[validate_idx, x_cols].values,
df.loc[validate_idx, 'y'].values[:, np.newaxis]),
# model works when batch_size = 1
# model fails when batch_size > 1
epochs=2, batch_size=1, verbose=1, callbacks=[es])
def sample(output, n_samples, num_components):
"""Sample from a mixture distribution parameterized by
model output."""
pi, sigma, mu = get_mixture_coef(output, num_components)
d0 = tfd.MixtureSameFamily(
return d0.sample(sample_shape=n_samples).numpy()
yhat = model.predict(df.loc[train_idx, x_cols].values)
out_pi, out_sigma, out_mu = get_mixture_coef(yhat, 2)
latent_1_samples = sample(yhat[:1], n_samples=1000, num_components=2)
latent_1_samples = pd.DataFrame({'latent_1_samples': latent_1_samples.ravel()})
fig, ax = plt.subplots()
bins = np.linspace(-4, 5, 9*4 + 1)
latent_1_samples.latent_1_samples.hist(ax=ax, bins=bins, label='Class 1: yHat', alpha=.4, density=True)
df.y[df.latent_class == 0].hist(ax=ax, bins=bins, label='Class 0: True', density=True, histtype='step')
df.y[df.latent_class == 1].hist(ax=ax, bins=bins, label='Class 1: True', density=True, histtype='step')
Thanks in advance!
I found two ways to solve the problem, guided by this answer. Both solutions point to the fact that Keras is awkwardly broadcasting y to match y_pred:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
loc=out_mu, scale=out_sigma,
# this also works:
# return -1 * d0.log_prob(tensorflow.transpose(y))
return -1 * d0.log_prob(y[:, 0])
Specifying the workaround here (Answer Section) even though it is specified by Dan in the question, for the benefit of the Community.
The problem of predicting the same distribution for all y, ignoring x can be resolved in two ways.
Code for Solution 1 is mentioned below:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
loc=out_mu, scale=out_sigma,
return -1 * d0.log_prob(tensorflow.transpose(y))
Code for Solution 2 is mentioned below:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
loc=out_mu, scale=out_sigma,
return -1 * d0.log_prob(y[:, 0])
Hope this helps. Happy Learning!

"keras.backend.variable" is not behaving correctly in keras as opposed to tensorflow

I want to define trainable scalar in my models. In TensorFlow, this is done using tf.Variable. In Keras, keras.backend.variable is supposed to behave the same way. However, when I use, keras does not change the variable during the optimization process. Does anyone know why?
To test, please uncomment RUN_ON = "tensorflow" or RUN_ON = "keras" to run on either of engines.
import numpy as np
import keras as k
import tensorflow as tf
import matplotlib.pyplot as plt
# RUN_ON = "tensorflow"
# RUN_ON = "keras"
b_true = 3.0
w_true = 5.0
x_true = np.linspace(0.0, 1.0, 1000).reshape(-1, 1)
y_true = x_true * w_true + b_true
ids = np.arange(0, x_true.shape[0])
if RUN_ON=="keras":
x = k.Input((1,), dtype="float32", name="x")
Fx = k.layers.Dense(1, use_bias=False, name="Fx")(x)
b = k.backend.variable(1.0, name="b")
y = k.layers.Lambda(lambda x: x+b, name="Add")(Fx)
model = k.Model(inputs=[x], outputs=[y])
model.compile("adam", loss="mse")
# model.summary(), [y_true], epochs=100000, batch_size=1000)
y_pred = model.predict(x_true)
elif RUN_ON=="tensorflow":
x = tf.placeholder("float32", shape=[None, 1], name="x")
Fx = tf.layers.Dense(1, use_bias=False, name="Fx")(x)
b = tf.Variable(1.0, name="b")
y = Fx + b
yp = tf.placeholder("float32", shape=[None, 1], name="y")
loss = tf.reduce_mean(tf.square(yp - y))
opt = tf.train.AdamOptimizer(0.001).minimize(loss)
with tf.Session() as sess:
for i in range(100000):
opt_out, loss_val, b_val =[opt, loss, b], feed_dict={x: x_true[ids], yp: y_true[ids]})
print("epoch={:d} loss={:e} b_val={:f}".format(i, loss_val, b_val))
if loss_val < 1.0e-9:
y_pred =[y], feed_dict={x: x_true, yp: y_true})[0]
raise ValueError('`RUN_ON` should be either `keras` or `tensorflow`.')
plt.plot(x_true, y_true, '--b', linewidth=4)
plt.plot(x_true, y_pred, 'r')