Bayesian Non-Parametric Evolutionary by precise Gradients in the Acquisition Function

The graph compiles but NaN output values in objective function (although, random generated data as input).
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
population_size = 40
initial_population = (tf.random.normal([population_size]),
pi = tf.constant(3.14159)
t_init, t0, t1 = 0., 0.5, 1.
def ode_fn(t, x):
x, y = initial_population
return -(tf.math.cos(x) * tf.math.cos(y) *
tf.math.exp(-(x-pi)**2 - (y-pi)**2))
def gradients(x):
results = tfp.math.ode.BDF().solve(ode_fn, t_init, initial_population[0],
solution_times=[t0, t1])
# The objective function and the gradient.
optim_results = tfp.optimizer.differential_evolution_minimize(
objective_value = optim_results[4]
DirSampleNoise = tfd.Dirichlet([tf.math.reduce_mean(objective_value), tf.math.reduce_std(objective_value)])
# Check that the argmin is close to the actual value.
# Print out the total number of function evaluations it took. Should be 5.
Current, developments - can be found in this link -


Why tanh function return different in tensorflow and pytorch?

I find that tensorflow and pytorch tanh result is different, I want to know why did this happen?
I know that the difference is very small, so is this acceptable?
import numpy as np
import tensorflow as tf
import torch
batch, sentence_length, embedding_dim = 20, 5, 10
value = np.random.random((batch, sentence_length, embedding_dim)).astype("f")
value = value * 10
tf_x = tf.constant(value, dtype=tf.float32)
tf_out = tf.math.tanh(tf_x)
pt_x = torch.from_numpy(value)
pt_out = torch.tanh(pt_x)
print((tf_out.numpy() == pt_out.numpy()).all()) # return False
print(((tf_out.numpy() - pt_out.numpy()) < 1e-6).all()) # return True
tensorflow == 2.5.0
torch == 1.9.0
Running your code with the following line at the end:
print(np.allclose(tf_out.numpy(), pt_out.numpy())) # Returns True
You will receive True. I do not know exactly how tensorflow and pytorch compute the tanh oppeartion, but when working with floating points, you rarely are exactely equal. However, you should be receiving equal results up to a certain tolerance, which is exactly what np.allclose() checks. Read more onallclose here

Error when using tensorflow HMC to marginalise GPR hyperparameters

I would like to use tensorflow (version 2) to use gaussian process regression
to fit some data and I found the google colab example online here [1].
I have turned some of this notebook into a minimal example that is below.
Sometimes the code fails with the following error when using MCMC to marginalize the hyperparameters: and I was wondering if anyone has seen this before or knows how to get around this?
tensorflow.python.framework.errors_impl.InvalidArgumentError: Input matrix is not invertible.
[[{{node mcmc_sample_chain/trace_scan/while/body/_168/smart_for_loop/while/body/_842/dual_averaging_step_size_adaptation___init__/_one_step/transformed_kernel_one_step/mh_one_step/hmc_kernel_one_step/leapfrog_integrate/while/body/_1244/leapfrog_integrate_one_step/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/gradients/leapfrog_integrate_one_step/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/PartitionedCall_grad/PartitionedCall/gradients/JointDistributionNamed/log_prob/JointDistributionNamed_log_prob_GaussianProcess/log_prob/JointDistributionNamed_log_prob_GaussianProcess/get_marginal_distribution/Cholesky_grad/MatrixTriangularSolve}}]] [Op:__inference_do_sampling_113645]
Function call stack:
Note that some of code below is a bit redundant but it should
in some sections but it should be able to reproduce the error.
import time
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_probability as tfp
tfb = tfp.bijectors
tfd = tfp.distributions
tfk = tfp.math.psd_kernels
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#%pylab inline
# Configure plot defaults
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['grid.color'] = '#666666'
#%config InlineBackend.figure_format = 'png'
def sinusoid(x):
return np.sin(3 * np.pi * x[..., 0])
def generate_1d_data(num_training_points, observation_noise_variance):
"""Generate noisy sinusoidal observations at a random set of points.
observation_index_points, observations
index_points_ = np.random.uniform(-1., 1., (num_training_points, 1))
index_points_ = index_points_.astype(np.float64)
# y = f(x) + noise
observations_ = (sinusoid(index_points_) +
return index_points_, observations_
# Generate training data with a known noise level (we'll later try to recover
# this value from the data).
observation_index_points_, observations_ = generate_1d_data(
def build_gp(amplitude, length_scale, observation_noise_variance):
"""Defines the conditional dist. of GP outputs, given kernel parameters."""
# Create the covariance kernel, which will be shared between the prior (which we
# use for maximum likelihood training) and the posterior (which we use for
# posterior predictive sampling)
kernel = tfk.ExponentiatedQuadratic(amplitude, length_scale)
# Create the GP prior distribution, which we will use to train the model
# parameters.
return tfd.GaussianProcess(
gp_joint_model = tfd.JointDistributionNamed({
'amplitude': tfd.LogNormal(loc=0., scale=np.float64(1.)),
'length_scale': tfd.LogNormal(loc=0., scale=np.float64(1.)),
'observation_noise_variance': tfd.LogNormal(loc=0., scale=np.float64(1.)),
'observations': build_gp,
x = gp_joint_model.sample()
lp = gp_joint_model.log_prob(x)
print("sampled {}".format(x))
print("log_prob of sample: {}".format(lp))
# Create the trainable model parameters, which we'll subsequently optimize.
# Note that we constrain them to be strictly positive.
constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp())
amplitude_var = tfp.util.TransformedVariable(
length_scale_var = tfp.util.TransformedVariable(
observation_noise_variance_var = tfp.util.TransformedVariable(
trainable_variables = [v.trainable_variables[0] for v in
# Use `tf.function` to trace the loss for more efficient evaluation.
#tf.function(autograph=False, experimental_compile=False)
def target_log_prob(amplitude, length_scale, observation_noise_variance):
return gp_joint_model.log_prob({
'amplitude': amplitude,
'length_scale': length_scale,
'observation_noise_variance': observation_noise_variance,
'observations': observations_
# Now we optimize the model parameters.
num_iters = 1000
optimizer = tf.optimizers.Adam(learning_rate=.01)
# Store the likelihood values during training, so we can plot the progress
lls_ = np.zeros(num_iters, np.float64)
for i in range(num_iters):
with tf.GradientTape() as tape:
loss = -target_log_prob(amplitude_var, length_scale_var,
grads = tape.gradient(loss, trainable_variables)
optimizer.apply_gradients(zip(grads, trainable_variables))
lls_[i] = loss
print('Trained parameters:')
print('amplitude: {}'.format(amplitude_var._value().numpy()))
print('length_scale: {}'.format(length_scale_var._value().numpy()))
print('observation_noise_variance: {}'.format(observation_noise_variance_var._value().numpy()))
num_results = 100
num_burnin_steps = 50
sampler = tfp.mcmc.TransformedTransitionKernel(
step_size=tf.cast(0.1, tf.float64),
bijector=[constrain_positive, constrain_positive, constrain_positive])
adaptive_sampler = tfp.mcmc.DualAveragingStepSizeAdaptation(
num_adaptation_steps=int(0.8 * num_burnin_steps),
target_accept_prob=tf.cast(0.75, tf.float64))
initial_state = [tf.cast(x, tf.float64) for x in [1., 1., 1.]]
# Speed up sampling by tracing with `tf.function`.
#tf.function(autograph=False, experimental_compile=False)
def do_sampling():
return tfp.mcmc.sample_chain(
trace_fn=lambda current_state, kernel_results: kernel_results)
t0 = time.time()
samples, kernel_results = do_sampling()
t1 = time.time()
print("Inference ran in {:.2f}s.".format(t1-t0))
This can happen if you have multiple index points that are very close, so you might consider using np.linspace or just doing some post filtering of your random draw. I would also suggest a bit bigger epsilon, maybe 1e-6.

How to batch a transformed (scaled and quantized) Beta distribution in tensorflow probability

I'm trying to fit a beta distribution to the results of a survey with discrete scores (1, 2, 3, 4, 5).
For that to work I need a working log_prob of a Beta in TensorFlow probability. However, there is a problem with how batching is handled in Beta.
Here is a minimal example that gives me an error:
InvalidArgumentError: Shapes of a and x are inconsistent: [3] vs. [1000,1] [Op:Betainc]
The same code seems to work ok with Normal distribution...
What am I doing wrong here?
import numpy as np
import tensorflow_probability as tfp
tfd = tfp.distributions
#Generate fake data
data = np.random.beta(2.,2.,1000)*5.0
data = np.ceil(data)
data = data[:,None]
# Create a batch of three Beta distributions.
alpha = np.array([1., 2., 3.]).astype(np.float32)
beta = np.array([1., 2., 3.]).astype(np.float32)
bt = tfd.Beta(alpha, beta)
#bt = tfd.Normal(loc=alpha, scale=beta)
#Scale beta to 0-5
scbt = tfd.TransformedDistribution(
# quantize beta to (1,2,3,4,5)
qdist = tfd.QuantizedDistribution(distribution=scbt,low=1,high=5)
#calc log_prob for 3 distributions
TensorFlow 2.0.0
tensorflow_probability 0.8.0
As suggested by Chris Suter. Here is broadcasting by hand solution:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
from matplotlib import pyplot as plt
#Generate fake data
numdata = 100
numbeta = 3
data = np.random.beta(2.,2.,numdata)
data *= 5.0
data = np.ceil(data)
data = data[:,None].astype(np.float32)
#alpha and beta [[1., 2., 3.]]
alpha = np.expand_dims(np.arange(1,4),0).astype(np.float32)
beta = np.expand_dims(np.arange(1,4),0).astype(np.float32)
#tile to compensate for betainc
alpha = tf.tile(alpha,[numdata,1])
beta = tf.tile(beta,[numdata,1])
data = tf.tile(data,[1,numbeta])
bt = tfd.Beta(concentration1=alpha, concentration0=beta)
scbt = tfd.TransformedDistribution(
# quantize beta to (1,2,3,4,5)
qdist = tfd.QuantizedDistribution(distribution=scbt,low=1,high=5)
#calc log_prob for numbeta number of distributions
EDIT2: The above solution does not work when I try to apply it in MCMC sampling.
The new code looks like this:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from time import time
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
import numpy as np
#Generate fake data
numdata = 100
data = np.random.beta(2.,2.,numdata)
data *= 5.0
data = np.ceil(data)
data = data[:,None].astype(np.float32)
def sample_chain():
#Parameters of MCMC
num_burnin_steps = 300
num_results = 200
num_chains = 50
step_size = 0.01
#data tensor
outcomes = tf.convert_to_tensor(data, dtype=tf.float32)
def modeldist(alpha,beta):
bt = tfd.Beta(concentration1=alpha, concentration0=beta)
scbt = tfd.TransformedDistribution(
# quantize beta to (1,2,3,4,5)
qdist = tfd.QuantizedDistribution(distribution=scbt,low=1,high=5)
return qdist
def joint_log_prob(con1,con0):
#manual broadcast
tcon1 = tf.tile(con1[None,:],[numdata,1])
tcon0 = tf.tile(con0[None,:],[numdata,1])
toutcomes = tf.tile(outcomes,[1,num_chains])
#model distribution with manual broadcast
dist = modeldist(tcon1,tcon0)
#joint log prob
return tf.reduce_sum(dist.log_prob(toutcomes),axis=0)
kernel = tfp.mcmc.HamiltonianMonteCarlo(
kernel = tfp.mcmc.SimpleStepSizeAdaptation(
inner_kernel=kernel, num_adaptation_steps=int(num_burnin_steps * 0.8))
init_state = [tf.identity(tf.random.uniform([num_chains])*10.0,name='init_alpha'),
samples, [step_size, is_accepted] = tfp.mcmc.sample_chain(
trace_fn=lambda _, pkr: [pkr.inner_results.accepted_results.step_size,
return samples
samples = sample_chain()
This ends up with an error message:
ValueError: Encountered None gradient. fn_arg_list: [tf.Tensor 'init_alpha:0' shape=(50,) dtype=float32, tf.Tensor 'init_beta:0' shape=(50,) dtype=float32] grads: [None, None]
Sadly tf.math.betainc doesn't support broadcasting at present, which causes the cdf computation, which QuantizedDistribution calls, to fail. If you must use Beta, the only workaround I can think of is to broadcast "manually" by tiling the data and Beta params.
Alternatively, you might be able to get away with using the Kumaraswamy distribution, which is similar to Beta but has some nicer analytical properties.

Using scipy.odr to fit curve

I'm trying to fit a set of data points via a fit function that depends on two variables, let's call these xdata and sdata. Problem is my curve is rather flat I want it to more or less "follow the points".
I've tried using scipy.odr to fit the curve it works rather well except that the curve is too flat:
import numpy as np
from math import pi
from math import sqrt
from math import log
from scipy import optimize
import scipy.optimize
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.odr import *
mudr=np.array([ 57.43708609, 46.26119205, 55.60688742, 33.21615894,
28.27072848, 22.54649007, 21.80662252, 11.21483444, 5.80211921])
#xdata points
dme=array([ 128662.54890776, 105265.32915726, 128652.56835434,
77968.67019573, 66273.56542068, 58464.58559543,
54570.66624991, 27286.90038703, 19480.92689266]) #xdata error
dmss22=np.array([ 4.90050000e+17, 4.90050000e+17, 4.90050000e+17,
4.90050000e+17, 4.90050000e+17, 4.90050000e+17,
4.90050000e+17, 4.90050000e+17, 4.90050000e+17]) #sdata points
dmse=np.array([ 1.09777592e+21, 1.11512117e+21, 1.13381702e+21,
1.15033267e+21, 1.14883089e+21, 1.27076265e+21,
1.22637165e+21, 1.19237598e+21, 1.64539205e+21]) # sdata error
F=np.array([ 115.01944248, 110.24354867, 112.77812389, 104.81830088,
104.35746903, 101.32016814, 100.54513274, 96.94226549,
93.00424779]) #ydata points
dF=np.array([ 72710.75386699, 72590.6256987 , 176539.40403673,
130555.27503081, 124299.52080164, 176426.64340597,
143013.52848306, 122117.93022746, 157547.78395513])#ydata error
def Ffitsso(p,X,B=2.58,Fc=92.2,mu=770,Za=0.9468): #fitfunction
temp1 = (2*B*X[0])/(4*pi*Fc)**2
temp2 = temp1*(afij[0]+afij[1]*np.log((2*B*X[0])/mu**2))
temp3 = temp1**2*(afij[2]+afij[3]*np.log((2*B*X[0])/mu**2)+\
temp4 = temp1**3*(afij[5]+afij[6]*np.log((2*B*X[0])/mu**2)+\
return Fc/Za*(1+p[0]*X[1])*(1+temp2+temp3+temp4)+p[1]
#fitting using scipy.odr
xtot=np.row_stack( (mudr, dmss22) )
etot=np.row_stack( (Ze, dmss22e) )
fitting = Model(Ffitsso)
mydata = RealData(xtot, F, sx=etot2, sy=dF)
myodr = ODR(mydata, fitting, beta0=[0, 100])
myoutput =
p[0]*X[0] in the fitfunction is supposed to be small compared to 1 but with the fit the value for p[0] is in order of e-18 whilst dmss22 values are in the order of e-17 which is not small enough.
Even worse is that it's negative meaning the function decreases which is not supposed to happen it's supposed to increase like the plotted data points.
Edit: I fixed, didn't know that it was so sensitive to initial beta values, put beta[0]=1.5*10(-15) and it works!**
Here is a graphical fitter with both curve_fit and ODR fitters using scipy's Differential Evolution (DE) genetic algorithm to supply initial parameter estimates for the non-linear solvers. The scipy implementation of DE uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, and this requires parameter bounds within which to search - in this example, these bounds are taken from the data maximum and minimum values. Note that it is much easier to give bounds for the initial parameter estimates rather than individual specific values.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy.odr
from scipy.optimize import differential_evolution
import warnings
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7, 0.0])
yData = numpy.array([1.1, 20.2, 30.3, 40.4, 50.0, 60.6, 70.7, 0.1])
def func(x, a, b, c, d, offset): # curve fitting function for curve_fit()
return a*numpy.exp(-(x-b)**2/(2*c**2)+d) + offset
def func_wrapper_for_ODR(parameters, x): # parameter order for ODR
return func(x, *parameters)
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minY, maxY]) # search bounds for a
parameterBounds.append([minX, maxX]) # search bounds for b
parameterBounds.append([minX, maxX]) # search bounds for c
parameterBounds.append([minY, maxY]) # search bounds for d
parameterBounds.append([0.0, maxY]) # search bounds for Offset
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
geneticParameters = generate_Initial_Parameters()
# curve_fit section
fittedParameters_curvefit, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters curve_fit:', fittedParameters_curvefit)
modelPredictions_curvefit = func(xData, *fittedParameters_curvefit)
absError_curvefit = modelPredictions_curvefit - yData
SE_curvefit = numpy.square(absError_curvefit) # squared errors
MSE_curvefit = numpy.mean(SE_curvefit) # mean squared errors
RMSE_curvefit = numpy.sqrt(MSE_curvefit) # Root Mean Squared Error, RMSE
Rsquared_curvefit = 1.0 - (numpy.var(absError_curvefit) / numpy.var(yData))
print('RMSE curve_fit:', RMSE_curvefit)
print('R-squared curve_fit:', Rsquared_curvefit)
# ODR section
data = scipy.odr.odrpack.Data(xData,yData)
model = scipy.odr.odrpack.Model(func_wrapper_for_ODR)
odr = scipy.odr.odrpack.ODR(data, model, beta0=geneticParameters)
# Run the regression.
odr_out =
print('Fitted parameters ODR:', odr_out.beta)
modelPredictions_odr = func(xData, *odr_out.beta)
absError_odr = modelPredictions_odr - yData
SE_odr = numpy.square(absError_odr) # squared errors
MSE_odr = numpy.mean(SE_odr) # mean squared errors
RMSE_odr = numpy.sqrt(MSE_odr) # Root Mean Squared Error, RMSE
Rsquared_odr = 1.0 - (numpy.var(absError_odr) / numpy.var(yData))
print('RMSE ODR:', RMSE_odr)
print('R-squared ODR:', Rsquared_odr)
# graphics output section
def ModelsAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plots
xModel = numpy.linspace(min(xData), max(xData))
yModel_curvefit = func(xModel, *fittedParameters_curvefit)
yModel_odr = func(xModel, *odr_out.beta)
# now the models as line plots
axes.plot(xModel, yModel_curvefit)
axes.plot(xModel, yModel_odr)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelsAndScatterPlot(graphWidth, graphHeight)

Implementing minimization in SciPy

I am trying to implement the 'Iterative hessian Sketch' algorithm from page 12. However, I am struggling with step two which needs to minimize the matrix-vector function.
Imports and basic data generating function
import numpy as np
import scipy as sp
from sklearn.datasets import make_regression
from scipy.optimize import minimize
import matplotlib.pyplot as plt
%matplotlib inline
from numpy.linalg import norm
def generate_data(nsamples, nfeatures, variance=1):
'''Generates a data matrix of size (nsamples, nfeatures)
which defines a linear relationship on the variables.'''
X, y = make_regression(n_samples=nsamples, n_features=nfeatures,\
X[:,0] = np.ones(shape=(nsamples)) # add bias terms
return X, y
To minimize the matrix-vector function, I have tried implementing a function which computes the quanity I would like to minimise:
def f2min(x, data, target, offset):
A = data
S = np.eye(A.shape[0])
#S = gaussian_sketch(nrows=A.shape[0]//2, ncols=A.shape[0] )
y = target
xt = np.ravel(offset)
norm_val = (1/2*S.shape[0])*norm(S#A#(x-xt))**2
#inner_prod = (y - A#xt).T#A#x
return norm_val - inner_prod
I would eventually like to replace S with some random matrices which can reduce the dimensionality of the problem, however, first I need to be confident that this optimisation method is working.
def grad_f2min(x, data, target, offset):
A = data
y = target
S = np.eye(A.shape[0])
xt = np.ravel(offset)
S_A = S#A
grad = (1/S.shape[0])*S_A.T#S_A#(x-xt) - A.T#(y-A#xt)
return grad
x0 = np.zeros((X.shape[0],1))
xt = np.zeros((2,1))
x_new = np.zeros((2,1))
for it in range(1):
result = minimize(f2min, x0=xt,args=(X,y,x_new),
method='CG', jac=False )
x_new = result.x
I don't think that this loop is correct at all because at the very least there should be some local convergence before moving on to the next step. The output is:
fun: 0.0
jac: array([ 0.00745058, 0.00774882])
message: 'Desired error not necessarily achieved due to precision loss.'
nfev: 416
nit: 0
njev: 101
status: 2
success: False
x: array([ 0., 0.])
Does anyone have an idea if:
(1) Why I'm not achieving convergence at each step
(2) I can implement step 2 in a better way?