Error when using tensorflow HMC to marginalise GPR hyperparameters - tensorflow

I would like to use tensorflow (version 2) to use gaussian process regression
to fit some data and I found the google colab example online here [1].
I have turned some of this notebook into a minimal example that is below.
Sometimes the code fails with the following error when using MCMC to marginalize the hyperparameters: and I was wondering if anyone has seen this before or knows how to get around this?
tensorflow.python.framework.errors_impl.InvalidArgumentError: Input matrix is not invertible.
[[{{node mcmc_sample_chain/trace_scan/while/body/_168/smart_for_loop/while/body/_842/dual_averaging_step_size_adaptation___init__/_one_step/transformed_kernel_one_step/mh_one_step/hmc_kernel_one_step/leapfrog_integrate/while/body/_1244/leapfrog_integrate_one_step/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/gradients/leapfrog_integrate_one_step/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/PartitionedCall_grad/PartitionedCall/gradients/JointDistributionNamed/log_prob/JointDistributionNamed_log_prob_GaussianProcess/log_prob/JointDistributionNamed_log_prob_GaussianProcess/get_marginal_distribution/Cholesky_grad/MatrixTriangularSolve}}]] [Op:__inference_do_sampling_113645]
Function call stack:
do_sampling
[1] https://colab.research.google.com/github/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Gaussian_Process_Regression_In_TFP.ipynb#scrollTo=jw-_1yC50xaM
Note that some of code below is a bit redundant but it should
in some sections but it should be able to reproduce the error.
Thanks!
import time
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_probability as tfp
tfb = tfp.bijectors
tfd = tfp.distributions
tfk = tfp.math.psd_kernels
tf.enable_v2_behavior()
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#%pylab inline
# Configure plot defaults
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['grid.color'] = '#666666'
#%config InlineBackend.figure_format = 'png'
def sinusoid(x):
return np.sin(3 * np.pi * x[..., 0])
def generate_1d_data(num_training_points, observation_noise_variance):
"""Generate noisy sinusoidal observations at a random set of points.
Returns:
observation_index_points, observations
"""
index_points_ = np.random.uniform(-1., 1., (num_training_points, 1))
index_points_ = index_points_.astype(np.float64)
# y = f(x) + noise
observations_ = (sinusoid(index_points_) +
np.random.normal(loc=0,
scale=np.sqrt(observation_noise_variance),
size=(num_training_points)))
return index_points_, observations_
# Generate training data with a known noise level (we'll later try to recover
# this value from the data).
NUM_TRAINING_POINTS = 100
observation_index_points_, observations_ = generate_1d_data(
num_training_points=NUM_TRAINING_POINTS,
observation_noise_variance=.1)
def build_gp(amplitude, length_scale, observation_noise_variance):
"""Defines the conditional dist. of GP outputs, given kernel parameters."""
# Create the covariance kernel, which will be shared between the prior (which we
# use for maximum likelihood training) and the posterior (which we use for
# posterior predictive sampling)
kernel = tfk.ExponentiatedQuadratic(amplitude, length_scale)
# Create the GP prior distribution, which we will use to train the model
# parameters.
return tfd.GaussianProcess(
kernel=kernel,
index_points=observation_index_points_,
observation_noise_variance=observation_noise_variance)
gp_joint_model = tfd.JointDistributionNamed({
'amplitude': tfd.LogNormal(loc=0., scale=np.float64(1.)),
'length_scale': tfd.LogNormal(loc=0., scale=np.float64(1.)),
'observation_noise_variance': tfd.LogNormal(loc=0., scale=np.float64(1.)),
'observations': build_gp,
})
x = gp_joint_model.sample()
lp = gp_joint_model.log_prob(x)
print("sampled {}".format(x))
print("log_prob of sample: {}".format(lp))
# Create the trainable model parameters, which we'll subsequently optimize.
# Note that we constrain them to be strictly positive.
constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp())
amplitude_var = tfp.util.TransformedVariable(
initial_value=1.,
bijector=constrain_positive,
name='amplitude',
dtype=np.float64)
length_scale_var = tfp.util.TransformedVariable(
initial_value=1.,
bijector=constrain_positive,
name='length_scale',
dtype=np.float64)
observation_noise_variance_var = tfp.util.TransformedVariable(
initial_value=1.,
bijector=constrain_positive,
name='observation_noise_variance_var',
dtype=np.float64)
trainable_variables = [v.trainable_variables[0] for v in
[amplitude_var,
length_scale_var,
observation_noise_variance_var]]
# Use `tf.function` to trace the loss for more efficient evaluation.
#tf.function(autograph=False, experimental_compile=False)
def target_log_prob(amplitude, length_scale, observation_noise_variance):
return gp_joint_model.log_prob({
'amplitude': amplitude,
'length_scale': length_scale,
'observation_noise_variance': observation_noise_variance,
'observations': observations_
})
# Now we optimize the model parameters.
num_iters = 1000
optimizer = tf.optimizers.Adam(learning_rate=.01)
# Store the likelihood values during training, so we can plot the progress
lls_ = np.zeros(num_iters, np.float64)
for i in range(num_iters):
with tf.GradientTape() as tape:
loss = -target_log_prob(amplitude_var, length_scale_var,
observation_noise_variance_var)
grads = tape.gradient(loss, trainable_variables)
optimizer.apply_gradients(zip(grads, trainable_variables))
lls_[i] = loss
print('Trained parameters:')
print('amplitude: {}'.format(amplitude_var._value().numpy()))
print('length_scale: {}'.format(length_scale_var._value().numpy()))
print('observation_noise_variance: {}'.format(observation_noise_variance_var._value().numpy()))
num_results = 100
num_burnin_steps = 50
sampler = tfp.mcmc.TransformedTransitionKernel(
tfp.mcmc.HamiltonianMonteCarlo(
target_log_prob_fn=target_log_prob,
step_size=tf.cast(0.1, tf.float64),
num_leapfrog_steps=8),
bijector=[constrain_positive, constrain_positive, constrain_positive])
adaptive_sampler = tfp.mcmc.DualAveragingStepSizeAdaptation(
inner_kernel=sampler,
num_adaptation_steps=int(0.8 * num_burnin_steps),
target_accept_prob=tf.cast(0.75, tf.float64))
initial_state = [tf.cast(x, tf.float64) for x in [1., 1., 1.]]
# Speed up sampling by tracing with `tf.function`.
#tf.function(autograph=False, experimental_compile=False)
def do_sampling():
return tfp.mcmc.sample_chain(
kernel=adaptive_sampler,
current_state=initial_state,
num_results=num_results,
num_burnin_steps=num_burnin_steps,
trace_fn=lambda current_state, kernel_results: kernel_results)
t0 = time.time()
samples, kernel_results = do_sampling()
t1 = time.time()
print("Inference ran in {:.2f}s.".format(t1-t0))

This can happen if you have multiple index points that are very close, so you might consider using np.linspace or just doing some post filtering of your random draw. I would also suggest a bit bigger epsilon, maybe 1e-6.

Related

Particle swarm optimization in regression problem

I am trying to optimize my Random forest regression model using a particle swarm optimizer to minimize the prediction error. But getting this error:UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U33'), dtype('<U33')) -> None
Can anyone please help me with this. I really appreciate any help you can provide.
My dataset contains 24 independent variables (X) and one dependent variable (y).
CODE:
import numpy as np
import pandas as pd
import re
import os
import random
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import animation, rc
%matplotlib inline
from matplotlib.animation import FuncAnimation
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor()
RFR.fit(X_train, y_train)
######################### Particle swarm optimization ##################################
#### error rate
#### The objective is to find minimum error
def fitness_function(xtest, ytest):
# Prediction
ypred = RFR.predict(X_test)
error = mean_squared_error(y_test, ypred)
#r2 = r2_score(ytest, ypred)
return f'The error is: {error}'
from matplotlib import animation
import random
####### velocity #######################################
def update_velocity(particle, velocity, pbest, gbest, w_min=0.5, max=1.0, c=0.1):
# Initialise new velocity array
num_particle = len(particle)
new_velocity = np.array([0.2 for i in range(num_particle)])
# Randomly generate r1, r2 and inertia weight from normal distribution
r1 = random.uniform(0,max)
r2 = random.uniform(0,max)
w = random.uniform(w_min,max)
c1 = c
c2 = c
# Calculate new velocity
for i in range(num_particle):
new_velocity[i] = w*velocity[i] + c1*r1*(pbest[i]-particle[i])+c2*r2*(gbest[i]-particle[i])
return new_velocity
############## update position ##############
def update_position(particle, velocity):
# Move particles by adding velocity
new_particle = particle + velocity
return new_particle
######################################### PSO Main function ###################################
def pso_2d(population, dimension, position_min, position_max, generation, fitness_criterion):
# Initialization
# Population
particles = [[np.random.uniform(position_min[0:24], position_max[0 :24]) for j in range(population)] for i in range(dimension)] # generating random particle position
# Particle's best position
pbest_position = particles # personal best position
# Fitness
pbest_fitness = [fitness_function(p[0:24],p[1]) for p in particles] # personal best fitness
# Index of the best particle
# np.reshape(np.ravel(p[0]), (2, 31))
gbest_index = np.argmin(pbest_fitness)
# Global best particle position
gbest_position = pbest_position[gbest_index]
# Velocity (starting from 0 speed)
velocity = [[0.0 for j in range(dimension)] for i in range(population)]
# Loop for the number of generation
for t in range(generation):
# Stop if the average fitness value reached a predefined success criterion
if np.average(pbest_fitness) <= fitness_criterion:
break
else:
for n in range(population):
# Update the velocity of each particle
velocity[n] = update_velocity(particles[n], velocity[n], pbest_position[n], gbest_position)
# Move the particles to new position
particles[n] = update_position(particles[n], velocity[n])
# Calculate the fitness value
pbest_fitness = [fitness_function(p[0:24],p[1]) for p in particles]
# Find the index of the best particle
gbest_index = np.argmin(pbest_fitness)
# Update the position of the best particle
gbest_position = pbest_position[gbest_index]
# Print the results
print('Global Best Position: ', gbest_position)
print('Best Fitness Value: ', min(pbest_fitness))
print('Average Particle Best Fitness Value: ', np.average(pbest_fitness))
print('Number of Generation: ', t)
position_min = [-0.44306155, -0.52971118, -0.10311188, -0.60053201, -0.78198029,
-0.37737778, -0.14371436, -0.01623235, -0.88660182, -0.06182274,
-0.30084403, -0.98080838, -0.11787062, -0.84172055, -0.709991 ,
-0.9841236 , -0.32976052, -0.26586302, -0.87641669, -0.23728611,
-0.08874495, -0.03091284, -0.29987714, -0.96795309]
position_max = [0.44306155, 0.52971118, 0.10311188, 0.60053201, 0.78198029,
0.37737778, 0.14371436, 0.01623235, 0.88660182, 0.06182274,
0.30084403, 0.98080838, 0.11787062, 0.84172055, 0.709991 ,
0.9841236 , 0.32976052, 0.26586302, 0.87641669, 0.23728611,
0.08874495, 0.03091284, 0.29987714, 0.96795309]
pso_2d(100, 24, position_min, position_max, 400, 10e-4)
Output: UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U33'), dtype('<U33')) -> None

Bayesian Non-Parametric Evolutionary by precise Gradients in the Acquisition Function - TensorFlow Core v2.5.0

The graph compiles but NaN output values in objective function (although, random generated data as input).
# reference https://www.tensorflow.org/probability/api_docs/python/tfp/math/ode/Solver
# reference https://www.tensorflow.org/probability/api_docs/python/tfp/optimizer/differential_evolution_minimize
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
tf.executing_eagerly()
population_size = 40
initial_population = (tf.random.normal([population_size]),
tf.random.normal([population_size]))
pi = tf.constant(3.14159)
t_init, t0, t1 = 0., 0.5, 1.
def ode_fn(t, x):
x, y = initial_population
return -(tf.math.cos(x) * tf.math.cos(y) *
tf.math.exp(-(x-pi)**2 - (y-pi)**2))
def gradients(x):
results = tfp.math.ode.BDF().solve(ode_fn, t_init, initial_population[0],
solution_times=[t0, t1])
return(results.states[0])
# The objective function and the gradient.
optim_results = tfp.optimizer.differential_evolution_minimize(
gradients,
initial_population=initial_population[0])
objective_value = optim_results[4]
DirSampleNoise = tfd.Dirichlet([tf.math.reduce_mean(objective_value), tf.math.reduce_std(objective_value)])
print(DirSampleNoise.sample([2,]))
# Check that the argmin is close to the actual value.
# Print out the total number of function evaluations it took. Should be 5.
Current, developments - can be found in this link - https://gitlab.com/emmanuelnsanga/bayes-distil-model/-/blob/main/optimizer_non-parametric.py

Abysmal tf.GradientTape performance compared to tf.gradients() for computing jacobians

SOLUTION BELOW:
Scenario:
I am trying to compute the jacobian of a user defined function many, many times in a loop. I am able to do this with TF 2's GradientTape as well as the older session based tf.gradients() method. The problem is that GradientTape is terribly slow (100x slower) than tf.gradients(). It has features i'd like to use (bath_jacobian, hessian support, etc), but if it's 100x slower then i can't use it.
The Question:
It's not clear to me if i'm simply misusing GradientTape, or if it will always be slower because it has to re-differentiate the provided function every time its called (my suspicion). I'm asking for tips to fix my use of GradientTape or a confirmation that it will always be fundamentally slower than tf.gradients by orders of magnitude.
Related Questions:
Repeated use of GradientTape for multiple Jacobian calculations - same scenario, unanswered
Does `GradientTape` need to re-differentiate each evaluation of a derivative? - same scenario, unanswered
using one GradientTape with global context - loosely related, having trouble applyng that solution to my scenario
Fully contained minimum example to compare GradientTape and tf.gradients():
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
# from tensorflow.python.ops.parallel_for.gradients import jacobian, batch_jacobian
import timeit
class FunctionCaller(object):
def __init__(self, func, nX, dtype=tf.float64, useSessions=True):
if useSessions:
disable_eager_execution()
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
return
#
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
#
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with session.run()
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
else:
return self._useGradientTape(x_i)
# THIS FUNCTION IS SUPER INEFFICIENT.
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
xTensor = tf.Variable(x_i, dtype=self.dtype) # is this my problem??? i recreate x every time?
y = tf.reshape(self.func(xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, xTensor)
# del g
return jac_x_at_i.numpy()
def __del__(self):
if self.sess is not None:
self.sess.close()
def main():
#tf.function
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.arange(nT*nX).reshape([-1, nX])
nTrials = 100
# try the eager version first
caller_eager = FunctionCaller(Xdot, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nX, useSessions=True)
start_time = timeit.default_timer()
caller_sessions.jac(x_i) # call it once to do its graph building stuff?
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
main()
EDIT - SOLUTION:
xdurch0 pointed out below that I should wrap _useGradientTape() in a #tf.function - something I was unsuccessful with before for other reasons. Once I did that, I had to move xTensor's definition outside the #tf.function wrapper by making it a member variable and using tf.assign().
With all this done, I find that GradientTape (for this simple example) is now on the same order of magnitude as tf.gradints. When running enough trials (~1E5), it's twice as fast as tf.gradients. awesome!
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
import timeit
class FunctionCaller(object):
def __init__(self, func, nT, nX, dtype=tf.float64, useSessions=True):
if useSessions:
disable_eager_execution()
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
# you should be able to create without an initial value, but tf is demanding one
# despite what the docs say. bug?
# tf.Variable(initial_value=None, shape=[None, nX], validate_shape=False, dtype=self.dtype)
self.xTensor = tf.Variable([[0]*nX]*nT, dtype=self.dtype) # x needs to be properly sized once
return
#
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
#
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with session.run()
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
else:
return self._useGradientTape(x_i).numpy()
#tf.function # THIS IS CRUCIAL
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
self.xTensor.assign(x_i) # you need to create the variable once outside the graph
y = tf.reshape(self.func(self.xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, self.xTensor)
# del g
return jac_x_at_i
def __del__(self):
if self.sess is not None:
self.sess.close()
def main():
#tf.function
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.random.random([nT, nX])
nTrials = 1000 # i find that nTrials<=1E3, eager is slower, it's faster for >=1E4, it's TWICE as fast for >=1E5
# try the eager version first
caller_eager = FunctionCaller(Xdot, nT, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nT, nX, useSessions=True)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
main()

How to batch a transformed (scaled and quantized) Beta distribution in tensorflow probability

I'm trying to fit a beta distribution to the results of a survey with discrete scores (1, 2, 3, 4, 5).
For that to work I need a working log_prob of a Beta in TensorFlow probability. However, there is a problem with how batching is handled in Beta.
Here is a minimal example that gives me an error:
InvalidArgumentError: Shapes of a and x are inconsistent: [3] vs. [1000,1] [Op:Betainc]
The same code seems to work ok with Normal distribution...
What am I doing wrong here?
import numpy as np
import tensorflow_probability as tfp
tfd = tfp.distributions
#Generate fake data
np.random.seed(2)
data = np.random.beta(2.,2.,1000)*5.0
data = np.ceil(data)
data = data[:,None]
# Create a batch of three Beta distributions.
alpha = np.array([1., 2., 3.]).astype(np.float32)
beta = np.array([1., 2., 3.]).astype(np.float32)
bt = tfd.Beta(alpha, beta)
#bt = tfd.Normal(loc=alpha, scale=beta)
#Scale beta to 0-5
scbt = tfd.TransformedDistribution(
distribution=bt,
bijector=tfp.bijectors.AffineScalar(
shift=0.,
scale=5.))
# quantize beta to (1,2,3,4,5)
qdist = tfd.QuantizedDistribution(distribution=scbt,low=1,high=5)
#calc log_prob for 3 distributions
print(np.sum(qdist.log_prob(data),axis=0))
print(qdist.log_prob(data).shape)
TensorFlow 2.0.0
tensorflow_probability 0.8.0
EDIT:
As suggested by Chris Suter. Here is broadcasting by hand solution:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
from matplotlib import pyplot as plt
#Generate fake data
numdata = 100
numbeta = 3
np.random.seed(2)
data = np.random.beta(2.,2.,numdata)
data *= 5.0
data = np.ceil(data)
data = data[:,None].astype(np.float32)
#alpha and beta [[1., 2., 3.]]
alpha = np.expand_dims(np.arange(1,4),0).astype(np.float32)
beta = np.expand_dims(np.arange(1,4),0).astype(np.float32)
#tile to compensate for betainc
alpha = tf.tile(alpha,[numdata,1])
beta = tf.tile(beta,[numdata,1])
data = tf.tile(data,[1,numbeta])
bt = tfd.Beta(concentration1=alpha, concentration0=beta)
scbt = tfd.TransformedDistribution(
distribution=bt,
bijector=tfp.bijectors.AffineScalar(
shift=0.,
scale=5.))
# quantize beta to (1,2,3,4,5)
qdist = tfd.QuantizedDistribution(distribution=scbt,low=1,high=5)
#calc log_prob for numbeta number of distributions
print(np.sum(qdist.log_prob(data),axis=0))
EDIT2: The above solution does not work when I try to apply it in MCMC sampling.
The new code looks like this:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from time import time
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
import numpy as np
#Generate fake data
numdata = 100
np.random.seed(2)
data = np.random.beta(2.,2.,numdata)
data *= 5.0
data = np.ceil(data)
data = data[:,None].astype(np.float32)
#tf.function
def sample_chain():
#Parameters of MCMC
num_burnin_steps = 300
num_results = 200
num_chains = 50
step_size = 0.01
#data tensor
outcomes = tf.convert_to_tensor(data, dtype=tf.float32)
def modeldist(alpha,beta):
bt = tfd.Beta(concentration1=alpha, concentration0=beta)
scbt = tfd.TransformedDistribution(
distribution=bt,
bijector=tfp.bijectors.AffineScalar(
shift=0.,
scale=5.))
# quantize beta to (1,2,3,4,5)
qdist = tfd.QuantizedDistribution(distribution=scbt,low=1,high=5)
return qdist
def joint_log_prob(con1,con0):
#manual broadcast
tcon1 = tf.tile(con1[None,:],[numdata,1])
tcon0 = tf.tile(con0[None,:],[numdata,1])
toutcomes = tf.tile(outcomes,[1,num_chains])
#model distribution with manual broadcast
dist = modeldist(tcon1,tcon0)
#joint log prob
return tf.reduce_sum(dist.log_prob(toutcomes),axis=0)
kernel = tfp.mcmc.HamiltonianMonteCarlo(
target_log_prob_fn=joint_log_prob,
num_leapfrog_steps=5,
step_size=step_size)
kernel = tfp.mcmc.SimpleStepSizeAdaptation(
inner_kernel=kernel, num_adaptation_steps=int(num_burnin_steps * 0.8))
init_state = [tf.identity(tf.random.uniform([num_chains])*10.0,name='init_alpha'),
tf.identity(tf.random.uniform([num_chains])*10.0,name='init_beta')]
samples, [step_size, is_accepted] = tfp.mcmc.sample_chain(
num_results=num_results,
num_burnin_steps=num_burnin_steps,
current_state=init_state,
kernel=kernel,
trace_fn=lambda _, pkr: [pkr.inner_results.accepted_results.step_size,
pkr.inner_results.is_accepted])
return samples
samples = sample_chain()
This ends up with an error message:
ValueError: Encountered None gradient. fn_arg_list: [tf.Tensor 'init_alpha:0' shape=(50,) dtype=float32, tf.Tensor 'init_beta:0' shape=(50,) dtype=float32] grads: [None, None]
Sadly tf.math.betainc doesn't support broadcasting at present, which causes the cdf computation, which QuantizedDistribution calls, to fail. If you must use Beta, the only workaround I can think of is to broadcast "manually" by tiling the data and Beta params.
Alternatively, you might be able to get away with using the Kumaraswamy distribution, which is similar to Beta but has some nicer analytical properties.

Using scipy.odr to fit curve

I'm trying to fit a set of data points via a fit function that depends on two variables, let's call these xdata and sdata. Problem is my curve is rather flat I want it to more or less "follow the points".
I've tried using scipy.odr to fit the curve it works rather well except that the curve is too flat:
import numpy as np
from math import pi
from math import sqrt
from math import log
from scipy import optimize
import scipy.optimize
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.odr import *
mudr=np.array([ 57.43708609, 46.26119205, 55.60688742, 33.21615894,
28.27072848, 22.54649007, 21.80662252, 11.21483444, 5.80211921])
#xdata points
dme=array([ 128662.54890776, 105265.32915726, 128652.56835434,
77968.67019573, 66273.56542068, 58464.58559543,
54570.66624991, 27286.90038703, 19480.92689266]) #xdata error
dmss22=np.array([ 4.90050000e+17, 4.90050000e+17, 4.90050000e+17,
4.90050000e+17, 4.90050000e+17, 4.90050000e+17,
4.90050000e+17, 4.90050000e+17, 4.90050000e+17]) #sdata points
dmse=np.array([ 1.09777592e+21, 1.11512117e+21, 1.13381702e+21,
1.15033267e+21, 1.14883089e+21, 1.27076265e+21,
1.22637165e+21, 1.19237598e+21, 1.64539205e+21]) # sdata error
F=np.array([ 115.01944248, 110.24354867, 112.77812389, 104.81830088,
104.35746903, 101.32016814, 100.54513274, 96.94226549,
93.00424779]) #ydata points
dF=np.array([ 72710.75386699, 72590.6256987 , 176539.40403673,
130555.27503081, 124299.52080164, 176426.64340597,
143013.52848306, 122117.93022746, 157547.78395513])#ydata error
def Ffitsso(p,X,B=2.58,Fc=92.2,mu=770,Za=0.9468): #fitfunction
temp1 = (2*B*X[0])/(4*pi*Fc)**2
temp2 = temp1*(afij[0]+afij[1]*np.log((2*B*X[0])/mu**2))
temp3 = temp1**2*(afij[2]+afij[3]*np.log((2*B*X[0])/mu**2)+\
afij[4]*(np.log((2*B*X[0])/mu**2))**2)
temp4 = temp1**3*(afij[5]+afij[6]*np.log((2*B*X[0])/mu**2)+\
afij[7]*(np.log((2*B*X[0])/mu**2))**2+\
afij[8]*(np.log((2*B*X[0])/mu**2))**3)
return Fc/Za*(1+p[0]*X[1])*(1+temp2+temp3+temp4)+p[1]
#fitting using scipy.odr
xtot=np.row_stack( (mudr, dmss22) )
etot=np.row_stack( (Ze, dmss22e) )
fitting = Model(Ffitsso)
mydata = RealData(xtot, F, sx=etot2, sy=dF)
myodr = ODR(mydata, fitting, beta0=[0, 100])
myoutput = myodr.run()
myoutput.pprint()
bet=myoutput.beta
plt.plot(mudr,F,"b^")
plt.plot(mudr,Ffitsso(bet,[mudr,dmss22]))
p[0]*X[0] in the fitfunction is supposed to be small compared to 1 but with the fit the value for p[0] is in order of e-18 whilst dmss22 values are in the order of e-17 which is not small enough.
Even worse is that it's negative meaning the function decreases which is not supposed to happen it's supposed to increase like the plotted data points.
Edit: I fixed, didn't know that it was so sensitive to initial beta values, put beta[0]=1.5*10(-15) and it works!**
Here is a graphical fitter with both curve_fit and ODR fitters using scipy's Differential Evolution (DE) genetic algorithm to supply initial parameter estimates for the non-linear solvers. The scipy implementation of DE uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, and this requires parameter bounds within which to search - in this example, these bounds are taken from the data maximum and minimum values. Note that it is much easier to give bounds for the initial parameter estimates rather than individual specific values.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy.odr
from scipy.optimize import differential_evolution
import warnings
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7, 0.0])
yData = numpy.array([1.1, 20.2, 30.3, 40.4, 50.0, 60.6, 70.7, 0.1])
def func(x, a, b, c, d, offset): # curve fitting function for curve_fit()
return a*numpy.exp(-(x-b)**2/(2*c**2)+d) + offset
def func_wrapper_for_ODR(parameters, x): # parameter order for ODR
return func(x, *parameters)
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minY, maxY]) # search bounds for a
parameterBounds.append([minX, maxX]) # search bounds for b
parameterBounds.append([minX, maxX]) # search bounds for c
parameterBounds.append([minY, maxY]) # search bounds for d
parameterBounds.append([0.0, maxY]) # search bounds for Offset
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
geneticParameters = generate_Initial_Parameters()
##########################
# curve_fit section
##########################
fittedParameters_curvefit, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters curve_fit:', fittedParameters_curvefit)
print()
modelPredictions_curvefit = func(xData, *fittedParameters_curvefit)
absError_curvefit = modelPredictions_curvefit - yData
SE_curvefit = numpy.square(absError_curvefit) # squared errors
MSE_curvefit = numpy.mean(SE_curvefit) # mean squared errors
RMSE_curvefit = numpy.sqrt(MSE_curvefit) # Root Mean Squared Error, RMSE
Rsquared_curvefit = 1.0 - (numpy.var(absError_curvefit) / numpy.var(yData))
print()
print('RMSE curve_fit:', RMSE_curvefit)
print('R-squared curve_fit:', Rsquared_curvefit)
print()
##########################
# ODR section
##########################
data = scipy.odr.odrpack.Data(xData,yData)
model = scipy.odr.odrpack.Model(func_wrapper_for_ODR)
odr = scipy.odr.odrpack.ODR(data, model, beta0=geneticParameters)
# Run the regression.
odr_out = odr.run()
print('Fitted parameters ODR:', odr_out.beta)
print()
modelPredictions_odr = func(xData, *odr_out.beta)
absError_odr = modelPredictions_odr - yData
SE_odr = numpy.square(absError_odr) # squared errors
MSE_odr = numpy.mean(SE_odr) # mean squared errors
RMSE_odr = numpy.sqrt(MSE_odr) # Root Mean Squared Error, RMSE
Rsquared_odr = 1.0 - (numpy.var(absError_odr) / numpy.var(yData))
print()
print('RMSE ODR:', RMSE_odr)
print('R-squared ODR:', Rsquared_odr)
print()
##########################################################
# graphics output section
def ModelsAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plots
xModel = numpy.linspace(min(xData), max(xData))
yModel_curvefit = func(xModel, *fittedParameters_curvefit)
yModel_odr = func(xModel, *odr_out.beta)
# now the models as line plots
axes.plot(xModel, yModel_curvefit)
axes.plot(xModel, yModel_odr)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelsAndScatterPlot(graphWidth, graphHeight)