I read the following paper(http://www3.stat.sinica.edu.tw/statistica/oldpdf/A10n416.pdf) where they model the variance-covariance matrix Σ as:
Σ = diag(S)*R*diag(S) (Equation 1 in the paper)
S is the k×1 vector of standard deviations, diag(S) is the diagonal matrix with diagonal elements S, and R is the k×k correlation matrix.
How can I implement this using PyMC ?
Here is some initial code I wrote:
import numpy as np
import pandas as pd
import pymc as pm
k=3
prior_mu=np.ones(k)
prior_var=np.eye(k)
prior_corr=np.eye(k)
prior_cov=prior_var*prior_corr*prior_var
post_mu = pm.Normal("returns",prior_mu,1,size=k)
post_var=pm.Lognormal("variance",np.diag(prior_var),1,size=k)
post_corr_inv=pm.Wishart("inv_corr",n_obs,np.linalg.inv(prior_corr))
post_cov_matrix_inv = ???
muVector=[10,5,-2]
varMatrix=np.diag([10,20,10])
corrMatrix=np.matrix([[1,.2,0],[.2,1,0],[0,0,1]])
cov_matrix=varMatrix*corrMatrix*varMatrix
n_obs=10000
x=np.random.multivariate_normal(muVector,cov_matrix,n_obs)
obs = pm.MvNormal( "observed returns", post_mu, post_cov_matrix_inv, observed = True, value = x )
model = pm.Model( [obs, post_mu, post_cov_matrix_inv] )
mcmc = pm.MCMC()
mcmc.sample( 5000, 2000, 3 )
Thanks
[edit]
I think that can be done using the following:
#pm.deterministic
def post_cov_matrix_inv(post_sdev=post_sdev,post_corr_inv=post_corr_inv):
return np.diag(post_sdev)*post_corr_inv*np.diag(post_sdev)
Here is the solution for the benefit of someone who stumbles onto this post:
p=3
prior_mu=np.ones(p)
prior_sdev=np.ones(p)
prior_corr_inv=np.eye(p)
muVector=[10,5,1]
sdevVector=[3,5,10]
corrMatrix=np.matrix([[1,0,-.1],[0,1,.5],[-.1,.5,1]])
cov_matrix=np.diag(sdevVector)*corrMatrix*np.diag(sdevVector)
n_obs=2000
x=np.random.multivariate_normal(muVector,cov_matrix,n_obs)
prior_cov=np.diag(prior_sdev)*np.linalg.inv(prior_corr_inv)*np.diag(prior_sdev)
post_mu = pm.Normal("returns",prior_mu,1,size=p)
post_sdev=pm.Lognormal("sdev",prior_sdev,1,size=p)
post_corr_inv=pm.Wishart("inv_corr",n_obs,prior_corr_inv)
#post_cov_matrix_inv = pm.Wishart("inv_cov_matrix",n_obs,np.linalg.inv(prior_cov))
#pm.deterministic
def post_cov_matrix_inv(post_sdev=post_sdev,post_corr_inv=post_corr_inv,nobs=n_obs):
post_sdev_inv=(post_sdev)**-1
return np.diag(post_sdev_inv)*cov2corr(post_corr_inv/nobs)*np.diag(post_sdev_inv)
obs = pm.MvNormal( "observed returns", post_mu, post_cov_matrix_inv, observed = True, value = x )
model = pm.Model( [obs, post_mu, post_sdev ,post_corr_inv])
mcmc = pm.MCMC(model)
mcmc.sample( 25000, 15000, 1,progress_bar=False )
Related
I have the following code, which basically tries to fit a simple regression model using tensorflow probability. The code runs without error, but the MCMC sampler doesn't seem to be doing anything in that it returns a trace of the initial states.
import tensorflow.compat.v2 as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfdimport warnings
tf.enable_v2_behavior()
plt.style.use("ggplot")
warnings.filterwarnings('ignore')
ru=4
N=102
N = 102 # number of data points
t = np.linspace(0, 4*np.pi, N)
data = 3+np.sin(t+0.001) + 0.5 + np.random.randn(N)
media_1 = ((data-min(data))/(max(data)-min(data)) ) #+ np.random.normal(0,.05, N)
y = np.repeat(ru, N) + np.random.normal(.3,.01,N) * media_1 + np.random.normal(0, .005, N)
# model
model = tfd.JointDistributionNamed(dict(
beta1 = tfd.Normal(0,1) ,
intercept = tfd.Normal(0,5 ) ,
var = tfd.Normal(0.05, 0.0005) ,
y = lambda intercept,beta1,var:
tfd.Independent(tfd.Normal(loc=intercept + beta1 * media_1, scale=var),
reinterpreted_batch_ndims=1
)
))
def target_log_prob_fn(intercept, beta1, var):
return model.log_prob({'intercept':intercept, 'beta1':beta1, 'var':var, 'y': y })
s = model.sample()
init_states = [ tf.fill([1], s['intercept'].numpy(), name='init_intercept'),
tf.fill([1], s['beta1'].numpy(), name='init_beta1'),
tf.fill([1], s['var'].numpy(), name='init_var'),]
num_results = 5000
num_burnin_steps = 3000
# Improve performance by tracing the sampler using `tf.function`
# and compiling it using XLA.
#tf.function(autograph=False, experimental_compile=True)
def do_sampling():
return tfp.mcmc.sample_chain(
num_results=num_results,
num_burnin_steps=num_burnin_steps,
current_state=init_states,
kernel=tfp.mcmc.HamiltonianMonteCarlo(
target_log_prob_fn=target_log_prob_fn,
step_size=0.1,
num_leapfrog_steps=3)
)
states, kernel_results = do_sampling()
The trace that is returned in states is exactly the same as the initial values in initial_states... Any ideas?
I can confirm that this MCMC sampler is not mixing by printing the acceptance rate with a snippet I found here
print("Acceptance rate:", kernel_results.is_accepted.numpy().mean())
That same page provides some hints about how to make your HMC kernel adaptive, which means it will automatically reduce the step size if too many proposed steps are rejected (and increase it if too many are accepted, too):
# Apply a simple step size adaptation during burnin
#tf.function
def do_sampling():
adaptive_kernel = tfp.mcmc.SimpleStepSizeAdaptation(
tfp.mcmc.HamiltonianMonteCarlo(
target_log_prob_fn=target_log_prob_fn,
step_size=0.1,
num_leapfrog_steps=3),
num_adaptation_steps=int(.8 * num_burnin_steps),
target_accept_prob=np.float64(.65))
return tfp.mcmc.sample_chain(
num_results=num_results,
num_burnin_steps=num_burnin_steps,
current_state=init_states,
kernel=adaptive_kernel,
trace_fn=lambda cs, kr: kr)
samples, kernel_results = do_sampling()
print("Acceptance rate:", kernel_results.inner_results.is_accepted.numpy().mean())
This produces a non-zero acceptance rate for me.
I created a differential equation solver (Runge-Kutta 4th order method) in Python. Than I decided to check its results by setting the parameter mu to 0 and looking at the numeric solution that was returned by it.
The problem is, I know that this solution should give an stable oscillation as a result, but instead I get a diverging solution.
The code is presented below. I tried solving this problem (rounding errors from floating point precision) by using numpy float128 data type. But the solver keeps giving me the wrong answer.
The code is:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
def f(t,x,v):
f = -k/m*x-mu/m*v
return(f)
def g(t,x,v):
g = v
return(g)
def srunge4(t,x,v,dt):
k1 = f(t,x,v)
l1 = g(t,x,v)
k2 = f(t+dt/2, x+k1*dt/2, v+l1*dt/2)
l2 = g(t+dt/2, x+k1*dt/2, v+l1*dt/2)
k3 = f(t+dt/2, x+k2*dt/2, v+l2*dt/2)
l3 = g(t+dt/2, x+k2*dt/2, v+l2*dt/2)
k4 = f(t+dt/2, x+k3*dt, v+l3*dt)
l4 = g(t+dt/2, x+k3*dt, v+l3*dt)
v = v + dt/6*(k1+2*k2+2*k3+k4)
x = x + dt/6*(l1+2*l2+2*l3+l4)
t = t + dt
return([t,x,v])
mu = np.float128(0.00); k = np.float128(0.1); m = np.float128(6)
x0 = np.float128(5); v0 = np.float128(-10)
t0 = np.float128(0); tf = np.float128(1000); dt = np.float128(0.05)
def sedol(t, x, v, tf, dt):
sol = np.array([[t,x,v]], dtype='float128')
while sol[-1][0]<=tf:
t,x,v = srunge4(t,x,v,dt)
sol=np.append(sol,np.float128([[t,x,v]]),axis=0)
sol = pd.DataFrame(data=sol, columns=['t','x','v'])
return(sol)
ft_runge = sedol(t0, x0, v0, tf, dt=0.1)
plt.close("all")
graf1 = plt.plot(ft_runge.iloc[:,0],ft_runge.iloc[:,1],'b')
plt.show()
Am I using numpy float128 in a wrong way?
You are mixing in srunge4 the association of k and l to x and v. Per the function association and the final summation, the association should be (v,f,k) and (x,g,l). This has to be obeyed in the updates of the stages of the method.
In stage 4, it should be t+dt in the first argument. However, as t is not used in the derivative computation, this error has no consequence here.
Also, you are destroying the float128 computation if you set one parameter to a float in the default float64 type in dt=0.1.
The code with these corrections and some further simplifications is
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
mu = np.float128(0.00); k = np.float128(0.1); m = np.float128(6)
x0 = np.float128(5); v0 = np.float128(-10)
t0 = np.float128(0); tf = np.float128(1000); dt = np.float128(0.05)
def f(t,x,v): return -(k*x+mu*v)/m
def g(t,x,v): return v
def srunge4(t,x,v,dt): # should be skutta4, Wilhelm Kutta gave this method in 1901
k1, l1 = (fg(t,x,v) for fg in (f,g))
# here is the essential corrections, x->l, v->k
k2, l2 = (fg(t+dt/2, x+l1*dt/2, v+k1*dt/2) for fg in (f,g))
k3, l3 = (fg(t+dt/2, x+l2*dt/2, v+k2*dt/2) for fg in (f,g))
k4, l4 = (fg(t+dt , x+l3*dt , v+k3*dt ) for fg in (f,g))
v = v + dt/6*(k1+2*k2+2*k3+k4)
x = x + dt/6*(l1+2*l2+2*l3+l4)
t = t + dt
return([t,x,v])
def sedol(t, x, v, tf, dt):
sol = [[t,x,v]]
while t<=tf:
t,x,v = srunge4(t,x,v,dt)
sol.append([t,x,v])
sol = pd.DataFrame(data=np.asarray(sol) , columns=['t','x','v'])
return(sol)
ft_runge = sedol(t0, x0, v0, tf, dt=2*dt)
plt.close("all")
fig,ax = plt.subplots(1,3)
ft_runge.plot(x='t', y='x', ax=ax[0])
ft_runge.plot(x='t', y='v', ax=ax[1])
ft_runge.plot.scatter(x='x', y='v', s=1, ax=ax[2])
plt.show()
It produces the expected ellipse without visually recognizable changes in the amplitudes.
I am building machine learning models for a certain data set. Then, based on the constraints and bounds for the outputs and inputs, I am trying to find the input parameters for the most minimized answer.
The problem which I am facing is that, when the model is a linear regression model or something like lasso, the minimization works perfectly fine.
However, when the model is "Decision Tree", it constantly returns the very initial value that is given to it. So basically, it does not enforce the constraints.
import numpy as np
import pandas as pd
from scipy.optimize import minimize
I am using the very first sample from the input data set for the optimization. As it is only one sample, I need to reshape it to (1,-1) as well.
x = df_in.iloc[0,:]
x = np.array(x)
x = x.reshape(1,-1)
This is my Objective function:
def objective(x):
x = np.array(x)
x = x.reshape(1,-1)
y = 0
for n in range(df_out.shape[1]):
y = Model[n].predict(x)
Y = y[0]
return Y
Here I am defining the bounds of inputs:
range_max = pd.DataFrame(range_max)
range_min = pd.DataFrame(range_min)
B_max=[]
B_min =[]
for i in range(range_max.shape[0]):
b_max = range_max.iloc[i]
b_min = range_min.iloc[i]
B_max.append(b_max)
B_min.append(b_min)
B_max = pd.DataFrame(B_max)
B_min = pd.DataFrame(B_min)
bnds = pd.concat([B_min, B_max], axis=1)
These are my constraints:
con_min = pd.DataFrame(c_min)
con_max = pd.DataFrame(c_max)
Here I am defining the constraint function:
def const(x):
x = np.array(x)
x = x.reshape(1,-1)
Y = []
for n in range(df_out.shape[1]):
y = Model[n].predict(x)[0]
Y.append(y)
Y = pd.DataFrame(Y)
a4 =[]
for k in range(Y.shape[0]):
a1 = Y.iloc[k,0] - con_min.iloc[k,0]
a2 = con_max.iloc[k, 0] - Y.iloc[k,0]
a3 = [a2,a1]
a4 = np.concatenate([a4, a3])
return a4
c = const(x)
con = {'type': 'ineq', 'fun': const}
This is where I try to minimize. I do not pick a method as the automatically picked model has worked so far.
sol = minimize(fun = objective, x0=x,constraints=con, bounds=bnds)
So the actual constraints are:
c_min = [0.20,1000]
c_max = [0.3,1600]
and the max and min range for the boundaries are:
range_max = [285,200,8,85,0.04,1.6,10,3.5,20,-5]
range_min = [215,170,-1,60,0,1,6,2.5,16,-18]
I think you should check the output of 'sol'. At times, the algorithm is not able to perform line search completely. To check for this, you should check message associated with 'sol'. In such a case, the optimizer returns initial parameters itself. There may be various reasons of this behavior. In a nutshell, please check the output of sol and act accordingly.
Arad,
If you have not yet resolved your issue, try using scipy.optimize.differential_evolution instead of scipy.optimize.minimize. I ran into similar issues, particularly with decision trees because of their step-like behavior resulting in infinite gradients.
I have the following code. It is taking forever in Python. There must be a way to translate this calculation into a broadcast...
def euclidean_square(a,b):
squares = np.zeros((a.shape[0],b.shape[0]))
for i in range(squares.shape[0]):
for j in range(squares.shape[1]):
diff = a[i,:] - b[j,:]
sqr = diff**2.0
squares[i,j] = np.sum(sqr)
return squares
You can use np.einsum after calculating the differences in a broadcasted way, like so -
ab = a[:,None,:] - b
out = np.einsum('ijk,ijk->ij',ab,ab)
Or use scipy's cdist with its optional metric argument set as 'sqeuclidean' to give us the squared euclidean distances as needed for our problem, like so -
from scipy.spatial.distance import cdist
out = cdist(a,b,'sqeuclidean')
I collected the different methods proposed here, and in two other questions, and measured the speed of the different methods:
import numpy as np
import scipy.spatial
import sklearn.metrics
def dist_direct(x, y):
d = np.expand_dims(x, -2) - y
return np.sum(np.square(d), axis=-1)
def dist_einsum(x, y):
d = np.expand_dims(x, -2) - y
return np.einsum('ijk,ijk->ij', d, d)
def dist_scipy(x, y):
return scipy.spatial.distance.cdist(x, y, "sqeuclidean")
def dist_sklearn(x, y):
return sklearn.metrics.pairwise.pairwise_distances(x, y, "sqeuclidean")
def dist_layers(x, y):
res = np.zeros((x.shape[0], y.shape[0]))
for i in range(x.shape[1]):
res += np.subtract.outer(x[:, i], y[:, i])**2
return res
# inspired by the excellent https://github.com/droyed/eucl_dist
def dist_ext1(x, y):
nx, p = x.shape
x_ext = np.empty((nx, 3*p))
x_ext[:, :p] = 1
x_ext[:, p:2*p] = x
x_ext[:, 2*p:] = np.square(x)
ny = y.shape[0]
y_ext = np.empty((3*p, ny))
y_ext[:p] = np.square(y).T
y_ext[p:2*p] = -2*y.T
y_ext[2*p:] = 1
return x_ext.dot(y_ext)
# https://stackoverflow.com/a/47877630/648741
def dist_ext2(x, y):
return np.einsum('ij,ij->i', x, x)[:,None] + np.einsum('ij,ij->i', y, y) - 2 * x.dot(y.T)
I use timeit to compare the speed of the different methods. For the comparison, I use vectors of length 10, with 100 vectors in the first group, and 1000 vectors in the second group.
import timeit
p = 10
x = np.random.standard_normal((100, p))
y = np.random.standard_normal((1000, p))
for method in dir():
if not method.startswith("dist_"):
continue
t = timeit.timeit(f"{method}(x, y)", number=1000, globals=globals())
print(f"{method:12} {t:5.2f}ms")
On my laptop, the results are as follows:
dist_direct 5.07ms
dist_einsum 3.43ms
dist_ext1 0.20ms <-- fastest
dist_ext2 0.35ms
dist_layers 2.82ms
dist_scipy 0.60ms
dist_sklearn 0.67ms
While the two methods dist_ext1 and dist_ext2, both based on the idea of writing (x-y)**2 as x**2 - 2*x*y + y**2, are very fast, there is a downside: When the distance between x and y is very small, due to cancellation error the numerical result can sometimes be (very slightly) negative.
Another solution besides using cdist is the following
difference_squared = np.zeros((a.shape[0], b.shape[0]))
for dimension_iterator in range(a.shape[1]):
difference_squared = difference_squared + np.subtract.outer(a[:, dimension_iterator], b[:, dimension_iterator])**2.
I am getting familiar with scikit and its pandas integration using the Titanic tutorial on Kaggle. I have cleaned my data and would like to make some prediction. I can do it calling a pipeline fit and transform - unfortunately I get an error trying to do the same with cross_val_score.
I am using the sklearn-pandas cross_val_score
The code is as follows:
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline([
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
X = df_train[df_train.columns.drop('Survived')]
y = df_train['Survived']
#model = pipe.fit(X = X, y = y)
#prediction = model.predict(df_train)
score = cross_val_score(pipe, X = X, y = y, scoring = 'accuracy')
df_train is a Pandas dataframe containing all my training set, including outcomes. The two commented lines:
model = pipe.fit(X = X, y = y)
prediction = model.predict(df_train)
Work fine and prediction returns me an array with predicted outcomes. Using the same with cross_val_score, I get the following error:
X has 20 features per sample; expecting 19
Full code below, can be run with the Titanic CSV files on Kaggle (https://www.kaggle.com/c/titanic/data)
#%% Libraries import
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#%% Read the data
path = 'E:/Kaggle/Titanic/Data/'
file_training = 'train.csv'
file_test = 'test.csv'
#Import the training and test dataset and concatenate them
df_training = pd.read_csv(path + file_training, header = 0, index_col = 'PassengerId')
df_test = pd.read_csv(path + file_test, header = 0, index_col = 'PassengerId')
# Work on the concatenated training and test data for feature engineering and clean-up
df = pd.concat([df_training, df_test], keys = ['train','test'])
#%% Initial data exploration and cleaning
df.describe(include = 'all')
pd.isnull(df).sum() > 0
#%% Preprocesing and Cleanup
#Create new columns with the name (to identify individuals part of a family)
df['LName'] = df['Name'].apply(lambda x:x.split(',')[0].strip())
df['FName'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[1].strip())
#Get the title
df['Title'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
titleDic = {
'Master' : 'kid',
'Mlle' : 'unmarriedWoman',
'Miss' : 'unmarriedWoman',
'Ms' : 'unmarriedWoman',
'Jonkheer' : 'noble',
'Don' : 'noble',
'Dona' : 'noble',
'Sir' : 'noble',
'Lady' : 'noble',
'the Countess' : 'noble',
'Capt' : 'ranked',
'Major' : 'ranked',
'Col' : 'ranked',
'Mr' : 'standard',
'Mme' : 'standard',
'Mrs' : 'standard',
'Dr' : 'academic',
'Rev' : 'academic'
}
df['Group'] = df['Title'].map(titleDic)
#%% Working with the family size
#Get the family size
df['familySize'] = df['Parch'] + df['SibSp'] + 1
#Add a family tag (single, couple, small, large)
df['familyType'] = pd.cut(df['familySize'],
[1,2,3,5,np.inf],
labels = ['single','couple','sFamily','bFamily'],
right = False)
#%% Filling empty values
#Fill empty values with the mean or mode for the column
#Fill the missing values with mean for age per title, class and gender. Store value in AgeFull variable
agePivot = pd.DataFrame(df.groupby(['Group', 'Sex'])['Age'].median())
agePivot.columns = ['AgeFull']
df = pd.merge(df, agePivot, left_on = ['Group', 'Sex'], right_index = True)
df.loc[df['Age'].isnull(),['Age']] = df['AgeFull']
#Embark location missing values
embarkPivot = pd.DataFrame(df.groupby(['Group'])['Embarked'].agg(lambda x:x.value_counts().index[0]))
embarkPivot.columns = ['embarkFull']
df = pd.merge(df, embarkPivot, left_on = ['Group'], right_index = True)
df.loc[df['Embarked'].isnull(),['Embarked']] = df['embarkFull']
#Fill the missing fare value
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mean()
#%% Final clean-up (drop temporary columns)
df = df.drop(['AgeFull', 'embarkFull'], 1)
#%% Preparation for training
df_train = df.loc['train']
df_test = df.loc['test']
#Creation of dummy variables
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline(steps = [
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
#Uncommenting the line below fixes the code - why?
#df_train = df_train.sort_index()
X = df_train[df_train.columns.drop(['Survived'])]
y = df_train.Survived
score = cross_val_score(pipe, X = df_train, y = df_train.Survived, scoring = 'accuracy')
This is very interesting. I have solved the issue just by sorting using the index the DataFrame before passing it to the cross_val_score in the pipeline.
df_train = df_train.sort_index()
Could anyone explain me why this would have an impact on how Scikit is working?