difference between np.linalg.lstsq and linear regression in scikit learn - pandas

comb 1 is a pandas data frame with following values.
yearID teamID salary W
408 ANA 51464167 82
409 ARI 81027833 85
When I use np.linalg.lstsq I am able to print dfg data frame.
dfg = pd.DataFrame()
comb1 = combined[combined['yearID'] == 2000]
x1 = comb1['salary'].values /1000000
y1 =comb1['W'].values
A1 = np.array([x1, np.ones(len(x1))])
w1 = np.linalg.lstsq(A1.T,y1)[0]
yq = (w1[0]*x1+w1[1])
dfg['New val'] = y1 - yq
When I use scikit learn libary for the linear regression and do the same operation I am getting a value error
from sklearn.linear_model import LinearRegression
fg = pd.DataFrame()
x2 = comb1['salary'].values /1000000
y2 =comb1['W'].values
x2_reshape = x2.reshape(-1,1)
y2_reshape = y2.reshape(-1,1)
clf1 = LinearRegression()
clf1.fit(x2_reshape, y2_reshape)
predicted_train = clf1.predict(x2_reshape)
x_pre = y2 - predicted_train
fg['New val'] = x_pre
What is the difference between these two ?? kindly help me!!

They should be the same:
Notes
From the implementation point of view, this is just plain Ordinary Least Squares (scipy.linalg.lstsq) wrapped as a predictor object.
If you getting an error, it's probably because of they way you set up your data.

Related

Matplotlib output too small to read

In adjusting the domain of a function to find certain parameters in a matplotlib plot, I found that when I try to isolate the part I need, the output becomes so small that details are impossible to see. I've tried refreshing the kernel with no change and plt.rcParams['figure.figsize'] hasn't been effective either.
This is my current code, with unused options in the function removed.
import numpy as np
import matplotlib.pyplot as plt
def P_cubic(V,T,Tc,Pc,ParamSet,omega=0):
R = 8.31446261815324 #J mol^-1 K^-1
Tr = T/Tc
if ParamSet == 'vdW':
elif ParamSet == 'RK':
elif ParamSet == 'SRK':
elif ParamSet == 'PR':
alpha = (1+(0.37464+1.54226*omega-0.26992*omega**2)*
(1-Tr**(1/2)))**2
sigma = 1+np.sqrt(2)
epsilon = 1-np.sqrt(2)
Omega = 0.07780
Psi = 0.45724
Zc = 0.30740
a = Psi*alpha*R**2*Tc**2/Pc
b = Omega*T*Tc/Pc #m3 mol-1
P = R*T/(V-b)-a/((V+epsilon*b)*(V+sigma*b))
return P
Tc = 512.5 #K
Pc = 8.0840E6 #Pa
omega = 0.565831
T = 473 #K
b = 0.07780*T*Tc/Pc #m3 mol-1
V = np.arange(0,1,0.001)
Vrange = b*V #m3 mol-1
PPa = np.empty(len(Vrange))
for i in range(len(Vrange)):
PPa[i]=P_cubic(Vrange[i],T,Tc,Pc,'PR',omega) #Pa
Pbar = PPa*1.0E-5 #bar
plt.rcParams['figure.figsize']=(1,0.8)
plt.plot(V,Pbar)
plt.xlabel('V/b')
plt.ylabel('P /bar')
plt.xlim(0,np.max(V))
plt.ylim(np.min(Pbar),np.max(Pbar))
plt.title('Variance of Pressure with Volume of Pure Methanol at 473 K')
plt.text(15,-6,f'b = {b:.2E} m^3/mol');
Below are screenshots with the output at varying figsize parameters to show that plt.rcParams['figure.figsize'] is not helping.
How do I fix this so that I can see the details of the plot?
There are two reasons for this. First, the size unit of the graph is inches, so the specified number itself is small, resulting in a smaller graph. Secondly, the default coordinates of the annotations are based on the data, so the x-value is 15, which is far from the graph, so the figure is automatically smaller. So, I think you need to set the graph size and fix the x-value of the annotations.
fig, ax = plt.subplots()
plt.rcParams['figure.figsize']=(8,4)
ax.plot(V,Pbar)
plt.xlabel('V/b')
plt.ylabel('P /bar')
plt.xlim(0,np.max(V))
plt.ylim(np.min(Pbar),np.max(Pbar))
plt.title('Variance of Pressure with Volume of Pure Methanol at 473 K')
plt.text(1.1,-6,f'b = {b:.2E} m^3/mol')
#plt.text(1.1,-6,f'b = {b:.2E} m^3/mol', transform=ax.transData)
plt.show()

A Simple Bayesian Network with a Coin-Flipping Problem

I am trying to implement a Bayesian network and solve a regression problem using PYMC3. In my model, I have a fair coin as the parent node. If the parent node is H, the child node selects the normal distribution N(5,0.2); if T, the child selects N(0,0.5). Here is an illustration of my network.
To simulate this network, I generated a sample dataset and tried doing Bayesian regression using the code below. Currently, the model does regression only for the child node as if the parent node does not exist. I would greatly appreciate it if anyone can let me know how to implement the conditional probability P(D|C). Ultimately, I am interested in finding the probability distribution for mu1 and mu2. Thank you!
# Generate data for coin flip P(C) and store in c1
theta_real = 0.5 # unkown value in a real experiment
n_sample = 10
c1 = bernoulli.rvs(p=theta_real, size=n_sample)
# Generate data for normal distribution P(D|C) and store in d1
np.random.seed(123)
mu1 = 0
sigma1 = 0.5
mu2 = 5
sigma2 = 0.2
d1 = []
for index, item in enumerate(c1):
if item == 0:
d1.extend(normal(mu1, sigma1, 1))
else:
d1.extend(normal(mu2, sigma2, 1))
# I start building PYMC3 model here
c1_tensor = theano.shared(np.array(c1))
d1_tensor = theano.shared(np.array(d1))
with pm.Model() as model:
# define prior for c1. I am not sure how to do this.
#c1_present = pm.Categorical('c1',observed=c1_tensor)
# how do I incorporate P(D | C)
mu_prior = pm.Normal('mu', mu=2, sd=2, shape=1)
sigma_prior = pm.HalfNormal('sigma', sd=2, shape=1)
y_likelihood = pm.Normal('y', mu=mu_prior, sd=sigma_prior, observed=d1_tensor)
You could use the Dirichlet distribution as a prior for the coin toss and NormalMixture as the prior of the two Gaussians. In the following snippet I changed the fairness of the coin and increased the number of coin tosses, but you could adjust these in any way want:
import numpy as np
import pymc3 as pm
from scipy.stats import bernoulli
# Generate data for coin flip P(C) and store in c1
theta_real = 0.2 # unkown value in a real experiment
n_sample = 2000
c1 = bernoulli.rvs(p=theta_real, size=n_sample)
# Generate data for normal distribution P(D|C) and store in d1
np.random.seed(123)
mu1 = 0
sigma1 = 0.5
mu2 = 5
sigma2 = 0.2
d1 = []
for index, item in enumerate(c1):
if item == 0:
d1.extend(np.random.normal(mu1, sigma1, 1))
else:
d1.extend(np.random.normal(mu2, sigma2, 1))
with pm.Model() as model:
w = pm.Dirichlet('p', a=np.ones(2))
mu = pm.Normal('mu', 0, 20, shape=2)
sigma = np.array([0.5,0.2])
pm.NormalMixture('like',w=w,mu=mu,sigma=sigma,observed=np.array(d1))
trace = pm.sample()
pm.summary(trace)
This will give you the following:
mean sd mc_error hpd_2.5 hpd_97.5 n_eff Rhat
mu__0 4.981222 0.023900 0.000491 4.935044 5.027420 2643.052184 0.999637
mu__1 -0.007660 0.004946 0.000095 -0.017388 0.001576 2481.146286 1.000312
p__0 0.213976 0.009393 0.000167 0.195602 0.231803 2245.905021 0.999302
p__1 0.786024 0.009393 0.000167 0.768197 0.804398 2245.905021 0.999302
The parameters are recovered nicely as you can also see from the traceplots:
The above implementation will give you the posterior of theta_real, mu1 and mu2 but I could not get convergence when I added sigma1 and sigma2 as parameters to be estimated by the data (even though the prior was quite narrow):
with pm.Model() as model:
w = pm.Dirichlet('p', a=np.ones(2))
mu = pm.Normal('mu', 0, 20, shape=2)
sigma = pm.HalfNormal('sigma', sd=2, shape=2)
pm.NormalMixture('like',w=w,mu=mu,sigma=sigma,observed=np.array(d1))
trace = pm.sample()
print(pm.summary(trace))
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, mu, p]
Sampling 4 chains: 100%|██████████| 4000/4000 [00:10<00:00, 395.57draws/s]
The acceptance probability does not match the target. It is 0.883057127209148, but should be close to 0.8. Try to increase the number of tuning steps.
The gelman-rubin statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.
mean sd mc_error ... hpd_97.5 n_eff Rhat
mu__0 1.244021 2.165433 0.216540 ... 5.005507 2.002049 212.596596
mu__1 3.743879 2.165122 0.216510 ... 5.012067 2.002040 235.750129
p__0 0.643069 0.248630 0.024846 ... 0.803369 2.004185 30.966189
p__1 0.356931 0.248630 0.024846 ... 0.798632 2.004185 30.966189
sigma__0 0.416207 0.125435 0.012517 ... 0.504110 2.009031 17.333177
sigma__1 0.271763 0.125539 0.012533 ... 0.497208 2.007779 19.217223
[6 rows x 7 columns]
Based on that you most likely will need to reparametrize if you also wanted to estimate the two standard deviations from this data.
This answer is to supplement #balleveryday's answer, which suggests the Gaussian Mixture Model, but had some trouble getting the symmetry breaking to work. Admittedly, the symmetry breaking in the official example is done in the context of Metropolis-Hastings sampling, whereas I think NUTS might be a little more sensitive to encountering impossible values (not sure). Here's what worked for me:
import numpy as np
import pymc3 as pm
from scipy.stats import bernoulli
import theano.tensor as tt
# everything should reproduce
np.random.seed(123)
n_sample = 2000
# Generate data for coin flip P(C) and store in c1
theta_real = 0.2 # unknown value in a real experiment
c1 = bernoulli.rvs(p=theta_real, size=n_sample)
# Generate data for normal distribution P(D|C) and store in d1
mu1, mu2 = 0, 5
sigma1, sigma2 = 0.5, 0.2
d1 = np.empty_like(c1, dtype=np.float64)
d1[c1 == 0] = np.random.normal(mu1, sigma1, np.sum(c1 == 0))
d1[c1 == 1] = np.random.normal(mu2, sigma2, np.sum(c1 == 1))
with pm.Model() as gmm_asym:
# mixture vector
w = pm.Dirichlet('p', a=np.ones(2))
# Gaussian parameters (testval helps start off ordered)
mu = pm.Normal('mu', 0, 20, shape=2, testval=[-10, 10])
sigma = pm.HalfNormal('sigma', sd=2, shape=2)
# break symmetry, forcing mu[0] < mu[1]
order_means_potential = pm.Potential('order_means_potential',
tt.switch(mu[1] - mu[0] < 0, -np.inf, 0))
# observed
pm.NormalMixture('like', w=w, mu=mu, sigma=sigma, observed=d1)
# reproducible sampling
tr_gmm_asym = pm.sample(tune=2000, target_accept=0.9, random_seed=20191121)
This produces samples with the statistics
mean sd mc_error hpd_2.5 hpd_97.5 n_eff Rhat
mu__0 0.004549 0.011975 0.000226 -0.017398 0.029375 2425.487301 0.999916
mu__1 5.007663 0.008993 0.000166 4.989247 5.024692 2181.134002 0.999563
p__0 0.789983 0.009091 0.000188 0.773059 0.808062 2417.356539 0.999788
p__1 0.210017 0.009091 0.000188 0.191938 0.226941 2417.356539 0.999788
sigma__0 0.497322 0.009103 0.000186 0.480394 0.515867 2227.397854 0.999358
sigma__1 0.191310 0.006633 0.000141 0.178924 0.204859 2286.817037 0.999614
and the traces

Numpy float128 is not giving a correct answer

I created a differential equation solver (Runge-Kutta 4th order method) in Python. Than I decided to check its results by setting the parameter mu to 0 and looking at the numeric solution that was returned by it.
The problem is, I know that this solution should give an stable oscillation as a result, but instead I get a diverging solution.
The code is presented below. I tried solving this problem (rounding errors from floating point precision) by using numpy float128 data type. But the solver keeps giving me the wrong answer.
The code is:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
def f(t,x,v):
f = -k/m*x-mu/m*v
return(f)
def g(t,x,v):
g = v
return(g)
def srunge4(t,x,v,dt):
k1 = f(t,x,v)
l1 = g(t,x,v)
k2 = f(t+dt/2, x+k1*dt/2, v+l1*dt/2)
l2 = g(t+dt/2, x+k1*dt/2, v+l1*dt/2)
k3 = f(t+dt/2, x+k2*dt/2, v+l2*dt/2)
l3 = g(t+dt/2, x+k2*dt/2, v+l2*dt/2)
k4 = f(t+dt/2, x+k3*dt, v+l3*dt)
l4 = g(t+dt/2, x+k3*dt, v+l3*dt)
v = v + dt/6*(k1+2*k2+2*k3+k4)
x = x + dt/6*(l1+2*l2+2*l3+l4)
t = t + dt
return([t,x,v])
mu = np.float128(0.00); k = np.float128(0.1); m = np.float128(6)
x0 = np.float128(5); v0 = np.float128(-10)
t0 = np.float128(0); tf = np.float128(1000); dt = np.float128(0.05)
def sedol(t, x, v, tf, dt):
sol = np.array([[t,x,v]], dtype='float128')
while sol[-1][0]<=tf:
t,x,v = srunge4(t,x,v,dt)
sol=np.append(sol,np.float128([[t,x,v]]),axis=0)
sol = pd.DataFrame(data=sol, columns=['t','x','v'])
return(sol)
ft_runge = sedol(t0, x0, v0, tf, dt=0.1)
plt.close("all")
graf1 = plt.plot(ft_runge.iloc[:,0],ft_runge.iloc[:,1],'b')
plt.show()
Am I using numpy float128 in a wrong way?
You are mixing in srunge4 the association of k and l to x and v. Per the function association and the final summation, the association should be (v,f,k) and (x,g,l). This has to be obeyed in the updates of the stages of the method.
In stage 4, it should be t+dt in the first argument. However, as t is not used in the derivative computation, this error has no consequence here.
Also, you are destroying the float128 computation if you set one parameter to a float in the default float64 type in dt=0.1.
The code with these corrections and some further simplifications is
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
mu = np.float128(0.00); k = np.float128(0.1); m = np.float128(6)
x0 = np.float128(5); v0 = np.float128(-10)
t0 = np.float128(0); tf = np.float128(1000); dt = np.float128(0.05)
def f(t,x,v): return -(k*x+mu*v)/m
def g(t,x,v): return v
def srunge4(t,x,v,dt): # should be skutta4, Wilhelm Kutta gave this method in 1901
k1, l1 = (fg(t,x,v) for fg in (f,g))
# here is the essential corrections, x->l, v->k
k2, l2 = (fg(t+dt/2, x+l1*dt/2, v+k1*dt/2) for fg in (f,g))
k3, l3 = (fg(t+dt/2, x+l2*dt/2, v+k2*dt/2) for fg in (f,g))
k4, l4 = (fg(t+dt , x+l3*dt , v+k3*dt ) for fg in (f,g))
v = v + dt/6*(k1+2*k2+2*k3+k4)
x = x + dt/6*(l1+2*l2+2*l3+l4)
t = t + dt
return([t,x,v])
def sedol(t, x, v, tf, dt):
sol = [[t,x,v]]
while t<=tf:
t,x,v = srunge4(t,x,v,dt)
sol.append([t,x,v])
sol = pd.DataFrame(data=np.asarray(sol) , columns=['t','x','v'])
return(sol)
ft_runge = sedol(t0, x0, v0, tf, dt=2*dt)
plt.close("all")
fig,ax = plt.subplots(1,3)
ft_runge.plot(x='t', y='x', ax=ax[0])
ft_runge.plot(x='t', y='v', ax=ax[1])
ft_runge.plot.scatter(x='x', y='v', s=1, ax=ax[2])
plt.show()
It produces the expected ellipse without visually recognizable changes in the amplitudes.

Scipy Optimize minimize returns the initial value

I am building machine learning models for a certain data set. Then, based on the constraints and bounds for the outputs and inputs, I am trying to find the input parameters for the most minimized answer.
The problem which I am facing is that, when the model is a linear regression model or something like lasso, the minimization works perfectly fine.
However, when the model is "Decision Tree", it constantly returns the very initial value that is given to it. So basically, it does not enforce the constraints.
import numpy as np
import pandas as pd
from scipy.optimize import minimize
I am using the very first sample from the input data set for the optimization. As it is only one sample, I need to reshape it to (1,-1) as well.
x = df_in.iloc[0,:]
x = np.array(x)
x = x.reshape(1,-1)
This is my Objective function:
def objective(x):
x = np.array(x)
x = x.reshape(1,-1)
y = 0
for n in range(df_out.shape[1]):
y = Model[n].predict(x)
Y = y[0]
return Y
Here I am defining the bounds of inputs:
range_max = pd.DataFrame(range_max)
range_min = pd.DataFrame(range_min)
B_max=[]
B_min =[]
for i in range(range_max.shape[0]):
b_max = range_max.iloc[i]
b_min = range_min.iloc[i]
B_max.append(b_max)
B_min.append(b_min)
B_max = pd.DataFrame(B_max)
B_min = pd.DataFrame(B_min)
bnds = pd.concat([B_min, B_max], axis=1)
These are my constraints:
con_min = pd.DataFrame(c_min)
con_max = pd.DataFrame(c_max)
Here I am defining the constraint function:
def const(x):
x = np.array(x)
x = x.reshape(1,-1)
Y = []
for n in range(df_out.shape[1]):
y = Model[n].predict(x)[0]
Y.append(y)
Y = pd.DataFrame(Y)
a4 =[]
for k in range(Y.shape[0]):
a1 = Y.iloc[k,0] - con_min.iloc[k,0]
a2 = con_max.iloc[k, 0] - Y.iloc[k,0]
a3 = [a2,a1]
a4 = np.concatenate([a4, a3])
return a4
c = const(x)
con = {'type': 'ineq', 'fun': const}
This is where I try to minimize. I do not pick a method as the automatically picked model has worked so far.
sol = minimize(fun = objective, x0=x,constraints=con, bounds=bnds)
So the actual constraints are:
c_min = [0.20,1000]
c_max = [0.3,1600]
and the max and min range for the boundaries are:
range_max = [285,200,8,85,0.04,1.6,10,3.5,20,-5]
range_min = [215,170,-1,60,0,1,6,2.5,16,-18]
I think you should check the output of 'sol'. At times, the algorithm is not able to perform line search completely. To check for this, you should check message associated with 'sol'. In such a case, the optimizer returns initial parameters itself. There may be various reasons of this behavior. In a nutshell, please check the output of sol and act accordingly.
Arad,
If you have not yet resolved your issue, try using scipy.optimize.differential_evolution instead of scipy.optimize.minimize. I ran into similar issues, particularly with decision trees because of their step-like behavior resulting in infinite gradients.

Evaluating the squared term of a gaussian kernel for having a covariance matrix for multi-dimensional inputs [duplicate]

I have the following code. It is taking forever in Python. There must be a way to translate this calculation into a broadcast...
def euclidean_square(a,b):
squares = np.zeros((a.shape[0],b.shape[0]))
for i in range(squares.shape[0]):
for j in range(squares.shape[1]):
diff = a[i,:] - b[j,:]
sqr = diff**2.0
squares[i,j] = np.sum(sqr)
return squares
You can use np.einsum after calculating the differences in a broadcasted way, like so -
ab = a[:,None,:] - b
out = np.einsum('ijk,ijk->ij',ab,ab)
Or use scipy's cdist with its optional metric argument set as 'sqeuclidean' to give us the squared euclidean distances as needed for our problem, like so -
from scipy.spatial.distance import cdist
out = cdist(a,b,'sqeuclidean')
I collected the different methods proposed here, and in two other questions, and measured the speed of the different methods:
import numpy as np
import scipy.spatial
import sklearn.metrics
def dist_direct(x, y):
d = np.expand_dims(x, -2) - y
return np.sum(np.square(d), axis=-1)
def dist_einsum(x, y):
d = np.expand_dims(x, -2) - y
return np.einsum('ijk,ijk->ij', d, d)
def dist_scipy(x, y):
return scipy.spatial.distance.cdist(x, y, "sqeuclidean")
def dist_sklearn(x, y):
return sklearn.metrics.pairwise.pairwise_distances(x, y, "sqeuclidean")
def dist_layers(x, y):
res = np.zeros((x.shape[0], y.shape[0]))
for i in range(x.shape[1]):
res += np.subtract.outer(x[:, i], y[:, i])**2
return res
# inspired by the excellent https://github.com/droyed/eucl_dist
def dist_ext1(x, y):
nx, p = x.shape
x_ext = np.empty((nx, 3*p))
x_ext[:, :p] = 1
x_ext[:, p:2*p] = x
x_ext[:, 2*p:] = np.square(x)
ny = y.shape[0]
y_ext = np.empty((3*p, ny))
y_ext[:p] = np.square(y).T
y_ext[p:2*p] = -2*y.T
y_ext[2*p:] = 1
return x_ext.dot(y_ext)
# https://stackoverflow.com/a/47877630/648741
def dist_ext2(x, y):
return np.einsum('ij,ij->i', x, x)[:,None] + np.einsum('ij,ij->i', y, y) - 2 * x.dot(y.T)
I use timeit to compare the speed of the different methods. For the comparison, I use vectors of length 10, with 100 vectors in the first group, and 1000 vectors in the second group.
import timeit
p = 10
x = np.random.standard_normal((100, p))
y = np.random.standard_normal((1000, p))
for method in dir():
if not method.startswith("dist_"):
continue
t = timeit.timeit(f"{method}(x, y)", number=1000, globals=globals())
print(f"{method:12} {t:5.2f}ms")
On my laptop, the results are as follows:
dist_direct 5.07ms
dist_einsum 3.43ms
dist_ext1 0.20ms <-- fastest
dist_ext2 0.35ms
dist_layers 2.82ms
dist_scipy 0.60ms
dist_sklearn 0.67ms
While the two methods dist_ext1 and dist_ext2, both based on the idea of writing (x-y)**2 as x**2 - 2*x*y + y**2, are very fast, there is a downside: When the distance between x and y is very small, due to cancellation error the numerical result can sometimes be (very slightly) negative.
Another solution besides using cdist is the following
difference_squared = np.zeros((a.shape[0], b.shape[0]))
for dimension_iterator in range(a.shape[1]):
difference_squared = difference_squared + np.subtract.outer(a[:, dimension_iterator], b[:, dimension_iterator])**2.