Gaussian rolling weights pandas - pandas

Suppose that I have a pandas series of data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
n = 1000
srs = pd.Series(np.random.random(n))
I wish to now roll a Gaussian filter through this data such that the weights look like:
window = 100
x = np.arange(window)
mu = 60
sigma = 0.2
y = np.exp(-(x-mu)**2 / 2*sigma**2) / np.sqrt(2*np.pi*sigma**2)
plt.plot(x,y)
That is to say, for each window of length 100 the 60th entry has the maximum weight and the other entries decay as per the Gaussian formulation.
Is this possible with .rolling()?

You can use numpy.average to take a weighted mean:
import numpy as np
import pandas as pd
n = 1000
window_size = 100
srs = pd.Series(np.random.random(n))
mu = 60
sigma = 0.2
x = np.arange(window_size)
weights = np.exp(-(x-mu)**2 / 2*sigma**2) / np.sqrt(2*np.pi*sigma**2)
srs.rolling(window).apply(lambda wndw: np.average(wndw, weights=weights))
This is the same as:
srs.rolling(window).apply(lambda wndw: (wndw*weights).sum()/weights.sum())
Remark that this will break if you try to pass a min_periods less than window since np.average requires a and weights to have the same length.

Related

how to interpolate between the two values?

How to Interpolate for every 100lbs of Pressure_alt based on GW(grass weight) column for all value of grouped OAT(temperature).I am new to python, help me to find solution for this.
import matplotlib.pyplot as plt
from scipy import interpolate
import numpy as np
x = np.arange(17402,17330)
y = x**100
temp = interpolate.interp1d(x, y)
xnew = np.arange(17300, 17400, 100)
ynew = temp(xnew)

np.dot - weights are not being applied to inputs

Trying to use np.dot function to multiply annual returns with weights in a portfolio to return portfolio performance
import numpy as np
import pandas as pd
from pandas_datareader import data as pdr
import matplotlib.pyplot as plt
import yfinance as yf
yf.pdr_override()
y_symbols = ['PG','MSFT', 'F', 'GE']
from datetime import datetime
startdate = datetime(1995,1,3)
enddate = datetime(2017,3,24)
data = pdr.get_data_yahoo(y_symbols, start =startdate, end =enddate)['Adj Close']
returns = (data/data.shift(1)) - 1
annual_returns = returns.mean() * 252
annual_returns
F 0.118506
GE 0.127551
MSFT 0.197452
PG 0.129486
dtype: float64
weights = np.array([0.4, 0.4, 0.15, 0.05])
np.dot = (annual_returns, weights)
(F 0.118506
GE 0.127551
MSFT 0.197452
PG 0.129486
dtype: float64,
array([0.4, 0.4, 0.15, 0.05])
Would expect to see one average of annual return for each stock * weighting
Any idea why I am not seeing one average here?
You set the np.dot value to a tuple. You should be making a function call instead by calling np.dot(annual_returns, weights)

Unexplained "drops" in Savgol smoothing with higher polynomial for trends, stock, energy data (all kinds of time series basically!)

I have been trying to smooth curves with Savgol (scikit) and, in several of my attempt, raising the polynomial degree resulted in "drops" like the one I show below. This example is from Google trends data, but I had similar problems with stock data and electricity consumption data. Any lead as to why it behaves like it or how to solve it (and be able to raise the polynomial degree) would be highly appreciated.
Image below: "Sample output".
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)
from scipy.signal import savgol_filter
kw_list = ["Carbon footprint"]
pytrends.build_payload(kw_list, timeframe='2004-12-14 2019-12-25', geo='', gprop='')
da1 = pytrends.interest_over_time()
#(drop last one for Savgol as need odd number, used to have 196 records)
Y3 = da1["Carbon footprint"]
fig = plt.figure(figsize=(18,9))
l = Y3.shape[0]
l = l if l%2 == 1 else l-1
# window = odd number closest to size of data
ax1 = plt.subplot(2,1,1)
ax1 = sns.lineplot(data=Y3, color="navy")
#Savgol with polynomial order = 7 is fine (but misses the initial plateau)
Y3_smooth = savgol_filter(Y3,l, 7)
ax1 = sns.lineplot(x=da1.index.to_pydatetime(),y=Y3_smooth, color="red")
plt.title(f"red = with Savgol, polynomial order = 7, window = {l}", fontsize=18)
ax2 = plt.subplot(2,1,2)
ax2 = sns.lineplot(data=Y3, color="navy")
#Savgol with polynomial order = 9 or more has a weird drop
Y3_smooth = savgol_filter(Y3,l, 10)
ax2 = sns.lineplot(x=da1.index.to_pydatetime(),y=Y3_smooth, color="red")
plt.title(f"red = with Savgol, polynomial order = 10, window = {l}", fontsize=18)
Sample output
If anyone is interested, I found this workaround using a different way to smooth. It works well including in the beginning and end, and allows a fine tuning of the degree of smoothing.
from scipy.ndimage.filters import gaussian_filter1d
def smooth(y, sigma=2):
y_smooth = gaussian_filter1d(y, sigma)
return y_smooth

Use matplotlib to plot scikit learn linear regression results

How can you plot the linear regression results from scikit learn after the analysis to see the "testing" data (real values vs. predicted values) at the end of the program? The code below is close but I believe it is missing a scaling factor.
input:
import pandas as pd
import numpy as np
import datetime
pd.core.common.is_list_like = pd.api.types.is_list_like # temp fix
import fix_yahoo_finance as yf
from pandas_datareader import data, wb
from datetime import date
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing, cross_validation, svm
import matplotlib.pyplot as plt
df = yf.download('MMM', start = date (2012, 1, 1), end = date (2018, 1, 1) , progress = False)
df_low = df[['Low']] # create a new df with only the low column
forecast_out = int(5) # predicting some days into future
df_low['low_prediction'] = df_low[['Low']].shift(-forecast_out) # create a new column based on the existing col but shifted some days
X_low = np.array(df_low.drop(['low_prediction'], 1))
X_low = preprocessing.scale(X_low) # scaling the input values
X_low_forecast = X_low[-forecast_out:] # set X_forecast equal to last 5 days
X_low = X_low[:-forecast_out] # remove last 5 days from X
y_low = np.array(df_low['low_prediction'])
y_low = y_low[:-forecast_out]
X_low_train, X_low_test, y_low_train, y_low_test = cross_validation.train_test_split(X_low, y_low, test_size = 0.2)
clf_low = LinearRegression() # classifier
clf_low.fit(X_low_train, y_low_train) # training
confidence_low = clf_low.score(X_low_test, y_low_test) # testing
print("confidence for lows: ", confidence_low)
forecast_prediction_low = clf_low.predict(X_low_forecast)
print(forecast_prediction_low)
plt.figure(figsize = (17,9))
plt.grid(True)
plt.plot(X_low_test, color = "red")
plt.plot(y_low_test, color = "green")
plt.show()
image:
You plot y_test and X_test, while you should plot y_test and clf_low.predict(X_test) instead, if you want to compare target and predicted.
BTW, clf_low in your code is not a classifier, it is a regressor. It's better to use the alias model instead of clf.

Fitting to Poisson histogram

I am trying to fit a curve over the histogram of a Poisson distribution that looks like this
I have modified the fit function so that it resembles a Poisson distribution, with the parameter t as a variable. But the curve_fit function can not be plotted and I am not sure why.
def histo(bsize):
N = bsize
#binwidth
bw = (dt.max()-dt.min())/(N-1.)
bin1 = dt.min()+ bw*np.arange(N)
#define the array to hold the occurrence count
bincount= np.array([])
for bin in bin1:
count = np.where((dt>=bin)&(dt<bin+bw))[0].size
bincount = np.append(bincount,count)
#bin center
binc = bin1+0.5*bw
plt.figure()
plt.plot(binc,bincount,drawstyle= 'steps-mid')
plt.xlabel("Interval[ticks]")
plt.ylabel("Frequency")
histo(30)
plt.xlim(0,.5e8)
plt.ylim(0,25000)
import numpy as np
from scipy.optimize import curve_fit
delta_t = 1.42e7
def func(x, t):
return t * np.exp(- delta_t/t)
popt, pcov = curve_fit(func, np.arange(0,.5e8),histo(30))
plt.plot(popt)
The problem with your code is that you do not know what the return values of curve_fit are. It is the parameters for the fit-function and their covariance matrix - not something you can plot directly.
Binned Least-Squares Fit
In general you can get everything much, much more easily:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.special import factorial
from scipy.stats import poisson
# get poisson deviated random numbers
data = np.random.poisson(2, 1000)
# the bins should be of integer width, because poisson is an integer distribution
bins = np.arange(11) - 0.5
entries, bin_edges, patches = plt.hist(data, bins=bins, density=True, label='Data')
# calculate bin centers
bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])
def fit_function(k, lamb):
'''poisson function, parameter lamb is the fit parameter'''
return poisson.pmf(k, lamb)
# fit with curve_fit
parameters, cov_matrix = curve_fit(fit_function, bin_centers, entries)
# plot poisson-deviation with fitted parameter
x_plot = np.arange(0, 15)
plt.plot(
x_plot,
fit_function(x_plot, *parameters),
marker='o', linestyle='',
label='Fit result',
)
plt.legend()
plt.show()
This is the result:
Unbinned Maximum-Likelihood fit
An even better possibility would be to not use a histogram at all
and instead to carry out a maximum-likelihood fit.
But by closer examination even this is unnecessary, because the
maximum-likelihood estimator for the parameter of the poissonian distribution is the arithmetic mean.
However, if you have other, more complicated PDFs, you can use this as example:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from scipy.special import factorial
from scipy import stats
def poisson(k, lamb):
"""poisson pdf, parameter lamb is the fit parameter"""
return (lamb**k/factorial(k)) * np.exp(-lamb)
def negative_log_likelihood(params, data):
"""
The negative log-Likelihood-Function
"""
lnl = - np.sum(np.log(poisson(data, params[0])))
return lnl
def negative_log_likelihood(params, data):
''' better alternative using scipy '''
return -stats.poisson.logpmf(data, params[0]).sum()
# get poisson deviated random numbers
data = np.random.poisson(2, 1000)
# minimize the negative log-Likelihood
result = minimize(negative_log_likelihood, # function to minimize
x0=np.ones(1), # start value
args=(data,), # additional arguments for function
method='Powell', # minimization method, see docs
)
# result is a scipy optimize result object, the fit parameters
# are stored in result.x
print(result)
# plot poisson-distribution with fitted parameter
x_plot = np.arange(0, 15)
plt.plot(
x_plot,
stats.poisson.pmf(x_plot, result.x),
marker='o', linestyle='',
label='Fit result',
)
plt.legend()
plt.show()
Thank you for the wonderful discussion!
You might want to consider the following:
1) Instead of computing "poisson", compute "log poisson", for better numerical behavior
2) Instead of using "lamb", use the logarithm (let me call it "log_mu"), to avoid the fit "wandering" into negative values of "mu".
So
log_poisson(k, log_mu): return k*log_mu - loggamma(k+1) - math.exp(log_mu)
Where "loggamma" is the scipy.special.loggamma function.
Actually, in the above fit, the "loggamma" term only adds a constant offset to the functions being minimized, so one can just do:
log_poisson_(k, log_mu): return k*log_mu - math.exp(log_mu)
NOTE: log_poisson_() not the same as log_poisson(), but when used for minimization in the manner above, will give the same fitted minimum (the same value of mu, up to numerical issues). The value of the function being minimized will have been offset, but one doesn't usually care about that anyway.