np.dot - weights are not being applied to inputs - numpy

Trying to use np.dot function to multiply annual returns with weights in a portfolio to return portfolio performance
import numpy as np
import pandas as pd
from pandas_datareader import data as pdr
import matplotlib.pyplot as plt
import yfinance as yf
yf.pdr_override()
y_symbols = ['PG','MSFT', 'F', 'GE']
from datetime import datetime
startdate = datetime(1995,1,3)
enddate = datetime(2017,3,24)
data = pdr.get_data_yahoo(y_symbols, start =startdate, end =enddate)['Adj Close']
returns = (data/data.shift(1)) - 1
annual_returns = returns.mean() * 252
annual_returns
F 0.118506
GE 0.127551
MSFT 0.197452
PG 0.129486
dtype: float64
weights = np.array([0.4, 0.4, 0.15, 0.05])
np.dot = (annual_returns, weights)
(F 0.118506
GE 0.127551
MSFT 0.197452
PG 0.129486
dtype: float64,
array([0.4, 0.4, 0.15, 0.05])
Would expect to see one average of annual return for each stock * weighting
Any idea why I am not seeing one average here?

You set the np.dot value to a tuple. You should be making a function call instead by calling np.dot(annual_returns, weights)

Related

Gaussian rolling weights pandas

Suppose that I have a pandas series of data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
n = 1000
srs = pd.Series(np.random.random(n))
I wish to now roll a Gaussian filter through this data such that the weights look like:
window = 100
x = np.arange(window)
mu = 60
sigma = 0.2
y = np.exp(-(x-mu)**2 / 2*sigma**2) / np.sqrt(2*np.pi*sigma**2)
plt.plot(x,y)
That is to say, for each window of length 100 the 60th entry has the maximum weight and the other entries decay as per the Gaussian formulation.
Is this possible with .rolling()?
You can use numpy.average to take a weighted mean:
import numpy as np
import pandas as pd
n = 1000
window_size = 100
srs = pd.Series(np.random.random(n))
mu = 60
sigma = 0.2
x = np.arange(window_size)
weights = np.exp(-(x-mu)**2 / 2*sigma**2) / np.sqrt(2*np.pi*sigma**2)
srs.rolling(window).apply(lambda wndw: np.average(wndw, weights=weights))
This is the same as:
srs.rolling(window).apply(lambda wndw: (wndw*weights).sum()/weights.sum())
Remark that this will break if you try to pass a min_periods less than window since np.average requires a and weights to have the same length.

Simple logistic regression with Statsmodels: Adding an intercept and visualizing the logistic regression equation

Using Statsmodels, I am trying to generate a simple logistic regression model to predict whether a person smokes or not (Smoke) based on their height (Hgt).
I have a feeling that an intercept needs to be included into the logistic regression model but I am not sure how to implement one using the add_constant() function. Also, I am unsure why the error below is generated.
This is the dataset, Pulse.CSV: https://drive.google.com/file/d/1FdUK9p4Dub4NXsc-zHrYI-AGEEBkX98V/view?usp=sharing
The full code and output are in this PDF file: https://drive.google.com/file/d/1kHlrAjiU7QvFXF2a7tlTSFPgfpq9bOXJ/view?usp=sharing
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
raw_data = pd.read_csv('Pulse.csv')
raw_data
x1 = raw_data['Hgt']
y = raw_data['Smoke']
reg_log = sm.Logit(y,x1,missing='Drop')
results_log = reg_log.fit()
def f(x,b0,b1):
return np.array(np.exp(b0+x*b1) / (1 + np.exp(b0+x*b1)))
f_sorted = np.sort(f(x1,results_log.params[0],results_log.params[1]))
x_sorted = np.sort(np.array(x1))
plt.scatter(x1,y,color='C0')
plt.xlabel('Hgt', fontsize = 20)
plt.ylabel('Smoked', fontsize = 20)
plt.plot(x_sorted,f_sorted,color='C8')
plt.show()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
4729 try:
-> 4730 return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
4731 except KeyError as e1:
((( Truncated for brevity )))
IndexError: index out of bounds
Intercept is not added by default in Statsmodels regression, but if you need you can include it manually.
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
raw_data = pd.read_csv('Pulse.csv')
raw_data
x1 = raw_data['Hgt']
y = raw_data['Smoke']
x1 = sm.add_constant(x1)
reg_log = sm.Logit(y,x1,missing='Drop')
results_log = reg_log.fit()
results_log.summary()
def f(x,b0,b1):
return np.array(np.exp(b0+x*b1) / (1 + np.exp(b0+x*b1)))
f_sorted = np.sort(f(x1,results_log.params[0],results_log.params[1]))
x_sorted = np.sort(np.array(x1))
plt.scatter(x1['Hgt'],y,color='C0')
plt.xlabel('Hgt', fontsize = 20)
plt.ylabel('Smoked', fontsize = 20)
plt.plot(x_sorted,f_sorted,color='C8')
plt.show()
This will also resolve the error as there was no intercept in your initial code.Source

'tuple' object has no attribute 'reshape'

I used a dataset "ex1data1.txt", but when I am running it to convert, it is showing the following error:
AttributeError Traceback (most recent call last)
<ipython-input-52-7c523f7ba9e1> in <module>()
1 # Converting loaded dataset into numpy array
2
----> 3 X = np.concatenate((np.ones(len(population)).reshape(len(population), 1), population.reshape(len(population),1)), axis=1)
4
5
AttributeError: 'tuple' object has no attribute 'reshape'
The code is given below:
import csv
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
# Loading Dataset
with open('ex1data1.txt') as csvfile:
population, profit = zip(*[(float(row['Population']), float(row['Profit'])) for row in csv.DictReader(csvfile)])
# Creating DataFrame
df = pd.DataFrame()
df['Population'] = population
df['Profit'] = profit
# Plotting using Seaborn
sns.lmplot(x="Population", y="Profit", data=df, fit_reg=False, scatter_kws={'s':45})
# Converting loaded dataset into numpy array
X = np.concatenate((np.ones(len(population)).reshape(len(population), 1), population.reshape(len(population),1)), axis=1)
y = np.array(profit).reshape(len(profit), 1)
# Creating theta matrix , theta = [[0], [0]]
theta = np.zeros((2, 1))
# Learning rate
alpha = 0.1
# Iterations to be taken
iterations = 1500
# Updated theta and calculated cost
theta, cost = gradientDescent(X, y, theta, alpha, iterations)
I don't know how to solve this reshape problem. Can anyone tell how can I solve this problem?
from your definition, population is a tuple. I'd suggest two options, the first is converting it to an array, i.e.
population = np.asarray(population)
Alternatively, you can use the DataFrame column .values attribute, which is essentially a numpy array:
X = np.concatenate((np.ones(len(population)).reshape(len(population), 1), df['Population'].values.reshape(len(population),1)), axis=1)

convert pandas datetime64[ns] to matplotlib date-float for date x-axis in seaborn tsplot

Ok I'm trying to do something that should be trivial but instead I've spent more time than I'd like to admit searching google and stack overflow only to become more frustrated.
What I'm trying to do: I'd like to format my x-axis on a seaborn tsplot.
What my stack overflow searching has told me: matplot lib has a set_major_formattter function but I can't seem to use it without tripping an overflow error.
What I'm looking for: a simple way to convert datetime64[ns] to a float that can be used with marplot lib's set_major_formatter.
Where I think I'm stuck:
df.date_action = df.date_action.values.astype('float')
# converts the field to a float but matplotlib expects seconds since 0001-01-01 not nano seconds since epoch
is there a simple way to do this that I'm missing?
the most helpful post I reviewed so far was
31255815 which got me 95% of the way there but not quite
here is some sample code to illustrate the issue
# standard imports
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
## generate fake data
from datetime import timedelta, date
import random
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2013, 1, 1)
end_date = date(2018, 6, 2)
date_list = []
number_list = []
for single_date in daterange(start_date, end_date):
date_list.append(single_date)
if len(number_list) > 0:
number_list.append(random.random() + number_list[-1])
else:
number_list.append(random.random())
df = pd.DataFrame(data={'date_action': date_list, 'values': number_list})
# note my actual data comes in as a datetime64[ns]
df['date_action'] = df['date_action'].astype('datetime64[ns]')
# the following looked promising but is still offset an incorrect amount
#df.date_action = df.date_action.values.astype('float')
#df.date_action = df.date_action.to_datetime
## chart stuff
plt.clf()
import matplotlib.dates as mdates
df['dummy_01'] = 0
rows = 1
cols = 1
fig, axs = plt.subplots(nrows=rows, ncols=cols, figsize=(10, 8))
ax1 = plt.subplot2grid((rows, cols), (0, 0))
for i in [ax1]: # trying to format x-axis
pass
i.xaxis_date()
i.xaxis.set_major_locator(mdates.AutoDateLocator())
i.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
sns.tsplot(df, time='date_action', unit='dummy_01',
value='values', ax=ax1) #
plt.plot()
plt.show()

Python - Finance Matplotlib related

I'm new to python and I'm testing the finance matploblib module.
I need to get the price and date values when the ma20 = ma50
Give me a clue on how to do this.
Here is my code:
# Modules
import datetime
import numpy as np
import matplotlib.finance as finance
import matplotlib.mlab as mlab
import matplotlib.pyplot as plot
# Define quote
startdate = datetime.date(2005,1,1)
today = enddate = datetime.date.today()
ticker = 'nvda'
# Catch CSV
fh = finance.fetch_historical_yahoo(ticker, startdate, enddate)
# From CSV to REACARRAY
r = mlab.csv2rec(fh); fh.close()
# Order by Desc
r.sort()
### Methods Begin
def moving_average(x, n, type='simple'):
"""
compute an n period moving average.
type is 'simple' | 'exponential'
"""
x = np.asarray(x)
if type=='simple':
weights = np.ones(n)
else:
weights = np.exp(np.linspace(-1., 0., n))
weights /= weights.sum()
a = np.convolve(x, weights, mode='full')[:len(x)]
a[:n] = a[n]
return a
### Methods End
prices = r.adj_close
dates = r.date
ma20 = moving_average(prices, 20, type='simple')
ma50 = moving_average(prices, 50, type='simple')
plot.plot(prices)
plot.plot(ma20)
plot.plot(ma50)
plot.show()
Since you are using numpy, you can use numpy's boolean indexing for arrays:
equal = ma20==ma50
print(dates[equal])
print(prices[equal])
'equal' is a boolean array of the same length as dates and prices. Numpy then picks from dates and prices only those entries where equal==True, or, equivalently, ma20==ma50.