How can I apply a couple of functions to multiple tickers in a list? (Code improvement) - pandas

So I'm currently learning how to analyse financial data in python using numpy, pandas, etc... and I'm starting off with a small script that will hopefully rank some chosen equities by the price change between 2 chosen dates.
My first script was:
import numpy as np
import pandas as pd
from pandas_datareader import data as web
from pandas import Series, DataFrame
import datetime
from operator import itemgetter
#Edit below for 2 dates you wish to calculate:
start = datetime.datetime(2014, 7, 15)
end = datetime.datetime(2017, 7, 25)
stocks = ('AAPL', 'GOOGL', 'YHOO', 'MSFT', 'AMZN', 'DAI')
#Getting the data:
AAPL = web.DataReader('AAPL', 'google', start, end)
GOOGL = web.DataReader('GOOGL', 'google', start, end)
YHOO = web.DataReader('YHOO', 'google', start, end)
MSFT = web.DataReader('MSFT', 'google', start, end)
AMZN = web.DataReader('AMZN', 'google', start, end)
DAI = web.DataReader('DAI', 'google', start, end)
#Calculating the change:
AAPLkey = (AAPL.ix[start]['Close'])/(AAPL.ix[end]['Close'])
GOOGLkey = (GOOGL.ix[start]['Close'])/(GOOGL.ix[end]['Close'])
YHOOkey = (YHOO.ix[start]['Close'])/(YHOO.ix[end]['Close'])
MSFTkey = (MSFT.ix[start]['Close'])/(MSFT.ix[end]['Close'])
AMZNkey = (AMZN.ix[start]['Close'])/(AMZN.ix[end]['Close'])
DAIkey = (DAI.ix[start]['Close'])/(DAI.ix[end]['Close'])
#Formatting the output in a sorted order:
dict1 = {"AAPL" : AAPLkey, "GOOGL" : GOOGLkey, "YHOO" : YHOOkey, "MSFT" : MSFTkey, "AMZN" : AMZNkey, "DAI" : DAIkey}
out = sorted(dict1.items(), key=itemgetter(1), reverse = True)
for tick , change in out:
print (tick,"\t", change)
I now obviously want to make this far shorter and this is what I've got so far:
import numpy as np
import pandas as pd
from pandas_datareader import data as web
from pandas import Series, DataFrame
import datetime
from operator import itemgetter
#Edit below for 2 dates you wish to calculate:
start = datetime.datetime(2014, 7, 15)
end = datetime.datetime(2017, 7, 25)
stocks = ('AAPL', 'GOOGL', 'YHOO', 'MSFT', 'AMZN', 'DAI')
for eq in stocks:
eq = web.DataReader(eq, 'google', start, end)
for legend in eq:
legend = (eq.ix[start]['Close'])/(eq.ix[end]['Close'])
print (legend)
The calculation works BUT the problem is this only outputs the last value for the item in the list (DAI).
So what's next in order to get the same result as my first code?

You can just move print statement into loop.
Like:
for legend in eq:
legend = (eq.loc[start]['Close'])/(eq.loc[end]['Close'])
print(legend)
Improved answer:
Get rid of label loop and print values from previous loop:
for eq in stocks:
df = web.DataReader(eq, 'google', start, end)
print((df.loc[start]['Close'])/(df.loc[end]['Close']))

When you loop over stocks at line for eq in stocks, you are saving results into eq. So at each iteration it gets overwritten. You should store the results in a list, like I have done using data.
Then loop over the data list which contains dataframes, and then use proper selection.
import numpy as np
import pandas as pd
from pandas_datareader import data as web
from pandas import Series, DataFrame
import datetime
from operator import itemgetter
# edit below for 2 dates you wish to calculate:
start = datetime.datetime(2014, 7, 15)
end = datetime.datetime(2017, 7, 25)
stocks = ('AAPL', 'GOOGL', 'YHOO', 'MSFT', 'AMZN', 'DAI')
# store all the dataframes in a list
data = []
for eq in stocks:
data.append(web.DataReader(eq, 'google', start, end))
# print required fields from each dataframe
for df in data:
print (df.ix[start]['Close'])/(df.ix[end]['Close'])
Output:
0.624067042032
0.612014075932
0.613225417599
0.572179539021
0.340850298595
1.28323537643

Thanks to the other answers, they both helped lot. This is my final improved script thanks to that help:
import numpy as np
import pandas as pd
from pandas_datareader import data as web
from pandas import Series, DataFrame
import datetime
from operator import itemgetter
# edit below for 2 dates you wish to calculate:
start = datetime.datetime(2014, 7, 15)
end = datetime.datetime(2017, 7, 25)
stocks = ('AAPL', 'GOOGL', 'YHOO', 'MSFT', 'AMZN', 'DAI')
dict1 = {}
for eq in stocks:
df = web.DataReader(eq, 'google', start, end)
k = ((df.loc[start]['Close'])/(df.loc[end]['Close']))
dict1 [eq] = k
out = sorted(dict1.items(), key=itemgetter(1), reverse = True)
for tick , change in out:
print (tick,"\t", change)

Related

How to add avwap to pandas_ta?

I am trying to get anchored vwap from specific date using pandas_ta. How to set anchor to specific date?
import pandas as pd
import yfinance as yf
import pandas_ta as ta
from datetime import datetime, timedelta, date
import warnings
import plac
data = yf.download("aapl", start="2021-07-01", end="2022-08-01")
df = pd.DataFrame(data)
df1 = df.ta.vwap(anchor = "D")
df14 = pd.concat([df, df1],axis=1)
print(df14)
pandas_ta.vwap anchor depending on the index values, as pandas-ta said(reference)
anchor (str): How to anchor VWAP. Depending on the index values, it will
implement various Timeseries Offset Aliases as listed here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
Default: "D".
In further words, you can't specify a specific date as TradingView did.
To anchor a date ourself,
import pandas as pd
import numpy as np
import yfinance as yf
import pandas_ta as ta
# set anchor date
anchored_date = pd.to_datetime('2022-01-30')
data = yf.download("aapl", start="2022-01-01", end="2022-08-01")
df = pd.DataFrame(data)
df1 = df.ta.vwap(anchor = "D")
df14 = pd.concat([df, df1],axis=1)
# I create a column 'typical_price', it should be identical with 'VWAP_D'
df14['typical_price'] = (df14['High'] + df14['Low'] + df14['Close'])/3
tpp_d = ((df14['High'] + df14['Low'] + df14['Close'])*df14['Volume'])/3
df14['anchored_VWAP'] = tpp_d.where(df14.index >= anchored_date).groupby(df14.index >= anchored_date).cumsum()/df14['Volume'].where(df14.index >= anchored_date).groupby(df14.index >= anchored_date).cumsum()
df14
Plot

How to Iterate with python dataframe

I'm trying to iterate in my data frame and create a column with the result
import pandas as pd
data = {'Name': ['Mary', 'Jose', 'John', 'Marc', 'Ruth','Rachel'],
'Grades': [10, 8, 8, 5, 7,4],
'Gender':['Female','Male','Male','Male','Female','Female']
}
df = pd.DataFrame(data)
values = []
for x in df.iteritems():
values.append('Passed' if x.Grades < 7 else 'Failed')
df['Final_result'] = values
df
I'm getting 'tuple' object that has no attribute 'Grades'. Can you guys help me?
If you want to be good at pandas, avoid loops and start thinking "vectorized operations":
import numpy as np
df["Final_result"] = np.where(df["Grades"] < 7, "Passed", "Failed")

Filling pandas Data Frame

I have a list of ticker values
ticker = ["AAPL","MSFT","GOOG"]
and I want to create a DF with "high" values of prices for all the stocks in the ticker list.
Creating an empty DF:
high_df = pd.DataFrame(columns = ticker)
Filling the DF:
import pandas_datareader as web
import datetime
start = datetime.datetime(2010,1,1)
end = datetime.datetime(2010,2,1)
for each_column in high_df.columns:
high_df[each_column] = web.DataReader(each_column, "yahoo",start,end)["High"]
This works but takes a long time if the ticker list is huge. Any other suggestions for approaches to speed up? Speed up with the way the DF is filled.
Seems like just need parallel computing.
from joblib import Parallel, delayed
def yourfunc(tic):
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2010, 2, 1)
result=web.DataReader(tic, "yahoo", start, end)["High"]
return result
results = Parallel(n_jobs=-1, verbose=verbosity_level, backend="threading")(
map(delayed(yourfunc), ticker ))
About the conversion , you can using pd.DataFrame(results,columns=ticker)

Dask Dataframe: Defining meta for date diff in groubpy

I'm trying to find inter-purchase times (i.e., days between orders) for customers. Although my code is working correctly without defining meta, I would like to get it working properly and no longer see the warning asking me to provide meta.
Also, I would appreciate any suggestions on how to use map or map_partitions instead of apply.
So far I've tried:
meta={'days_since_last_order': 'datetime64[ns]'}
meta={'days_since_last_order': 'f8'}
meta={'ORDER_DATE_DT':'datetime64[ns]','days_since_last_order': 'datetime64[ns]'}
meta={'ORDER_DATE_DT':'f8','days_since_last_order': 'f8'}
meta=('days_since_last_order', 'f8')
meta=('days_since_last_order', 'datetime64[ns]')
Here is my code:
import numpy as np
import pandas as pd
import datetime as dt
import dask.dataframe as dd
from dask.distributed import wait, Client
client = Client(processes=True)
start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2018-01-01')
d = (end - start).days + 1
np.random.seed(0)
df = pd.DataFrame()
df['CUSTOMER_ID'] = np.random.randint(1, 4, 10)
df['ORDER_DATE_DT'] = start + pd.to_timedelta(np.random.randint(1, d, 10), unit='d')
print(df.sort_values(['CUSTOMER_ID','ORDER_DATE_DT']))
print(df)
ddf = dd.from_pandas(df, npartitions=2)
# setting ORDER_DATE_DT as index to sort by date
ddf = ddf.set_index('ORDER_DATE_DT')
ddf = client.persist(ddf)
wait(ddf)
ddf = ddf.reset_index()
grp = ddf.groupby('CUSTOMER_ID')[['ORDER_DATE_DT']].apply(
lambda df: df.assign(days_since_last_order=df.ORDER_DATE_DT.diff(1))
# meta=????
)
# for some reason, I'm unable to print grp unless I reset_index()
grp = grp.reset_index()
print(grp.compute())
Here is the printout of df.sort_values(['CUSTOMER_ID','ORDER_DATE_DT'])
Here is the printout of grp.compute()

Make a vbar with stdev from dataframes for mean and stdeviation

I have 2 data frames, 1 for mean and standard deviation and I am trying to make them into a bar chart in bokeh with error bars but I am stuck at how to ??groupby?? the 'Design' and Treatment'.
Basically, I am trying to get 3 bars per x-value (T0 to T2). The legend should show something like: 'mouse-yes', 'mouse-no', and 'cat-no'. How do I restructure the dict or dataframe to convert to something for vbar? And then how do I couple that with the stdev dataframe?
Also, is there a way to make the x_range in the figure automatically taking all original values from the 'Time' column? I'd like to be able to interchange the 'Time', 'Design', and 'Treatment' columns for the x-axis. I'm guessing this is where a pivot table comes in handy.
from bokeh.core.properties import value
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import dodge
output_file("dodged_bars.html")
import pandas as pd
dat_mean=[['T0','mouse','yes',25],['T0','mouse','no',24],['T0','cat','no',23],['T1','mouse','yes',15],['T1','mouse','no',14],['T1','cat','no',13],['T2','mouse','yes',5],['T2','mouse','no',4],['T2','cat','no',3]]
df_mean= pd.DataFrame(dat_mean,columns = ["Time", "Design", "Treatment", "Mean for Cmpd1"])
dat_std=[['T0','mouse','yes',5],['T0','mouse','no',5],['T0','cat','no',5],['T1','mouse','yes',2.5],['T1','mouse','no',2.5],['T1','cat','no',2.5],['T2','mouse','yes',1],['T2','mouse','no',1],['T2','cat','no',1]]
df_std= pd.DataFrame(dat_std,columns = ["Time", "Design", "Treatment", "Mean for Cmpd1"])
data = df_mean.to_dict(orient='list')*
dates = df_mean['Time'].tolist()
source = ColumnDataSource(data=data)
p = figure(x_range=['T0', 'T1', 'T2'], y_range=(0, 30), plot_height=250, title="Bokeh plot",
toolbar_location=None, tools="")
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
show(p)
from bokeh.core.properties import value
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import dodge
output_file("dodged_bars.html")
import pandas as pd
dat_mean=[['T0','mouse','yes',25],['T0','mouse','no',24],['T0','cat','no',23],['T1','mouse','yes',15],['T1','mouse','no',14],['T1','cat','no',13],['T2','mouse','yes',5],['T2','mouse','no',4],['T2','cat','no',3]]
df_mean= pd.DataFrame(dat_mean,columns = ["Time", "Design", "Treatment", "Mean for Cmpd1"])
dat_std=[['T0','mouse','yes',5],['T0','mouse','no',5],['T0','cat','no',5],['T1','mouse','yes',2.5],['T1','mouse','no',2.5],['T1','cat','no',2.5],['T2','mouse','yes',1],['T2','mouse','no',1],['T2','cat','no',1]]
df_std= pd.DataFrame(dat_std,columns = ["Time", "Design", "Treatment", "Mean for Cmpd1"])
time_seq=df_mean['Time'].drop_duplicates()
time_vals=time_seq.tolist()
MEANs=df_mean.groupby(["Design", "Treatment"])["Mean for Cmpd1"].apply(list).to_dict()
keys=[]
for h in range(len(MEANs)):
raw_key=list(MEANs.keys())[h]
keys.append(raw_key[0]+'_'+raw_key[1])
results = {'time_vals' : time_vals,
keys[0] : list(MEANs.values())[0],
keys[1] : list(MEANs.values())[1],
keys[2] : list(MEANs.values())[2]}
source = ColumnDataSource(data=results)
p = figure(x_range=['T0', 'T1', 'T2'], y_range=(0, 30), plot_height=250, title="Bokeh plot",
toolbar_location=None, tools="")
for hh in range(len(MEANs)):
p.vbar(x=dodge('time_vals', -0.25+.2*hh, range=p.x_range), top=keys[hh], width=0.2,
source=source,color=color[hh], legend=value(keys[hh]))
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
show(p)
This code works. Errorbars can be done with add_layout and Whisker functions