CODING Q based on Dataframes and Series and dictionaries - pandas

It would be interesting to see if there is any evidence of a link between vaccine effectiveness and sex of the child. Calculate the ratio of the number of children who contracted chickenpox but were vaccinated against it (at least one varicella dose) versus those who were vaccinated but did not contract chicken pox. Return results by sex.
This function should return a dictionary in the form of (use the correct numbers):
{"male":0.2,
"female":0.4}
Note: To aid in verification, the chickenpox_by_sex()['female'] value the autograder is looking for starts with the digits 0.0077.
PLEASE WRITE A FUNCTIONING CODE FOR THE SAME.

Try the following code:
Read the given dataset using the following code
import pandas as pd
df=pd.read_csv('assets/NISPUF17.csv',index_col=0)
df
Main code
def chickenpox_by_sex():
# YOUR CODE HERE
male_df=df[df['SEX']==1]
vac_m=male_df[male_df['P_NUMVRC']>=1]
cp_m=vac_m[vac_m['HAD_CPOX']==1]
counts_cp_m=cp_m['SEX'].count()
ncp_m=vac_m[vac_m['HAD_CPOX']==2]
counts_ncp_m=ncp_m['SEX'].count()
male=counts_cp_m/counts_ncp_m
female_df=df[df['SEX']==2]
vac_f=female_df[female_df['P_NUMVRC']>=1]
cp_f=vac_f[vac_f['HAD_CPOX']==1]
counts_cp_f=cp_f['SEX'].count()
ncp_f=vac_f[vac_f['HAD_CPOX']==2]
counts_ncp_f=ncp_f['SEX'].count()
female=counts_cp_f/counts_ncp_f
ratio_dict={"male":male,"female":female}
return ratio_dict
raise NotImplementedError()
Check using the following code
chickenpox_by_sex()['female']
Final code to complete this
assert len(chickenpox_by_sex())==2, "Return a dictionary with two items, the first for males and the second for females."
=> [SEX] -> sex=1 (male); sex=2 (female)
=> [HAD_COP] -> contracted chicken pox = 1; not contracted chicken pox = 2
=> [P_NUMVRC]>=1 -> given one or more doses
*ratio(male) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
*ratio(female) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
Variable names:
male - male data frame
vac_m - vaccinated male
cp_m - vaccinated and contracted chickenpox (male)
counts_cp_m - counts of vaccinated and contracted chickenpox
ncp_m - vaccinated and not contracted chickenpox (male)
counts_ncp_m - vaccinated and not contracted chickenpox
Similarly for females.

CORRECT SOLUTION.
def chickenpox_by_sex():
import pandas as pd
df = pd.read_csv("NISPUF17.csv")
maleDf = df[df["SEX"] ==1]
doses1 = maleDf[maleDf["P_NUMVRC"] >= 1]
chichkenPox1_1 = doses1[doses1["HAD_CPOX"] == 1]
count1_1 = chichkenPox1_1["SEX"].count()
chichkenPox1_2 = doses1[doses1["HAD_CPOX"] == 2]
count1_2 = chichkenPox1_2["SEX"].count()
resultMale = count1_1/count1_2
femaleDf = df[df["SEX"] == 2]
doses2 = femaleDf[femaleDf["P_NUMVRC"] >= 1]
chichkenPox2_1 = doses2[doses2["HAD_CPOX"] == 1]
count2_1 = chichkenPox2_1["SEX"].count()
chichkenPox2_2 = doses2[doses2["HAD_CPOX"] == 2]
count2_2 = chichkenPox2_2["SEX"].count()
resultFemale = count2_1/count2_2
dict = {"male":resultMale,
"female":resultFemale
}
return dict

The following code works as well:
import pandas as pd
import numpy as np
import math
def chickenpox_by_sex():
df=pd.read_csv('assets/NISPUF17.csv')
c_vaccinated=df[df['P_NUMVRC']>0]
menstats=c_vaccinated[c_vaccinated['SEX']==1]
mnocpox=len(menstats[menstats['HAD_CPOX']==2])
menratio=len(menstats[menstats['HAD_CPOX']==1])/mnocpox
wstats=c_vaccinated[c_vaccinated['SEX']==2]
wnocpox=len(wstats[wstats['HAD_CPOX']==2])
wratio=len(wstats[wstats['HAD_CPOX']==1])/wnocpox
ratios={'male':menratio,'female':wratio}
return ratios
chickenpox_by_sex()

import pandas as pd
def chickenpox_by_sex():
df = pd.read_csv('assets/NISPUF17.csv')
df = df.drop(df[df.HAD_CPOX == 77].index)
df = df.drop(df[df.HAD_CPOX == 99].index)
df = df.dropna(subset=['P_NUMVRC'])
df.loc[df['HAD_CPOX'] == 1, 'HAD_CPOX'] = 'YES'
df.loc[df['HAD_CPOX'] == 2, 'HAD_CPOX'] = 'NO'
df.loc[df['SEX'] == 1, 'SEX'] = 'male'
df.loc[df['SEX'] == 2, 'SEX'] = 'female'
df.loc[df['P_NUMVRC'] == 2.0, 'P_NUMVRC'] = 1
df.loc[df['P_NUMVRC'] == 3.0, 'P_NUMVRC'] = 1
df = df[['SEX', 'P_NUMVRC', 'HAD_CPOX']].round(decimals=0)
dfm = df[df['SEX'] == 'male']
dfmVac = dfm[dfm['P_NUMVRC'] == 1.0]
mPoxVacYes = len(dfmVac[dfmVac['HAD_CPOX'] == 'YES'])
mPoxVacNo = len(dfmVac[dfmVac['HAD_CPOX'] == 'NO'])
dff = df[df['SEX'] == 'female']
dffVac = dff[dff['P_NUMVRC'] == 1.0]
fPoxVacYes = len(dffVac[dffVac['HAD_CPOX'] == 'YES'])
fPoxVacNo = len(dffVac[dffVac['HAD_CPOX'] == 'NO'])
ratioM = mPoxVacYes/float(mPoxVacNo)
ratioF = fPoxVacYes/float(fPoxVacNo)
result = {'male': ratioM * 100, 'female': ratioF * 100}
return result

import pandas as pd
import numpy as np
df = pd.read_csv('assets/NISPUF17.csv', usecols = ['HAD_CPOX', 'SEX', 'P_NUMVRC']).dropna().reset_index()
def chickenpox_by_sex():
girls = df[df.SEX == 2]
girls_had = girls[(girls.HAD_CPOX == 1) & (girls.P_NUMVRC > 0.0)]
girls_not_had = girls[(girls.HAD_CPOX == 2) &(girls.P_NUMVRC > 0.0)]
girls_ratio = len(girls_had)/len(girls_not_had)
boys = df[df.SEX == 1]
boys_had = boys[(boys.HAD_CPOX == 1) & (boys.P_NUMVRC > 0.0)]
boys_not_had = boys[(boys.HAD_CPOX == 2) &(boys.P_NUMVRC > 0.0)]
boys_ratio = len(boys_had)/len(boys_not_had)
result = {"male": round(boys_ratio, ndigits=4),
"female":round(girls_ratio, ndigits = 4)}
return result
chickenpox_by_sex()

Related

Error code 'could not convert string to float: 'PG-13'. How to fix it?

I am building a recommendation engine from a database from Kaggle.
df = pd.read_csv("netflix.csv")
df = df.drop(["ratingdescription"], axis=1)
df = pd.get_dummies(df, columns=["rating_level"])
df = df.dropna()
df = df[['title', 'rating', 'release_year', 'user_rating_score', 'user_rating_size']]
df['title'] = df['title'].astype('category')
df['title'] = df['title'].cat.codes
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df.drop(['title'], axis=1))
def recommend(title, df, model_knn):
query_index = df.loc\[df\['title'\] == title\].index.values\[0\]
distances, indices = model_knn.kneighbors(df.loc\[df\['title'\] == title\].drop(\['title'\], axis=1), n_neighbors = 6)
for i in range(0, len(indices.flatten())):
if indices.flatten()\[i\] == query_index:
continue
else:
recommended_title = df.loc\[df.index == indices.flatten()\[i\], 'title'\].values\[0\]
recommended_title = df.loc\[df\['title'\] == recommended_title\]\['title'\].cat.categories\[recommended_title\]
print('Recommendation:', recommended_title)
def evaluate(title, df, model_knn):
query_index = df.loc\[df\['title'\] == title\].index.values\[0\]
distances, indices = model_knn.kneighbors(df.loc\[df\['title'\] == title\].drop(\['title'\], axis=1), n_neighbors = 6)
recommended_titles = \[\]
for i in range(0, len(indices.flatten())):
if indices.flatten()\[i\] == query_index:
continue
`else:
recommended_title = df.loc[df.index == indices.flatten()[i], 'title'].values[0]
recommended_titles.append(recommended_title)
actual_titles = df.loc[df['rating'] == df.loc[df['title'] == title]['rating'].values[0], 'title']
actual_titles = actual_titles.drop(query_index)
actual_titles = [df.loc[df['title'] == title]['title'].cat.categories[title] for title in actual_titles]
recommended_titles = [df.loc[df['title'] == title]['title'].cat.categories[title] for title in recommended_titles]
precision, recall, _, _ = precision_recall_fscore_support(actual_titles, recommended_titles, average = 'macro')
print('Precision:', precision)
print('Recall:', recall)
recommend("The Shawshank Redemption", df, model_knn)
evaluate("The Shawshank Redemption", df, model_knn)
I have tried altering the code many times but it's either this or the error message
"KeyError: 'rating_level'" indicates that the column "rating_level" is not found in the dataframe **df**.
error received is this :
`ValueError: could not convert string to float: 'PG-13'

return pandas dataframe from function

I want to return a dataframe from this function, which can be used elsewhere (for plotly graph to be exact).
My idea is to use the dataframe I can create with points_sum(), save it as the team name, and then use that dataframe in my px.line(dataframe = team_name).
In essence, I want to use the men_points_df variable after I created it.
def points_sum(team):
points = 0
men_points = []
for index, row in menscore_df.iterrows():
if row['hometeam'] == team:
if row['homegoals'] > row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] < row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
if row['awayteam'] == team:
if row['homegoals'] < row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] > row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
men_points_df = pd.DataFrame(men_points, columns = ["Date", 'Points'])
return men_points_df
In plotly, I am trying to use my new dataframe (men_points_df), like below, but I get the error undefined name, even though I can print it (for example: test = points_sum("FIF") (FIF is one of the team names) and it shows the correct dataframe in the console (when I type test):
elif pathname == "/page-3":
return [html.H1('Seasonal performance',
style={'textAlign':'center'}),
html.Div(
children=[
html.H2('Select team',style={'textAlign':'center'}),
html.Br(),
html.Br(),
dcc.Dropdown(
id='team_dd',
options=[{'label': v, 'value': k} for k,v in teams_all.items()],
)]),
dcc.Graph(id="performance_graph")
]
Output(component_id="performance_graph", component_property="figure"),
Input(component_id="team_dd", component_property="value")
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
line_fig = px.line(
test, # <------------ THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig
Just call points_sum in the update_graph function, before you use test:
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
# vvv Here vvv
test = points_sum("FIF")
line_fig = px.line(
test, #THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig

Oanda API - Issue Price - Instruments

I'm using Oanda API to automate Trading strategies, I have a 'price' error that only occurs when selecting some instruments such as XAG (silver), my guess is that there is a classification difference but Oanda is yet to answer on the matter.
The error does not occur when selecting Forex pairs.
If anyone had such issues in the past and managed to solve it I'll be happy to hear form them.
PS: I'm UK based and have access to most products including CFDs
class SMABollTrader(tpqoa.tpqoa):
def __init__(self, conf_file, instrument, bar_length, SMA, dev, SMA_S, SMA_L, units):
super().__init__(conf_file)
self.instrument = instrument
self.bar_length = pd.to_timedelta(bar_length)
self.tick_data = pd.DataFrame()
self.raw_data = None
self.data = None
self.last_bar = None
self.units = units
self.position = 0
self.profits = []
self.price = []
#*****************add strategy-specific attributes here******************
self.SMA = SMA
self.dev = dev
self.SMA_S = SMA_S
self.SMA_L = SMA_L
#************************************************************************
def get_most_recent(self, days = 5):
while True:
time.sleep(2)
now = datetime.utcnow()
now = now - timedelta(microseconds = now.microsecond)
past = now - timedelta(days = days)
df = self.get_history(instrument = self.instrument, start = past, end = now,
granularity = "S5", price = "M", localize = False).c.dropna().to_frame()
df.rename(columns = {"c":self.instrument}, inplace = True)
df = df.resample(self .bar_length, label = "right").last().dropna().iloc[:-1]
self.raw_data = df.copy()
self.last_bar = self.raw_data.index[-1]
if pd.to_datetime(datetime.utcnow()).tz_localize("UTC") - self.last_bar < self.bar_length:
break
def on_success(self, time, bid, ask):
print(self.ticks, end = " ")
recent_tick = pd.to_datetime(time)
df = pd.DataFrame({self.instrument:(ask + bid)/2},
index = [recent_tick])
self.tick_data = self.tick_data.append(df)
if recent_tick - self.last_bar > self.bar_length:
self.resample_and_join()
self.define_strategy()
self.execute_trades()
def resample_and_join(self):
self.raw_data = self.raw_data.append(self.tick_data.resample(self.bar_length,
label="right").last().ffill().iloc[:-1])
self.tick_data = self.tick_data.iloc[-1:]
self.last_bar = self.raw_data.index[-1]
def define_strategy(self): # "strategy-specific"
df = self.raw_data.copy()
#******************** define your strategy here ************************
df["SMA"] = df[self.instrument].rolling(self.SMA).mean()
df["Lower"] = df["SMA"] - df[self.instrument].rolling(self.SMA).std() * self.dev
df["Upper"] = df["SMA"] + df[self.instrument].rolling(self.SMA).std() * self.dev
df["distance"] = df[self.instrument] - df.SMA
df["SMA_S"] = df[self.instrument].rolling(self.SMA_S).mean()
df["SMA_L"] = df[self.instrument].rolling(self.SMA_L).mean()
df["position"] = np.where(df[self.instrument] < df.Lower) and np.where(df["SMA_S"] > df["SMA_L"] ,1,np.nan)
df["position"] = np.where(df[self.instrument] > df.Upper) and np.where(df["SMA_S"] < df["SMA_L"], -1, df["position"])
df["position"] = np.where(df.distance * df.distance.shift(1) < 0, 0, df["position"])
df["position"] = df.position.ffill().fillna(0)
self.data = df.copy()
#***********************************************************************
def execute_trades(self):
if self.data["position"].iloc[-1] == 1:
if self.position == 0 or None:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
elif self.position == -1:
order = self.create_order(self.instrument, self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
self.position = 1
elif self.data["position"].iloc[-1] == -1:
if self.position == 0:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
self.position = -1
elif self.data["position"].iloc[-1] == 0:
if self.position == -1:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
self.position = 0
def report_trade(self, order, going):
time = order["time"]
units = order["units"]
price = order["price"]
pl = float(order["pl"])
self.profits.append(pl)
cumpl = sum(self.profits)
print("\n" + 100* "-")
print("{} | {}".format(time, going))
print("{} | units = {} | price = {} | P&L = {} | Cum P&L = {}".format(time, units, price, pl, cumpl))
print(100 * "-" + "\n")
trader = SMABollTrader("oanda.cfg", "EUR_GBP", "15m", SMA = 82, dev = 4, SMA_S = 38, SMA_L = 135, units = 100000)
trader.get_most_recent()
trader.stream_data(trader.instrument, stop = None )
if trader.position != 0: # if we have a final open position
close_order = trader.create_order(trader.instrument, units = -trader.position * trader.units,
suppress = True, ret = True)
trader.report_trade(close_order, "GOING NEUTRAL")
trader.signal = 0
I have done Hagmann course as well and I have recognised your code immediately.
Firstly the way you define your positions is not the best. Look at the section of combining two strategies. There are two ways.
Now regarding your price problem I had a similar situation with BTC. You can download it's historical data but when I plotted it to the strategy code and started to stream I had exactly the same error indicating that tick data was never streamed.
I am guessing that simply not all instruments are tradeable via api or in your case maybe you tried to stream beyond trading hours?

I am having a problem with a foor loop that includes dataframes

I have a dataframe with 8 columnds. If two of those columns satisfy a condition, I have to fill two columns with the product of other two. And after running the algorithm it is not working.
I have tryed to use series, I have tryed to use import warnings
warnings.filterwarnings("ignore") but it is not working
for i in seq:
if dataframefinal['trade'][i] == 1 and dataframefinal['z'][i] > 0:
dataframefinal['CloseAdj2'][i]= dataframefinal['Close2'][i] *
dataframefinal['trancosshort'][i]
dataframefinal['CloseAdj1'][i]= dataframefinal['Close1'][i] *
dataframefinal['trancostlong'][i]
elif dataframefinal['trade'][i] == 1 and dataframefinal['z'][i] < 0:
dataframefinal['CloseAdj2'][i]= dataframefinal['Close1'][i] *
dataframefinal['trancosshort'][i]
dataframefinal['CloseAdj1'][i]= dataframefinal['Close2'][i] *
dataframefinal['trancostlong'][i]
else:
dataframefinal['CloseAdj1'][i]= dataframefinal['Close1'][i]
dataframefinal['CloseAdj2'][i]= dataframefinal['Close2'][i]
You can use vectorized condition function numpy.select() to do this quickly:
import pandas as pd
from numpy.random import randn, randint
n = 10
df_data = pd.DataFrame(dict(trade=randint(0, 2, n),
z=randn(n),
Close1=randn(n),
Close2=randn(n),
trancosshort=randn(n),
trancostlong=randn(n)))
df_data["CloseAdj1"] = 0
df_data["CloseAdj2"] = 0
seq = [1, 3, 5, 7, 9]
df = df_data.loc[seq]
cond1 = df.eval("trade==1 and z > 0")
cond2 = df.eval("trade==2 and z < 0")
df["CloseAdj2"] = np.select([cond1, cond2],
[df.eval("Close2 * trancosshort"),
df.eval("Close1 * trancosshort")], df.Close2)
df["CloseAdj1"] = np.select([cond1, cond2],
[df.eval("Close1 * trancostlong"),
df.eval("Close2 * trancostlong")], df.Close1)
df_data.loc[seq, ["CloseAdj1", "CloseAdj2"]] = df[["CloseAdj1", "CloseAdj2"]]

Time Difference between Time Period and Instant

I have some time periods (df_A) and some time instants (df_B):
import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta
# Data
df_A = pd.DataFrame({'A1': [dt.datetime(2017,1,5,9,8), dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,2,7,9,19), dt.datetime(2017,2,7,9,19)],
'A2': [dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,5,9,12), dt.datetime(2017,1,7,9,26), dt.datetime(2017,1,7,9,20), dt.datetime(2017,1,7,9,21), dt.datetime(2017,2,7,9,23), dt.datetime(2017,2,7,9,25)]})
df_B = pd.DataFrame({ 'B': [dt.datetime(2017,1,6,14,45), dt.datetime(2017,1,4,3,31), dt.datetime(2017,1,7,3,31), dt.datetime(2017,1,7,14,57), dt.datetime(2017,1,9,14,57)]})
I can match these together:
# Define an Extra Margin
M = dt.timedelta(days = 10)
df_A["A1X"] = df_A["A1"] + M
df_A["A2X"] = df_A["A2"] - M
# Match
Bv = df_B .B .values
A1 = df_A .A1X.values
A2 = df_A .A2X.values
i, j = np.where((Bv[:, None] >= A1) & (Bv[:, None] <= A2))
df_C = pd.DataFrame(np.column_stack([df_B .values[i], df_A .values[j]]),
columns = df_B .columns .append (df_A.columns))
I would like to find the time difference between each time period and the time instant matched to it. I mean that
if B is between A1 and A2
then dT = 0
I've tried doing it like this:
# Calculate dt
def time(A1,A2,B):
if df_C["B"] < df_C["A1"]:
return df_C["A1"].subtract(df_C["B"])
elif df_C["B"] > df_C["A2"]:
return df_C["B"].subtract(df_C["A2"])
else:
return 0
df_C['dt'] = df_C.apply(time)
I'm getting "ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series"
So, I found two fixes:
You are adding M to the lower value and subtracting from the higher one. Change it to:
df_A['A1X'] = df_A['A1'] - M
df_A['A2X'] = df_A['A2'] + M
You are only passing one row of your dataframe at a time to your time function, so it should be something like:
def time(row):
if row['B'] < row['A1']:
return row['A1'] - row['B']
elif row['B'] > row['A2']:
return row['B'] - row['A2']
else:
return 0
And then you can call it like this:
df_C['dt'] = df_C.apply(time, axis=1) :)