return pandas dataframe from function - pandas

I want to return a dataframe from this function, which can be used elsewhere (for plotly graph to be exact).
My idea is to use the dataframe I can create with points_sum(), save it as the team name, and then use that dataframe in my px.line(dataframe = team_name).
In essence, I want to use the men_points_df variable after I created it.
def points_sum(team):
points = 0
men_points = []
for index, row in menscore_df.iterrows():
if row['hometeam'] == team:
if row['homegoals'] > row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] < row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
if row['awayteam'] == team:
if row['homegoals'] < row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] > row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
men_points_df = pd.DataFrame(men_points, columns = ["Date", 'Points'])
return men_points_df
In plotly, I am trying to use my new dataframe (men_points_df), like below, but I get the error undefined name, even though I can print it (for example: test = points_sum("FIF") (FIF is one of the team names) and it shows the correct dataframe in the console (when I type test):
elif pathname == "/page-3":
return [html.H1('Seasonal performance',
style={'textAlign':'center'}),
html.Div(
children=[
html.H2('Select team',style={'textAlign':'center'}),
html.Br(),
html.Br(),
dcc.Dropdown(
id='team_dd',
options=[{'label': v, 'value': k} for k,v in teams_all.items()],
)]),
dcc.Graph(id="performance_graph")
]
Output(component_id="performance_graph", component_property="figure"),
Input(component_id="team_dd", component_property="value")
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
line_fig = px.line(
test, # <------------ THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig

Just call points_sum in the update_graph function, before you use test:
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
# vvv Here vvv
test = points_sum("FIF")
line_fig = px.line(
test, #THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig

Related

Error code 'could not convert string to float: 'PG-13'. How to fix it?

I am building a recommendation engine from a database from Kaggle.
df = pd.read_csv("netflix.csv")
df = df.drop(["ratingdescription"], axis=1)
df = pd.get_dummies(df, columns=["rating_level"])
df = df.dropna()
df = df[['title', 'rating', 'release_year', 'user_rating_score', 'user_rating_size']]
df['title'] = df['title'].astype('category')
df['title'] = df['title'].cat.codes
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df.drop(['title'], axis=1))
def recommend(title, df, model_knn):
query_index = df.loc\[df\['title'\] == title\].index.values\[0\]
distances, indices = model_knn.kneighbors(df.loc\[df\['title'\] == title\].drop(\['title'\], axis=1), n_neighbors = 6)
for i in range(0, len(indices.flatten())):
if indices.flatten()\[i\] == query_index:
continue
else:
recommended_title = df.loc\[df.index == indices.flatten()\[i\], 'title'\].values\[0\]
recommended_title = df.loc\[df\['title'\] == recommended_title\]\['title'\].cat.categories\[recommended_title\]
print('Recommendation:', recommended_title)
def evaluate(title, df, model_knn):
query_index = df.loc\[df\['title'\] == title\].index.values\[0\]
distances, indices = model_knn.kneighbors(df.loc\[df\['title'\] == title\].drop(\['title'\], axis=1), n_neighbors = 6)
recommended_titles = \[\]
for i in range(0, len(indices.flatten())):
if indices.flatten()\[i\] == query_index:
continue
`else:
recommended_title = df.loc[df.index == indices.flatten()[i], 'title'].values[0]
recommended_titles.append(recommended_title)
actual_titles = df.loc[df['rating'] == df.loc[df['title'] == title]['rating'].values[0], 'title']
actual_titles = actual_titles.drop(query_index)
actual_titles = [df.loc[df['title'] == title]['title'].cat.categories[title] for title in actual_titles]
recommended_titles = [df.loc[df['title'] == title]['title'].cat.categories[title] for title in recommended_titles]
precision, recall, _, _ = precision_recall_fscore_support(actual_titles, recommended_titles, average = 'macro')
print('Precision:', precision)
print('Recall:', recall)
recommend("The Shawshank Redemption", df, model_knn)
evaluate("The Shawshank Redemption", df, model_knn)
I have tried altering the code many times but it's either this or the error message
"KeyError: 'rating_level'" indicates that the column "rating_level" is not found in the dataframe **df**.
error received is this :
`ValueError: could not convert string to float: 'PG-13'

Get 2 DataFrames into 1

Hi everyone so I have a DataFrame about Pokemon data
data = pd.read_csv('pokemon.csv')
And I'm only interested in 2 columns 'type1' 'type2' (type2 can be null) as the same way the original videogame does. What I need is to get a DataFrame that looks like this:
data.type1 looks like this:
data.type2:
So basically I need to take a single DataFrames using those 2 columns.
I've code this stuff trying to get 2 DataFrame that I can turn into the final one I am asked to reach:
tabla = {}
def contar(tipo):
buscando=tipo
if tipo == np.NaN:
pass
else:
if tipo in tabla:
tabla[tipo] += 1
else:
tabla[tipo] = 1
tabla2 = {}
def contar2(tipo):
buscando=tipo
if tipo == np.NaN:
pass
else:
if tipo in tabla2:
tabla2[tipo] += 1
else:
tabla2[tipo] = 1
def reset_tabla():
tabla = {}
tabla2 = {}
data['type1'].apply(contar)
df_type1 = pd.DataFrame.from_dict(tabla, orient='index')
reset_tabla()
data['type2'].apply(contar2)
df_type2 = pd.DataFrame.from_dict(tabla2, orient='index')
df_types = pd.concat([df_type1, df_type2])
df_type1
So with above code I get the data I want but no the way I need it.
I expected:
Instead, this was the output:
img continues and data appears 2 times due to 2 types columns
I think what I am doing wrong is the concat because type1 and 2 look like this separately:
and
Finally, if you know how to combine these 2 DataFrames or you think you can solve this problem better let me know.
Thanks you all :).
I've solved this issue, so if it's useful for somebody the solution is here:
tabla = {}
def contar(tipo):
buscando=tipo
if tipo in tabla:
tabla[tipo] += 1
else:
tabla[tipo] = 1
tabla2 = {}
def contar2(tipo):
buscando=tipo
if tipo == np.NaN:
pass
else:
if tipo in tabla2:
tabla2[tipo] += 1
else:
tabla2[tipo] = 1
def reset_tabla():
tabla = {}
tabla2 = {}
reset_tabla()
data['type1'].apply(contar)
data['type2'].apply(contar2)
for x in tabla2.keys():
if type(x)==float:
delete = x
del tabla2[delete]
types = {"type1": tabla,
"type2": tabla2}
df_types = pd.DataFrame(types)
df_types
So I get

Oanda API - Issue Price - Instruments

I'm using Oanda API to automate Trading strategies, I have a 'price' error that only occurs when selecting some instruments such as XAG (silver), my guess is that there is a classification difference but Oanda is yet to answer on the matter.
The error does not occur when selecting Forex pairs.
If anyone had such issues in the past and managed to solve it I'll be happy to hear form them.
PS: I'm UK based and have access to most products including CFDs
class SMABollTrader(tpqoa.tpqoa):
def __init__(self, conf_file, instrument, bar_length, SMA, dev, SMA_S, SMA_L, units):
super().__init__(conf_file)
self.instrument = instrument
self.bar_length = pd.to_timedelta(bar_length)
self.tick_data = pd.DataFrame()
self.raw_data = None
self.data = None
self.last_bar = None
self.units = units
self.position = 0
self.profits = []
self.price = []
#*****************add strategy-specific attributes here******************
self.SMA = SMA
self.dev = dev
self.SMA_S = SMA_S
self.SMA_L = SMA_L
#************************************************************************
def get_most_recent(self, days = 5):
while True:
time.sleep(2)
now = datetime.utcnow()
now = now - timedelta(microseconds = now.microsecond)
past = now - timedelta(days = days)
df = self.get_history(instrument = self.instrument, start = past, end = now,
granularity = "S5", price = "M", localize = False).c.dropna().to_frame()
df.rename(columns = {"c":self.instrument}, inplace = True)
df = df.resample(self .bar_length, label = "right").last().dropna().iloc[:-1]
self.raw_data = df.copy()
self.last_bar = self.raw_data.index[-1]
if pd.to_datetime(datetime.utcnow()).tz_localize("UTC") - self.last_bar < self.bar_length:
break
def on_success(self, time, bid, ask):
print(self.ticks, end = " ")
recent_tick = pd.to_datetime(time)
df = pd.DataFrame({self.instrument:(ask + bid)/2},
index = [recent_tick])
self.tick_data = self.tick_data.append(df)
if recent_tick - self.last_bar > self.bar_length:
self.resample_and_join()
self.define_strategy()
self.execute_trades()
def resample_and_join(self):
self.raw_data = self.raw_data.append(self.tick_data.resample(self.bar_length,
label="right").last().ffill().iloc[:-1])
self.tick_data = self.tick_data.iloc[-1:]
self.last_bar = self.raw_data.index[-1]
def define_strategy(self): # "strategy-specific"
df = self.raw_data.copy()
#******************** define your strategy here ************************
df["SMA"] = df[self.instrument].rolling(self.SMA).mean()
df["Lower"] = df["SMA"] - df[self.instrument].rolling(self.SMA).std() * self.dev
df["Upper"] = df["SMA"] + df[self.instrument].rolling(self.SMA).std() * self.dev
df["distance"] = df[self.instrument] - df.SMA
df["SMA_S"] = df[self.instrument].rolling(self.SMA_S).mean()
df["SMA_L"] = df[self.instrument].rolling(self.SMA_L).mean()
df["position"] = np.where(df[self.instrument] < df.Lower) and np.where(df["SMA_S"] > df["SMA_L"] ,1,np.nan)
df["position"] = np.where(df[self.instrument] > df.Upper) and np.where(df["SMA_S"] < df["SMA_L"], -1, df["position"])
df["position"] = np.where(df.distance * df.distance.shift(1) < 0, 0, df["position"])
df["position"] = df.position.ffill().fillna(0)
self.data = df.copy()
#***********************************************************************
def execute_trades(self):
if self.data["position"].iloc[-1] == 1:
if self.position == 0 or None:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
elif self.position == -1:
order = self.create_order(self.instrument, self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
self.position = 1
elif self.data["position"].iloc[-1] == -1:
if self.position == 0:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
self.position = -1
elif self.data["position"].iloc[-1] == 0:
if self.position == -1:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
self.position = 0
def report_trade(self, order, going):
time = order["time"]
units = order["units"]
price = order["price"]
pl = float(order["pl"])
self.profits.append(pl)
cumpl = sum(self.profits)
print("\n" + 100* "-")
print("{} | {}".format(time, going))
print("{} | units = {} | price = {} | P&L = {} | Cum P&L = {}".format(time, units, price, pl, cumpl))
print(100 * "-" + "\n")
trader = SMABollTrader("oanda.cfg", "EUR_GBP", "15m", SMA = 82, dev = 4, SMA_S = 38, SMA_L = 135, units = 100000)
trader.get_most_recent()
trader.stream_data(trader.instrument, stop = None )
if trader.position != 0: # if we have a final open position
close_order = trader.create_order(trader.instrument, units = -trader.position * trader.units,
suppress = True, ret = True)
trader.report_trade(close_order, "GOING NEUTRAL")
trader.signal = 0
I have done Hagmann course as well and I have recognised your code immediately.
Firstly the way you define your positions is not the best. Look at the section of combining two strategies. There are two ways.
Now regarding your price problem I had a similar situation with BTC. You can download it's historical data but when I plotted it to the strategy code and started to stream I had exactly the same error indicating that tick data was never streamed.
I am guessing that simply not all instruments are tradeable via api or in your case maybe you tried to stream beyond trading hours?

CODING Q based on Dataframes and Series and dictionaries

It would be interesting to see if there is any evidence of a link between vaccine effectiveness and sex of the child. Calculate the ratio of the number of children who contracted chickenpox but were vaccinated against it (at least one varicella dose) versus those who were vaccinated but did not contract chicken pox. Return results by sex.
This function should return a dictionary in the form of (use the correct numbers):
{"male":0.2,
"female":0.4}
Note: To aid in verification, the chickenpox_by_sex()['female'] value the autograder is looking for starts with the digits 0.0077.
PLEASE WRITE A FUNCTIONING CODE FOR THE SAME.
Try the following code:
Read the given dataset using the following code
import pandas as pd
df=pd.read_csv('assets/NISPUF17.csv',index_col=0)
df
Main code
def chickenpox_by_sex():
# YOUR CODE HERE
male_df=df[df['SEX']==1]
vac_m=male_df[male_df['P_NUMVRC']>=1]
cp_m=vac_m[vac_m['HAD_CPOX']==1]
counts_cp_m=cp_m['SEX'].count()
ncp_m=vac_m[vac_m['HAD_CPOX']==2]
counts_ncp_m=ncp_m['SEX'].count()
male=counts_cp_m/counts_ncp_m
female_df=df[df['SEX']==2]
vac_f=female_df[female_df['P_NUMVRC']>=1]
cp_f=vac_f[vac_f['HAD_CPOX']==1]
counts_cp_f=cp_f['SEX'].count()
ncp_f=vac_f[vac_f['HAD_CPOX']==2]
counts_ncp_f=ncp_f['SEX'].count()
female=counts_cp_f/counts_ncp_f
ratio_dict={"male":male,"female":female}
return ratio_dict
raise NotImplementedError()
Check using the following code
chickenpox_by_sex()['female']
Final code to complete this
assert len(chickenpox_by_sex())==2, "Return a dictionary with two items, the first for males and the second for females."
=> [SEX] -> sex=1 (male); sex=2 (female)
=> [HAD_COP] -> contracted chicken pox = 1; not contracted chicken pox = 2
=> [P_NUMVRC]>=1 -> given one or more doses
*ratio(male) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
*ratio(female) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
Variable names:
male - male data frame
vac_m - vaccinated male
cp_m - vaccinated and contracted chickenpox (male)
counts_cp_m - counts of vaccinated and contracted chickenpox
ncp_m - vaccinated and not contracted chickenpox (male)
counts_ncp_m - vaccinated and not contracted chickenpox
Similarly for females.
CORRECT SOLUTION.
def chickenpox_by_sex():
import pandas as pd
df = pd.read_csv("NISPUF17.csv")
maleDf = df[df["SEX"] ==1]
doses1 = maleDf[maleDf["P_NUMVRC"] >= 1]
chichkenPox1_1 = doses1[doses1["HAD_CPOX"] == 1]
count1_1 = chichkenPox1_1["SEX"].count()
chichkenPox1_2 = doses1[doses1["HAD_CPOX"] == 2]
count1_2 = chichkenPox1_2["SEX"].count()
resultMale = count1_1/count1_2
femaleDf = df[df["SEX"] == 2]
doses2 = femaleDf[femaleDf["P_NUMVRC"] >= 1]
chichkenPox2_1 = doses2[doses2["HAD_CPOX"] == 1]
count2_1 = chichkenPox2_1["SEX"].count()
chichkenPox2_2 = doses2[doses2["HAD_CPOX"] == 2]
count2_2 = chichkenPox2_2["SEX"].count()
resultFemale = count2_1/count2_2
dict = {"male":resultMale,
"female":resultFemale
}
return dict
The following code works as well:
import pandas as pd
import numpy as np
import math
def chickenpox_by_sex():
df=pd.read_csv('assets/NISPUF17.csv')
c_vaccinated=df[df['P_NUMVRC']>0]
menstats=c_vaccinated[c_vaccinated['SEX']==1]
mnocpox=len(menstats[menstats['HAD_CPOX']==2])
menratio=len(menstats[menstats['HAD_CPOX']==1])/mnocpox
wstats=c_vaccinated[c_vaccinated['SEX']==2]
wnocpox=len(wstats[wstats['HAD_CPOX']==2])
wratio=len(wstats[wstats['HAD_CPOX']==1])/wnocpox
ratios={'male':menratio,'female':wratio}
return ratios
chickenpox_by_sex()
import pandas as pd
def chickenpox_by_sex():
df = pd.read_csv('assets/NISPUF17.csv')
df = df.drop(df[df.HAD_CPOX == 77].index)
df = df.drop(df[df.HAD_CPOX == 99].index)
df = df.dropna(subset=['P_NUMVRC'])
df.loc[df['HAD_CPOX'] == 1, 'HAD_CPOX'] = 'YES'
df.loc[df['HAD_CPOX'] == 2, 'HAD_CPOX'] = 'NO'
df.loc[df['SEX'] == 1, 'SEX'] = 'male'
df.loc[df['SEX'] == 2, 'SEX'] = 'female'
df.loc[df['P_NUMVRC'] == 2.0, 'P_NUMVRC'] = 1
df.loc[df['P_NUMVRC'] == 3.0, 'P_NUMVRC'] = 1
df = df[['SEX', 'P_NUMVRC', 'HAD_CPOX']].round(decimals=0)
dfm = df[df['SEX'] == 'male']
dfmVac = dfm[dfm['P_NUMVRC'] == 1.0]
mPoxVacYes = len(dfmVac[dfmVac['HAD_CPOX'] == 'YES'])
mPoxVacNo = len(dfmVac[dfmVac['HAD_CPOX'] == 'NO'])
dff = df[df['SEX'] == 'female']
dffVac = dff[dff['P_NUMVRC'] == 1.0]
fPoxVacYes = len(dffVac[dffVac['HAD_CPOX'] == 'YES'])
fPoxVacNo = len(dffVac[dffVac['HAD_CPOX'] == 'NO'])
ratioM = mPoxVacYes/float(mPoxVacNo)
ratioF = fPoxVacYes/float(fPoxVacNo)
result = {'male': ratioM * 100, 'female': ratioF * 100}
return result
import pandas as pd
import numpy as np
df = pd.read_csv('assets/NISPUF17.csv', usecols = ['HAD_CPOX', 'SEX', 'P_NUMVRC']).dropna().reset_index()
def chickenpox_by_sex():
girls = df[df.SEX == 2]
girls_had = girls[(girls.HAD_CPOX == 1) & (girls.P_NUMVRC > 0.0)]
girls_not_had = girls[(girls.HAD_CPOX == 2) &(girls.P_NUMVRC > 0.0)]
girls_ratio = len(girls_had)/len(girls_not_had)
boys = df[df.SEX == 1]
boys_had = boys[(boys.HAD_CPOX == 1) & (boys.P_NUMVRC > 0.0)]
boys_not_had = boys[(boys.HAD_CPOX == 2) &(boys.P_NUMVRC > 0.0)]
boys_ratio = len(boys_had)/len(boys_not_had)
result = {"male": round(boys_ratio, ndigits=4),
"female":round(girls_ratio, ndigits = 4)}
return result
chickenpox_by_sex()

More efficient fillna(numpy)

I need an array version of a function similar to Pandas.fillna, in the forum I collected a lot of answers to create the following function, but it is still 3 times times slower than Pandas.fillna, I want to know if there is a better way to optimize, thank you.
def fillna(self,axis=None,mask=None,value=None,method='pad'):
""" array fillna
Parameters
----------
self : 1d/2d
axis : axis(0 or 1)
mask : Custom mask, or Built np.isfinite(x)
value : int
method : 'back', 'pad', 'mean'
--------
"""
x = np.asarray(self)
if mask is None: mask = np.isfinite(x)
if (not value is None)|(method=='mean'):
out = x.copy()
if x.ndim == 1:
if method=='mean':
out[~mask] = np.nanmean(x)
else: out[~mask] = value
else:
vask = ~mask * (np.nanmean(x,1)[:,None] if axis==1 else np.nanmean(x,0))
out[~mask] = vask[~mask]
else:
if axis is None: axis = 0
if x.ndim==1:
if method=='pad':
idx = np.where(mask,np.arange(mask.shape[0]),0)
np.maximum.accumulate(idx,axis=0,out=idx)
return x[idx]
elif method=='back':
idx = np.where(mask[::-1],np.arange(mask.shape[0]),0)
np.maximum.accumulate(idx,axis=0,out=idx)
return x[mask.shape[0]-idx[::-1]-1]
else: return x
if axis==1:
if method=='back': mask = mask[:, ::-1]
idx = np.where(mask,np.arange(mask.shape[1]),0)
else:
if method=='back': mask = mask[::-1,:]
idx = np.where(mask,np.arange(mask.shape[0])[:,None],0)
np.maximum.accumulate(idx,axis=axis,out=idx)
if axis==1:
if method=='back': idx = idx.shape[1]-idx[:, ::-1] - 1
out = x[np.arange(idx.shape[0])[:,None], idx]
else:
if method=='back': idx = idx.shape[0]-idx[::-1, :] - 1
out = x[idx,np.arange(idx.shape[1])]
return out