Error code 'could not convert string to float: 'PG-13'. How to fix it? - pandas

I am building a recommendation engine from a database from Kaggle.
df = pd.read_csv("netflix.csv")
df = df.drop(["ratingdescription"], axis=1)
df = pd.get_dummies(df, columns=["rating_level"])
df = df.dropna()
df = df[['title', 'rating', 'release_year', 'user_rating_score', 'user_rating_size']]
df['title'] = df['title'].astype('category')
df['title'] = df['title'].cat.codes
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df.drop(['title'], axis=1))
def recommend(title, df, model_knn):
query_index = df.loc\[df\['title'\] == title\].index.values\[0\]
distances, indices = model_knn.kneighbors(df.loc\[df\['title'\] == title\].drop(\['title'\], axis=1), n_neighbors = 6)
for i in range(0, len(indices.flatten())):
if indices.flatten()\[i\] == query_index:
continue
else:
recommended_title = df.loc\[df.index == indices.flatten()\[i\], 'title'\].values\[0\]
recommended_title = df.loc\[df\['title'\] == recommended_title\]\['title'\].cat.categories\[recommended_title\]
print('Recommendation:', recommended_title)
def evaluate(title, df, model_knn):
query_index = df.loc\[df\['title'\] == title\].index.values\[0\]
distances, indices = model_knn.kneighbors(df.loc\[df\['title'\] == title\].drop(\['title'\], axis=1), n_neighbors = 6)
recommended_titles = \[\]
for i in range(0, len(indices.flatten())):
if indices.flatten()\[i\] == query_index:
continue
`else:
recommended_title = df.loc[df.index == indices.flatten()[i], 'title'].values[0]
recommended_titles.append(recommended_title)
actual_titles = df.loc[df['rating'] == df.loc[df['title'] == title]['rating'].values[0], 'title']
actual_titles = actual_titles.drop(query_index)
actual_titles = [df.loc[df['title'] == title]['title'].cat.categories[title] for title in actual_titles]
recommended_titles = [df.loc[df['title'] == title]['title'].cat.categories[title] for title in recommended_titles]
precision, recall, _, _ = precision_recall_fscore_support(actual_titles, recommended_titles, average = 'macro')
print('Precision:', precision)
print('Recall:', recall)
recommend("The Shawshank Redemption", df, model_knn)
evaluate("The Shawshank Redemption", df, model_knn)
I have tried altering the code many times but it's either this or the error message
"KeyError: 'rating_level'" indicates that the column "rating_level" is not found in the dataframe **df**.
error received is this :
`ValueError: could not convert string to float: 'PG-13'

Related

return pandas dataframe from function

I want to return a dataframe from this function, which can be used elsewhere (for plotly graph to be exact).
My idea is to use the dataframe I can create with points_sum(), save it as the team name, and then use that dataframe in my px.line(dataframe = team_name).
In essence, I want to use the men_points_df variable after I created it.
def points_sum(team):
points = 0
men_points = []
for index, row in menscore_df.iterrows():
if row['hometeam'] == team:
if row['homegoals'] > row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] < row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
if row['awayteam'] == team:
if row['homegoals'] < row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] > row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
men_points_df = pd.DataFrame(men_points, columns = ["Date", 'Points'])
return men_points_df
In plotly, I am trying to use my new dataframe (men_points_df), like below, but I get the error undefined name, even though I can print it (for example: test = points_sum("FIF") (FIF is one of the team names) and it shows the correct dataframe in the console (when I type test):
elif pathname == "/page-3":
return [html.H1('Seasonal performance',
style={'textAlign':'center'}),
html.Div(
children=[
html.H2('Select team',style={'textAlign':'center'}),
html.Br(),
html.Br(),
dcc.Dropdown(
id='team_dd',
options=[{'label': v, 'value': k} for k,v in teams_all.items()],
)]),
dcc.Graph(id="performance_graph")
]
Output(component_id="performance_graph", component_property="figure"),
Input(component_id="team_dd", component_property="value")
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
line_fig = px.line(
test, # <------------ THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig
Just call points_sum in the update_graph function, before you use test:
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
# vvv Here vvv
test = points_sum("FIF")
line_fig = px.line(
test, #THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig

Oanda API - Issue Price - Instruments

I'm using Oanda API to automate Trading strategies, I have a 'price' error that only occurs when selecting some instruments such as XAG (silver), my guess is that there is a classification difference but Oanda is yet to answer on the matter.
The error does not occur when selecting Forex pairs.
If anyone had such issues in the past and managed to solve it I'll be happy to hear form them.
PS: I'm UK based and have access to most products including CFDs
class SMABollTrader(tpqoa.tpqoa):
def __init__(self, conf_file, instrument, bar_length, SMA, dev, SMA_S, SMA_L, units):
super().__init__(conf_file)
self.instrument = instrument
self.bar_length = pd.to_timedelta(bar_length)
self.tick_data = pd.DataFrame()
self.raw_data = None
self.data = None
self.last_bar = None
self.units = units
self.position = 0
self.profits = []
self.price = []
#*****************add strategy-specific attributes here******************
self.SMA = SMA
self.dev = dev
self.SMA_S = SMA_S
self.SMA_L = SMA_L
#************************************************************************
def get_most_recent(self, days = 5):
while True:
time.sleep(2)
now = datetime.utcnow()
now = now - timedelta(microseconds = now.microsecond)
past = now - timedelta(days = days)
df = self.get_history(instrument = self.instrument, start = past, end = now,
granularity = "S5", price = "M", localize = False).c.dropna().to_frame()
df.rename(columns = {"c":self.instrument}, inplace = True)
df = df.resample(self .bar_length, label = "right").last().dropna().iloc[:-1]
self.raw_data = df.copy()
self.last_bar = self.raw_data.index[-1]
if pd.to_datetime(datetime.utcnow()).tz_localize("UTC") - self.last_bar < self.bar_length:
break
def on_success(self, time, bid, ask):
print(self.ticks, end = " ")
recent_tick = pd.to_datetime(time)
df = pd.DataFrame({self.instrument:(ask + bid)/2},
index = [recent_tick])
self.tick_data = self.tick_data.append(df)
if recent_tick - self.last_bar > self.bar_length:
self.resample_and_join()
self.define_strategy()
self.execute_trades()
def resample_and_join(self):
self.raw_data = self.raw_data.append(self.tick_data.resample(self.bar_length,
label="right").last().ffill().iloc[:-1])
self.tick_data = self.tick_data.iloc[-1:]
self.last_bar = self.raw_data.index[-1]
def define_strategy(self): # "strategy-specific"
df = self.raw_data.copy()
#******************** define your strategy here ************************
df["SMA"] = df[self.instrument].rolling(self.SMA).mean()
df["Lower"] = df["SMA"] - df[self.instrument].rolling(self.SMA).std() * self.dev
df["Upper"] = df["SMA"] + df[self.instrument].rolling(self.SMA).std() * self.dev
df["distance"] = df[self.instrument] - df.SMA
df["SMA_S"] = df[self.instrument].rolling(self.SMA_S).mean()
df["SMA_L"] = df[self.instrument].rolling(self.SMA_L).mean()
df["position"] = np.where(df[self.instrument] < df.Lower) and np.where(df["SMA_S"] > df["SMA_L"] ,1,np.nan)
df["position"] = np.where(df[self.instrument] > df.Upper) and np.where(df["SMA_S"] < df["SMA_L"], -1, df["position"])
df["position"] = np.where(df.distance * df.distance.shift(1) < 0, 0, df["position"])
df["position"] = df.position.ffill().fillna(0)
self.data = df.copy()
#***********************************************************************
def execute_trades(self):
if self.data["position"].iloc[-1] == 1:
if self.position == 0 or None:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
elif self.position == -1:
order = self.create_order(self.instrument, self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
self.position = 1
elif self.data["position"].iloc[-1] == -1:
if self.position == 0:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
self.position = -1
elif self.data["position"].iloc[-1] == 0:
if self.position == -1:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
self.position = 0
def report_trade(self, order, going):
time = order["time"]
units = order["units"]
price = order["price"]
pl = float(order["pl"])
self.profits.append(pl)
cumpl = sum(self.profits)
print("\n" + 100* "-")
print("{} | {}".format(time, going))
print("{} | units = {} | price = {} | P&L = {} | Cum P&L = {}".format(time, units, price, pl, cumpl))
print(100 * "-" + "\n")
trader = SMABollTrader("oanda.cfg", "EUR_GBP", "15m", SMA = 82, dev = 4, SMA_S = 38, SMA_L = 135, units = 100000)
trader.get_most_recent()
trader.stream_data(trader.instrument, stop = None )
if trader.position != 0: # if we have a final open position
close_order = trader.create_order(trader.instrument, units = -trader.position * trader.units,
suppress = True, ret = True)
trader.report_trade(close_order, "GOING NEUTRAL")
trader.signal = 0
I have done Hagmann course as well and I have recognised your code immediately.
Firstly the way you define your positions is not the best. Look at the section of combining two strategies. There are two ways.
Now regarding your price problem I had a similar situation with BTC. You can download it's historical data but when I plotted it to the strategy code and started to stream I had exactly the same error indicating that tick data was never streamed.
I am guessing that simply not all instruments are tradeable via api or in your case maybe you tried to stream beyond trading hours?

CODING Q based on Dataframes and Series and dictionaries

It would be interesting to see if there is any evidence of a link between vaccine effectiveness and sex of the child. Calculate the ratio of the number of children who contracted chickenpox but were vaccinated against it (at least one varicella dose) versus those who were vaccinated but did not contract chicken pox. Return results by sex.
This function should return a dictionary in the form of (use the correct numbers):
{"male":0.2,
"female":0.4}
Note: To aid in verification, the chickenpox_by_sex()['female'] value the autograder is looking for starts with the digits 0.0077.
PLEASE WRITE A FUNCTIONING CODE FOR THE SAME.
Try the following code:
Read the given dataset using the following code
import pandas as pd
df=pd.read_csv('assets/NISPUF17.csv',index_col=0)
df
Main code
def chickenpox_by_sex():
# YOUR CODE HERE
male_df=df[df['SEX']==1]
vac_m=male_df[male_df['P_NUMVRC']>=1]
cp_m=vac_m[vac_m['HAD_CPOX']==1]
counts_cp_m=cp_m['SEX'].count()
ncp_m=vac_m[vac_m['HAD_CPOX']==2]
counts_ncp_m=ncp_m['SEX'].count()
male=counts_cp_m/counts_ncp_m
female_df=df[df['SEX']==2]
vac_f=female_df[female_df['P_NUMVRC']>=1]
cp_f=vac_f[vac_f['HAD_CPOX']==1]
counts_cp_f=cp_f['SEX'].count()
ncp_f=vac_f[vac_f['HAD_CPOX']==2]
counts_ncp_f=ncp_f['SEX'].count()
female=counts_cp_f/counts_ncp_f
ratio_dict={"male":male,"female":female}
return ratio_dict
raise NotImplementedError()
Check using the following code
chickenpox_by_sex()['female']
Final code to complete this
assert len(chickenpox_by_sex())==2, "Return a dictionary with two items, the first for males and the second for females."
=> [SEX] -> sex=1 (male); sex=2 (female)
=> [HAD_COP] -> contracted chicken pox = 1; not contracted chicken pox = 2
=> [P_NUMVRC]>=1 -> given one or more doses
*ratio(male) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
*ratio(female) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
Variable names:
male - male data frame
vac_m - vaccinated male
cp_m - vaccinated and contracted chickenpox (male)
counts_cp_m - counts of vaccinated and contracted chickenpox
ncp_m - vaccinated and not contracted chickenpox (male)
counts_ncp_m - vaccinated and not contracted chickenpox
Similarly for females.
CORRECT SOLUTION.
def chickenpox_by_sex():
import pandas as pd
df = pd.read_csv("NISPUF17.csv")
maleDf = df[df["SEX"] ==1]
doses1 = maleDf[maleDf["P_NUMVRC"] >= 1]
chichkenPox1_1 = doses1[doses1["HAD_CPOX"] == 1]
count1_1 = chichkenPox1_1["SEX"].count()
chichkenPox1_2 = doses1[doses1["HAD_CPOX"] == 2]
count1_2 = chichkenPox1_2["SEX"].count()
resultMale = count1_1/count1_2
femaleDf = df[df["SEX"] == 2]
doses2 = femaleDf[femaleDf["P_NUMVRC"] >= 1]
chichkenPox2_1 = doses2[doses2["HAD_CPOX"] == 1]
count2_1 = chichkenPox2_1["SEX"].count()
chichkenPox2_2 = doses2[doses2["HAD_CPOX"] == 2]
count2_2 = chichkenPox2_2["SEX"].count()
resultFemale = count2_1/count2_2
dict = {"male":resultMale,
"female":resultFemale
}
return dict
The following code works as well:
import pandas as pd
import numpy as np
import math
def chickenpox_by_sex():
df=pd.read_csv('assets/NISPUF17.csv')
c_vaccinated=df[df['P_NUMVRC']>0]
menstats=c_vaccinated[c_vaccinated['SEX']==1]
mnocpox=len(menstats[menstats['HAD_CPOX']==2])
menratio=len(menstats[menstats['HAD_CPOX']==1])/mnocpox
wstats=c_vaccinated[c_vaccinated['SEX']==2]
wnocpox=len(wstats[wstats['HAD_CPOX']==2])
wratio=len(wstats[wstats['HAD_CPOX']==1])/wnocpox
ratios={'male':menratio,'female':wratio}
return ratios
chickenpox_by_sex()
import pandas as pd
def chickenpox_by_sex():
df = pd.read_csv('assets/NISPUF17.csv')
df = df.drop(df[df.HAD_CPOX == 77].index)
df = df.drop(df[df.HAD_CPOX == 99].index)
df = df.dropna(subset=['P_NUMVRC'])
df.loc[df['HAD_CPOX'] == 1, 'HAD_CPOX'] = 'YES'
df.loc[df['HAD_CPOX'] == 2, 'HAD_CPOX'] = 'NO'
df.loc[df['SEX'] == 1, 'SEX'] = 'male'
df.loc[df['SEX'] == 2, 'SEX'] = 'female'
df.loc[df['P_NUMVRC'] == 2.0, 'P_NUMVRC'] = 1
df.loc[df['P_NUMVRC'] == 3.0, 'P_NUMVRC'] = 1
df = df[['SEX', 'P_NUMVRC', 'HAD_CPOX']].round(decimals=0)
dfm = df[df['SEX'] == 'male']
dfmVac = dfm[dfm['P_NUMVRC'] == 1.0]
mPoxVacYes = len(dfmVac[dfmVac['HAD_CPOX'] == 'YES'])
mPoxVacNo = len(dfmVac[dfmVac['HAD_CPOX'] == 'NO'])
dff = df[df['SEX'] == 'female']
dffVac = dff[dff['P_NUMVRC'] == 1.0]
fPoxVacYes = len(dffVac[dffVac['HAD_CPOX'] == 'YES'])
fPoxVacNo = len(dffVac[dffVac['HAD_CPOX'] == 'NO'])
ratioM = mPoxVacYes/float(mPoxVacNo)
ratioF = fPoxVacYes/float(fPoxVacNo)
result = {'male': ratioM * 100, 'female': ratioF * 100}
return result
import pandas as pd
import numpy as np
df = pd.read_csv('assets/NISPUF17.csv', usecols = ['HAD_CPOX', 'SEX', 'P_NUMVRC']).dropna().reset_index()
def chickenpox_by_sex():
girls = df[df.SEX == 2]
girls_had = girls[(girls.HAD_CPOX == 1) & (girls.P_NUMVRC > 0.0)]
girls_not_had = girls[(girls.HAD_CPOX == 2) &(girls.P_NUMVRC > 0.0)]
girls_ratio = len(girls_had)/len(girls_not_had)
boys = df[df.SEX == 1]
boys_had = boys[(boys.HAD_CPOX == 1) & (boys.P_NUMVRC > 0.0)]
boys_not_had = boys[(boys.HAD_CPOX == 2) &(boys.P_NUMVRC > 0.0)]
boys_ratio = len(boys_had)/len(boys_not_had)
result = {"male": round(boys_ratio, ndigits=4),
"female":round(girls_ratio, ndigits = 4)}
return result
chickenpox_by_sex()

Finding top 3 dominant topics for LDA topic model

I am creating a datatable via this LDA modeling tutorial, (https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/) and instead of just finding the single most dominant topic, I want to expand to find the top 3 most dominant topics, along with each of their percent contributions and topic keywords.
To do that, is it best to create 2 additional functions to create 3 separate dataframes, and append each of the results? Or is there a simpler way to modify the format_topics_sentence function to pull the top 3 topics from the enumerated bag of words corpus?
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
# Init output
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row_list in enumerate(ldamodel[corpus]):
row = row_list[0] if ldamodel.per_word_topics else row_list
# print(row)
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)
table ouput
I had a similar requirement in a recent project, hopefully this helps you out, you will need to add topic keywords to below code:
topics_df1 = pd.DataFrame()
topics_df2 = pd.DataFrame()
topics_df3 = pd.DataFrame()
for i, row_list in enumerate(lda_model[corpus]):
row = row_list[0] if lda_model.per_word_topics else row_list
row = sorted(row, key=lambda x: (x[1]), reverse=True)
for j, (topic_num, prop_topic) in enumerate(row):
if len(row) >= 3:
if j ==0:
topics_df1 = topics_df1.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
elif j ==1:
topics_df2 = topics_df2.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
elif j ==2:
topics_df3 = topics_df3.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
else:
break
elif len(row) == 2:
if j ==0:
topics_df1 = topics_df1.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
elif j ==1:
topics_df2 = topics_df2.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
topics_df3 = topics_df3.append(pd.Series(['-', '-']), ignore_index=True)
elif len(row) == 1:
topics_df1 = topics_df1.append(pd.Series([int(topic_num), prop_topic]), ignore_index=True)
topics_df2 = topics_df2.append(pd.Series(['-', '-']), ignore_index=True)
topics_df3 = topics_df3.append(pd.Series(['-', '-']), ignore_index=True)
topics_df1.rename(columns={0:'1st Topic', 1:'1st Topic Contribution'}, inplace=True)
topics_df2.rename(columns={0:'2nd Topic', 1:'2nd Topic Contribution'}, inplace=True)
topics_df3.rename(columns={0:'3rd Topic', 1:'3rd Topic Contribution'}, inplace=True)
topics_comb = pd.concat([topics_df1, topics_df2, topics_df3], axis=1, sort=False)
#Join topics dataframe to original data
new_df = pd.concat([data_ready, topics_comb], axis=1, sort=False)

More efficient fillna(numpy)

I need an array version of a function similar to Pandas.fillna, in the forum I collected a lot of answers to create the following function, but it is still 3 times times slower than Pandas.fillna, I want to know if there is a better way to optimize, thank you.
def fillna(self,axis=None,mask=None,value=None,method='pad'):
""" array fillna
Parameters
----------
self : 1d/2d
axis : axis(0 or 1)
mask : Custom mask, or Built np.isfinite(x)
value : int
method : 'back', 'pad', 'mean'
--------
"""
x = np.asarray(self)
if mask is None: mask = np.isfinite(x)
if (not value is None)|(method=='mean'):
out = x.copy()
if x.ndim == 1:
if method=='mean':
out[~mask] = np.nanmean(x)
else: out[~mask] = value
else:
vask = ~mask * (np.nanmean(x,1)[:,None] if axis==1 else np.nanmean(x,0))
out[~mask] = vask[~mask]
else:
if axis is None: axis = 0
if x.ndim==1:
if method=='pad':
idx = np.where(mask,np.arange(mask.shape[0]),0)
np.maximum.accumulate(idx,axis=0,out=idx)
return x[idx]
elif method=='back':
idx = np.where(mask[::-1],np.arange(mask.shape[0]),0)
np.maximum.accumulate(idx,axis=0,out=idx)
return x[mask.shape[0]-idx[::-1]-1]
else: return x
if axis==1:
if method=='back': mask = mask[:, ::-1]
idx = np.where(mask,np.arange(mask.shape[1]),0)
else:
if method=='back': mask = mask[::-1,:]
idx = np.where(mask,np.arange(mask.shape[0])[:,None],0)
np.maximum.accumulate(idx,axis=axis,out=idx)
if axis==1:
if method=='back': idx = idx.shape[1]-idx[:, ::-1] - 1
out = x[np.arange(idx.shape[0])[:,None], idx]
else:
if method=='back': idx = idx.shape[0]-idx[::-1, :] - 1
out = x[idx,np.arange(idx.shape[1])]
return out