Using 'not is in' in PySpark and getting an empty dataframe back - dataframe

I'm trying to use filter to find those 'title' that are not in list_A.
A = B.groupBy("title").count()
A = A.filter(A['count'] > 1)
A_df = A.toPandas()
list_A = A_df['title'].values.tolist()
B.filter(~B.title.isin(list_A)).count()
However, I get an empty dataframe back (count is 0)
It works well when I use 'is in':
Why this happened and how can I solve this?
I tried:
B=B.na.drop(subset=["title"])
B.filter(~B.title.isin(list_A)).count()
print(B.filter(~B.title.isin(list_A) | B.title.isNull()).count())
It still returns 0.

It may be because other "title" values are null.
B = spark.createDataFrame([('x',), ('x',), (None,)], ['title'])
A = B.groupBy("title").count()
A = A.filter(A['count'] > 1)
A_df = A.toPandas()
list_A = A_df['title'].values.tolist()
print(B.filter(~B.title.isin(list_A)).count())
# 0
print(B.filter(B.title.isin(list_A)).count())
# 2
If you really need list_A, you shouldn't go to Pandas for it.
You can either use collect
A = B.groupBy("title").count().filter(F.col('count') > 1)
list_A = [x.title for x in A.collect()]
print(list_A)
# ['x', None]
or collect_set
list_A = (B
.groupBy("title").count()
.groupBy((F.col('count') > 1).alias('_c')).agg(
F.collect_set('title').alias('_t')
).filter('_c')
.head()[1]
)
print(list_A)
# ['x']
Finally, to translate your current query to PySpark, you should use window functions.
Input:
from pyspark.sql import functions as F, Window as W
B = spark.createDataFrame(
[('x', 'Example'),
('x', 'Example'),
('x', 'not_example'),
('y', 'not_example'),
(None, 'not_example'),
(None, 'Example')],
['title', 'journal'])
Your current script:
A = B.groupBy("title").count()
A = A.filter(A['count'] > 1)
A_df = A.toPandas()
list_A = A_df['title'].values.tolist()
B.filter(((B.title.isin(list_A))&(B.journal!="Example"))|(~B.title.isin(list_A)))
Suggestion:
B_filtered = (B
.withColumn('A_cnt', F.count('title').over(W.partitionBy('title')))
.filter("(A_cnt > 1 and journal != 'Example') or A_cnt <= 1")
.drop('A_cnt')
)
B_filtered.show()
# +-----+-----------+
# |title| journal|
# +-----+-----------+
# | null|not_example|
# | null| Example|
# | x|not_example|
# | y|not_example|
# +-----+-----------+

Related

return pandas dataframe from function

I want to return a dataframe from this function, which can be used elsewhere (for plotly graph to be exact).
My idea is to use the dataframe I can create with points_sum(), save it as the team name, and then use that dataframe in my px.line(dataframe = team_name).
In essence, I want to use the men_points_df variable after I created it.
def points_sum(team):
points = 0
men_points = []
for index, row in menscore_df.iterrows():
if row['hometeam'] == team:
if row['homegoals'] > row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] < row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
if row['awayteam'] == team:
if row['homegoals'] < row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] > row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
men_points_df = pd.DataFrame(men_points, columns = ["Date", 'Points'])
return men_points_df
In plotly, I am trying to use my new dataframe (men_points_df), like below, but I get the error undefined name, even though I can print it (for example: test = points_sum("FIF") (FIF is one of the team names) and it shows the correct dataframe in the console (when I type test):
elif pathname == "/page-3":
return [html.H1('Seasonal performance',
style={'textAlign':'center'}),
html.Div(
children=[
html.H2('Select team',style={'textAlign':'center'}),
html.Br(),
html.Br(),
dcc.Dropdown(
id='team_dd',
options=[{'label': v, 'value': k} for k,v in teams_all.items()],
)]),
dcc.Graph(id="performance_graph")
]
Output(component_id="performance_graph", component_property="figure"),
Input(component_id="team_dd", component_property="value")
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
line_fig = px.line(
test, # <------------ THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig
Just call points_sum in the update_graph function, before you use test:
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
# vvv Here vvv
test = points_sum("FIF")
line_fig = px.line(
test, #THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig

Recursively update the dataframe

I have a dataframe called datafe from which I want to combine the hyphenated words.
for example input dataframe looks like this:
,author_ex
0,Marios
1,Christodoulou
2,Intro-
3,duction
4,Simone
5,Speziale
6,Exper-
7,iment
And the output dataframe should be like:
,author_ex
0,Marios
1,Christodoulou
2,Introduction
3,Simone
4,Speziale
5,Experiment
I have written a sample code to achieve this but I am not able to get out of the recursion safely.
def rm_actual(datafe, index):
stem1 = datafe.iloc[index]['author_ex']
stem2 = datafe.iloc[index + 1]['author_ex']
fixed_token = stem1[:-1] + stem2
datafe.drop(index=index + 1, inplace=True, axis=0)
newdf=datafe.reset_index(drop=True)
newdf.iloc[index]['author_ex'] = fixed_token
return newdf
def remove_hyphens(datafe):
for index, row in datafe.iterrows():
flag = False
token=row['author_ex']
if token[-1:] == '-':
datafe=rm_actual(datafe, index)
flag=True
break
if flag==True:
datafe=remove_hyphens(datafe)
if flag==False:
return datafe
datafe=remove_hyphens(datafe)
print(datafe)
Is there any possibilities I can get out of this recursion with expected output?
Another option:
Given/Input:
author_ex
0 Marios
1 Christodoulou
2 Intro-
3 duction
4 Simone
5 Speziale
6 Exper-
7 iment
Code:
import pandas as pd
# read/open file or create dataframe
df = pd.DataFrame({'author_ex':['Marios', 'Christodoulou', 'Intro-', \
'duction', 'Simone', 'Speziale', 'Exper-', 'iment']})
# check input format
print(df)
# create new column 'Ending' for True/False if column 'author_ex' ends with '-'
df['Ending'] = df['author_ex'].shift(1).str.contains('-$', na=False, regex=True)
# remove the trailing '-' from the 'author_ex' column
df['author_ex'] = df['author_ex'].str.replace('-$', '', regex=True)
# create new column with values of 'author_ex' and shifted 'author_ex' concatenated together
df['author_ex_combined'] = df['author_ex'] + df.shift(-1)['author_ex']
# create a series true/false but shifted up
index = (df['Ending'] == True).shift(-1)
# set the last row to 'False' after it was shifted
index.iloc[-1] = False
# replace 'author_ex' with 'author_ex_combined' based on true/false of index series
df.loc[index,'author_ex'] = df['author_ex_combined']
# remove rows that have the 2nd part of the 'author_ex' string and are no longer required
df = df[~df.Ending]
# remove the extra columns
df.drop(['Ending', 'author_ex_combined'], axis = 1, inplace=True)
# output final dataframe
print('\n\n')
print(df)
# notice index 3 and 6 are missing
Outputs:
author_ex
0 Marios
1 Christodoulou
2 Introduction
4 Simone
5 Speziale
6 Experiment

CODING Q based on Dataframes and Series and dictionaries

It would be interesting to see if there is any evidence of a link between vaccine effectiveness and sex of the child. Calculate the ratio of the number of children who contracted chickenpox but were vaccinated against it (at least one varicella dose) versus those who were vaccinated but did not contract chicken pox. Return results by sex.
This function should return a dictionary in the form of (use the correct numbers):
{"male":0.2,
"female":0.4}
Note: To aid in verification, the chickenpox_by_sex()['female'] value the autograder is looking for starts with the digits 0.0077.
PLEASE WRITE A FUNCTIONING CODE FOR THE SAME.
Try the following code:
Read the given dataset using the following code
import pandas as pd
df=pd.read_csv('assets/NISPUF17.csv',index_col=0)
df
Main code
def chickenpox_by_sex():
# YOUR CODE HERE
male_df=df[df['SEX']==1]
vac_m=male_df[male_df['P_NUMVRC']>=1]
cp_m=vac_m[vac_m['HAD_CPOX']==1]
counts_cp_m=cp_m['SEX'].count()
ncp_m=vac_m[vac_m['HAD_CPOX']==2]
counts_ncp_m=ncp_m['SEX'].count()
male=counts_cp_m/counts_ncp_m
female_df=df[df['SEX']==2]
vac_f=female_df[female_df['P_NUMVRC']>=1]
cp_f=vac_f[vac_f['HAD_CPOX']==1]
counts_cp_f=cp_f['SEX'].count()
ncp_f=vac_f[vac_f['HAD_CPOX']==2]
counts_ncp_f=ncp_f['SEX'].count()
female=counts_cp_f/counts_ncp_f
ratio_dict={"male":male,"female":female}
return ratio_dict
raise NotImplementedError()
Check using the following code
chickenpox_by_sex()['female']
Final code to complete this
assert len(chickenpox_by_sex())==2, "Return a dictionary with two items, the first for males and the second for females."
=> [SEX] -> sex=1 (male); sex=2 (female)
=> [HAD_COP] -> contracted chicken pox = 1; not contracted chicken pox = 2
=> [P_NUMVRC]>=1 -> given one or more doses
*ratio(male) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
*ratio(female) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
Variable names:
male - male data frame
vac_m - vaccinated male
cp_m - vaccinated and contracted chickenpox (male)
counts_cp_m - counts of vaccinated and contracted chickenpox
ncp_m - vaccinated and not contracted chickenpox (male)
counts_ncp_m - vaccinated and not contracted chickenpox
Similarly for females.
CORRECT SOLUTION.
def chickenpox_by_sex():
import pandas as pd
df = pd.read_csv("NISPUF17.csv")
maleDf = df[df["SEX"] ==1]
doses1 = maleDf[maleDf["P_NUMVRC"] >= 1]
chichkenPox1_1 = doses1[doses1["HAD_CPOX"] == 1]
count1_1 = chichkenPox1_1["SEX"].count()
chichkenPox1_2 = doses1[doses1["HAD_CPOX"] == 2]
count1_2 = chichkenPox1_2["SEX"].count()
resultMale = count1_1/count1_2
femaleDf = df[df["SEX"] == 2]
doses2 = femaleDf[femaleDf["P_NUMVRC"] >= 1]
chichkenPox2_1 = doses2[doses2["HAD_CPOX"] == 1]
count2_1 = chichkenPox2_1["SEX"].count()
chichkenPox2_2 = doses2[doses2["HAD_CPOX"] == 2]
count2_2 = chichkenPox2_2["SEX"].count()
resultFemale = count2_1/count2_2
dict = {"male":resultMale,
"female":resultFemale
}
return dict
The following code works as well:
import pandas as pd
import numpy as np
import math
def chickenpox_by_sex():
df=pd.read_csv('assets/NISPUF17.csv')
c_vaccinated=df[df['P_NUMVRC']>0]
menstats=c_vaccinated[c_vaccinated['SEX']==1]
mnocpox=len(menstats[menstats['HAD_CPOX']==2])
menratio=len(menstats[menstats['HAD_CPOX']==1])/mnocpox
wstats=c_vaccinated[c_vaccinated['SEX']==2]
wnocpox=len(wstats[wstats['HAD_CPOX']==2])
wratio=len(wstats[wstats['HAD_CPOX']==1])/wnocpox
ratios={'male':menratio,'female':wratio}
return ratios
chickenpox_by_sex()
import pandas as pd
def chickenpox_by_sex():
df = pd.read_csv('assets/NISPUF17.csv')
df = df.drop(df[df.HAD_CPOX == 77].index)
df = df.drop(df[df.HAD_CPOX == 99].index)
df = df.dropna(subset=['P_NUMVRC'])
df.loc[df['HAD_CPOX'] == 1, 'HAD_CPOX'] = 'YES'
df.loc[df['HAD_CPOX'] == 2, 'HAD_CPOX'] = 'NO'
df.loc[df['SEX'] == 1, 'SEX'] = 'male'
df.loc[df['SEX'] == 2, 'SEX'] = 'female'
df.loc[df['P_NUMVRC'] == 2.0, 'P_NUMVRC'] = 1
df.loc[df['P_NUMVRC'] == 3.0, 'P_NUMVRC'] = 1
df = df[['SEX', 'P_NUMVRC', 'HAD_CPOX']].round(decimals=0)
dfm = df[df['SEX'] == 'male']
dfmVac = dfm[dfm['P_NUMVRC'] == 1.0]
mPoxVacYes = len(dfmVac[dfmVac['HAD_CPOX'] == 'YES'])
mPoxVacNo = len(dfmVac[dfmVac['HAD_CPOX'] == 'NO'])
dff = df[df['SEX'] == 'female']
dffVac = dff[dff['P_NUMVRC'] == 1.0]
fPoxVacYes = len(dffVac[dffVac['HAD_CPOX'] == 'YES'])
fPoxVacNo = len(dffVac[dffVac['HAD_CPOX'] == 'NO'])
ratioM = mPoxVacYes/float(mPoxVacNo)
ratioF = fPoxVacYes/float(fPoxVacNo)
result = {'male': ratioM * 100, 'female': ratioF * 100}
return result
import pandas as pd
import numpy as np
df = pd.read_csv('assets/NISPUF17.csv', usecols = ['HAD_CPOX', 'SEX', 'P_NUMVRC']).dropna().reset_index()
def chickenpox_by_sex():
girls = df[df.SEX == 2]
girls_had = girls[(girls.HAD_CPOX == 1) & (girls.P_NUMVRC > 0.0)]
girls_not_had = girls[(girls.HAD_CPOX == 2) &(girls.P_NUMVRC > 0.0)]
girls_ratio = len(girls_had)/len(girls_not_had)
boys = df[df.SEX == 1]
boys_had = boys[(boys.HAD_CPOX == 1) & (boys.P_NUMVRC > 0.0)]
boys_not_had = boys[(boys.HAD_CPOX == 2) &(boys.P_NUMVRC > 0.0)]
boys_ratio = len(boys_had)/len(boys_not_had)
result = {"male": round(boys_ratio, ndigits=4),
"female":round(girls_ratio, ndigits = 4)}
return result
chickenpox_by_sex()

I am having a problem with a foor loop that includes dataframes

I have a dataframe with 8 columnds. If two of those columns satisfy a condition, I have to fill two columns with the product of other two. And after running the algorithm it is not working.
I have tryed to use series, I have tryed to use import warnings
warnings.filterwarnings("ignore") but it is not working
for i in seq:
if dataframefinal['trade'][i] == 1 and dataframefinal['z'][i] > 0:
dataframefinal['CloseAdj2'][i]= dataframefinal['Close2'][i] *
dataframefinal['trancosshort'][i]
dataframefinal['CloseAdj1'][i]= dataframefinal['Close1'][i] *
dataframefinal['trancostlong'][i]
elif dataframefinal['trade'][i] == 1 and dataframefinal['z'][i] < 0:
dataframefinal['CloseAdj2'][i]= dataframefinal['Close1'][i] *
dataframefinal['trancosshort'][i]
dataframefinal['CloseAdj1'][i]= dataframefinal['Close2'][i] *
dataframefinal['trancostlong'][i]
else:
dataframefinal['CloseAdj1'][i]= dataframefinal['Close1'][i]
dataframefinal['CloseAdj2'][i]= dataframefinal['Close2'][i]
You can use vectorized condition function numpy.select() to do this quickly:
import pandas as pd
from numpy.random import randn, randint
n = 10
df_data = pd.DataFrame(dict(trade=randint(0, 2, n),
z=randn(n),
Close1=randn(n),
Close2=randn(n),
trancosshort=randn(n),
trancostlong=randn(n)))
df_data["CloseAdj1"] = 0
df_data["CloseAdj2"] = 0
seq = [1, 3, 5, 7, 9]
df = df_data.loc[seq]
cond1 = df.eval("trade==1 and z > 0")
cond2 = df.eval("trade==2 and z < 0")
df["CloseAdj2"] = np.select([cond1, cond2],
[df.eval("Close2 * trancosshort"),
df.eval("Close1 * trancosshort")], df.Close2)
df["CloseAdj1"] = np.select([cond1, cond2],
[df.eval("Close1 * trancostlong"),
df.eval("Close2 * trancostlong")], df.Close1)
df_data.loc[seq, ["CloseAdj1", "CloseAdj2"]] = df[["CloseAdj1", "CloseAdj2"]]

Time Difference between Time Period and Instant

I have some time periods (df_A) and some time instants (df_B):
import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta
# Data
df_A = pd.DataFrame({'A1': [dt.datetime(2017,1,5,9,8), dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,2,7,9,19), dt.datetime(2017,2,7,9,19)],
'A2': [dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,5,9,12), dt.datetime(2017,1,7,9,26), dt.datetime(2017,1,7,9,20), dt.datetime(2017,1,7,9,21), dt.datetime(2017,2,7,9,23), dt.datetime(2017,2,7,9,25)]})
df_B = pd.DataFrame({ 'B': [dt.datetime(2017,1,6,14,45), dt.datetime(2017,1,4,3,31), dt.datetime(2017,1,7,3,31), dt.datetime(2017,1,7,14,57), dt.datetime(2017,1,9,14,57)]})
I can match these together:
# Define an Extra Margin
M = dt.timedelta(days = 10)
df_A["A1X"] = df_A["A1"] + M
df_A["A2X"] = df_A["A2"] - M
# Match
Bv = df_B .B .values
A1 = df_A .A1X.values
A2 = df_A .A2X.values
i, j = np.where((Bv[:, None] >= A1) & (Bv[:, None] <= A2))
df_C = pd.DataFrame(np.column_stack([df_B .values[i], df_A .values[j]]),
columns = df_B .columns .append (df_A.columns))
I would like to find the time difference between each time period and the time instant matched to it. I mean that
if B is between A1 and A2
then dT = 0
I've tried doing it like this:
# Calculate dt
def time(A1,A2,B):
if df_C["B"] < df_C["A1"]:
return df_C["A1"].subtract(df_C["B"])
elif df_C["B"] > df_C["A2"]:
return df_C["B"].subtract(df_C["A2"])
else:
return 0
df_C['dt'] = df_C.apply(time)
I'm getting "ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series"
So, I found two fixes:
You are adding M to the lower value and subtracting from the higher one. Change it to:
df_A['A1X'] = df_A['A1'] - M
df_A['A2X'] = df_A['A2'] + M
You are only passing one row of your dataframe at a time to your time function, so it should be something like:
def time(row):
if row['B'] < row['A1']:
return row['A1'] - row['B']
elif row['B'] > row['A2']:
return row['B'] - row['A2']
else:
return 0
And then you can call it like this:
df_C['dt'] = df_C.apply(time, axis=1) :)