How can I increase the efficiency of my code (for loop)? - pandas

def assignGroup(row):
if row["E114"]=="Very good":
return 1
elif row['E114']=="Fairly good":
return 2
elif row['E114'] =="Bad":
return 3
elif row['E114'] =="Very bad":
return 4
else:
return np.nan
outcome["leader"]=outcome.apply(assignGroup,axis=1)

outcome["leader"] = outcome["E114"].map({
"Very good" : 1,
"Fairly good": 2,
"Bad": 3,
"Very bad": 4
})

Use numpy's where:
import numpy as np
outcome["leader"] = np.where(outcome["E114"] == "Very good", 1, outcome["leader"])
outcome["leader"] = np.where(outcome["E114"] == "Fairly goo", 2, outcome["leader"])
outcome["leader"] = np.where(outcome["E114"] == "Bad", 3, outcome["leader"])
outcome["leader"] = np.where(outcome["E114"] == "Very bad", 4, outcome["leader"])
In Python loops should be last resource

Related

CODING Q based on Dataframes and Series and dictionaries

It would be interesting to see if there is any evidence of a link between vaccine effectiveness and sex of the child. Calculate the ratio of the number of children who contracted chickenpox but were vaccinated against it (at least one varicella dose) versus those who were vaccinated but did not contract chicken pox. Return results by sex.
This function should return a dictionary in the form of (use the correct numbers):
{"male":0.2,
"female":0.4}
Note: To aid in verification, the chickenpox_by_sex()['female'] value the autograder is looking for starts with the digits 0.0077.
PLEASE WRITE A FUNCTIONING CODE FOR THE SAME.
Try the following code:
Read the given dataset using the following code
import pandas as pd
df=pd.read_csv('assets/NISPUF17.csv',index_col=0)
df
Main code
def chickenpox_by_sex():
# YOUR CODE HERE
male_df=df[df['SEX']==1]
vac_m=male_df[male_df['P_NUMVRC']>=1]
cp_m=vac_m[vac_m['HAD_CPOX']==1]
counts_cp_m=cp_m['SEX'].count()
ncp_m=vac_m[vac_m['HAD_CPOX']==2]
counts_ncp_m=ncp_m['SEX'].count()
male=counts_cp_m/counts_ncp_m
female_df=df[df['SEX']==2]
vac_f=female_df[female_df['P_NUMVRC']>=1]
cp_f=vac_f[vac_f['HAD_CPOX']==1]
counts_cp_f=cp_f['SEX'].count()
ncp_f=vac_f[vac_f['HAD_CPOX']==2]
counts_ncp_f=ncp_f['SEX'].count()
female=counts_cp_f/counts_ncp_f
ratio_dict={"male":male,"female":female}
return ratio_dict
raise NotImplementedError()
Check using the following code
chickenpox_by_sex()['female']
Final code to complete this
assert len(chickenpox_by_sex())==2, "Return a dictionary with two items, the first for males and the second for females."
=> [SEX] -> sex=1 (male); sex=2 (female)
=> [HAD_COP] -> contracted chicken pox = 1; not contracted chicken pox = 2
=> [P_NUMVRC]>=1 -> given one or more doses
*ratio(male) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
*ratio(female) = (vaccinated and contracted chicken pox)/(vaccinated and not contracted chicken pox)
Variable names:
male - male data frame
vac_m - vaccinated male
cp_m - vaccinated and contracted chickenpox (male)
counts_cp_m - counts of vaccinated and contracted chickenpox
ncp_m - vaccinated and not contracted chickenpox (male)
counts_ncp_m - vaccinated and not contracted chickenpox
Similarly for females.
CORRECT SOLUTION.
def chickenpox_by_sex():
import pandas as pd
df = pd.read_csv("NISPUF17.csv")
maleDf = df[df["SEX"] ==1]
doses1 = maleDf[maleDf["P_NUMVRC"] >= 1]
chichkenPox1_1 = doses1[doses1["HAD_CPOX"] == 1]
count1_1 = chichkenPox1_1["SEX"].count()
chichkenPox1_2 = doses1[doses1["HAD_CPOX"] == 2]
count1_2 = chichkenPox1_2["SEX"].count()
resultMale = count1_1/count1_2
femaleDf = df[df["SEX"] == 2]
doses2 = femaleDf[femaleDf["P_NUMVRC"] >= 1]
chichkenPox2_1 = doses2[doses2["HAD_CPOX"] == 1]
count2_1 = chichkenPox2_1["SEX"].count()
chichkenPox2_2 = doses2[doses2["HAD_CPOX"] == 2]
count2_2 = chichkenPox2_2["SEX"].count()
resultFemale = count2_1/count2_2
dict = {"male":resultMale,
"female":resultFemale
}
return dict
The following code works as well:
import pandas as pd
import numpy as np
import math
def chickenpox_by_sex():
df=pd.read_csv('assets/NISPUF17.csv')
c_vaccinated=df[df['P_NUMVRC']>0]
menstats=c_vaccinated[c_vaccinated['SEX']==1]
mnocpox=len(menstats[menstats['HAD_CPOX']==2])
menratio=len(menstats[menstats['HAD_CPOX']==1])/mnocpox
wstats=c_vaccinated[c_vaccinated['SEX']==2]
wnocpox=len(wstats[wstats['HAD_CPOX']==2])
wratio=len(wstats[wstats['HAD_CPOX']==1])/wnocpox
ratios={'male':menratio,'female':wratio}
return ratios
chickenpox_by_sex()
import pandas as pd
def chickenpox_by_sex():
df = pd.read_csv('assets/NISPUF17.csv')
df = df.drop(df[df.HAD_CPOX == 77].index)
df = df.drop(df[df.HAD_CPOX == 99].index)
df = df.dropna(subset=['P_NUMVRC'])
df.loc[df['HAD_CPOX'] == 1, 'HAD_CPOX'] = 'YES'
df.loc[df['HAD_CPOX'] == 2, 'HAD_CPOX'] = 'NO'
df.loc[df['SEX'] == 1, 'SEX'] = 'male'
df.loc[df['SEX'] == 2, 'SEX'] = 'female'
df.loc[df['P_NUMVRC'] == 2.0, 'P_NUMVRC'] = 1
df.loc[df['P_NUMVRC'] == 3.0, 'P_NUMVRC'] = 1
df = df[['SEX', 'P_NUMVRC', 'HAD_CPOX']].round(decimals=0)
dfm = df[df['SEX'] == 'male']
dfmVac = dfm[dfm['P_NUMVRC'] == 1.0]
mPoxVacYes = len(dfmVac[dfmVac['HAD_CPOX'] == 'YES'])
mPoxVacNo = len(dfmVac[dfmVac['HAD_CPOX'] == 'NO'])
dff = df[df['SEX'] == 'female']
dffVac = dff[dff['P_NUMVRC'] == 1.0]
fPoxVacYes = len(dffVac[dffVac['HAD_CPOX'] == 'YES'])
fPoxVacNo = len(dffVac[dffVac['HAD_CPOX'] == 'NO'])
ratioM = mPoxVacYes/float(mPoxVacNo)
ratioF = fPoxVacYes/float(fPoxVacNo)
result = {'male': ratioM * 100, 'female': ratioF * 100}
return result
import pandas as pd
import numpy as np
df = pd.read_csv('assets/NISPUF17.csv', usecols = ['HAD_CPOX', 'SEX', 'P_NUMVRC']).dropna().reset_index()
def chickenpox_by_sex():
girls = df[df.SEX == 2]
girls_had = girls[(girls.HAD_CPOX == 1) & (girls.P_NUMVRC > 0.0)]
girls_not_had = girls[(girls.HAD_CPOX == 2) &(girls.P_NUMVRC > 0.0)]
girls_ratio = len(girls_had)/len(girls_not_had)
boys = df[df.SEX == 1]
boys_had = boys[(boys.HAD_CPOX == 1) & (boys.P_NUMVRC > 0.0)]
boys_not_had = boys[(boys.HAD_CPOX == 2) &(boys.P_NUMVRC > 0.0)]
boys_ratio = len(boys_had)/len(boys_not_had)
result = {"male": round(boys_ratio, ndigits=4),
"female":round(girls_ratio, ndigits = 4)}
return result
chickenpox_by_sex()

Update categories in two Series / Columns for comparison

If I try to compare two Series with different categories I get an error:
a = pd.Categorical([1, 2, 3])
b = pd.Categorical([4, 5, 3])
df = pd.DataFrame([a, b], columns=['a', 'b'])
a b
0 1 4
1 2 5
2 3 3
df.a == df.b
# TypeError: Categoricals can only be compared if 'categories' are the same.
What is the best way to update categories in both Series? Thank you!
My solution:
df['b'] = df.b.cat.add_categories(df.a.cat.categories.difference(df.b.cat.categories))
df['a'] = df.a.cat.add_categories(df.b.cat.categories.difference(df.a.cat.categories))
df.a == df.b
Output:
0 False
1 False
2 True
dtype: bool
One idea with union_categoricals:
from pandas.api.types import union_categoricals
union = union_categoricals([df.a, df.b]).categories
df['a'] = df.a.cat.set_categories(union)
df['b'] = df.b.cat.set_categories(union)
print (df.a == df.b)
0 False
1 False
2 True
dtype: bool

Count how many times numbers repeat in list number by numbers

Consider the first number, say m. See how many times this number is repeated consecutively. If it is repeated k times in a row, it gives rise to two entries in the output list: first
the number k, then the number m. (This is similar to how we say “four 2s” when we see
[2,2,2,2].) Then we move on to the next number after this run of m. Repeat the process
until every number in the list is considered
Example:The process is perhaps best understood by looking at a few examples:
• readAloud([]) should return []
• readAloud([1,1,1]) should return [3,1]
• readAloud([-1,2,7]) should return [1,-1,1,2,1,7]
• readAloud([3,3,8,-10,-10,-10]) should return [2,3,1,8,3,-10]
• readAloud([3,3,1,1,3,1,1]) should return [2,3,2,1,1,3,2,1]
I have the following code:
def readAloud(lst: List[int]) -> List[int]:
answer:List[int]=[]
l=len(lst)
d=1
for i in range(l-1):
if(lst[i]==lst[i]):
d = d + 1
answer.append(d)
answer.append(lst[i])
if (lst[i-1] != lst[i]):
d=1
answer.append(d)
answer.append(lst[i])
return answer
Grouping adjacent elements is exactly what itertools.groupby is for.
from itertools import chain, groupby
def read_aloud(numbers):
r = ((sum(1 for _ in v), k) for k, v in groupby(numbers))
return list(chain.from_iterable(r))
Examples:
>>> read_aloud([])
[]
>>> read_aloud([1, 1, 1])
[3, 1]
>>> read_aloud([3, 3, 8, -10, -10, -10])
[2, 3, 1, 8, 3, -10]
>>> read_aloud([3, 3, 1, 1, 3, 1, 1])
[2, 3, 2, 1, 1, 3, 2, 1]
Here a solution (but this is not the only one :) )
def readAloud(lst):
answer = []
count = 1
prev_elt = lst[0]
for m in lst[1:] + [None]: # we add Node for the last values
if prev_elt == m:
count += 1
else:
answer.extend([count, prev_elt])
prev_elt = m
count = 1
return answer
print(readAloud([3,3,1,1,3,1,1]))

pandas: how to assign a single column conditionally on multiple other columns?

I'm confused about conditional assignment in Pandas.
I have this dataframe:
df = pd.DataFrame([
{ 'stripe_subscription_id': 1, 'status': 'past_due' },
{ 'stripe_subscription_id': 2, 'status': 'active' },
{ 'stripe_subscription_id': None, 'status': 'active' },
{ 'stripe_subscription_id': None, 'status': 'active' },
])
I'm trying to add a new column, conditionally based on the others:
def get_cancellation_type(row):
if row.stripe_subscription_id:
if row.status == 'past_due':
return 'failed_to_pay'
elif row.status == 'active':
return 'cancelled_by_us'
else:
return 'cancelled_by_user'
df['cancellation_type'] = df.apply(get_cancellation_type, axis=1)
This is fairly readable, but is it the standard way to do things?
I've been looking at pd.assign, and am not sure if I should be using that instead.
This should work, you can change or add the conditions however you want.
df.loc[(df['stripe_subscription_id'] != np.nan) & (df['status'] == 'past_due'), 'cancellation_type'] = 'failed_to_pay'
df.loc[(df['stripe_subscription_id'] != np.nan) & (df['status'] == 'active'), 'cancellation_type'] = 'cancelled_by_us'
df.loc[(df['stripe_subscription_id'] == np.nan), 'cancellation_type'] = 'cancelled_by_user'
You migth consider to use np.select
import pandas as pd
import numpy as np
condList = [df["status"]=="past_due",
df["status"]=="active",
~df["status"].isin(["past_due",
"active"])]
choiceList = ["failed_to_pay", "cancelled_by_us", "cancelled_by_user"]
df['cancellation_type'] = np.select(condList, choiceList)

Converting recursive solution to dynamic programming

Problem statement: find the number of "vowel only" strings that can be made from a given sequence of Morse code (the entire string must be used)
I have this current recursive solution. I want to speed up this algorithm to run in O(n) time. I know that I can define my array as S[j] = the maximum number of unique strings that can be created with access from 1 ... j. But I don't know where to go from there.
morsedict = {'A': '.-',
'E': '.',
'I': '..',
'O': '---',
'U': '..-'}
maxcombinations = 0
def countCombinations(codelist):
if len(codelist) is 0:
global maxcombinations
maxcombinations += 1
return
if codelist[0] in morsedict.values():
countCombinations(codelist[1:])
if len(codelist) >= 2 and codelist[:2] in morsedict.values():
countCombinations(codelist[2:])
if len(codelist) >= 3 and codelist[:3] in morsedict.values():
countCombinations(codelist[3:])
return
For future researchers here is the solution for conversion to a DP problem:
morsedict = {'A': '.-',
'E': '.',
'I': '..',
'O': '---',
'U': '..-'}
def countcombinations(codelist):
# Generate the DP array to match the size of the codeword
maxcombinations = [0] * (len(codelist))
# How many unique strings can I create with access to j elements: j = current index
# j = 0: access to nothing (1 because we need somewhere to start)
maxcombinations[0] = 1
# Brute force calculate the first case due to its simplicity
if codelist[0: 2] in morsedict.values():
maxcombinations[1] = 1
else:
maxcombinations[1] = 0
# For the rest of the indices, we look back in the DP array to see how good we can do given a certain length string
for i in range(1, len(codelist)):
firststr = codelist[i]
secondstr = codelist[(i - 1): i + 1]
thirdstr = codelist[(i - 2): i + 1]
if len(firststr) is 1 and firststr in morsedict.values():
maxcombinations[i] += maxcombinations[i - 1]
if len(secondstr) is 2 and secondstr in morsedict.values():
maxcombinations[i] += maxcombinations[i - 2]
if len(thirdstr) is 3 and thirdstr in morsedict.values():
maxcombinations[i] += maxcombinations[i - 3]
print(maxcombinations[-1])
if __name__ == "__main__":
input()
codelist = input()
countcombinations(codelist)