How to do the formulas without splitting the dataframe which had different conditions - pandas

I have the following dataframe
import pandas as pd
d = {
'ID':[1,2,3,4,5],
'Price1':[5,9,4,3,9],
'Price2':[9,10,13,14,18],
'Type':['A','A','B','C','D'],
}
df = pd.DataFrame(data = d)
df
For applying the formula without condition I use the following code
df = df.eval(
'Price = (Price1*Price1)/2'
)
df
How to do the formulas without splitting the dataframe which had different conditions
Need a new column called Price_on_type
The formula is differing for each type
For type A the formula for Price_on_type = Price1+Price1
For type B the formula for Price_on_type = (Price1+Price1)/2
For type C the formula for Price_on_type = Price1
For type D the formula for Price_on_type = Price2
Expected Output:
import pandas as pd
d = {
'ID':[1,2,3,4,5],
'Price1':[5,9,4,3,9],
'Price2':[9,10,13,14,18],
'Price':[12.5,40.5, 8.0, 4.5, 40.5],
'Price_on_type':[14,19,8.0,3,18],
}
df = pd.DataFrame(data = d)
df

You can use numpy.select:
masks = [df['Type'] == 'A',
df['Type'] == 'B',
df['Type'] == 'C',
df['Type'] == 'D']
vals = [df.eval('(Price1*Price1)'),
df.eval('(Price1*Price1)/2'),
df['Price1'],
df['Price2']]
Or:
vals = [df['Price1'] + df['Price2'],
(df['Price1'] + df['Price2']) / 2,
df['Price1'],
df['Price2']]
df['Price_on_type'] = np.select(masks, vals)
print (df)
ID Price1 Price2 Type Price_on_type
0 1 5 9 A 14.0
1 2 9 10 A 19.0
2 3 4 13 B 8.5
3 4 3 14 C 3.0
4 5 9 18 D 18.0

If your data is not too big, using apply with custom function on axis=1
def Prices(x):
dict_sw = {
'A': x.Price1 + x.Price2,
'B': (x.Price1 + x.Price2)/2,
'C': x.Price1,
'D': x.Price2,
}
return dict_sw[x.Type]
In [239]: df['Price_on_type'] = df.apply(Prices, axis=1)
In [240]: df
Out[240]:
ID Price1 Price2 Type Price_on_type
0 1 5 9 A 14.0
1 2 9 10 A 19.0
2 3 4 13 B 8.5
3 4 3 14 C 3.0
4 5 9 18 D 18.0
Or using the trick convert True to 1 and False to 0
df['Price_on_type'] = \
(df.Type == 'A') * (df.Price1 + df.Price2) + \
(df.Type == 'B') * (df.Price1 + df.Price2)/2 + \
(df.Type == 'C') * df.Price1 + \
(df.Type == 'D') * df.Price2
Out[308]:
ID Price1 Price2 Type Price_on_type
0 1 5 9 A 14.0
1 2 9 10 A 19.0
2 3 4 13 B 8.5
3 4 3 14 C 3.0
4 5 9 18 D 18.0

Related

How can I aggregate strings from many cells into one cell?

Say I have two classes with a handful of students each, and I want to think of the possible pairings in each class. In my original data, I have one line per student.
What's the easiest way in Pandas to turn this dataset
Class Students
0 1 A
1 1 B
2 1 C
3 1 D
4 1 E
5 2 F
6 2 G
7 2 H
Into this new stuff?
Class Students
0 1 A,B
1 1 A,C
2 1 A,D
3 1 A,E
4 1 B,C
5 1 B,D
6 1 B,E
7 1 C,D
6 1 B,E
8 1 C,D
9 1 C,E
10 1 D,E
11 2 F,G
12 2 F,H
12 2 G,H
Try This:
import itertools
import pandas as pd
cla = [1, 1, 1, 1, 1, 2, 2, 2]
s = ["A", "B", "C", "D" , "E", "F", "G", "H"]
df = pd.DataFrame(cla, columns=["Class"])
df['Student'] = s
def create_combos(list_students):
combos = itertools.combinations(list_students, 2)
str_students = []
for i in combos:
str_students.append(str(i[0])+","+str(i[1]))
return str_students
def iterate_df(class_id):
df_temp = df.loc[df['Class'] == class_id]
list_student = list(df_temp['Student'])
list_combos = create_combos(list_student)
list_id = [class_id for i in list_combos]
return list_id, list_combos
list_classes = set(list(df['Class']))
new_id = []
new_combos = []
for idx in list_classes:
tmp_id, tmp_combo = iterate_df(idx)
new_id += tmp_id
new_combos += tmp_combo
new_df = pd.DataFrame(new_id, columns=["Class"])
new_df["Student"] = new_combos
print(new_df)

Apply function on a two dataframe rows

Given a pandas dataframe like this:
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})
col1 col2
0 1 4
1 2 5
2 3 6
I would like to do something equivalent to this using a function but without passing "by value" or as a global variable the whole dataframe (it could be huge and then it would give me a memory error):
i = -1
for index, row in df.iterrows():
if i < 0:
i = index
continue
c1 = df.loc[i][0] + df.loc[index][0]
c2 = df.loc[i][1] + df.loc[index][1]
df.ix[index, 0] = c1
df.ix[index, 1] = c2
i = index
col1 col2
0 1 4
1 3 9
2 6 15
i.e., I would like to have a function which will give me the previous output:
def my_function(two_rows):
row1 = two_rows[0]
row2 = two_rows[1]
c1 = row1[0] + row2[0]
c2 = row1[1] + row2[1]
row2[0] = c1
row2[1] = c2
return row2
df.apply(my_function, axis=1)
df
col1 col2
0 1 4
1 3 9
2 6 15
Is there a way of doing this?
What you've demonstrated is a cumsum
df.cumsum()
col1 col2
0 1 4
1 3 9
2 6 15
def f(df):
n = len(df)
r = range(1, n)
for j in df.columns:
for i in r:
df[j].values[i] += df[j].values[i - 1]
return df
f(df)
To define a function as a loop that does this in place
Slow cell by cell
def f(df):
n = len(df)
r = range(1, n)
for j in df.columns:
for i in r:
df[j].values[i] += df[j].values[i - 1]
return df
f(df)
col1 col2
0 1 4
1 3 9
2 6 15
Compromise between memory and efficiency
def f(df):
for j in df.columns:
df[j].values[:] = df[j].values.cumsum()
return df
f(df)
f(df)
col1 col2
0 1 4
1 3 9
2 6 15
Note that you don't need to return df. I chose to for convenience.

How to build column by column dataframe pandas

I have a dataframe looking like this example
A | B | C
__|___|___
s s nan
nan x x
I would like to create a table of intersections between columns like this
| A | B | C
__|______|____|______
A | True |True| False
__|______|____|______
B | True |True|True
__|______|____|______
C | False|True|True
__|______|____|______
Is there an elegant cycle-free way to do it?
Thank you!
Setup
df = pd.DataFrame(dict(A=['s', np.nan], B=['s', 'x'], C=[np.nan, 'x']))
Option 1
You can use numpy broadcasting to evaluate each column by each other column. Then determine if any of the comparisons are True
v = df.values
pd.DataFrame(
(v[:, :, None] == v[:, None]).any(0),
df.columns, df.columns
)
A B C
A True True False
B True True True
C False True True
By replacing any with sum you can get a count of how many intersections.
v = df.values
pd.DataFrame(
(v[:, :, None] == v[:, None]).sum(0),
df.columns, df.columns
)
A B C
A 1 1 0
B 1 2 1
C 0 1 1
Or use np.count_nonzero instead of sum
v = df.values
pd.DataFrame(
np.count_nonzero(v[:, :, None] == v[:, None], 0),
df.columns, df.columns
)
A B C
A 1 1 0
B 1 2 1
C 0 1 1
Option 2
Fun & Creative way
d = pd.get_dummies(df.stack()).unstack(fill_value=0)
d = d.T.dot(d)
d.groupby(level=1).sum().groupby(level=1, axis=1).sum()
A B C
A 1 1 0
B 1 2 1
C 0 1 1

Replace columns values by other values conditionally in Pandas

I would like to remove the description in a data frame if the value is identical to the caption:
m[m.Description == m.Caption].Description = \
m[m.Description == m.Caption].Description.map(lambda x: '')
I feel this writing is quite boilerplate:
df[condition][columns] = df[condition][columns].map(lambda x: value)
Is there a better syntax to do the same? I imagine something like:
df[condition][columns].map(lambda x: value, inplace=True)
You need loc with boolean indexing:
m.loc[m.Description == m.Caption, 'Description'] = ' '
Sample:
m = pd.DataFrame({'Description':['a','b','f'],
'Caption':['a','c',''],
'C':[7,8,9]})
print (m)
C Caption Description
0 7 a a
1 8 c b
2 9 f
m.loc[m.Description == m.Caption, 'Description'] = ' '
print (m)
C Caption Description
0 7 a
1 8 c b
2 9 f
Alternatively use mask:
m['Description'] = m['Description'].mask(m.Description == m.Caption, ' ')
print (m)
C Caption Description
0 7 a
1 8 c b
2 9 f

How can I increment a level in Pandas MultiIndex?

How can I increment all values in a specific level of a pandas multiindex?
You can create new MultiIndex.from_tuples and assign:
df = pd.DataFrame({'A':[1,2,3],
'B':[4,5,6],
'C':[7,8,9],
'D':[1,3,5],
'E':[5,3,6],
'F':[7,4,3]})
df = df.set_index(['A','B'])
print (df)
C D E F
A B
1 4 7 1 5 7
2 5 8 3 3 4
3 6 9 5 6 3
#change multiindex
new_index = list(zip(df.index.get_level_values('A'), df.index.get_level_values('B') + 1))
df.index = pd.MultiIndex.from_tuples(new_index, names = df.index.names)
print (df)
C D E F
A B
1 5 7 1 5 7
2 6 8 3 3 4
3 7 9 5 6 3
Another possible solution with reset_index and set_index:
df = df.reset_index()
df.B = df.B + 1
df = df.set_index(['A','B'])
print (df)
C D E F
A B
1 5 7 1 5 7
2 6 8 3 3 4
3 7 9 5 6 3
Solution with DataFrame.assign:
print (df.reset_index().assign(B=lambda x: x.B+1).set_index(['A','B']))
Timings:
In [26]: %timeit (reset_set(df1))
1 loop, best of 3: 144 ms per loop
In [27]: %timeit (assign_method(df3))
10 loops, best of 3: 161 ms per loop
In [28]: %timeit (jul(df2))
1 loop, best of 3: 543 ms per loop
In [29]: %timeit (tuples_method(df))
1 loop, best of 3: 581 ms per loop
Code for timings:
np.random.seed(100)
N = 1000000
df = pd.DataFrame(np.random.randint(10, size=(N,5)), columns=list('ABCDE'))
print (df)
df = df.set_index(['A','B'])
print (df)
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
def reset_set(df):
df = df.reset_index()
df.B = df.B + 1
return df.set_index(['A','B'])
def assign_method(df):
df = df.reset_index().assign(B=lambda x: x.B+1).set_index(['A','B'])
return df
def tuples_method(df):
new_index = list(zip(df.index.get_level_values('A'), df.index.get_level_values('B') + 1))
df.index = pd.MultiIndex.from_tuples(new_index, names = df.index.names)
return df
def jul(df):
df.index = pd.MultiIndex.from_tuples([(x[0], x[1]+1) for x in df.index], names=df.index.names)
return df
Thank you Jeff for another solution:
df.index.set_levels(df.index.levels[1] + 1 , level=1, inplace=True)
print (df)
C D E F
A B
1 5 7 1 5 7
2 6 8 3 3 4
3 7 9 5 6 3
Here's a slightly different way:
df.index = pd.MultiIndex.from_tuples([(x[0], x[1]+1) for x in df.index], names=df.index.names)
1000 loops, best of 3: 840 µs per loop
For comparison:
new_index = list(zip(df.index.get_level_values('A'),
df.index.get_level_values('B') + 1))
df.index = pd.MultiIndex.from_tuples(new_index, names = df.index.names)
1000 loops, best of 3: 984 µs per loop
The reset_index method is 10 times slower.
It can be as simple as
df.index.set_levels(df.index.levels[0] + 1, 0, inplace=True)
demo
df = pd.DataFrame(
dict(A=[2, 3, 4, 5]),
pd.MultiIndex.from_product([[1, 2], [3, 4]])
)
df
df.index.set_levels(df.index.levels[0] + 1, 0, inplace=True)
df