Update categories in two Series / Columns for comparison - pandas

If I try to compare two Series with different categories I get an error:
a = pd.Categorical([1, 2, 3])
b = pd.Categorical([4, 5, 3])
df = pd.DataFrame([a, b], columns=['a', 'b'])
a b
0 1 4
1 2 5
2 3 3
df.a == df.b
# TypeError: Categoricals can only be compared if 'categories' are the same.
What is the best way to update categories in both Series? Thank you!
My solution:
df['b'] = df.b.cat.add_categories(df.a.cat.categories.difference(df.b.cat.categories))
df['a'] = df.a.cat.add_categories(df.b.cat.categories.difference(df.a.cat.categories))
df.a == df.b
Output:
0 False
1 False
2 True
dtype: bool

One idea with union_categoricals:
from pandas.api.types import union_categoricals
union = union_categoricals([df.a, df.b]).categories
df['a'] = df.a.cat.set_categories(union)
df['b'] = df.b.cat.set_categories(union)
print (df.a == df.b)
0 False
1 False
2 True
dtype: bool

Related

Pandas create new column base on groupby and apply lambda if statement

I have the issue with groupby and apply
df = pd.DataFrame({'A': ['a', 'a', 'a', 'b', 'b', 'b', 'b'], 'B': np.r_[1:8]})
I want to create a column C for each group take value 1 if B > z_score=2 and 0 otherwise. The code:
from scipy import stats
df['C'] = df.groupby('A').apply(lambda x: 1 if np.abs(stats.zscore(x['B'], nan_policy='omit')) > 2 else 0, axis=1)
However, I am unsuccessful with code and cannot figure out the issue
Use GroupBy.transformwith lambda, function, then compare and for convert True/False to 1/0 convert to integers:
from scipy import stats
s = df.groupby('A')['B'].transform(lambda x: np.abs(stats.zscore(x, nan_policy='omit')))
df['C'] = (s > 2).astype(int)
Or use numpy.where:
df['C'] = np.where(s > 2, 1, 0)
Error in your solution is per groups:
from scipy import stats
df = df.groupby('A')['B'].apply(lambda x: 1 if np.abs(stats.zscore(x, nan_policy='omit')) > 2 else 0)
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
If check gotcha in pandas docs:
pandas follows the NumPy convention of raising an error when you try to convert something to a bool. This happens in an if-statement or when using the boolean operations: and, or, and not.
So if use one of solutions instead if-else:
from scipy import stats
df = df.groupby('A')['B'].apply(lambda x: (np.abs(stats.zscore(x, nan_policy='omit')) > 2).astype(int))
print (df)
A
a [0, 0, 0]
b [0, 0, 0, 0]
Name: B, dtype: object
but then need convert to column, for avoid this problems is used groupby.transform.
You can use groupby + apply a function that finds the z-scores of each item in each group; explode the resulting list; use gt to create a boolean series and convert it to dtype int
df['C'] = df.groupby('A')['B'].apply(lambda x: stats.zscore(x, nan_policy='omit')).explode(ignore_index=True).abs().gt(2).astype(int)
Output:
A B C
0 a 1 0
1 a 2 0
2 a 3 0
3 b 4 0
4 b 5 0
5 b 6 0
6 b 7 0

How can I flatten the output dataframe of pandas crosstab from two series x and y into a series?

I have the following series x and y:
x = pd.Series(['a', 'b', 'a', 'c', 'c'], name='x')
y = pd.Series([1, 0, 1, 0, 0], name='y')
I call pd.crosstab to get the following dataframe as output:
pd.crosstab(x, y)
Output:
y 0 1
x
a 0 2
b 1 0
c 2 0
I want to transform this into a single series as follows:
x_a_y_0 0
x_a_y_1 2
x_b_y_0 1
x_b_y_1 0
x_c_y_0 2
x_c_y_1 0
For a specific dataframe like this one, I can construct this by visual inspection:
pd.Series(
dict(
x_a_y_0=0,
x_a_y_1=2,
x_b_y_0=1,
x_b_y_1=0,
x_c_y_0=2,
x_c_y_1=0
)
)
But given arbitrary series x and y, how do I generate the corresponding final output?
Use DataFrame.stack with change MultiIndex by map:
s = pd.crosstab(x, y).stack()
s.index = s.index.map(lambda x: f'x_{x[0]}_y_{x[1]}')
print (s)
x_a_y_0 0
x_a_y_1 2
x_b_y_0 1
x_b_y_1 0
x_c_y_0 2
x_c_y_1 0
dtype: int64
Also is possible pass s.index.names, thank you #SeaBean:
s.index = s.index.map(lambda x: f'{s.index.names[0]}_{x[0]}_{s.index.names[1]}_{x[1]}')

How to show rows with data which are not equal?

I have two tables
import pandas as pd
import numpy as np
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
columns=['a', 'b', 'c'])
df1 = pd.DataFrame(np.array([[1, 2, 4], [4, 5, 6], [7, 8, 9]]),
columns=['a', 'b', 'c'])
print(df1.equals(df2))
I want to compare them. I want the same result if I would use function df.compare(df1) or at least something close to it. Can't use above fnction as my complier states that 'DataFrame' object has no attribute 'compare'
First approach:
Let's compare value by value:
In [1183]: eq_df = df1.eq(df2)
In [1196]: eq_df
Out[1200]:
a b c
0 True True False
1 True True True
2 True True True
Then let's reduce it down to see which rows are equal for all columns
from functools import reduce
In [1285]: eq_ser = reduce(np.logical_and, (eq_df[c] for c in eq_df.columns))
In [1288]: eq_ser
Out[1293]:
0 False
1 True
2 True
dtype: bool
Now we can print out the rows which are not equal
In [1310]: df1[~eq_ser]
Out[1315]:
a b c
0 1 2 4
In [1316]: df2[~eq_ser]
Out[1316]:
a b c
0 1 2 3
Second approach:
def diff_dataframes(
df1, df2, compare_cols=None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Given two dataframes and column(s) to compare, return three dataframes with rows:
- common between the two dataframes
- found only in the left dataframe
- found only in the right dataframe
"""
df1 = df1.fillna(pd.NA)
df = df1.merge(df2.fillna(pd.NA), how="outer", on=compare_cols, indicator=True)
df_both = df.loc[df["_merge"] == "both"].drop(columns="_merge")
df_left = df.loc[df["_merge"] == "left_only"].drop(columns="_merge")
df_right = df.loc[df["_merge"] == "right_only"].drop(columns="_merge")
tup = namedtuple("df_diff", ["common", "left", "right"])
return tup(df_both, df_left, df_right)
Usage:
In [1366]: b, l, r = diff_dataframes(df1, df2)
In [1371]: l
Out[1371]:
a b c
0 1 2 4
In [1372]: r
Out[1372]:
a b c
3 1 2 3
Third approach:
In [1440]: eq_ser = df1.eq(df2).sum(axis=1).eq(len(df1.columns))

Pandas apply function on multiple columns

I am trying to apply a function to every column in a dataframe, when I try to do it on just a single fixed column name it works. I tried doing it on every column, but when I try passing the column name as an argument in the function I get an error.
How do you properly pass arguments to apply a function on a data frame?
def result(row,c):
if row[c] >=0 and row[c] <=1:
return 'c'
elif row[c] >1 and row[c] <=2:
return 'b'
else:
return 'a'
cols = list(df.columns.values)
for c in cols
df[c] = df.apply(result, args = (c), axis=1)
TypeError: ('result() takes exactly 2 arguments (21 given)', u'occurred at index 0')
Input data frame format:
d = {'c1': [1, 2, 1, 0], 'c2': [3, 0, 1, 2]}
df = pd.DataFrame(data=d)
df
c1 c2
0 1 3
1 2 0
2 1 1
3 0 2
You don't need to pass the column name to apply. As you only want to check if values of the columns are in certain range and should return a, b or c. You can make the following changes.
def result(val):
if 0<=val<=1:
return 'c'
elif 1<val<=2:
return 'b'
return 'a'
cols = list(df.columns.values)
for c in cols
df[c] = df[c].apply(result)
Note that this will replace your column values.
A faster way is np.select:
import numpy as np
values = ['c', 'b']
for col in df.columns:
df[col] = np.select([0<=df[col]<=1, 1<df[col]<=2], values, default = 'a')

Weighted mean pandas

Im calculating weighted mean for many columns using pandas. In some cases weight can sum to zero so i use np.ma.average:
import pandas as pd
import numpy as np
df = pd.DataFrame.from_dict(dict([('ID', [1, 1, 1]),('HeightA', [1, 2, 3]), ('WeightA', [0, 0, 0]),('HeightB', [2, 4, 6]), ('WeightB', [1, 2, 4])]))
>>df
ID HeightA WeightA HeightB WeightB
0 1 1 0 2 1
1 1 2 0 4 2
2 1 3 0 6 4
wmA = lambda x: np.ma.average(x, weights=df.loc[x.index, "WeightA"])
wmB = lambda x: np.ma.average(x, weights=df.loc[x.index, "WeightB"])
f = {'HeightA':wmA,'HeightB':wmB}
df2 = df.groupby(['ID'])['HeightA','HeightB'].agg(f)
This works but i have many columns of height and weights so i dont want to have to write a lambda function for each one so i try:
def givewm(data,weightcolumn):
return np.ma.average(data, weights=data.loc[data.index, weightcolumn])
f = {'HeightA':givewm(df,'WeightA'),'HeightB':givewm(df,'WeightB')}
df2 = df.groupby(['ID'])['HeightA','HeightB'].agg(f)
Which give error: builtins.TypeError: Axis must be specified when shapes of a and weights differ.
How can i write a function which returns weighted mean with weight column name as input?
Use double nested functions, solution from github:
df = (pd.DataFrame.from_dict(dict([('ID', [1, 1, 1]),
('HeightA', [1, 2, 3]),
('WeightA', [10, 20, 30]),
('HeightB', [2, 4, 6]),
('WeightB', [1, 2, 4])])))
print (df)
ID HeightA WeightA HeightB WeightB
0 1 1 10 2 1
1 1 2 20 4 2
2 1 3 30 6 4
def givewm(weightcolumn):
def f1(x):
return np.ma.average(x, weights=df.loc[x.index, weightcolumn])
return f1
f = {'HeightA':givewm('WeightA'),'HeightB':givewm('WeightB')}
df2 = df.groupby('ID').agg(f)
print (df2)
HeightA HeightB
ID
1 2.333333 4.857143
Verify solution:
wmA = lambda x: np.ma.average(x, weights=df.loc[x.index, "WeightA"])
wmB = lambda x: np.ma.average(x, weights=df.loc[x.index, "WeightB"])
f = {'HeightA':wmA,'HeightB':wmB}
df2 = df.groupby(['ID'])['HeightA','HeightB'].agg(f)
print (df2)
HeightA HeightB
ID
1 2.333333 4.857143