Pandas Dataframe: change columns, index and plot

Pandas Dataframe: change columns, index and plot - pandas

Hi, I generated the table above using Counter from collections for counting the combinations of 3 variables from a dataframe: Jessica, Mike, and Dog. I got the combination and their counts.
Any help to make that table a bit more prettier? I would like to rename the index as grp1, grp2, etc and the column as well with something else than 0.
Also, what would be the best plot to use for plotting the different groups?
Thanks for your help!!
I used this command to produce the table here:
df= np.random.choice(["Mike", "Jessica", "Dog"], size=(20, 3))
Z= pd.DataFrame(df,columns=['a', 'b', 'c'])
import collections
from collections import Counter
LL= Z.apply (Counter, axis= "columns").value_counts()
H= pd.DataFrame(LL)
print(H)

quite an unusual technique....
you can change the dict index to a multi-index
then plot() as barh and labels make sense
df= np.random.choice(["Mike", "Jessica", "Dog"], size=(20, 3))
Z= pd.DataFrame(df,columns=['a', 'b', 'c'])
import collections
from collections import Counter
LL= Z.apply (Counter, axis= "columns").value_counts()
H= pd.DataFrame(LL)
I = pd.Series(H.index).apply(pd.Series)
H = H.set_index(pd.MultiIndex.from_arrays(I.T.values, names=I.columns))
H.plot(kind="barh")
H after setting as multi-index
0
Mike Dog Jessica
2.0 1.0 NaN 5
NaN 1.0 4
NaN 1.0 2.0 3
1.0 NaN 2.0 3
1.0 1.0 2
NaN NaN 3.0 1
2.0 1.0 1
3.0 NaN NaN 1

Instead of using counter, you can apply value_counts directly to each row:
import pandas as pd
from matplotlib import pyplot as plt
# Hard Coded For Reproducibility
df = pd.DataFrame({'a': {0: 'Dog', 1: 'Jessica', 2: 'Mike',
3: 'Dog', 4: 'Dog', 5: 'Dog',
6: 'Jessica', 7: 'Jessica',
8: 'Dog', 9: 'Dog', 10: 'Jessica',
11: 'Mike', 12: 'Dog',
13: 'Jessica', 14: 'Mike',
15: 'Mike',
16: 'Mike', 17: 'Dog',
18: 'Jessica', 19: 'Mike'},
'b': {0: 'Mike', 1: 'Mike', 2: 'Jessica',
3: 'Jessica', 4: 'Dog', 5: 'Jessica',
6: 'Mike', 7: 'Dog', 8: 'Mike',
9: 'Dog', 10: 'Dog', 11: 'Dog',
12: 'Dog', 13: 'Jessica',
14: 'Jessica', 15: 'Dog',
16: 'Dog', 17: 'Dog', 18: 'Jessica', 19: 'Jessica'},
'c': {0: 'Mike', 1: 'Dog', 2: 'Jessica',
3: 'Dog', 4: 'Dog', 5: 'Dog', 6: 'Dog',
7: 'Jessica', 8: 'Mike', 9: 'Dog',
10: 'Dog', 11: 'Mike', 12: 'Jessica',
13: 'Jessica', 14: 'Jessica',
15: 'Jessica', 16: 'Jessica',
17: 'Dog', 18: 'Mike', 19: 'Dog'}})
# Apply value_counts across each row
df = df.apply(pd.value_counts, axis=1) \
.fillna(0)
# Group By All Columns and
# Get Duplicate Count From Group Size
df = pd.DataFrame(df
.groupby(df.columns.values.tolist())
.size()
.sort_values())
# Plot
plt.figure()
df.plot(kind="barh")
plt.tight_layout()
plt.show()
df after groupby, size, and sort:
0
Dog Jessica Mike
0.0 3.0 0.0 1
1.0 2.0 0.0 1
0.0 2.0 1.0 3
1.0 0.0 2.0 3
3.0 0.0 0.0 3
2.0 1.0 0.0 4
1.0 1.0 1.0 5
Plt:

Related

How to keep the number and names of columns in training and test dataset equal after one hot encoding?

Shape of the original dataset is 82580×30 with multiple string columns. Example dataset:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
df = pd.DataFrame({'Nationality': {0: 'DEU', 1: 'PRT', 2: 'PRT', 3: 'PRT', 4: 'FRA', 5: 'DEU', 6: 'CHE', 7: 'DEU', 8: 'GBR', 9: 'AUT', 10: 'PRT', 11: 'FRA', 12: 'OTR', 13: 'GBR', 14: 'ESP', 15: 'PRT', 16: 'OTR', 17: 'PRT', 18: 'ESP', 19: 'AUT'},
'Age': {0: 27.0, 1: 45.46, 2: 45.46, 3: 58.0, 4: 57.0, 5: 27.0, 6: 49.0, 7: 62.0, 8: 44.0, 9: 61.0, 10: 54.0, 11: 53.0, 12: 50.0, 13: 30.0, 14: 51.0, 15: 45.46, 16: 40.0, 17: 49.0, 18: 49.0, 19: 14.0},
'DaysSinceCreation': {0: 370, 1: 213, 2: 206, 3: 1018, 4: 835, 5: 52, 6: 597, 7: 217, 8: 999, 9: 1004, 10: 402, 11: 879, 12: 393, 13: 923, 14: 249, 15: 52, 16: 159, 17: 929, 18: 49, 19: 131},
'BookingsCheckedIn': {0: 1, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 2, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 0, 16: 0, 17: 1, 18: 1, 19: 0}})
# Encoding Variables
transformer = make_column_transformer((OneHotEncoder(sparse=False), ['Nationality']), remainder='passthrough')
transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# Concat the two tables
transformed_df.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df = pd.concat([transformed_df, df], axis=1)
# Remove old columns
df.drop(['Nationality'], axis = 1, inplace = True)
print('The shape after encoding: {}'.format(df.shape))
print(df.columns.unique())
The shape after encoding: (20, 14)
Index(['onehotencoder__Nationality_AUT', 'onehotencoder__Nationality_CHE',
'onehotencoder__Nationality_DEU', 'onehotencoder__Nationality_ESP',
'onehotencoder__Nationality_FRA', 'onehotencoder__Nationality_GBR',
'onehotencoder__Nationality_OTR', 'onehotencoder__Nationality_PRT',
'remainder__Age', 'remainder__DaysSinceCreation',
'remainder__BookingsCheckedIn', 'Age', 'DaysSinceCreation',
'BookingsCheckedIn'],
dtype='object')
After modeling, trying to test on a completely different test set:
df = pd.DataFrame({'Nationality': {0: 'CAN', 1: 'DEU', 2: 'PRT', 3: 'PRT', 4: 'FRA'},
'Age': {0: 27.0, 1: 29.0, 2: 24.0, 3: 24.0, 4: 46.0},
'DaysSinceCreation': {0: 222, 1: 988, 2: 212, 3: 685, 4: 1052},
'BookingsCheckedIn': {0: 0, 1: 1, 2: 1, 3: 1, 4: 0}})
# Encoding Variables
transformer = make_column_transformer(
(OneHotEncoder(sparse=False), ['Nationality']),
remainder='passthrough')
transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# Concat the two tables
transformed_df.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df = pd.concat([transformed_df, df], axis=1)
# Remove old columns
df.drop(['Nationality'], axis = 1, inplace = True)
print('The shape after encoding: {}'.format(df.shape))
print(df.columns.unique())
The shape after encoding: (5, 10)
Index(['onehotencoder__Nationality_CAN', 'onehotencoder__Nationality_DEU',
'onehotencoder__Nationality_FRA', 'onehotencoder__Nationality_PRT',
'remainder__Age', 'remainder__DaysSinceCreation',
'remainder__BookingsCheckedIn', 'Age', 'DaysSinceCreation',
'BookingsCheckedIn'],
dtype='object')
As can be seen, testing dataset has some features that were not present in the original training set and many features of training set are not present in test set. If I only use .values of X_train, y_train, X_test, y_test, I can run from logistic regression to Neural Net with >99% accuracy, but that feels like cheating and is not working out with Decision Trees. How do we deal with this?

I would like to contribute 2 inputs:
(1) the test set should be a subset of the training set, so the unknown Nationality 'CAN' is not allowed. Either: try to include the new 'CAN' in the training data, or try to replace it with 'GBR' instead in the test data.
(2) you should not do fit_transform() separately on training and test set. The right way is to fit on training set, then... transform on training set and transform on test set. To illustrate:
# Encoding Variables
transformer = make_column_transformer((OneHotEncoder(sparse=False), ['Nationality']), remainder='passthrough')
####transformed = transformer.fit_transform(df) #delete this
transformer.fit(df) #use this instead
transformed = transformer.transform(df) #use this instead
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# Concat the two tables
<truncated>
print('The shape after encoding: {}'.format(df.shape))
The shape after encoding: (20, 14)
Second part, note that I have replaced 'CAN' with 'GBR'. And only use the previously fitted transformer to transform the test set:
df = pd.DataFrame({'Nationality': {0: 'GBR', 1: 'DEU', 2: 'PRT', 3: 'PRT', 4: 'FRA'},
'Age': {0: 27.0, 1: 29.0, 2: 24.0, 3: 24.0, 4: 46.0},
'DaysSinceCreation': {0: 222, 1: 988, 2: 212, 3: 685, 4: 1052},
'BookingsCheckedIn': {0: 0, 1: 1, 2: 1, 3: 1, 4: 0}})
# Encoding Variables
####transformer = make_column_transformer((OneHotEncoder(sparse=False), ['Nationality']), remainder='passthrough') #do not repeat, use the previous fitted model
####transformed = transformer.fit_transform(df) #delete this, NO fitting on test set
transformed = transformer.transform(df) #only do transform on test set
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
# Concat the two tables
<truncated>
print('The shape after encoding: {}'.format(df.shape))
The shape after encoding: (5, 14)
So the number of columns (14) are the same for both training set and test set

multiple nested groupby in pandas

Here is my pandas dataframe:
df = pd.DataFrame({'Date': {0: '2016-10-11', 1: '2016-10-11', 2: '2016-10-11', 3: '2016-10-11', 4: '2016-10-11',5: '2016-10-12',6: '2016-10-12',7: '2016-10-12',8: '2016-10-12',9: '2016-10-12'}, 'Stock': {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H',8: 'I', 9:'J'}, 'Sector': {0: 0,1: 0, 2: 1, 3: 1, 4: 1, 5: 0, 6:0, 7:0, 8:1, 9:1}, 'Segment': {0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6:2,7:2,8:3,9:3}, 'Range': {0: 5, 1: 0, 2: 1, 3: 0, 4: 2, 5: 6, 6:0, 7:23, 8:5, 9:5}})
Here is how it looks:
I want to add the following columns:
'Date_Range_Avg': average of 'Range' grouped by Date
'Date_Sector_Range_Avg': average of 'Range' grouped by Date and Sector
'Date_Segment_Range_Avg': average of 'Range' grouped by Date and Segment
This would be the output:
res = pd.DataFrame({'Date': {0: '2016-10-11', 1: '2016-10-11', 2: '2016-10-11', 3: '2016-10-11', 4: '2016-10-11',5: '2016-10-12',6: '2016-10-12',7: '2016-10-12',8: '2016-10-12',9: '2016-10-12'}, 'Stock': {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H',8: 'I', 9:'J'}, 'Sector': {0: 0,1: 0, 2: 1, 3: 1, 4: 1, 5: 0, 6:0, 7:0, 8:1, 9:1}, 'Segment': {0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6:2,7:2,8:3,9:3}, 'Range': {0: 5, 1: 0, 2: 1, 3: 0, 4: 2, 5: 6, 6:0, 7:23, 8:5, 9:5}, 'Date_Range_Avg':{0: 1.6, 1: 1.6, 2: 1.6, 3: 1.6, 4: 1.6, 5: 7.8, 6: 7.8, 7: 7.8, 8:7.8, 9: 7.8}, 'Date_Sector_Range_Avg':{0: 2.5, 1: 2.5, 2: 1, 3: 1, 4: 1, 5: 9.67, 6: 9.67, 7: 9.67, 8: 9.67, 9: 9.67}, 'Date_Segment_Range_Avg':{0: 5, 1: 0.75, 2: 0.75, 3: 0.75, 4: 0.75, 5: 6, 6: 11.5, 7: 11.5, 8: 5, 9: 5}})
This is how it looks:
Note I have rounded some of the values - but this rounding is not essential for the question I have (please feel free to not round)
I'm aware that I can do each of these groupings separately but it strikes me as inefficient (my dataset contains millions of rows)
Essentially, I would like to first do a grouping by Date and then re-use it to do the two more fine-grained groupings by Date and Segment and by Date and Sector.
How to do this?
My initial hunch is to go like this:
day_groups = df.groupby("Date")
df['Date_Range_Avg'] = day_groups['Range'].transform('mean')
and then to re-use day_groups to do the 2 more fine-grained groupbys like this:
df['Date_Sector_Range_Avg'] = day_groups.groupby('Segment')[Range].transform('mean')
Which doesn't work as you get:
'AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby''

groupby runs really fast when the aggregate function is vectorized. If you are worried about performance, try it out first to see if it's the real bottleneck in your program.
You can create temporary data frames holding the result of each groupby, then successively merge them with df:
group_bys = {
"Date_Range_Avg": ["Date"],
"Date_Sector_Range_Avg": ["Date", "Sector"],
"Date_Segment_Range_Avg": ["Date", "Segment"]
}
tmp = [
df.groupby(columns)["Range"].mean().to_frame(key)
for key, columns in group_bys.items()
]
result = df
for t in tmp:
result = result.merge(t, left_on=t.index.names, right_index=True)
Result:
Date Stock Sector Segment Range Date_Range_Avg Date_Sector_Range_Avg Date_Segment_Range_Avg
0 2016-10-11 A 0 0 5 1.6 2.500000 5.00
1 2016-10-11 B 0 1 0 1.6 2.500000 0.75
2 2016-10-11 C 1 1 1 1.6 1.000000 0.75
3 2016-10-11 D 1 1 0 1.6 1.000000 0.75
4 2016-10-11 E 1 1 2 1.6 1.000000 0.75
5 2016-10-12 F 0 1 6 7.8 9.666667 6.00
6 2016-10-12 G 0 2 0 7.8 9.666667 11.50
7 2016-10-12 H 0 2 23 7.8 9.666667 11.50
8 2016-10-12 I 1 3 5 7.8 5.000000 5.00
9 2016-10-12 J 1 3 5 7.8 5.000000 5.00
Another option is to use transform, and avoid the multiple merges:
# reusing your code
group_bys = {
"Date_Range_Avg": ["Date"],
"Date_Sector_Range_Avg": ["Date", "Sector"],
"Date_Segment_Range_Avg": ["Date", "Segment"]
}
tmp = {key : df.groupby(columns)["Range"].transform('mean')
for key, columns in group_bys.items()
}
df.assign(**tmp)

Computing statistics on trends in a time series in pandas

I have a time series data on the prices of items in different periods:
import pandas as pd
d = {'ItemID': {0: '1',
1: '1',
2: '1',
3: '1',
4: '1',
5: '1',
6: '1',
7: '1',
8: '1',
9: '1',
10: '1',
11: '2',
12: '2',
13: '2',
14: '2',
15: '2',
16: '2',
17: '2',
18: '2',
19: '2',
20: '2',
21: '2'},
'Period': {0: '1',
1: '1',
2: '1',
3: '1',
4: '1',
5: '1',
6: '2',
7: '2',
8: '2',
9: '2',
10: '2',
11: '1',
12: '1',
13: '1',
14: '1',
15: '1',
16: '2',
17: '2',
18: '2',
19: '2',
20: '2',
21: '2'},
'Price': {0: 1,
1: 2,
2: 1,
3: 2,
4: 2,
5: 3,
6: 6,
7: 6,
8: 7,
9: 7,
10: 8,
11: 50,
12: 49,
13: 50,
14: 49,
15: 48,
16: 61,
17: 62,
18: 63,
19: 64,
20: 64,
21: 65}}
df = pd.DataFrame(d)
I would like to compute the following statistics about the price changes per item and period:
number of streaks
max streak length
avg streak length
A streak is, essentially, a list of either non-decreasing or non-increasing values. In the following list [0,5,4,3,3] there are 2 streaks: [0,5] and [4,3,3].
For the above dataframe the correct output would be:
s = {'ItemID': {0: '1',1: '1', 2: '2', 3: '2'}, 'Period' : {0: '1',1: '2', 2: '1', 3: '2'},
'MaxStreakLength': {0: 4,1: 5, 2: 3, 3: 6},
'AvgStreakLength': {0: 3,1: 3, 2: 2.5, 3: 6},
'NumStreaks':{0: 2,1: 1, 2: 2, 3: 1}}
How to do this efficiently? The initial dataframe is quite large (millions of records)

I assume there no direct methods to achieve these sequence splitting, here i have added conditional sequence splitting,
def sequential_split(p):
a = p >= 0
b = a.cumsum()
arr = b-b.mask(a).ffill().fillna(0).astype(int)
streak_ends = (np.where(a==0)[0]-1)
return arr, streak_ends
def get_data(p):
arr,s_e = sequential_split(p.diff())
arr1,s_e1 = sequential_split(p.diff(-1))
if len(s_e)>len(s_e1):
s_e , arr = s_e1, arr1
streak_peaks = arr.iloc[s_e].add(1).tolist()
else:
streak_peaks = arr.loc[s_e[1:]].add(1).tolist() + [arr.iloc[-1]+1]
return [arr.max()+1, sum(streak_peaks)/len(streak_peaks), arr[arr==0].shape[0]]
columns=['MaxStreakLength','AvgStreakLength','NumStreaks']
a = df.groupby(['ItemID','Period'])['Price'].apply(get_data)
a.apply(lambda x: pd.Series(x, index=columns)).reset_index()
Out:
ItemID Period MaxStreakLength AvgStreakLength NumStreaks
0 1 1 4.0 3.0 2.0
1 1 2 5.0 5.0 1.0
2 2 1 3.0 2.5 2.0
3 2 2 6.0 6.0 1.0

pandas groupby + multiple aggregate/apply with multiple columns

I have this minimal sample data:
import pandas as pd
from pandas import Timestamp
data = pd.DataFrame({'Client': {0: "Client_1", 1: "Client_2", 2: "Client_2", 3: "Client_3", 4: "Client_3", 5: "Client_3", 6: "Client_4", 7: "Client_4"},
'Id_Card': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8},
'Type': {0: 'A', 1: 'B', 2: 'C', 3: np.nan, 4: 'A', 5: 'B', 6: np.nan, 7: 'B'},
'Loc': {0: 'ADW', 1: 'ZCW', 2: 'EWC', 3: "VWQ", 4: "OKS", 5: 'EQW', 6: "PKA", 7: 'CSA'},
'Amount': {0: 10.0, 1: 15.0, 2: 17.0, 3: 32.0, 4: np.nan, 5: 51.0, 6: 38.0, 7: -20.0},
'Net': {0: 30.0, 1: 42.0, 2: -10.0, 3: 15.0, 4: 98, 5: np.nan, 6: 23.0, 7: -10.0},
'Date': {0: Timestamp('2018-09-29 00:00:00'), 1: Timestamp('1996-08-02 00:00:00'), 2: np.nan, 3: Timestamp('2020-11-02 00:00:00'), 4: Timestamp('2008-12-27 00:00:00'), 5: Timestamp('2004-12-21 00:00:00'), 6: np.nan, 7: Timestamp('2010-08-25 00:00:00')}})
data
I'm trying to aggregate this data grouping by Client column. Counting the Id_Card per client, concatenating Type, Loc, separated by ; (e.g. A;B and ZCW;EWC values for Client_2, NOT A;ZCW B;EWC), sum the Amount, Net, per client, and getting the minimum Date per client. However, I'm facing some problems:
These functions works perfectly individually, but I can't find a way to mix the aggregate function and apply function:
Code example:
data.groupby("Client").agg({"Id_Card": "count", "Amount":"sum", "Date": "min"})
data.groupby('Client')['Loc'].apply(';'.join).reset_index()
The apply function doesn't work for columns with missing values:
Code example:
data.groupby('Client')['Type'].apply(';'.join).reset_index()
TypeError: sequence item 0: expected str instance, float found
The aggregate and apply functions don't allow me to put multiple columns for one transformation:
Code example:
cols_to_sum = ["Amount", "Net"]
data.groupby("Client").agg({"Id_Card": "count", cols_to_sum:"sum", "Date": "min"})
cols_to_join = ["Type", "Loc"]
data.groupby('Client')[cols_to_join].apply(';'.join).reset_index()
In (3) I only put Amount and Net and I could put them separately in the aggregate function, but I'm looking to a more efficient way as I'm working with plenty of columns.
The output expected is the same dataframe, but aggregated with the conditions outlined at the beggining.

For doing a join, you would have to filter out the NaN values. As join you have to apply at two places, I have created a separate function
def join_non_nan_values(elements):
return ";".join([elem for elem in elements if elem == elem]) # elem == elem will fail for Nan values
data.groupby("Client").agg({"Id_Card": "count", "Type": join_non_nan_values,
"Loc": join_non_nan_values, "Amount":"sum", "Net": "sum", "Date": "min"})

Go step by step, and prepare three different data frames to merge them later.
First dataframe is for simple functions like count,sum,mean
df1 = data.groupby("Client").agg({"Id_Card": "count", "Amount":"sum", "Net":sum, "Date": "min"}).reset_index()
Next you deal with Type and Loc join, we use fill na to deal with nan values
df2=data[['Client', 'Type']].fillna('').groupby("Client")['Type'].apply(
';'.join).reset_index()
df3=data[['Client', 'Loc']].fillna('').groupby("Client")['Loc'].apply(
';'.join).reset_index()
And finally you merge the results together:
data_new = df1.merge(df2, on='Client').merge(df3, on='Client')
data_new output:

Apply np.average in pandas pivot aggfunc

I am trying to calculate weighted average prices using pandas pivot table.
I have tried passing in a dictionary using aggfunc.
This does not work when passed into aggfunc, although it should calculate the correct weighted average.
'Price': lambda x: np.average(x, weights=df['Balance'])
I have also tried using a manual groupby:
df.groupby('Product').agg({
'Balance': sum,
'Price': lambda x : np.average(x, weights='Balance'),
'Value': sum
})
This also yields the error:
TypeError: Axis must be specified when shapes of a and weights differ.
Here is sample data
import pandas as pd
import numpy as np
price_dict = {'Product': {0: 'A',
1: 'A',
2: 'A',
3: 'A',
4: 'A',
5: 'B',
6: 'B',
7: 'B',
8: 'B',
9: 'B',
10: 'C',
11: 'C',
12: 'C',
13: 'C',
14: 'C'},
'Balance': {0: 10,
1: 20,
2: 30,
3: 40,
4: 50,
5: 60,
6: 70,
7: 80,
8: 90,
9: 100,
10: 110,
11: 120,
12: 130,
13: 140,
14: 150},
'Price': {0: 1,
1: 2,
2: 3,
3: 4,
4: 5,
5: 6,
6: 7,
7: 8,
8: 9,
9: 10,
10: 11,
11: 12,
12: 13,
13: 14,
14: 15},
'Value': {0: 10,
1: 40,
2: 90,
3: 160,
4: 250,
5: 360,
6: 490,
7: 640,
8: 810,
9: 1000,
10: 1210,
11: 1440,
12: 1690,
13: 1960,
14: 2250}}
Try to calculate weighted average by passing dict into aggfunc:
df = pd.DataFrame(price_dict)
df.pivot_table(
index='Product',
aggfunc = {
'Balance': sum,
'Price': np.mean,
'Value': sum
}
)
Output:
Balance Price Value
Product
A 150 3 550
B 400 8 3300
C 650 13 8550
The expected outcome should be :
Balance Price Value
Product
A 150 3.66 550
B 400 8.25 3300
C 650 13.15 8550

Here is one way using apply
df.groupby('Product').apply(lambda x : pd.Series(
{'Balance': x['Balance'].sum(),
'Price': np.average(x['Price'], weights=x['Balance']),
'Value': x['Value'].sum()}))
Out[57]:
Balance Price Value
Product
A 150.0 3.666667 550.0
B 400.0 8.250000 3300.0
C 650.0 13.153846 8550.0

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Pandas Dataframe: change columns, index and plot - pandas

Related

How to keep the number and names of columns in training and test dataset equal after one hot encoding?

multiple nested groupby in pandas

Computing statistics on trends in a time series in pandas

pandas groupby + multiple aggregate/apply with multiple columns

Apply np.average in pandas pivot aggfunc

Categories

Resources