Quickly replace values in a Pandas DataFrame - pandas

I have the following dataframe:
df = pd.DataFrame(
{
'A':[1,2],
'B':[3,4]
}, index=['1','2'])
df.loc[:,'Sum'] = df.sum(axis=1)
df.loc['Sum'] = df.sum(axis=0)
print(df)
# A B Sum
# 1 1 3 4
# 2 2 4 6
# Sum 3 7 10
I want to:
replace 1 by 3*4/10
replace 2 by 3*6/10
replace 3 by 4*7/10
replace 4 by 7*6/10
What is the easiest way to do this? I want the solution to be able to extend to n number of rows and columns. Been cracking my head over this. TIA!

If I understood you correctly:
df = pd.DataFrame(
{
'A':[1,2],
'B':[3,4]
}, index=['1','2'])
df.loc[:,'Sum'] = df.sum(axis=1)
df.loc['Sum'] = df.sum(axis=0)
print(df)
conditions = [(df==1), (df==2), (df==3), (df==4)]
values = [(3*4)/10, (3*6)/10, (4*7)/10, (7*6)/10]
df[df.columns] = np.select(conditions, values, df)
OutPut:
A B Sum
1 1.2 2.8 4.2
2 1.8 4.2 6.0
Sum 2.8 7.0 10.0

Let us try create it from original df before you do the sum and assign
import numpy as np
v = np.multiply.outer(df.sum(1).values,df.sum().values)/df.sum().sum()
out = pd.DataFrame(v,index=df.index,columns=df.columns)
out
Out[20]:
A B
1 1.2 2.8
2 1.8 4.2

Related

Pandas pivot table with prefix to columns

I have a dataframe:
df = C1 A1. A2. A3. Type
A 1. 5. 2. AG
A 7. 3. 8. SC
And I want to create:
df = C1 A1_AG A1_SC A2_AG A2_SC
A 1. 7. 5. 3
How can it be done?
You can rather use a melt and transpose:
(df.melt('Type')
.assign(col=lambda d: d['Type']+'_'+d['variable'])
.set_index('col')[['value']].T
)
Output:
col AG_A1 SC_A1 AG_A2 SC_A2 AG_A3 SC_A3
value 1 7 5 3 2 8
with additional columns(s):
(df.melt(['C1', 'Type'])
.assign(col=lambda d: d['Type']+'_'+d['variable'])
.pivot(index=['C1'], columns='col', values='value')
.reset_index()
)
Output:
col C1 AG_A1 AG_A2 AG_A3 SC_A1 SC_A2 SC_A3
0 A 1 5 2 7 3 8
Use DataFrame.set_index with DataFrame.unstack:
df = df.set_index(['C1','Type']).unstack()
df.columns = df.columns.map(lambda x: f'{x[0]}_{x[1]}')
df = df.reset_index()
print (df)
C1 A1_AG A1_SC A2_AG A2_SC A3_AG A3_SC
0 A 1.0 7.0 5.0 3.0 2.0 8.0
One convenience option with pivot_wider from pyjanitor:
# pip install pyjanitor
import pandas as pd
import janitor
df.pivot_wider(index = 'C1', names_from = 'Type')
C1 A1_AG A1_SC A2_AG A2_SC A3_AG A3_SC
0 A 1.0 7.0 5.0 3.0 2.0 8.0
Of course, you can skip the convenience function and use pivot directly:
result = df.pivot(index='C1', columns='Type')
result.columns = result.columns.map('_'.join)
result.reset_index()
C1 A1_AG A1_SC A2_AG A2_SC A3_AG A3_SC
0 A 1.0 7.0 5.0 3.0 2.0 8.0

Create Dataframe from Matrix Search Calculation Pandas

I have the follwoign Dataframe:
df = pd.DataFrame({'Idenitiy': ['Haus1', 'Haus2', 'Haus1','Haus2'],
'kind': ['Gas', 'Gas', 'Strom','Strom'],
'2005':[2,3,5,6],
'2006':[2,3.5,5.5,7]})
No I would like to have the following dataframe as an output as the Product of the entitites:
Year Product(Gas) Product(Strom)
2005 6 30
2006 6 38,5
2007 7 38,5
Thank you.
Here's a way to do:
# multiply column values
from functools import reduce
def mult(f):
v = [reduce(lambda a,b : a*b, f['2005']), reduce(lambda a,b : a*b, f['2006'])]
return pd.Series(v, index=['2005','2006'])
# groupby and multiply column values
df1 = df.groupby('kind')[['2005','2006']].apply(mult).unstack().reset_index()
df1.columns = ['Year','Kind','vals']
print(df1)
Year Kind vals
0 2005 Gas 6.0
1 2005 Strom 30.0
2 2006 Gas 7.0
3 2006 Strom 38.5
# reshape the table
df1 = (df1
.pivot_table(index='Year', columns=['Kind'], values='vals'))
# fix column names
df1 = df1.add_prefix('Product_')
df1.columns.name = None
df1 = df1.reset_index()
Year Product_Gas Product_Strom
0 2005 6.0 30.0
1 2006 7.0 38.5

Removing Rows in a Pandas Dataframe with Identical (and adjacent) entries in a specific column

I have a dataframe where I have some dulicates in the "Item" column.
I want to remove the rows where there are dulicates (adjacent) but retain the last one i.e. Get rid of the red but keep the green
I then want to create a new column, where apples is assumed a start, and the next row is a time delta from this.i.e.
IIUC, try:
df_out = df.assign(Item_cnt=(df['Item'] != df['Item'].shift()).cumsum())\
.drop_duplicates(['Item','Item_cnt'], keep='last')
df_out['delta T'] = df_out['datetime'] - df_out.groupby((df_out['Item'] == 'apples').cumsum())['datetime'].transform('first')
Output:
Item datetime Item_cnt delta T
2 apples 1.2 1 0.0
3 oranges 2.3 2 1.1
4 apples 2.5 3 0.0
5 bananas 2.7 4 0.2
Details:
Create a grouping using cumsum and checking to see if the next line differs, then use drop_duplicates keeping the last record in that group.
IIUC,
df = pd.DataFrame({'Item' : ['apples', 'apples','apples','orange','apples','bananas'],
'dateTime' : [1,1.1,1.2,2.3,2.5,2.7]})
s = df.copy()
s['dateTime'] = s['dateTime'].round()
idx = s.drop_duplicates(subset=['Item','dateTime'],keep='last').index.tolist()
df = df.loc[idx]
df.loc[df['Item'].ne('apples'), 'delta'] = abs(df['dateTime'].shift() - df['dateTime'])
print(df.fillna(0))
Item dateTime delta
2 apples 1.2 0.0
3 orange 2.3 1.1
4 apples 2.5 0.0
5 bananas 2.7 0.2
Here is the df:
df = pd.DataFrame.from_dict({'Item':
['apples', 'apples', 'apples', 'oranges', 'apples', 'bananas'],
'dateTime':[1, 1.1, 1.2, 2.3, 2.5, 2.7]})
You can't use duplcated because you need to keep multiple copies of the same item, so try this:
df['Item_lag'] = df['Item'].shift(-1)
df = df[df['Item'] != df['Item_lag']] # get rid of repeated Items
df['deltaT'] = df['dateTime'] - df['dateTime'].shift(1).fillna(0) # calculate time diff
df.drop(['dateTime', 'Item_lag'], axis=1, inplace=True) # drop extra columns
df # display df
out:
Item deltaT
apples 1.2
oranges 1.1
apples 0.2
bananas 0.2

pandas groupby and agg operation of selected columns and row

I have a dataframe as below:
I am not sure if it is possible to use pandas to make an output as below:
difference=Response[df.Time=="pre"]-Response.min for each group
If pre is always first per groups and values in output should be repeated:
df['diff'] = df.groupby('IDs')['Response'].transform(lambda x: (x.iat[0] - x).min())
For only first value per groups is possible replace values to empty strings, but get mixed values - numeric with strings, so next processing should be problem:
df['diff'] = df['diff'].mask(df['diff'].duplicated(), '')
EDIT:
df = pd.DataFrame({
'Response':[2,5,0.4,2,1,4],
'Time':[7,'pre',9,4,2,'pre'],
'IDs':list('aaabbb')
})
#print (df)
d = df[df.Time=="pre"].set_index('IDs')['Response'].to_dict()
print (d)
{'a': 5.0, 'b': 4.0}
df['diff'] = df.groupby('IDs')['Response'].transform(lambda x: d[x.name] - x.min())
print (df)
Response Time IDs diff
0 2.0 7 a 4.6
1 5.0 pre a 4.6
2 0.4 9 a 4.6
3 2.0 4 b 3.0
4 1.0 2 b 3.0
5 4.0 pre b 3.0

Iterating over rows and columns in Pandas

I am trying to fill mean values of columns for all NaNs values in the column.
import numpy as np
import pandas as pd
table = pd.DataFrame({'A':[1,2,np.nan],
'B':[3,np.nan, np.nan],
'C':[4,5,6]})
def impute_missing_values(table):
for column in table:
for value in column:
if value == 'NaN':
value = column.mean(skipna=True)
else:
value = value
impute_missing_values(table)
table
Why I am getting an error for this code?
IIUC:
table.fillna(table.mean())
Output:
A B C
0 1.0 3.0 4
1 2.0 3.0 5
2 1.5 3.0 6
Okay, I am adding this as another answer because this isn't something I recommend at all. Using pandas methods vectorizes operations for better performance.
Using loops is not recommended when possible to avoid.
However, here is a quick fix to your code:
import pandas as pd
import numpy as np
import math
table = pd.DataFrame({'A':[1,2,np.nan],
'B':[3,np.nan, np.nan],
'C':[4,5,6]})
def impute_missing_values(df):
for column in df:
for idx, value in df[column].iteritems():
if math.isnan(value):
df.loc[idx,column] = df[column].mean(skipna=True)
else:
pass
return df
impute_missing_values(table)
table
Output:
A B C
0 1.0 3.0 4
1 2.0 3.0 5
2 1.5 3.0 6
You can try the SimpleImputer from scikit learn (https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer) using the mean option.
import pandas as pd
from sklearn.impute import SimpleImputer
table = pd.DataFrame({'A':[1,2,np.nan],
'B':[3,np.nan, np.nan],
'C':[4,5,6]})
print(table, '\n')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
table_means = pd.DataFrame(imp.fit_transform(table), columns = {'C','B','A'})
print(table_means)
The print commands results in:
A B C
0 1.0 3.0 4
1 2.0 NaN 5
2 NaN NaN 6
A C B
0 1.0 3.0 4.0
1 2.0 3.0 5.0
2 1.5 3.0 6.0
To correct your code (as per my comment below):
def impute_missing_values(table):
for column in table:
table.loc[:,column] = np.where(table[column].isna(), table[column].mean(), table[column])
return table