Creating a monotonic list with groupby with missing dates - pandas

I am finding the rolling average of all counties over a period of time. However, the first county, county "A", is missing the time point for January 3rd.
def crt_data():
data = [[datetime(2020, 1, 1), 'A', 1],
[datetime(2020,1,2), 'A', 2],
#[datetime(2020,1,3), 'A', 3],
[datetime(2020,1,4), 'A', 4],
[datetime(2020,1,1), 'B', 10],
[datetime(2020,1,2), 'B', 11],
[datetime(2020,1,3), 'B', 12],
[datetime(2020,1,4), 'B', 13],
[datetime(2020,1,1), 'C', 4],
[datetime(2020,1,2), 'C', 3],
[datetime(2020,1,3), 'C', 2],
[datetime(2020,1,4), 'C', 1]
]
df = pd.DataFrame(data, columns=['my_date', 'County', 'cmi'])
return df
df = crt_data()
print('\n \n roll over by timepoint')
df['my_mean'] = df.groupby('my_date')['cmi'].mean().reset_index(0, drop=True)
df = df.sort_values(by=['County', 'my_date'])
df['rolling_cmi2'] = df.my_mean.rolling(2).mean()
print(df)
my_date County cmi my_mean rolling_cmi2
0 2020-01-01 A 1 5.000000 NaN
1 2020-01-02 A 2 5.333333 5.166667
2 2020-01-04 A 4 7.000000 6.166667
3 2020-01-01 B 10 6.000000 6.500000
4 2020-01-02 B 11 NaN NaN
5 2020-01-03 B 12 NaN NaN
6 2020-01-04 B 13 NaN NaN
7 2020-01-01 C 4 NaN NaN
8 2020-01-02 C 3 NaN NaN
9 2020-01-03 C 2 NaN NaN
10 2020-01-04 C 1 NaN NaN
EDIT:
What I expect to see is something like this:
my_date County cmi my_mean rolling_cmi2
0 2020-01-01 A 1 5.000000 NaN
1 2020-01-02 A 2 5.333333 5.166667
3 2020-01-03 B 12 7.000000 6.166667
2 2020-01-04 A 4 7.000000 7.000000
When I group, I get no dates for January third, and two dates for January 1st. This makes the rolling value incorrect.
How do I reduce it to a single average with one of each of the dates, and that date be the correct date? I know County C has all the correct time points, can I move County C to the top of the list to get a complete list of dates? How would you do this?

Related

Delete rows in data frame based on condition of ranked values

if i have the below dataframe
raw_data = {
'code': [1,1,1,1,2,2,2,2],
'Date': ['2022-01-04','2022-01-01', '2022-01-03','2022-01-02', '2022-01-08', '2022-01-07','2022-01-06','2022-01-05'],
'flag_check': [np.NaN, np.NaN, '11-33-24-33333' ,np.NaN, np.NaN,'11-55-24-33443' ,np.NaN, np.NaN],
'rank':[np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
}
df = pd.DataFrame(raw_data, columns=['code', 'Date','flag_check', 'rank'])
I need to do the following
1- rank the entries based on code then date
2- within the same code entries fill the rank column with series numbers 1,2,3 based on the code and the date.
3- check the value of a "flag_check" if it is not null then delete all rows after it
Expected result
Here's a way to do it:
df['rank'] = df.groupby(['code'])['Date'].rank(method='dense').astype(int)
df = df.sort_values(['code','Date'])
x = df.groupby('code')['flag_check'].apply(lambda x:x.shift().notna().cumsum())
df = df.loc[x[x==0].index,:].reset_index(drop=True)
Input:
code Date flag_check rank
0 1 2022-01-04 NaN NaN
1 1 2022-01-01 NaN NaN
2 1 2022-01-03 11-33-24-33333 NaN
3 1 2022-01-02 NaN NaN
4 2 2022-01-08 NaN NaN
5 2 2022-01-07 11-55-24-33443 NaN
6 2 2022-01-06 NaN NaN
7 2 2022-01-05 NaN NaN
Output:
code Date flag_check rank
0 1 2022-01-01 NaN 1
1 1 2022-01-02 NaN 2
2 1 2022-01-03 11-33-24-33333 3
3 2 2022-01-05 NaN 1
4 2 2022-01-06 NaN 2
5 2 2022-01-07 11-55-24-33443 3
Annotated code
# Order by Date
s = df.sort_values('Date')
# rank the date column per code group
s['rank'] = s.groupby('code')['Date'].rank(method='dense')
# create boolean mask to identify the rows after the first non-null value
mask = s['flag_check'].notna()[::-1].groupby(df['code']).cummax()
Result
s[mask]
code Date flag_check rank
1 1 2022-01-01 NaN 1.0
3 1 2022-01-02 NaN 2.0
2 1 2022-01-03 11-33-24-33333 3.0
7 2 2022-01-05 NaN 1.0
6 2 2022-01-06 NaN 2.0
5 2 2022-01-07 11-55-24-33443 3.0

Subtract values from different groups

I have the following DataFrame:
A X
Time
1 a 10
2 b 17
3 b 20
4 c 21
5 c 36
6 d 40
given by pd.DataFrame({'Time': [1, 2, 3, 4, 5, 6], 'A': ['a', 'b', 'b', 'c', 'c', 'd'], 'X': [10, 17, 20, 21, 36, 40]}).set_index('Time')
The desired output is:
Time Difference
0 2 7
1 4 1
2 6 4
The first difference 1 is a result of subtracting 21 from 20: (first "c" value - last "b" value).
I'm open to numPy transformations as well.
Aggregate by GroupBy.agg with GroupBy.first,
GroupBy.last and then subtract shifted values for last column with omit first row by positions:
df = df.reset_index()
df1 = df.groupby('A',as_index=False, sort=False).agg(first=('X', 'first'),
last=('X','last'),
Time=('Time','first'))
df1['Difference'] = df1['first'].sub(df1['last'].shift(fill_value=0))
df1 = df1[['Time','Difference']].iloc[1:].reset_index(drop=True)
print (df1)
Time Difference
0 2 7
1 4 1
2 6 4
IIUC, you can pivot, ffill the columns, and compute the difference:
g = df.reset_index().groupby('A')
(df.assign(col=g.cumcount().values)
.pivot('A', 'col', 'X')
.ffill(axis=1)
.assign(Time=g['Time'].first(),
diff=lambda d: d[0]-d[1].shift())
[['Time', 'diff']].iloc[1:]
.rename_axis(index=None, columns=None)
)
output:
Time Difference
b 2 7.0
c 4 1.0
d 6 4.0
Intermediate, pivoted/ffilled dataframe:
col 0 1 Time Difference
A
a 10.0 10.0 1 NaN
b 17.0 20.0 2 7.0
c 21.0 36.0 4 1.0
d 40.0 40.0 6 4.0
Another possible solution:
(df.assign(Y = df['X'].shift())
.iloc[df.index % 2 == 0]
.assign(Difference = lambda z: z['X'] - z['Y'])
.reset_index()
.loc[:, ['Time', 'Difference']]
)
Output:
Time Difference
0 2 7.0
1 4 1.0
2 6 4.0

Python - count and Difference data frames

I have two data frames about occupation in industry in 2005 and 2006. I would like to create a df using the column with the result of the changed of these years, if it growth or decreased. Here is a sample:
import pandas as pd
d = {'OCC2005': [1234, 1234, 1234 ,1234, 2357,2357,2357,2357, 4321,4321,4321,4321, 3333], 'IND2005': [4, 5, 6, 7, 5,6,7,4, 6,7,5,4,5], 'Result': [7, 8, 12, 1, 11,15,20,1,5,12,8,4,3]}
df = pd.DataFrame(data=d)
print(df)
d2 = {'OCC2006': [1234, 1234, 1234 ,1234, 2357,2357,2357,2357, 4321,4321,4361,4321, 3333,4444], 'IND2006': [4, 5, 6, 7, 5,6,7,4, 6,7,5,4,5,8], 'Result': [17, 18, 12, 1, 1,5,20,1,5,2,18,4,0,15]}
df2 = pd.DataFrame(data=d2)
print(df2)
Final_Result = df2['Result'] - df['Result']
print(Final_Result)
I would like to create a df with occ- ind- final_result
Rename columns of df to match column names of df2:
MAP = dict(zip(df.columns, df2.columns))
out = (df2.set_index(['OCC2006', 'IND2006'])
.sub(df.rename(columns=MAP).set_index(['OCC2006', 'IND2006']))
.reset_index())
print(out)
# Output
OCC2006 IND2006 Result
0 1234 4 10.0
1 1234 5 10.0
2 1234 6 0.0
3 1234 7 0.0
4 2357 4 0.0
5 2357 5 -10.0
6 2357 6 -10.0
7 2357 7 0.0
8 3333 5 -3.0
9 4321 4 0.0
10 4321 5 NaN
11 4321 6 0.0
12 4321 7 -10.0
13 4361 5 NaN
14 4444 8 NaN

Adding new column to pandas dataframe after groupby and rolling on a column

I am trying to add a new column to pandas dataframe after groupby and rolling average but the newly generated column changes order after reset_index()
original dataframe
Name Values
0 A 1
1 A 2
2 A 3
3 B 1
4 B 2
5 C 3
6 A 2
7 A 6
8 B 8
9 B 3
10 D 0
after groupby and rolling it looks something like:
Name
A 0 NaN
1 NaN
2 2.000000
6 2.333333
7 3.666667
B 3 NaN
4 NaN
8 3.666667
9 4.333333
C 5 NaN
D 10 NaN
Name: Values, dtype: float64
Now can someone help me to add this result in new column in the original dataframe? Because when I try to reset_index(), the order changes to the groupby order.
Use apply to apply rolling mean on each group,
df['rolling_mean'] = df.groupby('Name').Values.apply(lambda x: x.rolling(3).mean())
df
Name Values rolling_mean
0 A 1 NaN
1 A 2 NaN
2 A 3 2.000000
3 B 1 NaN
4 B 2 NaN
5 C 3 NaN
6 A 2 2.333333
7 A 6 3.666667
8 B 8 3.666667
9 B 3 4.333333
10 D 0 NaN
Here is an example:
df = pd.DataFrame({'Name': {0: 'A',
1: 'A',
2: 'A',
3: 'B',
4: 'B',
5: 'C',
6: 'A',
7: 'A',
8: 'B',
9: 'B',
10: 'D'},
'Values': {0: 1, 1: 2, 2: 3, 3: 1, 4: 2, 5: 3, 6: 2, 7: 6, 8: 8, 9: 3, 10: 0}})
df2 = pd.DataFrame({2: {('A', 0): np.nan,
('A', 1): np.nan,
('A', 2): 2.0,
('A', 6): 2.333333,
('A', 7): 3.666667,
('B', 3): np.nan,
('B', 4): np.nan,
('B', 8): 3.666667,
('B', 9): 4.3333330000000005,
('C', 5): np.nan,
('D', 10): np.nan}})
df.merge(df2.reset_index(level=0), left_index=True, right_index=True)
Name Values 0 2
0 A 1 A NaN
1 A 2 A NaN
2 A 3 A 2.000000
3 B 1 B NaN
4 B 2 B NaN
5 C 3 C NaN
6 A 2 A 2.333333
7 A 6 A 3.666667
8 B 8 B 3.666667
9 B 3 B 4.333333
10 D 0 D NaN
or join:
df.join(df2.reset_index(level=0))
Name Values 0 2
0 A 1 A NaN
1 A 2 A NaN
2 A 3 A 2.000000
3 B 1 B NaN
4 B 2 B NaN
5 C 3 C NaN
6 A 2 A 2.333333
7 A 6 A 3.666667
8 B 8 B 3.666667
9 B 3 B 4.333333
10 D 0 D NaN

expanding exponential weighting on Pandas multi-index DataFrame where each day is a matrix

I have a multi-index dataframe where the first index is date, and each day is a 3x3 matrix:
multi_index = pd.MultiIndex.from_product([[pd.datetime(2017, 1, 1),pd.datetime(2017, 1, 2),pd.datetime(2017, 1, 3)], ['A','B','C']])
df = pd.DataFrame(index=multi_index, data={"A": [1, 2, 3, 4, 1, 2, 3, 4, 2],"B": [1, 2, 3, 4, 1, 2, 3, 4, 2],"C": [1, 2, 3, 4, 1, 2, 3, 4, 2]})
I would like to create a new dataframe with the same structure as df, but the values are exponentially weighted averages of the expanding window of the matrices.
So for 2017-01-01, new df is the same as old df. On 2017-01-02, new df is the exponentially weighted average of the 2 matrices on 2017-01-01 and 2017-01-02 from df. On 2017-01-03, it is the exponentially weighted average of 3 matrices.
I was trying combinations of groupby/expanding/apply/ewm but did not find a solution.
Would below work for you? Grouping by second index and then applying pandas.DataFrame.ewm
print(df)
A B C
2017-01-01 A 1 1 1
B 2 2 2
C 3 3 3
2017-01-02 A 4 4 4
B 1 1 1
C 2 2 2
2017-01-03 A 3 3 3
B 4 4 4
C 2 2 2
result = df.groupby(level=1).apply(lambda x: x.ewm(1).mean())
print(result)
A B C
2017-01-01 A 1.000000 1.000000 1.000000
B 2.000000 2.000000 2.000000
C 3.000000 3.000000 3.000000
2017-01-02 A 3.000000 3.000000 3.000000
B 1.333333 1.333333 1.333333
C 2.333333 2.333333 2.333333
2017-01-03 A 3.000000 3.000000 3.000000
B 2.857143 2.857143 2.857143
C 2.142857 2.142857 2.142857