Sorting pandas dataframe by groups - pandas

I would like to sort a dataframe by certain priority rules.
I've achieved this in the code below but I think this is a very hacky solution.
Is there a more proper Pandas way of doing this?
import pandas as pd
import numpy as np
df=pd.DataFrame({"Primary Metric":[80,100,90,100,80,100,80,90,90,100,90,90,80,90,90,80,80,80,90,90,100,80,80,100,80],
"Secondary Metric Flag":[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0],
"Secondary Value":[15, 59, 70, 56, 73, 88, 83, 64, 12, 90, 64, 18, 100, 79, 7, 71, 83, 3, 26, 73, 44, 46, 99,24, 20],
"Final Metric":[222, 883, 830, 907, 589, 93, 479, 498, 636, 761, 851, 349, 25, 405, 132, 491, 253, 318, 183, 635, 419, 885, 305, 258, 924]})
Primary_List=list(np.unique(df['Primary Metric']))
Primary_List.sort(reverse=True)
df_sorted=pd.DataFrame()
for p in Primary_List:
lol=df[df["Primary Metric"]==p]
lol.sort_values(["Secondary Metric Flag"],ascending = False)
pt1=lol[lol["Secondary Metric Flag"]==1].sort_values(by=['Secondary Value', 'Final Metric'], ascending=[False, False])
pt0=lol[lol["Secondary Metric Flag"]==0].sort_values(["Final Metric"],ascending = False)
df_sorted=df_sorted.append(pt1)
df_sorted=df_sorted.append(pt0)
df_sorted
The priority rules are:
First sort by the 'Primary Metric', then by the 'Secondary Metric
Flag'.
If the 'Secondary Metric Flag' ==1, sort by 'Secondary Value' then
the 'Final Metric'
If ==0, go right for the 'Final Metric'.
Appreciate any feedback.

You do not need for loop and groupby here , just split them and sort_values
df1=df.loc[df['Secondary Metric Flag']==1].sort_values(by=['Primary Metric','Secondary Value', 'Final Metric'], ascending=[True,False, False])
df0=df.loc[df['Secondary Metric Flag']==0].sort_values(["Primary Metric","Final Metric"],ascending = [True,False])
df=pd.concat([df1,df0]).sort_values('Primary Metric')

sorted with loc
def k(t):
p, s, v, f = df.loc[t]
return (-p, -s, -s * v, -f)
df.loc[sorted(df.index, key=k)]
Primary Metric Secondary Metric Flag Secondary Value Final Metric
9 100 1 90 761
5 100 1 88 93
1 100 1 59 883
3 100 1 56 907
23 100 1 24 258
20 100 0 44 419
13 90 1 79 405
19 90 1 73 635
7 90 1 64 498
11 90 1 18 349
10 90 0 64 851
2 90 0 70 830
8 90 0 12 636
18 90 0 26 183
14 90 0 7 132
15 80 1 71 491
21 80 1 46 885
17 80 1 3 318
24 80 0 20 924
4 80 0 73 589
6 80 0 83 479
22 80 0 99 305
16 80 0 83 253
0 80 0 15 222
12 80 0 100 25
sorted with itertuples
def k(t):
_, p, s, v, f = t
return (-p, -s, -s * v, -f)
idx, *tups = zip(*sorted(df.itertuples(), key=k))
pd.DataFrame(dict(zip(df, tups)), idx)
lexsort
p = df['Primary Metric']
s = df['Secondary Metric Flag']
v = df['Secondary Value']
f = df['Final Metric']
a = np.lexsort([
-p, -s, -s * v, -f
][::-1])
df.iloc[a]
Construct New DataFrame
df.mul([-1, -1, 1, -1]).assign(
**{'Secondary Value': lambda d: d['Secondary Metric Flag'] * d['Secondary Value']}
).pipe(
lambda d: df.loc[d.sort_values([*d]).index]
)

Related

Am tryin to generate different values for each element of a column of a numpy array using a certain range for each column but it's not abiding by it

I need each column to generate random integers with specified range for col 1 (random.randint(1, 50) for col 2 random.randint(51, 100)...etc
import numpy
import random
import pandas
from random import randint
wsn = numpy.arange(1, 6)
taskn = 3
t1 = numpy.random.randint((random.randint(2, 50),random.randint(51, 100),
random.randint(101, 150),random.randint(151, 200),random.randint(201, 250)),size=(5,5))
t2 = numpy.random.randint((random.randint(2, 50),random.randint(51, 100),
random.randint(101, 150),random.randint(151, 200),random.randint(201, 250)),size=(5,5))
t3= numpy.random.randint((random.randint(2, 50),random.randint(51, 100),
random.randint(101, 150),random.randint(151, 200),random.randint(201, 250)),size=(5,5))
print('\nGenerated Data:\t\n\nNumber \t\t\t Task 1 \t\t\t Task 2 \t\t\t Task 3\n')
ni = len(t1)
for i in range(ni):
print('\t {0} \t {1} \t {2} \t {3}\n'.format(wsn[i], t1[i],t2[i],t3[i]))
print('\n\n')
It prints the following
Generated Data:
Number Task 1 Task 2 Task 3
1 [ 1 13 16 121 18] [ 5 22 34 65 194] [ 10 68 60 134 130]
2 [ 0 2 117 176 46] [ 1 15 111 116 180] [22 41 70 24 85]
3 [ 0 12 121 19 193] [ 0 5 37 109 205] [ 5 53 5 106 15]
4 [ 0 5 97 99 235] [ 0 22 142 11 150] [ 6 79 129 64 87]
5 [ 2 46 71 101 186] [ 3 57 141 37 71] [ 15 32 9 117 77]
soemtimes It even generates 0 when I didn't even specifiy 0 in the ranges
np.random.randint(low, high, size=None) allows for low and high being arrays of length num_intervals.
In that case, when size is not specified, it will generate as many integers as there are intervals defined by the low and high bounds.
If you want to generate multiple integers per interval, you just need to specify the corresponding size argument, which must ends by num_intervals.
Here it is size=(num_tasks, num_samples, num_intervals).
import numpy as np
bounds = np.array([1, 50, 100, 150, 200, 250])
num_tasks = 3
num_samples = 7
bounds_low = bounds[:-1]
bounds_high = bounds[1:]
num_intervals = len(bounds_low)
arr = np.random.randint(
bounds_low, bounds_high, size=(num_tasks, num_samples, num_intervals)
)
Checking the properties:
assert arr.shape == (num_tasks, num_samples, num_intervals)
for itvl_idx in range(num_intervals):
assert np.all(arr[:, :, itvl_idx] >= bounds_low[itvl_idx])
assert np.all(arr[:, :, itvl_idx] < bounds_high[itvl_idx])
An example of output:
array([[[ 45, 61, 100, 185, 216],
[ 36, 78, 117, 152, 222],
[ 18, 77, 112, 153, 221],
[ 9, 70, 123, 178, 223],
[ 16, 84, 118, 157, 233],
[ 42, 78, 108, 179, 240],
[ 40, 52, 116, 152, 225]],
[[ 3, 92, 102, 151, 236],
[ 45, 89, 138, 179, 218],
[ 45, 73, 120, 183, 231],
[ 35, 80, 130, 167, 212],
[ 14, 86, 118, 195, 212],
[ 20, 66, 117, 151, 248],
[ 49, 94, 138, 175, 212]],
[[ 13, 75, 116, 169, 206],
[ 13, 75, 127, 179, 213],
[ 29, 64, 136, 151, 213],
[ 1, 81, 140, 197, 200],
[ 17, 77, 112, 171, 215],
[ 18, 75, 103, 180, 209],
[ 47, 57, 132, 194, 234]]])

SpecificationError: Function names must be unique if there is no new column names assigned

I want to create a new column in the clin dataframe based on the following conditions:
1 if vals>=2*365 or is NAN
otherwise 0
I then assign the new column name as SURV.
import numpy as np
vals = clin['days_to_death'].astype(np.float32)
# non-LTS is 0, LTS is 1
surv = [1 if ( v>=2*365 or np.isnan(v) ) else 0 for v in vals ]
clin['SURV'] = clin.apply(surv, axis=1)
Traceback:
SpecificationError: Function names must be unique if there is no new column names assigned
---------------------------------------------------------------------------
SpecificationError Traceback (most recent call last)
<ipython-input-31-603dee8413ce> in <module>
5 # non-LTS is 0, LTS is 1
6 surv = [1 if ( v>=2*365 or np.isnan(v) ) else 0 for v in vals ]
----> 7 clin['SURV'] = clin.apply(surv, axis=1)
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7766 kwds=kwds,
7767 )
-> 7768 return op.get_result()
7769
7770 def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
146 # multiple values for keyword argument "axis"
147 return self.obj.aggregate( # type: ignore[misc]
--> 148 self.f, axis=self.axis, *self.args, **self.kwds
149 )
150
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in aggregate(self, func, axis, *args, **kwargs)
7572 axis = self._get_axis_number(axis)
7573
-> 7574 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
7575
7576 result = None
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/aggregation.py in reconstruct_func(func, **kwargs)
93 # there is no reassigned name
94 raise SpecificationError(
---> 95 "Function names must be unique if there is no new column names "
96 "assigned"
97 )
SpecificationError: Function names must be unique if there is no new column names assigned
clin
clin = pd.DataFrame([[1, '466', '47', 0, '90'],
[1, '357', '54', 1, '80'],
[1, '108', '72', 1, '60'],
[1, '254', '51', 0, '80'],
[1, '138', '78', 1, '80'],
[0, nan, '67', 0, '60']], columns=['vital_status', 'days_to_death', 'age_at_initial_pathologic_diagnosis',
'gender', 'karnofsky_performance_score'], index=['TCGA-06-1806', 'TCGA-06-5408', 'TCGA-06-5410', 'TCGA-06-5411',
'TCGA-06-5412', 'TCGA-06-5413'])
Expected output:
vital_status
days_to_death
age_at_initial_pathologic_diagnosis
gender
karnofsky_performance_score
SURV
TCGA-06-1806
1
466
47
0
90
0
TCGA-06-5408
1
357
54
1
80
0
TCGA-06-5410
1
108
72
1
60
0
TCGA-06-5411
1
254
51
0
80
0
TCGA-06-5412
1
138
78
1
80
0
TCGA-06-5413
0
nan
67
0
60
1
Make a new column of all 0's and then update the column with your desired parameters.
clin['SURV'] = 0
clin.loc[pd.to_numeric(clin.days_to_death).ge(2*365) | clin.days_to_death.isna(), 'SURV'] = 1
print(clin)
Output:
vital_status days_to_death age_at_initial_pathologic_diagnosis gender karnofsky_performance_score SURV
TCGA-06-1806 1 466 47 0 90 0
TCGA-06-5408 1 357 54 1 80 0
TCGA-06-5410 1 108 72 1 60 0
TCGA-06-5411 1 254 51 0 80 0
TCGA-06-5412 1 138 78 1 80 0
TCGA-06-5413 0 NaN 67 0 60 1

Groupby and sum based on column name

I have a dataframe:
df = pd.DataFrame({
'BU': ['AA', 'AA', 'AA', 'BB', 'BB', 'BB'],
'Line_Item': ['Revenues','EBT', 'Expenses', 'Revenues', 'EBT', 'Expenses'],
'201901': [100, 120, 130, 200, 190, 210],
'201902': [100, 120, 130, 200, 190, 210],
'201903': [200, 250, 450, 120, 180, 190],
'202001': [200, 250, 450, 120, 180, 190],
'202002': [200, 250, 450, 120, 180, 190],
'202003': [200, 250, 450, 120, 180, 190]
})
The columns represent years and months respectively. I would like to sum the columns for months into a new columns for the year. The result should look like the following:
df = pd.DataFrame({
'BU': ['AA', 'AA', 'AA', 'BB', 'BB', 'BB'],
'Line_Item': ['Revenues','EBT', 'Expenses', 'Revenues', 'EBT', 'Expenses'],
'201901': [100, 120, 130, 200, 190, 210],
'201902': [100, 120, 130, 200, 190, 210],
'201903': [200, 250, 450, 120, 180, 190],
'202001': [200, 250, 450, 120, 180, 190],
'202002': [200, 250, 450, 120, 180, 190],
'202003': [200, 250, 450, 120, 180, 190],
'2019': [400, 490, 710, 520, 560, 610],
'2020': [600, 750, 1350, 360, 540, 570]
})
My actual dataset has a number of years and has 12 months for each year. Hoping not to have to add the columns manually.
Try creating a DataFrame that contains the year columns and convert the column names to_datetime :
data_df = df.iloc[:, 2:]
data_df.columns = pd.to_datetime(data_df.columns, format='%Y%m')
2019-01-01 2019-02-01 2019-03-01 2020-01-01 2020-02-01 2020-03-01
0 100 100 200 200 200 200
1 120 120 250 250 250 250
2 130 130 450 450 450 450
3 200 200 120 120 120 120
4 190 190 180 180 180 180
5 210 210 190 190 190 190
resample sum the columns by Year and rename columns to just the year values:
data_df = (
data_df.resample('Y', axis=1).sum().rename(columns=lambda c: c.year)
)
2019 2020
0 400 600
1 490 750
2 710 1350
3 520 360
4 560 540
5 610 570
Then join back to the original DataFrame:
new_df = df.join(data_df)
new_df:
BU Line_Item 201901 201902 201903 202001 202002 202003 2019 2020
0 AA Revenues 100 100 200 200 200 200 400 600
1 AA EBT 120 120 250 250 250 250 490 750
2 AA Expenses 130 130 450 450 450 450 710 1350
3 BB Revenues 200 200 120 120 120 120 520 360
4 BB EBT 190 190 180 180 180 180 560 540
5 BB Expenses 210 210 190 190 190 190 610 570
Are the columns are you summing always the same? That is, are there always 3 2019 columns with those same names, and 3 2020 columns with those names? If so, you can just hardcode those new columns.
df['2019'] = df['201901'] + df['201902'] + df['201903']
df['2020'] = df['202001'] + df['202002'] + df['202003']

Extract information to work with with pandas

I have this dataframe:
Column Non-Null Dtype
0 nombre 74 non-null object
1 fabricante - 74 non-null - object
2 calorias -74 non-null -int64
3 proteina -74 non-null -int64
4 grasa -74 non-null -int64
5 sodio -74 non-null -int64
6 fibra dietaria -74 non-null -float64
7 carbohidratos -74 non-null -float64
8 azúcar -74 non-null -int64
9 potasio -74 non-null -int64
10 vitaminas y minerales -74 non-null -int64
I am trying to extract information like this:
cereal_df.loc[cereal_df['fabricante'] == 'Kelloggs', 'sodio']
The output is (good, that is what I want to extract in this case right?)
2 260
3 140
6 125
16 290
17 90
19 140
21 220
24 125
25 200
26 0
27 240
37 170
38 170
43 150
45 190
46 220
47 170
50 320
55 210
57 0
59 290
63 70
64 230
Name: sodio, dtype: int64
That is what I need so far, but when I try to write a function like this (in order to get the confidence):
def valor_medio_intervalo(fabricante, variable, confianza):
subconjunto = cereal_df.loc[cereal_df['fabricante'] == fabricante, cereal_df[variable]]
inicio, final = sm.stats.DescrStatsW(subconjunto[variable]).zconfint_mean(alpha = 1 - confianza)
return inicio, final
Then I run the function:
valor_medio_intervalo('Kelloggs', 'azúcar', 0.95)
And the output is:
KeyError Traceback (most recent call last)
<ipython-input-57-11420ac4d15f> in <module>()
1 #TEST_CELL
----> 2 valor_medio_intervalo('Kelloggs', 'azúcar', 0.95)
7 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1296 if missing == len(indexer):
1297 axis_name = self.obj._get_axis_name(axis)
-> 1298 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1299
1300 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Int64Index([ 6, 8, 5, 0, 8, 10, 14, 8, 6, 5, 12, 1, 9, 7, 13, 3, 2,\n 12, 13, 7, 0, 3, 10, 5, 13, 11, 7, 12, 12, 15, 9, 5, 3, 4,\n 11, 10, 11, 6, 9, 3, 6, 12, 3, 13, 6, 9, 7, 2, 10, 14, 3,\n 0, 0, 6, -1, 12, 8, 6, 2, 3, 0, 0, 0, 15, 3, 5, 3, 14,\n 3, 3, 12, 3, 3, 8],\n dtype='int64')] are in the [columns]"
I do not understand what is going on.
I appreciate your help or any hint.
Thanks in advance
Just got the answer examining the code:
def valor_medio_intervalo(fabricante, variable, confianza):
subconjunto = cereal_df.loc[cereal_df['fabricante'] == fabricante,cereal_df[variable]]
inicio, final = sm.stats.DescrStatsW(subconjunto[variable]).zconfint_mean(alpha = 1 -
confianza)
return inicio, final
in the line
inicio, final = sm.stats.DescrStatsW(subconjunto[variable]).zconfint_mean(alpha = 1 -
confianza)
the
(subconjunto[variable])
is just
(subconjunto)

How can the name of the index column of a pandas DataFrame be changed?

I've got a DataFrame that gets set up such that a column of country names is set as the index column. I want to change the title of that index column. This seems like a simple thing to do, but I can't find how to actually do it. How can it be done? How can the index "foods" column here be changed to "countries"?
import pandas as pd
df = pd.DataFrame(
[
["alcoholic drinks" , 375, 135, 458, 475],
["beverages" , 57, 47, 53, 73],
["carcase meat" , 245, 267, 242, 227],
["cereals" , 1472, 1494, 1462, 1582],
["cheese" , 105, 66, 103, 103],
["confectionery" , 54, 41, 62, 64],
["fats and oils" , 193, 209, 184, 235],
["fish" , 147, 93, 122, 160],
["fresh fruit" , 1102, 674, 957, 1137],
["fresh potatoes" , 720, 1033, 566, 874],
["fresh Veg" , 253, 143, 171, 265],
["other meat" , 685, 586, 750, 803],
["other veg." , 488, 355, 418, 570],
["processed potatoes", 198, 187, 220, 203],
["processed veg." , 360, 334, 337, 365],
["soft drinks" , 1374, 1506, 1572, 1256],
["sugars" , 156, 139, 147, 175]
],
columns = [
"foods",
"England",
"Northern Ireland",
"Scotland",
"Wales"
]
)
df = df.set_index("foods")
df = df.transpose()
df = df.rename({"foods": "countries"})
df
Try this:
df = df.rename_axis("countries", axis=0).rename_axis(None, axis=1)
Demo:
In [10]: df
Out[10]:
alcoholic drinks beverages carcase meat ...
countries
England 375 57 245
Northern Ireland 135 47 267
Scotland 458 53 242
Wales 475 73 227
food is your column index name not your index name.
You can set it explicitly like this:
df.index.name = 'countries'
Output:
foods alcoholic drinks beverages carcase meat cereals cheese \
countries
England 375 57 245 1472 105
Northern Ireland 135 47 267 1494 66
Scotland 458 53 242 1462 103
Wales 475 73 227 1582 103
And, to remove food from column index name:
df.columns.name = None
Output:
alcoholic drinks beverages carcase meat cereals cheese \
countries
England 375 57 245 1472 105
Northern Ireland 135 47 267 1494 66
Scotland 458 53 242 1462 103
Wales 475 73 227 1582 103
Pandas has an Index.rename() method. Works like this:
import pandas as pd
df = pd.DataFrame(
[
["alcoholic drinks", 375, 135, 458, 475],
["beverages", 57, 47, 53, 73],
["carcase meat", 245, 267, 242, 227],
["cereals", 1472, 1494, 1462, 1582],
["cheese", 105, 66, 103, 103],
["confectionery", 54, 41, 62, 64],
["fats and oils", 193, 209, 184, 235],
["fish", 147, 93, 122, 160],
["fresh fruit", 1102, 674, 957, 1137],
["fresh potatoes", 720, 1033, 566, 874],
["fresh Veg", 253, 143, 171, 265],
["other meat", 685, 586, 750, 803],
["other veg.", 488, 355, 418, 570],
["processed potatoes", 198, 187, 220, 203],
["processed veg.", 360, 334, 337, 365],
["soft drinks", 1374, 1506, 1572, 1256],
["sugars", 156, 139, 147, 175]
],
columns=[
"foods",
"England",
"Northern Ireland",
"Scotland",
"Wales"
]
)
df.set_index('foods', inplace=True)
df = df.transpose()
print(df.head())
foods confectionery fats and oils fish fresh fruit ...
England 54 193 147 1102
Northern Ireland 41 209 93 674
Scotland 62 184 122 957
Wales 64 235 160 1137
Renaming the index of the DataFrame:
df.index.rename('Countries', inplace=True)
print(df.head())
foods confectionery fats and oils fish fresh fruit ...
Countries
England 54 193 147 1102
Northern Ireland 41 209 93 674
Scotland 62 184 122 957
Wales 64 235 160 1137
The underlying Series that makes up the columns now has a name because of the transpose(). All we need to do is rename that to an empty string:
df.columns.rename('', inplace=True)
print(df.head())
confectionery fats and oils fish fresh fruit ...
Countries
England 54 193 147 1102
Northern Ireland 41 209 93 674
Scotland 62 184 122 957
Wales 64 235 160 1137
I don't prefer this over #MaxU's answer because it's slower, but it is shorter code for whatever that's worth.
df.stack().rename_axis(['countries', None]).unstack()
alcoholic drinks beverages carcase meat cereals
countries
England 375 57 245 1472
Northern Ireland 135 47 267 1494
Scotland 458 53 242 1462
Wales 475 73 227 1582