What is the fastest way for adding the vector elements horizontally in odd order? - optimization

According to this question I implemented the horizontal addition this time 5 by 5 and 7 by 7. It does the job correctly but it is not fast enough.
Can it be faster than what it is? I tried to use hadd and other instruction but the improvement is restricted. For examlple, when I use _mm256_bsrli_epi128 it is slightly better but it needs some extra permutation that ruins the benefit because of the lanes. So the question is how it should be implemented to gain more performance. The same story is for 9 elements, etc.
This adds 5 elements horizontally and puts the results in places 0, 5, and 10:
//it put the results in places 0, 5, and 10
inline __m256i _mm256_hadd5x5_epi16(__m256i a )
{
__m256i a1, a2, a3, a4;
a1 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 1 * 2);
a2 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 2 * 2);
a3 = _mm256_bsrli_epi128(a2, 2);
a4 = _mm256_bsrli_epi128(a3, 2);
return _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a1, a2), _mm256_add_epi16(a3, a4)) , a );
}
And this adds 7 elements horizontally and puts the results in places 0 and 7:
inline __m256i _mm256_hadd7x7_epi16(__m256i a )
{
__m256i a1, a2, a3, a4, a5, a6;
a1 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 1 * 2);
a2 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 2 * 2);
a3 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 3 * 2);
a4 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 4 * 2);
a5 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 5 * 2);
a6 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 6 * 2);
return _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a1, a2), _mm256_add_epi16(a3, a4)) , _mm256_add_epi16(_mm256_add_epi16(a5, a6), a ));
}

Indeed it is possible calculate these sums with less instructions. The idea is to accumulate
the partial sums not only in columns 10, 5 and 0, but also in other columns. This reduces the number of
vpaddw instructions and the number of 'shuffles' compared to your solution.
#include <stdio.h>
#include <x86intrin.h>
/* gcc -O3 -Wall -m64 -march=haswell hor_sum5x5.c */
int print_vec_short(__m256i x);
int print_10_5_0_short(__m256i x);
__m256i _mm256_hadd5x5_epi16(__m256i a );
__m256i _mm256_hadd7x7_epi16(__m256i a );
int main() {
short x[16];
for(int i=0; i<16; i++) x[i] = i+1; /* arbitrary initial values */
__m256i t0 = _mm256_loadu_si256((__m256i*)x);
__m256i t2 = _mm256_permutevar8x32_epi32(t0,_mm256_set_epi32(0,7,6,5,4,3,2,1));
__m256i t02 = _mm256_add_epi16(t0,t2);
__m256i t3 = _mm256_bsrli_epi128(t2,4); /* byte shift right */
__m256i t023 = _mm256_add_epi16(t02,t3);
__m256i t13 = _mm256_srli_epi64(t02,16); /* bit shift right */
__m256i sum = _mm256_add_epi16(t023,t13);
printf("t0 = ");print_vec_short(t0 );
printf("t2 = ");print_vec_short(t2 );
printf("t02 = ");print_vec_short(t02 );
printf("t3 = ");print_vec_short(t3 );
printf("t023= ");print_vec_short(t023);
printf("t13 = ");print_vec_short(t13 );
printf("sum = ");print_vec_short(sum );
printf("\nVector elements of interest: columns 10, 5, 0:\n");
printf("t0 [10, 5, 0] = ");print_10_5_0_short(t0 );
printf("t2 [10, 5, 0] = ");print_10_5_0_short(t2 );
printf("t02 [10, 5, 0] = ");print_10_5_0_short(t02 );
printf("t3 [10, 5, 0] = ");print_10_5_0_short(t3 );
printf("t023[10, 5, 0] = ");print_10_5_0_short(t023);
printf("t13 [10, 5, 0] = ");print_10_5_0_short(t13 );
printf("sum [10, 5, 0] = ");print_10_5_0_short(sum );
printf("\nSum with _mm256_hadd5x5_epi16(t0)\n");
sum = _mm256_hadd5x5_epi16(t0);
printf("sum [10, 5, 0] = ");print_10_5_0_short(sum );
/* now the sum of 7 elements: */
printf("\n\nSum of short ints 13...7 and short ints 6...0:\n");
__m256i t = _mm256_loadu_si256((__m256i*)x);
t0 = _mm256_permutevar8x32_epi32(t0,_mm256_set_epi32(3,6,5,4,3,2,1,0));
t0 = _mm256_and_si256(t0,_mm256_set_epi16(0xFFFF,0,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF));
__m256i t1 = _mm256_alignr_epi8(t0,t0,2);
__m256i t01 = _mm256_add_epi16(t0,t1);
__m256i t23 = _mm256_alignr_epi8(t01,t01,4);
__m256i t0123 = _mm256_add_epi16(t01,t23);
__m256i t4567 = _mm256_alignr_epi8(t0123,t0123,8);
__m256i sum08 = _mm256_add_epi16(t0123,t4567); /* all elements are summed, but another permutation is needed to get the answer at position 7 */
sum = _mm256_permutevar8x32_epi32(sum08,_mm256_set_epi32(4,4,4,4,4,0,0,0));
printf("t = ");print_vec_short(t );
printf("t0 = ");print_vec_short(t0 );
printf("t1 = ");print_vec_short(t1 );
printf("t01 = ");print_vec_short(t01 );
printf("t23 = ");print_vec_short(t23 );
printf("t0123 = ");print_vec_short(t0123 );
printf("t4567 = ");print_vec_short(t4567 );
printf("sum08 = ");print_vec_short(sum08 );
printf("sum = ");print_vec_short(sum );
printf("\nSum with _mm256_hadd7x7_epi16(t) (the answer is in column 0 and in column 7)\n");
sum = _mm256_hadd7x7_epi16(t);
printf("sum = ");print_vec_short(sum );
return 0;
}
inline __m256i _mm256_hadd5x5_epi16(__m256i a )
{
__m256i a1, a2, a3, a4;
a1 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 1 * 2);
a2 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 2 * 2);
a3 = _mm256_bsrli_epi128(a2, 2);
a4 = _mm256_bsrli_epi128(a3, 2);
return _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a1, a2), _mm256_add_epi16(a3, a4)) , a );
}
inline __m256i _mm256_hadd7x7_epi16(__m256i a )
{
__m256i a1, a2, a3, a4, a5, a6;
a1 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 1 * 2);
a2 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 2 * 2);
a3 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 3 * 2);
a4 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 4 * 2);
a5 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 5 * 2);
a6 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 6 * 2);
return _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a1, a2), _mm256_add_epi16(a3, a4)) , _mm256_add_epi16(_mm256_add_epi16(a5, a6), a ));
}
int print_vec_short(__m256i x){
short int v[16];
_mm256_storeu_si256((__m256i *)v,x);
printf("%4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi \n",
v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
return 0;
}
int print_10_5_0_short(__m256i x){
short int v[16];
_mm256_storeu_si256((__m256i *)v,x);
printf("%4hi %4hi %4hi \n",v[10],v[5],v[0]);
return 0;
}
The output is:
$ ./a.out
t0 = 16 15 14 13 | 12 11 10 9 | 8 7 6 5 | 4 3 2 1
t2 = 2 1 16 15 | 14 13 12 11 | 10 9 8 7 | 6 5 4 3
t02 = 18 16 30 28 | 26 24 22 20 | 18 16 14 12 | 10 8 6 4
t3 = 0 0 2 1 | 16 15 14 13 | 0 0 10 9 | 8 7 6 5
t023= 18 16 32 29 | 42 39 36 33 | 18 16 24 21 | 18 15 12 9
t13 = 0 18 16 30 | 0 26 24 22 | 0 18 16 14 | 0 10 8 6
sum = 18 34 48 59 | 42 65 60 55 | 18 34 40 35 | 18 25 20 15
Vector elements of interest: columns 10, 5, 0:
t0 [10, 5, 0] = 11 6 1
t2 [10, 5, 0] = 13 8 3
t02 [10, 5, 0] = 24 14 4
t3 [10, 5, 0] = 15 10 5
t023[10, 5, 0] = 39 24 9
t13 [10, 5, 0] = 26 16 6
sum [10, 5, 0] = 65 40 15
Sum with _mm256_hadd5x5_epi16(t0)
sum [10, 5, 0] = 65 40 15
Sum of short ints 13...7 and short ints 6...0:
t = 16 15 14 13 | 12 11 10 9 | 8 7 6 5 | 4 3 2 1
t0 = 8 0 14 13 | 12 11 10 9 | 0 7 6 5 | 4 3 2 1
t1 = 9 8 0 14 | 13 12 11 10 | 1 0 7 6 | 5 4 3 2
t01 = 17 8 14 27 | 25 23 21 19 | 1 7 13 11 | 9 7 5 3
t23 = 21 19 17 8 | 14 27 25 23 | 5 3 1 7 | 13 11 9 7
t0123 = 38 27 31 35 | 39 50 46 42 | 6 10 14 18 | 22 18 14 10
t4567 = 39 50 46 42 | 38 27 31 35 | 22 18 14 10 | 6 10 14 18
sum08 = 77 77 77 77 | 77 77 77 77 | 28 28 28 28 | 28 28 28 28
sum = 77 77 77 77 | 77 77 77 77 | 77 77 28 28 | 28 28 28 28
Sum with _mm256_hadd7x7_epi16(t) (the answer is in column 0 and in column 7)
sum = 16 31 45 58 | 70 81 91 84 | 77 70 63 56 | 49 42 35 28

Related

Obtain corresponding column based on another column that matches another dataframe

I want to find matching values from two data frames and return a third value.
For example, if cpg_symbol["Gene_Symbol"] corresponds with diff_meth_kirp_symbol.index, I want to assign cpg_symbol.loc["Composite_Element_REF"] as index.
My code returned an empty dataframe.
diff_meth_kirp.index = diff_meth_kirp.merge(cpg_symbol, left_on=diff_meth_kirp.index, right_on="Gene_Symbol")[["Composite_Element_REF"]]
Example:
diff_meth_kirp
Hello
My
name
is
First
0
1
2
3
Second
4
5
6
7
Third
8
9
10
11
Fourth
12
13
14
15
Fifth
16
17
18
19
Sixth
20
21
22
23
cpg_symbol
Composite_Element_REF
Gene_Symbol
cg1
First
cg2
Third
cg3
Fifth
cg4
Seventh
cg5
Ninth
cg6
First
Expected output:
Hello
My
name
is
cg1
0
1
2
3
cg2
8
9
10
11
cg3
16
17
18
19
cg6
0
1
2
3
Your code works well for me but you can try this version:
out = (diff_meth_kirp.merge(cpg_symbol.set_index('Gene_Symbol'),
left_index=True, right_index=True)
.set_index('Composite_Element_REF')
.rename_axis(None).sort_index())
print(out)
# Output
Hello My name is
cg1 0 1 2 3
cg2 8 9 10 11
cg3 16 17 18 19
cg6 0 1 2 3
Input dataframes:
data1 = {'Hello': {'First': 0, 'Second': 4, 'Third': 8, 'Fourth': 12, 'Fifth': 16, 'Sixth': 20},
'My': {'First': 1, 'Second': 5, 'Third': 9, 'Fourth': 13, 'Fifth': 17, 'Sixth': 21},
'name': {'First': 2, 'Second': 6, 'Third': 10, 'Fourth': 14, 'Fifth': 18, 'Sixth': 22},
'is': {'First': 3, 'Second': 7, 'Third': 11, 'Fourth': 15, 'Fifth': 19, 'Sixth': 23}}
diff_meth_kirp = pd.DataFrame(data1)
data2 = {'Composite_Element_REF': {0: 'cg1', 1: 'cg2', 2: 'cg3', 3: 'cg4', 4: 'cg5', 5: 'cg6'},
'Gene_Symbol': {0: 'First', 1: 'Third', 2: 'Fifth', 3: 'Seventh', 4: 'Ninth', 5: 'First'}}
cpg_symbol = pd.DataFrame(data2)

How do I get a time delta that is closest to 0 days?

I have the following dataframe:
gp_columns = {
'name': ['companyA', 'companyB'],
'firm_ID' : [1, 2],
'timestamp_one' : ['2016-04-01', '2017-09-01']
}
fund_columns = {
'firm_ID': [1, 1, 2, 2, 2],
'department_ID' : [10, 11, 20, 21, 22],
'timestamp_mult' : ['2015-01-01', '2016-03-01', '2016-10-01', '2017-02-01', '2018-11-01'],
'number' : [400, 500, 1000, 3000, 4000]
}
gp_df = pd.DataFrame(gp_columns)
fund_df = pd.DataFrame(fund_columns)
gp_df['timestamp_one'] = pd.to_datetime(gp_df['timestamp_one'])
fund_df['timestamp_mult'] = pd.to_datetime(fund_df['timestamp_mult'])
merged_df = gp_df.merge(fund_df)
merged_df
merged_df_v1 = merged_df.copy()
merged_df_v1['incidence_num'] = merged_df.groupby('firm_ID')['department_ID']\
.transform('cumcount')
merged_df_v1['incidence_num'] = merged_df_v1['incidence_num'] + 1
merged_df_v1['time_delta'] = merged_df_v1['timestamp_mult'] - merged_df_v1['timestamp_one']
merged_wide = pd.pivot(merged_df_v1, index = ['name','firm_ID', 'timestamp_one'], \
columns = 'incidence_num', \
values = ['department_ID', 'time_delta', 'timestamp_mult', 'number'])
merged_wide.reset_index()
that looks as follows:
My question is how i get a column that calculates the minimum time delta (so closest to 0). Note that the time delta can be negative or positive, so .abs() does not work for me here.
I want a dataframe with this particular output:
You can stack (which removes NaTs) and groupby.first after sorting the rows by absolute value (with the key parameter of sort_values):
df = merged_wide.reset_index()
df['time_delta_min'] = (df['time_delta'].stack()
.sort_values(key=abs)
.groupby(level=0).first()
)
output:
name firm_ID timestamp_one department_ID \
incidence_num 1 2 3
0 companyA 1 2016-04-01 10 11 NaN
1 companyB 2 2017-09-01 20 21 22
time_delta timestamp_mult \
incidence_num 1 2 3 1 2
0 -456 days -31 days NaT 2015-01-01 2016-03-01
1 -335 days -212 days 426 days 2016-10-01 2017-02-01
number time_delta_min
incidence_num 3 1 2 3
0 NaT 400 500 NaN -31 days
1 2018-11-01 1000 3000 4000 -212 days
Use lookup with indices of absolute values by DataFrame.idxmin:
idx, cols = pd.factorize(df['time_delta'].abs().idxmin(axis=1))
df['time_delta_min'] = (df['time_delta'].reindex(cols, axis=1).to_numpy()
[np.arange(len(df)), idx])
print (df)

SpecificationError: Function names must be unique if there is no new column names assigned

I want to create a new column in the clin dataframe based on the following conditions:
1 if vals>=2*365 or is NAN
otherwise 0
I then assign the new column name as SURV.
import numpy as np
vals = clin['days_to_death'].astype(np.float32)
# non-LTS is 0, LTS is 1
surv = [1 if ( v>=2*365 or np.isnan(v) ) else 0 for v in vals ]
clin['SURV'] = clin.apply(surv, axis=1)
Traceback:
SpecificationError: Function names must be unique if there is no new column names assigned
---------------------------------------------------------------------------
SpecificationError Traceback (most recent call last)
<ipython-input-31-603dee8413ce> in <module>
5 # non-LTS is 0, LTS is 1
6 surv = [1 if ( v>=2*365 or np.isnan(v) ) else 0 for v in vals ]
----> 7 clin['SURV'] = clin.apply(surv, axis=1)
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7766 kwds=kwds,
7767 )
-> 7768 return op.get_result()
7769
7770 def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
146 # multiple values for keyword argument "axis"
147 return self.obj.aggregate( # type: ignore[misc]
--> 148 self.f, axis=self.axis, *self.args, **self.kwds
149 )
150
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in aggregate(self, func, axis, *args, **kwargs)
7572 axis = self._get_axis_number(axis)
7573
-> 7574 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
7575
7576 result = None
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/aggregation.py in reconstruct_func(func, **kwargs)
93 # there is no reassigned name
94 raise SpecificationError(
---> 95 "Function names must be unique if there is no new column names "
96 "assigned"
97 )
SpecificationError: Function names must be unique if there is no new column names assigned
clin
clin = pd.DataFrame([[1, '466', '47', 0, '90'],
[1, '357', '54', 1, '80'],
[1, '108', '72', 1, '60'],
[1, '254', '51', 0, '80'],
[1, '138', '78', 1, '80'],
[0, nan, '67', 0, '60']], columns=['vital_status', 'days_to_death', 'age_at_initial_pathologic_diagnosis',
'gender', 'karnofsky_performance_score'], index=['TCGA-06-1806', 'TCGA-06-5408', 'TCGA-06-5410', 'TCGA-06-5411',
'TCGA-06-5412', 'TCGA-06-5413'])
Expected output:
vital_status
days_to_death
age_at_initial_pathologic_diagnosis
gender
karnofsky_performance_score
SURV
TCGA-06-1806
1
466
47
0
90
0
TCGA-06-5408
1
357
54
1
80
0
TCGA-06-5410
1
108
72
1
60
0
TCGA-06-5411
1
254
51
0
80
0
TCGA-06-5412
1
138
78
1
80
0
TCGA-06-5413
0
nan
67
0
60
1
Make a new column of all 0's and then update the column with your desired parameters.
clin['SURV'] = 0
clin.loc[pd.to_numeric(clin.days_to_death).ge(2*365) | clin.days_to_death.isna(), 'SURV'] = 1
print(clin)
Output:
vital_status days_to_death age_at_initial_pathologic_diagnosis gender karnofsky_performance_score SURV
TCGA-06-1806 1 466 47 0 90 0
TCGA-06-5408 1 357 54 1 80 0
TCGA-06-5410 1 108 72 1 60 0
TCGA-06-5411 1 254 51 0 80 0
TCGA-06-5412 1 138 78 1 80 0
TCGA-06-5413 0 NaN 67 0 60 1

check if the group id or element is found the column list in pandas data frame

data = {
'org_id' :[4,73,6,77,21,36,40,22,21,30,31],
'flag': [['4', '73'],['73'],['6', '77'],['77'],['21'],['36', '36'],['40'],['22', '41'],['21'],['22', '30'],['31', '31']],
'r_id' : [4,4,6,6,20,20,20,22,28,28,28]
}
df = pd.DataFrame.from_dict(data)
df
required data frame to be like below,
data = {
'org_id' :[4,73,6,77,21,36,40,22,21,30,31],
'flag': [['4', '73'],['73'],['6', '77'],['77'],['21'],['36', '36'],['40'],['22', '41'],['21'],['22', '30'],['31', '31']],
'r_id' : [4,4,6,6,20,20,20,22,28,28,28],
'is_foundin_org_id': ['yes','yes','yes','yes','NO','NO','NO','yes','NO','NO','NO']
}
df2 = pd.DataFrame.from_dict(data)
df2
output data frame
Out[115]:
org_id flag r_id is_foundin_org_id
0 4 [4, 73] 4 yes
1 73 [73] 4 yes
2 6 [6, 77] 6 yes
3 77 [77] 6 yes
4 21 [21] 20 NO
5 36 [36, 36] 20 NO
6 40 [40] 20 NO
7 22 [22, 41] 22 yes
8 21 [21] 28 NO
9 30 [22, 30] 28 NO
10 31 [31, 31] 28 NO
Need to identify after grouping by r_id whether r_id is present in the grouped rows of r_id, eg. when I group by 4 is found in one of the rows of org_id, hence i mark yes for the group 4, similarly for 20 is not found in the org_id column, hence i mark No for all the 20s group. Thanking you.
Try this
d = {True: 'Yes', False: 'No'}
df['is_foundin_org_id'] = (df.org_id.eq(df.r_id).groupby(df.r_id)
.transform('max').map(d))
Out[1549]:
org_id flag r_id is_foundin_org_id
0 4 [4, 73] 4 Yes
1 73 [73] 4 Yes
2 6 [6, 77] 6 Yes
3 77 [77] 6 Yes
4 21 [21] 20 No
5 36 [36, 36] 20 No
6 40 [40] 20 No
7 22 [22, 41] 22 Yes
8 21 [21] 28 No
9 30 [22, 30] 28 No
10 31 [31, 31] 28 No
IIUC,
df['is_found'] = np.where(df['org_id'].eq(df['r_id']) # check if the ids are equal
.groupby(df['r_id']) # group by r_id
.transform('any'), # if True occurs within the groups
'yes', 'no')
Output:
org_id flag r_id is_found
0 4 [4, 73] 4 yes
1 73 [73] 4 yes
2 6 [6, 77] 6 yes
3 77 [77] 6 yes
4 21 [21] 20 no
5 36 [36, 36] 20 no
6 40 [40] 20 no
7 22 [22, 41] 22 yes
8 21 [21] 28 no
9 30 [22, 30] 28 no
10 31 [31, 31] 28 no
Numpy and pandas.factorize
This may seem convoluted. But I'm using Numpy and keeping everything O(n)
Get arrays because I'll use them more than once
a = df.r_id.to_numpy()
b = df.org_id.to_numpy()
Factorizing something identifies each unique value with an integer starting with zero. pandas.factorize will return a tuple of (factorized_integer_representation, unique_values). What's great about the factorization is that I can use those integers as positions within the unique values array to reproduce the original array. Namely r[i] using the r and i below.
I could've also used numpy.unique with the argument return_inverse to get the same arrays, BUT pandas.factorize doesn't sort the unique values and that is an order of O(log(n)) that we can save by not using. For larger data, pandas.factorize is the winner.
I'll create a holding array that will house the boolean values on whether our any condition is satisfied for each unique value. numpy.logical_or.at is the function we use to see if any values from a == b is True within the specified indices in i.
I'll demonstrate after the code below.
i, r = pd.factorize(a)
o = np.zeros(len(r), bool)
np.logical_or.at(o, i, a == b)
df.assign(is_found=np.where(o, 'Yes', 'No')[i])
org_id flag r_id is_found
0 4 [4, 73] 4 Yes
1 73 [73] 4 Yes
2 6 [6, 77] 6 Yes
3 77 [77] 6 Yes
4 21 [21] 20 No
5 36 [36, 36] 20 No
6 40 [40] 20 No
7 22 [22, 41] 22 Yes
8 21 [21] 28 No
9 30 [22, 30] 28 No
10 31 [31, 31] 28 No
Details
a == b
array([True, False, True, False, False, False, False, True, False, False, False])
r are the unique values
r
array([ 4, 6, 20, 22, 28])
i are the indices
i
array([0, 0, 1, 1, 2, 2, 2, 3, 4, 4, 4])
So r[i] reproduces a
r[i]
array([ 4, 4, 6, 6, 20, 20, 20, 22, 28, 28, 28])
Now we start with a base array o of all False, one for each unique value
array([False, False, False, False, False])
And for each position in i, we check if the corresponding value in a == b is True.
# i, a == b -> 0, True <4 == 4>
# 0, False <4 != 73>
# ↓ 1, True <6 == 6>
# ↓ 1, False <6 != 77>
# ↓ ↓ 2, False <20 != 21>
# ↓ ↓ 2, False <20 != 36>
# ↓ ↓ 2, False <20 != 40>
# ↓ ↓ ↓ 3, True <22 == 22>
# ↓ ↓ ↓ ↓ 4, False <28 != 21>
# ↓ ↓ ↓ ↓ 4, False <28 != 30>
# ↓ ↓ ↓ ↓ 4, False <28 != 31>
# At least 1 True ↓ ↓ ↓ ↓ ↓
# o -> [ True, True, False, True, False]
Swap Yes|No instead of True|False
# o -> [ True, True, False, True, False]
# np.where(o, 'Yes', 'No') -> [ 'Yes', 'Yes', 'No', 'Yes', 'No']
And slice it with i to produce an array of the same length as original with the appropriate value for each corresponding value in the unique values array.
np.where(o, 'Yes', 'No')[i]
['Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No']

Sorting pandas dataframe by groups

I would like to sort a dataframe by certain priority rules.
I've achieved this in the code below but I think this is a very hacky solution.
Is there a more proper Pandas way of doing this?
import pandas as pd
import numpy as np
df=pd.DataFrame({"Primary Metric":[80,100,90,100,80,100,80,90,90,100,90,90,80,90,90,80,80,80,90,90,100,80,80,100,80],
"Secondary Metric Flag":[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0],
"Secondary Value":[15, 59, 70, 56, 73, 88, 83, 64, 12, 90, 64, 18, 100, 79, 7, 71, 83, 3, 26, 73, 44, 46, 99,24, 20],
"Final Metric":[222, 883, 830, 907, 589, 93, 479, 498, 636, 761, 851, 349, 25, 405, 132, 491, 253, 318, 183, 635, 419, 885, 305, 258, 924]})
Primary_List=list(np.unique(df['Primary Metric']))
Primary_List.sort(reverse=True)
df_sorted=pd.DataFrame()
for p in Primary_List:
lol=df[df["Primary Metric"]==p]
lol.sort_values(["Secondary Metric Flag"],ascending = False)
pt1=lol[lol["Secondary Metric Flag"]==1].sort_values(by=['Secondary Value', 'Final Metric'], ascending=[False, False])
pt0=lol[lol["Secondary Metric Flag"]==0].sort_values(["Final Metric"],ascending = False)
df_sorted=df_sorted.append(pt1)
df_sorted=df_sorted.append(pt0)
df_sorted
The priority rules are:
First sort by the 'Primary Metric', then by the 'Secondary Metric
Flag'.
If the 'Secondary Metric Flag' ==1, sort by 'Secondary Value' then
the 'Final Metric'
If ==0, go right for the 'Final Metric'.
Appreciate any feedback.
You do not need for loop and groupby here , just split them and sort_values
df1=df.loc[df['Secondary Metric Flag']==1].sort_values(by=['Primary Metric','Secondary Value', 'Final Metric'], ascending=[True,False, False])
df0=df.loc[df['Secondary Metric Flag']==0].sort_values(["Primary Metric","Final Metric"],ascending = [True,False])
df=pd.concat([df1,df0]).sort_values('Primary Metric')
sorted with loc
def k(t):
p, s, v, f = df.loc[t]
return (-p, -s, -s * v, -f)
df.loc[sorted(df.index, key=k)]
Primary Metric Secondary Metric Flag Secondary Value Final Metric
9 100 1 90 761
5 100 1 88 93
1 100 1 59 883
3 100 1 56 907
23 100 1 24 258
20 100 0 44 419
13 90 1 79 405
19 90 1 73 635
7 90 1 64 498
11 90 1 18 349
10 90 0 64 851
2 90 0 70 830
8 90 0 12 636
18 90 0 26 183
14 90 0 7 132
15 80 1 71 491
21 80 1 46 885
17 80 1 3 318
24 80 0 20 924
4 80 0 73 589
6 80 0 83 479
22 80 0 99 305
16 80 0 83 253
0 80 0 15 222
12 80 0 100 25
sorted with itertuples
def k(t):
_, p, s, v, f = t
return (-p, -s, -s * v, -f)
idx, *tups = zip(*sorted(df.itertuples(), key=k))
pd.DataFrame(dict(zip(df, tups)), idx)
lexsort
p = df['Primary Metric']
s = df['Secondary Metric Flag']
v = df['Secondary Value']
f = df['Final Metric']
a = np.lexsort([
-p, -s, -s * v, -f
][::-1])
df.iloc[a]
Construct New DataFrame
df.mul([-1, -1, 1, -1]).assign(
**{'Secondary Value': lambda d: d['Secondary Metric Flag'] * d['Secondary Value']}
).pipe(
lambda d: df.loc[d.sort_values([*d]).index]
)