how do I select rows from pandas df without returning False values? - pandas

I have a df and I need to select rows based on some conditions in multiple columns.
Here is what I have
import pandas as pd
dat = [('p','q', 5), ('k','j', 2), ('p','-', 5), ('-','p', 4), ('q','pkjq', 3), ('pkjq','q', 2)
df = pd.DataFrame(dat, columns = ['a', 'b', 'c'])
df_dat = df[(df[['a','b']].isin(['k','p','q','j']) & df['c'] > 3)] | df[(~df[['a','b']].isin(['k','p','q','j']) & df['c'] > 2 )]
Expected result = [('p','q', 5), ('p','-', 5), ('-','p', 4), ('q','pkjq', 3)]
Result I am getting is an all false dataframe

When you have the complicate condition I recommend, make the condition outside the slice
cond1 = df[['a','b']].isin(['k','p','q','j']).any(1) & df['c'].gt(3)
cond2 = (~df[['a','b']].isin(['k','p','q','j'])).any(1) & df['c'].gt(2)
out = df.loc[cond1 | cond2]
Out[305]:
a b c
0 p q 5
2 p - 5
3 - p 4
4 q pkjq 3

Related

How to subset a dataframe without assigning it to a variable?

I would like to subset a dataframe without assigning it first to a variable.
Example with assigning:
df = pd.DataFrame({'A': range(10), 'B': range(5, 15)})
df[(df['A'] > 3) & (df['B'] < 12)]
Result:
A B
4 4 9
5 5 10
6 6 11
How to do this without creating df first?
Something like...
pd.DataFrame({'A': range(10), 'B': range(5, 15)}).loc[..., ...]
Or maybe using .pipe()?
Use selection by callable:
df = (pd.DataFrame({'A': range(10), 'B': range(5, 15)})
.loc[lambda x: (x['A'] > 3) & (x['B'] < 12)])
print (df)
A B
4 4 9
5 5 10
6 6 11
Another idea with query, thank you #sammywemmy:
df = pd.DataFrame({'A': range(10), 'B': range(5, 15)}).query("A > 3 and B < 12")
#working same
df = pd.DataFrame({'A': range(10), 'B': range(5, 15)}).query("A > 3 & B < 12")

How to I apply multiple filter criteria based on condition to copy values from other columns into new columns in pandas dataframe

I have suppose cols: A,B,C,D,E,F
i.e If in col A == '', make new col G = col C,new col H = col D,new col I = col E
if in col A !='' & col B == 'some-value' ,make col G=0,col H=0, col I=0..
tried using np.where but it supports two condtions only any idea.
def change(dfr):
if (dfr['A']==''):
dfr['G'] = dfr['A']
dfr['H'] = dfr['B']
dfr['I'] = dfr['C']
if ((dfr['A']!='') & (dfr['B']=='some-value')):
dfr['G'] = dfr['A']
dfr['H'] = dfr['B']
dfr['I'] = dfr['C']
if ((dfr['A']!='') & (dfr['B']=='value')):
dfr['G'] = 0
dfr['H'] = 0
dfr['I'] = 0
I'm not sure you need the if statements. You can use .loc to accomplish this. Here is a toy dataframe:
data = pd.DataFrame({"A" : ['a', '', 'f', '4', '', 'z'],
"B" : ['f', 'y', 't', 'u', 'o', '1'],
"C" : ['a', 'b', 'c', 'd', 'e', 'f'],
"G" : [1, 1, 1, 1, 1, 1],
'H' : [6, 6, 6, 6, 6, 6],
"I" : ['q', 'q', 'q', 'q', 'q', 'q']})
data
A B C G H I
0 a f a 1 6 q
1 y b 1 6 q
2 f t c 1 6 q
3 4 u d 1 6 q
4 o e 1 6 q
5 z 1 f 1 6 q
It probably makes sense to build in a couple of arguments for the values you'd like to check for in columns B:
def change(dfr, b_firstvalue, b_secondvalue):
new_df = dfr.copy()
new_df.loc[new_df['A']=='', 'G'] = new_df['A']
new_df.loc[new_df['A']=='', 'H'] = new_df['B']
new_df.loc[new_df['A']=='', 'I'] = new_df['C']
new_df.loc[((new_df['A']!='') & (new_df['B'] == b_firstvalue)), 'G'] = new_df['A']
new_df.loc[((new_df['A']!='') & (new_df['B'] == b_firstvalue)), 'H'] = new_df['B']
new_df.loc[((new_df['A']!='') & (new_df['B'] == b_firstvalue)), 'I'] = new_df['C']
new_df.loc[((new_df['A']!='') & (new_df['B'] == b_secondvalue)), 'G'] = 0
new_df.loc[((new_df['A']!='') & (new_df['B'] == b_secondvalue)), 'H'] = 0
new_df.loc[((new_df['A']!='') & (new_df['B'] == b_secondvalue)), 'I'] = 0
return new_df
data2 = change(data, '1', 'f')
data2
A B C G H I
0 a f a 0 0 0
1 y b y b
2 f t c 1 6 q
3 4 u d 1 6 q
4 o e o e
5 z 1 f z 1 f
Obviously, the function will depend on exactly how many columns you want to deal with. This was just a solution for the example problem. If you have many more columns you'd like to replace values with, there may be more efficient ways of handling that.

Getting count of rows from breakpoints of different column

Consider there are two columns A and B in a dataframe. How can I decile column A and use those breakpoints of column A deciles to calculate the count of rows in column B??
import pandas as pd
import numpy as np
df=pd.read_excel("E:\Sai\Development\UCG\qcut.xlsx")
df['Range']=pd.qcut(df['a'],10)
df_gb=df.groupby('Range',as_index=False).agg({'a':[min,max,np.size]})
df_gb.columns = df_gb.columns.droplevel()
df_gb=df_gb.rename(columns={'':'Range','size':'count_A'})
df['Range_B']=0
df['Range_B'].loc[df['b']<=df_gb['max'][0]]=1
df['Range_B'].loc[(df['b']>df_gb['max'][0]) & (df['b']<=df_gb['max'][1])]=2
df['Range_B'].loc[(df['b']>df_gb['max'][1]) & (df['b']<=df_gb['max'][2])]=3
df['Range_B'].loc[(df['b']>df_gb['max'][2]) & (df['b']<=df_gb['max'][3])]=4
df['Range_B'].loc[(df['b']>df_gb['max'][3]) & (df['b']<=df_gb['max'][4])]=5
df['Range_B'].loc[(df['b']>df_gb['max'][4]) & (df['b']<=df_gb['max'][5])]=6
df['Range_B'].loc[(df['b']>df_gb['max'][5]) & (df['b']<=df_gb['max'][6])]=7
df['Range_B'].loc[(df['b']>df_gb['max'][6]) & (df['b']<=df_gb['max'][7])]=8
df['Range_B'].loc[(df['b']>df_gb['max'][7]) & (df['b']<=df_gb['max'][8])]=9
df['Range_B'].loc[df['b']>df_gb['max'][8]]=10
df_gb_b=df.groupby('Range_B',as_index=False).agg({'b':np.size})
df_gb_b=df_gb_b.rename(columns={'b':'count_B'})
df_final = pd.concat([df_gb, df_gb_b], axis=1)
df_final=df_final[['Range','count_A','count_B']]
Is there any simple solution, as I intend to do for so many columns
I hope this would help:
df['Range'] = pd.qcut(df['a'], 10)
df2 = df.groupby(['Range'])['a'].count().reset_index().rename(columns = {'a':'count_A'})
for item in df2['Range'].values:
df2.loc[df2['Range'] == item, 'count_B'] = df['b'].apply(lambda x: x in item).sum()
df2 = df2.sort_values('Range', ascending = True)
if you want to additionally count values b that are out of range a:
min_border = df2['Range'].values[0].left
max_border = df2['Range'].values[-1].right
df2.loc[0, 'count_B'] += df.loc[df['b'] <= min_border, 'b'].count()
df2.iloc[-1, 2] += df.loc[df['b'] > max_border, 'b'].count()
One way -
df = pd.DataFrame({'A': np.random.randint(0, 100, 20), 'B': np.random.randint(0, 10, 20)})
bins = [0, 1, 4, 8, 16, 32, 60, 100, 200, 500, 5999]
labels = ["{0} - {1}".format(i, j) for i, j in zip(bins, bins[1:])]
df['group_A'] = pd.cut(df['A'], bins, right=False, labels=labels)
df['group_B'] = pd.cut(df.B, bins, right=False, labels=labels)
df1 = df.groupby(['group_A'])['A'].count().reset_index()
df2 = df.groupby(['group_B'])['B'].count().reset_index()
df_final = pd.merge(df1, df2, left_on =['group_A'], right_on =['group_B']).drop(['group_B'], axis=1).rename(columns={'group_A': 'group'})
print(df_final)
Output
group A B
0 0 - 1 0 1
1 1 - 4 1 3
2 4 - 8 1 9
3 8 - 16 2 7
4 16 - 32 3 0
5 32 - 60 7 0
6 60 - 100 6 0
7 100 - 200 0 0
8 200 - 500 0 0
9 500 - 5999 0 0

Pandas how to find position of cell contains sub-string

Example:
Price | Rate p/lot | Total Comm|
947.2 1.25 BAM 1.25
129.3 2.1 $ 1.25
161.69 $ 0.8 CAD 2.00
If I have search for ['$','CAD']:-
Expected output:-
[(1, 2), (2, 1),(2,2)]
Sorry, find solution like this,It may help someone
import pandas as pd
df = pd.DataFrame([[947.2, 1.25, 'BAM 1.25'],
[129.3, 2.1, '$ 1.25'],
[161.69, '0.8 $', 'CAD 2.00']],
columns=['Price', 'Rate p/lot', 'Total Comm'])
row, column = (df.applymap(lambda x: x if any(s in str(x) for s in ['$','CAD']) else None )).values.nonzero()
t = list(zip(row,column))
You can use in with applymap:
i, j = (df.applymap(lambda x: '$' in str(x))).values.nonzero()
t = list(zip(i, j))
print (t)
[(1, 2), (2, 1)]
i, j = (df.applymap(lambda x: any(y for y in L if y in str(x)))).values.nonzero()
#another solution
#i, j = (df.applymap(lambda x: any(s in str(x) for s in L))).values.nonzero()
t = list(zip(i, j))
print (t)
[(1, 2), (2, 1), (2, 2)]
Use str.contains:
df = df.astype(str)
from itertools import product
result = reduce(lambda x,y:x+y, [list(product([i],list(df.iloc[:,i][df.iloc[:,i].str.contains('\$|CAD')].index))) for i in range(len(df.columns))])
Output
[(1, 2), (2, 1), (2, 2)]

Fast way combine multiple columns of type float into one column of type array(float)

I have a dataset like this:
df = pd.DataFrame({
"333-0": [123,123,123],
"5985-0.0": [1,2,3],
"5985-0.1":[1,2,3],
"5985-0.2":[1,2,3]
},
index = [0,1,2] )
Here, we have three columns ["5985-0.0", "5985-0.1", "5985-0.2"] that represent the first, second and third float readings of thing 5985-0 -- i.e. .x represents an array index.
I'd like to take multiple columns and collapse them into a single column 5985-0 containing some kind of list of float, which I can do like this:
srccols = ["5985-0.0", "5985-0.1", "5985-0.2"]
df["5985-0"] = df[srccols].apply(tuple, axis=1)
df.dropna(srccols, axis=1)
333-0 5985-0
0 123 (1, 1, 1)
1 123 (2, 2, 2)
2 123 (3, 3, 3)
which I can then store as an SQL table with an array column.
However, apply(tuple) is very slow. Is there a faster, more idiomatic pandas way to combine multiple columns into one.
(First person to say "normalized" gets a downvote).
My Choice
Assuming I know the columns
thing = '5985-0'
cols = ['5985-0.0', '5985-0.1', '5985-0.2']
k = len(cols)
v = df.values
l = [v[:, df.columns.get_loc(c)].tolist() for c in cols]
s = pd.Series(list(zip(*l)), name=thing)
df.drop(cols, 1).join(s)
333-0 5985-0
0 123 (1, 1, 1)
1 123 (2, 2, 2)
2 123 (3, 3, 3)
Base Case
Using filter, join, and apply(tuple, 1)
thing = '5985-0'
d = df.filter(like=thing)
s = d.apply(tuple, 1).rename(thing)
cols = d.columns
df.drop(cols, 1).join(s)
333-0 5985-0
0 123 (1, 1, 1)
1 123 (2, 2, 2)
2 123 (3, 3, 3)
Option 2
Using filter, join, pd.Series
thing = '5985-0'
d = df.filter(like=thing)
s = pd.Series(d.values.tolist(), name=thing)
cols = d.columns
df.drop(cols, 1).join(s)
333-0 5985-0
0 123 [1, 1, 1]
1 123 [2, 2, 2]
2 123 [3, 3, 3]
Option 3
Using filter, join, pd.Series, and zip
thing = '5985-0'
d = df.filter(like=thing)
s = pd.Series(list(zip(*d.values.T)), name=thing)
cols = d.columns
print(df.drop(cols, 1).join(s))
333-0 5985-0
0 123 (1, 1, 1)
1 123 (2, 2, 2)
2 123 (3, 3, 3)
Timing
Large Data Set
df = pd.concat([df] * 10000, ignore_index=True
%%timeit
thing = '5985-0'
d = df.filter(like=thing)
s = d.apply(tuple, 1).rename(thing)
cols = d.columns
df.drop(cols, 1).join(s)
1 loop, best of 3: 350 ms per loop
%%timeit
thing = '5985-0'
cols = ['5985-0.0', '5985-0.1', '5985-0.2']
k = len(cols)
v = df.values
l = [v[:, df.columns.get_loc(c)].tolist() for c in cols]
s = pd.Series(list(zip(*l)), name=thing)
df.drop(cols, 1).join(s)
100 loops, best of 3: 4.06 ms per loop
%%timeit
thing = '5985-0'
d = df.filter(like=thing)
s = pd.Series(d.values.tolist(), name=thing)
cols = d.columns
df.drop(cols, 1).join(s)
100 loops, best of 3: 4.56 ms per loop
%%timeit
thing = '5985-0'
d = df.filter(like=thing)
s = pd.Series(list(zip(*d.values.T)), name=thing)
cols = d.columns
df.drop(cols, 1).join(s)
100 loops, best of 3: 6.89 ms per loop