Pandas matching column isin another (list) column (broadcasting `.isin`) - pandas

In a workflow matching up a spec against some allowed values, I wish to find which rows (index) are matching a spec.
This is different from Pandas, isin, column of lists, as I an not matching each row against a (static) list.
I can do it with .explode and .groupby or with .apply, but the first seems very complicated and the second has performance issues. There must be a better way, but which? I can't make .isin broadcast, which otherwise would seem like the best solution.
(Example code also at https://colab.research.google.com/drive/1d8v6n99NPBaSufOsaWe3eRRrMgoSG_rr?usp=sharing)
import pandas as pd
import numpy as np
df = pd.DataFrame(data = {
'name': ['a', 'b', 'c'],
'lst': [[0,2,4], [1,2], []],
'spec': [2,4,0]
})
expect = pd.DataFrame(data= {
'name': ['a', 'b', 'c'],
'match_spec': [True, False, False]
})
def check(f):
try:
got = f()
result = (expect['match_spec'] == got)
ok = result.all()
if ok:
print(f'OK {f}')
else:
print(f'FAIL {f}\n{got}')
except Exception as ex:
print(f'ERROR {f}\n{ex}')
def naive_broadcast(): return df.spec in df.lst
check(naive_broadcast)
def result_apply(): return df.apply(lambda x: x.spec in x.lst, axis=1)
check(result_apply)
def naive_isin(): return df.spec.isin(df.lst)
check(naive_isin)
def vectorization(): np.vectorize(df.spec.isin)(df.lst.values)
check(vectorization)
# Another ugly way, exploding and grouping
def explode_groupby():
exp = df.explode('lst')
return (exp.assign(m = (exp['lst'].eq(exp['spec'])))
.groupby('name')
.agg(match_spec=('m', max))
.reset_index()['match_spec'])
check(explode_groupby)
The above code produces:
ERROR <function naive_broadcast at 0x7fe0a4f6c9d0>
unhashable type: 'Series'
OK <function result_apply at 0x7fe0a4f6ca60>
FAIL <function naive_isin at 0x7fe0a4f6c310>
0 False
1 False
2 False
Name: spec, dtype: bool
ERROR <function vectorization at 0x7fe0a4f6c940>
setting an array element with a sequence.
OK <function explode_groupby at 0x7fe0a4f6ce50>

If you are concerned about performance here's something new
pd.DataFrame([*df['lst']]).eq(df['spec'], axis=0).any(axis=1)
Result
0 True
1 False
2 False
dtype: bool

After some experiments, I found a way to do vectorization "right"™ :)
def vectorize_in():
def isin(spec, list):
return spec in list
vec_in = np.vectorize(isin)
return vec_in(df['spec'], df['lst'])
check(vectorize_in)
OK <function vectorize_in at 0x7fe2de337550>
This approach scales very well, both in len(df), max(len(df.lst)) and len(np.unique(np.concatenate(df['lst']))).
As seen by:
import numpy as np
import pandas as pd
rowcount = 10000
df = pd.DataFrame(data = {"spec": np.arange(0,rowcount)})
rand = np.random.default_rng(seed=42)
lst_mean = len(df)/2
lst_ln = 100
def mklist(x):
start = int(np.abs(rand.normal()*lst_mean))
l = int(np.abs(rand.normal()*lst_ln))
return np.arange(start-int(l/2), start+l)
df['lst']=df.apply(mklist, axis=1)
def explode_eq(): pd.DataFrame([*df['lst']]).eq(df['spec'], axis=0).any(axis=1)
%timeit result_apply()
234 ms ± 4.08 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit explode_eq()
1.16 s ± 222 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit vectorize_in()
32.4 ms ± 743 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Related

How to write Pandas DataFrame filter condition from parameters in a function

Let say I have pandas DataFrame like this:
NO_DOC | NO_SEQ | DESC
A1 | 1 | A
B1 | 2 | B
C1 | 3 | C
I want to search row index from data above on a certain filter:
I am able to find it using statement:
idx = data.index[data["NO_DOC"] == "A1" && data["NO_SEQ"] == 1]
but what I want to ask is how to make the "condition" is variable because I want to make it in a function.
Example:
def find(self, data, columns, value):
return data.index[data[...] == value[...] && ....].tolist()
columns parameters is ["NO_DOC","NO_SEQ"]
value is ["A1", 1]
the filter condition above is: data["NO_DOC"] == "A1" && data["NO_SEQ"] == 1
How do I make a function to find row index on pandas DataFrame with a certain filter condition?
We can use zip with a list comprehension to build a DataFrame from each Series of boolean values:
mask_df = pd.DataFrame([
data[col] == val for col, val in zip(columns, values)
])
0 1 2
NO_DOC True False False
NO_SEQ True False False
Then we can reduce this DataFrame of booleans with all to check which columns (indexes) have only True values (logical AND):
mask = mask_df.all()
0 True
1 False
2 False
dtype: bool
Note: logical OR can be achieved with any instead of all
Now we can use the mask to filter the indexes:
data.index[mask].tolist()
[0]
Together find can look something like:
def find(data, columns, values):
# Create a DataFrame of boolean conditions
# Take logical AND using DataFrame.all
mask = pd.DataFrame([
data[col] == val for col, val in zip(columns, values)
]).all()
# Filter and convert to list
return data.index[mask].tolist()
Or with np.logical_and and .reduce which behaves in with the same logic as the DataFrame but can be much faster as it does not need to maintain pandas objects:
def find(data, columns, values):
mask = np.logical_and.reduce(
[data[col] == val for col, val in zip(columns, values)]
)
return data.index[mask].tolist()
Some timing via timeit.
Setup:
import numpy as np
import pandas as pd
def find_with_df_mask(data, columns, values):
mask = pd.DataFrame([
data[col] == val for col, val in zip(columns, values)
]).all()
return data.index[mask].tolist()
def find_with_np_mask(data, columns, values):
mask = np.logical_and.reduce(
[data[col] == val for col, val in zip(columns, values)]
)
return data.index[mask].tolist()
df = pd.concat([pd.DataFrame({
'NO_DOC': ['A1', 'B1', 'C1'],
'NO_SEQ': [1, 2, 3],
'DESC': ['A', 'B', 'C']
})] * 1000, ignore_index=True)
cols = ["NO_DOC", "NO_SEQ"]
vals = ["A1", 1]
Timings:
%timeit find_with_df_mask(df, cols, vals)
21.9 ms ± 65.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit find_with_np_mask(df, cols, vals)
319 µs ± 253 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
To write Pandas dataframe filter condition from parameters in a function:
import pandas as pd
raw_data = {'NO_DOC':['A1','B1','C1'],'NO_SEQ':[1,2,3],'DESC':['A','B','C']}
data = pd.DataFrame(raw_data)
columns = ["NO_DOC", "NO_SEQ"]
value = ["A1", 1]
Create masks and send them to functions instead of columns and values:
mask1 = data[columns[0]] == value[0]
mask2 = data[columns[1]] == value[1]
def find(data, mask1,mask2):
return data.index[mask1 & mask2].tolist()
Call your function as follows:
find(data, mask1, mask2)

Add to items, with multiple occurrences [duplicate]

I have unsorted array of indexes:
i = np.array([1,5,2,6,4,3,6,7,4,3,2])
I also have an array of values of the same length:
v = np.array([2,5,2,3,4,1,2,1,6,4,2])
I have array with zeros of desired values:
d = np.zeros(10)
Now I want to add to elements in d values of v based on it's index in i.
If I do it in plain python I would do it like this:
for index,value in enumerate(v):
idx = i[index]
d[idx] += v[index]
It is ugly and inefficient. How can I change it?
np.add.at(d, i, v)
You'd think d[i] += v would work, but if you try to do multiple additions to the same cell that way, one of them overrides the others. The ufunc.at method avoids those problems.
We can use np.bincount which is supposedly pretty efficient for such accumulative weighted counting, so here's one with that -
counts = np.bincount(i,v)
d[:counts.size] = counts
Alternatively, using minlength input argument and for a generic case when d could be any array and we want to add into it -
d += np.bincount(i,v,minlength=d.size).astype(d.dtype, copy=False)
Runtime tests
This section compares np.add.at based approach listed in the other post with the np.bincount based one listed earlier in this post.
In [61]: def bincount_based(d,i,v):
...: counts = np.bincount(i,v)
...: d[:counts.size] = counts
...:
...: def add_at_based(d,i,v):
...: np.add.at(d, i, v)
...:
In [62]: # Inputs (random numbers)
...: N = 10000
...: i = np.random.randint(0,1000,(N))
...: v = np.random.randint(0,1000,(N))
...:
...: # Setup output arrays for two approaches
...: M = 12000
...: d1 = np.zeros(M)
...: d2 = np.zeros(M)
...:
In [63]: bincount_based(d1,i,v) # Run approaches
...: add_at_based(d2,i,v)
...:
In [64]: np.allclose(d1,d2) # Verify outputs
Out[64]: True
In [67]: # Setup output arrays for two approaches again for timing
...: M = 12000
...: d1 = np.zeros(M)
...: d2 = np.zeros(M)
...:
In [68]: %timeit add_at_based(d2,i,v)
1000 loops, best of 3: 1.83 ms per loop
In [69]: %timeit bincount_based(d1,i,v)
10000 loops, best of 3: 52.7 µs per loop

Efficiently find indexes that would make array equal to a permutation of itself

I'm looking for some function that find the indexes that would make an array equal to a permutation of itself.
Assume that p1 is a 1d Numpy array that contains no duplicates. Assume that p2 is a permutation (a reordering) of p1.
I want a function find_position_in_original such that p2[find_position_in_original(p2, p1)] is identical to p1.
For example:
p1 = np.array(['a', 'e', 'c', 'f'])
p2 = np.array(['e', 'f', 'a', 'c'])
in which find_position_in_permutation(p1, p2) should return:
[2, 0, 1, 3]
because p2[[2, 0, 1, 3]] is identical to p1.
You can do this in a brute-force manner using lists:
def find_position_in_permutation(original, permutation):
original = list(original)
permutation = list(permutation)
return list(map(permutation.index, original))
but I am wondering if there is something more algorithmically efficient. This one appears to be O(N^2).
Benchmarks of current answers:
import numpy as np
from string import ascii_lowercase
n = 100
letters = np.array([*ascii_lowercase])
p1 = np.random.choice(letters, size=n)
p2 = np.random.permutation(p1)
p1l = p1.tolist()
p2l = p2.tolist()
def find_pos_in_perm_1(original, permutation):
""" My original solution """
return list(map(permutation.index, original))
def find_pos_in_perm_2(original, permutation):
""" Eric Postpischil's solution, using a dict as a lookup table """
tbl = {val: ix for ix, val in enumerate(permutation)}
return [tbl[val] for val in original]
def find_pos_in_perm_3(original, permutation):
""" Paul Panzer's solution, using an array as a lookup table """
original_argsort = np.argsort(original)
permutation_argsort = np.argsort(permutation)
tbl = np.empty_like(original_argsort)
tbl[original_argsort] = permutation_argsort
return tbl
%timeit find_pos_in_perm_1(p1l, p2l)
# 40.5 µs ± 1.13 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit find_pos_in_perm_2(p1l, p2l)
# 10 µs ± 171 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit find_pos_in_perm_3(p1, p2)
# 6.38 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
You can do O(N log N) using argsort:
>>> import numpy as np
>>> from string import ascii_lowercase
>>>
>>> letters = np.array([*ascii_lowercase])
>>> p1, p2 = map(np.random.permutation, 2*(letters,))
>>>
>>> o1, o2 = map(np.argsort, (p1, p2))
>>> o12, o21 = map(np.empty_like, (o1, o2))
>>> o12[o1], o21[o2] = o2, o1
>>>
>>> print(np.all(p1[o21] == p2))
True
>>> print(np.all(p2[o12] == p1))
True
O(N) solution using Python dictionary:
>>> import operator as op
>>>
>>> l1, l2 = map(op.methodcaller('tolist'), (p1, p2))
>>>
>>> s12 = op.itemgetter(*l1)({k: v for v, k in enumerate(l2)})
>>> print(np.all(s12 == o12))
True
Some timings:
26 elements
argsort 0.004 ms
dict 0.003 ms
676 elements
argsort 0.096 ms
dict 0.075 ms
17576 elements
argsort 4.366 ms
dict 2.915 ms
456976 elements
argsort 191.376 ms
dict 230.459 ms
Benchmark code:
import numpy as np
from string import ascii_lowercase
import operator as op
from timeit import timeit
L1 = np.array([*ascii_lowercase], object)
L2 = np.add.outer(L1, L1).ravel()
L3 = np.add.outer(L2, L1).ravel()
L4 = np.add.outer(L2, L2).ravel()
letters = (*map(op.methodcaller('astype', str), (L1, L2, L3, L4)),)
def use_argsort(p1, p2):
o1, o2 = map(np.argsort, (p1, p2))
o12 = np.empty_like(o1)
o12[o1] = o2
return o12
def use_dict(l1, l2):
return op.itemgetter(*l1)({k: v for v, k in enumerate(l2)})
for L, N in zip(letters, (1000, 1000, 200, 4)):
print(f'{len(L)} elements')
p1, p2 = map(np.random.permutation, (L, L))
l1, l2 = map(op.methodcaller('tolist'), (p1, p2))
T = (timeit(lambda: f(i1, i2), number=N)*1000/N for f, i1, i2 in (
(use_argsort, p1, p2), (use_dict, l1, l2)))
for m, t in zip(('argsort', 'dict '), T):
print(m, f'{t:10.3f} ms')

Selecting rows from ndarray via bytearray

I have a bytearray that is pulled from redis.
r.set('a', '')
r.setbit('a', 0, 1)
r.setbit('a', 1, 1)
r.setbit('a', 12, 1)
a_raw = db.get('a')
# b'\xc0\x08'
a_bin = bin(int.from_bytes(a, byteorder="big"))
# 0b1100000000001000
I want to use that bytearray to select rows from an ndarray.
arr = np.arange(12)
arr[a_raw]
# array([0, 1, 12])
Edit Both solutions work, but I found #paul-panzer's to be faster
import timeit
setup = '''import numpy as np; a = b'\\xc0\\x08'; '''
t1 = timeit.timeit('idx = np.unpackbits(np.frombuffer(a, np.uint8)); np.where(idx)',
setup = setup, number=10000)
t2 = timeit.timeit('idx = np.array(list(bin(int.from_bytes(a, byteorder="big"))[2:])) == "1"; np.where(idx)',
setup = setup, number=10000)
print(t1, t2)
#0.019560601096600294 0.054518797900527716
Edit 2 Actually, the from_bytes method doesn't return what I'm looking for:
redis_db.delete('timeit_test')
redis_db.setbit('timeit_test', 12666, 1)
redis_db.setbit('timeit_test', 14379, 1)
by = redis_db.get('timeit_test')
idx = np.unpackbits(np.frombuffer(by, np.uint8))
indices = np.where(idx)
idx = np.array(list(bin(int.from_bytes(by, byteorder="big"))[2:])) == "1"
indices_2 = np.where(idx)
print(indices, indices_2)
#(array([12666, 14379]),) (array([ 1, 1714]),)
Here is a way using unpackbits:
>>> a = b'\xc0\x08'
>>> b = np.arange(32).reshape(16, 2)
>>> c = np.arange(40).reshape(20, 2)
>>>
>>> idx = np.unpackbits(np.frombuffer(a, np.uint8))
>>>
# if the sizes match boolen indexing can be used
>>> b[idx.view(bool)]
array([[ 0, 1],
[ 2, 3],
[24, 25]])
>>>
# non matching sizes can be worked around using where
>>> c[np.where(idx)]
array([[ 0, 1],
[ 2, 3],
[24, 25]])
>>>
Here's one way:
In [57]: b = 0b1100000000001000
In [58]: mask = np.array(list(bin(b)[2:])) == '1'
In [59]: arr = np.arange(13)
In [60]: arr[mask[:len(arr)]]
Out[60]: array([ 0, 1, 12])
Additionally it's a simple check to demonstrate that the __getitem__ implementation for ndarray does not support indexing directly on a bytes object:
In [61]: by = b'\xc0\x08'
In [62]: arr[by]
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-111-6cd68003b176> in <module>()
----> 1 arr[by]
IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`)
and integer or boolean arrays are valid indices
So unless you subclass ndarray or create an extension module with customized __getitem__ behavior, there is no way to do it directly from the bytes, and you must convert the bytes into a boolean mask based on bitwise conditions.
Here's an example comparing the timing for a few different approaches that work directly from the original bytes object:
In [171]: %timeit np.array(list(bin(int.from_bytes(by, byteorder='big'))[2:])) == '1'
3.51 µs ± 38 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [172]: %timeit np.unpackbits(np.frombuffer(by, np.uint8))
2.05 µs ± 29.59 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [173]: %timeit np.array(list(bin(struct.unpack('>H', by)[0])[2:])) == '1'
2.65 µs ± 6.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

what is the fastest way to get the mode of a numpy array

I have to find the mode of a NumPy array that I read from an hdf5 file. The NumPy array is 1d and contains floating point values.
my_array=f1[ds_name].value
mod_value=scipy.stats.mode(my_array)
My array is 1d and contains around 1M values. It takes about 15 min for my script to return the mode value. Is there any way to make this faster?
Another question is why scipy.stats.median(my_array) does not work while mode works?
AttributeError: module 'scipy.stats' has no attribute 'median'
The implementation of scipy.stats.mode has a Python loop for handling the axis argument with multidimensional arrays. The following simple implementation, for one-dimensional arrays only, is faster:
def mode1(x):
values, counts = np.unique(x, return_counts=True)
m = counts.argmax()
return values[m], counts[m]
Here's an example. First, make an array of integers with length 1000000.
In [40]: x = np.random.randint(0, 1000, size=(2, 1000000)).sum(axis=0)
In [41]: x.shape
Out[41]: (1000000,)
Check that scipy.stats.mode and mode1 give the same result.
In [42]: from scipy.stats import mode
In [43]: mode(x)
Out[43]: ModeResult(mode=array([1009]), count=array([1066]))
In [44]: mode1(x)
Out[44]: (1009, 1066)
Now check the performance.
In [45]: %timeit mode(x)
2.91 s ± 18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [46]: %timeit mode1(x)
39.6 ms ± 83.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.91 seconds for mode(x) and only 39.6 milliseconds for mode1(x).
Here's one approach based on sorting -
def mode1d(ar_sorted):
ar_sorted.sort()
idx = np.flatnonzero(ar_sorted[1:] != ar_sorted[:-1])
count = np.empty(idx.size+1,dtype=int)
count[1:-1] = idx[1:] - idx[:-1]
count[0] = idx[0] + 1
count[-1] = ar_sorted.size - idx[-1] - 1
argmax_idx = count.argmax()
if argmax_idx==len(idx):
modeval = ar_sorted[-1]
else:
modeval = ar_sorted[idx[argmax_idx]]
modecount = count[argmax_idx]
return modeval, modecount
Note that this mutates/changes the input array as it sorts it. So, if you want to keep the input array un-mutated or do mind the input array being sorted, pass a copy.
Sample run on 1M elements -
In [65]: x = np.random.randint(0, 1000, size=(1000000)).astype(float)
In [66]: from scipy.stats import mode
In [67]: mode(x)
Out[67]: ModeResult(mode=array([ 295.]), count=array([1098]))
In [68]: mode1d(x)
Out[68]: (295.0, 1098)
Runtime test
In [75]: x = np.random.randint(0, 1000, size=(1000000)).astype(float)
# Scipy's mode
In [76]: %timeit mode(x)
1 loop, best of 3: 1.64 s per loop
# #Warren Weckesser's soln
In [77]: %timeit mode1(x)
10 loops, best of 3: 52.7 ms per loop
# Proposed in this post
In [78]: %timeit mode1d(x)
100 loops, best of 3: 12.8 ms per loop
With a copy, the timings for mode1d would be comparable to mode1.
I added the two functions mode1 and mode1d from replies above to my script and tried to compare with the scipy.stats.mode.
dir_name="C:/Users/test_mode"
file_name="myfile2.h5"
ds_name="myds"
f_in=os.path.join(dir_name,file_name)
def mode1(x):
values, counts = np.unique(x, return_counts=True)
m = counts.argmax()
return values[m], counts[m]
def mode1d(ar_sorted):
ar_sorted.sort()
idx = np.flatnonzero(ar_sorted[1:] != ar_sorted[:-1])
count = np.empty(idx.size+1,dtype=int)
count[1:-1] = idx[1:] - idx[:-1]
count[0] = idx[0] + 1
count[-1] = ar_sorted.size - idx[-1] - 1
argmax_idx = count.argmax()
if argmax_idx==len(idx):
modeval = ar_sorted[-1]
else:
modeval = ar_sorted[idx[argmax_idx]]
modecount = count[argmax_idx]
return modeval, modecount
startTime=time.time()
with h5py.File(f_in, "a") as f1:
myds=f1[ds_name].value
time1=time.time()
file_read_time=time1-startTime
print(str(file_read_time)+"\t"+"s"+"\t"+str((file_read_time)/60)+"\t"+"min")
print("mode_scipy=")
mode_scipy=scipy.stats.mode(myds)
print(mode_scipy)
time2=time.time()
mode_scipy_time=time2-time1
print(str(mode_scipy_time)+"\t"+"s"+"\t"+str((mode_scipy_time)/60)+"\t"+"min")
print("mode1=")
mode1=mode1(myds)
print(mode1)
time3=time.time()
mode1_time=time3-time2
print(str(mode1_time)+"\t"+"s"+"\t"+str((mode1_time)/60)+"\t"+"min")
print("mode1d=")
mode1d=mode1d(myds)
print(mode1d)
time4=time.time()
mode1d_time=time4-time3
print(str(mode1d_time)+"\t"+"s"+"\t"+str((mode1d_time)/60)+"\t"+"min")
The result from running the script for a numpy array of around 1M is :
mode_scipy=
ModeResult(mode=array([ 1.11903353e-06], dtype=float32), count=array([304909]))
938.8368742465973 s
15.647281237443288 min
mode1=(1.1190335e-06, 304909)
0.06500649452209473 s
0.0010834415753682455 min
mode1d=(1.1190335e-06, 304909)
0.06200599670410156 s
0.0010334332784016928 min