Combine 2 different sized arrays element-wise based on index pairing array - numpy

Say, we had 2 arrays of unique values:
a = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) # any values are possible,
b = np.array([0, 11, 12, 13, 14, 15, 16, 17, 18, 19]) # sorted values are for demonstration
, where a[0] corresponds to b[0], a[1] to b[11], a[2]-b[12], etc.
Then, due to some circumstances we randomly lost some of it and received noise elements from/to both a & b. Now 'useful data' in a and b are kind of 'eroded' like this:
a = np.array([0, 1, 313, 2, 3, 4, 5, 934, 6, 8, 9, 730, 241, 521])
b = np.array([112, 514, 11, 13, 16, 955, 17, 18, 112])
The noise elements have negligible probability to coincide with any of 'useful data'. So, if to search them, we could find the left ones and to define the 'index pairing array':
cor_tab = np.array([[1,2], [4,3], [8,4], [9,7]])
which, if applied, provides pairs of 'useful data' left:
np.column_stack((a[cor_tab[:,0]], b[cor_tab[:,1]]))
array([[1, 11],
[3, 13],
[6, 16],
[8, 18]])
The question: Given the 'eroded' a and b, how to combine them into numpy array such that:
values indexed in cor_tab are paired in the same column/row,
lost values are treated as -1,
noise as 'don't care', and
array looks like this:
[[ -1 112],
[ 0 514],
[ 1 11],
[313 -1],
[ 2 -1],
[ 3 13],
[ 4 -1],
[ 5 -1],
[934 -1],
[ 6 16],
[ -1 955],
[ -1 17],
[ 8 18],
[ 9 -1],
[730 -1],
[241 -1],
[521 112]]
, where 'useful data' is at indices: 2, 5, 9, 12?
Initially I solved this, in dubious way:
import numpy as np
def combine(aa, bb, t):
c0 = np.empty((0), int)
c1 = np.empty((0), int)
# add -1 & 'noise' at the left side:
if t[0][0] > t[0][1]:
c0 = np.append(c0, aa[: t[0][0]])
c1 = np.append(c1, [np.append([-1] * (t[0][0] - t[0][1]), bb[: t[0][1]])])
else:
c0 = np.append(c0, [np.append([-1] * (t[0][1] - t[0][0]), aa[: t[0][0]])])
c1 = np.append(c1, bb[: t[0][1]])
ind_compenstr = t[0][0] - t[0][1] # 'index compensator'
for i, ii in enumerate(t):
x = ii[0] - ii[1] - ind_compenstr
# add -1 & 'noise' in the middle:
if x > 0:
c0 = np.append(c0, [aa[ii[0]-x:ii[0]]])
c1 = np.append(c1, [[-1] * x])
elif x == 0:
c0 = np.append(c0, [aa[ii[0]-x:ii[0]]])
c1 = np.append(c1, [bb[ii[1]-x:ii[1]]])
else:
x = abs(x)
c0 = np.append(c0, [[-1] * x])
c1 = np.append(c1, [bb[ii[1]-x:ii[1]]])
# add useful elements:
c0 = np.append(c0, aa[ii[0]])
c1 = np.append(c1, bb[ii[1]])
ind_compenstr += x
# add -1 & 'noise' at the right side:
l0 = len(aa) - t[-1][0]
l1 = len(bb) - t[-1][1]
if l0 > l1:
c0 = np.append(c0, aa[t[-1][0] + 1:])
c1 = np.append(c1, [np.append(bb[t[-1][1] + 1:], [-1] * (l0 - l1))])
else:
c0 = np.append(c0, [np.append(aa[t[-1][0] + 1:], [-1] * (l1 - l0))])
c1 = np.append(c1, bb[t[-1][1] + 1:])
return np.array([c0,c1])
But bellow I suggest another solution.

It is difficult to understand what the question want, but IIUC, at first, we need to find the column size of the expected array that contains combined uncommon values between the two arrays (np.union1d), and then create an array based on that size full filled by -1 (np.full). Now, using np.searchsorted, the indices of values of an array in another array will be achieved. Values that are not contained in the other array can be given by np.in1d in invert mode. So we can achieve the goal by indexing as:
union_ = np.union1d(a, b)
# [0 1 2 3 4 5 6 7 8 9]
res = np.full((2, union_.size), -1)
# [[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
# [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]]
arange_row_ids = np.arange(union_.size)
# [0 1 2 3 4 5 6 7 8 9]
col_inds = np.searchsorted(a, b)[np.in1d(b, a, invert=True)]
# np.searchsorted(a, b) ---> [1 3 6 7 7]
# np.in1d(b, a, invert=True) ---> [False False False True False]
# [7]
res[0, np.delete(arange_row_ids, col_inds + np.arange(col_inds.size))] = a
# np.delete(arange_row_ids, col_inds + np.arange(col_inds.size)) ---> [0 1 2 3 4 5 6 8 9]
# [[ 0 1 2 3 4 5 6 -1 8 9]
# [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]]
col_inds = np.searchsorted(b, a)[np.in1d(a, b, invert=True)]
# np.searchsorted(b, a) ---> [0 0 1 1 2 2 2 4 5]
# np.in1d(a, b, invert=True) ---> [ True False True False True True False False True]
# [0 1 2 2 5]
res[1, np.delete(arange_row_ids, col_inds + np.arange(col_inds.size))] = b
# np.delete(arange_row_ids, col_inds + np.arange(col_inds.size)) ---> [1 3 6 7 8]
# [[ 0 1 2 3 4 5 6 -1 8 9]
# [-1 1 -1 3 -1 -1 6 7 8 -1]]
The question is not clear enough to see if the answer is the expected one, but I think it is helpful that could help for further modifications based on the need.

Here's a partially vectorized solution:
import numpy as np
# this function if from Divakar's answer at #https://stackoverflow.com/questions/38619143/convert-python-#sequence-to-numpy-array-filling-missing-values that I used as #function:
def boolean_indexing(v):
lens = np.array([len(item) for item in v])
mask = lens[:,None] > np.arange(lens.max())[::-1]
out = np.full(mask.shape, -1, dtype=int)
out[mask] = np.concatenate(v)
return out
# 2 arrays with eroded useful data and the index pairing array:
a = np.array([0, 1, 313, 2, 3, 4, 5, 934, 6, 8, 9, 730, 241, 521])
b = np.array([112, 514, 11, 13, 16, 955, 17, 18, 112])
cor_tab = np.array([[1,2], [4,3], [8,4], [9,7]])
# split every array by correspondent indices in `cor_tab`:
aa = np.split(a, cor_tab[:,0]+1)
bb = np.split(b, cor_tab[:,1]+1)
#initiate 2 flat empty arrays:
aaa = np.empty((0), int)
bbb = np.empty((0), int)
# loop over the splitted arrays:
for i, j in zip(aa,bb):
c = boolean_indexing([i, j])
aaa = np.append(aaa, c[0])
bbb = np.append(bbb, c[1])
ccc = np.array([aaa,bbb]).T
In case of other types of data, here is another example. Lets take two arrays of letters:
a = np.array(['y', 'w', 'a', 'e', 'i', 'o', 'u', 'y', 'w', 'a', 'e', 'i', 'o', 'u'])
b = np.array(['t', 'h', 'b', 't', 'c', 'n', 's', 'j', 'p', 'z', 'n', 'h', 't', 's', 'm', 'p'])
, and index pairing array:
cor_tab = np.array([[2,0], [3,2], [4,3], [5,5], [6,6], [9,10], [11,12], [13,13]])
np.column_stack((a[cor_tab[:,0]], b[cor_tab[:,1]]))
array([['a', 't'], # useful data
['e', 'b'],
['i', 't'],
['o', 'n'],
['u', 's'],
['a', 'n'],
['i', 't'],
['u', 's']], dtype='<U1')
The only correction required is dtype='<U1' in boolean_indexing(). Result is:
[['y' '-'],
['w' '-'],
['a' 't'],
['-' 'h'],
['e' 'b'],
['i' 't'],
['-' 'c'],
['o' 'n'],
['u' 's'],
['-' 'j'],
['y' 'p'],
['w' 'z'],
['a' 'n'],
['e' 'h'],
['i' 't'],
['o' '-'],
['u' 's'],
['-' 'm'],
['-' 'p']]
It works for floats as well if change dtype in boolean_indexing() to float.

Related

Find duplicated sequences in numpy.array or pandas column

For example, I have got an array like this:
([ 1, 5, 7, 9, 4, 6, 3, 3, 7, 9, 4, 0, 3, 3, 7, 8, 1, 5 ])
I need to find all duplicated sequences , not values, but sequences of at least two values one by one.
The result should be like this:
of length 2: [1, 5] with indexes (0, 16);
of length 3: [3, 3, 7] with indexes (6, 12); [7, 9, 4] with indexes (2, 8)
The long sequences should be excluded, if they are not duplicated. ([5, 5, 5, 5]) should NOT be taken as [5, 5] on indexes (0, 1, 2)! It's not a duplicate sequence, it's one long sequence.
I can do it with pandas.apply function, but it calculates too slow, swifter did not help me.
And in real life I need to find all of them, with length from 10 up to 100 values one by one on database with 1500 columns with 700 000 values each. So i really do need a vectorized decision.
Is there a vectorized decision for finding all at once? Or at least for finding only 10-values sequences? Or only 4-values sequences? Anything, that will be fully vectorized?
One possible implementation (although not fully vectorized) that finds all sequences of size n that appear more than once is the following:
import numpy as np
def repeated_sequences(arr, n):
Na = arr.size
r_seq = np.arange(n)
n_seqs = arr[np.arange(Na - n + 1)[:, None] + r_seq]
unique_seqs = np.unique(n_seqs, axis=0)
comp = n_seqs == unique_seqs[:, None]
M = np.all(comp, axis=-1)
if M.any():
matches = np.array(
[np.convolve(M[i], np.ones((n), dtype=int)) for i in range(M.shape[0])]
)
repeated_inds = np.count_nonzero(matches, axis=-1) > n
repeated_matches = matches[repeated_inds]
idxs = np.argwhere(repeated_matches > 0)[::n]
grouped_idxs = np.split(
idxs[:, 1], np.unique(idxs[:, 0], return_index=True)[1][1:]
)
else:
return [], []
return unique_seqs[repeated_inds], grouped_idxs
In theory, you could replace
matches = np.array(
[np.convolve(M[i], np.ones((n), dtype=int)) for i in range(M.shape[0])]
)
with
matches = scipy.signal.convolve(
M, np.ones((1, n), dtype=int), mode="full"
).astype(int)
which would make the whole thing "fully vectorized", but my tests showed that this was 3 to 4 times slower than the for-loop. So I'd stick with that. Or simply,
matches = np.apply_along_axis(np.convolve, -1, M, np.ones((n), dtype=int))
which does not have any significant speed-up, since it's basically a hidden loop (see this).
This is based off #Divakar's answer here that dealt with a very similar problem, in which the sequence to look for was provided. I simply made it so that it could follow this procedure for all possible sequences of size n, which are found inside the function with n_seqs = arr[np.arange(Na - n + 1)[:, None] + r_seq]; unique_seqs = np.unique(n_seqs, axis=0).
For example,
>>> a = np.array([1, 5, 7, 9, 4, 6, 3, 3, 7, 9, 4, 0, 3, 3, 7, 8, 1, 5])
>>> repeated_seqs, inds = repeated_sequences(a, n)
>>> for i, seq in enumerate(repeated_seqs[:10]):
...: print(f"{seq} with indexes {inds[i]}")
...:
[3 3 7] with indexes [ 6 12]
[7 9 4] with indexes [2 8]
Disclaimer
The long sequences should be excluded, if they are not duplicated. ([5, 5, 5, 5]) should NOT be taken as [5, 5] on indexes (0, 1, 2)! It's not a duplicate sequence, it's one long sequence.
This is not directly taken into account and the sequence [5, 5] would appear more than once according to this algorithm. You could do something like this, based off #Paul's answer here, but it involves a loop:
import numpy as np
repeated_matches = np.array([[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])
idxs = np.argwhere(repeated_matches > 0)
grouped_idxs = np.split(
idxs[:, 1], np.unique(idxs[:, 0], return_index=True)[1][1:]
)
>>> print(grouped_idxs)
[array([ 6, 7, 8, 12, 13, 14], dtype=int64),
array([ 7, 8, 9, 10], dtype=int64)]
# If there are consecutive numbers in grouped_idxs, that means that there is a long
# sequence that should be excluded. So, you'd have to check for consecutive numbers
filtered_idxs = []
for idx in grouped_idxs:
if not all((idx[1:] - idx[:-1]) == 1):
filtered_idxs.append(idx)
>>> print(filtered_idxs)
[array([ 6, 7, 8, 12, 13, 14], dtype=int64)]
Some tests:
>>> n = 3
>>> a = np.array([1, 5, 7, 9, 4, 6, 3, 3, 7, 9, 4, 0, 3, 3, 7, 8, 1, 5])
>>> %timeit repeated_sequences(a, n)
414 µs ± 5.88 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
>>> n = 4
>>> a = np.random.randint(0, 10, (10000,))
>>> %timeit repeated_sequences(a, n)
3.88 s ± 54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
>>> result, _ = repeated_sequences(a, n)
>>> result.shape
(2637, 4)
This is not the most efficient implementation by far, but it works as a 2D approach. Plus, if there aren't any repeated sequences, it returns empty lists.
EDIT: Full implementation
I vectorized the routine I added in the Disclaimer section as a possible solution to the long sequence problem and ended up with the following:
import numpy as np
# Taken from:
# https://stackoverflow.com/questions/53051560/stacking-numpy-arrays-of-different-length-using-padding
def stack_padding(it):
def resize(row, size):
new = np.array(row)
new.resize(size)
return new
row_length = max(it, key=len).__len__()
mat = np.array([resize(row, row_length) for row in it])
return mat
def repeated_sequences(arr, n):
Na = arr.size
r_seq = np.arange(n)
n_seqs = arr[np.arange(Na - n + 1)[:, None] + r_seq]
unique_seqs = np.unique(n_seqs, axis=0)
comp = n_seqs == unique_seqs[:, None]
M = np.all(comp, axis=-1)
repeated_seqs = []
idxs_repeated_seqs = []
if M.any():
matches = np.apply_along_axis(np.convolve, -1, M, np.ones((n), dtype=int))
repeated_inds = np.count_nonzero(matches, axis=-1) > n
if repeated_inds.any():
repeated_matches = matches[repeated_inds]
idxs = np.argwhere(repeated_matches > 0)
grouped_idxs = np.split(
idxs[:, 1], np.unique(idxs[:, 0], return_index=True)[1][1:]
)
# Additional routine
# Pad this uneven array with zeros so that we can use it normally
grouped_idxs = np.array(grouped_idxs, dtype=object)
padded_idxs = stack_padding(grouped_idxs)
# Find the indices where there are padded zeros
pad_positions = padded_idxs == 0
# Perform the "consecutive-numbers check" (this will take one
# item off the original array, so we have to correct for its shape).
idxs_to_remove= np.pad(
(padded_idxs[:, 1:] - padded_idxs[:, :-1]) == 1,
[(0, 0), (0, 1)],
constant_values=True,
)
pad_positions = np.argwhere(pad_positions)
i = pad_positions[:, 0]
j = pad_positions[:, 1] - 1 # Shift by one (shape correction)
idxs_to_remove[i, j] = True # Masking, since we don't want pad indices
# Obtain a final mask (boolean opposite of indices to remove)
final_mask = ~idxs_to_remove.all(axis=-1)
grouped_idxs = grouped_idxs[final_mask] # Filter the long sequences
repeated_seqs = unique_seqs[repeated_inds][final_mask]
# In order to get the correct indices, we must first limit the
# search to a shape (on axis=1) of the closest multiple of n.
# This will avoid taking more indices than we should to show where
# each repeated sequence begins
to = padded_idxs.shape[1] & (-n)
# Build the final list of indices (that goes from 0 - to with
# a step of n
idxs_repeated_seqs = [
grouped_idxs[i][:to:n] for i in range(grouped_idxs.shape[0])
]
return repeated_seqs, idxs_repeated_seqs
For example,
n = 2
examples = [
# First example is your original example array.
np.array([1, 5, 7, 9, 4, 6, 3, 3, 7, 9, 4, 0, 3, 3, 7, 8, 1, 5]),
# Second example has a long sequence of 5's, and since there aren't
# any [5, 5] anywhere else, it's not taken into account and therefore
# should not come out.
np.array([1, 5, 5, 5, 5, 6, 3, 3, 7, 9, 4, 0, 3, 3, 7, 8, 1, 5]),
# Third example has the same long sequence but since there is a [5, 5]
# later, then it should take it into account and this sequence should
# be found.
np.array([1, 5, 5, 5, 5, 6, 5, 5, 7, 9, 4, 0, 3, 3, 7, 8, 1, 5]),
# Fourth example has a [5, 5] first and later it has a long sequence of
# 5's which are uneven and the previous implementation got confused with
# the indices to show as the starting indices. In this case, it should be
# 1, 13 and 15 for [5, 5].
np.array([1, 5, 5, 9, 4, 6, 3, 3, 7, 9, 4, 0, 3, 5, 5, 5, 5, 5]),
]
for a in examples:
print(f"\nExample: {a}")
repeated_seqs, inds = repeated_sequences(a, n)
for i, seq in enumerate(repeated_seqs):
print(f"\t{seq} with indexes {inds[i]}")
Output (as expected):
Example: [1 5 7 9 4 6 3 3 7 9 4 0 3 3 7 8 1 5]
[1 5] with indexes [0 16]
[3 3] with indexes [6 12]
[3 7] with indexes [7 13]
[7 9] with indexes [2 8]
[9 4] with indexes [3 9]
Example: [1 5 5 5 5 6 3 3 7 9 4 0 3 3 7 8 1 5]
[1 5] with indexes [0 16]
[3 3] with indexes [6 12]
[3 7] with indexes [7 13]
Example: [1 5 5 5 5 6 5 5 7 9 4 0 3 3 7 8 1 5]
[1 5] with indexes [ 0 16]
[5 5] with indexes [1 3 6]
Example: [1 5 5 9 4 6 3 3 7 9 4 0 3 5 5 5 5 5]
[5 5] with indexes [ 1 13 15]
[9 4] with indexes [3 9]
You can test it out yourself with more examples and more cases. Keep in mind this is what I understood from your disclaimer. If you want to count the long sequences as one, even if multiple sequences are in there (for example, [5, 5] appears twice in [5, 5, 5, 5]), this won't work for you and you'd have to come up with something else.

Generating boolean dataframe based on contents in series and dataframe

I have:
df = pd.DataFrame(
[
[22, 33, 44],
[55, 11, 22],
[33, 55, 11],
],
index=["abc", "def", "ghi"],
columns=list("abc")
) # size(3,3)
and:
unique = pd.Series([11, 22, 33, 44, 55]) # size(1,5)
then I create a new df based on unique and df, so that:
df_new = pd.DataFrame(index=unique, columns=df.columns) # size(5,3)
From this newly created df, I'd like to create a new boolean df based on unique and df, so that the end result is:
df_new = pd.DataFrame(
[
[0, 1, 1],
[1, 0, 1],
[1, 1, 0],
[0, 0, 1],
[1, 1, 0],
],
index=unique,
columns=df.columns
)
This new df is either true or false depending on whether the value is present in the original dataframe or not. For example, the first column has three values: [22, 55, 33]. In a df with dimensions (5,3), this first column would be: [0, 1, 1, 0, 1] i.e. [0, 22, 33, 0 , 55]
I tried filter2 = unique.isin(df) but this doesn't work, also notnull. I tried applying a filter but the dimensions returned were incorrect. How can I do this?
Use DataFrame.stack with DataFrame.reset_index, DataFrame.pivot, then check if not missing values by DataFrame.notna, cast to integers for True->1 and False->0 mapping and last remove index and columns names by DataFrame.rename_axis:
df_new = (df.stack()
.reset_index(name='v')
.pivot('v','level_1','level_0')
.notna()
.astype(int)
.rename_axis(index=None, columns=None))
print (df_new)
a b c
11 0 1 1
22 1 0 1
33 1 1 0
44 0 0 1
55 1 1 0
Helper Series is not necessary, but if there is more values or is necessary change order by helper Series use add DataFrame.reindex:
#added 66
unique = pd.Series([11, 22, 33, 44, 55,66])
df_new = (df.stack()
.reset_index(name='v')
.pivot('v','level_1','level_0')
.reindex(unique)
.notna()
.astype(int)
.rename_axis(index=None, columns=None))
print (df_new)
a b c
11 0 1 1
22 1 0 1
33 1 1 0
44 0 0 1
55 1 1 0
66 0 0 0

Group Pandas dataframe Age column by Age groups [duplicate]

I have a data frame column with numeric values:
df['percentage'].head()
46.5
44.2
100.0
42.12
I want to see the column as bin counts:
bins = [0, 1, 5, 10, 25, 50, 100]
How can I get the result as bins with their value counts?
[0, 1] bin amount
[1, 5] etc
[5, 10] etc
...
You can use pandas.cut:
bins = [0, 1, 5, 10, 25, 50, 100]
df['binned'] = pd.cut(df['percentage'], bins)
print (df)
percentage binned
0 46.50 (25, 50]
1 44.20 (25, 50]
2 100.00 (50, 100]
3 42.12 (25, 50]
bins = [0, 1, 5, 10, 25, 50, 100]
labels = [1,2,3,4,5,6]
df['binned'] = pd.cut(df['percentage'], bins=bins, labels=labels)
print (df)
percentage binned
0 46.50 5
1 44.20 5
2 100.00 6
3 42.12 5
Or numpy.searchsorted:
bins = [0, 1, 5, 10, 25, 50, 100]
df['binned'] = np.searchsorted(bins, df['percentage'].values)
print (df)
percentage binned
0 46.50 5
1 44.20 5
2 100.00 6
3 42.12 5
...and then value_counts or groupby and aggregate size:
s = pd.cut(df['percentage'], bins=bins).value_counts()
print (s)
(25, 50] 3
(50, 100] 1
(10, 25] 0
(5, 10] 0
(1, 5] 0
(0, 1] 0
Name: percentage, dtype: int64
s = df.groupby(pd.cut(df['percentage'], bins=bins)).size()
print (s)
percentage
(0, 1] 0
(1, 5] 0
(5, 10] 0
(10, 25] 0
(25, 50] 3
(50, 100] 1
dtype: int64
By default cut returns categorical.
Series methods like Series.value_counts() will use all categories, even if some categories are not present in the data, operations in categorical.
Using the Numba module for speed up.
On big datasets (more than 500k), pd.cut can be quite slow for binning data.
I wrote my own function in Numba with just-in-time compilation, which is roughly six times faster:
from numba import njit
#njit
def cut(arr):
bins = np.empty(arr.shape[0])
for idx, x in enumerate(arr):
if (x >= 0) & (x < 1):
bins[idx] = 1
elif (x >= 1) & (x < 5):
bins[idx] = 2
elif (x >= 5) & (x < 10):
bins[idx] = 3
elif (x >= 10) & (x < 25):
bins[idx] = 4
elif (x >= 25) & (x < 50):
bins[idx] = 5
elif (x >= 50) & (x < 100):
bins[idx] = 6
else:
bins[idx] = 7
return bins
cut(df['percentage'].to_numpy())
# array([5., 5., 7., 5.])
Optional: you can also map it to bins as strings:
a = cut(df['percentage'].to_numpy())
conversion_dict = {1: 'bin1',
2: 'bin2',
3: 'bin3',
4: 'bin4',
5: 'bin5',
6: 'bin6',
7: 'bin7'}
bins = list(map(conversion_dict.get, a))
# ['bin5', 'bin5', 'bin7', 'bin5']
Speed comparison:
# Create a dataframe of 8 million rows for testing
dfbig = pd.concat([df]*2000000, ignore_index=True)
dfbig.shape
# (8000000, 1)
%%timeit
cut(dfbig['percentage'].to_numpy())
# 38 ms ± 616 µs per loop (mean ± standard deviation of 7 runs, 10 loops each)
%%timeit
bins = [0, 1, 5, 10, 25, 50, 100]
labels = [1,2,3,4,5,6]
pd.cut(dfbig['percentage'], bins=bins, labels=labels)
# 215 ms ± 9.76 ms per loop (mean ± standard deviation of 7 runs, 10 loops each)
We could also use np.select:
bins = [0, 1, 5, 10, 25, 50, 100]
df['groups'] = (np.select([df['percentage'].between(i, j, inclusive='right')
for i,j in zip(bins, bins[1:])],
[1, 2, 3, 4, 5, 6]))
Output:
percentage groups
0 46.50 5
1 44.20 5
2 100.00 6
3 42.12 5
Convenient and fast version using Numpy
np.digitize is a convenient and fast option:
import pandas as pd
import numpy as np
df = pd.DataFrame({'x': [1,2,3,4,5]})
df['y'] = np.digitize(a['x'], bins=[3,5])
print(df)
returns
x y
0 1 0
1 2 0
2 3 1
3 4 1
4 5 2

How to merge columns using mask

I am trying to merge two columns (Phone 1 and 2)
Here is my fake data:
import pandas as pd
employee = {'EmployeeID' : [0, 1, 2, 3, 4, 5, 6, 7],
'LastName' : ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'],
'Name' : ['w', 'x', 'y', 'z', None, None, None, None],
'phone1' : [1, 1, 2, 2, 4, 5, 6, 6],
'phone2' : [None, None, 3, 3, None, None, 7, 7],
'level_15' : [0, 1, 0, 1, 0, 0, 0, 1]}
df2 = pd.DataFrame(employee)
and I want the 'phone' column to be
'phone' : [1, 2, 3, 4, 5, 7, 9, 10]
In the beginning of my code, i split the names based on '/' and this code below creates a column with 0s and 1s which I used as mask to do other tasks through out my code.
df2 = (df2.set_index(cols)['name'].str.split('/',expand=True).stack().reset_index(name='Name'))
m = df2['level_15'].eq(0)
print (m)
#remove column level_15
df2 = df2.drop(['level_15'], axis=1)
#add last name for select first letter by condition, replace NaNs by forward fill
df2['last_name'] = df2['name'].str[:2].where(m).ffill()
df2['name'] = df2['name'].mask(m, df2['name'].str[2:])
I feel like there is a way to merge phone1 and phone2 using the 0s and 1s, but I can't figure out. Thank you.
First, start by filling in NaNs;
df2['phone2'] = df2.phone2.fillna(df2.phone1)
# Alternatively, based on your latest update
# df2['phone2'] = df2.phone2.mask(df2.phone2.eq(0)).fillna(df2.phone1)
You can just use np.where to merge columns on odd/even indices:
df2['phone'] = np.where(np.arange(len(df2)) % 2 == 0, df2.phone1, df2.phone2)
df2 = df2.drop(['phone1', 'phone2'], 1)
df2
EmployeeID LastName Name phone
0 0 a w 1
1 1 b x 2
2 2 c y 3
3 3 d z 4
4 4 e None 5
5 5 f None 6
6 6 g None 7
7 7 h None 8
Or, with Series.where/mask:
df2['phone'] = df2.pop('phone1').where(
np.arange(len(df2)) % 2 == 0, df2.pop('phone2')
)
Or,
df2['phone'] = df2.pop('phone1').mask(
np.arange(len(df2)) % 2 != 0, df2.pop('phone2)
)
df2
EmployeeID LastName Name phone
0 0 a w 1
1 1 b x 2
2 2 c y 3
3 3 d z 4
4 4 e None 5
5 5 f None 6
6 6 g None 7
7 7 h None 8

Get elements of a permutation

I have an 1D-array whose elements are a permutation of 0:N, and I need to take the first K elements of this permutation
For example, in the case the permutation is
0 [[9]
1 [0]
2 [1]
3 [2]
4 [3]
5 [4]
6 [5]
7 [6]
8 [7]
9 [8]]
The first 3 elements are 9 , 8 , 7
The code is
n = start
r = zeros (nodeCount, dtype = int)
i = 0
while (self.nodes[n][direction] != stop):
r[i] = n
n = self.nodes[n][direction]
i+=1
I need a faster way to extract the elements from permutation.
This works, but I don't think it is going to be especially fast:
>>> a
array([9, 0, 1, 2, 3, 4, 5, 6, 7, 8])
>>> n = 3
>>> b = np.empty((n,), dtype=a.dtype)
>>> b[0] = a[0]
>>> for k in xrange(1, n):
... b[k] = a[b[k-1]]
...
>>> b
array([9, 8, 7])
Is numpy.roll what you're after?
>>> a = np.arange(10)
>>> b = np.roll(a,1)
>>> b
array([9, 0, 1, 2, 3, 4, 5, 6, 7, 8])
>>> np.roll(b[::-1],1)[:3]
array([9, 8, 7])
That last line of code is pretty cryptic, but b[::-1] reverses the array, np.roll shifts it, and the [:3] only takes the first three elements.