How to get the indices of x smallest elements in a large numpy matrix/multi-dimensional array (works for any number of dimensions)? - numpy

Given a large numpy matrix/multi-dimensional array, what is the best and fastest way to get the indices of the x smallest elements?

from typing import Tuple
import numpy as np
def get_indices_of_k_smallest_as_array(arr: np.ndarray, k: int) -> np.ndarray:
idx = np.argpartition(arr.ravel(), k)
return np.array(np.unravel_index(idx, arr.shape))[:, range(k)].transpose().tolist()
def get_indices_of_k_smallest_as_tuple(arr: np.ndarray, k: int) -> Tuple:
idx = np.argpartition(arr.ravel(), k)
return tuple(np.array(np.unravel_index(idx, arr.shape))[:, range(min(k, 0), max(k, 0))])
This answer gives the correct indices, but those indices aren't sorted based on size of the elements. That's just how the introselect algorithm works, which is used by np.argpartition under the hood, https://en.wikipedia.org/wiki/Introselect.
It would be nice if the return was also sorted based on the size of the elements, ex. index 0 of the return points to the smallest element, index 1 points to the 2nd smallest element, etc.
Here's how to do it with sorting. Keep in mind that sorting the results after np.argpartition is going to be much faster than sorting the entire multi-dimensional array.
def get_indices_of_k_smallest_as_array(arr: np.ndarray, k: int) -> np.ndarray:
ravel_array = arr.ravel()
indices_on_ravel = np.argpartition(ravel_array, k)
sorted_indices_on_ravel = sorted(indices_on_ravel, key=lambda x: ravel_array[x])
sorted_indices_on_original = np.array(np.unravel_index(sorted_indices_on_ravel, arr.shape))[:, range(k)].transpose().tolist()
# for the fun of numpy indexing, you can do it this way too
# indices_on_original = np.array(np.unravel_index(indices_on_ravel, arr.shape))[:, range(k)].transpose().tolist()
# sorted_indices_on_original = sorted(indices_on_original, key=lambda x: arr[tuple(np.array(x).T)])
return sorted_indices_on_original
def get_indices_of_k_smallest_as_tuple(arr: np.ndarray, k: int) -> Tuple:
ravel_array = arr.ravel()
indices_on_ravel = np.argpartition(ravel_array, k)
sorted_indices_on_ravel = sorted(indices_on_ravel, key=lambda x: ravel_array[x])
sorted_indices_on_original = tuple(
np.array(np.unravel_index(sorted_indices_on_ravel, arr.shape))[:, range(min(k, 0), max(k, 0))]
)
return sorted_indices_on_original

Related

How to aggregate two rows with the same value in another column?

I have a dataframe like this:
name
cluster
'sock'
1
'graceful'
2
'disgrace'
2
'fixture'
3
'winnow'
4
'window'
4
and a function common_substring that takes in two strings and returns the longest common substring.
I want to return
name
cluster
'sock'
1
'grace'
2
'fixture'
3
'win'
4
I know I'm supposed to group by cluster, but I'm struggling with how to aggregate and apply a function that takes two values.
Based on the #jezrael and the link:
from functools import partial, reduce
from itertools import chain
from typing import Iterator
def ngram(seq: str, n: int) -> Iterator[str]:
return (seq[i: i+n] for i in range(0, len(seq)-n+1))
def allngram(seq: str) -> set:
lengths = range(len(seq))
ngrams = map(partial(ngram, seq), lengths)
return set(chain.from_iterable(ngrams))
def func(a, b):
seqs_ngrams = map(allngram, [a,b])
intersection = reduce(set.intersection, seqs_ngrams)
longest = max(intersection, key=len)
return longest
def f(x):
try:
return func(x.iat[0], x.iat[1])
except:
return 'Unique value'
df = df.groupby('cluster')['name'].agg(f).reset_index()
print (df)
Demonstration:
df = pd.DataFrame({'name':['winnow', 'window'], 'cluster':[4,4]})
df = df.groupby('cluster')['name'].agg(f).reset_index()
output:
For, any length input:
from functools import partial, reduce
from itertools import chain
from typing import Iterator
def ngram(seq: str, n: int) -> Iterator[str]:
return (seq[i: i+n] for i in range(0, len(seq)-n+1))
def allngram(seq: str) -> set:
lengths = range(len(seq)+1)
ngrams = map(partial(ngram, seq), lengths)
return set(chain.from_iterable(ngrams))
def func(seq):
seq=seq.values
seqs_ngrams = map(allngram, seq)
intersection = reduce(set.intersection, seqs_ngrams)
longest = max(intersection, key=len)
return longest
def f(x):
return func(x)
df = pd.DataFrame({'name':['winnow', 'window', 'win', 'sock', 'graceful', 'disgrace','fixture'], 'cluster':[4,4,4,1,2,2,3]})
df = df.groupby('cluster')['name'].agg(f).reset_index()
Ooutput:
Use GroupBy.agg with aggregate function for pass function if length of group is not 1, then pass all values in column in list:
#https://stackoverflow.com/questions/40556491
from functools import partial, reduce
from itertools import chain
from typing import Iterator
def ngram(seq: str, n: int) -> Iterator[str]:
return (seq[i: i+n] for i in range(0, len(seq)-n+1))
def allngram(seq: str) -> set:
lengths = range(len(seq)+1)
ngrams = map(partial(ngram, seq), lengths)
return set(chain.from_iterable(ngrams))
def func(x):
seqs_ngrams = map(allngram, x.tolist())
intersection = reduce(set.intersection, seqs_ngrams)
longest = max(intersection, key=len)
return longest
df['name'] = df['name'].str.strip("'")
df = df.groupby('cluster')['name'].agg(func).reset_index()
print (df)
cluster name
0 1 sock
1 2 grace
2 3 fixture
3 4 win

Get index in matrix using coordinates from linspace

I have a matrix m defined by two linspaces x and y:
x = np.linspace(-1, 1, 100).reshape(1, 100)
y = np.linspace(-1, 1, 100).reshape(100, 1)
m = x + 1j * y
# > m.shape
# (100, 100)
What if I want to set a value to a point at coordinates defined in the given linspace, not the array index?
For example, I would want to write something like this:
m[-0.5, 0.123] = val
Is there an elegant way in numpy to convert from a coordinate system to the closest index in the array?
Here are two possible takes. You could use np.searchsorted to find an index based on value provided x and y are increasing. If x and y are evenly spaced (as in your example of linspace), then you can approximate the index of value val by (val - (-1) / (1 - (-1)) * 100 in O(1) time.
import numpy as np
z = np.linspace(-1, 1, 100)
m = z[None,:] + 1j * z[:,None]
# using searchsorted
f = lambda v: np.searchsorted(z, v)
# approximate the index using the fact that numbers in z are evenly spaced
g = lambda v: int((v - z[0]) / (z[-1] - z[0]) * z.shape[0])
print(m[f(0.5), f(0.123)])
# (0.1313131313131315+0.5151515151515154j)
print(m[g(0.5), g(0.123)])
# (0.1313131313131315+0.5151515151515154j)

Binary-search without an explicit array

I want to perform a binary-search using e.g. np.searchsorted, however, I do not want to create an explicit array containing values. Instead, I want to define a function giving the value to be expected at the desired position of the array, e.g. p(i) = i, where i denotes the position within the array.
Generating an array of values regarding the function would, in my case, be neither efficient nor elegant. Is there any way to achieve this?
What about something like:
import collections
class GeneratorSequence(collections.Sequence):
def __init__(self, func, size):
self._func = func
self._len = size
def __len__(self):
return self._len
def __getitem__(self, i):
if 0 <= i < self._len:
return self._func(i)
else:
raise IndexError
def __iter__(self):
for i in range(self._len):
yield self[i]
This would work with np.searchsorted(), e.g.:
import numpy as np
gen_seq = GeneratorSequence(lambda x: x ** 2, 100)
np.searchsorted(gen_seq, 9)
# 3
You could also write your own binary search function, you do not really need NumPy in this case, and it can actually be beneficial:
def bin_search(seq, item):
first = 0
last = len(seq) - 1
found = False
while first <= last and not found:
midpoint = (first + last) // 2
if seq[midpoint] == item:
first = midpoint
found = True
else:
if item < seq[midpoint]:
last = midpoint - 1
else:
first = midpoint + 1
return first
Which gives identical results:
all(bin_search(gen_seq, i) == np.searchsorted(gen_seq, i) for i in range(100))
# True
Incidentally, this is also WAY faster:
gen_seq = GeneratorSequence(lambda x: x ** 2, 1000000)
%timeit np.searchsorted(gen_seq, 10000)
# 1 loop, best of 3: 1.23 s per loop
%timeit bin_search(gen_seq, 10000)
# 100000 loops, best of 3: 16.1 µs per loop
Inspired by #norok2 comment, I think you can use something like this:
def f(i):
return i*2 # Just an example
class MySeq(Sequence):
def __init__(self, f, maxi):
self.maxi = maxi
self.f = f
def __getitem__(self, x):
if x < 0 or x > self.maxi:
raise IndexError()
return self.f(x)
def __len__(self):
return self.maxi + 1
In this case f is your function while maxi is the maximum index. This of course only works if the function f return values in sorted order.
At this point you can use an object of type MySeq inside np.searchsorted.

Cython Typing List of Strings

I'm trying to use cython to improve the performance of a loop, but I'm running
into some issues declaring the types of the inputs.
How do I include a field in my typed struct which is a string that can be
either 'front' or 'back'
I have a np.recarray that looks like the following (note the length of the
recarray is unknown as compile time)
import numpy as np
weights = np.recarray(4, dtype=[('a', np.int64), ('b', np.str_, 5), ('c', np.float64)])
weights[0] = (0, "front", 0.5)
weights[1] = (0, "back", 0.5)
weights[2] = (1, "front", 1.0)
weights[3] = (1, "back", 0.0)
as well as inputs of a list of strings and a pandas.Timestamp
import pandas as pd
ts = pd.Timestamp("2015-01-01")
contracts = ["CLX16", "CLZ16"]
I am trying to cythonize the following loop
def ploop(weights, contracts, timestamp):
cwts = []
for gen_num, position, weighting in weights:
if weighting != 0:
if position == "front":
cntrct_idx = gen_num
elif position == "back":
cntrct_idx = gen_num + 1
else:
raise ValueError("transition.columns must contain "
"'front' or 'back'")
cwts.append((gen_num, contracts[cntrct_idx], weighting, timestamp))
return cwts
My attempt involved typing the weights input as a struct in cython,
in a file struct_test.pyx as follows
import numpy as np
cimport numpy as np
cdef packed struct tstruct:
np.int64_t gen_num
char[5] position
np.float64_t weighting
def cloop(tstruct[:] weights_array, contracts, timestamp):
cdef tstruct weights
cdef int i
cdef int cntrct_idx
cwts = []
for k in xrange(len(weights_array)):
w = weights_array[k]
if w.weighting != 0:
if w.position == "front":
cntrct_idx = w.gen_num
elif w.position == "back":
cntrct_idx = w.gen_num + 1
else:
raise ValueError("transition.columns must contain "
"'front' or 'back'")
cwts.append((w.gen_num, contracts[cntrct_idx], w.weighting,
timestamp))
return cwts
But I am receiving runtime errors, which I believe are related to the
char[5] position.
import pyximport
pyximport.install()
import struct_test
struct_test.cloop(weights, contracts, ts)
ValueError: Does not understand character buffer dtype format string ('w')
In addition I am a bit unclear how I would go about typing contracts as well
as timestamp.
Your ploop (without the timestamp variable) produces:
In [226]: ploop(weights, contracts)
Out[226]: [(0, 'CLX16', 0.5), (0, 'CLZ16', 0.5), (1, 'CLZ16', 1.0)]
Equivalent function without a loop:
def ploopless(weights, contracts):
arr_contracts = np.array(contracts) # to allow array indexing
wgts1 = weights[weights['c']!=0]
mask = wgts1['b']=='front'
wgts1['b'][mask] = arr_contracts[wgts1['a'][mask]]
mask = wgts1['b']=='back'
wgts1['b'][mask] = arr_contracts[wgts1['a'][mask]+1]
return wgts1.tolist()
In [250]: ploopless(weights, contracts)
Out[250]: [(0, 'CLX16', 0.5), (0, 'CLZ16', 0.5), (1, 'CLZ16', 1.0)]
I'm taking advantage of the fact that returned list of tuples has same (int, str, int) layout as the input weight array. So I'm just making a copy of weights and replacing selected values of the b field.
Note that I use the field selection index before the mask one. The boolean mask produces a copy, so we have to careful about indexing order.
I'm guessing that loop-less array version will be competitive in time with the cloop (on realistic arrays). The string and list operations in cloop probably limit its speedup.

Row-wise Histogram

Given a 2-dimensional tensor t, what's the fastest way to compute a tensor h where
h[i, :] = tf.histogram_fixed_width(t[i, :], vals, nbins)
I.e. where tf.histogram_fixed_width is called per row of the input tensor t?
It seems that tf.histogram_fixed_width is missing an axis parameter that works like, e.g., tf.reduce_sum's axis parameter.
tf.histogram_fixed_width works on the entire tensor indeed. You have to loop through the rows explicitly to compute the per-row histograms. Here is a complete working example using TensorFlow's tf.while_loop construct :
import tensorflow as tf
t = tf.random_uniform([2, 2])
i = 0
hist = tf.constant(0, shape=[0, 5], dtype=tf.int32)
def loop_body(i, hist):
h = tf.histogram_fixed_width(t[i, :], [0.0, 1.0], nbins=5)
return i+1, tf.concat_v2([hist, tf.expand_dims(h, 0)], axis=0)
i, hist = tf.while_loop(
lambda i, _: i < 2, loop_body, [i, hist],
shape_invariants=[tf.TensorShape([]), tf.TensorShape([None, 5])])
sess = tf.InteractiveSession()
print(hist.eval())
Inspired by keveman's answer and because the number of rows of t is fixed and rather small, I chose to use a combination of tf.gather to split rows and tf.pack to join rows. It looks simple and works, will see if it is efficient...
t_histo_rows = [
tf.histogram_fixed_width(
tf.gather(t, [row]),
vals, nbins)
for row in range(t_num_rows)]
t_histo = tf.pack(t_histo_rows, axis=0)
I would like to propose another implementation.
This implementation can also handle multi axes and unknown dimensions (batching).
def histogram(tensor, nbins=10, axis=None):
value_range = [tf.reduce_min(tensor), tf.reduce_max(tensor)]
if axis is None:
return tf.histogram_fixed_width(tensor, value_range, nbins=nbins)
else:
if not hasattr(axis, "__len__"):
axis = [axis]
other_axis = [x for x in range(0, len(tensor.shape)) if x not in axis]
swap = tf.transpose(tensor, [*other_axis, *axis])
flat = tf.reshape(swap, [-1, *np.take(tensor.shape.as_list(), axis)])
count = tf.map_fn(lambda x: tf.histogram_fixed_width(x, value_range, nbins=nbins), flat, dtype=(tf.int32))
return tf.reshape(count, [*np.take([-1 if a is None else a for a in tensor.shape.as_list()], other_axis), nbins])
The only slow part here is tf.map_fn but it is still faster than the other solutions mentioned.
If someone knows a even faster implementation please comment since this operation is still very expensive.
answers above is still slow running in GPU. Here i give an another option, which is faster(at least in my running envirment), but it is limited to 0~1 (you can normalize the value first). the train_equal_mask_nbin can be defined once in advance
def histogram_v3_nomask(tensor, nbins, row_num, col_num):
#init mask
equal_mask_list = []
for i in range(nbins):
equal_mask_list.append(tf.ones([row_num, col_num], dtype=tf.int32) * i)
#[nbins, row, col]
#[0, row, col] is tensor of shape [row, col] with all value 0
#[1, row, col] is tensor of shape [row, col] with all value 1
#....
train_equal_mask_nbin = tf.stack(equal_mask_list, axis=0)
#[inst, doc_len] float to int(equaly seg float in bins)
int_input = tf.cast(tensor * (nbins), dtype=tf.int32)
#input [row,col] -> copy N times, [nbins, row_num, col_num]
int_input_nbin_copy = tf.reshape(tf.tile(int_input, [nbins, 1]), [nbins, row_num, col_num])
#calculate histogram
histogram = tf.transpose(tf.count_nonzero(tf.equal(train_equal_mask_nbin, int_input_nbin_copy), axis=2))
return histogram
With the advent of tf.math.bincount, I believe the problem has become much simpler.
Something like this should work:
def hist_fixed_width(x,st,en,nbins):
x=(x-st)/(en-st)
x=tf.cast(x*nbins,dtype=tf.int32)
x=tf.clip_by_value(x,0,nbins-1)
return tf.math.bincount(x,minlength=nbins,axis=-1)