numpy / linear algebra - fast 16-bit histogram - numpy

If I have an image made of uint16s and want to compute a histogram for each bit, i.e. a vector 'x' of 0..65535 that contains the intensity value, and a vector y that is the number of samples that have that value, is there a vectorized numpy / linear algreba way to compute this?

I did it the obvious way with Numpy, and using your image dimensions on my Mac, it takes 300ms. I then did the same thing with OpenCV and it is 33x faster at 9ms!
#!/usr/bin/env python3
import cv2
import numpy as np
# Dimensions - height, width
h, w = 2160, 2560
# Known image, channel0=1, channel1=3, channel2=5, channel3=65535
R = np.zeros((h,w,4), dtype=np.uint16)
R[...,0] = 1
R[...,1] = 3
R[...,2] = 5
R[...,3] = 65535
def npHistogram(R):
"""Generate histogram using Numpy"""
H, _ = np.histogram(R,65536)
return H
def OpenCVHistogram(R):
"""Generate histogram using OpenCV"""
H = cv2.calcHist([R.ravel()], [0], None, [65536], [0,65536])
return H
A = npHistogram(R)
B = OpenCVHistogram(R)
#%timeit npHistogram(R)
#%timeit OpenCVHistogram(R)
Results
Using IPython, I got these timings
%timeit npHistogram(R)
300 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit OpenCVHistogram(R)
9.02 ms ± 226 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Keywords: Python, histogram, slow, Numpy, np.histogram, speedup, OpenCV, image processing.

Ok, if OpenCV is too big a dependency for you to get 9ms processing time instead of 300ms, how about Numba? This runs in 10ms.
#!/usr/bin/env python3
import numpy as np
from numba import jit
# Dimensions - height, width
h, w = 2160, 2560
# Known image, channel0=1, channel1=3, channel2=5, channel3=65535
R = np.zeros((h,w,4), dtype=np.uint16)
R[...,0] = 1
R[...,1] = 3
R[...,2] = 5
R[...,3] = 65535
#jit(nopython=True, nogil=True)
def NumbaHistogram(pixels):
"""Histogram of uint16 image"""
H = np.zeros(65536, dtype=np.int32)
for i in range(len(pixels)):
H[pixels[i]] += 1
return H
#%timeit q = NumbaHistogram(R.ravel())
Results
%timeit NumbaHistogram(R.ravel())
10.4 ms ± 54.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Related

Removing items in one 2d numpy array from another

I have a function, foo, which returns an np array containing every possible combination of np.arange(n) when k numbers are removed.
import numpy as np
from itertools import combinations
def foo(n,k):
return np.array([np.delete(np.arange(n),i) for i in combinations(range(n),k)])
The output of this function is correct, but the list comprehension it uses means a longer processing time when larger numbers are involved. Is there a more efficient solution to this using pure numpy?
I have tried using np.delete with idx as the key (a 2d array that contains the values to remove on each row), along with a broadcasted np.arange without success:
import numpy as np
from itertools import combinations
k = 2
n = 15
idx = np.array([i for i in combinations(range(n),k)])
arr = np.broadcast_to(np.arange(n), (idx.shape[0],n))
res = np.delete(arr, idx, axis=1)
This code produces an empty array.
"Is there a more efficient solution to this using pure numpy?" . No.
itertools is efficient. Instead of deleting k elements, choose (n-k) elements.
import numpy as np
from itertools import chain, combinations
def foo_new(n,k):
return list(combinations(np.arange(n),n-k))
def foo_old(n,k): ## your function
return np.array([np.delete(np.arange(n),i) for i in combinations(range(n),k)])
# In [5]: %timeit foo_new(25,5)
# 3.77 ms ± 62.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# In [6]: %timeit foo_old(25,5)
# 151 ms ± 676 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Understanding Numba Performance Differences

I'm trying to understand the performance differences I am seeing by using various numba implementations of an algorithm. In particular, I would expect func1d from below to be the fastest implementation since it it the only algorithm that is not copying data, however from my timings func1b appears to be fastest.
import numpy
import numba
def func1a(data, a, b, c):
# pure numpy
return a * (1 + numpy.tanh((data / b) - c))
#numba.njit(fastmath=True)
def func1b(data, a, b, c):
new_data = a * (1 + numpy.tanh((data / b) - c))
return new_data
#numba.njit(fastmath=True)
def func1c(data, a, b, c):
new_data = numpy.empty(data.shape)
for i in range(new_data.shape[0]):
for j in range(new_data.shape[1]):
new_data[i, j] = a * (1 + numpy.tanh((data[i, j] / b) - c))
return new_data
#numba.njit(fastmath=True)
def func1d(data, a, b, c):
for i in range(data.shape[0]):
for j in range(data.shape[1]):
data[i, j] = a * (1 + numpy.tanh((data[i, j] / b) - c))
return data
Helper functions for testing memory copying
def get_data_base(arr):
"""For a given NumPy array, find the base array
that owns the actual data.
https://ipython-books.github.io/45-understanding-the-internals-of-numpy-to-avoid-unnecessary-array-copying/
"""
base = arr
while isinstance(base.base, numpy.ndarray):
base = base.base
return base
def arrays_share_data(x, y):
return get_data_base(x) is get_data_base(y)
def test_share(func):
data = data = numpy.random.randn(100, 3)
print(arrays_share_data(data, func(data, 0.5, 2.5, 2.5)))
Timings
# force compiling
data = numpy.random.randn(10_000, 300)
_ = func1a(data, 0.5, 2.5, 2.5)
_ = func1b(data, 0.5, 2.5, 2.5)
_ = func1c(data, 0.5, 2.5, 2.5)
_ = func1d(data, 0.5, 2.5, 2.5)
data = numpy.random.randn(10_000, 300)
%timeit func1a(data, 0.5, 2.5, 2.5)
%timeit func1b(data, 0.5, 2.5, 2.5)
%timeit func1c(data, 0.5, 2.5, 2.5)
%timeit func1d(data, 0.5, 2.5, 2.5)
67.2 ms ± 230 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
13 ms ± 10.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
69.8 ms ± 60.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
69.8 ms ± 105 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Test which implementations copy memory
test_share(func1a)
test_share(func1b)
test_share(func1c)
test_share(func1d)
False
False
False
True
Here, copying of data doesn't play a big role: the bottle neck is fast how the tanh-function is evaluated. There are many algorithms: some of them are faster some of them are slower, some are more precise some less.
Different numpy-distributions use different implementations of tanh-function, e.g. it could be one from mkl/vml or the one from the gnu-math-library.
Depending on numba version, also either the mkl/svml impelementation is used or gnu-math-library.
The easiest way to look inside is to use a profiler, for example perf.
For the numpy-version on my machine I get:
>>> perf record python run.py
>>> perf report
Overhead Command Shared Object Symbol
46,73% python libm-2.23.so [.] __expm1
24,24% python libm-2.23.so [.] __tanh
4,89% python _multiarray_umath.cpython-37m-x86_64-linux-gnu.so [.] sse2_binary_scalar2_divide_DOUBLE
3,59% python [unknown] [k] 0xffffffff8140290c
As one can see, numpy uses the slow gnu-math-library (libm) functionality.
For the numba-function I get:
53,98% python libsvml.so [.] __svml_tanh4_e9
3,60% python [unknown] [k] 0xffffffff81831c57
2,79% python python3.7 [.] _PyEval_EvalFrameDefault
which means that fast mkl/svml functionality is used.
That is (almost) all there is to it.
As #user2640045 has rightly pointed out, the numpy performance will be hurt by additional cache misses due to creation of temporary arrays.
However, cache misses don't play such a big role as the calculation of tanh:
%timeit func1a(data, 0.5, 2.5, 2.5) # 91.5 ms ± 2.88 ms per loop
%timeit numpy.tanh(data) # 76.1 ms ± 539 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
i.e. creation of temporary objects is responsible for around 20% of the running time.
FWIW, also for version with the handwritten loops, my numba version (0.50.1) is able to vectorize and call mkl/svml functionality. If for some other version this not happens - numba will fall back to gnu-math-library functionality, what seems to be happening on your machine.
Listing of run.py:
import numpy
# TODO: define func1b for checking numba
def func1a(data, a, b, c):
# pure numpy
return a * (1 + numpy.tanh((data / b) - c))
data = numpy.random.randn(10_000, 300)
for _ in range(100):
func1a(data, 0.5, 2.5, 2.5)
The performance difference is NOT in the evaluation of the tanh-function
I must disagree with #ead. Let's assume for the moment that
the main performance difference is in the evaluation of the tanh-function
Then one would expect that running just tanh from numpy and numba with fast math would show that speed difference.
def func_a(data):
return np.tanh(data)
#nb.njit(fastmath=True)
def func_b(data):
new_data = np.tanh(data)
return new_data
data = np.random.randn(10_000, 300)
%timeit func_a(data)
%timeit func_b(data)
Yet on my machine the above code shows almost no difference in performance.
15.7 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
15.8 ms ± 82 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Short detour on NumExpr
I tried a NumExpr version of your code. But before being amazed that it runts almost 7 times faster you should keep in mind that it uses all 10 cores available on my machine. After allowing numba to run in parallel too and optimising that a little bit the performance benefit is small but sill there 2.56 ms vs 3.87 ms. See code below.
#nb.njit(fastmath=True)
def func_a(data):
new_data = a * (1 + np.tanh((data / b) - c))
return new_data
#nb.njit(fastmath=True, parallel=True)
def func_b(data):
new_data = a * (1 + np.tanh((data / b) - c))
return new_data
#nb.njit(fastmath=True, parallel=True)
def func_c(data):
for i in nb.prange(data.shape[0]):
for j in range(data.shape[1]):
data[i, j] = a * (1 + np.tanh((data[i, j] / b) - c))
return data
def func_d(data):
return ne.evaluate('a * (1 + tanh((data / b) - c))')
data = np.random.randn(10_000, 300)
%timeit func_a(data)
%timeit func_b(data)
%timeit func_c(data)
%timeit func_d(data)
17.4 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.31 ms ± 193 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.87 ms ± 152 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.56 ms ± 104 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
The actual explanation
The ~34% time that NumExpr saves compared to numba are nice but even nicer is that they have a concise explanation why they are faster than numpy. I am pretty sure that this applies to numba too.
From the NumExpr github page:
The main reason why NumExpr achieves better performance than NumPy is
that it avoids allocating memory for intermediate results. This
results in better cache utilization and reduces memory access in
general.
So
a * (1 + numpy.tanh((data / b) - c))
is slower because it does a lot of steps producing intermediate results.

Matrix multiplication in Numpy takes too much time

I am trying to simply implement a loss function (MSE) in Python using numpy and this is my code:
import numpy as np
def loss(X, y, w):
N = (X.shape)[0]
X_new = np.concatenate((np.ones((N, 1)), X), axis=1)
E = y-np.matmul(X_new, w)
E_t = np.transpose(E)
loss_value = (1/N)*(np.matmul(E_t, E))
return loss_value
The dimension of E is (15000, 1) and E_t is obviously (1,15000). However, when debugging, I realized that np.matmul(E_t,E) takes too much time. I have a laptop with 16GB of RAM and Core i7, so it's weird for me that np.matmul is failing here. Is this normal if the matrices I am dealing with have these dimensions?
On a rather basic 4GB machine:
In [477]: E=np.ones((15000, 1))
In [478]: E.T#E
Out[478]: array([[15000.]])
In [479]: timeit E.T#E
10.5 µs ± 241 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
You don't tell us anything about X, but assuming the worse case:
In [480]: E=np.ones((15000, 1),object)
In [481]: E.T#E
Out[481]: array([[15000]], dtype=object)
In [482]: timeit E.T#E
577 µs ± 492 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)

Why random.choices is faster than NumPy’s random choice?

I am trying to do random sampling in the most efficient way in Python, however, I am puzzled because when using the numpy's random.choices() was slower than using the random.choices()
import numpy as np
import random
np.random.seed(12345)
# use gamma distribution
shape, scale = 2.0, 2.0
s = np.random.gamma(shape, scale, 1000000)
meansample = []
samplesize = 500
%timeit meansample = [ np.mean( np.random.choice( s, samplesize, replace=False)) for _ in range(500)]
23.3 s ± 229 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit meansample = [np.mean(random.choices(s, k=samplesize)) for x in range(0,500)]
152 ms ± 324 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
23 Seconds vs 152 ms is a lot of time
What i'am doing wrong?
Two issues here. First, for the pure-Python random library, you probably mean to use sample instead of choices to sample without replacement. That alters the benchmark somewhat. Second, np.random.choice has better performing alternatives for sampling without replacement. This is a known issue related to random generator API. You can use np.random.Generator to get better performance. My timings:
%timeit meansample = [ np.mean( np.random.choice( s, samplesize, replace=False)) for _ in range(500)]
# 1 loop, best of 3: 12.4 s per loop
%timeit meansample = [np.mean(random.choices(s, k=samplesize)) for x in range(0,500)]
# 10 loops, best of 3: 118 ms per loop
sl = s.tolist()
%timeit meansample = [np.mean(random.sample(sl, k=samplesize)) for x in range(0,500)]
# 1 loop, best of 3: 219 ms per loop
g = np.random.Generator(np.random.PCG64())
%timeit meansample = [ np.mean( g.choice( s, samplesize, replace=False)) for _ in range(500)]
# 10 loops, best of 3: 25 ms per loop
So, without replacement, random.sample outperforms np.random.choice but is slower than np.random.Generator.choice.

Calculating distances between vectors of two numpy arrays

I have two numpy arrays R with dimensions S x F and W with dimensions N x M x F. Getting concrete lets assign the following values N = 5, M = 7, F = 3, S = 4
The array R contains a collections of samples S = 4 with F = 3 features. Each line represents a samples and each row a feature. Therefore R[0] is the first sample, R[1] the second and goes on. Each R[i-th] entry, contains F elements, giving for sake of example R[0] = np.array([1, 4, -2]).
Here is a small snippet to initialize all those values, with a MWE in mind
import numpy as np
# Size of Map (rows, columns)
N, M = 5, 7
# Number of features
F = 3
# Sample size
S = 4
np.random.seed(13)
R = np.random.randint(0, 10, size=(S, F))
W = np.random.randint(-4, 5, size=(N, M, F))
We can also see a given "depth line" of numpy array W, as a vector also with same dimension as each row of array R (this can easily be noticed looking at the size of the last dimension of both arrays). With that I can access W[2, 3] and obtain np.array([ 2, 2, -1 ]) (the values here are just examples).
I created a simple function to calculate the distance of a given vector r to each "depth line" of matrix W and the return the position of the nearest element of W depth line to r
def nearest_vector_matrix_naive(r, W):
delta = np.zeros((N,M), dtype=int)
for i in range(N):
for j in range(M):
norm = 0
for k in range(F):
norm += (r[k] - W[i,j,k])**2
delta[i,j] = norm
norm = 0
win_idx = np.unravel_index(np.argmin(delta, axis=None), delta.shape)
return win_idx
Of course this is a very naive approach, that I could further optimize to the code below, obtaining a HUGE performance boost.
def nearest_vector_matrix(r, W):
delta = np.sum((W[:,:] - r)**2, axis=2)
return np.unravel_index(np.argmin(delta, axis=None), delta.shape)
I can use this function simple as
nearest_idx = nearest_vector_matrix(R[0], W)
# Returns the nearest vector in W to R[0]
W[nearest_idx]
Since I have the array R with a bunch of samples I use the following snippet to calculate the nearest vectors to a array of samples:
def nearest_samples_matrix(R, W):
DELTA = np.zeros((R.shape[0],2))
for idx, r in enumerate(R):
delta = np.sum((W[:,:] - r)**2, axis=2)
DELTA[idx] = np.unravel_index(np.argmin(delta, axis=None), delta.shape)
return DELTA
This function returns an array with S rows (S being the number of samples) of 2d indexes. That is DELTA has (S, 2) shape (always).
I would like to know how can I substitute the for loop (for example for a broadcasting) inside nearest_samples_matrix to enhance the code execution performance even further?
I could not figure out how to do it. (besides I was able to do it in the first case)
The best solution depends on the input size of the arrays
For lower dimensional problems dim<20 or less, a kdtree approach is usually the way to go. There are quite a lot of answers regarding this topic eg. one I have written a few weeks ago.
If the dimension of the problems is too high you can switch to brute-force algorithms. Both of the following algorithms are much faster than your optimized approach, but on larger input sizes and low dimensional problems much slower than a kdtree approach O(log(n)) instead of O(n^2).
Brute force 1
The following example uses an algorithm described here. It is very fast on large dimensional problems because most of the calculation is done in a highly optimized matrix-matrix multiplication algorithm.
The disadvantage is high memory usage (all distances are calculated in one function call) and precision problems, because of the more error prone calculation method.
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
def nearest_samples_matrix_2(R,W):
R_Temp=R
W_Temp=W.reshape(-1,W.shape[2])
dist=euclidean_distances(R_Temp, W_Temp)
ind_1,ind_2=np.unravel_index(np.argmin(dist,axis=1),shape=(W.shape[0],W.shape[1]))
return np.vstack((ind_1,ind_2)).T
Brute force 2
This is quite similar to your naive approach, but uses a JIT-Compiler (Numba) to get good performance. Temporary arrays are not necessary and the precision should be good (as long as no overflow occurs). There is room for further optimization (loop tiling) on larger input sizes.
import numpy as np
import numba as nb
#parallelization is only beneficial on larger input data
#nb.njit(fastmath=True,parallel=True,cache=True)
def nearest_samples_matrix_3(r, W):
ind_i=0
ind_j=0
out=np.empty((r.shape[0],2),dtype=np.int64)
for x in nb.prange(r.shape[0]):
delta=0
for k in range(W.shape[2]):
delta += (r[x,k] - W[0,0,k])**2
for i in range(W.shape[0]):
for j in range(W.shape[1]):
norm = 0
for k in range(W.shape[2]):
norm += (r[x,k] - W[i,j,k])**2
if norm < delta:
delta=norm
ind_i=i
ind_j=j
out[x,0]=ind_i
out[x,1]=ind_j
return out
Timings
#small Arrays
N, M = 100, 200
F = 30
S = 50
R = np.random.randint(0, 10, size=(S, F))
W = np.random.randint(-4, 5, size=(N, M, F))
#your function
%timeit nearest_samples_matrix(R,W)
#268 ms ± 2.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit nearest_samples_matrix_2(R,W)
#5.62 ms ± 22.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit nearest_samples_matrix_3(R,W)
#3.68 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#larger arrays
N, M = 1_000, 2_000
F = 50
S = 100
R = np.random.randint(0, 10, size=(S, F))
W = np.random.randint(-4, 5, size=(N, M, F))
#%timeit nearest_samples_matrix_1(R,W)
#too slow
%timeit nearest_samples_matrix_2(R,W)
#2.76 s ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit nearest_samples_matrix_3(R,W)
#1.42 s ± 402 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)