Efficient use of JAX for conditional function evaluation based on an array of integers - numpy

I want to efficiently perform conditional function evaluation based on an array of integers and other arrays with real numbers serving as input for those functions. I hope to find a JAX-based solution that provides significant performance improvements over a for-loop approach that I describe below:
import jax
from jax import vmap;
import jax.numpy as jnp
import jax.random as random
def g_0(x, y, z, u):
return x + y + z + u
def g_1(x, y, z, u):
return x * y * z * u
def g_2(x, y, z, u):
return x - y + z - u
def g_3(x, y, z, u):
return x / y / z / u
g_i = [g_0, g_1, g_2, g_3]
g_i_jit = [jax.jit(func) for func in g_i]
def g_git(i, x, y, z, u):
return g_i_jit[i](x=x, y=y, z=z, u=u)
def g(i, x, y, z, u):
return g_i[i](x=x, y=y, z=z, u=u)
len_xyz = 3000
x_ar = random.uniform(random.PRNGKey(0), shape=(len_xyz,))
y_ar = random.uniform(random.PRNGKey(1), shape=(len_xyz,))
z_ar = random.uniform(random.PRNGKey(2), shape=(len_xyz,))
len_u = 1000
u_0 = random.uniform(random.PRNGKey(3), shape=(len_u,))
u_1 = jnp.repeat(u_0, len_xyz)
u_ar = u_1.reshape(len_u, len_xyz)
len_i = 50
i_ar = random.randint(random.PRNGKey(5), shape=(len_i,), minval=0, maxval= len(g_i)) #related to g_range-1
total = jnp.zeros((len_u, len_xyz))
for i in range(len_i):
total= total + g_git(i_ar[i], x_ar, y_ar, z_ar, u_ar)
The role of "i_ar" is to act as an index that selects one of the four functions from the list g_i. "i_ar" is an array of integers, with each integer representing an index into the g_i list. On the other hand, x_ar, y_ar, z_ar, and u_ar are arrays of real numbers that are inputs to the functions selected by i_ar.
I suspect that this difference in nature between i_ar and x_ar, y_ar, z_ar, and u_ar is what could be difficult to find a JAX way that would be more efficient replacement of the for loop above'. Any ideas how to use JAX (or something else) to replace the foor loop to obtain 'total' more efficiently?
I have tried naively using vmap:
g_git_vmap = jax.vmap(g_git)
total = jnp.zeros((len_u, len_xyz))
total = jnp.sum(g_git_vmap(i_ar, x_ar, y_ar, z_ar, u_ar), axis=0)
but this resulted in error messages and led to nowhere.

Probably the best way to do this is with lax.switch, which allows dynamically switching between multiple functions based on an index array.
Here's a comparison of your original function with an approach based on lax.switch, with timings on a Colab GPU runtime:
def f_original(i, x, y, z, u):
total = jnp.zeros((len(u), len(x)))
for ii in range(len(i)):
total += g_git(i[ii], x, y, z, u)
return total
#jax.jit
def f_switch(i, x, y, z, u):
g = lambda i: jax.lax.switch(i, g_i, x, y, z, u)
return jax.vmap(g)(i).sum(0)
out1 = f_original(i_ar, x_ar, y_ar, z_ar, u_ar)
out2 = f_switch(i_ar, x_ar, y_ar, z_ar, u_ar)
np.testing.assert_allclose(out1, out2, rtol=5E-3)
%timeit f_original(i_ar, x_ar, y_ar, z_ar, u_ar).block_until_ready()
# 71 ms ± 23.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit f_switch(i_ar, x_ar, y_ar, z_ar, u_ar).block_until_ready()
# 4.69 ms ± 37.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Related

Make an LSTM prediction written in NumPy faster

I have written a bidirectional-LSTM prediction function using NumPy (and not Tensorflow nor PyTorch), and I need to make it faster. The network has three layers, but for the sake of simplicity, I will just show (and time) the first layer. This bi-LSTM layer is called by calling the subfunctions LSTMf() and LSTMb() to process the input data (array of 500 points) forward and backwards. The LSTMf() and LSTMb() have loops which I suspect take the most time. Here is the prediction function:
import numpy as np
def predict(xt, ht, c, u, t, whff, wxff, bff, whif, wxif, bif, whlf, wxlf, blf, whof, wxof, bof, whfb,
wxfb, bfb, whib, wxib, bib, whlb, wxlb, blb, whob, wxob, bob):
def tanh(a):
return np.tanh(a)
def sig(a):
return 1 / (1 + np.exp(-a))
def cell(x, h, c, wh1, wx1, b1, wh2, wx2, b2, wh3, wx3, b3, wh4, wx4, b4):
new_c = c * sig(h # wh1 + x # wx1 + b1) + sig(h # wh2 + x # wx2 + b2) * tanh(h # wh3 + x # wx3 + b3)
new_h = tanh(new_c) * sig(h # wh4 + x # wx4 + b4)
return new_c, new_h
def LSTMf(xt, ht, c, t, whf, wxf, bf, whi, wxi, bi, whl, wxl, bl, who, wxo, bo):
h = ht[t - 1:t]
for i in range(t):
c, h = cell(xt[i:i + 1], h, c, whf, wxf, bf, whi, wxi, bi, whl, wxl, bl, who, wxo, bo)
ht[i] = h
return ht
def LSTMb(xt, ht, c, t, whf, wxf, bf, whi, wxi, bi, whl, wxl, bl, who, wxo, bo):
h = ht[0:1]
for i in range(t - 1, -1, -1):
c, h = cell(xt[i:i + 1], h, c, whf, wxf, bf, whi, wxi, bi, whl, wxl, bl, who, wxo, bo)
ht[i] = h
return ht
# LSTM-bi 1
hf = LSTMf(xt, ht.copy(), c, t, whff, wxff, bff, whif, wxif, bif, whlf, wxlf, blf, whof, wxof, bof)
hb = LSTMb(xt, ht.copy(), c, t, whfb, wxfb, bfb, whib, wxib, bib, whlb, wxlb, blb, whob, wxob, bob)
xt = np.concatenate((hf, hb), axis=1)
return xt
The input data and the rest of parameters can be artificially generated with the following code:
t = 500 # input's number of points
u = 64 # layer's number of units
xt = np.zeros((t, 1), dtype=np.float32) # input
ht = np.zeros((t, u), dtype=np.float32)
ou = np.zeros((1, u), dtype=np.float32)
uu = np.zeros((u, u), dtype=np.float32)
weights = {'wxif':ou,'wxff':ou,'wxlf':ou,'wxof':ou,'whif':uu,'whff':uu,'whlf':uu,'whof':uu,'bif':ou,'bff':ou,'blf':ou,'bof':ou,
'wxib':ou,'wxfb':ou,'wxlb':ou,'wxob':ou,'whib':uu,'whfb':uu,'whlb':uu,'whob':uu,'bib':ou,'bfb':ou,'blb':ou,'bob':ou}
yt = predict(xt, ht, ou, **weights) # Call example
I have timed it (1) like this, (2) with Numba, and (3) with Cython:
import numpy as np
from predict import predict
from predict_numba import predict_numba
from predict_cython import predict_cython
import timeit
n = 100
print(timeit.Timer(lambda: predict(xt, ht, ou, u, t, **weights)).timeit(n)/n) # 0.05198 s
predict_numba(xt, ht, ou, u, t, **weights) # Dummy slow numba call
print(timeit.Timer(lambda: predict_numba(xt, ht, ou, u, t, **weights)).timeit(n)/n) # 0.01149 s
print(timeit.Timer(lambda: predict_cython(xt, ht, ou, u, t, **weights)).timeit(n)/n) # 0.13345 s
I would like to make this prediction faster than 0.03 s.
Numba is fast enough but I cannot have a very slow first call (more than 30 s for the three layers)
Cython is very slow; I'm not sure if this is the reason, but following the advice here (Cython: matrix multiplication) I did not type most parameters since the operation '#' does not support memory views.
Originally I was using Keras with CPU or GPU, but NumPy is faster than either. I have also heard of TorchScript which might be applicable. What can I do to make the prediction faster?
__
Context: This function predicts the R-peaks in an ECG window, and is meant to be called as frequently as possible, to predict the R-peaks of an ECG being acquired in real-time.
PS. In case you want to make sense of the calculations, this description of how an LSTM cell works might be of use: https://imgur.com/UFrd9oa

Efficiently find indexes that would make array equal to a permutation of itself

I'm looking for some function that find the indexes that would make an array equal to a permutation of itself.
Assume that p1 is a 1d Numpy array that contains no duplicates. Assume that p2 is a permutation (a reordering) of p1.
I want a function find_position_in_original such that p2[find_position_in_original(p2, p1)] is identical to p1.
For example:
p1 = np.array(['a', 'e', 'c', 'f'])
p2 = np.array(['e', 'f', 'a', 'c'])
in which find_position_in_permutation(p1, p2) should return:
[2, 0, 1, 3]
because p2[[2, 0, 1, 3]] is identical to p1.
You can do this in a brute-force manner using lists:
def find_position_in_permutation(original, permutation):
original = list(original)
permutation = list(permutation)
return list(map(permutation.index, original))
but I am wondering if there is something more algorithmically efficient. This one appears to be O(N^2).
Benchmarks of current answers:
import numpy as np
from string import ascii_lowercase
n = 100
letters = np.array([*ascii_lowercase])
p1 = np.random.choice(letters, size=n)
p2 = np.random.permutation(p1)
p1l = p1.tolist()
p2l = p2.tolist()
def find_pos_in_perm_1(original, permutation):
""" My original solution """
return list(map(permutation.index, original))
def find_pos_in_perm_2(original, permutation):
""" Eric Postpischil's solution, using a dict as a lookup table """
tbl = {val: ix for ix, val in enumerate(permutation)}
return [tbl[val] for val in original]
def find_pos_in_perm_3(original, permutation):
""" Paul Panzer's solution, using an array as a lookup table """
original_argsort = np.argsort(original)
permutation_argsort = np.argsort(permutation)
tbl = np.empty_like(original_argsort)
tbl[original_argsort] = permutation_argsort
return tbl
%timeit find_pos_in_perm_1(p1l, p2l)
# 40.5 µs ± 1.13 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit find_pos_in_perm_2(p1l, p2l)
# 10 µs ± 171 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit find_pos_in_perm_3(p1, p2)
# 6.38 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
You can do O(N log N) using argsort:
>>> import numpy as np
>>> from string import ascii_lowercase
>>>
>>> letters = np.array([*ascii_lowercase])
>>> p1, p2 = map(np.random.permutation, 2*(letters,))
>>>
>>> o1, o2 = map(np.argsort, (p1, p2))
>>> o12, o21 = map(np.empty_like, (o1, o2))
>>> o12[o1], o21[o2] = o2, o1
>>>
>>> print(np.all(p1[o21] == p2))
True
>>> print(np.all(p2[o12] == p1))
True
O(N) solution using Python dictionary:
>>> import operator as op
>>>
>>> l1, l2 = map(op.methodcaller('tolist'), (p1, p2))
>>>
>>> s12 = op.itemgetter(*l1)({k: v for v, k in enumerate(l2)})
>>> print(np.all(s12 == o12))
True
Some timings:
26 elements
argsort 0.004 ms
dict 0.003 ms
676 elements
argsort 0.096 ms
dict 0.075 ms
17576 elements
argsort 4.366 ms
dict 2.915 ms
456976 elements
argsort 191.376 ms
dict 230.459 ms
Benchmark code:
import numpy as np
from string import ascii_lowercase
import operator as op
from timeit import timeit
L1 = np.array([*ascii_lowercase], object)
L2 = np.add.outer(L1, L1).ravel()
L3 = np.add.outer(L2, L1).ravel()
L4 = np.add.outer(L2, L2).ravel()
letters = (*map(op.methodcaller('astype', str), (L1, L2, L3, L4)),)
def use_argsort(p1, p2):
o1, o2 = map(np.argsort, (p1, p2))
o12 = np.empty_like(o1)
o12[o1] = o2
return o12
def use_dict(l1, l2):
return op.itemgetter(*l1)({k: v for v, k in enumerate(l2)})
for L, N in zip(letters, (1000, 1000, 200, 4)):
print(f'{len(L)} elements')
p1, p2 = map(np.random.permutation, (L, L))
l1, l2 = map(op.methodcaller('tolist'), (p1, p2))
T = (timeit(lambda: f(i1, i2), number=N)*1000/N for f, i1, i2 in (
(use_argsort, p1, p2), (use_dict, l1, l2)))
for m, t in zip(('argsort', 'dict '), T):
print(m, f'{t:10.3f} ms')

Evaluating the squared term of a gaussian kernel for having a covariance matrix for multi-dimensional inputs [duplicate]

I have the following code. It is taking forever in Python. There must be a way to translate this calculation into a broadcast...
def euclidean_square(a,b):
squares = np.zeros((a.shape[0],b.shape[0]))
for i in range(squares.shape[0]):
for j in range(squares.shape[1]):
diff = a[i,:] - b[j,:]
sqr = diff**2.0
squares[i,j] = np.sum(sqr)
return squares
You can use np.einsum after calculating the differences in a broadcasted way, like so -
ab = a[:,None,:] - b
out = np.einsum('ijk,ijk->ij',ab,ab)
Or use scipy's cdist with its optional metric argument set as 'sqeuclidean' to give us the squared euclidean distances as needed for our problem, like so -
from scipy.spatial.distance import cdist
out = cdist(a,b,'sqeuclidean')
I collected the different methods proposed here, and in two other questions, and measured the speed of the different methods:
import numpy as np
import scipy.spatial
import sklearn.metrics
def dist_direct(x, y):
d = np.expand_dims(x, -2) - y
return np.sum(np.square(d), axis=-1)
def dist_einsum(x, y):
d = np.expand_dims(x, -2) - y
return np.einsum('ijk,ijk->ij', d, d)
def dist_scipy(x, y):
return scipy.spatial.distance.cdist(x, y, "sqeuclidean")
def dist_sklearn(x, y):
return sklearn.metrics.pairwise.pairwise_distances(x, y, "sqeuclidean")
def dist_layers(x, y):
res = np.zeros((x.shape[0], y.shape[0]))
for i in range(x.shape[1]):
res += np.subtract.outer(x[:, i], y[:, i])**2
return res
# inspired by the excellent https://github.com/droyed/eucl_dist
def dist_ext1(x, y):
nx, p = x.shape
x_ext = np.empty((nx, 3*p))
x_ext[:, :p] = 1
x_ext[:, p:2*p] = x
x_ext[:, 2*p:] = np.square(x)
ny = y.shape[0]
y_ext = np.empty((3*p, ny))
y_ext[:p] = np.square(y).T
y_ext[p:2*p] = -2*y.T
y_ext[2*p:] = 1
return x_ext.dot(y_ext)
# https://stackoverflow.com/a/47877630/648741
def dist_ext2(x, y):
return np.einsum('ij,ij->i', x, x)[:,None] + np.einsum('ij,ij->i', y, y) - 2 * x.dot(y.T)
I use timeit to compare the speed of the different methods. For the comparison, I use vectors of length 10, with 100 vectors in the first group, and 1000 vectors in the second group.
import timeit
p = 10
x = np.random.standard_normal((100, p))
y = np.random.standard_normal((1000, p))
for method in dir():
if not method.startswith("dist_"):
continue
t = timeit.timeit(f"{method}(x, y)", number=1000, globals=globals())
print(f"{method:12} {t:5.2f}ms")
On my laptop, the results are as follows:
dist_direct 5.07ms
dist_einsum 3.43ms
dist_ext1 0.20ms <-- fastest
dist_ext2 0.35ms
dist_layers 2.82ms
dist_scipy 0.60ms
dist_sklearn 0.67ms
While the two methods dist_ext1 and dist_ext2, both based on the idea of writing (x-y)**2 as x**2 - 2*x*y + y**2, are very fast, there is a downside: When the distance between x and y is very small, due to cancellation error the numerical result can sometimes be (very slightly) negative.
Another solution besides using cdist is the following
difference_squared = np.zeros((a.shape[0], b.shape[0]))
for dimension_iterator in range(a.shape[1]):
difference_squared = difference_squared + np.subtract.outer(a[:, dimension_iterator], b[:, dimension_iterator])**2.

what is the fastest way to get the mode of a numpy array

I have to find the mode of a NumPy array that I read from an hdf5 file. The NumPy array is 1d and contains floating point values.
my_array=f1[ds_name].value
mod_value=scipy.stats.mode(my_array)
My array is 1d and contains around 1M values. It takes about 15 min for my script to return the mode value. Is there any way to make this faster?
Another question is why scipy.stats.median(my_array) does not work while mode works?
AttributeError: module 'scipy.stats' has no attribute 'median'
The implementation of scipy.stats.mode has a Python loop for handling the axis argument with multidimensional arrays. The following simple implementation, for one-dimensional arrays only, is faster:
def mode1(x):
values, counts = np.unique(x, return_counts=True)
m = counts.argmax()
return values[m], counts[m]
Here's an example. First, make an array of integers with length 1000000.
In [40]: x = np.random.randint(0, 1000, size=(2, 1000000)).sum(axis=0)
In [41]: x.shape
Out[41]: (1000000,)
Check that scipy.stats.mode and mode1 give the same result.
In [42]: from scipy.stats import mode
In [43]: mode(x)
Out[43]: ModeResult(mode=array([1009]), count=array([1066]))
In [44]: mode1(x)
Out[44]: (1009, 1066)
Now check the performance.
In [45]: %timeit mode(x)
2.91 s ± 18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [46]: %timeit mode1(x)
39.6 ms ± 83.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.91 seconds for mode(x) and only 39.6 milliseconds for mode1(x).
Here's one approach based on sorting -
def mode1d(ar_sorted):
ar_sorted.sort()
idx = np.flatnonzero(ar_sorted[1:] != ar_sorted[:-1])
count = np.empty(idx.size+1,dtype=int)
count[1:-1] = idx[1:] - idx[:-1]
count[0] = idx[0] + 1
count[-1] = ar_sorted.size - idx[-1] - 1
argmax_idx = count.argmax()
if argmax_idx==len(idx):
modeval = ar_sorted[-1]
else:
modeval = ar_sorted[idx[argmax_idx]]
modecount = count[argmax_idx]
return modeval, modecount
Note that this mutates/changes the input array as it sorts it. So, if you want to keep the input array un-mutated or do mind the input array being sorted, pass a copy.
Sample run on 1M elements -
In [65]: x = np.random.randint(0, 1000, size=(1000000)).astype(float)
In [66]: from scipy.stats import mode
In [67]: mode(x)
Out[67]: ModeResult(mode=array([ 295.]), count=array([1098]))
In [68]: mode1d(x)
Out[68]: (295.0, 1098)
Runtime test
In [75]: x = np.random.randint(0, 1000, size=(1000000)).astype(float)
# Scipy's mode
In [76]: %timeit mode(x)
1 loop, best of 3: 1.64 s per loop
# #Warren Weckesser's soln
In [77]: %timeit mode1(x)
10 loops, best of 3: 52.7 ms per loop
# Proposed in this post
In [78]: %timeit mode1d(x)
100 loops, best of 3: 12.8 ms per loop
With a copy, the timings for mode1d would be comparable to mode1.
I added the two functions mode1 and mode1d from replies above to my script and tried to compare with the scipy.stats.mode.
dir_name="C:/Users/test_mode"
file_name="myfile2.h5"
ds_name="myds"
f_in=os.path.join(dir_name,file_name)
def mode1(x):
values, counts = np.unique(x, return_counts=True)
m = counts.argmax()
return values[m], counts[m]
def mode1d(ar_sorted):
ar_sorted.sort()
idx = np.flatnonzero(ar_sorted[1:] != ar_sorted[:-1])
count = np.empty(idx.size+1,dtype=int)
count[1:-1] = idx[1:] - idx[:-1]
count[0] = idx[0] + 1
count[-1] = ar_sorted.size - idx[-1] - 1
argmax_idx = count.argmax()
if argmax_idx==len(idx):
modeval = ar_sorted[-1]
else:
modeval = ar_sorted[idx[argmax_idx]]
modecount = count[argmax_idx]
return modeval, modecount
startTime=time.time()
with h5py.File(f_in, "a") as f1:
myds=f1[ds_name].value
time1=time.time()
file_read_time=time1-startTime
print(str(file_read_time)+"\t"+"s"+"\t"+str((file_read_time)/60)+"\t"+"min")
print("mode_scipy=")
mode_scipy=scipy.stats.mode(myds)
print(mode_scipy)
time2=time.time()
mode_scipy_time=time2-time1
print(str(mode_scipy_time)+"\t"+"s"+"\t"+str((mode_scipy_time)/60)+"\t"+"min")
print("mode1=")
mode1=mode1(myds)
print(mode1)
time3=time.time()
mode1_time=time3-time2
print(str(mode1_time)+"\t"+"s"+"\t"+str((mode1_time)/60)+"\t"+"min")
print("mode1d=")
mode1d=mode1d(myds)
print(mode1d)
time4=time.time()
mode1d_time=time4-time3
print(str(mode1d_time)+"\t"+"s"+"\t"+str((mode1d_time)/60)+"\t"+"min")
The result from running the script for a numpy array of around 1M is :
mode_scipy=
ModeResult(mode=array([ 1.11903353e-06], dtype=float32), count=array([304909]))
938.8368742465973 s
15.647281237443288 min
mode1=(1.1190335e-06, 304909)
0.06500649452209473 s
0.0010834415753682455 min
mode1d=(1.1190335e-06, 304909)
0.06200599670410156 s
0.0010334332784016928 min

hessian of a variable returned by tf.concat() is None

Let x and y be vectors of length N, and z is a function z = f(x,y). In Tensorflow v1.0.0, tf.hessians(z,x) and tf.hessians(z,y) both returns an N by N matrix, which is what I expected.
However, when I concatenate the x and y into a vector p of size 2*N using tf.concat, and run tf.hessian(z, p), it returns error "ValueError: None values not supported."
I understand this is because in the computation graph x,y ->z and x,y -> p, so there is no gradient between p and z. To circumvent the problem, I can create p first, slice it into x and y, but I will have to change a ton of my code. Is there a more elegant way?
related question: Slice of a variable returns gradient None
import tensorflow as tf
import numpy as np
N = 2
A = tf.Variable(np.random.rand(N,N).astype(np.float32))
B = tf.Variable(np.random.rand(N,N).astype(np.float32))
x = tf.Variable(tf.random_normal([N]) )
y = tf.Variable(tf.random_normal([N]) )
#reshape to N by 1
x_1 = tf.reshape(x,[N,1])
y_1 = tf.reshape(y,[N,1])
#concat x and y to form a vector with length of 2*N
p = tf.concat([x,y],axis = 0)
#define the function
z = 0.5*tf.matmul(tf.matmul(tf.transpose(x_1), A), x_1) + 0.5*tf.matmul(tf.matmul(tf.transpose(y_1), B), y_1) + 100
#works , hx and hy are both N by N matrix
hx = tf.hessians(z,x)
hy = tf.hessians(z,y)
#this gives error "ValueError: None values not supported."
#expecting a matrix of size 2*N by 2*N
hp = tf.hessians(z,p)
Compute the hessian by its definition.
gxy = tf.gradients(z, [x, y])
gp = tf.concat([gxy[0], gxy[1]], axis=0)
hp = []
for i in range(2*N):
hp.append(tf.gradients(gp[i], [x, y]))
Because tf.gradients computes the sum of (dy/dx), so when computing the second partial derivative, one should slice the vector into scalars and then compute the gradient. Tested on tf1.0 and python2.