numpy large matrix multiplication optimize - numpy

I have to do a iterative calculation with large matrix:
R(t) = M # R(t-1), where M is n x n, and R is n x 1
if I write this:
for _ in range(iter_num):
R = M # R
I suppose it will be very slow, because it has to copy and create new array each time. Is that any way to optimize this? (maybe do it inplace?)

A few timings to show that OP's approach is actually quite competitive:
>>> import functools as ft
>>> kwds = dict(globals=globals(), number=1000)
>>> R = np.random.random((200,))
>>> M = np.random.random((200, 200))
>>> def f_op(M, R):
... for i in range(k):
... R = M#R
... return R
...
>>> def f_pp(M, R):
... return ft.reduce(np.matmul, (R,) + k * (M.T,))
...
>>> def f_ag(M, R):
... return np.linalg.matrix_power(M, k)#R
...
>>> def f_tai(M, R):
... return np.linalg.multi_dot([M]*k+[R])
...
>>> k = 20
>>> repeat('f_op(M, R)', **kwds)
[0.14156094897771254, 0.1264056910004001, 0.12611976702464744]
>>> repeat('f_pp(M, R)', **kwds)
[0.12594187198556028, 0.1227772050187923, 0.12045996301458217]
>>> repeat('f_ag(M, R)', **kwds)
[2.065609384997515, 2.041590739012463, 2.038702343008481]
>>> repeat('f_tai(M, R)', **kwds)
[3.426795684004901, 3.4321794749994297, 3.4208814119920135]
>>>
>>> k = 500
>>> repeat('f_op(M, R)', **kwds)
[3.066054102004273, 3.0294102499901783, 3.020273027010262]
>>> repeat('f_pp(M, R)', **kwds)
[2.891954762977548, 2.8680382019956596, 2.8558325179910753]
>>> repeat('f_ag(M, R)', **kwds)
[5.216210452985251, 5.1636185249954, 5.157578871003352]

Using numpy.linalg.multi_dot
np.linalg.multi_dot([M] * iter_num + [R])
([M] * iter_num creates a list of references to M.)
Some thoughts mentioned in the documentation,
(multi_dot) Compute the dot product of two or more arrays in a single function call, while automatically selecting the fastest evaluation order.
and
Think of multi_dot as:
def multi_dot(arrays): return functools.reduce(np.dot, arrays)
Note OP's method is actually quite fast. See Paul Panzer's answer for more timing results.
Thanks for Paul Panzer's suggestion for using reference rather than view.

Would this work for you?
R_final = np.linalg.matrix_power(M, iter_num) # R
It seems like you are doing M # M # M # ... # M # R, which can be cast to M ** iter_num # R

Using explicit spectral decomposition my be useful if iter_num is large compared to n (assuming np.lialg.matrix_power doesn't do this already) and M is invertible:
def mat_pow(a, p):
vals, vecs = np.linalg.eig(a)
return vecs # np.diag(vals**p) # vecs.T
mat_pow(M, iter_num) # R
If M is symmetric, you could use the even faster np.linalg.eigh

Related

Binary-search without an explicit array

I want to perform a binary-search using e.g. np.searchsorted, however, I do not want to create an explicit array containing values. Instead, I want to define a function giving the value to be expected at the desired position of the array, e.g. p(i) = i, where i denotes the position within the array.
Generating an array of values regarding the function would, in my case, be neither efficient nor elegant. Is there any way to achieve this?
What about something like:
import collections
class GeneratorSequence(collections.Sequence):
def __init__(self, func, size):
self._func = func
self._len = size
def __len__(self):
return self._len
def __getitem__(self, i):
if 0 <= i < self._len:
return self._func(i)
else:
raise IndexError
def __iter__(self):
for i in range(self._len):
yield self[i]
This would work with np.searchsorted(), e.g.:
import numpy as np
gen_seq = GeneratorSequence(lambda x: x ** 2, 100)
np.searchsorted(gen_seq, 9)
# 3
You could also write your own binary search function, you do not really need NumPy in this case, and it can actually be beneficial:
def bin_search(seq, item):
first = 0
last = len(seq) - 1
found = False
while first <= last and not found:
midpoint = (first + last) // 2
if seq[midpoint] == item:
first = midpoint
found = True
else:
if item < seq[midpoint]:
last = midpoint - 1
else:
first = midpoint + 1
return first
Which gives identical results:
all(bin_search(gen_seq, i) == np.searchsorted(gen_seq, i) for i in range(100))
# True
Incidentally, this is also WAY faster:
gen_seq = GeneratorSequence(lambda x: x ** 2, 1000000)
%timeit np.searchsorted(gen_seq, 10000)
# 1 loop, best of 3: 1.23 s per loop
%timeit bin_search(gen_seq, 10000)
# 100000 loops, best of 3: 16.1 µs per loop
Inspired by #norok2 comment, I think you can use something like this:
def f(i):
return i*2 # Just an example
class MySeq(Sequence):
def __init__(self, f, maxi):
self.maxi = maxi
self.f = f
def __getitem__(self, x):
if x < 0 or x > self.maxi:
raise IndexError()
return self.f(x)
def __len__(self):
return self.maxi + 1
In this case f is your function while maxi is the maximum index. This of course only works if the function f return values in sorted order.
At this point you can use an object of type MySeq inside np.searchsorted.

Efficient implementation of factorization machine with matrix operations?

Link is here : https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf (slides 5-6)
Given the following matrices:
X : n * d
W : d * k
Is there an efficient way to calculate the n x 1 matrix using only matrix operations (eg. numpy, tensorflow), where the jth element is :
EDIT:
Current attempt is this, but obviously it's not very space efficient, as it requires storing matrices of size n*d*d :
n = 1000
d = 256
k = 32
x = np.random.normal(size=[n,d])
w = np.random.normal(size=[d,k])
xxt = np.matmul(x.reshape([n,d,1]),x.reshape([n,1,d]))
wwt = np.matmul(w.reshape([1,d,k]),w.reshape([1,k,d]))
output = xxt*wwt
output = np.sum(output,(1,2))
Avoid large temporary arrays
Not all types of algorithms are that easily or obviously to vectorize. The np.sum(xxt*wwt) can be rewritten using np.einsum. This should be faster than your solution, but has some other limitations (eg. no multithreading).
I would therefor suggest using a compiler like Numba.
Example
import numpy as np
import numba as nb
import time
#nb.njit(fastmath=True,parallel=True)
def factorization_nb(w,x):
n = x.shape[0]
d = x.shape[1]
k = w.shape[1]
output=np.empty(n,dtype=w.dtype)
wwt=np.dot(w.reshape((d,k)),w.reshape((k,d)))
for i in nb.prange(n):
sum=0.
for j in range(d):
for jj in range(d):
sum+=x[i,j]*x[i,jj]*wwt[j,jj]
output[i]=sum
return output
def factorization_orig(w,x):
n = x.shape[0]
d = x.shape[1]
k = w.shape[1]
xxt = np.matmul(x.reshape([n,d,1]),x.reshape([n,1,d]))
wwt = np.matmul(w.reshape([1,d,k]),w.reshape([1,k,d]))
output = xxt*wwt
output = np.sum(output,(1,2))
return output
Mesuring Performance
n = 1000
d = 256
k = 32
x = np.random.normal(size=[n,d])
w = np.random.normal(size=[d,k])
#first call has some compilation overhead
res_1=factorization_nb(w,x)
t1=time.time()
for i in range(100):
res_1=factorization_nb(w,x)
#res_2=factorization_orig(w,x)
print(time.time()-t1)
Timings
factorization_nb: 4.2 ms per iteration
factorization_orig: 460 ms per iteration (110x speedup)
For an einsum implemtnation in pytorch, it would be something like
V = torch.randn([50, 10])
x = torch.randn([50])
result = (torch.einsum('ik,jk,i,j->', V, V, x, x)-torch.einsum('ik,ik,i,i->', V, V, x, x))/2
where we subtract the contribution from the feature weight being dotted with itself.

Evaluating the squared term of a gaussian kernel for having a covariance matrix for multi-dimensional inputs [duplicate]

I have the following code. It is taking forever in Python. There must be a way to translate this calculation into a broadcast...
def euclidean_square(a,b):
squares = np.zeros((a.shape[0],b.shape[0]))
for i in range(squares.shape[0]):
for j in range(squares.shape[1]):
diff = a[i,:] - b[j,:]
sqr = diff**2.0
squares[i,j] = np.sum(sqr)
return squares
You can use np.einsum after calculating the differences in a broadcasted way, like so -
ab = a[:,None,:] - b
out = np.einsum('ijk,ijk->ij',ab,ab)
Or use scipy's cdist with its optional metric argument set as 'sqeuclidean' to give us the squared euclidean distances as needed for our problem, like so -
from scipy.spatial.distance import cdist
out = cdist(a,b,'sqeuclidean')
I collected the different methods proposed here, and in two other questions, and measured the speed of the different methods:
import numpy as np
import scipy.spatial
import sklearn.metrics
def dist_direct(x, y):
d = np.expand_dims(x, -2) - y
return np.sum(np.square(d), axis=-1)
def dist_einsum(x, y):
d = np.expand_dims(x, -2) - y
return np.einsum('ijk,ijk->ij', d, d)
def dist_scipy(x, y):
return scipy.spatial.distance.cdist(x, y, "sqeuclidean")
def dist_sklearn(x, y):
return sklearn.metrics.pairwise.pairwise_distances(x, y, "sqeuclidean")
def dist_layers(x, y):
res = np.zeros((x.shape[0], y.shape[0]))
for i in range(x.shape[1]):
res += np.subtract.outer(x[:, i], y[:, i])**2
return res
# inspired by the excellent https://github.com/droyed/eucl_dist
def dist_ext1(x, y):
nx, p = x.shape
x_ext = np.empty((nx, 3*p))
x_ext[:, :p] = 1
x_ext[:, p:2*p] = x
x_ext[:, 2*p:] = np.square(x)
ny = y.shape[0]
y_ext = np.empty((3*p, ny))
y_ext[:p] = np.square(y).T
y_ext[p:2*p] = -2*y.T
y_ext[2*p:] = 1
return x_ext.dot(y_ext)
# https://stackoverflow.com/a/47877630/648741
def dist_ext2(x, y):
return np.einsum('ij,ij->i', x, x)[:,None] + np.einsum('ij,ij->i', y, y) - 2 * x.dot(y.T)
I use timeit to compare the speed of the different methods. For the comparison, I use vectors of length 10, with 100 vectors in the first group, and 1000 vectors in the second group.
import timeit
p = 10
x = np.random.standard_normal((100, p))
y = np.random.standard_normal((1000, p))
for method in dir():
if not method.startswith("dist_"):
continue
t = timeit.timeit(f"{method}(x, y)", number=1000, globals=globals())
print(f"{method:12} {t:5.2f}ms")
On my laptop, the results are as follows:
dist_direct 5.07ms
dist_einsum 3.43ms
dist_ext1 0.20ms <-- fastest
dist_ext2 0.35ms
dist_layers 2.82ms
dist_scipy 0.60ms
dist_sklearn 0.67ms
While the two methods dist_ext1 and dist_ext2, both based on the idea of writing (x-y)**2 as x**2 - 2*x*y + y**2, are very fast, there is a downside: When the distance between x and y is very small, due to cancellation error the numerical result can sometimes be (very slightly) negative.
Another solution besides using cdist is the following
difference_squared = np.zeros((a.shape[0], b.shape[0]))
for dimension_iterator in range(a.shape[1]):
difference_squared = difference_squared + np.subtract.outer(a[:, dimension_iterator], b[:, dimension_iterator])**2.

`scipy.optimize` functions hang even with `maxiter=0`

I am trying to train the MNIST data (which I downloaded from Kaggle) with simple multi-class logistic regression, but the scipy.optimize functions hang.
Here's the code:
import csv
from math import exp
from numpy import *
from scipy.optimize import fmin, fmin_cg, fmin_powell, fmin_bfgs
# Prepare the data
def getIiter(ifname):
"""
Get the iterator from a csv file with filename ifname
"""
ifile = open(ifname, 'r')
iiter = csv.reader(ifile)
iiter.__next__()
return iiter
def parseRow(s):
y = [int(x) for x in s]
lab = y[0]
z = y[1:]
return (lab, z)
def getAllRows(ifname):
iiter = getIiter(ifname)
x = []
l = []
for row in iiter:
lab, z = parseRow(row)
x.append(z)
l.append(lab)
return x, l
def cutData(x, y):
"""
70% training
30% testing
"""
m = len(x)
t = int(m * .7)
return [(x[:t], y[:t]), (x[t:], y[t:])]
def num2IndMat(l):
t = array(l)
tt = [vectorize(int)((t == i)) for i in range(10)]
return array(tt).T
def readData(ifname):
x, l = getAllRows(ifname)
t = [[1] + y for y in x]
return array(t), num2IndMat(l)
#Calculate the cost function
def sigmoid(x):
return 1 / (1 + exp(-x))
vSigmoid = vectorize(sigmoid)
vLog = vectorize(log)
def costFunction(theta, x, y):
sigxt = vSigmoid(dot(x, theta))
cm = (- y * vLog(sigxt) - (1 - y) * vLog(1 - sigxt)) / m / N
return sum(cm)
def unflatten(flatTheta):
return [flatTheta[i * N : (i + 1) * N] for i in range(n + 1)]
def costFunctionFlatTheta(flatTheta):
return costFunction(unflatten(flatTheta), trainX, trainY)
def costFunctionFlatTheta1(flatTheta):
return costFunction(flatTheta.reshape(785, 10), trainX, trainY)
x, y = readData('train.csv')
[(trainX, trainY), (testX, testY)] = cutData(x, y)
m = len(trainX)
n = len(trainX[0]) - 1
N = len(trainY[0])
initTheta = zeros(((n + 1), N))
flatInitTheta = ndarray.flatten(initTheta)
flatInitTheta1 = initTheta.reshape(1, -1)
In the last two lines we flatten initTheta because the fmin{,_cg,_bfgs,_powell} functions seem to only take vectors as the initial value argument x0. I also flatten initTheta using reshape in hope this answer can be of help.
There is no problem computing the cost function which takes up less than 2 seconds on my computer:
print(costFunctionFlatTheta(flatInitTheta), costFunctionFlatTheta1(flatInitTheta1))
# 0.69314718056 0.69314718056
But all the fmin functions hang, even if I set maxiter=0.
e.g.
newFlatTheta = fmin(costFunctionFlatTheta, flatInitTheta, maxiter=0)
or
newFlatTheta1 = fmin(costFunctionFlatTheta1, flatInitTheta1, maxiter=0)
When I interrupt the program, it seems to me it all hangs at lines in optimize.py calling the cost functions, lines like this:
return function(*(wrapper_args + args))
For example, if I use fmin_cg, this would be line 292 in optimize.py (Version 0.5).
How do I solve this problem?
OK I found a way to stop fmin_cg from hanging.
Basically I just need to write a function that computes the gradient of the cost function, and pass it to the fprime parameter of fmin_cg.
def gradient(theta, x, y):
return dot(x.T, vSigmoid(dot(x, theta)) - y) / m / N
def gradientFlatTheta(flatTheta):
return ndarray.flatten(gradient(flatTheta.reshape(785, 10), trainX, trainY))
Then
newFlatTheta = fmin_cg(costFunctionFlatTheta, flatInitTheta, fprime=gradientFlatTheta, maxiter=0)
terminates within seconds, and setting maxiter to a higher number (say 100) one can train the model within reasonable amount of time.
The documentation of fmin_cg says the gradient would be numerically computed if no fprime is given, which is what I suspect caused the hanging.
Thanks to this notebook by zgo2016#Kaggle which helped me find the solution.

Itertools for containers

Considder the following interactive example
>>> l=imap(str,xrange(1,4))
>>> list(l)
['1', '2', '3']
>>> list(l)
[]
Does anyone know if there is already an implementation somewhere out there with a version of imap (and the other itertools functions) such that the second time list(l) is executed you get the same as the first. And I don't want the regular map because building the entire output in memory can be a waste of memory if you use larger ranges.
I want something that basically does something like
class cmap:
def __init__(self, function, *iterators):
self._function = function
self._iterators = iterators
def __iter__(self):
return itertools.imap(self._function, *self._iterators)
def __len__(self):
return min( map(len, self._iterators) )
But it would be a waste of time to do this manually for all itertools if someone already did this.
ps.
Do you think containers are more zen then iterators since for an iterator something like
for i in iterator:
do something
implicitly empties the iterator while a container you explicitly need to remove elements.
You do not have to build such an object for each type of container. Basically, you have the following:
mkimap = lambda: imap(str,xrange(1,4))
list(mkimap())
list(mkimap())
Now you onlky need a nice wrapping object to prevent the "ugly" function calls. This could work this way:
class MultiIter(object):
def __init__(self, f, *a, **k):
if a or k:
self.create = lambda: f(*a, **k)
else: # optimize
self.create = f
def __iter__(self):
return self.create()
l = MultiIter(lambda: imap(str, xrange(1,4)))
# or
l = MultiIter(imap, str, xrange(1,4))
# or even
#MultiIter
def l():
return imap(str, xrange(1,4))
# and then
print list(l)
print list(l)
(untested, hope it works, but you should get the idea)
For your 2nd question: Iterators and containers both have their uses. You should take whatever best fits your needs.
You may be looking for itertools.tee()
Iterators are my favorite topic ;)
from itertools import imap
class imap2(object):
def __init__(self, f, *args):
self.g = imap(f,*args)
self.lst = []
self.done = False
def __iter__(self):
while True:
try: # try to get something from g
x = next(self.g)
except StopIteration:
if self.done:
# give the old values
for x in self.lst:
yield x
else:
# g was consumed for the first time
self.done = True
return
else:
self.lst.append(x)
yield x
l=imap2(str,xrange(1,4))
print list(l)
print list(l)