memory leak (cython + numpy) - numpy

I'm struggling to find where the leak is in this code
kullback.pyx
import numpy as np
cimport numpy as np
from libcpp.vector cimport vector
import scipy.stats as st
import matplotlib.pyplot as plt
cdef vector[double] minmax(double i, dict a):
cdef double minmax
cdef vector[double] out
try:
minmax= min(list(filter(lambda x: x > i, a.keys())))
except ValueError:
minmax = min(a.keys())
cdef double maxmin
try:
maxmin = max(list(filter(lambda x: x < i, a.keys())))
except ValueError:
maxmin = max(a.keys())
out.push_back(minmax)
out.push_back(maxmin)
return out
def KullbackLeibler(args):
cdef np.ndarray[np.double_t, ndim = 1] psample = args[0]
cdef np.ndarray[np.double_t, ndim = 1] qsample = args[1]
cdef int n = args[2]
a = plt.hist(psample, bins = n)
cdef np.ndarray[np.double_t, ndim = 1] ax = a[1]
cdef np.ndarray[np.double_t, ndim = 1] ay = a[0]
b = plt.hist(qsample, bins = ax)
adict = dict(zip(ax, ay))
ax = ax[:-1]
cdef np.ndarray[np.double_t, ndim = 1] bx = b[1]
cdef np.ndarray[np.double_t, ndim = 1] by = b[0]
bdict = dict(zip(bx, by))
bx = bx[:-1]
cdef vector[double] kl
cdef int N = np.sum(ay)
cdef int i
cdef double p_minmax, p_maxmin, q_minmax, q_maxmin
cdef double KL
for i in range(len(psample)):
ptmp = minmax(psample[i], adict)
p_minmax = ptmp[0]
p_maxmin = ptmp[1]
qtmp = minmax(psample[i], bdict)
q_minmax = qtmp[0]
q_maxmin = qtmp[1]
pdensity = adict[p_maxmin]/ N
qdensity = np.max([bdict[q_maxmin]/ N, 10e-20])
KL = pdensity * np.log(pdensity/qdensity)
kl.push_back(KL)
cdef double res = np.sum(kl)
del args, psample, qsample, ax, ay, bx, by, adict, bdict
return res
here the main from which I launch
main.py
import kullback as klcy ##unresolvedimport
import datetime
import numpy as np
import pathos.pools as pp
import objgraph
np.random.seed(10)
ncore = 4
pool = pp.ProcessPool(ncore)
KL = []
for i in range(2500):
time1 = datetime.datetime.now()
n = 500
x = [np.random.normal(size = n, scale = 1) for j in range(ncore)]
y = [np.random.normal(size = n, scale = 1) for j in range(ncore)]
data = np.array(list(zip(x,y,[n/10]*ncore)))
kl = pool.map(klcy.KullbackLeibler, data)
time2 = datetime.datetime.now()
print(i, time2 - time1, sep = " ")
print(objgraph.show_growth())
KL.append(kl)
The function KullbackLeibler takes as input two arrays and an integer
What I've already tried:
using objgraph to identify growing objects, unfortunately it seems it doesn't work with C-defined arrays (it identifies only the list in which I'm appending the result as growing)
Why can't objgraph capture the growth of np.array()?
deleting all the arrays at the end of the pyx function
tried placing a gc.collect() call both in the pyx file and in the main file, but nothing has changed
Memory consumption grows linearly with the number of iterations, along with the time required for each iteration (from 0.6s to over 4s). It's my first attempt with cython, any suggestion would be useful.

The problem had nothing to do with arrays. I wasn't closing matplotlib plots
a = plt.hist(psample, bins = n)
b = plt.hist(qsample, bins = ax)
Even if I wasn't displaying them they were drawn nonetheless, consuming memory which was never freed afterwards. Thanks to #DavidW in the comments for making me notice.

Related

Lambdify a function in two variables and plot a surface

I have a function f(x,y) where t is a parameter. I'm trying to plot the function where t = 1 for x and y values ranging from -5 to 5. The plot doesn't render.
import sympy as sp
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
C = sv.CoordSys3D("")
x, y, z = C.base_scalars()
t = sp.symbols("t")
f = sp.sin(2*sp.pi*t)*sp.exp(-(x-3*sp.sin(sp.pi*t))**2 -(y-3*sp.cos(sp.pi*t))**2)
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(projection='3d')
X = np.linspace(-5,5,100)
Y = np.linspace(-5,5,100)
xvals, yvals = np.meshgrid(X,Y)
zvals = sp.lambdify((x,y),f.subs(t,1),"numpy")(xvals,yvals)
ax.plot_surface(xvals,yvals,zvals)
plt.show()
I get an error 'int' object has no attribute 'ndim' which I don't know how to solve.
The problem is that when you execute f.subs(t,1) it returns a number (zero in this case). So, f=0 is the expression that you are going to lambdify. Let's see the function generated by lambdify:
import inspect
print(inspect.getsource(sp.lambdify((x,y),f.subs(t,1),"numpy")))
# def _lambdifygenerated(Dummy_25, Dummy_24):
# return 0
So, no matter the values and shape of xvals and yvals, that numerical function will always return 0, which is an integer number.
However, ax.plot_surface requires zvals to have the same shape as xvals or yval. Luckily, we can easily fix that with a simple if statement:
import sympy as sp
import sympy.vector as sv
import numpy as np
import matplotlib.pyplot as plt
C = sv.CoordSys3D("")
x, y, z = C.base_scalars()
t = sp.symbols("t")
f = sp.sin(2*sp.pi*t)*sp.exp(-(x-3*sp.sin(sp.pi*t))**2 -(y-3*sp.cos(sp.pi*t))**2)
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(projection='3d')
X = np.linspace(-5,5,100)
Y = np.linspace(-5,5,100)
xvals, yvals = np.meshgrid(X,Y)
zvals = sp.lambdify((x,y),f.subs(t,1),"numpy")(xvals,yvals)
# if zvals is just a number, create a proper matrix
if not isinstance(zvals, np.ndarray):
zvals = zvals * np.ones_like(xvals)
ax.plot_surface(xvals,yvals,zvals)
plt.show()
The fact that this doesn't render is bug in lambdify that it doesn't work well for constant expressions.
Your real problem though is that the expression you are trying to plot is just zero:
In [5]: f
Out[5]:
2 2
- (x_ - 3⋅sin(π⋅t)) - (y_ - 3⋅cos(π⋅t))
ℯ ⋅sin(2⋅π⋅t)
In [6]: f.subs(t, 1)
Out[6]: 0

Count of Kernel Density Estimation (KDE)

I have some data (A,B) and have used seaborn to make a contour plot of it.
import pandas as pd
import seaborn as sns
# Dataframe 1
df_1 = pd.DataFrame({'A':[1,2,1,2,3,4,2,1,4], 'B': [2,1,2,1,2,3,4,2,1]})
# Plot A v B
ax = sns.kdeplot(df_1["A"], df_1["B"])
I would like to get the cumulative count please (C). I’d like to make a new plot with C on the Y axis, A on the X axis and contours of B. I think that if I could start off by making a new dataframe of A,B,H where H was the count (the height of the volcano) then that might be a start. The resulting plot might look a bit like this:
I think I've worked it out but this solution is messy:
import pandas as pd
import numpy as np
from scipy import stats
from itertools import chain
Fruit = 9 # How many were there?
# Dataframe 1
df_1 = pd.DataFrame({'A':[1,2,1,2,3,4,2,1,4], 'B': [2,1,2,1,2,3,4,2,1]})
m1 = df_1["A"]
m2 = df_1["B"]
xmin = 0
xmax = 5
ymin = 0
ymax = 5
# Kernel density estimate:
X, Y = np.mgrid[xmin:xmax:5j, ymin:ymax:5j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
H = np.reshape(kernel(positions).T, X.shape)
# Re-jig it
X = X.reshape((25, 1))
Y = Y.reshape((25, 1))
H = H.reshape((25, 1))
X_L = list(chain.from_iterable(X))
Y_L = list(chain.from_iterable(Y))
H_L = list(chain.from_iterable(H))
df_2 = pd.DataFrame({'A': X_L, 'B': Y_L, 'H': H_L})
# Find the cumulative count C
df_2 = df_2.sort_values('B')
C = np.cumsum(H)
C = C.reshape((25, 1))
C_L = list(chain.from_iterable(C))
df_2['C'] = pd.DataFrame(C_L, index=df_2.index)
# Scale C
Max_C = np.amax(C)
df_2.loc[:,'C'] *= Fruit / Max_C
# Break it down to constant B
df_2_B_0 = df_2[df_2['B'] == 0]
df_2_B_1 = df_2[df_2['B'] == 1]
df_2_B_2 = df_2[df_2['B'] == 2]
df_2_B_3 = df_2[df_2['B'] == 3]
df_2_B_4 = df_2[df_2['B'] == 4]
# Plot A v C
ax = df_2_B_0.plot('A','C', label='0')
df_2_B_1.plot('A','C',ax=ax, label='1')
df_2_B_2.plot('A','C',ax=ax, label='2')
df_2_B_3.plot('A','C',ax=ax, label='3')
df_2_B_4.plot('A','C',ax=ax, label='4')
plt.ylabel('C')
plt.legend(title='B')

cython memoryview not faster than ndarray

I have a function written in regular numpy ndarray and another one with a typed memoryview. However, I couldn't get the memoryview version to work faster than the regular version (unlike many of the blogs such as memoryview benchmarks).
Any pointers / suggestions to increase the speed of the memoryview code versus the numpy alternative will be greatly appreciated! ... OR ... if anyone can point out any glaring reason why the memoryview version is not much faster than the regular numpy version
In the code below there are two functions, both of which takes in two vectors bi and xi and returns a matrix. The first function shrink_correl is the regular numpy version and the second function shrink_correl2 is the memoryview alternative (let the file be sh_cor.pyx).
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
cimport cython
cimport numpy as np
import numpy as np
from numpy cimport ndarray as ar
# -- ***this is the Regular Cython version*** -
cpdef ar[double, ndim=2, mode='c'] shrink_correl(ar[double, ndim=1, mode='c'] bi, ar[double, ndim=1, mode='c'] xi):
cdef:
int n_ = xi.shape[0]
int n__ = int(n_*(n_-1)/2)
ar[double, ndim=2, mode='c'] f = np.zeros([n__, n_+1])
int x__ = 0
ar[double, ndim=2, mode='c'] f1 = np.zeros([n_, n_+1])
ar[double, ndim=2, mode='c'] f2 = np.zeros([n__, n_+1])
ar[double, ndim=1, mode='c'] g = np.zeros(n_+1)
ar[double, ndim=1, mode='c'] s = np.zeros(n__)
ar[double, ndim=2, mode='c'] cori_ = np.zeros([n_, n_])
Py_ssize_t j, k
with nogil:
for j in range(0, n_-1):
for k in range(j+1, n_):
x__ += 1
f[x__-1, j] = bi[k]*xi[k]*1000
f[x__-1, k] = bi[j]*xi[j]*1000
f1 = np.dot(np.transpose(f), f)
with nogil:
for j in range(0, n_):
f1[n_, j] = xi[j]*1000
f1[j, n_] = f1[n_, j]
f2 = np.dot(f, np.linalg.inv(f1))
with nogil:
for j in range(0, n_):
g[j] = -bi[j]*xi[j]*1000
s = np.dot(f2, g)
with nogil:
for j in range(0, n_):
cori_[j, j] = 1.0
x__ = 0
with nogil:
for j in range(0, n_-1):
for k in range(j+1, n_):
x__ += 1
cori_[j, k] = s[x__-1]
cori_[k, j] = cori_[j, k]
return cori_
# -- ***this is the MemoryView Cython version*** -
cpdef ar[double, ndim=2, mode='c'] shrink_correl2(double[:] bi, double[:] xi):
cdef:
int n_ = xi.shape[0]
int n__ = int(n_*(n_-1)/2)
double[:, ::1] f = np.zeros([n__, n_+1])
int x__ = 0
double[:, ::1] f1 = np.zeros([n_, n_+1])
double[:, ::1] f2 = np.zeros([n__, n_+1])
double[:] g = np.zeros(n_+1)
double[:] s = np.zeros(n__)
double[:, ::1] cori_ = np.zeros([n_, n_])
ar[double, ndim=2, mode='c'] cori__ = np.zeros([n_, n_])
Py_ssize_t j, k
with nogil:
for j in range(0, n_-1):
for k in range(j+1, n_):
x__ += 1
f[x__-1, j] = bi[k]*xi[k]*1000
f[x__-1, k] = bi[j]*xi[j]*1000
f1 = np.dot(np.transpose(f), f)
with nogil:
for j in range(0, n_):
f1[n_, j] = xi[j]*1000
f1[j, n_] = f1[n_, j]
f2 = np.dot(f, np.linalg.inv(f1))
with nogil:
for j in range(0, n_):
g[j] = -bi[j]*xi[j]*1000
s = np.dot(f2, g)
with nogil:
for j in range(0, n_):
cori_[j, j] = 1.0
x__ = 0
with nogil:
for j in range(0, n_-1):
for k in range(j+1, n_):
x__ += 1
cori_[j, k] = s[x__-1]
cori_[k, j] = cori_[j, k]
cori__[:, :] = cori_
return cori__
This is compiled using the following setup.py code
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy as np
import os
ext_modules = [Extension('sh_cor', ['sh_cor.pyx'], include_dirs=[np.get_include(),
os.path.join(np.get_include(), 'numpy')],
define_macros=[('NPY_NO_DEPRECATED_API', None)],
extra_compile_args=['-O3', '-march=native', '-ffast-math', '-flto'],
libraries=['m']
)]
setup(
name="Sh Cor",
cmdclass={'build_ext': build_ext},
ext_modules=ext_modules
)
The code used to test the speeds is
import numpy as np
import sh_cor # this the library created by the setup.py file
import time
b = np.random.random(400)
b = b/np.sum(b)
x = np.random.random(400)-0.5
n = 10
t0 = time.time()
for i in range(n):
v1 = sh_cor.shrink_correl(b, x)
t1 = time.time()
print((t1-t0)/n)
t0 = time.time()
for i in range(n):
v2 = sh_cor.shrink_correl2(b, x)
t1 = time.time()
print((t1-t0)/n)
The output on my PC is:
0.7070999860763549 # regular numpy
0.6726999998092651 # memoryview
using memoryview (in the codes above) only gives me a 5% speed boost (unlike the huge speed boost in the blogs).
#uday Give me about a week as I'm computer less, but here's where to speed things up to get you started: 1) instead of attaching the gil using np.transpose create a memoryview identical to what you want to transpose BEFORE any loops (I.e. you'll have variable f declared as a memoryview that won't need the gil and just create a view on that f_t, I.e. cdef double[:, ::1] f_T = np.transpose(f) or just =f.T.
2) This step is a little more tricky as you need a C/C++ style wrapper version of np.dot (so in this case, be sure the call to the dgemm function is with nogil: above it & indent the function the next line to release the gil with 4 space indent SO requires): https://gist.github.com/pv/5437087. That example looks to work (although you'll have to save the include f2pyptr.h file out and put it where your project is being built; I also suspect you should add cimport numpy as np); if not and it needs mods you can see as I've done in another post here: Calling BLAS / LAPACK directly using the SciPy interface and Cython (pointer issue?)/- also how to add MKL Then you'll need to add from cython.parallel cimport prange at the top and change all the loops to prange from range and be sure all your prange sections are nogil and all variables are cdef declared before operated on . In addition you'll have to add -openmp to your setup.py in the compiler arguments as well as a link to its include libs. Ask more questions if you need clarification. This isn't as easy as it should be but with a little guidance becomes quite simple. Basically once your setup.py is modified to include everything it will work going forward.
3) although probably easiest to fix - get rid of that List. Make it a numpy array or pandas dataframe if you need text and data. Whenever I've used lists for data the slowdown had been incredible.

Cython error: Undeclared name not built in:array

I am compiling this Cython code in Sage Cell Server and I get the following error.
undeclared name not builtin: array
It displays the same error in Sage Notebook. I think it is not recognizing numpy array but it
is strange cause I have imported numpy already.
cython('''
cimport numpy as np
ctypedef np.int DTYPE
def computeDetCy(np.ndarray[DTYPE, ndim=2] matrix):
return determ(matrix,len(matrix))
cdef inline int determ(np.ndarray[DTYPE, ndim=2] matrix, int n):
cdef int det = 0
cdef int p=0
cdef int h
cdef int k
cdef int i=0
cdef int j=0
cdef np.ndarray[DTYPE, ndim=2] temp=np.zeros(4,4)
if n == 1:
return matrix[0][0]
elif n == 2:
return matrix[0][0]*matrix[1][1] - matrix[0][1]*matrix[1][0]
else:
for p in range(0, n):
h = 0
k = 0
for i in range(1, n):
for j in range(0, n):
if j==p:
continue
temp[h][k] = matrix[i][j]
k+=1
if k ==(n-1):
h+=1
k=0
det= det + matrix[0][p] * (-1)**p * determ(temp, n-1)
return det
computeDetCy(array([[13,42,43,22],[12,67,45,98],[23,91,18,54],[34,56,82,76]]))
''')
Yeah, but you imported it as np, not importing * (which would be a bad idea anyway) and didn't do a regular Python import. (Sometimes you have to do both a cimport and import, see this SO question for an example.)
However, even after
import numpy as np
and using np.array, I still get some errors
ValueError: Buffer dtype mismatch, expected 'DTYPE' but got 'long'
So this solves your question, but isn't the whole story, and the things I tried didn't work to fix this new issue.

Cython additional typing and cimport for numpy array slow down the performance?

Below are two simple Cython methods I wrote. In g_cython() method I used additional typing for numpy array a and b, but surprisingly g_cython() is twice slower than g_less_cython(). I wonder why is this happening? I thought adding that would make indexing on a and b much faster?
PS. I understand both functions can be vectorized in numpy -- I am just exploring cython optimization tricks.
import numpy as np;
cimport numpy as np;
def g_cython(np.ndarray[np.int_t, ndim = 1] a, percentile):
cdef int i
cdef int n = len(a)
cdef np.ndarray[np.int_t, ndim = 1] b = np.zeros(n, dtype = 'int')
for i in xrange(n):
b[i] = np.searchsorted(percentile, a[i])
return b
def g_less_cython(a, percentile):
cdef int i
b = np.zeros_like(a)
for i in xrange(len(a)):
b[i] = np.searchsorted(percentile, a[i])
return b
my test case is when len(a) == 1000000 and len(percentile) = 100
def main3():
n = 100000
a = np.random.random_integers(0,10000000,n)
per = np.linspace(0, 10000000, 101)
q = time.time()
b = g_cython(a, per)
q = time.time() - q
print q
q = time.time()
bb = g_less_cython(a, per)
q = time.time() - q
print q
I tested you code, g_cython is a slightly faster than g_less_cython.
here is the test code
import pyximport; pyximport.install()
import search_sorted
import numpy as np
import time
x = np.arange(100000, dtype=np.int32)
y = np.random.randint(0, 100000, 100000)
start = time.clock()
search_sorted.g_cython(y, x)
print time.clock() - start
start = time.clock()
search_sorted.g_less_cython(y, x)
print time.clock() - start
the output is:
0.215430514708
0.259622599945
I turned off the boundscheck and wraparound flag:
#cython.boundscheck(False)
#cython.wraparound(False)
def g_cython(np.ndarray[np.int_t, ndim = 1] a, percentile):
....
The difference is not notable because the call of np.searchsorted(percentile, a[i]) is the critical part that used most of CPU.