Binary-search without an explicit array - numpy

I want to perform a binary-search using e.g. np.searchsorted, however, I do not want to create an explicit array containing values. Instead, I want to define a function giving the value to be expected at the desired position of the array, e.g. p(i) = i, where i denotes the position within the array.
Generating an array of values regarding the function would, in my case, be neither efficient nor elegant. Is there any way to achieve this?

What about something like:
import collections
class GeneratorSequence(collections.Sequence):
def __init__(self, func, size):
self._func = func
self._len = size
def __len__(self):
return self._len
def __getitem__(self, i):
if 0 <= i < self._len:
return self._func(i)
else:
raise IndexError
def __iter__(self):
for i in range(self._len):
yield self[i]
This would work with np.searchsorted(), e.g.:
import numpy as np
gen_seq = GeneratorSequence(lambda x: x ** 2, 100)
np.searchsorted(gen_seq, 9)
# 3
You could also write your own binary search function, you do not really need NumPy in this case, and it can actually be beneficial:
def bin_search(seq, item):
first = 0
last = len(seq) - 1
found = False
while first <= last and not found:
midpoint = (first + last) // 2
if seq[midpoint] == item:
first = midpoint
found = True
else:
if item < seq[midpoint]:
last = midpoint - 1
else:
first = midpoint + 1
return first
Which gives identical results:
all(bin_search(gen_seq, i) == np.searchsorted(gen_seq, i) for i in range(100))
# True
Incidentally, this is also WAY faster:
gen_seq = GeneratorSequence(lambda x: x ** 2, 1000000)
%timeit np.searchsorted(gen_seq, 10000)
# 1 loop, best of 3: 1.23 s per loop
%timeit bin_search(gen_seq, 10000)
# 100000 loops, best of 3: 16.1 µs per loop

Inspired by #norok2 comment, I think you can use something like this:
def f(i):
return i*2 # Just an example
class MySeq(Sequence):
def __init__(self, f, maxi):
self.maxi = maxi
self.f = f
def __getitem__(self, x):
if x < 0 or x > self.maxi:
raise IndexError()
return self.f(x)
def __len__(self):
return self.maxi + 1
In this case f is your function while maxi is the maximum index. This of course only works if the function f return values in sorted order.
At this point you can use an object of type MySeq inside np.searchsorted.

Related

Unable to iterate a list to a thread

I am trying to pass a json return between functions but I get errors. So I convert the json to a list. However, I cannot iterate the list from a while loop unless I specify an actual number.
Full code is
class BackendThread(QThread):
update_date = pyqtSignal(list)
def run(self):
device_mode = []
while True:
#do stuff and get json_return
for xx in json_return["result"]["devices"]:
for y in xx["nodes"]:
if y['type'] == "FRAME_BUFFER":
data = xx["device_id"] + "\n" + y['configuration']['display_mode']
device_mode.append(data)
self.update_date.emit(str(device_mode))
device_mode = []
time.sleep(1)
class Window(QDialog):
def __init__(self):
QDialog.__init__(self)
self.resize(400,400)
self.input=QTextEdit(self)
self.input.resize(400,400)
self.initUI()
def initUI(self):
self.backend=BackendThread()
self.backend.update_date.connect(self.handleDisplay)
self.backend.start()
def handleDisplay(self,data):
count = 0
while count < 11:
self.input.setText(data[count])
count += 1
if __name__ == '__main__':
app=QApplication(sys.argv)
win =Window()
win.show()
sys.exit(app.exec_())
So this part does not work. I only get the last item in the list
count = 0
while count < 11:
self.input.setText(data[count])
count += 1
When I do this, it works but I cannot hard code the item number becuase the list will never have the same amount of items
self.input.setText(data[0])
self.input.setText(data[1])
self.input.setText(data[2])
etc
Any ideas as to how to get that while loop working?

Can you solve maximum gap for a chain of elements in SQL?

I have a difficult query I have to make in SQL(postgressql). I have tried to explain the problem below.
I have a chain of elements each having a max gap to next. So I want to calculate the "distance" matrix. So take the following 4 element:
example_id,id,max_gap
0,0,2
0,1,5
0,2,
0,3,4
then the max_gap between each element should be the following for this example
example_id,id,max_gap
0,0,0,0
0,0,1,2
0,0,2,7
0,0,3,
0,1,0,-2
0,1,1,0
0,1,2,5
0,1,3,
0,2,0,-7
0,2,1,-5
0,2,2,0
0,2,3,
0,3,0,
0,3,1,
0,3,2,
0,3,3,0
So if any of the elements between two elements have max_gap infinity then the max_gap between the two elements is infinity.
The challenge is to the solve this problem in SQL (since in need to have this in a sql trigger).
The following Python code can be used to create test_cases:
from random import randint, random
from itertools import groupby
n_examples = 100
def generate_examples(n):
out = []
for i in range(n):
for j in range(randint(1,10)):
max_dist = randint(0,10)
if random()>0.75:
max_dist = None
out.append([i,j,max_dist])
return out
def max_dist_between_all(example):
example_id = example[0][0]
n=len(example)
return [(example_id,i,j,calc_dist(i,j,example)) for i in range(n) for j in range(n)]
def calculate_max_dist_between_all_examples(examples):
return [result
for _, example in groupby(examples, lambda x:x[0])
for result in max_dist_between_all(list(example))
]
def calc_dist(i,j,example):
if j<i:
i,j = j,i
sign =-1
else:
sign=1
max_dist = 0
for k in range(i,j):
max_dist_between_step = example[k][2]
if max_dist_between_step is None:
return None
max_dist+=max_dist_between_step
return sign*max_dist
examples =generate_examples(n_examples)
def print_in_csv(input_, headers):
print(",".join(headers))
print("\n".join([",".join(str(e) if e is not None else "" for e in l) for l in input_]))
print_in_csv(examples, ["example_id","id","max_gap"])
print()
print_in_csv(calculate_max_dist_between_all_examples(examples), ["example_id","id","max_gap"])
Do you just want a self join?
select e1.example_id, e1.id, e2.id, e1.max_gap - e2.max_gap
from elements e1 join
elements e2
on e1.example_id = e2.example_id

Scipy Spatial Distance Sub-module rejects Numpy Array

I have a dataframe named, "df", with 4 columns. Three columns are independent variables: x1, x2, and x3. And, the other variable, y, is the dependent variable
I would like to calculate the distance, "pdist" between the dependent variable and each of the dependent variables, so I first converted each column to a numpy array as follows:
y = df[["y"]].values
x1 = df[["x1"]].values
x2 = df[["x2"]].values
x3 = df[["x3"]].values
When I feed these arrays through this coding pipeline I got from Github:
import numpy as np
from scipy.spatial.distance import pdist
def distance_correlation(Xval, Yval, pval=True, nruns=500):
X, Y = np.atleast_1d(Xval),np.atleast_1d(Yval)
if np.prod(X.shape) == len(X):X = X[:, None]
if np.prod(Y.shape) == len(Y):Y = Y[:, None]
X, Y = np.atleast_2d(X),np.atleast_2d(Y)
n = X.shape[0]
if Y.shape[0] != X.shape[0]:raise ValueError('Number of samples must match')
a, b = squareform(pdist(X)),squareform(pdist(Y))
A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
dcov2_xy = (A * B).sum() / float(n * n)
dcov2_xx = (A * A).sum() / float(n * n)
dcov2_yy = (B * B).sum() / float(n * n)
dcor = np.sqrt(dcov2_xy) / np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
if pval:
greater = 0
for i in range(nruns):
Y_r = copy.copy(Yval)
np.random.shuffle(Y_r)
if distcorr(Xval, Y_r, pval=False) > dcor:
greater += 1
return (dcor, greater / float(nruns))
else:
return dcor
distance_correlation(x1, y, pval=True, nruns=500)
I get this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-32-c720c9df4e97> in <module>
----> 1 distance_correlation(bop_sp500, price, pval=True, nruns=500)
<ipython-input-17-e0b3aea12c32> in distance_correlation(Xval, Yval, pval, nruns)
9 n = X.shape[0]
10 if Y.shape[0] != X.shape[0]:raise ValueError('Number of samples must match')
---> 11 a, b = squareform(pdist(X)),squareform(pdist(Y))
12 A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
13 B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
~\Anaconda3\lib\site-packages\scipy\spatial\distance.py in pdist(X, metric, *args, **kwargs)
1997 s = X.shape
1998 if len(s) != 2:
-> 1999 raise ValueError('A 2-dimensional array must be passed.')
2000
2001 m, n = s
ValueError: A 2-dimensional array must be passed..
Could anyone identify where I am going wrong? I know the error originates from the manner in which I created my numpy arrays. But, I have no clues on fixing it.
Please explain it with examples that use my variable definitions. I am new to Python
Ok, so I finally managed to figure out the cause of the problem I faced:
The Numpy array that was being fed into the helper function was a 2d array.
While the helper function required a "Numpy vector"; i.e. a 1d Numpy array.
The best way to create it is to use the numpy.ravel() function. Hence, for my datasets, the code would be as follows (I have broken down the steps for simplicity):
# Create Arrays
y = df[["y"]].values
x1 = df[["x1"]].values
x2 = df[["x2"]].values
x3 = df[["x3"]].values
# Ravel Them
y = y.ravel()
x1 = x1.ravel()
x2 = x2.ravel()
x3 = x3.ravel()

`scipy.optimize` functions hang even with `maxiter=0`

I am trying to train the MNIST data (which I downloaded from Kaggle) with simple multi-class logistic regression, but the scipy.optimize functions hang.
Here's the code:
import csv
from math import exp
from numpy import *
from scipy.optimize import fmin, fmin_cg, fmin_powell, fmin_bfgs
# Prepare the data
def getIiter(ifname):
"""
Get the iterator from a csv file with filename ifname
"""
ifile = open(ifname, 'r')
iiter = csv.reader(ifile)
iiter.__next__()
return iiter
def parseRow(s):
y = [int(x) for x in s]
lab = y[0]
z = y[1:]
return (lab, z)
def getAllRows(ifname):
iiter = getIiter(ifname)
x = []
l = []
for row in iiter:
lab, z = parseRow(row)
x.append(z)
l.append(lab)
return x, l
def cutData(x, y):
"""
70% training
30% testing
"""
m = len(x)
t = int(m * .7)
return [(x[:t], y[:t]), (x[t:], y[t:])]
def num2IndMat(l):
t = array(l)
tt = [vectorize(int)((t == i)) for i in range(10)]
return array(tt).T
def readData(ifname):
x, l = getAllRows(ifname)
t = [[1] + y for y in x]
return array(t), num2IndMat(l)
#Calculate the cost function
def sigmoid(x):
return 1 / (1 + exp(-x))
vSigmoid = vectorize(sigmoid)
vLog = vectorize(log)
def costFunction(theta, x, y):
sigxt = vSigmoid(dot(x, theta))
cm = (- y * vLog(sigxt) - (1 - y) * vLog(1 - sigxt)) / m / N
return sum(cm)
def unflatten(flatTheta):
return [flatTheta[i * N : (i + 1) * N] for i in range(n + 1)]
def costFunctionFlatTheta(flatTheta):
return costFunction(unflatten(flatTheta), trainX, trainY)
def costFunctionFlatTheta1(flatTheta):
return costFunction(flatTheta.reshape(785, 10), trainX, trainY)
x, y = readData('train.csv')
[(trainX, trainY), (testX, testY)] = cutData(x, y)
m = len(trainX)
n = len(trainX[0]) - 1
N = len(trainY[0])
initTheta = zeros(((n + 1), N))
flatInitTheta = ndarray.flatten(initTheta)
flatInitTheta1 = initTheta.reshape(1, -1)
In the last two lines we flatten initTheta because the fmin{,_cg,_bfgs,_powell} functions seem to only take vectors as the initial value argument x0. I also flatten initTheta using reshape in hope this answer can be of help.
There is no problem computing the cost function which takes up less than 2 seconds on my computer:
print(costFunctionFlatTheta(flatInitTheta), costFunctionFlatTheta1(flatInitTheta1))
# 0.69314718056 0.69314718056
But all the fmin functions hang, even if I set maxiter=0.
e.g.
newFlatTheta = fmin(costFunctionFlatTheta, flatInitTheta, maxiter=0)
or
newFlatTheta1 = fmin(costFunctionFlatTheta1, flatInitTheta1, maxiter=0)
When I interrupt the program, it seems to me it all hangs at lines in optimize.py calling the cost functions, lines like this:
return function(*(wrapper_args + args))
For example, if I use fmin_cg, this would be line 292 in optimize.py (Version 0.5).
How do I solve this problem?
OK I found a way to stop fmin_cg from hanging.
Basically I just need to write a function that computes the gradient of the cost function, and pass it to the fprime parameter of fmin_cg.
def gradient(theta, x, y):
return dot(x.T, vSigmoid(dot(x, theta)) - y) / m / N
def gradientFlatTheta(flatTheta):
return ndarray.flatten(gradient(flatTheta.reshape(785, 10), trainX, trainY))
Then
newFlatTheta = fmin_cg(costFunctionFlatTheta, flatInitTheta, fprime=gradientFlatTheta, maxiter=0)
terminates within seconds, and setting maxiter to a higher number (say 100) one can train the model within reasonable amount of time.
The documentation of fmin_cg says the gradient would be numerically computed if no fprime is given, which is what I suspect caused the hanging.
Thanks to this notebook by zgo2016#Kaggle which helped me find the solution.

how to use Apache Commons Math Optimization in Jython?

I want to transfer Matlab code to Jython version, and find that the fminsearch in Matlab might be replaced by Apache-Common-Math-Optimization.
I'm coding on the Mango Medical Image script manager, which uses Jython 2.5.3 as coding language. And the Math version is 3.6.1.
Here is my code:
def f(x,y):
return x^2+y^2
sys.path.append('/home/shujian/APPs/Mango/lib/commons-math3-3.6.1.jar')
sys.add_package('org.apache.commons.math3.analysis')
from org.apache.commons.math3.analysis import MultivariateFunction
sys.add_package('org.apache.commons.math3.optim.nonlinear.scalar.noderiv')
from org.apache.commons.math3.optim.nonlinear.scalar.noderiv import NelderMeadSimplex,SimplexOptimizer
sys.add_package('org.apache.commons.math3.optim.nonlinear.scalar')
from org.apache.commons.math3.optim.nonlinear.scalar import ObjectiveFunction
sys.add_package('org.apache.commons.math3.optim')
from org.apache.commons.math3.optim import MaxEval,InitialGuess
sys.add_package('org.apache.commons.math3.optimization')
from org.apache.commons.math3.optimization import GoalType
initialSolution=[2.0,2.0]
simplex=NelderMeadSimplex([2.0,2.0])
opt=SimplexOptimizer(2**(-6), 2**(-10))
solution=opt.optimize(MaxEval(300),ObjectiveFunction(f),simplex,GoalType.MINIMIZE,InitialGuess([2.0,2.0]))
skewParameters2 = solution.getPointRef()
print skewParameters2;
And I got the error below:
TypeError: optimize(): 1st arg can't be coerced to
I'm quite confused about how to use the optimization in Jython and the examples are all Java version.
I've given up this plan and find another method to perform the fminsearch in Jython. Below is the Jython version code:
import sys
sys.path.append('.../jnumeric-2.5.1_ra0.1.jar') #add the jnumeric path
import Numeric as np
def nelder_mead(f, x_start,
step=0.1, no_improve_thr=10e-6,
no_improv_break=10, max_iter=0,
alpha=1., gamma=2., rho=-0.5, sigma=0.5):
'''
#param f (function): function to optimize, must return a scalar score
and operate over a numpy array of the same dimensions as x_start
#param x_start (float list): initial position
#param step (float): look-around radius in initial step
#no_improv_thr, no_improv_break (float, int): break after no_improv_break iterations with
an improvement lower than no_improv_thr
#max_iter (int): always break after this number of iterations.
Set it to 0 to loop indefinitely.
#alpha, gamma, rho, sigma (floats): parameters of the algorithm
(see Wikipedia page for reference)
return: tuple (best parameter array, best score)
'''
# init
dim = len(x_start)
prev_best = f(x_start)
no_improv = 0
res = [[np.array(x_start), prev_best]]
for i in range(dim):
x=np.array(x_start)
x[i]=x[i]+step
score = f(x)
res.append([x, score])
# simplex iter
iters = 0
while 1:
# order
res.sort(key=lambda x: x[1])
best = res[0][1]
# break after max_iter
if max_iter and iters >= max_iter:
return res[0]
iters += 1
# break after no_improv_break iterations with no improvement
print '...best so far:', best
if best < prev_best - no_improve_thr:
no_improv = 0
prev_best = best
else:
no_improv += 1
if no_improv >= no_improv_break:
return res[0]
# centroid
x0 = [0.] * dim
for tup in res[:-1]:
for i, c in enumerate(tup[0]):
x0[i] += c / (len(res)-1)
# reflection
xr = x0 + alpha*(x0 - res[-1][0])
rscore = f(xr)
if res[0][1] <= rscore < res[-2][1]:
del res[-1]
res.append([xr, rscore])
continue
# expansion
if rscore < res[0][1]:
xe = x0 + gamma*(x0 - res[-1][0])
escore = f(xe)
if escore < rscore:
del res[-1]
res.append([xe, escore])
continue
else:
del res[-1]
res.append([xr, rscore])
continue
# contraction
xc = x0 + rho*(x0 - res[-1][0])
cscore = f(xc)
if cscore < res[-1][1]:
del res[-1]
res.append([xc, cscore])
continue
# reduction
x1 = res[0][0]
nres = []
for tup in res:
redx = x1 + sigma*(tup[0] - x1)
score = f(redx)
nres.append([redx, score])
res = nres
And the test example is as below:
def f(x):
return x[0]**2+x[1]**2+x[2]**2
print nelder_mead(f,[3.4,2.3,2.2])
Actually, the original version is for python, and the link below is the source:
https://github.com/fchollet/nelder-mead