Populating numpy array with most performance - numpy

I have few arrays a,b,c and d as shown below and would like to populate a matrix by evaluating a function f(...) which consumes a,b,c and d.
with nested for loop this is obviously possible but I'm looking for more pythonic and fast way to do this.
So far I tried, np.fromfunction with no luck.
Thanks
PS: This function f has a conditional. I still can consider approaches which does not support conditionals but if the solution supports conditionals that would be fantastic.
example function in case helpful
def fun(a,b,c,c): return a+b+c+d if a==b else a*b*c*d
Also why fromfunction failed is shown below
>>> a = np.array([1,2,3,4,5])
>>> b = np.array([10,20,30])
>>> def fun(i,j): return a[i] * b[j]
>>> np.fromfunction(fun, (3,5))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\Anaconda3\lib\site-packages\numpy\core\numeric.py", line 1853, in fromfunction
return function(*args, **kwargs)
File "<stdin>", line 1, in fun
IndexError: arrays used as indices must be of integer (or boolean) type

The reason the function fails is that np.fromfunction passes floating-point values, which are not valid as indices. You can modify your function like this to make it work:
def fun(i,j):
return a[j.astype(int)] * b[i.astype(int)]
print(np.fromfunction(fun, (3,5)))
[[ 10 20 30 40 50]
[ 20 40 60 80 100]
[ 30 60 90 120 150]]

Jake has explained why your fromfunction approach fails. However, you don't need fromfunction for your example. You could simply add an axis to b and have numpy broadcast the shapes:
a = np.array([1,2,3,4,5])
b = np.array([10,20,30])
def fun(i,j): return a[j.astype(int)] * b[i.astype(int)]
f1 = np.fromfunction(fun, (3, 5))
f2 = b[:, None] * a
(f1 == f2).all() # True
Extending this to the function you showed that contains an if condition, you could just split the if into two operations in sequence: creating an array given by the if expression, and overwriting the relevant parts by the else expression.
a = np.array([1, 2, 3, 4, 5])
b = np.array([5, 4, 3, 2, 1])
c = np.array([100, 200, 300, 400, 500])
d = np.array([0, 1, 2, 3])
# Calculate the values at all indices as the product
result = d[:, None] * (a * b * c)
# array([[ 0, 0, 0, 0, 0],
# [ 500, 1600, 2700, 3200, 2500],
# [1000, 3200, 5400, 6400, 5000],
# [1500, 4800, 8100, 9600, 7500]])
# Calculate sum
sum_arr = d[:, None] + (a + b + c)
# array([[106, 206, 306, 406, 506],
# [107, 207, 307, 407, 507],
# [108, 208, 308, 408, 508],
# [109, 209, 309, 409, 509]])
# Set diagonal elements (i==j) to sum:
np.fill_diagonal(result, np.diag(sum_arr))
which gives the following result:
array([[ 106, 0, 0, 0, 0],
[ 500, 207, 2700, 3200, 2500],
[1000, 3200, 308, 6400, 5000],
[1500, 4800, 8100, 409, 7500]])

Related

Minimizing the peak difference of elements of two lists given some constraints

I want to minimize the peak difference of list1[i] - list2[i] using the scipy.optimize.minimize method.
The elements in list1 and list2 are floats.
For example:
list1 = [50, 50.5, 52, 53, 55, 55.5, 56, 57, 60, 61]
How do I minimize list1[i] - list2[i] given that I have two constraints:
1. list2 = list1[0]
2. list2[i+1]-list2[i]<=1.5
Basically, two consecutive elements in list2 can not be separated by more than 1.5 and the first element of list2 is the first element of list1.
Maybe there is another way other than scipy.optimize.minimize but I don't know how to do this.
I think the optimum values for list2 are maybe:
list2 = [50, 50.5, 52, 53, 54.5, 55.5, 56, 57, 58.5, 60]
In this case the peak difference is 1.5.
But maybe the algorithm finds a more optimum solution where there is less difference between the elements of list1 and list2.
Here is what I have tried but failed:
import numpy as np
from scipy.optimize import minimize
list1 = [50, 50.5, 52, 53, 55, 55.5, 56, 57,60, 61]
list2 = [list1[0]]
#Define objective
def peakDifference(*args):
global list2
peak_error = []
for list1_i, list2_i in zip(list1, list2):
peak_error.append(list1_i-list2_i)
return max(peak_error)
peak_error = peakDifference()
#Define constraints
def constraint1(*args):
for x in range(len(list2) - 1):
return list2[x+1] - list2[x] - 1.5
con1 = {'type': 'ineq', 'fun': constraint1}
#Optimize
sol = minimize(peakDifference,list2, constraints=con1)
Traceback (most recent call last): File "C:/Users/JumpStart/PycharmProjects/anglesimulation/venv/asdfgh.py", line 27, in <module>
sol = minimize(peakDifference,list2, constraints=con1) File "C:\Users\JumpStart\anaconda3\lib\site-packages\scipy\optimize\_minimize.py", line 625, in minimize
return _minimize_slsqp(fun, x0, args, jac, bounds, File "C:\Users\JumpStart\anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 412, in _minimize_slsqp
a = _eval_con_normals(x, cons, la, n, m, meq, mieq) File "C:\Users\JumpStart\anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 486, in _eval_con_normals
a_ieq = vstack([con['jac'](x, *con['args']) File "C:\Users\JumpStart\anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 486, in <listcomp>
a_ieq = vstack([con['jac'](x, *con['args']) File "C:\Users\JumpStart\anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 284, in cjac
return approx_derivative(fun, x, method='2-point', File "C:\Users\JumpStart\anaconda3\lib\site-packages\scipy\optimize\_numdiff.py", line 426, in approx_derivative
return _dense_difference(fun_wrapped, x0, f0, h, File "C:\Users\JumpStart\anaconda3\lib\site-packages\scipy\optimize\_numdiff.py", line 497, in _dense_difference
df = fun(x) - f0 TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'
Process finished with exit code 1
NLP model
Here is a "working" version of the NLP model. The model cannot be solved reliably this way, as it is non-differentiable.
import numpy as np
from scipy.optimize import minimize
list1 = np.array([50, 50.5, 52, 53, 55, 55.5, 56, 57,60, 61])
list1_0 = list1[0]
n = len(list1)
# our x variable will have one element less (first element is fixed)
list1x = np.delete(list1,0) # version with 1st element dropped
nx = len(list1x)
# objective function
# minimize the the maximum difference
# Notes:
# - x excludes first element (they are the same by definition)
# - this is non-differentiable so likely to be non-optimal
def maxDifference(x):
return np.max(np.abs(list1x - x))
# n-1 constraints
def constraint1(x):
return 1.5 - np.diff(np.insert(x,0,list1_0),1)
con1 = {'type': 'ineq', 'fun': constraint1}
#Optimize
x = 55*np.ones(nx) # initial value
sol = minimize(maxDifference, x, constraints=con1)
sol
# optimal is: x = [51.25,51.25,52.75,54.25,54.75,56.25,57.75,59.25,60.25]
# obj = 0.75
The result is:
fun: 5.0
jac: array([0., 0., 0., 0., 0., 0., 0., 0., 0.])
message: 'Optimization terminated successfully'
nfev: 20
nit: 2
njev: 2
status: 0
success: True
x: array([51.5, 53. , 54.5, 55. , 55. , 55. , 55. , 55. , 56. ])
This is non-optimal: the objective is 5 (instead of 0.75).
LP model
An LP model will find a proven optimal solution. That is much more reliable. E.g.:
import pulp as lp
list1 = [50, 50.5, 52, 53, 55, 55.5, 56, 57,60, 61]
n = len(list1)
model = pulp.LpProblem("Min_difference", pulp.LpMinimize)
x = lp.LpVariable.dicts("x",(i for i in range(n)))
z = lp.LpVariable("z")
# objective
model += z
# constraints
for i in range(n):
model += z >= x[i]-list1[i]
model += z >= list1[i]-x[i]
for i in range(n-1):
model += x[i+1] - x[i] <= 1.5
model += x[0] == list1[0]
model.solve()
print(lp.LpStatus[model.status])
print("obj:",z.varValue)
print([x[i].varValue for i in range(n)])
This shows:
Optimal
obj: 0.75
[50.0, 51.25, 52.75, 53.75, 55.25, 56.25, 56.75, 57.75, 59.25, 60.25]

model.predict(sample) returning TypeError: cannot perform reduce with flexible type

I keep running into this error when I try to predict based on fitted model.
training, testing = train_test_split(gesture, test_size = 0.2, random_state = 0)
x = training.drop('CLASS', axis = 1) # remove the Class column from Training dataframe
y = testing.drop('CLASS', axis = 1) # remove the Class column from Testing dataframe
f_train = x.values.tolist()
l_train = training['CLASS'].values.tolist() # make a list of class identifiers from Training dataframe
f_test = y.values.tolist()
knn = KNeighborsRegressor(n_neighbors = 5)
knn.fit(f_train, l_train)
predictions = knn.predict(f_test)
The error occurs in the last line of the above code and the error message is given below:
Traceback (most recent call last):
File "C:\Users\Umair Khan\Dropbox\`Shift betweeen PCs\Work\EMG Hand Gesture\Codes\ML_on_CSV.py", line 39, in <module>
predictions = knn.predict(f_test)
File "C:\Users\Umair Khan\AppData\Local\Programs\Python\Python37-32\lib\site-packages\sklearn\neighbors\_regression.py", line 185, in predict
y_pred = np.mean(_y[neigh_ind], axis=1)
File "<__array_function__ internals>", line 6, in mean
File "C:\Users\Umair Khan\AppData\Local\Programs\Python\Python37-32\lib\site-packages\numpy\core\fromnumeric.py", line 3335, in mean
out=out, **kwargs)
File "C:\Users\Umair Khan\AppData\Local\Programs\Python\Python37-32\lib\site-packages\numpy\core\_methods.py", line 151, in _mean
ret = umr_sum(arr, axis, dtype, out, keepdims)
TypeError: cannot perform reduce with flexible type
f_test is a list of lists like such [[16, 30, 35, 250, -1, 0.5, 35, 0.03, 0.02], [16, 30, 35, 250, -1, 0.5, 35, 0.03, 0.02]]
I have also tried passing an array in predict(sample) but the issue still remains.
predictions = knn.predict(np.array(f_test).astype(np.float))
We need to see more of the error traceback. And info on the function inputs, particularly shape and dtype.
I've seen this error message when working with structured arrays. But it's not obvious where those might arise in your code.
In [15]: np.ones((2,), dtype='i,i')
Out[15]: array([(1, 1), (1, 1)], dtype=[('f0', '<i4'), ('f1', '<i4')])
In [16]: np.sum(np.ones((2,), dtype='i,i'))
....
---> 87 return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
88
89
TypeError: cannot perform reduce with flexible type
Solved:
changed dtype of l_train from string to float and the error disappeared. f_train and f_test were already of type float.

Extracting the indices of outliers in Linear Regression

The following script computes R-squared value between two numpy arrays(x and y).
The R-squared value is very low due to outliers in the data. How can I extract the indices of those outliers?
import numpy as np, matplotlib.pyplot as plt, scipy.stats as stats
x = np.random.random_integers(1,50,50)
y = np.random.random_integers(1,50,50)
r2 = stats.linregress(x, y) [3]**2
print r2
plt.scatter(x, y)
plt.show()
An outlier is defined as: value-mean > 2*standard deviation.
You can do this with the line
[i for i in range(len(x)) if (abs(x[i] - np.mean(x)) > 2*np.std(x))]
What is does:
A list is constructed from the indices of x, where the element at that index satisfies the condition described above.
A quick test:
x = np.random.random_integers(1,50,50)
this gives me the array:
array([16, 6, 13, 18, 21, 37, 31, 8, 1, 48, 4, 40, 9, 14, 6, 45, 20,
15, 14, 32, 30, 8, 19, 8, 34, 22, 49, 5, 22, 23, 39, 29, 37, 24,
45, 47, 21, 5, 4, 27, 48, 2, 22, 8, 12, 8, 49, 12, 15, 18])
Now I add some outliers manually as there are none initially:
x[4] = 200
x[15] = 178
lets test:
[i for i in range(len(x)) if (abs(x[i] - np.mean(x)) > 2*np.std(x))]
result:
[4, 15]
Is this what you was looking for?
EDIT:
I added the abs() function in the line above, because when you are working with negative numbers this might end bad. The abs() function takes the absolute value.
I think Sander's approach is the correct one, but if you must see R2 without those outliers before making a decision here is a way to do it.
Setup data and introduce outlier:
In [1]:
import numpy as np, scipy.stats as stats
np.random.seed(123)
x = np.random.random_integers(1,50,50)
y = np.random.random_integers(1,50,50)
y[5] = 100
Calculate R2 taking out one y value at a time (along with matching x value):
m = np.eye(y.shape[0])
r2 = np.apply_along_axis(lambda a: stats.linregress(np.delete(x, a.argmax()), np.delete(y, a.argmax()))[3]**2, 0, m)
Get index of the biggest outlier:
r2.argmax()
Out[1]:
5
Get R2 when this outlier is taken out:
In [2]:
r2[r2.argmax()]
Out[2]:
0.85892084723588935
Get the value of the outlier:
In [3]:
y[r2.argmax()]
Out[3]:
100
To get top n outliers:
In [4]:
n = 5
sorted_index = r2.argsort()[::-1]
sorted_index[:n]
Out [4]:
array([ 5, 27, 34, 0, 17], dtype=int64)

implementation of Hierarchial Agglomerative clustering

i am newbie and just want to implement Hierarchical Agglomerative clustering for RGB images. For this I extract all values of RGB from an image. And I process image.Next I find its distance and then develop the linkage. Now from linkage I want to extract my original data (i.e RGB values) on specified indices with indices id. Here is code I have done so far.
image = Image.open('image.jpg')
image = image.convert('RGB')
im = np.array(image).reshape((-1,3))
rgb = list(im.getdata())
X = pdist(im)
Y = linkage(X)
I = inconsistent(Y)
based on the 4th column of consistency. I opt minimum value of the cutoff in order to get maximum clusters.
cutoff = 0.7
cluster_assignments = fclusterdata(Y, cutoff)
# Print the indices of the data points in each cluster.
num_clusters = cluster_assignments.max()
print "%d clusters" % num_clusters
indices = cluster_indices(cluster_assignments)
ind = np.array(enumerate(rgb))
for k, ind in enumerate(indices):
print "cluster", k + 1, "is", ind
dendrogram(Y)
I got results like this
cluster 6 is [ 6 11]
cluster 7 is [ 9 12]
cluster 8 is [15]
Means cluster 6 contains the indices of 6 and 11 leafs. Now at this point I stuck in how to map these indices to get original data(i.e rgb values). indices of each rgb values to each pixel in the image. And then I have to generate codebook to implement Agglomeration Clustering. I have no idea how to approach this task. Read a lot of stuff but nothing clued.
Here is my solution:
import numpy as np
from scipy.cluster import hierarchy
im = np.array([[54,101,9],[ 67,89,27],[ 67,85,25],[ 55,106,1],[ 52,108,0],
[ 55,78,24],[ 19,57,8],[ 19,46,0],[ 95,110,15],[112,159,57],
[ 67,118,26],[ 76,127,35],[ 74,128,30],[ 25,62,0],[100,120,9],
[127,145,61],[ 48,112,25],[198,25,21],[203,11,10],[127,171,60],
[124,173,45],[120,133,19],[109,137,18],[ 60,85,0],[ 37,0,0],
[187,47,20],[127,170,52],[ 30,56,0]])
groups = hierarchy.fclusterdata(im, 0.7)
idx_sorted = np.argsort(groups)
group_sorted = groups[idx_sorted]
im_sorted = im[idx_sorted]
split_idx = np.where(np.diff(group_sorted) != 0)[0] + 1
np.split(im_sorted, split_idx)
output:
[array([[203, 11, 10],
[198, 25, 21]]),
array([[187, 47, 20]]),
array([[127, 171, 60],
[127, 170, 52]]),
array([[124, 173, 45]]),
array([[112, 159, 57]]),
array([[127, 145, 61]]),
array([[25, 62, 0],
[30, 56, 0]]),
array([[19, 57, 8]]),
array([[19, 46, 0]]),
array([[109, 137, 18],
[120, 133, 19]]),
array([[100, 120, 9],
[ 95, 110, 15]]),
array([[67, 89, 27],
[67, 85, 25]]),
array([[55, 78, 24]]),
array([[ 52, 108, 0],
[ 55, 106, 1]]),
array([[ 54, 101, 9]]),
array([[60, 85, 0]]),
array([[ 74, 128, 30],
[ 76, 127, 35]]),
array([[ 67, 118, 26]]),
array([[ 48, 112, 25]]),
array([[37, 0, 0]])]

how to vectorize an operation on a 1 dimensionsal array to produce 2 dimensional matrix in numpy

I have a 1d array of values
i = np.arange(0,7,1)
and a function
# Returns a column matrix
def fn(i):
return np.matrix([[i*2,i*3]]).T
fnv = np.vectorize(fn)
then writing
fnv(i)
gives me an error
File "<stdin>", line 1, in <module>
File "c:\Python33\lib\site-packages\numpy\lib\function_base.py",
line 1872, in __call__
return self._vectorize_call(func=func, args=vargs)
File "c:\Python33\lib\site-packages\numpy\lib\function_base.py",
line 1942, in _vectorize_call
copy=False, subok=True, dtype=otypes[0])
ValueError: setting an array element with a sequence.
The result I am looking for is a matrix with two rows and as many columns as in the input array. What is the best notation in numpy to achieve this?
For example i would equal
[1,2,3,4,5,6]
and the output would equal
[[2,4,6,8,10,12],
[3,6,9,12,15,18]]
EDIT
You should try to avoid using vectorize, because it gives the illusion of numpy efficiency, but inside it's all python loops.
If you really have to deal with user supplied functions that take ints and return a matrix of shape (2, 1) then there probably isn't much you can do. But that seems like a really weird use case. If you can replace that with a list of functions that take an int and return an int, and that use ufuncs when needed, i.e. np.sin instead of math.sin, you can do the following
def vectorize2(funcs) :
def fnv(arr) :
return np.vstack([f(arr) for f in funcs])
return fnv
f2 = vectorize2((lambda x : 2 * x, lambda x : 3 * x))
>>> f2(np.arange(10))
array([[ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18],
[ 0, 3, 6, 9, 12, 15, 18, 21, 24, 27]])
Just for your reference, I have timed this vectorization against your proposed one:
f = vectorize(fn)
>>> timeit.timeit('f(np.arange(10))', 'from __main__ import np, f', number=1000)
0.28073329263679625
>>> timeit.timeit('f2(np.arange(10))', 'from __main__ import np, f2', number=1000)
0.023139129945661807
>>> timeit.timeit('f(np.arange(10000))', 'from __main__ import np, f', number=10)
2.3620706288432984
>>> timeit.timeit('f2(np.arange(10000))', 'from __main__ import np, f2', number=10)
0.002757072593169596
So there is an order of magnitude in speed even for small arrays, that grows to a x1000 speed up, available almost for free, for larger arrays.
ORIGINAL ANSWER
Don't use vectorize unless there is no way around it, it's slow. See the following examples
>>> a = np.array(range(7))
>>> a
array([0, 1, 2, 3, 4, 5, 6])
>>> np.vstack((a, a+1))
array([[0, 1, 2, 3, 4, 5, 6],
[1, 2, 3, 4, 5, 6, 7]])
>>> np.vstack((a, a**2))
array([[ 0, 1, 2, 3, 4, 5, 6],
[ 0, 1, 4, 9, 16, 25, 36]])
Whatever your function is, if it can be constructed with numpy's ufuncs, you can do something like np.vstack((a, f(a))) and get what you want
A simple reimplementation of vectorize gives me what I want
def vectorize( fn):
def do_it (array):
return np.column_stack((fn(p) for p in array))
return do_it
If this is not performant or there is a better way then let me know.