About the numpy.where statement - numpy

I would like to use the numpy.where to check the value of a previous row but don't know how to code
for n1 in range(len(image1)):
print('input image ',input_folder+'\\' + image1[n1])
print('\n')
print('image1[n1] ',image1[n1])
print('\n')
im = Image.open(input_folder+'\\'+image1[n1])
a = np.array(im, dtype='uint8')
width, height = im.size
print('width ',width)
print('height ',height)
a = np.where(a==[0,0,0],[255,255,255],a)
!-- Change the looping statement to np.where --!
for h in range(height):
for w in range(width):
if h <= (height - 2) and w <= (width - 2):
if a[h,w,0] != 255 and a[h,w,1] != 255 and a[h,w,2] != 255:
if (a[h-1,w,0] == 255 and a[h-1,w,1] == 255 and a[h-1,w,2] == 255 and a[h+1,w,0] == 255 and a[h+1,w,1] == 255 and a[h+1,w,2] == 255) or (a[h,w-1,0] == 255 and a[h,w-1,1] == 255 and a[h,w-1,2] == 255 and a[h,w+1,0] == 255 and a[h,w+1,1] == 255 and a[h,w+1,2] == 255):***
Change the above looping statement to np.where(a[-??] = [255,255,255] or a[+??] = [255,255,255]) so it can run more faster than the for loop statement. -->
a[h,w,0] = 255
a[h,w,1] = 255
a[h,w,2] = 255

I'm afraid, you can not use np.where here.
The reason is that:
the condition passed to np.where should indicate each element of the
source array,
whereas the criterion in your code actually relates only to first 2
dimensions of the source array.
So I came up with another, quite elegant and concise solution.
Part 1: How to get first two indices of elements, where all elements
in the third dimension are != 255:
To to it, on the whole array, you could run:
np.not_equal(a, 255).all(axis=2)
Part 2: How to limit the "range of operation" to elements having both
previous and next row and column.
You can do it passing to the above code a "subrange" of the original array:
np.not_equal(a[1:-1, 1:-1], 255).all(axis=2))
You should eliminate both the first and the last column and row (in
your code you failed to eliminate the first row / column).
But note that this time the resulting indices are by one less than before,
so at the later step you will have to add 1 to them.
Part 3: A function to check whether all elements along the third dimension
== 255, for some row (r) and column (c):
def all_eq(arr, r, c):
return np.equal(arr[r, c], 255).all()
(will be used soon).
Part 4: How to get the result:
res = a.copy()
for r, c in zip(*np.where(np.not_equal(a[1:-1, 1:-1], 255).all(axis=2))):
h = r + 1
w = c + 1
if all_eq(a, h-1, w) and all_eq(a, h+1, w) or\
all_eq(a, h, w-1) and all_eq(a, h, w+1):
res[h, w] = 255
Note that this code starts from making a copy of the original array
(it will hold the result).
Then, for r, c in zip(…) iterates over the indices found.
First 2 lines in the loop add 1 to the indices found, in the subrange
of the original array, so now h and w indicate row / column in the whole
original array.
Then if checks whether the respective adjacent pixels have 255 in all elements.
If they do, then put 255 in all elements of the "current" pixel, in the result.
You can't operate on the original array, since changed values in some pixels
would "falsify" the evaluation of conditions for subseqent pixels.
Edit
After some research I found, that it is possible to use np.where,
although the solution is a bit complicated and involving quite a big
number of Numpy methods:
# Mask 1: Pixels with all elements != 255
m1 = np.zeros((height, width), dtype='int8')
idx = np.where(np.not_equal(a, 255).all(axis=2))
m1[idx] = 1
# Pixels with all elements == 255
m2 = np.apply_along_axis(lambda px: np.equal(px, 255).all(), 2, a).astype('int8')
# Both adjacent pixels (left / right) == 255
m2a = np.logical_and(np.insert(m2, 0, 0, axis=1)[:,:-1], np.insert(m2,
width, 0, axis=1)[:,1:])
# Both adjacent pixels (up / down) == 255
m2b = np.logical_and(np.insert(m2, 0, 0, axis=0)[:-1,:], np.insert(m2,
height, 0, axis=0)[1:,:])
# Mask 2: Both adjacent pixels (either vertically or horizontally) == 255
m2 = np.logical_or(m2a, m2b)
# The "final" mask
msk = np.logical_and(m1, m2)
# Generate the result
result = np.where(np.expand_dims(msk, 2), 255, a)
This solution should be substantially faster than my first concept.

Related

Find pairs of array such as array_1 = -array_2

I search a way to find all the vector from a np.meshgrid(xrange, xrange, xrange) that are related by k = -k.
For the moment I do that :
#numba.njit
def find_pairs(array):
boolean = np.ones(len(array), dtype=np.bool_)
pairs = []
idx = [i for i in range(len(array))]
while len(idx) > 1:
e1 = idx[0]
for e2 in idx:
if (array[e1] == -array[e2]).all():
boolean[e2] = False
pairs.append([e1, e2])
idx.remove(e1)
if e2 != e1:
idx.remove(e2)
break
return boolean, pairs
# Give array of 3D vectors
krange = np.fft.fftfreq(N)
comb_array = np.array(np.meshgrid(krange, krange, krange)).T.reshape(-1, 3)
# Take idx of the pairs k, -k vector and boolean selection that give position of -k vectors
boolean, pairs = find_pairs(array)
It works but the execution time grow rapidly with N...
Maybe someone has already deal with that?
The main problem is that comb_array has a shape of (R, 3) where R = N**3 and the nested loop in find_pairs runs at least in quadratic time since idx.remove runs in linear time and is called in the for loop. Moreover, there are cases where the for loop does not change the size of idx and the loop appear to run forever (eg. with N=4).
One solution to solve this problem in O(R log R) is to sort the array and then check for opposite values in linear time:
import numpy as np
import numba as nb
# Give array of 3D vectors
krange = np.fft.fftfreq(N)
comb_array = np.array(np.meshgrid(krange, krange, krange)).T.reshape(-1, 3)
# Sorting
packed = comb_array.view([('x', 'f8'), ('y', 'f8'), ('z', 'f8')])
idx = np.argsort(packed, axis=0).ravel()
sorted_comb = comb_array[idx]
# Find pairs
#nb.njit
def findPairs(sorted_comb, idx):
n = idx.size
boolean = np.zeros(n, dtype=np.bool_)
pairs = []
cur = n-1
for i in range(n):
while cur >= i:
if np.all(sorted_comb[i] == -sorted_comb[cur]):
boolean[idx[i]] = True
pairs.append([idx[i], idx[cur]])
cur -= 1
break
cur -= 1
return boolean, pairs
findPairs(sorted_comb, idx)
Note that the algorithm assume that for each row, there are only up to one valid matching pair. If there are several equal rows, they are paired 2 by two. If your goal is to extract all the combination of equal rows in this case, then please note that the output will grow exponentially (which is not reasonable IMHO).
This solution is pretty fast even for N = 100. Most of the time is spent in the sort that is not very efficient (unfortunately Numpy does not provide a way to do a lexicographic argsort of the row efficiently yet though this operation is fundamentally expensive).

Map elements of multiple columns in Pandas

I'm trying to label some values in a DataFrame in Pandas based on the value itself, in-place.
df = pd.read_csv('data/extrusion.csv')
# get list of columns that contain thickness
columns = [c for c in data.columns if 'SDickeIst'.lower() in c.lower()]
# create a function that returns the class based on value
def get_label(ser):
ser.map(lambda x : x if x == 0 else 1)
df[columns].apply(get_label)
I would expect that the apply function takes each column in particular and applies get_label on it. In turn, get_label gets the ser argument as a Series and uses map to map each element != 0 with 1.
get_label doesn't return anything.
You want to return ser.map(lambda x : x if x == 0 else 1).
def get_label(ser):
return ser.map(lambda x : x if x == 0 else 1)
Besides that, apply doesn't act in-place, it always returns a new object. Therefore you need
df[columns] = df[columns].apply(get_label)
But in this simple case, using DataFrame.where should be much faster if you are dealing with large DataFrames.
df[columns] = df[columns].where(lambda x: x == 0, 1)

Crop sides of a numpy array by w elements (where w may be zero)

I'd like to generalize the following to allow the number w of cropped elements to possibly be zero:
a = np.arange(42).reshape(6, 7)
w = 1 # Number of elements to crop on each side.
print(a[w:-w, w:-w])
And also in this generalization to arbitrary dimensions:
def crop(array, width):
width = np.broadcast_to(width, array.ndim)
return array[tuple(slice(w, -w) for w in width)]
The problem is that a slice with stop=0 returns no elements.
What is the most elegant solution? Is there any existing library function?
It would be wonderful if numpy.pad would allow negative values for cropping, but apparently it does not.
It may be necessary to write a custom function:
def crop_array(array, width) -> np.ndarray:
array = np.asarray(array)
width = np.broadcast_to(width, array.ndim)
assert np.all(width >= 0)
return array[tuple((slice(None) if w == 0 else slice(w, -w)) for w in width)]

Is the numpy sum method superfluous in this code?

I am reading a book, and found an error as below:
def relu(x):
return (x>0)*x
def relu2dev(x):
return (x>0)
street_lights = np.array([[1,0,1],[0,1,1],[0,0,1],[1,1,1]])
walk_stop = np.array([[1,1,0,0]]).T
alpha = 0.2
hidden_size = 4
weights_0_1 = 2*np.random.random((3,hidden_size))-1
weights_1_2 = 2*np.random.random((hidden_size,1))-1
for it in range(60):
layer_2_error = 0;
for i in range(len(street_lights)):
layer_0 = street_lights[i:i+1]
layer_1 = relu(np.dot(layer_0,weights_0_1))
layer_2 = np.dot(layer_1,weights_1_2)
layer_2_delta = (layer_2-walk_stop[i:i+1])
# -> layer_2_delta's shape is (1,1), so why np.sum?
layer_2_error += np.sum((layer_2_delta)**2)
layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2dev(layer_1)
weights_1_2 -= alpha * layer_1.T.dot(layer_2_delta)
weights_0_1 -= alpha * layer_0.T.dot(layer_1_delta)
if(it % 10 == 9):
print("Error: " + str(layer_2_error))
The error place is commented with # ->:
layer_2_delta's shape is (1,1), so why would one use np.sum? I think np.sum can be removed, but not quite sure, since it comes from a book.
As you say, layer_2_delta has a shape of (1,1). This means it is a 2 dimensional array with one element: layer_2_delta = np.array([[X]]). However, layer_2_error is a scalar. So you can get the scalar from the array by either selecting the value at the first index (layer_2_delta[0,0]) or by summing all the elements (which in this case is just the one). As the book seems to use "sum of square errors", it seems natural to keep the notation which is square each element in array and then add all of these up (for instruction purposes): this would be more general (e.g., to cases where the layer has more than one element) than the index approach. But you're right, there could be other ways to do this :).

Retrieve indices for rows of a PyTables table matching a condition using `Table.where()`

I need the indices (as numpy array) of the rows matching a given condition in a table (with billions of rows) and this is the line I currently use in my code, which works, but is quite ugly:
indices = np.array([row.nrow for row in the_table.where("foo == 42")])
It also takes half a minute, and I'm sure that the list creation is one of the reasons why.
I could not find an elegant solution yet and I'm still struggling with the pytables docs, so does anybody know any magical way to do this more beautifully and maybe also a bit faster? Maybe there is special query keyword I am missing, since I have the feeling that pytables should be able to return the matched rows indices as numpy array.
tables.Table.get_where_list() gives indices of the rows matching a given condition
I read the source of pytables, where() is implemented in Cython, but it seems not fast enough. Here is a complex method that can speedup:
Create some data first:
from tables import *
import numpy as np
class Particle(IsDescription):
name = StringCol(16) # 16-character String
idnumber = Int64Col() # Signed 64-bit integer
ADCcount = UInt16Col() # Unsigned short integer
TDCcount = UInt8Col() # unsigned byte
grid_i = Int32Col() # 32-bit integer
grid_j = Int32Col() # 32-bit integer
pressure = Float32Col() # float (single-precision)
energy = Float64Col() # double (double-precision)
h5file = open_file("tutorial1.h5", mode = "w", title = "Test file")
group = h5file.create_group("/", 'detector', 'Detector information')
table = h5file.create_table(group, 'readout', Particle, "Readout example")
particle = table.row
for i in range(1001000):
particle['name'] = 'Particle: %6d' % (i)
particle['TDCcount'] = i % 256
particle['ADCcount'] = (i * 256) % (1 << 16)
particle['grid_i'] = i
particle['grid_j'] = 10 - i
particle['pressure'] = float(i*i)
particle['energy'] = float(particle['pressure'] ** 4)
particle['idnumber'] = i * (2 ** 34)
# Insert a new particle record
particle.append()
table.flush()
h5file.close()
Read the column in chunks and append the indices into a list and concatenate the list to array finally. You can change the chunk size according to your memory size:
h5file = open_file("tutorial1.h5")
table = h5file.get_node("/detector/readout")
size = 10000
col = "energy"
buf = np.zeros(batch, dtype=table.coldtypes[col])
res = []
for start in range(0, table.nrows, size):
length = min(size, table.nrows - start)
data = table.read(start, start + batch, field=col, out=buf[:length])
tmp = np.where(data > 10000)[0]
tmp += start
res.append(tmp)
res = np.concatenate(res)