draw a line in tensorflow - tensorflow

I want to create a human pose skeleton estimation network and for this, I have a two-part network, first part generates 16 heatmaps as output(each heatmap for different joint and hence a key point can be extracted), using these 16 key points I wish to create a human skeleton and feed it to second half of my network. My problem is, how do I draw lines between the key points to create the skeleton? I couldn't find a way to do it on a tensor object using tensorflow or keras.

I know i'm a bit late but here is some code that I think does what you're after (in TFv2.3). Hopefully it will save someone time in the future!
It uses solely tensorflow ops, so you can use it in data loaders etc. The real pain here is that Tensorflow doesn't allow Eager Assignment, so you can't just update tensors by index. This works around that by creating two sparse tensors, one for the mask (where to apply the line) and another for the new_values (what value to apply at the line). The code for simply designing the line might not be applicable in your case (based on https://stackoverflow.com/a/47381058) but ported away from numpy.
import tensorflow as tf
def trapez(y, y0, w):
return tf.clip_by_value(tf.minimum(y + 1 + w/2 - y0, -y + 1 + w/2 + y0), 0, 1)
def apply_output(img, yy, xx, val):
stack = tf.stack([yy, xx], axis=1)
stack = tf.cast(stack, tf.int64)
values = tf.ones(stack.shape[0], tf.float32)
mask = tf.sparse.SparseTensor(indices=stack, values=values, dense_shape=img.shape)
mask = tf.sparse.reorder(mask)
mask = tf.sparse.to_dense(mask)
mask = tf.cast(mask, tf.float32)
new_values = tf.sparse.SparseTensor(indices=stack, values=val, dense_shape=img.shape)
new_values = tf.sparse.reorder(new_values)
new_values = tf.sparse.to_dense(new_values)
img = img * (1 - mask) + new_values * mask
img = tf.cast(tf.expand_dims(img * 255, axis=-1), tf.uint8)
return img
def weighted_line(img, r0, c0, r1, c1, w):
output = img
x = tf.range(c0, c1 + 1, dtype=tf.float32)
slope = (r1-r0) / (c1-c0)
w *= tf.sqrt(1 + tf.abs(slope)) / 2
y = x * slope + (c1*r0-c0*r1) / (c1-c0)
thickness = tf.math.ceil(w/2)
yy = (tf.reshape(tf.math.floor(y), [-1, 1]) + tf.reshape(tf.range(-thickness-1, thickness+2), [1, -1]))
xx = tf.repeat(x, yy.shape[1])
values = tf.reshape(trapez(yy, tf.reshape(y, [-1, 1]), w), [-1])
yy = tf.reshape(yy, [-1])
limits_y = tf.math.logical_and(yy >= 0, yy < img.shape[0])
limits_x = tf.math.logical_and(xx >= 0, xx < img.shape[1])
limits = tf.math.logical_and(limits_y, limits_x)
limits = tf.math.logical_and(limits, values > 0)
yy = tf.cast(yy[limits], tf.float32)
xx = tf.cast(xx[limits], tf.float32)
return yy, xx, values[limits], apply_output(output, yy, xx, values[limits])
Just for a sanity check, you can call it with the following, and display it using opencv
if __name__ == "__main__":
IMG = tf.zeros((500, 500), tf.float32)
yy, xx, vals, FINAL_IMG = weighted_line(IMG, 10, 20, 100, 200, 5)
jpeg_string = tf.io.encode_jpeg(FINAL_IMG)
tf.io.write_file("output.jpg", jpeg_string)
import cv2
img = cv2.imread("output.jpg")
cv2.imshow("Output", img)
cv2.waitKey(0)

Related

vectorized way to multiply and add specific axes in numpy array (convolutional layer backprop)

I was wondering how it would be possible to vectorize the following quadruple for-loops (this is t do with backprop in a convolutional layer).
W = np.ones((2, 2, 3, 8)) # just a toy example
dW = np.zeros(W.shape)
dZ = np.ones((10, 4, 4, 8))*2
# get the shapes: m = samples/images; H_dim = Height of image; W_dim = Width of image; 8 = Channels/filters
(m, H_dim, W_dim, C) = dZ.shape
dA_prev = np.zeros((10, 4, 4, 3))
# add symmetric padding of 2 around height and width borders with 0-values; shape is now: (10, 8, 8, 3)
dA_prev = np.pad(dA_prev,((0,0),(2,2),(2,2),(0,0)), mode='constant', constant_values = (0,0))
# loop over images
for i in range(m):
# loop over height
for h in range(H_dim):
# loop over width
for w in range(W_dim):
# loop over channels/filters
for c in range(C):
vert_start = 1 * h # 1 = stride, just as an example
vert_end = vert_start + 2 # 2 = vertical filter size, just as an example
horiz_start = 1 * w # 1 = stride
horiz_end = horiz_start + 2 # 2 = horizontal filter size, just as an example
dW[:,:,:,c] += dA_prev[i, vert_start:vert_end,horiz_start:horiz_end,:] * dZ[i, h, w, c]
dA_prev[i, vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c] # dZ[i, h, w, c] is a scalar
doing backprop on the bias is easy enough (db = np.sum(dZ, axis=(0,1,2), keepdims=True)), and the weights can be dealt with using stride tricks and by reshaping the dZ and then using the dot product the rescaled input (or tensordot on the axes or einsum).
def _striding(array, stride_size, filter_shapes, Wout=None, Hout=None):
strides = (array.strides[0], array.strides[1] * stride_size, array.strides[2] * stride_size, array.strides[1], array.strides[2], array.strides[3])
strided = as_strided(array, shape=(array.shape[0], Hout, Wout, filter_shapes[0], filter_shapes[1], array.shape[3]), strides=strides, writeable=False)
return strided
Hout = (A_prev.shape[1] - 2) // 1 + 1
Wout = (A_prev.shape[2] - 2) // 1 + 1
x_flat = _striding(array=A_prev, stride_size=2, filter_shapes=(2,2),
Wout=Wout, Hout=Hout).reshape(-1, 2 * 2 * A_prev.shape[3])
dout_descendant_flat = dout_descendant.reshape(-1, n_C)
dW = x_flat.T # dout_descendant_flat # shape (fh * fw * n_prev_C, C)
dW = dW.reshape(fh, fw, n_prev_C, C)
this gives identical results as dW in the slow version. but doing something similar to get the derivative wrt to the input that should yield the same result, doesn't. here's what i've done:
dZ_pad = np.pad(dZ,((0,0),(2,2),(2,2),(0,0)), mode='constant', constant_values = (0,0)) # padding to get the same shape as A_prev
dZ_pad_reshaped = _striding(array=dZ_pad, stride_size=1, filter_shapes=(2,2),
Wout=4, Hout=4) # the Hout and Wout dims are from the unpadded dims of A_prev
Wrot180 = np.rot90(W, 2, axes=(0,1)) # the filter height and width are in the first two axes, which we want to rotate
dA_prev = np.tensordot(dZ_pad_reshaped, Wrot180, axes=([3,4,5],[0,1,3]))
the shapes of dA_prev are right, but for some reason the results aren't identical as the slow version
OK, turns out the error was to do with several things:
dZ needed to be dilated relative to the stride in the forward propagation
the window function used for windowing dZ (done after dilation of dZ) needed to be called with stride 1 (no matter the stride choice in the forward propagation) with the output heights and widths of the padded input (not the original, unpadded input -- this was the main mistake that took me days to debug)
the relevant code is below with comments explaining shapes and operations as well as some further sources for reading. i've also included the forward propagation.
i should note that after days of debugging, writing various functions, reading etc. the variable names changed after a while, so for the ease of reading, here are the names of the variables as defined in my question and then their equivalent in the code below:
A_prev is x
dZ is dout_descendant
Hout is the height of dout_descendant
Wout is the width of dout_descendant
(as one would expect all references to self are to the class these functions are part of)
def _pad(self, array, pad_size, pad_val):
'''
only symmetric padding is implemented
'''
return np.pad(array, ((0, 0), (pad_size, pad_size), (pad_size, pad_size), (0, 0)), 'constant', constant_values=(pad_val, pad_val))
def _dilate(self, array, stride_size, pad_size, symmetric_filter_shape, output_image_size):
# on dilation for backprop with stride>1,
# see: https://medium.com/#mayank.utexas/backpropagation-for-convolution-with-strides-8137e4fc2710
# see also: https://leimao.github.io/blog/Transposed-Convolution-As-Convolution/
pad_bottom_and_right = (output_image_size + 2 * pad_size - symmetric_filter_shape) % stride_size
for m in range(stride_size - 1):
array = np.insert(array, range(1, array.shape[1], m + 1), 0, axis=1)
array = np.insert(array, range(1, array.shape[2], m + 1), 0, axis=2)
for _ in range(pad_bottom_and_right):
array = np.insert(array, array.shape[1], 0, axis=1)
array = np.insert(array, array.shape[2], 0, axis=2)
return array
def _windows(self, array, stride_size, filter_shapes, out_height, out_width):
'''
inputs:
array to create windows of
stride_size: int
filter_shapes: tuple(int): tuple of filter height and width
out_height and out_width: int, respectively: output sizes for the windows
returns:
windows of array with shape (excl. dilation):
array.shape[0], out_height, out_width, filter_shapes[0], filter_shapes[1], array.shape[3]
'''
strides = (array.strides[0], array.strides[1] * stride_size, array.strides[2] * stride_size, array.strides[1], array.strides[2], array.strides[3])
return np.lib.stride_tricks.as_strided(array, shape=(array.shape[0], out_height, out_width, filter_shapes[0], filter_shapes[1], array.shape[3]), strides=strides, writeable=False)
def forward(self, x):
'''
expects inputs to be of shape: [batchsize, height, width, channel in]
after init, filter_shapes are: [fh, fw, channel in, channel out]
'''
self.input_shape = x.shape
x_pad = self._pad(x, self.pad_size, self.pad_val)
self.input_pad_shape = x_pad.shape
# get the shapes
batch_size, h, w, Cin = self.input_shape
# calculate output sizes; only symmetric padding is possible
self.Hout = (h + 2*self.pad_size - self.fh) // self.stride + 1
self.Wout = (w + 2*self.pad_size - self.fw) // self.stride + 1
x_windows = self._windows(array=x_pad, stride_size=self.stride, filter_shapes=(self.fh, self.fw),
out_width=self.Wout, out_height=self.Hout) # 2D matrix with shape (batch_size, Hout, Wout, fh, fw, Cin)
self.out = np.tensordot(x_windows, self.w, axes=([3,4,5], [0,1,2])) + self.b
self.inputs = x_windows
## alternative 1: einsum approach, slower than other alternatives
# self.out = np.einsum('noufvc,fvck->nouk', x_windows, self.w) + self.b
## alternative 2: column approach with simple dot product
# z = x_windows.reshape(-1, self.fh * self.fw * Cin) # self.W.reshape(self.fh*self.fw*Cin, Cout) + self.b # 2D matrix with shape (batch_size * Hout * Wout, Cout)
# self.dout = z.reshape(batch_size, Hout, Wout, Cout)
def backward(self,dout_descendant):
'''
dout_descendant has shape (batch_size, Hout, Wout, Cout)
'''
# get shapes
batch_size, h, w, Cin = self.input_shape
# we want to sum everything but the filters for b
self.db = np.sum(dout_descendant, axis=(0,1,2), keepdims=True) # shape (1,1,1, Cout)
# for dW we'll use the column approach with ordinary dot product for variety ;) tensordot does the same without all the reshaping
dout_descendant_flat = dout_descendant.reshape(-1, self.Cout) # new shape (batch_size * Hout * Wout, Cout)
x_flat = self.inputs.reshape(-1, self.fh * self.fw * Cin) # shape (batch_size * Hout * Wout, fh * fw * Cin)
dw = x_flat.T # dout_descendant_flat # shape (fh * fw * Cin, Cout)
self.dw = dw.reshape(self.fh, self.fw, Cin, self.Cout)
del dout_descendant_flat # free memory
# for dinputs: we'll get padded and dilated windows of dout_descendant and perform the tensordot with 180 rotated W
# for details, see https://medium.com/#mayank.utexas/backpropagation-for-convolution-with-strides-8137e4fc2710 ; also: https://pavisj.medium.com/convolutions-and-backpropagations-46026a8f5d2c ; also: https://youtu.be/Lakz2MoHy6o?t=835
Wrot180 = np.rot90(self.w, 2, axes=(0,1)) # or also self.w[::-1, ::-1, :, :]
# backprop for forward with stride > 1 is done on windowed dout that's padded and dilated with stride 1
dout_descendant = self._dilate(dout_descendant, stride_size=self.stride, pad_size=self.pad_size, symmetric_filter_shape=self.fh, output_image_size=h)
dout_descendant = self._pad(dout_descendant, pad_size=self.fw-1, pad_val=self.pad_val) # pad dout_descendant to dim: fh-1 (or fw-1); only symmetrical filters are supported
dout_descendant = self._windows(array=dout_descendant, stride_size=1, filter_shapes=(self.fh, self.fw),
out_height=h + 2 * self.pad_size, out_width=w + 2 * self.pad_size) # shape: (batch_size * h_padded * w_padded, fh * fw * Cout)
self.dout = np.tensordot(dout_descendant, Wrot180, axes=([3,4,5],[0,1,3]))
self.dout = self.dout[:,self.pad_size:-self.pad_size, self.pad_size:-self.pad_size, :]
## einsum alternative, but slower:
# dinput = np.einsum('nhwfvk,fvck->nhwc', dout_windows, self.W)
i've left this answer here, because all the other sources on stackoverflow or github i could find that used numpy stride tricks were implemented for convolutions of stride 1 (which doesn't require dilation of dZ) or they used very complex fancy indexing operations that were extremely hard to follow (e.g. https://sgugger.github.io/convolution-in-depth.html#convolution-in-depth or https://github.com/parasdahal/deepnet/blob/51a9e61c351138b7dc637f4b748a0e6ca2e15595/deepnet/im2col.py)

How to implement custom Keras ordinal loss function with tensor evaluation without disturbing TF>2.0 Model Graph?

I am trying to implement a custom loss function in Tensorflow 2.4 using the Keras backend.
The loss function is a ranking loss; I found the following paper with a somewhat log-likelihood loss: Chen et al. Single-Image Depth Perception in the Wild.
Similarly, I wanted to sample some (in this case 50) points from an image to compare the relative order between ground-truth and predicted depth maps using the NYU-Depth dataset. Being a fan of Numpy, I started working with that but came to the following exception:
ValueError: No gradients provided for any variable: [...]
I have learned that this is caused by the arguments not being filled when calling the loss function but instead, a C function is compiled which is then used later. So while I know the dimensions of my tensors (4, 480, 640, 1), I cannot work with the data as wanted and have to use the keras.backend functions on top so that in the end (if I understood correctly), there is supposed to be a path between the input tensors from the TF graph and the output tensor, which has to provide a gradient.
So my question now is: Is this a feasible loss function within keras?
I have already tried a few ideas and different approaches with different variations of my original code, which was something like:
def ranking_loss_function(y_true, y_pred):
# Chen et al. loss
y_true_np = K.eval(y_true)
y_pred_np = K.eval(y_pred)
if y_true_np.shape[0] != None:
num_sample_points = 50
total_samples = num_sample_points ** 2
err_list = [0 for x in range(y_true_np.shape[0])]
for i in range(y_true_np.shape[0]):
sample_points = create_random_samples(y_true, y_pred, num_sample_points)
for x1, y1 in sample_points:
for x2, y2 in sample_points:
if y_true[i][x1][y1] > y_true[i][x2][y2]:
#image_relation_true = 1
err_list[i] += np.log(1 + np.exp(-1 * y_pred[i][x1][y1] + y_pred[i][x2][y2]))
elif y_true[i][x1][y1] < y_true[i][x2][y2]:
#image_relation_true = -1
err_list[i] += np.log(1 + np.exp(y_pred[i][x1][y1] - y_pred[i][x2][y2]))
else:
#image_relation_true = 0
err_list[i] += np.square(y_pred[i][x1][y1] - y_pred[i][x2][y2])
err_list = np.divide(err_list, total_samples)
return K.constant(err_list)
As you can probably tell, the main idea was to first create the sample points and then based on the existing relation between them in y_true/y_pred continue with the corresponding computation from the cited paper.
Can anyone help me and provide some more helpful information or tips on how to correctly implement this loss using keras.backend functions? Trying to include the ordinal relation information really confused me compared to standard regression losses.
EDIT: Just in case this causes confusion: create_random_samples() just creates 50 random sample points (x, y) coordinate pairs based on the shape[1] and shape[2] of y_true (image width and height)
EDIT(2): After finding this variation on GitHub, I have tried out a variation using only TF functions to retrieve data from the tensors and compute the output. The adjusted and probably more correct version still throws the same exception though:
def ranking_loss_function(y_true, y_pred):
#In the Wild ranking loss
y_true_np = K.eval(y_true)
y_pred_np = K.eval(y_pred)
if y_true_np.shape[0] != None:
num_sample_points = 50
total_samples = num_sample_points ** 2
bs = y_true_np.shape[0]
w = y_true_np.shape[1]
h = y_true_np.shape[2]
total_samples = total_samples * bs
num_pairs = tf.constant([total_samples], dtype=tf.float32)
output = tf.Variable(0.0)
for i in range(bs):
sample_points = create_random_samples(y_true, y_pred, num_sample_points)
for x1, y1 in sample_points:
for x2, y2 in sample_points:
y_true_sq = tf.squeeze(y_true)
y_pred_sq = tf.squeeze(y_pred)
d1_t = tf.slice(y_true_sq, [i, x1, y1], [1, 1, 1])
d2_t = tf.slice(y_true_sq, [i, x2, y2], [1, 1, 1])
d1_p = tf.slice(y_pred_sq, [i, x1, y1], [1, 1, 1])
d2_p = tf.slice(y_pred_sq, [i, x2, y2], [1, 1, 1])
d1_t_sq = tf.squeeze(d1_t)
d2_t_sq = tf.squeeze(d2_t)
d1_p_sq = tf.squeeze(d1_p)
d2_p_sq = tf.squeeze(d2_p)
if d1_t_sq > d2_t_sq:
# --> Image relation = 1
output.assign_add(tf.math.log(1 + tf.math.exp(-1 * d1_p_sq + d2_p_sq)))
elif d1_t_sq < d2_t_sq:
# --> Image relation = -1
output.assign_add(tf.math.log(1 + tf.math.exp(d1_p_sq - d2_p_sq)))
else:
output.assign_add(tf.math.square(d1_p_sq - d2_p_sq))
return output/num_pairs
EDIT(3): This is the code for create_random_samples():
(FYI: Because it was weird to get the shape from y_true in this case, I first proceeded to hard-code it here as I know it for the dataset which I am currently using.)
def create_random_samples(y_true, y_pred, num_points=50):
y_true_shape = (4, 480, 640, 1)
y_pred_shape = (4, 480, 640, 1)
if y_true_shape[0] != None:
num_samples = num_points
population = [(x, y) for x in range(y_true_shape[1]) for y in range(y_true_shape[2])]
sample_points = random.sample(population, num_samples)
return sample_points

Faster way to patchify a picture to overlapping blocks

I'm looking for a FAST (and if possible memory afficiant) way to rewrite a function I crerated as part of Visual bag of words algorithm:
def get_pic_patches(pic, l, s): # "s" stands for stride
r, c = pic.shape
i, j = [0, 0]
x_range = list(range(0, r, s ) )
y_range = list(range(0, c , s ) )
patches = []
patches_location = []
for x in x_range: # without last two since it will exceed dimensions
for y in y_range: # without last two since it will exceed dimensions
if x+ l<= r and y+l <= c:
patch = pic[x:x + l , y:y + l ]
patches_location.append([x, y]) # patch location is the upper left pixel
patches.append( patch )
return patches, patches_location
it takes a grayscale image (NOT RGB!), desired patch length and stride value,
and gives back all patches as a list of numpy array.
On other qestions, I found this:
def patchify(img, patch_shape):
img = np.ascontiguousarray(img) # won't make a copy if not needed
X, Y = img.shape
x, y = patch_shape
shape = ((X-x+1), (Y-y+1), x, y) # number of patches, patch_shape
strides = img.itemsize*np.array([Y, 1, Y, 1])
return np.lib.stride_tricks.as_strided(img, shape=shape, strides=strides)
in order to get to return a list, I used it like this:
def patchify(img, patch_shape):
img = np.ascontiguousarray(img) # won't make a copy if not needed
X, Y = img.shape
x, y = patch_shape
shape = ((X-x+1), (Y-y+1), x, y) # number of patches, patch_shape
strides = img.itemsize*np.array([Y, 1, Y, 1])
patches = np.lib.stride_tricks.as_strided(img, shape=shape, strides=strides)
a,b,c,d = patches.shape
patches = patches.reshape(((a*b),c,d))
patches = patches.tolist()
return
but this was actually much slower than my original function! another problem is that is only works with stride = 1, and I want to be able to use all sorts of stride values.

Why does the cost in my implementation of a deep neural network increase after a few iterations?

I am a beginner in machine learning and neural networks. Recently, after watching Andrew Ng's lectures on deep learning, I tried to implement a binary classifier using deep neural networks on my own.
However, the cost of the function is expected to decrease after each iteration.
In my program, it decreases slightly in the beginning, but rapidly increases later. I tried to make changes in learning rate and number of iterations, but to no avail. I am very confused.
Here is my code
1. Neural network classifier class
class NeuralNetwork:
def __init__(self, X, Y, dimensions, alpha=1.2, iter=3000):
self.X = X
self.Y = Y
self.dimensions = dimensions # Including input layer and output layer. Let example be dimensions=4
self.alpha = alpha # Learning rate
self.iter = iter # Number of iterations
self.length = len(self.dimensions)-1
self.params = {} # To store parameters W and b for each layer
self.cache = {} # To store cache Z and A for each layer
self.grads = {} # To store dA, dZ, dW, db
self.cost = 1 # Initial value does not matter
def initialize(self):
np.random.seed(3)
# If dimensions is 4, then layer 0 and 3 are input and output layers
# So we only need to initialize w1, w2 and w3
# There is no need of w0 for input layer
for l in range(1, len(self.dimensions)):
self.params['W'+str(l)] = np.random.randn(self.dimensions[l], self.dimensions[l-1])*0.01
self.params['b'+str(l)] = np.zeros((self.dimensions[l], 1))
def forward_propagation(self):
self.cache['A0'] = self.X
# For last layer, ie, the output layer 3, we need to activate using sigmoid
# For layer 1 and 2, we need to use relu
for l in range(1, len(self.dimensions)-1):
self.cache['Z'+str(l)] = np.dot(self.params['W'+str(l)], self.cache['A'+str(l-1)]) + self.params['b'+str(l)]
self.cache['A'+str(l)] = relu(self.cache['Z'+str(l)])
l = len(self.dimensions)-1
self.cache['Z'+str(l)] = np.dot(self.params['W'+str(l)], self.cache['A'+str(l-1)]) + self.params['b'+str(l)]
self.cache['A'+str(l)] = sigmoid(self.cache['Z'+str(l)])
def compute_cost(self):
m = self.Y.shape[1]
A = self.cache['A'+str(len(self.dimensions)-1)]
self.cost = -1/m*np.sum(np.multiply(self.Y, np.log(A)) + np.multiply(1-self.Y, np.log(1-A)))
self.cost = np.squeeze(self.cost)
def backward_propagation(self):
A = self.cache['A' + str(len(self.dimensions) - 1)]
m = self.X.shape[1]
self.grads['dA'+str(len(self.dimensions)-1)] = -(np.divide(self.Y, A) - np.divide(1-self.Y, 1-A))
# Sigmoid derivative for final layer
l = len(self.dimensions)-1
self.grads['dZ' + str(l)] = self.grads['dA' + str(l)] * sigmoid_prime(self.cache['Z' + str(l)])
self.grads['dW' + str(l)] = 1 / m * np.dot(self.grads['dZ' + str(l)], self.cache['A' + str(l - 1)].T)
self.grads['db' + str(l)] = 1 / m * np.sum(self.grads['dZ' + str(l)], axis=1, keepdims=True)
self.grads['dA' + str(l - 1)] = np.dot(self.params['W' + str(l)].T, self.grads['dZ' + str(l)])
# Relu derivative for previous layers
for l in range(len(self.dimensions)-2, 0, -1):
self.grads['dZ'+str(l)] = self.grads['dA'+str(l)] * relu_prime(self.cache['Z'+str(l)])
self.grads['dW'+str(l)] = 1/m*np.dot(self.grads['dZ'+str(l)], self.cache['A'+str(l-1)].T)
self.grads['db'+str(l)] = 1/m*np.sum(self.grads['dZ'+str(l)], axis=1, keepdims=True)
self.grads['dA'+str(l-1)] = np.dot(self.params['W'+str(l)].T, self.grads['dZ'+str(l)])
def update_parameters(self):
for l in range(1, len(self.dimensions)):
self.params['W'+str(l)] = self.params['W'+str(l)] - self.alpha*self.grads['dW'+str(l)]
self.params['b'+str(l)] = self.params['b'+str(l)] - self.alpha*self.grads['db'+str(l)]
def train(self):
np.random.seed(1)
self.initialize()
for i in range(self.iter):
#print(self.params)
self.forward_propagation()
self.compute_cost()
self.backward_propagation()
self.update_parameters()
if i % 100 == 0:
print('Cost after {} iterations is {}'.format(i, self.cost))
2. Testing code for odd or even number classifier
import numpy as np
from main import NeuralNetwork
X = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
Y = np.array([[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]])
clf = NeuralNetwork(X, Y, [1, 1, 1], alpha=0.003, iter=7000)
clf.train()
3. Helper Code
import math
import numpy as np
def sigmoid_scalar(x):
return 1/(1+math.exp(-x))
def sigmoid_prime_scalar(x):
return sigmoid_scalar(x)*(1-sigmoid_scalar(x))
def relu_scalar(x):
if x > 0:
return x
else:
return 0
def relu_prime_scalar(x):
if x > 0:
return 1
else:
return 0
sigmoid = np.vectorize(sigmoid_scalar)
sigmoid_prime = np.vectorize(sigmoid_prime_scalar)
relu = np.vectorize(relu_scalar)
relu_prime = np.vectorize(relu_prime_scalar)
Output
I believe your cross-entropy derivative is wrong. Instead of this:
# WRONG!
self.grads['dA'+str(len(self.dimensions)-1)] = -(np.divide(self.Y, A) - np.divide(1-self.Y, A))
... do this:
# CORRECT
self.grads['dA'+str(len(self.dimensions)-1)] = np.divide(A - self.Y, (1 - A) * A)
See these lecture notes for the details. I think you meant the formula (5), but forgot 1-A. Anyway, use formula (6).

tensorflow giving nans when calculating gradient with sparse tensors

The following snippet is from a fairly large piece of code but hopefully I can give all the information necessary:
y2 = tf.matmul(y1,ymask)
dist = tf.norm(ystar-y2,axis=0)
y1 and y2 are 128x30 and ymask is 30x30. ystar is 128x30. dist is 1x30. When ymask is the identity matrix, everything works fine. But when I set it to be all zeros, apart from a single 1 along the diagonal (so as to set all columns but one in y2 to be zero), I get nans for the gradient of dist with respect to y2, using tf.gradients(dist, [y2]). The specific value of dist is [0,0,7.9,0,...], with all the ystar-y2 values being around the range (-1,1) in the third column and zero elsewhere.
I'm pretty confused as to why a numerical issue would occur here, given there are no logs or divisions, is this underflow? Am I missing something in the maths?
For context, I'm doing this to try to train individual dimensions of y, one at a time, using the whole network.
longer version to reproduce:
import tensorflow as tf
import numpy as np
import pandas as pd
batchSize = 128
eta = 0.8
tasks = 30
imageSize = 32**2
groups = 3
tasksPerGroup = 10
trainDatapoints = 10000
w = np.zeros([imageSize, groups * tasksPerGroup])
toyIndex = 0
for toyLoop in range(groups):
m = np.ones([imageSize]) * np.random.randn(imageSize)
for taskLoop in range(tasksPerGroup):
w[:, toyIndex] = m * 0.1 * np.random.randn(1)
toyIndex += 1
xRand = np.random.normal(0, 0.5, (trainDatapoints, imageSize))
taskLabels = np.matmul(xRand, w) + np.random.normal(0,0.5,(trainDatapoints, groups * tasksPerGroup))
DF = np.concatenate((xRand, taskLabels), axis=1)
trainDF = pd.DataFrame(DF[:trainDatapoints, ])
# define graph variables
x = tf.placeholder(tf.float32, [None, imageSize])
W = tf.Variable(tf.zeros([imageSize, tasks]))
b = tf.Variable(tf.zeros([tasks]))
ystar = tf.placeholder(tf.float32, [None, tasks])
ymask = tf.placeholder(tf.float32, [tasks, tasks])
dataLength = tf.cast(tf.shape(ystar)[0],dtype=tf.float32)
y1 = tf.matmul(x, W) + b
y2 = tf.matmul(y1,ymask)
dist = tf.norm(ystar-y2,axis=0)
mse = tf.reciprocal(dataLength) * tf.reduce_mean(tf.square(dist))
grads = tf.gradients(dist, [y2])
trainStep = tf.train.GradientDescentOptimizer(eta).minimize(mse)
# build graph
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
randTask = np.random.randint(0, 9)
ymaskIn = np.zeros([tasks, tasks])
ymaskIn[randTask, randTask] = 1
batch = trainDF.sample(batchSize)
batch_xs = batch.iloc[:, :imageSize]
batch_ys = np.zeros([batchSize, tasks])
batch_ys[:, randTask] = batch.iloc[:, imageSize + randTask]
gradOut = sess.run(grads, feed_dict={x: batch_xs, ystar: batch_ys, ymask: ymaskIn})
sess.run(trainStep, feed_dict={x: batch_xs, ystar: batch_ys, ymask:ymaskIn})
Here's a very simple reproduction:
import tensorflow as tf
with tf.Graph().as_default():
y = tf.zeros(shape=[1], dtype=tf.float32)
dist = tf.norm(y,axis=0)
(grad,) = tf.gradients(dist, [y])
with tf.Session():
print(grad.eval())
Prints:
[ nan]
The issue is that tf.norm computes sum(x**2)**0.5. The gradient is x / sum(x**2) ** 0.5 (see e.g. https://math.stackexchange.com/a/84333), so when sum(x**2) is zero we're dividing by zero.
There's not much to be done in terms of a special case: the gradient as x approaches all zeros depends on which direction it's approaching from. For example if x is a single-element vector, the limit as x approaches 0 could either be 1 or -1 depending on which side of zero it's approaching from.
So in terms of solutions, you could just add a small epsilon:
import tensorflow as tf
def safe_norm(x, epsilon=1e-12, axis=None):
return tf.sqrt(tf.reduce_sum(x ** 2, axis=axis) + epsilon)
with tf.Graph().as_default():
y = tf.constant([0.])
dist = safe_norm(y,axis=0)
(grad,) = tf.gradients(dist, [y])
with tf.Session():
print(grad.eval())
Prints:
[ 0.]
Note that this is not actually the Euclidean norm. It's a good approximation as long as the input is much larger than epsilon.