I'm working on implementing pytorch conv2d with numpy. But pytorch conv2d vs numpy results are different for the same input and conv weight. How to fix it? Thanks for any help.
Code sample below:
Note:
The code contains 4 parts:
Fixed random seed to generate fixed input and conv weight.
Implement conv2d with pytorch.
Implement conv2d with numpy.
Run conv2d and verify the results are different.
import random
import numpy as np
import torch
import torch.nn as nn
fixed_seed = 5179
np.random.seed(fixed_seed)
random.seed(fixed_seed)
torch.manual_seed(fixed_seed)
np.set_printoptions(precision=8, floatmode="fixed")
torch.set_printoptions(precision=8)
def conv_forward_torch(input_image_tensor, weight, stride, pad):
# input_image_tensor B Hi Wi Ci
# weight Hk Wk Ci Co
input_image_tensor = input_image_tensor.permute(0, 3, 1, 2) # B Ci Hi Wi
weight = weight.permute(3, 2, 0, 1) # Co Ci Hk Wk
output = torch.nn.functional.conv2d(input_image_tensor, weight, stride=stride, padding=pad) # B Co Ho Wo
output = output.permute(0, 2, 3, 1).cpu().detach().numpy() # B Ho Wo Co
return output
def conv_forward_naive(x, w, stride, pad, bias = None):
# x B Hi Wi Ci
# w Hk Wk Ci Co
x = np.transpose(x, [0, 3, 1, 2]) # B Ci Hi Wi
w = np.transpose(w, [3, 2, 0, 1]) # Co Ci Kh Kw
if pad != 0:
x = np.pad(x, ((0, 0), (0 ,0), (pad, pad), (pad, pad)),'constant')
b, ci, hi, wi = x.shape
co, ci, hk, wk = w.shape
ho = np.floor(1 + (hi - hk) / stride).astype(int)
wo = np.floor(1 + (wi - wk) / stride).astype(int)
out = np.zeros((b, co, ho, wo), dtype=np.float32) # B Co Ho Wo
x = np.expand_dims(x, axis=1) # B 1 Ci Hi Wi
w = np.expand_dims(w, axis=0) # 1 Co Ci Hk Wk
for i in range(ho):
for j in range(wo):
x_windows = x[:, :, :, i * stride:i * stride + hk, j * stride: j * stride + wk] # B 1 Ci Hk Wk
out[:, :, i, j] = np.sum(x_windows * w, axis=(2, 3, 4)) # B Co
out = np.transpose(out, [0, 2, 3, 1]) # B Ho Wo Co
return out
B = 1 # Batch size
Hi = 2 # Input height
Wi = 2 # Input width
Ci = 1 # Input channel
Co = 1 # Ouput channel
P = 0 # Padding size
Hk = 2 # Kernel height
Wk = 2 # Kernel width
S = 1 # Stride
input_image_tensor = torch.randn(B, Hi, Wi, Ci)
conv_weight_tensor = torch.randn(Hk, Wk, Ci, Co)
input_image = input_image_tensor.detach().numpy()
conv_weight = conv_weight_tensor.detach().numpy()
y_torch = conv_forward_torch(input_image_tensor, conv_weight_tensor, S, P)
y_np = conv_forward_naive(input_image, conv_weight, S, P)
is_same = y_torch == y_np
print(is_same, y_torch, y_np)
Expected output:
is_same should be True.
Actual output for y_torch, y_np:
[[[[False]]]] [[[[-3.62229419]]]] [[[[-3.62229395]]]]
torch.version = '1.11.0'
np.version = '1.20.1'
Your code works as expected. The result you observed is because of a difference in floating-point precisions between NumPy and PyTorch. To compare floating points you should not use a direct equal check, but instead something like np.allclose.
In this case your snippet indeed returns True:
>>> np.allclose(y_torch, y_np)
True
Related
I have a network written with tensorflow Keras, in part of my code I need to use scipy.cKDTree, so I decorated my function with #tf.function. When I want to make the tree I receive the following error. (Let me know if more details are required.)
The error happens when it tries to make cKDTree. The size of the pc2e is shape=(46080, 3).
In similar questions I found that it could be because of the Pillow version, I changed the version and didn't solve the error.
Also is there a better way to have KDTree in tensorflow?
TypeError: in user code:
/home/***/My_Models.py:731 var_layer *
tree2 = cKDTree(pc2e, leafsize=500, balanced_tree=False)
ckdtree.pyx:522 scipy.spatial.ckdtree.cKDTree.__init__ **
TypeError: __array__() takes 1 positional argument but 2 were given
Process finished with exit code 1
The function:
#tf.function
def var_layer(self, inputs, output): # output: x y z i j k w
inputs_v = tf.Variable(inputs)
pc1_raw, pc2_raw = tf.split(inputs_v, num_or_size_splits=2, axis=4)
# B x T x W x H x Channels
s0, s1, s2, s3, s4 = pc1_raw.shape[0], pc1_raw.shape[1], pc1_raw.shape[2], pc1_raw.shape[3], pc1_raw.shape[4]
pc1 = tf.reshape(pc1_raw[:, -1, :, :, 0:3], shape=[-1, s2 * s3, 3])
pc2 = tf.reshape(pc2_raw[:, -1, :, :, 0:3], shape=[-1, s2 * s3, 3])
# normal2 = tf.reshape(pc2_raw[:, -1, :, :, 3:6], [-1, s2 * s3, 3])
# normal1 = tf.reshape(pc1_raw[:, -1, :, :, 3:6], [-1, s2 * s3, 3])
Rq, Tr3 = tfg.dual_quaternion.to_rotation_translation(output)
R33 = tfg.rotation_matrix_3d.from_quaternion(Rq)
RT = tf.concat([R33, tf.expand_dims(Tr3, axis=2)], -1)
RT = tf.pad(RT, [[0, 0], [0, 1], [0, 0]], constant_values=[0.0, 0.0, 0.0, 1.0])
pc1 = tf.pad(pc1, [[0, 0], [0, 0], [0, 1]], constant_values=1)
pc1 = tf.transpose(pc1, perm=[0, 2, 1])
pc1_tr = tf.linalg.matmul(RT, pc1)
pc1_tr = pc1_tr[:, 0:3]
pc1_tr = tf.transpose(pc1_tr, perm=[0, 2, 1]) # B x WH x 3
# remove zero values
for epoch in range(self.Epochs):
pc2e = pc2[epoch]
print(pc2e)
tree2 = cKDTree(pc2e, leafsize=500, balanced_tree=False)
dist_in, ind = tree2.query(pc1_tr[epoch], k=1)
nonempty = np.count_nonzero(dist_in)
dist_in = np.sum(np.abs(dist_in))
if nonempty != 0:
dist_in = np.divide(dist_in, nonempty)
dist_p2p = dist_in
print(dist_p2p)
return dist_p2p
versions:
Tensorflow 2.3.0
Scipy 1.4.1
pillow==8.2.0
Input of the function is a point cloud with this shape: Batch x Time x W x H x Channels
and the size of pc2e is shape=(46080, 3)
I need to make the following loop faster. My only two requirements are:
it has to be called from a Python script;
it cannot use Numba;
I am sorry for such a generic title, but I was not sure how to post a specific performance problem. Thank you for the input.
import numpy as np
def sig(a):
return 1 / (1 + np.exp(-a))
N = 1000
xt = np.random.rand(N, 1).astype(np.float32) # input
w1 = np.random.rand(64, 64).astype(np.float32) # weight 1
w2 = np.random.rand(1, 64).astype(np.float32) # weight 2
ht = np.zeros((N, 64), dtype=np.float32) # output
h = ht[N - 1:N]
for i in range(N):
h = sig(h # w1 + xt[i:i + 1] # w2) # 1x64 = sig(1x64 # 64x64 + 1x1 # 1x64)
ht[i] = h
I have a system given by this recursive relationship: xt = At xt-1 + bt. I wish to compute xt for all t, with At, bt and x0 given. Is there are built-in function for that? If I use a loop it would be extremely slow. Thanks!
There is sort of a way. Let's say you have your A matrices in a 3D tensor with shape (T, N, N), where T is the total number of time steps and N is the size of your vector. Similarly, B values are in a 2D tensor (T, N). The first step in the computation would be:
x1 = A[0] # x0 + B[0]
Where # represents matrix product. But you can convert this into a single matrix product. Suppose we add a value 1 at the end of x0, and we call that x0p (for prime):
x0p = tf.concat([x, [1]], axis=0)
And now we build a new 3D tensor Ap with shape (T, N+1, N+1), such that for each A[i] we concatenate B[i] as a new column, and then we add a row with N zeros and a single one at the end:
AwithB = tf.concat([tf.concat([A, tf.expand_dims(B, 2)], axis=2)], axis=1)
AnewRow = tf.concat([tf.zeros((T, 1, N), A.dtype), tf.ones((T, 1, 1), A.dtype)], axis=2)
Ap = tf.concat([AwithB, AnewRow], axis=1)
As it turns out, you can now say:
x1p = Ap[0] # x0p
And therefore:
x2p = Ap[1] # x1p = Ap[1] # Ap[0] # x0p
So we just need to compute all the matrix product of all matrices in Ap across the first dimension. Unfortunately, there does not seem to be a direct operation to compute that with TensorFlow, but you can do it relatively fast with tf.scan:
Ap_prod = tf.scan(tf.matmul, Ap)[-1]
And with that you just have to do:
xtp = Ap_prod # x0p
Here is a proof of concept (the code is tweaked to support single examples and batches, either in the A and B values or in the x)
import tensorflow as tf
def compute_state(a, b, x):
s = tf.shape(a)
t = s[-3]
n = s[-1]
# Add final 1 to x
xp = tf.concat([x, tf.ones_like(x[..., :1])], axis=-1)
# Add B column to A
a_b = tf.concat([tf.concat([a, tf.expand_dims(b, axis=-1)], axis=-1)], axis=-2)
# Make new final row for A
a_row = tf.concat([tf.zeros_like(a[..., :1, :]),
tf.ones_like(a[..., :1, :1])], axis=-1)
# Add new row to A
ap = tf.concat([a_b, a_row], axis=-2)
# Compute matrix product reduction
ap_prod = tf.scan(tf.matmul, ap)[..., -1, :, :]
# Compute final result
outp = tf.linalg.matvec(ap_prod, xp)
return outp[..., :-1]
#Test
tf.random.set_seed(0)
a = tf.random.uniform((10, 5, 5), -1, 1)
b = tf.random.uniform((10, 5), -1, 1)
x = tf.random.uniform((5,), -1, 1)
y = compute_state(a, b, x)
# Also works with batches of (a, b) or x
a = tf.random.uniform((100, 10, 5, 5), -1, 1)
b = tf.random.uniform((100, 10, 5), -1, 1)
x = tf.random.uniform((100, 5), -1, 1)
y = compute_state(a, b, x)
Given a tensor of rank 1 eg. p = [x y z w], how can I "min-max clamp" within the provided boundaries: max = [1 10 5 3] and min = [-1 -10 -5 -3] such that the i-th element in p is always within the boundaries defined by mini and maxi
Extra: Would it be possible to do this for ranks > 1?
I found the following solution adequate. See the documentation for tf.minimum and tf.maximum. Solution:
import tensorflow as tf
p = tf.Variable([-1, 1, 3, 7])
clamp_min = tf.Variable([1, 1, 1, 1])
clamp_max = tf.Variable([5, 5, 5, 5])
p = tf.minimum(p, clamp_max)
p = tf.maximum(p, clamp_min)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
print(sess.run(p))
Produces:
[1 1 3 5]
I am a beginner in machine learning and neural networks. Recently, after watching Andrew Ng's lectures on deep learning, I tried to implement a binary classifier using deep neural networks on my own.
However, the cost of the function is expected to decrease after each iteration.
In my program, it decreases slightly in the beginning, but rapidly increases later. I tried to make changes in learning rate and number of iterations, but to no avail. I am very confused.
Here is my code
1. Neural network classifier class
class NeuralNetwork:
def __init__(self, X, Y, dimensions, alpha=1.2, iter=3000):
self.X = X
self.Y = Y
self.dimensions = dimensions # Including input layer and output layer. Let example be dimensions=4
self.alpha = alpha # Learning rate
self.iter = iter # Number of iterations
self.length = len(self.dimensions)-1
self.params = {} # To store parameters W and b for each layer
self.cache = {} # To store cache Z and A for each layer
self.grads = {} # To store dA, dZ, dW, db
self.cost = 1 # Initial value does not matter
def initialize(self):
np.random.seed(3)
# If dimensions is 4, then layer 0 and 3 are input and output layers
# So we only need to initialize w1, w2 and w3
# There is no need of w0 for input layer
for l in range(1, len(self.dimensions)):
self.params['W'+str(l)] = np.random.randn(self.dimensions[l], self.dimensions[l-1])*0.01
self.params['b'+str(l)] = np.zeros((self.dimensions[l], 1))
def forward_propagation(self):
self.cache['A0'] = self.X
# For last layer, ie, the output layer 3, we need to activate using sigmoid
# For layer 1 and 2, we need to use relu
for l in range(1, len(self.dimensions)-1):
self.cache['Z'+str(l)] = np.dot(self.params['W'+str(l)], self.cache['A'+str(l-1)]) + self.params['b'+str(l)]
self.cache['A'+str(l)] = relu(self.cache['Z'+str(l)])
l = len(self.dimensions)-1
self.cache['Z'+str(l)] = np.dot(self.params['W'+str(l)], self.cache['A'+str(l-1)]) + self.params['b'+str(l)]
self.cache['A'+str(l)] = sigmoid(self.cache['Z'+str(l)])
def compute_cost(self):
m = self.Y.shape[1]
A = self.cache['A'+str(len(self.dimensions)-1)]
self.cost = -1/m*np.sum(np.multiply(self.Y, np.log(A)) + np.multiply(1-self.Y, np.log(1-A)))
self.cost = np.squeeze(self.cost)
def backward_propagation(self):
A = self.cache['A' + str(len(self.dimensions) - 1)]
m = self.X.shape[1]
self.grads['dA'+str(len(self.dimensions)-1)] = -(np.divide(self.Y, A) - np.divide(1-self.Y, 1-A))
# Sigmoid derivative for final layer
l = len(self.dimensions)-1
self.grads['dZ' + str(l)] = self.grads['dA' + str(l)] * sigmoid_prime(self.cache['Z' + str(l)])
self.grads['dW' + str(l)] = 1 / m * np.dot(self.grads['dZ' + str(l)], self.cache['A' + str(l - 1)].T)
self.grads['db' + str(l)] = 1 / m * np.sum(self.grads['dZ' + str(l)], axis=1, keepdims=True)
self.grads['dA' + str(l - 1)] = np.dot(self.params['W' + str(l)].T, self.grads['dZ' + str(l)])
# Relu derivative for previous layers
for l in range(len(self.dimensions)-2, 0, -1):
self.grads['dZ'+str(l)] = self.grads['dA'+str(l)] * relu_prime(self.cache['Z'+str(l)])
self.grads['dW'+str(l)] = 1/m*np.dot(self.grads['dZ'+str(l)], self.cache['A'+str(l-1)].T)
self.grads['db'+str(l)] = 1/m*np.sum(self.grads['dZ'+str(l)], axis=1, keepdims=True)
self.grads['dA'+str(l-1)] = np.dot(self.params['W'+str(l)].T, self.grads['dZ'+str(l)])
def update_parameters(self):
for l in range(1, len(self.dimensions)):
self.params['W'+str(l)] = self.params['W'+str(l)] - self.alpha*self.grads['dW'+str(l)]
self.params['b'+str(l)] = self.params['b'+str(l)] - self.alpha*self.grads['db'+str(l)]
def train(self):
np.random.seed(1)
self.initialize()
for i in range(self.iter):
#print(self.params)
self.forward_propagation()
self.compute_cost()
self.backward_propagation()
self.update_parameters()
if i % 100 == 0:
print('Cost after {} iterations is {}'.format(i, self.cost))
2. Testing code for odd or even number classifier
import numpy as np
from main import NeuralNetwork
X = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
Y = np.array([[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]])
clf = NeuralNetwork(X, Y, [1, 1, 1], alpha=0.003, iter=7000)
clf.train()
3. Helper Code
import math
import numpy as np
def sigmoid_scalar(x):
return 1/(1+math.exp(-x))
def sigmoid_prime_scalar(x):
return sigmoid_scalar(x)*(1-sigmoid_scalar(x))
def relu_scalar(x):
if x > 0:
return x
else:
return 0
def relu_prime_scalar(x):
if x > 0:
return 1
else:
return 0
sigmoid = np.vectorize(sigmoid_scalar)
sigmoid_prime = np.vectorize(sigmoid_prime_scalar)
relu = np.vectorize(relu_scalar)
relu_prime = np.vectorize(relu_prime_scalar)
Output
I believe your cross-entropy derivative is wrong. Instead of this:
# WRONG!
self.grads['dA'+str(len(self.dimensions)-1)] = -(np.divide(self.Y, A) - np.divide(1-self.Y, A))
... do this:
# CORRECT
self.grads['dA'+str(len(self.dimensions)-1)] = np.divide(A - self.Y, (1 - A) * A)
See these lecture notes for the details. I think you meant the formula (5), but forgot 1-A. Anyway, use formula (6).