Problem about getting None from the GradientTape.gradient in TensorFlow - tensorflow

I tried the following code:
from d2l import tensorflow as d2l
import tensorflow as tf
#tf.function
def corr2d(X, k, Y): ##save
"""Compute 2D cross-correlation."""
with tf.GradientTape() as tape:
for i in range(Y.shape[0]):
for j in range(Y.shape[1]):
Y[i, j].assign(tf.reduce_sum(tf.multiply(X[i: i + h, j: j + w], k)))
print('Gradients = ', tape.gradient(Y, k)) # show the gradient
print('Watched Variables = ', tape.watched_variables()) # show the watched varaibles
print(tf.__version__)
Xin= tf.constant([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])
kernel = tf.Variable([[0.0, 1.0], [2.0, 3.0]])
h, w = kernel.shape
Y_hat = tf.Variable(tf.zeros((Xin.shape[0] - h + 1, Xin.shape[1] - w + 1))) # prepare the output tensor
corr2d(X, kernel, Y_hat)
print(Y_hat)
I got the following results:
2.4.1
Gradients = None
Watched Variables = (<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32>, <tf.Variable 'Variable:0' shape=(2, 2) dtype=float32>)
<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[19., 25.],
[37., 43.]], dtype=float32)>
Can anyone explain why the returned gradient is None even though the source variable kernel is included in the list of watched variables?

I'm not sure I really understood what you were trying to do. You were passing your variable as the target for the gradient.
It is always easier to think in terms of cost function and variables.
Let's say your cost function is y = x ** 2. In this case, it is possible to calculate the gradient of y with respect to x.
Basically, you did not have a function to calculate any gradient with respect to k.
I have done a small change. Check for the variable cost.
import tensorflow as tf
def corr2d(X, k, Y): ##save
"""Compute 2D cross-correlation."""
with tf.GradientTape() as tape:
cost = 0
for i in range(Y.shape[0]):
for j in range(Y.shape[1]):
Y[i, j].assign(tf.reduce_sum(tf.multiply(X[i: i + h, j: j + w], k)))
cost = cost + tf.reduce_sum(tf.multiply(X[i: i + h, j: j + w], k))
print('\nGradients = ', tape.gradient(cost, k)) # show the gradient
print('Watched Variables = ', tape.watched_variables()) # show the watched varaibles
print(tf.__version__)
Xin= tf.constant([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])
kernel = tf.Variable([[0.0, 1.0], [2.0, 3.0]])
h, w = kernel.shape
Y_hat = tf.Variable(tf.zeros((Xin.shape[0] - h + 1, Xin.shape[1] - w + 1))) # prepare the output tensor
corr2d(Xin, kernel, Y_hat)
print(Y_hat)
And now, you will get
Gradients = tf.Tensor(
[[ 8. 12.]
[20. 24.]], shape=(2, 2), dtype=float32)
Watched Variables = (<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0., 1.],
[2., 3.]], dtype=float32)>, <tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[19., 25.],
[37., 43.]], dtype=float32)>)
<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[19., 25.],
[37., 43.]], dtype=float32)>

Related

Autodiff implementation for gradient calculation

I have worked through some papers about the autodiff algorithm to implement it for myself (for learning purposes). I compared my algorithm in test cases to the output of tensorflow and their outputs did not match in most cases. Therefor i worked through the tutorial from this side and implemented it with tensorflow operations just for the matrix multiplication operation since that was one of the operations that did not work:
gradient of matmul and unbroadcast method:
def gradient_matmul(node, dx, adj):
# dx is needed to know which of both parents should be derived
a = node.parents[0]
b = node.parents[1]
# the operation was node.tensor = tf.matmul(a.tensor, b,tensor)
if a == dx or b == dx:
# result depends on which of the parents is the derivative
mm = tf.matmul(adj, tf.transpose(b.tensor)) if a == dx else \
tf.matmul(tf.transpose(a.tensor), adj)
return mm
else:
return None
def unbroadcast(adjoint, node):
dim_a = len(adjoint.shape)
dim_b = len(node.shape)
if dim_a > dim_b:
sum = tuple(range(dim_a - dim_b))
res = tf.math.reduce_sum(adjoint, axis = sum)
return res
return adjoint
And finally the gradient calculation autodiff algorithm:
def gradient(y, dx):
working = [y]
adjoints = defaultdict(float)
adjoints[y] = tf.ones(y.tensor.shape)
while len(working) != 0:
curr = working.pop(0)
if curr == dx:
return adjoints[curr]
if curr.is_store:
continue
adj = adjoints[curr]
for p in curr.parents:
# for testing with matrix multiplication as only operation
local_grad = gradient_matmul(curr, p, adj)
adjoints[p] = unbroadcast(tf.add(adjoints[p], local_grad), p.tensor)
if not p in working:
working.append(p)
Yet it produces the same output as my initial implementation.
I constructed a matrix multiplication test case:
x = tf.constant([[[1.0, 1.0], [2.0, 3.0]], [[4.0, 5.0], [6.0, 7.0]]])
y = tf.constant([[3.0, -7.0], [-1.0, 5.0]])
z = tf.constant([[[1, 1], [2.0, 2]], [[3, 3], [-1, -1]]])
w = tf.matmul(tf.matmul(x, y), z)
Where w should be derived for each of the variables.
Tensorflow calculates the gradient:
[<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
array([[[-22., 18.],
[-22., 18.]],
[[ 32., -16.],
[ 32., -16.]]], dtype=float32)>, <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[66., -8.],
[80., -8.]], dtype=float32)>, <tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
array([[[ 5., 5.],
[ -1., -1.]],
[[ 18., 18.],
[-10., -10.]]], dtype=float32)>]
My implementation calculates:
[[[-5. 7.]
[-5. 7.]]
[[-5. 7.]
[-5. 7.]]]
[[33. 22.]
[54. 36.]]
[[[ 9. 9.]
[14. 14.]]
[[-5. -5.]
[-6. -6.]]]
Maybe the problem is the difference between numpys dot and tensorflows matmul?
But then i don't know to fix the gradient or unbroadcast for the tensorflow method...
Thanks for taking the time to look over my code! :)
I found the error, the gradient matmul should have been:
def gradient_matmul(node, dx, adj):
a = node.parents[0]
b = node.parents[1]
if a == dx:
return tf.matmul(adj, b.tensor, transpose_b=True)
elif b == dx:
return tf.matmul(a.tensor, adj, transpose_a=True)
else:
return None
Since i only want to transpose the last 2 dimensions

Jacobian of a vector in Tensorflow

I think this question has never been properly answered 8see How to calculate the Jacobian of a vector function with tensorflow or Computing Jacobian in TensorFlow 2.0), so I will try again:
I want to compute the jacobian of the vector valued function z = [x**2 + 2*y, y**2], that is, I want to obtain the matrix of the partial derivatives
[[2x, 0],
[2, 2y]]
(being automatic differentiation, this matrix will be for an specific point).
with tf.GradientTape() as g:
x = tf.Variable(1.0)
y = tf.Variable(4.0)
z = tf.convert_to_tensor([x**2 + 2*y, y**2])
jacobian = g.jacobian(z, [x, y])
print(jacobian)
Obtaining
[<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 0.], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 8.], dtype=float32)>]
I want to obtain naturally the tensor
[[2., 0.],
[2., 8.]]
not that intermediate result. Can it be done?
Try some thing like this
import numpy as np
import tensorflow as tf
with tf.GradientTape() as g:
x = tf.Variable(1.0)
y = tf.Variable(4.0)
z = tf.convert_to_tensor([x**2 + 2*y, y**2])
jacobian = g.jacobian(z, [x, y])
print(np.array([jacob.numpy() for jacob in jacobian]))
Result
[[2. 0.]
[2. 8.]]

Converting a tensor such that maximum value is 1 and rest is 0

I am struggling with a problem in which I need to convert my tensor such that out of total values, the maximum value gets 1 as a value and rest as 0.
tf.Tensor(
[[0.05]
[0.1]
[0.5]
[0.35]],shape=(4,1),dtype = float32)
I tried
out = tf.sparse_to_dense(tf.argmax(a),tf.cast(tf.shape(a), dtype=tf.int64), tf.reduce_max(a))
but unfortunately, getting an error as
Input must be a SparseTensor.
I wanted the output as
tf.Tensor(
[[0]
[0]
[1]
[0]],shape=(4,1),dtype = float32)
please help me to solve the problem.
Thanks a lot
try in this way
x = tf.constant(
[[0.05],
[0.1],
[0.5],
[0.35]])
top_values, top_indices = tf.nn.top_k(tf.reshape(x, (-1,)), 1)
tf.cast(tf.greater_equal(x, top_values), tf.float64)
output
<tf.Tensor: shape=(4, 1), dtype=float64, numpy=
array([[0.],
[0.],
[1.],
[0.]])>
Using keras backend K.cast(K.equal(a, K.max(a)), dtype='int8')
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
a = np.arange(0, 1, 0.1)
a = a[:, np.newaxis]
a
array([[0. ],
[0.1],
[0.2],
[0.3],
[0.4],
[0.5],
[0.6],
[0.7],
[0.8],
[0.9]], dtype=float32)
tensor = K.cast(a, dtype='float32')
tensor
<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[0. ],
[0.1],
[0.2],
[0.3],
[0.4],
[0.5],
[0.6],
[0.7],
[0.8],
[0.9]], dtype=float32)>
K.cast(K.equal(a, K.max(a)), dtype='int8')
<tf.Tensor: shape=(10, 1), dtype=int8, numpy=
array([[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1]], dtype=int8)>

ValueError: Shapes must be equal rank in assign_add()

I am reading tf.Variable in Tensorflow r2.0 in TF2:
import tensorflow as tf
# Create a variable.
w = tf.constant([1, 2, 3, 4], tf.float32, shape=[2, 2])
# Use the variable in the graph like any Tensor.
y = tf.matmul(w,tf.constant([7, 8, 9, 10], tf.float32, shape=[2, 2]))
v= tf.Variable(w)
# The overloaded operators are available too.
z = tf.sigmoid(w + y)
tf.shape(z)
# Assign a new value to the variable with `assign()` or a related method.
v.assign(w + 1)
v.assign_add(tf.constant([1.0, 21]))
ValueError: Shapes must be equal rank, but are 2 and 1 for
'AssignAddVariableOp_4' (op: 'AssignAddVariableOp') with input shapes:
[], 2.
And also how come the following returns false?
tf.shape(v) == tf.shape(tf.constant([1.0, 21],tf.float32))
My other question is that when we are in TF 2, we should not use tf.Session() anymore, correct? It seems we should never run session.run(), but the API document keys doing it with tf.compat.v1, etc. So why they are using it in TF2 docs?
Any help would be appreciated.
CS
As it clearly says in the error, it is expecting shape [2,2] for assign_add on v which is having the shape [2,2].
If you try to give any shape other than the initial shape of the Tensor which you are trying to do assign_add the error will be given.
Below is the modified code with the expected shape for the operation.
import tensorflow as tf
# Create a variable.
w = tf.constant([1, 2, 3, 4], tf.float32, shape=[2, 2])
# Use the variable in the graph like any Tensor.
y = tf.matmul(w,tf.constant([7, 8, 9, 10], tf.float32, shape=[2, 2]))
v= tf.Variable(w)
# The overloaded operators are available too.
z = tf.sigmoid(w + y)
tf.shape(z)
# Assign a new value to the variable with `assign()` or a related method.
v.assign(w + 1)
print(v)
v.assign_add(tf.constant([1, 2, 3, 4], tf.float32, shape=[2, 2]))
Output for v:
<tf.Variable 'UnreadVariable' shape=(2, 2) dtype=float32, numpy=
array([[3., 5.],
[7., 9.]], dtype=float32)>
Now the following Tensor comparison is returning True.
tf.shape(v) == tf.shape(tf.constant([1.0, 21],tf.float32))
<tf.Tensor: shape=(2,), dtype=bool, numpy=array([ True, True])>
Coming to your tf.Session() question, in TensorFlow 2.0 eager execution is enabled by default, still, if you need to disable eager execution and can use tf.Session like below.
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
hello = tf.constant('Hello, TensorFlow!')
sess = tf.compat.v1.Session()
print(sess.run(hello))

How to get the Jacobian matrix form derivative of vector by vector in TensorFlow Eager Execution API?

In the MLP model the input of layer l can be computed by this formula:
z = Wa + b
W is the weight matrix between layer l-1 and layer l, a is the output signal of layer l-1 neuron, b is the bias of layer l.
For example:
I want to use TensorFlow Eager Execution API to get the derivatives:
I define a function to calculate the value of z:
def f002(W, a, b):
return tf.matmul(W, a) + b
My main program:
def test001(args={}):
tf.enable_eager_execution()
tfe = tf.contrib.eager
a = tf.reshape(tf.constant([1.0, 2.0, 3.0]), [3, 1])
W = tf.constant([[4.0, 5.0, 6.0],[7.0, 8.0, 9.0]])
b = tf.reshape(tf.constant([1001.0, 1002.0]), [2, 1])
z = f002(W, a, b)
print(z)
grad_f1 = tfe.gradients_function(f002)
dv = grad_f1(W, a, b)
print(dv)
I can get the correct value of z in forward mode. But when print the derivative results it displayed something like these:
[<tf.Tensor: id=17, shape=(2, 3), dtype=float32, numpy=
array([[1., 2., 3.],
[1., 2., 3.]], dtype=float32)>, <tf.Tensor: id=18, shape=(3, 1),
dtype=float32, numpy=
array([[11.],
[13.],
[15.]], dtype=float32)>, <tf.Tensor: id=16, shape=(2, 1),
dtype=float32, numpy=
array([[1.],
[1.]], dtype=float32)>]
This is not what I want. How to get the Jacobian matrix derivative result of vector by vector?