Weird results with if-else in a tf.data.map - tensorflow2.0

Why does the else block run when the below script is executed in TF.2.3.1?
DEFAULT_STR = "*"
def add_na_cols(example:Dict, col:str):
if example[col] == DEFAULT_STR:
tf.print(example)
example[f'{col}_na'] = "True"
else:
tf.print("Came to false")
example[f"{col}_na"] = "False"
return example
t = tf.data.Dataset.from_tensor_slices({"a": [DEFAULT_STR]})
for r in t.map(partial(add_na_cols, col="a")):
print(r)
Expect to print
{'a': "*"}
{'a': <tf.Tensor: shape=(), dtype=string, numpy=b'*'>, 'a_na': <tf.Tensor: shape=(), dtype=string, numpy=True'>}
but see
{'a': "*"}
{'a': <tf.Tensor: shape=(), dtype=string, numpy=b'*'>, 'a_na': <tf.Tensor: shape=(), dtype=string, numpy=b'False'>}
Created a collab
https://colab.research.google.com/drive/1ZgLF0ytiRJ4_VwfMMpVRP1TVkV-cFdDc?usp=sharing
def add_na_cols(example:Dict, col:str):
example[f'{col}_na'] = tf.cond(example[col] == DEFAULT_STR, lambda: "True", lambda: "False")
return example
works but I am trying to know why in graph mode, the if-else approach doesn't work as expected

Related

Autodiff implementation for gradient calculation

I have worked through some papers about the autodiff algorithm to implement it for myself (for learning purposes). I compared my algorithm in test cases to the output of tensorflow and their outputs did not match in most cases. Therefor i worked through the tutorial from this side and implemented it with tensorflow operations just for the matrix multiplication operation since that was one of the operations that did not work:
gradient of matmul and unbroadcast method:
def gradient_matmul(node, dx, adj):
# dx is needed to know which of both parents should be derived
a = node.parents[0]
b = node.parents[1]
# the operation was node.tensor = tf.matmul(a.tensor, b,tensor)
if a == dx or b == dx:
# result depends on which of the parents is the derivative
mm = tf.matmul(adj, tf.transpose(b.tensor)) if a == dx else \
tf.matmul(tf.transpose(a.tensor), adj)
return mm
else:
return None
def unbroadcast(adjoint, node):
dim_a = len(adjoint.shape)
dim_b = len(node.shape)
if dim_a > dim_b:
sum = tuple(range(dim_a - dim_b))
res = tf.math.reduce_sum(adjoint, axis = sum)
return res
return adjoint
And finally the gradient calculation autodiff algorithm:
def gradient(y, dx):
working = [y]
adjoints = defaultdict(float)
adjoints[y] = tf.ones(y.tensor.shape)
while len(working) != 0:
curr = working.pop(0)
if curr == dx:
return adjoints[curr]
if curr.is_store:
continue
adj = adjoints[curr]
for p in curr.parents:
# for testing with matrix multiplication as only operation
local_grad = gradient_matmul(curr, p, adj)
adjoints[p] = unbroadcast(tf.add(adjoints[p], local_grad), p.tensor)
if not p in working:
working.append(p)
Yet it produces the same output as my initial implementation.
I constructed a matrix multiplication test case:
x = tf.constant([[[1.0, 1.0], [2.0, 3.0]], [[4.0, 5.0], [6.0, 7.0]]])
y = tf.constant([[3.0, -7.0], [-1.0, 5.0]])
z = tf.constant([[[1, 1], [2.0, 2]], [[3, 3], [-1, -1]]])
w = tf.matmul(tf.matmul(x, y), z)
Where w should be derived for each of the variables.
Tensorflow calculates the gradient:
[<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
array([[[-22., 18.],
[-22., 18.]],
[[ 32., -16.],
[ 32., -16.]]], dtype=float32)>, <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[66., -8.],
[80., -8.]], dtype=float32)>, <tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
array([[[ 5., 5.],
[ -1., -1.]],
[[ 18., 18.],
[-10., -10.]]], dtype=float32)>]
My implementation calculates:
[[[-5. 7.]
[-5. 7.]]
[[-5. 7.]
[-5. 7.]]]
[[33. 22.]
[54. 36.]]
[[[ 9. 9.]
[14. 14.]]
[[-5. -5.]
[-6. -6.]]]
Maybe the problem is the difference between numpys dot and tensorflows matmul?
But then i don't know to fix the gradient or unbroadcast for the tensorflow method...
Thanks for taking the time to look over my code! :)
I found the error, the gradient matmul should have been:
def gradient_matmul(node, dx, adj):
a = node.parents[0]
b = node.parents[1]
if a == dx:
return tf.matmul(adj, b.tensor, transpose_b=True)
elif b == dx:
return tf.matmul(a.tensor, adj, transpose_a=True)
else:
return None
Since i only want to transpose the last 2 dimensions

keras.models.load_model does not work as expected within MirroredStrategy

I try to load a model within MirroredStategy. I find that the loaded model within MirroredStategy is not working correctly in that only one replica is found, while there are 4 visible devices specified actually. This does not happen for the model that is directly constructed within MirroredStategy.
It is worth mentioning that the subclassing tf.keras.models.Model and tf.keras.layers.Layer are used here, which I think may be the cause of this wrong behavior. I have confirmed that loading an saved tf.keras.Sequential model works well within MirroredStategy.
Reproducible code:
class Demo(tf.keras.models.Model):
def __init__(self, **kwargs):
super(Demo, self).__init__(**kwargs)
self.test_layer = TestLayer()
self.dense_layer = tf.keras.layers.Dense(units=1, activation=None,
kernel_initializer="ones",
bias_initializer="zeros")
​
def call(self, inputs):
vector = self.test_layer(inputs)
logit = self.dense_layer(vector)
return logit, vector
​
def summary(self):
inputs = tf.keras.Input(shape=(10,), dtype=tf.int64)
model = tf.keras.models.Model(inputs=inputs, outputs=self.call(inputs))
return model.summary()
​
#tf.function
def _step(inputs, labels, model):
logit, vector = model(inputs)
return logit, vector
​
def tf_dataset(keys, labels, batchsize, repeat):
dataset = tf.data.Dataset.from_tensor_slices((keys, labels))
dataset = dataset.repeat(repeat)
dataset = dataset.batch(batchsize, drop_remainder=True)
return dataset
​
def _dataset_fn(input_context):
global_batch_size = 16384
keys = np.ones((global_batch_size, 10))
labels = np.random.randint(low=0, high=2, size=(global_batch_size, 1))
replica_batch_size = input_context.get_per_replica_batch_size(global_batch_size)
dataset = tf_dataset(keys, labels, batchsize=replica_batch_size, repeat=1)
dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id)
return dataset
​
# Save model within MirroredStrategy scope
strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1", "GPU:2", "GPU:3"])
with strategy.scope():
model = Demo()
model.compile()
model.summary()
dataset = strategy.distribute_datasets_from_function(_dataset_fn)
for i, (key_tensors, replica_labels) in enumerate(dataset):
print("-" * 30, "step ", str(i), "-" * 30)
logit, vector = strategy.run(_step, args=(key_tensors, replica_labels, model))
# model(tf.keras.Input(shape=(10,), dtype=tf.int64))
model.save("demo")
​
# Load model within MirroredStrategy scope
with strategy.scope():
model2 = tf.keras.models.load_model("demo")
dataset = strategy.distribute_datasets_from_function(_dataset_fn)
for i, (key_tensors, replica_labels) in enumerate(dataset):
print("-" * 30, "step ", str(i), "-" * 30)
logit, vector = strategy.run(_step, args=(key_tensors, replica_labels, model2))
Actual log:
------------------------------ step 0 ------------------------------
global_replica_id: Tensor("demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:0)
global_replica_id: Tensor("replica_1/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:1)
global_replica_id: Tensor("replica_2/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:2)
global_replica_id: Tensor("replica_3/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:3)
2022-07-13 06:20:56.820402: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
------------------------------ step 0 ------------------------------
global_replica_id: 0
Expected log:
------------------------------ step 0 ------------------------------
global_replica_id: Tensor("demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:0)
global_replica_id: Tensor("replica_1/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:1)
global_replica_id: Tensor("replica_2/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:2)
global_replica_id: Tensor("replica_3/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:3)
2022-07-13 06:20:56.820402: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
------------------------------ step 0 ------------------------------
global_replica_id: Tensor("demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:0)
global_replica_id: Tensor("replica_1/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:1)
global_replica_id: Tensor("replica_2/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:2)
global_replica_id: Tensor("replica_3/demo/test_layer/replica_id_in_sync_group:0", shape=(), dtype=int32, device=/job:localhost/replica:0/task:0/device:gpu:3)
The log is from the line tf.print("global_replica_id: {}".format(global_replica_id)) within TestLayer.call.

Problem about getting None from the GradientTape.gradient in TensorFlow

I tried the following code:
from d2l import tensorflow as d2l
import tensorflow as tf
#tf.function
def corr2d(X, k, Y): ##save
"""Compute 2D cross-correlation."""
with tf.GradientTape() as tape:
for i in range(Y.shape[0]):
for j in range(Y.shape[1]):
Y[i, j].assign(tf.reduce_sum(tf.multiply(X[i: i + h, j: j + w], k)))
print('Gradients = ', tape.gradient(Y, k)) # show the gradient
print('Watched Variables = ', tape.watched_variables()) # show the watched varaibles
print(tf.__version__)
Xin= tf.constant([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])
kernel = tf.Variable([[0.0, 1.0], [2.0, 3.0]])
h, w = kernel.shape
Y_hat = tf.Variable(tf.zeros((Xin.shape[0] - h + 1, Xin.shape[1] - w + 1))) # prepare the output tensor
corr2d(X, kernel, Y_hat)
print(Y_hat)
I got the following results:
2.4.1
Gradients = None
Watched Variables = (<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32>, <tf.Variable 'Variable:0' shape=(2, 2) dtype=float32>)
<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[19., 25.],
[37., 43.]], dtype=float32)>
Can anyone explain why the returned gradient is None even though the source variable kernel is included in the list of watched variables?
I'm not sure I really understood what you were trying to do. You were passing your variable as the target for the gradient.
It is always easier to think in terms of cost function and variables.
Let's say your cost function is y = x ** 2. In this case, it is possible to calculate the gradient of y with respect to x.
Basically, you did not have a function to calculate any gradient with respect to k.
I have done a small change. Check for the variable cost.
import tensorflow as tf
def corr2d(X, k, Y): ##save
"""Compute 2D cross-correlation."""
with tf.GradientTape() as tape:
cost = 0
for i in range(Y.shape[0]):
for j in range(Y.shape[1]):
Y[i, j].assign(tf.reduce_sum(tf.multiply(X[i: i + h, j: j + w], k)))
cost = cost + tf.reduce_sum(tf.multiply(X[i: i + h, j: j + w], k))
print('\nGradients = ', tape.gradient(cost, k)) # show the gradient
print('Watched Variables = ', tape.watched_variables()) # show the watched varaibles
print(tf.__version__)
Xin= tf.constant([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])
kernel = tf.Variable([[0.0, 1.0], [2.0, 3.0]])
h, w = kernel.shape
Y_hat = tf.Variable(tf.zeros((Xin.shape[0] - h + 1, Xin.shape[1] - w + 1))) # prepare the output tensor
corr2d(Xin, kernel, Y_hat)
print(Y_hat)
And now, you will get
Gradients = tf.Tensor(
[[ 8. 12.]
[20. 24.]], shape=(2, 2), dtype=float32)
Watched Variables = (<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0., 1.],
[2., 3.]], dtype=float32)>, <tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[19., 25.],
[37., 43.]], dtype=float32)>)
<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[19., 25.],
[37., 43.]], dtype=float32)>

How to generate encoded text directly from tf.data.Dataset.from_generator method?

On Better performance with the tf.data API Tensorflow tutorial is showed a simple and efficient Dataset implementation. When working with text datasets, this implementation would be something like:
class TextDataset(tf.data.Dataset):
def _generator(dataset_dir, num_samples):
# Opening the dataset file
dataset_file = open(dataset_dir, "r")
for sample_idx in range(num_samples):
# Reading data (line, record) from the file
sample = dataset_file.readline()
yield {"idx": sample_idx, "text": sample}
def __new__(cls, dataset_dir, num_samples=3):
return tf.data.Dataset.from_generator(
cls._generator,
output_types={"idx": tf.dtypes.int64, "text": tf.dtypes.string},
output_shapes={"idx": (), "text": ()},
args=(dataset_dir, num_samples,)
)
which generates the following dataset:
{'idx': <tf.Tensor: shape=(), dtype=int64, numpy=0>,
'text': <tf.Tensor: shape=(), dtype=string, numpy=b'sample one'>},
{'idx': <tf.Tensor: shape=(), dtype=int64, numpy=1>,
'text': <tf.Tensor: shape=(), dtype=string, numpy=b'sample two'>},
{'idx': <tf.Tensor: shape=(), dtype=int64, numpy=2>,
'text': <tf.Tensor: shape=(), dtype=string, numpy=b'sample three'>}
...
Now, instead of yield the text as a string in the _generator method, it would be interesting to return only the identifiers of the string's tokens (encode). This is possible to be done by a tokenizer.
So, how to encode the text as a list of integers before yield it in the _generator method?
Note: a working example is available in Google Colab.

How to get the Jacobian matrix form derivative of vector by vector in TensorFlow Eager Execution API?

In the MLP model the input of layer l can be computed by this formula:
z = Wa + b
W is the weight matrix between layer l-1 and layer l, a is the output signal of layer l-1 neuron, b is the bias of layer l.
For example:
I want to use TensorFlow Eager Execution API to get the derivatives:
I define a function to calculate the value of z:
def f002(W, a, b):
return tf.matmul(W, a) + b
My main program:
def test001(args={}):
tf.enable_eager_execution()
tfe = tf.contrib.eager
a = tf.reshape(tf.constant([1.0, 2.0, 3.0]), [3, 1])
W = tf.constant([[4.0, 5.0, 6.0],[7.0, 8.0, 9.0]])
b = tf.reshape(tf.constant([1001.0, 1002.0]), [2, 1])
z = f002(W, a, b)
print(z)
grad_f1 = tfe.gradients_function(f002)
dv = grad_f1(W, a, b)
print(dv)
I can get the correct value of z in forward mode. But when print the derivative results it displayed something like these:
[<tf.Tensor: id=17, shape=(2, 3), dtype=float32, numpy=
array([[1., 2., 3.],
[1., 2., 3.]], dtype=float32)>, <tf.Tensor: id=18, shape=(3, 1),
dtype=float32, numpy=
array([[11.],
[13.],
[15.]], dtype=float32)>, <tf.Tensor: id=16, shape=(2, 1),
dtype=float32, numpy=
array([[1.],
[1.]], dtype=float32)>]
This is not what I want. How to get the Jacobian matrix derivative result of vector by vector?