Cannot get GradientTape to give non null results - tensorflow2.0

I am trying to manually implement a very simple RNN using tensorflow2. I modeled my code on the example to manually make models on tensorflow website. The code, stripped to bare essentials for this purpose, is
class ModelSimple(object):
def __init__(self):
# Initialize the weights to `5.0` and the bias to `0.0`
# In practice, these should be initialized to random values (for example, with `tf.random.normal`)
self.W = tf.Variable(tf.random.normal([]))
self.b = tf.Variable(tf.random.normal([]))
def __call__(self, x):
return self.W * x + self.b
def loss(predicted_y, target_y):
return tf.reduce_mean(tf.square(predicted_y - target_y))
NUM_EXAMPLES = 1000
inputs = tf.random.normal(shape=[NUM_EXAMPLES])
outputs = tf.zeros(NUM_EXAMPLES)
model = ModelSimple()
with tf.GradientTape() as t:
t.watch([model.W,model.b])
current_loss = loss(model(inputs), outputs)
dW, db = t.gradient(current_loss, [model.W, model.b])
print(dW,db)
This gives nice tensors for dW and db. Then I try to do what I described above
class ModelRNN(object):
def __init__(self, n_inputs, n_neurons):
self.n_inputs = n_inputs
self.n_neurons = n_neurons
# weights for new input
self.Wx = tf.Variable(tf.random.normal(shape=[self.n_inputs, self.n_neurons], dtype=tf.float32))
# weights for previous output
self.Wy = tf.Variable(tf.random.normal(shape=[self.n_neurons, self.n_neurons], dtype=tf.float32))
# bias weights
self.b = tf.Variable(tf.zeros([1, self.n_neurons], dtype=tf.float32))
def __call__(self, X_batch):
# get shape of input
batch_size, num_time_steps, _ = X_batch.get_shape()
# we will loop through the time steps and the output of the previous computation feeds into
# the next one.
# this variable keeps track of it and is initialized to zero
y_last = tf.Variable(tf.zeros([batch_size, self.n_neurons], dtype=tf.float32))
# the outputs will be stored in this tensor
Ys = tf.Variable(tf.zeros([batch_size, num_time_steps, self.n_neurons], dtype=tf.float32))
for t in range(num_time_steps):
Xt = X_batch[:, t, :]
yt = tf.tanh(tf.matmul(y_last, self.Wy) +
tf.matmul(Xt, self.Wx) +
self.b)
y_last.assign(yt)
Ys[:, t, :].assign(yt)
return Ys
inputs = tf.convert_to_tensor(np.array([
# t = 0 t = 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
],dtype=np.float32))
outputs=tf.Variable(tf.zeros((4,2,5),dtype=np.float32))
model = ModelRNN(3, 5)
with tf.GradientTape() as t:
t.watch([model.Wx,model.Wy,model.b])
current_loss = loss(model(inputs), outputs)
dWx,dWy,db = t.gradient(current_loss, [model.Wx, model.Wy,model.b])
print(dWx,dWy,db)
and it turns out dWx,dWy,db are all None. I have tried several things (including watching them using the GradientTape despite them being variables) and yet I keep getting None. What am I doing wrong?

It looks like this is related to this issue:
Tensorflow cannot get gradient wrt a Variable, but can wrt a Tensor
Replacing assign with a python list and tf.stack results in a gradient being returned
Ys = []
for t in range(num_time_steps):
Xt = X_batch[:, t, :]
yt = tf.tanh(tf.matmul(y_last, self.Wy) +
tf.matmul(Xt, self.Wx) +
self.b)
y_last.assign(yt)
Ys.append(yt)
return tf.stack(Ys,axis=1)

Related

Custom layer with tf.extract_image_patches extremely slow

I'm new to tensorflow. I'm trying to implement a custom pooling layer, using owa operators (https://github.com/jiforcen/ordered-weighted-pooling). For that I'm using tf.extract_image_patches, but this operation is extremely slow when the input data dimensions are large, as raised here Issue #13017.
I believe that the pooling layer that I implemented has a behavior very similar to what is performed by the tf.keras.layers.MaxPooling2D layer. Inspecting the code of MaxPooling2D I saw that it calls the gen_nn_ops.max_pool method.
I tried to take a look at what's inside gen_nn_ops.max_pool, but I can't find it in the repository. From what I've googled I can't find the source code because it's automatically generated by bazel. If I build from source, I'll see this file inside bazel-genfiles, right? This file contains automatically generated Python wrappers to underlying C++ implementations.
Is it possible to create a custom pooling operation in C++ and use it in my Python code? I'm adding the custom layer code that I implemented.
import tensorflow as tf
from keras import backend as K
from tensorflow.keras.constraints import Constraint
from tensorflow.python.util.tf_export import keras_export
from tensorflow.python.keras.utils import conv_utils
# import skimage.measure
#keras_export('keras.constraints.UnitSumNonNeg', 'keras.constraints.unit_sum_non_neg')
class UnitSumNonNeg(Constraint):
"""Limits weights to be non-negative and with sum equal to one
Also available via the shortcut function `keras.constraints.unit_sum_non_neg`.
"""
def __call__(self, w):
aux = w * tf.cast(tf.math.greater_equal(w, 0.), w.dtype)
return aux/(K.epsilon() + tf.reduce_sum(aux, axis=[0], keepdims=True))
class OWAPoolingNew(tf.keras.layers.Layer):
def __init__(self,
pool_size=(2, 2),
strides=None,
padding='valid',
data_format=None,
name=None,
sort=True,
train=True,
seed=None,
all_channels=False,
**kwargs):
super(OWAPoolingNew, self).__init__(name=name, **kwargs)
self.pool_size = pool_size
self.strides = pool_size if strides == None else strides
self.padding = padding
self.data_format = conv_utils.normalize_data_format('channels_last')
self.sort = sort
self.train = train
self.seed = seed if seed != None else 10
self.all_channels = all_channels
def build(self, input_shape):
if self.all_channels:
weights_shape = (self.pool_size[0] * self.pool_size[1], input.shape[-1])
else:
weights_shape = (self.pool_size[0] * self.pool_size[1], 1)
tf.random.set_seed(self.seed)
kernel = tf.random.uniform(shape=weights_shape)
kernel /= tf.reduce_sum(kernel, axis=[0], keepdims=True)
self.kernel = tf.Variable(initial_value = kernel, trainable=self.train, dtype='float32', constraint=UnitSumNonNeg())
# def owapool(self, a, axis=[]):
# a = tf.reshape(a, shape=a.shape[0:4]+(-1,))
# a = tf.sort(a, direction='DESCENDING', axis=-1)
# return tf.reduce_sum(tf.math.multiply(self.kernel, a), axis=-1)
def call(self, inputs):
_, height, width, channels = inputs.get_shape().as_list()
if(self.padding.upper()=='SAME'): # Complete size to pad 'SAME'
pad_bottom = self.pool_size[0] * height%self.pool_size[0]
pad_right = self.pool_size[1] * width%self.pool_size[1]
paddings = tf.constant([[0, 0], [0, pad_bottom], [0, pad_right], [0, 0]])
inputs = tf.pad(inputs, paddings, "CONSTANT")
# Extract pooling regions
stride = [1, self.strides[0], self.strides[1], 1]
ksize = [1, self.pool_size[0], self.pool_size[1], 1]
x_tensor_p = tf.image.extract_patches(inputs, sizes = ksize, strides = stride,
rates = [1, 1, 1, 1], padding='SAME')
_, pool_height, pool_width, elems = x_tensor_p.get_shape().as_list()
# Extract pooling regions for each channel
elems = int(elems / channels)
inputs = tf.reshape(inputs, [-1, pool_height, pool_width, elems, channels]) # Reshape tensor
# Sort values for pooling
if self.sort:
inputs = tf.sort(inputs, axis=-2, direction='DESCENDING', name=None)
outputs = tf.reduce_sum(tf.math.multiply(self.kernel, inputs), axis=-2)
return outputs
```

Two sets of shared embeddings from one tensorflow feature?

How can I create two sets of shared embeddings from the same tensorflow feature columns?
This small example
import tensorflow as tf
data = {"A": [0, 1, 2, 3], "B": [2, 1, 0, 3]}
def add_label(example):
return example, 1
def input_fn():
dset = tf.data.Dataset.from_tensor_slices(data).map(add_label).batch(2)
return dset
def model_fn(features, labels, mode, params):
colA = tf.feature_column.categorical_column_with_vocabulary_list("A", [0, 1, 2, 3])
colB = tf.feature_column.categorical_column_with_vocabulary_list("B", [0, 1, 2, 3])
model1_embedddings = tf.feature_column.shared_embeddings(categorical_columns=[colA, colB], dimension=2)
X1 = tf.keras.layers.DenseFeatures(model1_embedddings)(features)
output1_output = tf.reduce_sum(X1, axis=1)
with tf.compat.v1.variable_scope("other", reuse=False):
model2_embedddings = tf.feature_column.shared_embeddings(categorical_columns=[colA, colB], dimension=2)
X2 = tf.keras.layers.DenseFeatures(model2_embedddings)(features)
output2_output = tf.reduce_sum(X2, axis=1)
loss = tf.losses.mean_squared_error(labels, output1_output + output2_output)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss=loss)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
estimator = tf.estimator.Estimator(model_fn=model_fn)
estimator.train(input_fn=input_fn)
crashes with
ValueError: Variable A_B_shared_embedding already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope?
It seems one should be able to use variable_scope or name_scope to get it to work, but so far no luck.
There is an option to shared_embeddings to set a new embeddings collection name
model2_embedddings = tf.feature_column.shared_embeddings(
categorical_columns=[colA, colB], dimension=2,
shared_embedding_collection_name="other")

Tensorflow Executor failed to create kernel on GPU

I am trying to train a network using tensorflow on GPU. But this warning is thrown out during the training process.
I checked free memory size of the gpu. It seems ok.
E tensorflow/core/common_runtime/executor.cc:623] Executor failed to > create kernel. Invalid argument: Default AvgPoolingOp only supports
NHWC on device type CPU
[[{{node vgg_src/pool1}} = AvgPool[T=DT_FLOAT,
data_format="NCHW", ksize=[1, 1, 2, 2], padding="SAME", strides=[1,
1, 2, 2], _device="/job:localhost/replica:0/task:0/device:GPU:0"]
(vgg_src/conv1_2/Relu)]]
Although I can train and run the network properly, I still want to know what causes this problem. And how could I fix this problem?
--
Update
Add the code of my model which is a modified vgg-16 network.
import os
import tensorflow as tf
import numpy as np
import pdb
vgg_mean = [0.485, 0.456, 0.406]
vgg_std = [0.229, 0.224, 0.225]
data = None
dir_path = os.path.dirname(os.path.realpath(__file__))
# dir_path = os.path.normpath(os.path.join(dir_path, os.pardir))
weights_path = os.path.join(dir_path, 'models', 'vgg16_onnx.npy')
class Model():
def __init__(self, vgg16_npy_path=None):
global data
if vgg16_npy_path is None:
path = weights_path
print(path)
if os.path.exists(path):
vgg16_npy_path = path
else:
print("VGG16 weights were not found in the project directory!")
exit(0)
if data is None:
data = np.load(vgg16_npy_path, encoding='latin1')
self.data_dict = data.item()
print("VGG16 weights loaded")
else:
self.data_dict = data.item()
def build(self, bgr_input):
'''notice that opencv load image with bgr order, but the pretrained model is designed for rgb'''
blue, green, red = tf.split(axis=3, num_or_size_splits=3, value=bgr_input)
rgb = tf.concat(axis=3, values=[
(red - vgg_mean[0])/vgg_std[0],
(green - vgg_mean[1])/vgg_std[1],
(blue - vgg_mean[2])/vgg_std[2],
])
self.conv1_1 = self.conv_layer(rgb, "conv1_1")
self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2")
self.pool1 = self.avg_pool(self.conv1_2, 'pool1')
self.conv2_1 = self.conv_layer(self.pool1, "conv2_1")
self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2")
self.pool2 = self.avg_pool(self.conv2_2, 'pool2')
self.conv3_1 = self.conv_layer(self.pool2, "conv3_1")
self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2")
self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3")
self.pool3 = self.avg_pool(self.conv3_3, 'pool3')
self.conv4_1 = self.conv_layer(self.pool3, "conv4_1")
self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2")
self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3")
self.pool4 = self.avg_pool(self.conv4_3, 'pool4')
self.conv5_1 = self.conv_layer(self.pool4, "conv5_1")
self.conv5_2 = self.conv_layer(self.conv5_1, "conv5_2")
self.conv5_3 = self.conv_layer(self.conv5_2, "conv5_3")
self.pool5 = self.avg_pool(self.conv5_3, 'pool5')
self.fc6 = self.fc_layer(self.pool5, 'fc6')
self.fc7 = self.fc_layer(self.fc6, 'fc7')
self.fc8 = self.fc_layer(self.fc7, 'fc8')
self.data_dict = None
def avg_pool(self, bottom, name):
return tf.nn.avg_pool(bottom,
ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)
def max_pool(self, bottom, name):
return tf.nn.max_pool(bottom,
ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name)
def conv_layer(self, bottom, name, stride = 1):
with tf.variable_scope(name):
filt = self.get_conv_filter(name)
conv = tf.nn.conv2d(bottom, filt, [1, stride, stride, 1], padding='SAME')
conv_biases = self.get_bias(name)
bias = tf.nn.bias_add(conv, conv_biases)
mean = self.get_mean(name)
variance = self.get_variance(name)
offset = self.get_beta(name)
scale = self.get_gamma(name)
norm = tf.nn.batch_normalization(bias, mean, variance, offset, scale, 1e-20 )
relu = tf.nn.relu(norm)
return relu
def fc_layer(self, bottom, name):
with tf.variable_scope(name):
shape = bottom.get_shape().as_list()
dim = 1
for d in shape[1:]:
dim *= d
x = tf.reshape(bottom, [-1, dim])
weights = self.get_fc_weight(name)
biases = self.get_bias(name)
# Fully connected layer. Note that the '+' operation automatically
# broadcasts the biases.
fc = tf.nn.bias_add(tf.matmul(x, weights), biases)
return fc
def get_mean(self, name):
return tf.constant(self.data_dict[name][4], name = "mean")
def get_variance(self, name):
return tf.constant(self.data_dict[name][5], name = "variance")
def get_gamma(self, name):
return tf.constant(self.data_dict[name][2], name = "gamma")
def get_beta(self, name):
return tf.constant(self.data_dict[name][3], name = "beta")
def get_conv_filter(self, name):
return tf.constant(np.rollaxis(np.rollaxis(np.rollaxis(self.data_dict[name][0], 1), 2), 3), name="filter")
def get_bias(self, name):
return tf.constant(self.data_dict[name][1], name="biases")
def get_fc_weight(self, name):
return tf.constant(np.rollaxis(self.data_dict[name][0], 1), name="weights")
This is not GPU-memory size issue.
Read the thread.
https://github.com/tensorpack/tensorpack/issues/263
You may be need to change the code according to the thread
TensorFlow only supports NHWC on cpu,
Check out these two lines in this thread. May be you need to change the parameter
- with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NCHW'), \
+ with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NHWC')
with argscope([Conv2D, AvgPooling, BatchNorm, GlobalAvgPooling], data_format='NHWC')

Casting object returned by tf.trainable_variables() as Tensor

tf.trainable_variables() returns a list of all trainable variable objects. When an object from the list is passed to an op, such as tf.nn.l2_loss, TensorFlow is able to cast the object as a Tensor and perform the necessary calculations. However, passing the same object to a user defined function throws an error.
Consider the following two layer network to work with:
# Generate random data
x_train = np.random.rand(64, 16, 16, 8)
y_train = np.random.randint(0, 5, 64)
one_hot = np.zeros((len(y_train), 5))
one_hot[list(np.indices((len(y_train),))) + [y_train]] = 1
y_train = one_hot
# Model definition
class FeedForward(object):
def __init__(self, l2_lambda=0.01):
self.x = tf.placeholder(tf.float32, shape=[None, 16, 16, 4], name="input_x")
self.y = tf.placeholder(tf.float32, [None, 5], name="input_y")
l2_loss = tf.constant(0.0)
with tf.name_scope("conv1"):
kernel_shape=[1, 1, 4, 4]
w = tf.Variable(tf.truncated_normal(kernel_shape, stddev=0.1), name="weight")
conv1 = tf.nn.conv2d(self.x, w, strides=[1, 1, 1, 1], padding="SAME", name="conv")
with tf.name_scope("conv2"):
kernel_shape=[1, 1, 4, 2]
w = tf.Variable(tf.truncated_normal(kernel_shape, stddev=0.1), name="weight")
conv2 = tf.nn.conv2d(conv1, w, strides=[1, 1, 1, 1], padding="SAME", name="conv")
out = tf.contrib.layers.flatten(conv2)
with tf.name_scope("output"):
kernel_shape=[out.get_shape()[1].value, 5]
w = tf.Variable(tf.truncated_normal(kernel_shape, stddev=0.1), name="weight")
self.scores = tf.matmul(out, w, name="scores")
predictions = tf.argmax(self.scores, axis=1, name="predictions")
# L2 Regularizer
if l2_reg_lambda > 0.:
l2_loss = tf.add_n([self.some_norm(var) for var in tf.trainable_variables() if ("weight" in var.name)])
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.y)
self.loss = tf.reduce_mean(losses) + (l2_lambda * l2_loss)
correct_predictions = tf.equal(predictions, tf.argmax(self.y, axis=1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def some_norm(w):
# operate on w and return scalar
# (only) for example
return (1 / tf.nn.l2_loss(w))
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
ffn = FeedForward()
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-2)
grads_and_vars = optimizer.compute_gradients(ffn.loss)
sess.run(tf.global_variables_initializer())
def train_step(x_batch, y_batch):
feed_dict = {
ffn.x: x_batch,
ffn.y: y_batch,
}
_, step, loss, accuracy = sess.run([train_op, global_step, ffn.loss, ffn.accuracy], feed_dict)
print("step {}, loss {:g}, acc {:g}".format(step, loss, accuracy))
batch_size = 32
n_epochs = 4
s_idx = - batch_size
for batch_index in range(n_epochs):
s_idx += batch_size
e_idx = s_idx + batch_size
x_batch = x_train[s_idx:e_idx]
y_batch = y_train[s_idx:e_idx]
train_step(x_batch, y_batch)
current_step = tf.train.global_step(sess, global_step)
The problem here is that on passing the trainable variable to some_norm(), it is passed as an object and can not be operated on. The related error message encountered at the first line inside some_norm() is:
Failed to convert object of type <class '__main__.FeedForward'> to Tensor.
Contents: <__main__.FeedForward object at 0x7fefde7e97b8>.
Consider casting elements to a supported type.
Is there a way to cast the object returned by tf.trainable_variables() as a tensor or is there a possible workaround such as passing a reference?
How is using the above different from using l2_loss = tf.add_n([tf.nn.l2_loss(var) for var in tf.trainable_variables()...]) which works just fine?
You forgot the self argument in your some_norm implementation def some_norm(w):, so it tries to convert your instance of the class (self) to a tensor.

Implementing LSTM regression model with tensor flow

I am trying to implement a tensor flow LSTM regression model for a list of inputs number.
example:
input_data = [1, 2, 3, 4, 5]
time_steps = 2
-> X == [[1, 2], [2, 3], [3, 4]]
-> y == [3, 4, 5]
The code is below:
TIMESTEPS = 20
num_hidden=20
Xd, yd = load_data()
train_input = Xd['train']
train_input = train_input.reshape(-1,20,1)
train_output = yd['train']
# train_input = [[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],..
# train_output = [[21],[22],[23]....
test_input = Xd['test']
test_output = yd['test']
X = tf.placeholder(tf.float32, [None, 20, 1])
y = tf.placeholder(tf.float32, [None, 1])
cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
val, state = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
val = tf.Print(val, [tf.argmax(val,1)], 'argmax(val)=' , summarize=20, first_n=7)
val = tf.transpose(val, [1, 0, 2])
val = tf.Print(val, [tf.argmax(val,1)], 'argmax(val2)=' , summarize=20, first_n=7)
# Take only the last output after 20 time steps
last = tf.gather(val, int(val.get_shape()[0]) - 1)
last = tf.Print(last, [tf.argmax(last,1)], 'argmax(val3)=' , summarize=20, first_n=7)
# define variables for weights and bias
weight = tf.Variable(tf.truncated_normal([num_hidden, int(y.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[y.get_shape()[1]]))
# Prediction is matmul of last value + wieght + bias
prediction = tf.matmul(last, weight) + bias
# Cost function using softmax
# y is the true distrubution and prediction is the predicted
cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(prediction), reduction_indices=[1]))
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(cost)
from tensorflow.python import debug as tf_debug
inita = tf.initialize_all_variables()
sess = tf.Session()
sess.run(inita)
batch_size = 100
no_of_batches = int(len(train_input)/batch_size)
epoch = 10
test_size = 100
for i in range(epoch):
for start, end in zip(range(0, len(train_input), batch_size), range(batch_size, len(train_input)+1, batch_size)):
sess.run(minimize, feed_dict={X: train_input[start:end], y: train_output[start:end]})
test_indices = np.arange(len(test_input)) # Get A Test Batch
np.random.shuffle(test_indices)
test_indices = test_indices[0:test_size]
print (i, mean_squared_error(np.argmax(test_output[test_indices], axis=1), sess.run(prediction, feed_dict={X: test_input[test_indices]})))
print ("predictions", prediction.eval(feed_dict={X: train_input}, session=sess))
y_pred = prediction.eval(feed_dict={X: test_input}, session=sess)
sess.close()
test_size = test_output.shape[0]
ax = np.arange(0, test_size, 1)
plt.plot(ax, test_output, 'r', ax, y_pred, 'b')
plt.show()
But i am not able to minimize the cost, the calculated MSE increases at each step instead of decreasing.
I suspect there is a problem with the cost problem that i am using.
any thoughts or suggestions as to what i am doing wrong ?
Thanks
As mentioned in the comment, you had to change your loss function to the MSE function and reduce your learning rate. Is your error converging to zero ?