gradient calculation for bias term using GradientTape() - tensorflow2.0

I want to calculate gradient tensors with respect to weight variables and bias term, separately. The gradient for weight variables is calculated correctly, But the gradient for bias is NOT computed well. Please, let me know what the problem is, or modify my code correctly.
import numpy as np
import tensorflow as tf
X =tf.constant([[1.0,0.1,-1.0],[2.0,0.2,-2.0],[3.0,0.3,-3.0],[4.0,0.4,-4.0],[5.0,0.5,-5.0]])
b1 = tf.Variable(-0.5)
Bb = tf.constant([ [1.0], [1.0], [1.0], [1.0], [1.0] ])
Bb = b1* Bb
Y0 = tf.constant([ [-10.0], [-5.0], [0.0], [5.0], [10.0] ])
W = tf.Variable([ [1.0], [1.0], [1.0] ])
with tf.GradientTape() as tape:
Y = tf.matmul(X, W) + Bb
print("Y : ", Y.numpy())
loss_val = tf.reduce_sum(tf.square(Y - Y0))
print("loss : ", loss_val.numpy())
gw = tape.gradient(loss_val, W) # gradient calculation works well
gb = tape.gradient(loss_val, b1) # does NOT work
print("gradient W : ", gw.numpy())
print("gradient b : ", gb.numpy())

Two things. Firstly if you look at the docs here -
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/GradientTape#args
you'll see that you can only make a single call to gradient unless persistent=True
Secondly, you're setting Bb = b1* Bb outside of the context manager for the tape so this op is not being recorded.
import numpy as np
import tensorflow as tf
X =tf.constant([[1.0,0.1,-1.0],[2.0,0.2,-2.0],[3.0,0.3,-3.0],[4.0,0.4,-4.0],[5.0,0.5,-5.0]])
b1 = tf.Variable(-0.5)
Bb = tf.constant([ [1.0], [1.0], [1.0], [1.0], [1.0] ])
Y0 = tf.constant([ [-10.0], [-5.0], [0.0], [5.0], [10.0] ])
W = tf.Variable([ [1.0], [1.0], [1.0] ])
with tf.GradientTape(persistent=True) as tape:
Bb = b1* Bb
Y = tf.matmul(X, W) + Bb
print("Y : ", Y.numpy())
loss_val = tf.reduce_sum(tf.square(Y - Y0))
print("loss : ", loss_val.numpy())
gw = tape.gradient(loss_val, W) # gradient calculation works well
gb = tape.gradient(loss_val, b1) # does NOT work
print("gradient W : ", gw.numpy())
print("gradient b : ", gb.numpy())

Related

Numpy to pyTorch: are there different data types?

Question: Can somebody help me to align this two approaches of data generation so that both of them can be used by the nn-model below ? When using appraoch (2) with numpy and torch.from_numpy(x) a run time error occurs ("expected scalar type Float but found Double")
For data generation I have these two approaches:
import torch
import torch.nn as nn
import numpy as np
def get_training_data_1():
x = torch.randn(batch_size, n_in)
y = torch.tensor([[1.0], [0.0], [0.0], [1.0], [1.0], [1.0], [0.0], [0.0], [1.0], [1.0]])
return x,y
def get_training_data_2():
x = np.random.rand(batch_size, n_in)
y = np.array([[1.0], [0.0], [0.0], [1.0], [1.0], [1.0], [0.0], [0.0], [1.0], [1.0]])
x = torch.from_numpy(x)
y = torch.from_numpy(y)
return x,y
n_in, n_h, n_out, batch_size = 2, 5, 1, 10
x, y = get_training_data_2()
With this model I run into problems when using appraoch (2) with numpy and torch.from_numpy(x), while it is OK when using approach (1)
#---- Create a NN-model
model = nn.Sequential( nn.Linear(n_in, n_h), # hidden layer
nn.ReLU(), # activation layer
nn.Linear(n_h, n_out), # output layer
nn.Sigmoid() ) # final 0, 1 rounding
#---- Construct the loss function
criterion = torch.nn.MSELoss()
#---- Construct the optimizer (Stochastic Gradient Descent in this case)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
#---- Gradient Descent
for epoch in range(1501):
y_pred = model(x) # Forward pass: Compute predicted y by passing x to the model
loss = criterion(y_pred, y) # Compute and print loss
if epoch%50 == 0:
print(epoch, loss.item())
optimizer.zero_grad() # Zero gradients, perform a backward pass, and update the weights.
loss.backward() # perform a backward pass (backpropagation)
optimizer.step() # Update the parameters
The default floating point type in torch is float32 (i.e. single precision). In NumPy the default is float64 (double precision). Try changing get_training_data_2 so that it explicitly sets the data type of the numpy arrays numpy.float32 before converting them to torch tensors:
def get_training_data_2():
x = np.random.rand(batch_size, n_in).astype(np.float32)
y = np.array([[1.0], [0.0], [0.0], [1.0], [1.0], [1.0], [0.0], [0.0], [1.0], [1.0]],
dtype=np.float32)
x = torch.from_numpy(x)
y = torch.from_numpy(y)
return x,y
Note. With the newer NumPy random API, you can generate float32 samples directly instead of casting float64 values to float32.
def get_training_data_2(rng):
x = rng.random(size=(batch_size, n_in), dtype=np.float32)
y = np.array([[1.0], [0.0], [0.0], [1.0], [1.0], [1.0], [0.0], [0.0], [1.0], [1.0]],
dtype=np.float32)
x = torch.from_numpy(x)
y = torch.from_numpy(y)
return x,y
rng = np.random.default_rng()
x, y = get_training_data_2(rng)

problem to calculate gradient using GradientTape() of tensorflow 2.0

Using tensorflow 2.0 and GradientTape() function, the first tape.gradient() gives correct gradient tensor, But the second tape.gradient() gives 'None'.
Why the second value is 'None'? I expect the gradient computed respectively on a second.
import tensorflow as tf
import numpy as np
x = tf.constant([ [1.0, 2.0], [3.0, 4.0], [5.0, 6.0] ])
y0 = tf.constant([ [4.0], [8.0], [12.0] ])
w = tf.Variable( [[1.0], [1.0]] )
with tf.GradientTape() as tape:
y = tf.matmul(x, w)
print("y : ", y.numpy())
loss = tf.reduce_sum(y-y0)
print("loss : ", loss.numpy())
grad = tape.gradient(loss, w) # gradient calculation is correct
print("gradient : ", grad.numpy())
mu = 0.01
w = w - mu*grad
with tf.GradientTape() as tape:
y = tf.matmul(x, w)
print("y : ", y.numpy())
loss = tf.reduce_sum(y-y0)
print("loss : ", loss.numpy())
grad = tape.gradient(loss, w) # gradient value go to 'None'
print("gradient : ", grad)
You are overwriting w by a Tensor (which is not a Variable) by assigning w = w - mu*grad. By default, GradientTape only tracks variables. You have two options.
Recommended: Replace w = w - mu*grad by w.assign(w - mu*grad). This keeps w as a Variable and is the way to update variable values.
You can track non-variables explicitly in a GradientTape. In the second tape context, add tape.watch(w) in the very beginning (before the matmul).

"keras.backend.variable" is not behaving correctly in keras as opposed to tensorflow

I want to define trainable scalar in my models. In TensorFlow, this is done using tf.Variable. In Keras, keras.backend.variable is supposed to behave the same way. However, when I use model.fit, keras does not change the variable during the optimization process. Does anyone know why?
To test, please uncomment RUN_ON = "tensorflow" or RUN_ON = "keras" to run on either of engines.
import numpy as np
import keras as k
import tensorflow as tf
import matplotlib.pyplot as plt
# RUN_ON = "tensorflow"
# RUN_ON = "keras"
b_true = 3.0
w_true = 5.0
x_true = np.linspace(0.0, 1.0, 1000).reshape(-1, 1)
y_true = x_true * w_true + b_true
ids = np.arange(0, x_true.shape[0])
if RUN_ON=="keras":
x = k.Input((1,), dtype="float32", name="x")
Fx = k.layers.Dense(1, use_bias=False, name="Fx")(x)
b = k.backend.variable(1.0, name="b")
y = k.layers.Lambda(lambda x: x+b, name="Add")(Fx)
model = k.Model(inputs=[x], outputs=[y])
model.compile("adam", loss="mse")
# model.summary()
model.fit(x_true, [y_true], epochs=100000, batch_size=1000)
y_pred = model.predict(x_true)
elif RUN_ON=="tensorflow":
x = tf.placeholder("float32", shape=[None, 1], name="x")
Fx = tf.layers.Dense(1, use_bias=False, name="Fx")(x)
b = tf.Variable(1.0, name="b")
y = Fx + b
yp = tf.placeholder("float32", shape=[None, 1], name="y")
loss = tf.reduce_mean(tf.square(yp - y))
opt = tf.train.AdamOptimizer(0.001).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(100000):
np.random.shuffle(ids)
opt_out, loss_val, b_val = sess.run([opt, loss, b], feed_dict={x: x_true[ids], yp: y_true[ids]})
print("epoch={:d} loss={:e} b_val={:f}".format(i, loss_val, b_val))
if loss_val < 1.0e-9:
break
y_pred = sess.run([y], feed_dict={x: x_true, yp: y_true})[0]
else:
raise ValueError('`RUN_ON` should be either `keras` or `tensorflow`.')
plt.plot(x_true, y_true, '--b', linewidth=4)
plt.plot(x_true, y_pred, 'r')
plt.show()
#

tf.layers.batch_normalization freezes during sess.run() (1.5.0-dev20171031)

The graph building phase passes without error, but the program freezes (no reading hard drive, no memory change, no ...) during sess.run() in the first mini-batch in the first epoch. If I remove this layer or replace it with tf.contrib.layers.layer_norm, the program runs without issues.
The tensor (x) I pass into tf.layers.batch_normalization has the shape [#batches, 200]. I use most default values, but turned off the center and scale.
x_BN = tf.layers.batch_normalization(
x,
axis=-1,
momentum=0.99,
epsilon=1e-10, #0.001,
center=False, #True,
scale=False, #True,
beta_initializer=tf.zeros_initializer(),
gamma_initializer=tf.ones_initializer(),
moving_mean_initializer=tf.zeros_initializer(),
moving_variance_initializer=tf.ones_initializer(),
beta_regularizer=None,
gamma_regularizer=None,
beta_constraint=None,
gamma_constraint=None,
training=Flg_training, #False,
trainable=True,
name=None,
reuse=None,
renorm=False,
renorm_clipping=None,
renorm_momentum=0.99,
fused=False,
virtual_batch_size=None,
adjustment=None
)
The tensorflow version I'm using is tf-nightly-gpu (1.5.0-dev20171031 or 1.5.0-dev20171023). Has anyone encountered a similar problem?
Update
This happens when the input of tf.layers.batch_normalization is from tf.nn.bidirectional_dynamic_rnn, please see a simplified code to reproduce this issue:
import tensorflow as tf
import numpy as np
starter_learning_rate = 0.001
decay_steps = 100
decay_rate = 0.96
num_RNN_layers = 3
LSTM_CELL_SIZE = 100
keep_prob = 0.95
with tf.name_scope('Inputs'):
x = tf.placeholder(dtype=tf.float32, shape=[None, 200])
y = tf.placeholder(dtype=tf.float32, shape=[None, 200])
length = tf.placeholder(dtype=tf.int32, shape=[None])
Flg_training = tf.placeholder(dtype=tf.bool, shape=[])
x_1 = tf.expand_dims(x, -1)
with tf.name_scope('BiLSTM'):
dropcells = []
for iiLyr in list(range(num_RNN_layers)):
cell_iiLyr = tf.nn.rnn_cell.LSTMCell(num_units=LSTM_CELL_SIZE, state_is_tuple=True)
dropcells.append(tf.nn.rnn_cell.DropoutWrapper(cell=cell_iiLyr, output_keep_prob=keep_prob)) #,, input_keep_prob=self.keep_prob input_keep_prob=1.0, seed=None
MultiLyr_cell = tf.nn.rnn_cell.MultiRNNCell(cells=dropcells, state_is_tuple=True)
outputs, states = tf.nn.bidirectional_dynamic_rnn(
cell_fw=MultiLyr_cell,
cell_bw=MultiLyr_cell,
dtype=tf.float32,
sequence_length=length, #tf_b_lens
inputs=x_1, #stacked_RefPts_desc, #tf_b_VCCs_AMs_BN1
scope = "BiLSTM"
)
#output_fw, output_bw = outputs
states_fw, states_bw = states
c_fw_lstLyr, h_fw_lstLyr = states_fw[-1]
c_bw_lstLyr, h_bw_lstLyr = states_bw[-1]
states_concat1 = tf.concat([h_fw_lstLyr, h_bw_lstLyr], axis = 1, name = 'states_concat')
with tf.name_scope("cs_BN1"):
x_BN = tf.layers.batch_normalization(
states_concat1,
axis=-1, # axis that should be normalized (typically the features axis, in this case the concated states or hidden vectors)
momentum=0.99,
epsilon=1e-10, #0.001,
center=False, #True,
scale=False, #True,
beta_initializer=tf.zeros_initializer(),
gamma_initializer=tf.ones_initializer(),
moving_mean_initializer=tf.zeros_initializer(),
moving_variance_initializer=tf.ones_initializer(),
beta_regularizer=None,
gamma_regularizer=None,
beta_constraint=None,
gamma_constraint=None,
training=Flg_training, #False,
trainable=True,
name="test_BN", #None,
reuse=None,
renorm=False,
renorm_clipping=None,
renorm_momentum=0.99,
fused=False,
virtual_batch_size=None,
adjustment=None
)
with tf.name_scope("Regression"):
a = tf.get_variable("a", shape=[1], dtype=tf.float32, initializer=tf.constant_initializer(1.0))
b = tf.get_variable("b", shape=[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
with tf.name_scope("Prediction"):
y_pred = tf.multiply(x_BN, a) + b
with tf.name_scope('Loss'):
losses = tf.losses.mean_squared_error(y, y_pred, reduction=tf.losses.Reduction.NONE)
mean_loss = tf.reduce_mean(losses)
with tf.name_scope('Training'):
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
decay_steps, decay_rate, staircase=True)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(losses, global_step=global_step)
#x_mean = tf.reduce_mean(x_BN, axis=0)
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter("G:\\Surface_Ozone\\Temp\\", sess.graph)
sess.run(tf.global_variables_initializer())
for ii in list(range(2000)):
x_in = (np.random.rand(20, 200))
y_in = x_in * 1.5 + 3.0
length_in = np.full([20], 200, dtype=np.int32)
_, mean_loss_val, a_val, b_val = sess.run([train_step, mean_loss, a, b], feed_dict={
x: x_in,
Flg_training: True,
y: y_in,
length: length_in
})
if (ii < 50):
print("step {}: {} | a: {} | b: {}".format(ii, mean_loss_val, a_val, b_val))
else:
if (ii % 100 == 0):
print("step {}: {} | a: {} | b: {}".format(ii, mean_loss_val, a_val, b_val))
print("Normal End.")

Creating a highly customizable RNN in Tensorflow

I am trying to implement an RNN without using the RNN functions provided by tensorflow. Here is the code I tried that eventually gave me an error
import tensorflow as tf
tf.InteractiveSession()
x = tf.placeholder(tf.float32, shape=(5,5))
InitialState = tf.zeros((5,1))
h = InitialState
W1 = tf.Variable(tf.random_normal([5, 5], stddev=0.35),
name="W1")
W2 = tf.Variable(tf.random_normal([5, 5], stddev=0.35),
name="W2")
for k in range(5):
h = tf.matmul(W1,h) + tf.matmul(W2,x[:,k:(k+1)])
h = tf.sigmoid(h)
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
a = sess.run([h], feed_dict = {x:tf.ones((5,5))})
How can I implement an RNN from scratch? Is there an example online?
import tensorflow as tf
import numpy as np
hidden_size = 2 # hidden layer of two neurons
input_size = 5
# Weight of x will the be (hidden_layer_size x input_size)
Wx = tf.Variable(tf.random_normal([hidden_size, input_size], stddev=0.35),
name="Wx")
# Weight of y will be (input_size x hidden_layer_size)
Wy = tf.Variable(tf.random_normal([input_size, hidden_size], stddev=0.35),
name="Wy")
# Weight of h will be (hidden_size, hidden_size)
Wh = tf.Variable(tf.random_normal([hidden_size, hidden_size], stddev=0.35),
name="Wh")
h = tf.zeros((hidden_size, input_size))
x = tf.placeholder(dtype = tf.float32,
shape = (input_size,input_size))
y = tf.placeholder(dtype = tf.float32,
shape = (input_size,input_size))
feed_dict = {
x : np.ones((5,5), dtype = np.float32),
y : np.ones((5,5), dtype = np.float32)
}
# RNN step
for _ in range(input_size):
h = tf.tanh(tf.matmul(Wh, h) + tf.matmul(Wx, x))
o = tf.nn.softmax(h)
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op, feed_dict = feed_dict)
h_new, y_hat = sess.run([h, o], feed_dict = feed_dict)
print h_new
print y_hat