consistent forward / backward pass with tensorflow dropout - tensorflow

For the reinforcement learning one usually applies forward pass of the neural network for each step of the episode in order to calculate policy. Afterwards one could calculate parameter gradients using backpropagation. Simplified implementation of my network looks like this:
class AC_Network(object):
def __init__(self, s_size, a_size, scope, trainer, parameters_net):
with tf.variable_scope(scope):
self.is_training = tf.placeholder(shape=[], dtype=tf.bool)
self.inputs = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
# (...)
layer = slim.fully_connected(self.inputs,
layer_size,
activation_fn=tf.nn.relu,
biases_initializer=None)
layer = tf.contrib.layers.dropout(inputs=layer, keep_prob=parameters_net["dropout_keep_prob"],
is_training=self.is_training)
self.policy = slim.fully_connected(layer, a_size,
activation_fn=tf.nn.softmax,
biases_initializer=None)
self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
self.advantages = tf.placeholder(shape=[None], dtype=tf.float32)
actions_onehot = tf.one_hot(self.actions, a_size, dtype=tf.float32)
responsible_outputs = tf.reduce_sum(self.policy * actions_onehot, [1])
self.policy_loss = - policy_loss_multiplier * tf.reduce_mean(tf.log(responsible_outputs) * self.advantages)
local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
self.gradients = tf.gradients(self.policy_loss, local_vars)
Now during training I will fist rollout the episode by consecutive forward passes (again, simplified version):
s = self.local_env.reset() # list of input variables for the first step
while done == False:
a_dist = sess.run([self.policy],
feed_dict = {self.local_AC.inputs: [s],
self.is_training: True})
a = np.argmax(a_dist)
s, r, done, extra_stat = self.local_env.step(a)
# (...)
and in the end I will calculate gradients by backward pass:
p_l, grad = sess.run([self.policy_loss,
self.gradients],
feed_dict={self.inputs: np.vstack(comb_observations),
self.is_training: True,
self.actions: np.hstack(comb_actions),})
(please note that I could have made a mistake somewhere above trying to remove as much as possible of the original code irrelevant to the issue in question)
So finally the question: Is there a way of ensuring that all the consecutive calls to the sess.run() will generate the same dropout structure? Ideally I would like to have exactly the same dropout structure within each episode and only change it between episodes. Things seem to work well as they are but I continue to wonder.

Related

GradientTape for variable weighted sum of two Sequential models in TensorFlow

Suppose we want to minimize the following equation using gradient descent:
min f(alpha * v + (1-alpha)*w) with v and w the model weights and alpha the weight, between 0 and 1, for the sum resulting in the combined model v_bar or ū (here referred to as m).
alpha = tf.Variable(0.01, name='Alpha', constraint=lambda t: tf.clip_by_value(t, 0, 1))
w_weights = tff.learning.ModelWeights.from_model(w)
v_weights = tff.learning.ModelWeights.from_model(v)
m_weights = tff.learning.ModelWeights.from_model(m)
m_weights_trainable = tf.nest.map_structure(lambda v, w: alpha*v + (tf.constant(1.0) - alpha)*w, v_weights.trainable, w_weights.trainable)
tf.nest.map_structure(lambda v, t: v.assign(t), m_weights.trainable, m_weights_trainable)
In the paper of Adaptive Personalized Federated Learning, formula with update step for alpha suggests updating alpha based on the gradients of model m applied on a minibatch. I tried it with the watch or without, but it always leads to No gradients provided for any variable
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch([alpha])
outputs_m = m.forward_pass(batch)
grad = tape.gradient(outputs_m.loss, alpha)
optimizer.apply_gradients(zip([grad], [alpha]))
Some more information about the initialization of the models:
The m.forward_pass(batch) is the default implementation from tff.learning.Model (found here) by creating a model with tff.learning.from_keras_model and a tf.keras.Sequential model.
def model_fn():
keras_model = create_keras_model()
return tff.learning.from_keras_model(
keras_model,
input_spec = element_spec,
loss = tf.keras.losses.MeanSquaredError(),
metrics = [tf.keras.metrics.MeanSquaredError(),
tf.keras.metrics.MeanAbsoluteError()],
)
w = model_fn()
v = model_fn()
m = model_fn()
Some more experimenting as suggested below by Zachary Garrett:
It seems that whenever this weighted sum is calculated, and the new weights for the model are assigned, then it loses track of the previous trainable variables of both summed models. Again, it leads to the No gradients provided for any variable whenever optimizer.apply_gradients(zip([grad], [alpha])) is called. All gradients seem to be None.
with tf.GradientTape() as tape:
alpha = tf.Variable(0.01, name='Alpha', constraint=lambda t: tf.clip_by_value(t, 0, 1))
m_weights_t = tf.nest.map_structure(lambda w, v: tf.math.scalar_mul(alpha, v, name=None) + tf.math.scalar_mul(tf.constant(1.0) - alpha, w, name=None),
w.trainable,
v.trainable)
m_weights = tff.learning.ModelWeights.from_model(m)
tf.nest.map_structure(lambda v, t: v.assign(t), m_weights.trainable,
m_weights_trainable)
outputs_m = m.forward_pass(batch)
grad = tape.gradient(outputs_m.loss, alpha)
optimizer.apply_gradients(zip([grad], [alpha]))
Another edit:
I think I have a strategy to get it working, but it is bad practice as manually setting trainable_weights or _trainable_weights does not work. Any tips on improving this?
def do_weighted_combination():
def _mapper(target_layer, v_layer, w_layer):
target_layer.kernel = v_layer.kernel * alpha + w_layer.kernel * (1-alpha)
target_layer.bias = v_layer.bias * alpha + w_layer.bias * (1-alpha)
tf.nest.map_structure(_mapper, m.layers, v.layers, w.layers)
with tf.GradientTape(persistent=True) as tape:
do_weighted_combination()
predictions = m(x_data)
loss = m.compiled_loss(y_data, predictions)
g1 = tape.gradient(loss, v.trainable_weights) # Not None
g2 = tape.gradient(loss, alpha) # Not None
For TensorFlow auto-differentiation using tf.GradientTape, operations must occur within the tf.GradientTape Python context manager so that TensorFlow can "see" them.
Possibly what is happening here is that alpha is used outside/before the tape context, when setting the model variables. Then when m.forwad_pass is called TensorFlow doesn't see any access to alpha and thus can't compute a gradient for it (instead returning None).
Moving the
alpha*v + (tf.constant(1.0) - alpha)*w, v_weights.trainable, w_weights.trainable
logic inside the tf.GradientTape context manager (possibly inside m.forward_pass) may be a solution.

How to implement Gaussian Mixture for VAE?

I feel like I don't really know what I'm doing so I will describe what I think I'm doing and what I want to do and where that fails.
Given a normal variational autoencoder:
...
net = tf.layers.dense(net, units=code_size * 2, activation=None)
mean = net[:, :code_size]
std = net[:, code_size:]
posterior = tfd.MultivariateNormalDiagWithSoftplusScale(mean, std)
net = posterior.sample()
net = tf.layers.dense(net, units=input_size, ...)
...
What I think I'm doing: Let the neural network find a "mean" and "std" value and use it to create a Normal distribution (Gaussian).
Sample from that distribution and use that for the decoder.
In other words: learn a Gaussian distribution of the encoding
Now I would like to do the same for a mixture of Gaussians.
...
net = tf.layers.dense(net, units=code_size * 2 * code_size, activation=None)
means, stds = tf.split(net, 2, axis=-1)
means = tf.split(means, code_size, axis=-1)
stds = tf.split(stds, code_size, axis=-1)
components = [tfd.MultivariateNormalDiagWithSoftplusScale(means[i], stds[i]) for i in range(code_size)]
probs = [1.0 / code_size] * code_size
gauss_mix = tfd.Mixture(cat=tfd.Categorical(probs=probs), components=components)
net = gauss_mix.sample()
net = tf.layers.dense(net, units=input_size, ...)
...
That seemed relatively straight forward for me except that it fails with the following error:
Shapes () and (?,) are not compatible
This seems to come from probs that doesn't have the batch dimension (I didn't thought it would need that).
I thought that probs defines the probability between the components.
If I define a probs that also has the batch dimension I get the following cryptic error I don't know what it should mean:
Dimension -1796453376 must be >= 0
Do I generally misunderstand some concepts?
Or what do I need to do differently?

Soft attention from scratch for video sequences

I am trying to implement soft attention for video sequences classification. As there are a lot of implementations and examples about NLP so I tried following this schema but for video 1. Basically a LSTM with an Attention Model in between.
1 https://blog.heuritech.com/2016/01/20/attention-mechanism/
My code for my attention layer is the following which I am not sure it is implemented correctly.
def attention_layer(self, input, context):
# Input is a Tensor: [batch_size, lstm_units]
# Input (Seq_length, batch_size, lstm_units)
# Context is a LSTMStateTuple: [batch_size, lstm_units]. Hidden_state, output = StateTuple
hidden_state, _ = context
weights_y = tf.get_variable("att_weights_Y", [self.lstm_units, self.lstm_units], initializer=tf.contrib.layers.xavier_initializer())
weights_c = tf.get_variable("att_weights_c", [self.lstm_units, self.lstm_units], initializer=tf.contrib.layers.xavier_initializer())
z_ = []
for feat in input:
# Equation => M = tanh(Wc c + Wy y)
Wcc = tf.matmul(hidden_state, weights_c)
Wyy = tf.matmul(feat, weights_y)
m = tf.add(Wcc, Wyy)
m = tf.tanh(m, name='M_matrix')
# Equation => s = softmax(m)
s = tf.nn.softmax(m, name='softmax_att')
z = tf.multiply(feat, s)
z_.append(z)
out = tf.stack(z_, axis=1)
out = tf.reduce_sum(out, 1)
return out, s
So, adding this layer in between my LSTMs (or at the begining of my 2 LSTM) makes the training so slow. More specifically, it takes a lot of time when I declare my optimizer:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
My questions are:
Is the implementation correct? If it is, is there a way to optimize it in order to make it train properly?
I was not able to make it work with the seq2seq APIs. Is there any API with Tensorflow that allows me tackle this specific issue?
Does it actually makes sense to use this for sequence classification?

How to use previous output and hidden states from LSTM for the attention mechanism?

I am currently trying to code the attention mechanism from this paper: "Effective Approaches to Attention-based Neural Machine Translation", Luong, Pham, Manning (2015). (I use global attention with the dot score).
However, I am unsure on how to input the hidden and output states from the lstm decode. The issue is that the input of the lstm decoder at time t depends on quantities that I need to compute using the output and hidden states from t-1.
Here is the relevant part of the code:
with tf.variable_scope('data'):
prob = tf.placeholder_with_default(1.0, shape=())
X_or = tf.placeholder(shape = [batch_size, timesteps_1, num_input], dtype = tf.float32, name = "input")
X = tf.unstack(X_or, timesteps_1, 1)
y = tf.placeholder(shape = [window_size,1], dtype = tf.float32, name = "label_annotation")
logits = tf.zeros((1,1), tf.float32)
with tf.variable_scope('lstm_cell_encoder'):
rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [hidden_size, hidden_size]]
multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
lstm_outputs, lstm_state = tf.contrib.rnn.static_rnn(cell=multi_rnn_cell,inputs=X,dtype=tf.float32)
concat_lstm_outputs = tf.stack(tf.squeeze(lstm_outputs))
last_encoder_state = lstm_state[-1]
with tf.variable_scope('lstm_cell_decoder'):
initial_input = tf.unstack(tf.zeros(shape=(1,1,hidden_size2)))
rnn_decoder_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, state_is_tuple = True)
# Compute the hidden and output of h_1
for index in range(window_size):
output_decoder, state_decoder = tf.nn.static_rnn(rnn_decoder_cell, initial_input, initial_state=last_encoder_state, dtype=tf.float32)
# Compute the score for source output vector
scores = tf.matmul(concat_lstm_outputs, tf.reshape(output_decoder[-1],(hidden_size,1)))
attention_coef = tf.nn.softmax(scores)
context_vector = tf.reduce_sum(tf.multiply(concat_lstm_outputs, tf.reshape(attention_coef, (window_size, 1))),0)
context_vector = tf.reshape(context_vector, (1,hidden_size))
# compute the tilda hidden state \tilde{h}_t=tanh(W[c_t, h_t]+b_t)
concat_context = tf.concat([context_vector, output_decoder[-1]], axis = 1)
W_tilde = tf.Variable(tf.random_normal(shape = [hidden_size*2, hidden_size2], stddev = 0.1), name = "weights_tilde", trainable = True)
b_tilde = tf.Variable(tf.zeros([1, hidden_size2]), name="bias_tilde", trainable = True)
hidden_tilde = tf.nn.tanh(tf.matmul(concat_context, W_tilde)+b_tilde) # hidden_tilde is [1*64]
# update for next time step
initial_input = tf.unstack(tf.reshape(hidden_tilde, (1,1,hidden_size2)))
last_encoder_state = state_decoder
# predict the target
W_target = tf.Variable(tf.random_normal(shape = [hidden_size2, 1], stddev = 0.1), name = "weights_target", trainable = True)
logit = tf.matmul(hidden_tilde, W_target)
logits = tf.concat([logits, logit], axis = 0)
logits = logits[1:]
The part inside the loop is what I am unsure of. Does tensorflow remember the computational graph when I overwrite the variable "initial_input" and "last_encoder_state"?
I think your model will be much simplified if you use tf.contrib.seq2seq.AttentionWrapper with one of implementations: BahdanauAttention or LuongAttention.
This way it'll be possible to wire the attention vector on a cell level, so that cell output is already after attention applied. Example from the seq2seq tutorial:
cell = LSTMCell(512)
attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs)
attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_size=256)
Note that this way you won't need a loop of window_size, because tf.nn.static_rnn or tf.nn.dynamic_rnn will instantiate the cells wrapped with attention.
Regarding your question: you should distinguish python variables and tensorflow graph nodes: you can assign last_encoder_state to a different tensor, the original graph node won't change because of this. This is flexible, but can be also misleading in the result network - you might think that you connect an LSTM to one tensor, but it's actually the other. In general, you shouldn't do that.

RNN Slow-down phenomenon of Tensorflow

I found a peculiar property of lstm cell(not limited to lstm but I only examined with this) of tensorflow which has not been reported as far as I know.
I don't know whether it actually has, so I left this post in SO. Below is a toy code for this problem:
import tensorflow as tf
import numpy as np
import time
def network(input_list):
input,init_hidden_c,init_hidden_m = input_list
cell = tf.nn.rnn_cell.BasicLSTMCell(256, state_is_tuple=True)
init_hidden = tf.nn.rnn_cell.LSTMStateTuple(init_hidden_c, init_hidden_m)
states, hidden_cm = tf.nn.dynamic_rnn(cell, input, dtype=tf.float32, initial_state=init_hidden)
net = [v for v in tf.trainable_variables()]
return states, hidden_cm, net
def action(x, h_c, h_m):
t0 = time.time()
outputs, output_h = sess.run([rnn_states[:,-1:,:], rnn_hidden_cm], feed_dict={
rnn_input:x,
rnn_init_hidden_c: h_c,
rnn_init_hidden_m: h_m
})
dt = time.time() - t0
return outputs, output_h, dt
rnn_input = tf.placeholder("float", [None, None, 512])
rnn_init_hidden_c = tf.placeholder("float", [None,256])
rnn_init_hidden_m = tf.placeholder("float", [None,256])
rnn_input_list = [rnn_input, rnn_init_hidden_c, rnn_init_hidden_m]
rnn_states, rnn_hidden_cm, rnn_net = network(rnn_input_list)
feed_input = np.random.uniform(low=-1.,high=1.,size=(1,1,512))
feed_init_hidden_c = np.zeros(shape=(1,256))
feed_init_hidden_m = np.zeros(shape=(1,256))
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(10000):
_, output_hidden_cm, deltat = action(feed_input, feed_init_hidden_c, feed_init_hidden_m)
if i % 10 == 0:
print 'Running time: ' + str(deltat)
(feed_init_hidden_c, feed_init_hidden_m) = output_hidden_cm
feed_input = np.random.uniform(low=-1.,high=1.,size=(1,1,512))
[Not important]What this code does is to generate an output from 'network()' function containing LSTM where the input's temporal dimension is 1, so output's is also 1, and pull in&out initial state for each step of running.
[Important] Looking the 'sess.run()' part. For some reasons in my real code, I happened to put [:,-1:,:] for 'rnn_states'. What is happening is then the time spent for each 'sess.run()' increases. For some inspection by my own, I found this slow down stems from that [:,-1:,:]. I just wanted to get the output at the last time step. If you do 'outputs, output_h = sess.run([rnn_states, rnn_hidden_cm], feed_dict{~' w/o [:,-1:,:] and take 'last_output = outputs[:,-1:,:]' after the 'sess.run()', then the slow down does not occur.
I do not know why this exponential increment of time happens with that [:,-1:,:] running. Is this the nature of tensorflow hasn't been documented but particularly slows down(may be adding more graph by its own?)?
Thank you, and hope this mistake not happen for other users by this post.
I encountered the same problem, with TensorFlow slowing down for each iteration I ran it, and found this question while trying to debug it. Here's a short description of my situation and how I solved it for future reference. Hopefully it can point someone in the right direction and save them some time.
In my case the problem was mainly that I didn't make use of feed_dict to supply the network state when executing sess.run(). Instead I redeclared outputs, final_state and prediction every iteration. The answer at https://github.com/tensorflow/tensorflow/issues/1439#issuecomment-194405649 made me realize how stupid that was... I was constantly creating new graph nodes in every iteration, making it all slower and slower. The problematic code looked something like this:
# defining the network
lstm_layer = rnn.BasicLSTMCell(num_units, forget_bias=1)
outputs, final_state = rnn.static_rnn(lstm_layer, input, initial_state=rnn_state, dtype='float32')
prediction = tf.nn.softmax(tf.matmul(outputs[-1], out_weights)+out_bias)
for input_data in data_seq:
# redeclaring, stupid stupid...
outputs, final_state = rnn.static_rnn(lstm_layer, input, initial_state=rnn_state, dtype='float32')
prediction = tf.nn.softmax(tf.matmul(outputs[-1], out_weights)+out_bias)
p, rnn_state = sess.run((prediction, final_state), feed_dict={x: input_data})
The solution was of course to only declare the nodes once in the beginning, and supply the new data with feed_dict. The code went from being half slow (> 15 ms in the beginning) and becoming slower for every iteration, to execute every iteration in around 1 ms. My new code looks something like this:
out_weights = tf.Variable(tf.random_normal([num_units, n_classes]), name="out_weights")
out_bias = tf.Variable(tf.random_normal([n_classes]), name="out_bias")
# placeholder for the network state
state_placeholder = tf.placeholder(tf.float32, [2, 1, num_units])
rnn_state = tf.nn.rnn_cell.LSTMStateTuple(state_placeholder[0], state_placeholder[1])
x = tf.placeholder('float', [None, 1, n_input])
input = tf.unstack(x, 1, 1)
# defining the network
lstm_layer = rnn.BasicLSTMCell(num_units, forget_bias=1)
outputs, final_state = rnn.static_rnn(lstm_layer, input, initial_state=rnn_state, dtype='float32')
prediction = tf.nn.softmax(tf.matmul(outputs[-1], out_weights)+out_bias)
# actual network state, which we input with feed_dict
_rnn_state = tf.nn.rnn_cell.LSTMStateTuple(np.zeros((1, num_units), dtype='float32'), np.zeros((1, num_units), dtype='float32'))
it = 0
for input_data in data_seq:
encl_input = [[input_data]]
p, _rnn_state = sess.run((prediction, final_state), feed_dict={x: encl_input, rnn_state: _rnn_state})
print("{} - {}".format(it, p))
it += 1
Moving the declaration out from the for loop also got rid of the problem which the OP sdr2002 had, doing a slice outputs[-1] in sess.run() inside the for loop.
As mentioned above, no sliced output for 'sess.run()' is much appreciated for this case.
def action(x, h_c, h_m):
t0 = time.time()
outputs, output_h = sess.run([rnn_states, rnn_hidden_cm], feed_dict={
rnn_input:x,
rnn_init_hidden_c: h_c,
rnn_init_hidden_m: h_m
})
outputs = outputs[:,-1:,:]
dt = time.time() - t0
return outputs, output_h, dt