TensorFlow: Variables in bijectors cannot be reused - tensorflow

Describe the problem
I am trying to reuse the weights and biases in the neural network within the MaskedAutoregressiveFlow bijector, by placing it within a tf.variable_scope with reuse=tf.AUTO_REUSE. But found that the weights and biases are not reused in practice.
Reproduce
import tensorflow as tf
from tensorflow.contrib.distributions.python.ops import bijectors as tfb
def get_bijector(name='my_bijector', reuse=None):
"""Returns a MAF bijector."""
with tf.variable_scope(name, reuse=reuse):
shift_and_log_scale_fn = \
tfb.masked_autoregressive_default_template([128])
return tfb.MaskedAutoregressiveFlow(shift_and_log_scale_fn)
x = tf.placeholder(shape=[None, 64], dtype='float32', name='x')
bijector_0 = get_bijector(reuse=tf.AUTO_REUSE)
y_0 = bijector_0.forward(x)
bijector_1 = get_bijector(reuse=tf.AUTO_REUSE)
y_1 = bijector_1.forward(x)
# We were expecting that the `y_0` and `y_1` share the same dependent variables,
# since we used `tf.AUTO_REUSE` within the `tf.variable_scope`. However, the following
# will return a `False`.
print(get_dependent_variables(y_0) == get_dependent_variables(y_1))
wherein we have employed the function that gains all the variables a tensor depends on:
import collections
def get_dependent_variables(tensor):
"""Returns all variables that the tensor `tensor` depends on.
Forked from: https://stackoverflow.com/a/42861919/1218716
Args:
tensor: Tensor.
Returns:
List of variables.
"""
# Initialize
starting_op = tensor.op
dependent_vars = []
queue = collections.deque()
queue.append(starting_op)
op_to_var = {var.op: var for var in tf.trainable_variables()}
visited = {starting_op}
while queue:
op = queue.popleft()
try:
dependent_vars.append(op_to_var[op])
except KeyError:
# `op` is not a variable, so search its inputs (if any).
for op_input in op.inputs:
if op_input.op not in visited:
queue.append(op_input.op)
visited.add(op_input.op)
return dependent_vars

Related

Can't apply gradients on tf.Variable

I am trying to learn a similarity matrix(M) between two image embeddings, A single instance of training is a pair of images - (anchor, positive). So ideally the model will return 0 distance for embeddings of similar images.
The problem is, when i declare the distance matrix(M) as a tf.Variable, it returns an error
on this line
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
TypeError: 'Variable' object is not iterable.
I think I should use a tensorflow datatype for M, that is iterable
Please tell me how I can fix this issue
import tensorflow as tf
from tensorflow import keras
# metric learning model
class MetricLearningModel:
def __init__(self, lr):
self.optimizer = keras.optimizers.Adam(lr=lr)
self.lr = lr
self.loss_object = keras.losses.MeanSquaredError()
self.trainable_variables = tf.Variable(
(tf.ones((2048, 2048), dtype=tf.float32)),
trainable=True
)
def similarity_function(self, anchor_embeddings, positive_embeddings):
M = self.trainable_variables
X_i = anchor_embeddings
X_j = positive_embeddings
similarity_value = tf.matmul(X_j, M, name='Tensor')
similarity_value = tf.matmul(similarity_value, tf.transpose(X_i), name='Tensor')
# distance(x,y) = sqrt( (x-y)#M#(x-y).T )
return similarity_value
def train_step(self, anchor, positive):
anchor_embeddings, positive_embeddings = anchor, positive
# Calculate gradients
with tf.GradientTape() as tape:
# Calculate similarity between anchors and positives.
similarities = self.similarity_function(anchor_embeddings, positive_embeddings)
y_pred = similarities
y_true = tf.zeros(1)
print(y_true, y_pred)
loss_value = self.loss_object(
y_pred=y_true,
y_true=y_pred,
)
gradients = tape.gradient(loss_value, self.trainable_variables)
# Apply gradients via optimizer
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
metric_model = MetricLearningModel(lr=1e-3)
anchor, positive = tf.ones((1, 2048), dtype=tf.float32), tf.ones((1, 2048), dtype=tf.float32)
metric_model.train_step(anchor, positive)
The python zip function expects iterable objects, like for example a list or a tuple.
In your calls to tape.gradient, or optimizer.apply_gradients, you can put your Variable in a list to solve the issue :
with tf.GradienTape() as tape:
gradients = tape.gradient(loss_value, [self.trainable_variables])
# Apply gradients via optimizer
self.optimizer.apply_gradients(zip(gradients, [self.trainable_variables]))
tape.gradient respects the shape of the sources object passed to compute the gradients of, so if you feed it with a list, you will get a list out of it. It is stated in the documentation:
Returns
a list or nested structure of Tensors (or IndexedSlices, or None), one for each element in sources. Returned structure is the same as the structure of sources.

TPU training in colab, custom model, data from my own GCP account: Cell just seems to hang, no progress or error message

I am trying to train on a colab TPU using data from my GCP account.
When I run the cell that starts the training, the cell just seems to hang, with no progress. I put a very low number of steps, so that the training should complete pretty quickly, about a minute on GPU, but it never finishes on TPU.
I am using a custom model, and I am using files saved on GCP using the solution given in this stackoverflow answer How to connect to private storage bucket using the Google Colab TPU
The model trains/runs just fine on GPU/CPU.
The full code is in this colab notebook here
https://colab.research.google.com/drive/13HgRJru0glOzn7m0b7tmVCO_VrRpa1XS?usp=sharing
And here's a google drive link to the sample data file
https://drive.google.com/file/d/10EFyxau97jLfeGaKugMevIyX-bobsFe5/view?usp=sharing
And below is the code from the colab notebook
!pip install transformers --q
%tensorflow_version 2.x
!gcloud auth login
'''NEED TO RUN THIS CELL TWICE TO AVOID ERROR'''
from google.colab import auth
auth.authenticate_user()
project_id = 'machinelearning-264918'
!gcloud config set project {project_id}
!pip install tfa-nightly
import tensorflow_addons as tfa
from transformers import TFBertModel, AutoModel
import tensorflow as tf
from tensorflow.keras.layers import (Dense,
Dropout)
import os
import tensorflow_addons as tfa
logger = tf.get_logger()
logger.info(tf.__version__)
autotune = tf.data.experimental.AUTOTUNE
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
logger.info('Running with TPUStrategy on TPU {} with {} cores '
.format(tpu.cluster_spec().as_dict()['worker'],
strategy.num_replicas_in_sync))
batch_size = 3 * strategy.num_replicas_in_sync
except Exception:
# raise ValueError
strategy = tf.distribute.OneDeviceStrategy(device='/gpu:0')
logger.warning('Failed initializing TPU! Running on GPU')
batch_size = 3
from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
from tensorflow.python.distribute import parameter_server_strategy
def _minimize(strategy, tape, optimizer, loss, trainable_variables):
with tape:
if isinstance(optimizer, lso.LossScaleOptimizer):
loss = optimizer.get_scaled_loss(loss)
gradients = tape.gradient(loss, trainable_variables)
# Whether to aggregate gradients outside of optimizer. This requires support
# of the optimizer and doesn't work with ParameterServerStrategy and
# CentralStroageStrategy.
aggregate_grads_outside_optimizer = (
optimizer._HAS_AGGREGATE_GRAD and # pylint: disable=protected-access
not isinstance(strategy.extended,
parameter_server_strategy.ParameterServerStrategyExtended))
if aggregate_grads_outside_optimizer:
# We aggregate gradients before unscaling them, in case a subclass of
# LossScaleOptimizer all-reduces in fp16. All-reducing in fp16 can only be
# done on scaled gradients, not unscaled gradients, for numeric stability.
gradients = optimizer._aggregate_gradients(zip(gradients, # pylint: disable=protected-access
trainable_variables))
if isinstance(optimizer, lso.LossScaleOptimizer):
gradients = optimizer.get_unscaled_gradients(gradients)
gradients = optimizer._clip_gradients(gradients) # pylint: disable=protected-access
if trainable_variables:
if aggregate_grads_outside_optimizer:
optimizer.apply_gradients(
zip(gradients, trainable_variables),
experimental_aggregate_gradients=False)
else:
optimizer.apply_gradients(zip(gradients, trainable_variables))
class CustomModel(tf.keras.Model):
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
x, y = data
batch_label = tf.reshape(y, (tf.size(y)/2, 2), name=None)
rs = tf.ragged.stack(x, axis=0)
reg = rs.to_tensor()
batch_input = tf.reshape(reg, (tf.shape(reg)[0]*tf.shape(reg)[1], tf.shape(reg)[2]))
with tf.GradientTape() as tape:
y_pred = self(batch_input, training=True) # Forward pass
# Compute the loss value
# (the loss function is configured in `compile()`)
loss = self.compiled_loss(batch_label, y_pred, regularization_losses=self.losses)
# Compute gradients
_minimize(self.distribute_strategy, tape, self.optimizer, loss,
self.trainable_variables)
# Update weights
# self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update metrics (includes the metric that tracks the loss)
self.compiled_metrics.update_state(y, y_pred)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
def get_model(drop_out):
sciBert = TFBertModel.from_pretrained('bert-base-uncased', from_pt=True)
allFinal = tf.keras.Input(shape=(None,), dtype=tf.int32, name='inputN')
'''Should posFinal and negFinal be concatenated, so there's only one call to sciBert'''
allBertOut = sciBert(allFinal, training=True)
allPoolConcat = tf.concat([
allBertOut[0][:, 0], #output of ff layer after last hidden state since it seems to be untrained in roberta
tf.reduce_mean(allBertOut[0][:, 1:-1], axis=1)
],axis=1)
postLayer = tf.keras.layers.Dense(768, activation='swish', name='postff')
LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNormO")
postLayer2 = tf.keras.layers.Dense(768, activation='swish', name='2postff')
classifier = tf.keras.layers.Dense(2, name='classifierff')
postWeights = postLayer(allPoolConcat)
postWeights = LayerNorm(postWeights)
postWeights = Dropout(drop_out)(postWeights)
postWeights2 = postLayer2(postWeights)
allScores = classifier(postWeights2)
model = CustomModel(inputs=allFinal, outputs=allScores)
return model
#tf.function
def _parse_example(example_proto):
features = {
'sciBert_SentenceIndex': tf.io.VarLenFeature( dtype=tf.int64),
'SciBert_IDs': tf.io.VarLenFeature(dtype=tf.int64),
}
parsed_example_dict = tf.io.parse_single_example(example_proto, features)
sentencePositions = parsed_example_dict['sciBert_SentenceIndex']
passageIds = parsed_example_dict['SciBert_IDs']
sentencePositions = tf.sparse.to_dense(sentencePositions)
bertIds = tf.sparse.to_dense(passageIds)
sentencePositions = tf.cast(sentencePositions, dtype=tf.int32)
passageIds = tf.cast(passageIds, dtype=tf.int32)
length = tf.shape(
sentencePositions, out_type=tf.dtypes.int32, name='shape'
)
lengthMinusOne = tf.math.subtract(
length, 1, name='SubtractOne'
)
# creage random numbers for a sentence index up to 2nd to last index
# the last index is just the last position of the non-padded bertID
startRandSentIndex = tf.random.uniform(
shape=[1], minval=0, maxval=lengthMinusOne[0], dtype=tf.dtypes.int32, seed=None, name=None)
# Get the end point for that sentence
endRandSentIndex = tf.math.add(startRandSentIndex, 1, name=None)
# last position of the non-padded bertID
lastPosition = length-1
# extract BertID positions for sentence start/end and bertID end
startSentencePosit = tf.gather_nd(sentencePositions, [startRandSentIndex], batch_dims=0)
endSentencePosit = tf.gather_nd(sentencePositions, [endRandSentIndex], batch_dims=0)
lastPassagePosit = tf.gather_nd(sentencePositions, [lastPosition], batch_dims=0)
# Get slices of BertIDs for the query, and the rest
firstPiece = tf.slice(bertIds, [0], [startSentencePosit[0]] )
queryPiece = tf.slice(bertIds, [startSentencePosit[0]], [endSentencePosit[0]-startSentencePosit[0]] )
lastPiece = tf.slice(bertIds, [endSentencePosit[0]], [lastPassagePosit[0]-endSentencePosit[0]] )
# concat rest of passage
restPassagePiece = tf.concat( [firstPiece,lastPiece], axis=0 )
# Clip
queryPiece = queryPiece[0:256]
restPassagePiece = restPassagePiece[0:510]
# add special tokens for proper input into the model
return tf.cast(queryPiece, dtype=tf.int32), tf.cast(restPassagePiece, dtype=tf.int32)
#tf.function
def clip_seq_to_len(seq, num_tokens=512):
seq_len = tf.shape(seq)[0]
if seq_len > 511:
return seq[:511]
return seq[:]
#tf.function
def make_samples(query_a, passage_a, query_b, passage_b):
CLS_inputID = tf.constant([102])
SEP_inputID = tf.constant([103])
positive_sample_a = clip_seq_to_len(tf.concat([CLS_inputID, query_a, SEP_inputID, passage_a], axis=-1))
positive_sample_b = clip_seq_to_len(tf.concat([CLS_inputID, query_b, SEP_inputID, passage_b], axis=-1))
negative_sample_a = clip_seq_to_len(tf.concat([CLS_inputID, query_a, SEP_inputID, passage_b], axis=-1))
negative_sample_b = clip_seq_to_len(tf.concat([CLS_inputID, query_b, SEP_inputID, passage_a], axis=-1))
positive_sample_a = tf.concat([positive_sample_a, SEP_inputID], axis=-1)
positive_sample_b = tf.concat([positive_sample_b, SEP_inputID], axis=-1)
negative_sample_a = tf.concat([negative_sample_a, SEP_inputID], axis=-1)
negative_sample_b = tf.concat([negative_sample_b, SEP_inputID], axis=-1)
return positive_sample_a, positive_sample_b, negative_sample_a, negative_sample_b
#tf.function
def get_samples(example_a, example_b):
samples = make_samples(*_parse_example(example_a), *_parse_example(example_b))
return samples
config = {
'drop_out':0.1
}
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
with strategy.scope():
model = get_model(**config)
model.compile(loss=loss_fn,
optimizer=tfa.optimizers.AdamW(weight_decay=1e-5, learning_rate=3e-4, epsilon=1e-07), run_eagerly=False)
config_name = 'model_b'
base_dir = 'gs://bdora-semanticscholar'
model_dir = os.path.join(base_dir, config_name)
# tensorboard_dir = os.path.join(model_dir, 'logs_' + str(time()))
tfrecords_pattern_train = os.path.join(base_dir, 'VersionB_00022*')
tfrecords_pattern_train2 = os.path.join(base_dir, 'VersionB_00022*')
#tf.function
def gen():
while True:
yield ([1, 0], [1, 0], [0, 1], [0, 1] )
batchNumber = batch_size
run_eagerly = False
with strategy.scope():
filenames = tf.io.gfile.glob(tfrecords_pattern_train)
train_dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=autotune)
filenames = tf.io.gfile.glob(tfrecords_pattern_train)
neg_dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=autotune)
train_dataset = train_dataset.shuffle(150_000, seed=1000, reshuffle_each_iteration=True)
neg_dataset = neg_dataset.shuffle(150_000, seed=2000, reshuffle_each_iteration=True)
train_datasetC = tf.data.Dataset.zip((train_dataset, neg_dataset))
train_datasetC = train_datasetC.map(get_samples, num_parallel_calls=autotune)
train_datasetC = train_datasetC.shuffle(1024, seed=1000, reshuffle_each_iteration=True)
train_datasetC = train_datasetC.padded_batch(batchNumber, padding_values=(0, 0, 0, 0))
datasetLabels = tf.data.Dataset.from_generator(
gen,
(tf.int32, tf.int32, tf.int32, tf.int32),
(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])))
datasetLabels = datasetLabels.batch(batchNumber)
train_datasetFinal = tf.data.Dataset.zip((train_datasetC, datasetLabels))
train_datasetFinal = train_datasetFinal.prefetch(autotune)
train_datasetFinal = train_datasetFinal.repeat()
train_datasetFinal = train_datasetFinal.apply(tf.data.experimental.ignore_errors())
model.fit(train_datasetFinal, steps_per_epoch=100, epochs=3)
And this is the only output I get
Epoch 1/3
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
I found this GitHub issue discussion [1] that you can refer to.
It's not an error, it just means it's not updating those variables. Those variables (pooler) are not used when doing sequence classification.
[1] https://github.com/tensorflow/tensorflow/issues/37501

Tensorflow: FailedPreconditionError: Error while reading resource variable from Container: localhost. When running sess.run() on custom loss function

I have a code running Keras with TensorFlow 1. The code modifies the loss function in order to do deep reinforcement learning:
import os
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
env = gym.make("CartPole-v0").env
env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape
from tensorflow import keras
import random
from tensorflow.keras import layers as L
import tensorflow as tf
from tensorflow.python.keras.backend import set_session
sess = tf.compat.v1.Session()
graph = tf.compat.v1.get_default_graph()
init = tf.global_variables_initializer()
sess.run(init)
network = keras.models.Sequential()
network.add(L.InputLayer(state_dim))
# let's create a network for approximate q-learning following guidelines above
network.add(L.Dense(5, activation='elu'))
network.add(L.Dense(5, activation='relu'))
network.add(L.Dense(n_actions, activation='linear'))
s = env.reset()
# Create placeholders for the <s, a, r, s'> tuple and a special indicator for game end (is_done = True)
states_ph = keras.backend.placeholder(dtype='float32', shape=(None,) + state_dim)
actions_ph = keras.backend.placeholder(dtype='int32', shape=[None])
rewards_ph = keras.backend.placeholder(dtype='float32', shape=[None])
next_states_ph = keras.backend.placeholder(dtype='float32', shape=(None,) + state_dim)
is_done_ph = keras.backend.placeholder(dtype='bool', shape=[None])
#get q-values for all actions in current states
predicted_qvalues = network(states_ph)
#select q-values for chosen actions
predicted_qvalues_for_actions = tf.reduce_sum(predicted_qvalues * tf.one_hot(actions_ph, n_actions),
axis=1)
gamma = 0.99
# compute q-values for all actions in next states
predicted_next_qvalues = network(next_states_ph)
# compute V*(next_states) using predicted next q-values
next_state_values = tf.math.reduce_max(predicted_next_qvalues, axis=1)
# compute "target q-values" for loss - it's what's inside square parentheses in the above formula.
target_qvalues_for_actions = rewards_ph + tf.constant(gamma) * next_state_values
# at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
target_qvalues_for_actions = tf.where(is_done_ph, rewards_ph, target_qvalues_for_actions)
#mean squared error loss to minimize
loss = (predicted_qvalues_for_actions - tf.stop_gradient(target_qvalues_for_actions)) ** 2
loss = tf.reduce_mean(loss)
# training function that resembles agent.update(state, action, reward, next_state) from tabular agent
train_step = tf.compat.v1.train.AdamOptimizer(1e-4).minimize(loss)
a = 0
next_s, r, done, _ = env.step(a)
sess.run(train_step, {
states_ph: [s], actions_ph: [a], rewards_ph: [r],
next_states_ph: [next_s], is_done_ph: [done]
})
When I run a sess.run() training step, I get the following error:
tensorflow.python.framework.errors_impl.FailedPreconditionError: Error while reading resource variable beta1_power from Container: localhost. This could mean that the variable was uninitialized. Not found: Container localhost does not exist. (Could not find resource: localhost/beta1_power)
Any ideas on what might be the problem?
The initialization operation should be fetched and run (only one time) after the variables (i.e. model) have been created or the computation graph has been defined. Therefore, they should be put right before running the training step:
# Define and create the computation graph/model
# ...
# Initialize variables in the graph/model
init = tf.global_variables_initializer()
sess.run(init)
# Start training
sess.run(train_step, ...)

Create keras callback to save model predictions and targets for each batch during training

I am building a simple Sequential model in Keras (tensorflow backend). During training I want to inspect the individual training batches and model predictions. Therefore, I am trying to create a custom Callback that saves the model predictions and targets for each training batch. However, the model is not using the current batch for prediction, but the entire training data.
How can I hand over only the current training batch to the Callback?
And how can I access the batches and targets that the Callback saves in self.predhis and self.targets?
My current version looks as follows:
callback_list = [prediction_history((self.x_train, self.y_train))]
self.model.fit(self.x_train, self.y_train, batch_size=self.batch_size, epochs=self.n_epochs, validation_data=(self.x_val, self.y_val), callbacks=callback_list)
class prediction_history(keras.callbacks.Callback):
def __init__(self, train_data):
self.train_data = train_data
self.predhis = []
self.targets = []
def on_batch_end(self, epoch, logs={}):
x_train, y_train = self.train_data
self.targets.append(y_train)
prediction = self.model.predict(x_train)
self.predhis.append(prediction)
tf.logging.info("Prediction shape: {}".format(prediction.shape))
tf.logging.info("Targets shape: {}".format(y_train.shape))
NOTE: this answer is outdated and only works with TF1. Check #bers's answer for a solution tested on TF2.
After model compilation, the placeholder tensor for y_true is in model.targets and y_pred is in model.outputs.
To save the values of these placeholders at each batch, you can:
First copy the values of these tensors into variables.
Evaluate these variables in on_batch_end, and store the resulting arrays.
Now step 1 is a bit involved because you'll have to add an tf.assign op to the training function model.train_function. Using current Keras API, this can be done by providing a fetches argument to K.function() when the training function is constructed.
In model._make_train_function(), there's a line:
self.train_function = K.function(inputs,
[self.total_loss] + self.metrics_tensors,
updates=updates,
name='train_function',
**self._function_kwargs)
The fetches argument containing the tf.assign ops can be provided via model._function_kwargs (only works after Keras 2.1.0).
As an example:
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import Callback
from keras import backend as K
import tensorflow as tf
import numpy as np
class CollectOutputAndTarget(Callback):
def __init__(self):
super(CollectOutputAndTarget, self).__init__()
self.targets = [] # collect y_true batches
self.outputs = [] # collect y_pred batches
# the shape of these 2 variables will change according to batch shape
# to handle the "last batch", specify `validate_shape=False`
self.var_y_true = tf.Variable(0., validate_shape=False)
self.var_y_pred = tf.Variable(0., validate_shape=False)
def on_batch_end(self, batch, logs=None):
# evaluate the variables and save them into lists
self.targets.append(K.eval(self.var_y_true))
self.outputs.append(K.eval(self.var_y_pred))
# build a simple model
# have to compile first for model.targets and model.outputs to be prepared
model = Sequential([Dense(5, input_shape=(10,))])
model.compile(loss='mse', optimizer='adam')
# initialize the variables and the `tf.assign` ops
cbk = CollectOutputAndTarget()
fetches = [tf.assign(cbk.var_y_true, model.targets[0], validate_shape=False),
tf.assign(cbk.var_y_pred, model.outputs[0], validate_shape=False)]
model._function_kwargs = {'fetches': fetches} # use `model._function_kwargs` if using `Model` instead of `Sequential`
# fit the model and check results
X = np.random.rand(10, 10)
Y = np.random.rand(10, 5)
model.fit(X, Y, batch_size=8, callbacks=[cbk])
Unless the number of samples can be divided by the batch size, the final batch will have a different size than other batches. So K.variable() and K.update() can't be used in this case. You'll have to use tf.Variable(..., validate_shape=False) and tf.assign(..., validate_shape=False) instead.
To verify the correctness of the saved arrays, you can add one line in training.py to print out the shuffled index array:
if shuffle == 'batch':
index_array = _batch_shuffle(index_array, batch_size)
elif shuffle:
np.random.shuffle(index_array)
print('Index array:', repr(index_array)) # Add this line
batches = _make_batches(num_train_samples, batch_size)
The shuffled index array should be printed out during fitting:
Epoch 1/1
Index array: array([8, 9, 3, 5, 4, 7, 1, 0, 6, 2])
10/10 [==============================] - 0s 23ms/step - loss: 0.5670
And you can check if cbk.targets is the same as Y[index_array]:
index_array = np.array([8, 9, 3, 5, 4, 7, 1, 0, 6, 2])
print(Y[index_array])
[[ 0.75325592 0.64857277 0.1926653 0.7642865 0.38901153]
[ 0.77567689 0.13573623 0.4902501 0.42897559 0.55825652]
[ 0.33760938 0.68195038 0.12303088 0.83509441 0.20991668]
[ 0.98367778 0.61325065 0.28973401 0.28734073 0.93399794]
[ 0.26097574 0.88219054 0.87951941 0.64887846 0.41996446]
[ 0.97794604 0.91307569 0.93816428 0.2125808 0.94381495]
[ 0.74813435 0.08036688 0.38094272 0.83178364 0.16713736]
[ 0.52609421 0.39218962 0.21022047 0.58569125 0.08012982]
[ 0.61276627 0.20679494 0.24124858 0.01262245 0.0994412 ]
[ 0.6026137 0.25620512 0.7398164 0.52558182 0.09955769]]
print(cbk.targets)
[array([[ 0.7532559 , 0.64857274, 0.19266529, 0.76428652, 0.38901153],
[ 0.77567691, 0.13573623, 0.49025011, 0.42897558, 0.55825651],
[ 0.33760938, 0.68195039, 0.12303089, 0.83509439, 0.20991668],
[ 0.9836778 , 0.61325067, 0.28973401, 0.28734073, 0.93399793],
[ 0.26097575, 0.88219053, 0.8795194 , 0.64887846, 0.41996446],
[ 0.97794604, 0.91307569, 0.93816429, 0.2125808 , 0.94381493],
[ 0.74813437, 0.08036689, 0.38094273, 0.83178365, 0.16713737],
[ 0.5260942 , 0.39218962, 0.21022047, 0.58569127, 0.08012982]], dtype=float32),
array([[ 0.61276627, 0.20679495, 0.24124858, 0.01262245, 0.0994412 ],
[ 0.60261369, 0.25620511, 0.73981643, 0.52558184, 0.09955769]], dtype=float32)]
As you can see, there are two batches in cbk.targets (one "full batch" of size 8 and the final batch of size 2), and the row order is the same as Y[index_array].
Long edit (almost a new answer) for the following reasons:
Yu-Yang's 2017 answer relies on the private _make_train_function and _function_kwargs APIs, which work only in TF1 (and maybe in TF1 compatibility, so-called non-eager mode).
Similarly, Binyan Hu's 2020 answer relies on _make_test_function and does not work in TF2 by default (requiring non-eager mode as well).
My own Jan 2020 answer, which was already subject to several required configuration settings, seems to have stopped working with (or before) TF 2.5, and I was not able to make model.inputs or model.outputs work any longer.
Finally, the earlier version of this answer requires potentially expensive model evaluation to obtain the predictions for each batch. A similar solution to obtain activation histograms even led to OOM issues with repeated training of different models.
So I set out find a way to obtain all possible quantities (inputs, targets, predictions, activations), batch-wise, without using any private APIs. The aim was to be able to call .numpy() on the intended quantities, so Keras callbacks can run ordinary Python code to ease debugging (I suppose that is what this question is mainly about - for maximum performance, one would probably try to integrate as many computations as possible into TensorFlow's graph operations anyway).
This is the common base model for all solutions:
"""Demonstrate batch data access."""
import tensorflow as tf
from tensorflow import keras
class DataCallback(keras.callbacks.Callback):
"""This class is where all implementations differ."""
def tf_nan(dtype):
"""Create NaN variable of proper dtype and variable shape for assign()."""
return tf.Variable(float("nan"), dtype=dtype, shape=tf.TensorShape(None))
def main():
"""Run main."""
model = keras.Sequential([keras.layers.Dense(1, input_shape=(2,))])
callback = DataCallback()
model.compile(loss="mse", optimizer="adam")
model.fit(
x=tf.transpose(tf.range(7.0) + [[0.2], [0.4]]),
y=tf.transpose(tf.range(7.0) + 10 + [[0.5]]),
validation_data=(
tf.transpose(tf.range(11.0) + 30 + [[0.6], [0.7]]),
tf.transpose(tf.range(11.0) + 40 + [[0.9]]),
),
shuffle=False,
batch_size=3,
epochs=2,
verbose=0,
callbacks=[callback],
)
model.save("tmp.tf")
if __name__ == "__main__":
main()
The following three snippets show one possible solution each, each with their own pros and cons. The core trick is always the same: allocate a tf.Variable and use tf.Variable.assign to export the intended quantity, from some Keras code run in graph mode, into the callback. The methods differ slightly in callback initialization and (in one case) model compilation, and most importantly, in the quantities they can access, which is why I summarize them above each snippet.
Custom metric
Using a custom (fake) metric (similar to my Jan 2020 answer), while we cannot seem to access model.inputs nor model.outputs any more (and model.(_)targets does not even exist any longer), we can access y_true and y_pred, which represent the model targets and outputs:
[ ] Inputs/Samples (x)
[ ] Weights (w)
[+] Targets/Labels (y_true)
[+] Outputs/Predictions (y_pred)
[ ] All layers (or only final input/output layers)
"""Demonstrate batch data access using a custom metric."""
import tensorflow as tf
from tensorflow import keras
class DataCallback(keras.callbacks.Callback): # diff
"""Callback to operate on batch data from metric."""
def __init__(self):
"""Offer a metric to access batch data."""
super().__init__()
self.y_true = None
self.y_pred = None
def set_model(self, model):
"""Initialize variables when model is set."""
self.y_true = tf_nan(model.output.dtype)
self.y_pred = tf_nan(model.output.dtype)
def metric(self, y_true, y_pred):
"""Fake metric."""
self.y_true.assign(y_true)
self.y_pred.assign(y_pred)
return 0
def on_train_batch_end(self, _batch, _logs=None):
"""See keras.callbacks.Callback.on_train_batch_end."""
print("y_true =", self.y_true.numpy())
print("y_pred =", self.y_pred.numpy())
def on_train_end(self, _logs=None):
"""Clean up."""
del self.y_true, self.y_pred
def tf_nan(dtype):
"""Create NaN variable of proper dtype and variable shape for assign()."""
return tf.Variable(float("nan"), dtype=dtype, shape=tf.TensorShape(None))
def main():
"""Run main."""
model = keras.Sequential([keras.layers.Dense(1, input_shape=(2,))])
callback = DataCallback()
model.compile(loss="mse", optimizer="adam", metrics=[callback.metric]) # diff
model.fit(
x=tf.transpose(tf.range(7.0) + [[0.2], [0.4]]),
y=tf.transpose(tf.range(7.0) + 10 + [[0.5]]),
validation_data=(
tf.transpose(tf.range(11.0) + 30 + [[0.6], [0.7]]),
tf.transpose(tf.range(11.0) + 40 + [[0.9]]),
),
shuffle=False,
batch_size=3,
epochs=2,
verbose=0,
callbacks=[callback],
)
model.save("tmp.tf")
if __name__ == "__main__":
main()
Custom training step
A custom training step is what I used in an earlier version of this answer. The idea still works in principle, but y_pred can be expensive and it might make sense to use a custom metric (see above) if that is required.
[+] Inputs/Samples (x)
[+] Weights (w)
[+] Targets/Labels (y_true)
[~] Outputs/Predictions (y_pred) [expensive!]
[ ] All layers (or only final input/output layers)
"""Demonstrate batch data access using a custom training step."""
import tensorflow as tf
from tensorflow import keras
class DataCallback(keras.callbacks.Callback): # diff
"""Callback to operate on batch data from training step."""
def __init__(self):
"""Initialize tf.Variables."""
super().__init__()
self.x = None
self.w = None
self.y_true = None
self.y_pred = None
def set_model(self, model):
"""Wrap the model.train_step function to access training batch data."""
self.x = tf_nan(model.input.dtype)
# pylint:disable=protected-access (replace by proper dtype if you know it)
if model.compiled_loss._user_loss_weights is not None:
self.w = tf_nan(model.compiled_loss._user_loss_weights.dtype)
self.y_true = tf_nan(model.output.dtype)
self.y_pred = tf_nan(model.output.dtype)
model_train_step = model.train_step
def outer_train_step(data):
# https://github.com/keras-team/keras/blob/v2.7.0/keras/engine/training.py
x, y_true, w = keras.utils.unpack_x_y_sample_weight(data)
self.x.assign(x)
if w is not None:
self.w.assign(w)
self.y_true.assign(y_true)
result = model_train_step(data)
y_pred = model(x)
self.y_pred.assign(y_pred)
return result
model.train_step = outer_train_step
def on_train_batch_end(self, _batch, _logs=None):
"""See keras.callbacks.Callback.on_train_batch_end."""
print("x =", self.x.numpy())
if self.w is not None:
print("w =", self.w.numpy())
print("y_true =", self.y_true.numpy())
print("y_pred =", self.y_pred.numpy())
def on_train_end(self, _logs=None):
"""Clean up."""
del self.x, self.w, self.y_true, self.y_pred
def tf_nan(dtype):
"""Create NaN variable of proper dtype and variable shape for assign()."""
return tf.Variable(float("nan"), dtype=dtype, shape=tf.TensorShape(None))
def main():
"""Run main."""
model = keras.Sequential([keras.layers.Dense(1, input_shape=(2,))])
callback = DataCallback()
model.compile(loss="mse", optimizer="adam")
model.fit(
x=tf.transpose(tf.range(7.0) + [[0.2], [0.4]]),
y=tf.transpose(tf.range(7.0) + 10 + [[0.5]]),
validation_data=(
tf.transpose(tf.range(11.0) + 30 + [[0.6], [0.7]]),
tf.transpose(tf.range(11.0) + 40 + [[0.9]]),
),
shuffle=False,
batch_size=3,
epochs=2,
verbose=0,
callbacks=[callback],
)
model.save("tmp.tf")
if __name__ == "__main__":
main()
Custom layer call
A custom layer call is a super-flexible way of accessing each layer's inputs and outputs. The callback handles patching of the call functions for a list of layers. While we cannot access weights and targets (as these quantitities do not make sense at the level of individual layers), it allows us to access individual layer activations, which can be handy for questions such as How does one log activations using `tf.keras.callbacks.TensorBoard`?.
[+] Inputs/Samples (x)
[ ] Weights (w)
[ ] Targets/Labels (y_true)
[+] Outputs/Predictions (y_pred)
[+] All layers (or only final input/output layers)
"""Demonstrate batch data access using custom layer calls."""
import tensorflow as tf
from tensorflow import keras
class DataCallback(keras.callbacks.Callback): # diff
"""Callback to operate on batch data from selected (to be wrapped) layers."""
def __init__(self, layers):
"""Wrap the calls of an iterable of model layers to access layer batch data."""
super().__init__()
self.data = {}
self.inner_calls = {}
self.outer_calls = {}
for layer in layers:
self.data[layer] = {
"inputs": tf_nan(layer.input.dtype),
"outputs": tf_nan(layer.output.dtype),
}
self.inner_calls[layer] = layer.call
def outer_call(inputs, layer=layer, layer_call=layer.call):
self.data[layer]["inputs"].assign(inputs)
outputs = layer_call(inputs)
self.data[layer]["outputs"].assign(outputs)
return outputs
self.outer_calls[layer] = outer_call
def on_train_batch_begin(self, _epoch, _logs=None):
"""Wrap layer calls during each batch."""
for layer, call in self.outer_calls.items():
layer.call = call
def on_train_batch_end(self, _epoch, _logs=None):
"""Restore original layer calls for ModelCheckpoint, model.save, ..."""
for layer, call in self.inner_calls.items():
layer.call = call
for layer, data in self.data.items():
print("Layer =", layer)
print("Inputs =", data["inputs"].numpy())
print("Outputs =", data["outputs"].numpy())
def tf_nan(dtype):
"""Create NaN variable of proper dtype and variable shape for assign()."""
return tf.Variable(float("nan"), dtype=dtype, shape=tf.TensorShape(None))
def main():
"""Run main."""
model = keras.Sequential([keras.layers.Dense(1, input_shape=(2,))])
callback = DataCallback(model.layers) # diff
model.compile(loss="mse", optimizer="adam")
model.fit(
x=tf.transpose(tf.range(7.0) + [[0.2], [0.4]]),
y=tf.transpose(tf.range(7.0) + 10 + [[0.5]]),
validation_data=(
tf.transpose(tf.range(11.0) + 30 + [[0.6], [0.7]]),
tf.transpose(tf.range(11.0) + 40 + [[0.9]]),
),
shuffle=False,
batch_size=3,
epochs=2,
verbose=0,
callbacks=[callback],
)
model.save("tmp.tf")
if __name__ == "__main__":
main()
When to use which and open to-dos
I think the snippets above each solution nicely summarize what each approach is capable of. Generally,
a custom training step will be ideal to access the model input, such as batched dataset generators, effects of shuffling, etc;
a custom layer call is ideal to access the in-betweens of the model; and
a custom metric is ideal to access the outputs of the model.
I am fairly certain (but have not tried) that one can combine all approaches to be able to access all batch quantities simultaneously. I have not tested anything but training mode - each method can have further pros and cons relating to their usefulness in testing or prediction mode. Finally, I assume, but have not tested either, that their should be only minor differences between tf.keras and keras. Having tested this code on TF2.8.rc1 and Keras 2.8.0, which has moved the tf.keras code back into the keras pip package, and not using any private APIs, I believe this assumption is justified.
It would be great if this approach could be extended to access model.inputs and model.outputs again. Currently, I am getting errors such as this one:
TypeError: You are passing KerasTensor(...), an intermediate Keras symbolic input/output, to a TF API that does not allow registering custom dispatchers, such as tf.cond, tf.function, gradient tapes, or tf.map_fn. Keras Functional model construction only supports TF API calls that do support dispatching, such as tf.math.add or tf.reshape. Other APIs cannot be called directly on symbolic Kerasinputs/outputs. You can work around this limitation by putting the operation in a custom Keras layer call and calling that layer on this symbolic input/output.
Previous answer
From TF 2.2 on, you can use custom training steps rather than callbacks to achieve what you want. Here's a demo that works with tensorflow==2.2.0rc1, using inheritance to improve the keras.Sequential model. Performance-wise, this is not ideal as predictions are made twice, once in self(x, training=True) and once in super().train_step(data). But you get the idea.
This works in eager mode and does not use private APIs, so it should be pretty stable. One caveat is that you have to use tf.keras (standalone keras does not support Model.train_step), but I feel standalone keras is becoming more and more deprecated anyway. (In fact, tf.keras migrates to keras in TF2.8.)
"""Demonstrate access to Keras batch tensors in a tf.keras custom training step."""
import numpy as np
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.python.keras.engine import data_adapter
in_shape = (2,)
out_shape = (1,)
batch_size = 3
n_samples = 7
class SequentialWithPrint(keras.Sequential):
def train_step(self, original_data):
# Basically copied one-to-one from https://git.io/JvDTv
data = data_adapter.expand_1d(original_data)
x, y_true, w = data_adapter.unpack_x_y_sample_weight(data)
y_pred = self(x, training=True)
# this is pretty much like on_train_batch_begin
K.print_tensor(w, "Sample weight (w) =")
K.print_tensor(x, "Batch input (x) =")
K.print_tensor(y_true, "Batch output (y_true) =")
K.print_tensor(y_pred, "Prediction (y_pred) =")
result = super().train_step(original_data)
# add anything here for on_train_batch_end-like behavior
return result
# Model
model = SequentialWithPrint([keras.layers.Dense(out_shape[0], input_shape=in_shape)])
model.compile(loss="mse", optimizer="adam")
# Example data
X = np.random.rand(n_samples, *in_shape)
Y = np.random.rand(n_samples, *out_shape)
model.fit(X, Y, batch_size=batch_size)
print("X: ", X)
print("Y: ", Y)
Finally, here is a simpler example without inheritance:
"""Demonstrate access to Keras batch tensors in a tf.keras custom training step."""
import tensorflow as tf
IN_SHAPE = (2,)
OUT_SHAPE = (1,)
BATCH_SIZE = 3
N_SAMPLES = 7
def make_print_data_and_train_step(keras_model):
"""Return a train_step function that prints data batches."""
original_train_step = keras_model.train_step
def print_data_and_train_step(data):
# Adapted from https://git.io/JvDTv, skipping data_adapter.expand_1d
x, y_true, w = tf.keras.utils.unpack_x_y_sample_weight(data)
y_pred = keras_model(x, training=True)
# this is pretty much like on_train_batch_begin
tf.keras.backend.print_tensor(w, "Sample weight (w) =")
tf.keras.backend.print_tensor(x, "Batch input (x) =")
tf.keras.backend.print_tensor(y_true, "Batch output (y_true) =")
tf.keras.backend.print_tensor(y_pred, "Prediction (y_pred) =")
result = original_train_step(data)
# add anything here for on_train_batch_end-like behavior
return result
return print_data_and_train_step
# Model
model = tf.keras.Sequential([tf.keras.layers.Dense(OUT_SHAPE[0], input_shape=IN_SHAPE)])
model.train_step = make_print_data_and_train_step(model)
model.compile(loss="mse", optimizer="adam")
# Example data
X = tf.random.normal((N_SAMPLES, *IN_SHAPE))
Y = tf.random.normal((N_SAMPLES, *OUT_SHAPE))
model.fit(X, Y, batch_size=BATCH_SIZE)
print("X: ", X)
print("Y: ", Y)
Update: This approach has stopped working. See my other answer a number of solutions compatible with TF2.8 (and hopefully beyond).
One problem with #Yu-Yang's solution is that it relies on model._function_kwargs, which is not guaranteed to work as it is not part of the API. In particular, in TF2 with eager execution, session kwargs seem to be either not accepted at all or run preemptively due to eager mode.
Therefore, here is my solution tested on tensorflow==2.1.0. The trick is to replace fetches by a Keras metric, in which the assignment operations from fetches are made during training.
This even enables a Keras-only solution if the batch size divides the number of samples; otherwise, another trick has to be applied when initializing TensorFlow variables with a None shape, similar to validate_shape=False in earlier solutions (compare https://github.com/tensorflow/tensorflow/issues/35667).
Importantly, tf.keras behaves differently from keras (sometimes just ignoring assignments, or seeing variables as Keras symbolic tensors), so this updated solution takes care of both implementations (Keras==2.3.1 and tensorflow==2.1.0).
"""Demonstrate access to Keras symbolic tensors in a (tf.)keras.Callback."""
import numpy as np
import tensorflow as tf
use_tf_keras = True
if use_tf_keras:
from tensorflow import keras
from tensorflow.keras import backend as K
tf.config.experimental_run_functions_eagerly(False)
compile_kwargs = {"run_eagerly": False, "experimental_run_tf_function": False}
else:
import keras
from keras import backend as K
compile_kwargs = {}
in_shape = (2,)
out_shape = (1,)
batch_size = 3
n_samples = 7
class CollectKerasSymbolicTensorsCallback(keras.callbacks.Callback):
"""Collect Keras symbolic tensors."""
def __init__(self):
"""Initialize intermediate variables for batches and lists."""
super().__init__()
# Collect batches here
self.inputs = []
self.targets = []
self.outputs = []
# # For a pure Keras solution, we need to know the shapes beforehand;
# # in particular, batch_size must divide n_samples:
# self.input = K.variable(np.empty((batch_size, *in_shape)))
# self.target = K.variable(np.empty((batch_size, *out_shape)))
# self.output = K.variable(np.empty((batch_size, *out_shape)))
# If the shape of these variables will change (e.g., last batch), initialize
# arbitrarily and specify `shape=tf.TensorShape(None)`:
self.input = tf.Variable(0.0, shape=tf.TensorShape(None))
self.target = tf.Variable(0.0, shape=tf.TensorShape(None))
self.output = tf.Variable(0.0, shape=tf.TensorShape(None))
def on_batch_end(self, batch, logs=None):
"""Evaluate the variables and save them into lists."""
self.inputs.append(K.eval(self.input))
self.targets.append(K.eval(self.target))
self.outputs.append(K.eval(self.output))
def on_train_end(self, logs=None):
"""Print all variables."""
print("Inputs: ", *self.inputs)
print("Targets: ", *self.targets)
print("Outputs: ", *self.outputs)
#tf.function
def assign_keras_symbolic_tensors_metric(_foo, _bar):
"""
Return the assignment operations as a metric to have them evaluated by Keras.
This replaces `fetches` from the TF1/non-eager-execution solution.
"""
# Collect assignments as list of (dest, src)
assignments = (
(callback.input, model.inputs[0]),
(callback.target, model._targets[0] if use_tf_keras else model.targets[0]),
(callback.output, model.outputs[0]),
)
for (dest, src) in assignments:
dest.assign(src)
return 0
callback = CollectKerasSymbolicTensorsCallback()
metrics = [assign_keras_symbolic_tensors_metric]
# Example model
model = keras.Sequential([keras.layers.Dense(out_shape[0], input_shape=in_shape)])
model.compile(loss="mse", optimizer="adam", metrics=metrics, **compile_kwargs)
# Example data
X = np.random.rand(n_samples, *in_shape)
Y = np.random.rand(n_samples, *out_shape)
model.fit(X, Y, batch_size=batch_size, callbacks=[callback])
print("X: ", X)
print("Y: ", Y)
Inspired by the way tf.keras.callbacks.TesnsorBoard saves v1 (graph) summaries.
No variable assignments and no redundant metrics.
For use with tensorflow>=2.0.0, graph (disable eager) mode during evaluating.
Extensive operations on the numpy predictions can be implemented by overriding SavePrediction._pred_callback.
import numpy as np
import tensorflow as tf
from tensorflow import keras
tf.compat.v1.disable_eager_execution()
in_shape = (2,)
out_shape = (1,)
batch_size = 2
n_samples = 32
class SavePrediction(keras.callbacks.Callback):
def __init__(self):
super().__init__()
self._get_pred = None
self.preds = []
def _pred_callback(self, preds):
self.preds.append(preds)
def set_model(self, model):
super().set_model(model)
if self._get_pred is None:
self._get_pred = self.model.outputs[0]
def on_test_begin(self, logs):
# pylint: disable=protected-access
self.model._make_test_function()
# pylint: enable=protected-access
if self._get_pred not in self.model.test_function.fetches:
self.model.test_function.fetches.append(self._get_pred)
self.model.test_function.fetch_callbacks[self._get_pred] = self._pred_callback
def on_test_end(self, logs):
if self._get_pred in self.model.test_function.fetches:
self.model.test_function.fetches.remove(self._get_pred)
if self._get_pred in self.model.test_function.fetch_callbacks:
self.model.test_function.fetch_callbacks.pop(self._get_pred)
print(self.preds)
model = keras.Sequential([
keras.layers.Dense(out_shape[0], input_shape=in_shape)
])
model.compile(loss="mse", optimizer="adam")
X = np.random.rand(n_samples, *in_shape)
Y = np.random.rand(n_samples, *out_shape)
model.evaluate(X, Y,
batch_size=batch_size,
callbacks=[SavePrediction()])

How to use tf.contrib.seq2seq.BahdanauAttention

I am trying to produce a simple code for a seq2seq model with attention in tf 1.1. I am not sure what is the parameter "depth of query mechanism ". I am getting an error on creation of Attention Mechanisms saying that:
TypeError: int() argument must be a string, a bytes-like object or a number, not 'TensorShape'
Here is my code. Am I on a right track? I could not find any detailed documentation.
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, BasicLSTMCell, DropoutWrapper, MultiRNNCell, EmbeddingWrapper, static_rnn
import tensorflow.contrib.seq2seq as seq2seq
import attention_wrapper as wrapper
tf.reset_default_graph()
try:
sess.close()
except:
pass
sess = tf.InteractiveSession()
## Place holders
encode_input = [tf.placeholder(tf.int32,
shape=(None,),
name = "ei_%i" %i)
for i in range(input_seq_length)]
labels = [tf.placeholder(tf.int32,
shape=(None,),
name = "l_%i" %i)
for i in range(output_seq_length)]
decode_input = [tf.zeros_like(encode_input[0], dtype=np.int32, name="GO")] + labels[:-1]
############ Encoder
lstm_cell = BasicLSTMCell(embedding_dim)
encoder_cell = EmbeddingWrapper(lstm_cell, embedding_classes=input_vocab_size, embedding_size=embedding_dim)
encoder_outputs, encoder_state = static_rnn(encoder_cell, encode_input, dtype=tf.float32)
############ Decoder
# Attention Mechanisms. Bahdanau is additive style attention
attn_mech = tf.contrib.seq2seq.BahdanauAttention(
num_units = input_seq_length, # depth of query mechanism
memory = encoder_outputs, # hidden states to attend (output of RNN)
normalize=False, # normalize energy term
name='BahdanauAttention')
lstm_cell_decoder = BasicLSTMCell(embedding_dim)
# Attention Wrapper: adds the attention mechanism to the cell
attn_cell = wrapper.AttentionWrapper(
cell = lstm_cell_decoder,# Instance of RNNCell
attention_mechanism = attn_mech, # Instance of AttentionMechanism
attention_size = embedding_dim, # Int, depth of attention (output) tensor
attention_history=False, # whether to store history in final output
name="attention_wrapper")
# Decoder setup
decoder = tf.contrib.seq2seq.BasicDecoder(
cell = lstm_cell_decoder,
helper = helper, # A Helper instance
initial_state = encoder_state, # initial state of decoder
output_layer = None) # instance of tf.layers.Layer, like Dense
# Perform dynamic decoding with decoder object
outputs, final_state = tf.contrib.seq2seq.dynamic_decode(decoder)