How can I create two sets of shared embeddings from the same tensorflow feature columns?
This small example
import tensorflow as tf
data = {"A": [0, 1, 2, 3], "B": [2, 1, 0, 3]}
def add_label(example):
return example, 1
def input_fn():
dset =
return dset
def model_fn(features, labels, mode, params):
colA = tf.feature_column.categorical_column_with_vocabulary_list("A", [0, 1, 2, 3])
colB = tf.feature_column.categorical_column_with_vocabulary_list("B", [0, 1, 2, 3])
model1_embedddings = tf.feature_column.shared_embeddings(categorical_columns=[colA, colB], dimension=2)
X1 = tf.keras.layers.DenseFeatures(model1_embedddings)(features)
output1_output = tf.reduce_sum(X1, axis=1)
with tf.compat.v1.variable_scope("other", reuse=False):
model2_embedddings = tf.feature_column.shared_embeddings(categorical_columns=[colA, colB], dimension=2)
X2 = tf.keras.layers.DenseFeatures(model2_embedddings)(features)
output2_output = tf.reduce_sum(X2, axis=1)
loss = tf.losses.mean_squared_error(labels, output1_output + output2_output)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss=loss)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
estimator = tf.estimator.Estimator(model_fn=model_fn)
crashes with
ValueError: Variable A_B_shared_embedding already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope?
It seems one should be able to use variable_scope or name_scope to get it to work, but so far no luck.

There is an option to shared_embeddings to set a new embeddings collection name
model2_embedddings = tf.feature_column.shared_embeddings(
categorical_columns=[colA, colB], dimension=2,


Cannot get GradientTape to give non null results

I am trying to manually implement a very simple RNN using tensorflow2. I modeled my code on the example to manually make models on tensorflow website. The code, stripped to bare essentials for this purpose, is
class ModelSimple(object):
def __init__(self):
# Initialize the weights to `5.0` and the bias to `0.0`
# In practice, these should be initialized to random values (for example, with `tf.random.normal`)
self.W = tf.Variable(tf.random.normal([]))
self.b = tf.Variable(tf.random.normal([]))
def __call__(self, x):
return self.W * x + self.b
def loss(predicted_y, target_y):
return tf.reduce_mean(tf.square(predicted_y - target_y))
inputs = tf.random.normal(shape=[NUM_EXAMPLES])
outputs = tf.zeros(NUM_EXAMPLES)
model = ModelSimple()
with tf.GradientTape() as t:[model.W,model.b])
current_loss = loss(model(inputs), outputs)
dW, db = t.gradient(current_loss, [model.W, model.b])
This gives nice tensors for dW and db. Then I try to do what I described above
class ModelRNN(object):
def __init__(self, n_inputs, n_neurons):
self.n_inputs = n_inputs
self.n_neurons = n_neurons
# weights for new input
self.Wx = tf.Variable(tf.random.normal(shape=[self.n_inputs, self.n_neurons], dtype=tf.float32))
# weights for previous output
self.Wy = tf.Variable(tf.random.normal(shape=[self.n_neurons, self.n_neurons], dtype=tf.float32))
# bias weights
self.b = tf.Variable(tf.zeros([1, self.n_neurons], dtype=tf.float32))
def __call__(self, X_batch):
# get shape of input
batch_size, num_time_steps, _ = X_batch.get_shape()
# we will loop through the time steps and the output of the previous computation feeds into
# the next one.
# this variable keeps track of it and is initialized to zero
y_last = tf.Variable(tf.zeros([batch_size, self.n_neurons], dtype=tf.float32))
# the outputs will be stored in this tensor
Ys = tf.Variable(tf.zeros([batch_size, num_time_steps, self.n_neurons], dtype=tf.float32))
for t in range(num_time_steps):
Xt = X_batch[:, t, :]
yt = tf.tanh(tf.matmul(y_last, self.Wy) +
tf.matmul(Xt, self.Wx) +
Ys[:, t, :].assign(yt)
return Ys
inputs = tf.convert_to_tensor(np.array([
# t = 0 t = 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
model = ModelRNN(3, 5)
with tf.GradientTape() as t:[model.Wx,model.Wy,model.b])
current_loss = loss(model(inputs), outputs)
dWx,dWy,db = t.gradient(current_loss, [model.Wx, model.Wy,model.b])
and it turns out dWx,dWy,db are all None. I have tried several things (including watching them using the GradientTape despite them being variables) and yet I keep getting None. What am I doing wrong?
It looks like this is related to this issue:
Tensorflow cannot get gradient wrt a Variable, but can wrt a Tensor
Replacing assign with a python list and tf.stack results in a gradient being returned
Ys = []
for t in range(num_time_steps):
Xt = X_batch[:, t, :]
yt = tf.tanh(tf.matmul(y_last, self.Wy) +
tf.matmul(Xt, self.Wx) +
return tf.stack(Ys,axis=1)

How to use Dateset to feed array of data to inference with tensorflow?

I am new to Tensorflow Dataset API, and I could not fully understand the simplicity of its design, so I need some help.
Here is a simple example
import tensorflow as tf
x = tf.placeholder(tf.int32, shape=[])
y = tf.square(x)
with tf.Session() as sess:
print(, {x: 2}))
# result is 4, simple
If I have an integer array arr_x=[2, 3, 5, 8, 10], how can I use Dateset API to iterate the array?
I am trying
p = tf.placeholder(tf.int32, shape=[None])
d =
d = x: x)
iter = d.make_initializable_iterator()
next_element = iter.get_next()
with tf.Session() as sess:, feed_dict={p: [2, 3, 4]})
while True:
print, next_element)
except tf.errors.OutOfRangeError:
But no luck, any idea?
What about:
arr_x = np.array([2, 3, 5, 8, 10])
arr_y = np.array([[0,1],[1,0],[1,0],[0,1],[1,0]])
dataset =, arr_y))
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
sess = tf.Session()
while True:
except tf.errors.OutOfRangeError:

Masking zero-padding embedding (and return zero gradients) in Tensorflow as in Pytorch

I'm trying to recreate the PoolNet from Spotlight with the BPR loss in Tensorflow but I can't get the same results. Below is the model I'm using (it's an estimator model_fn).
def _pooling_model_fn(features, labels, mode, params):
with tf.name_scope('inputs'):
if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
users_prev_items_inputs_train = features['item_seqs']
elif mode == tf.estimator.ModeKeys.PREDICT:
users_prev_items_inputs_train = tf.reshape(features['item_seqs'], [1, -1])
with tf.device('/cpu:0'):
prod_embeddings = tf.keras.layers.Embedding(params["num_items"], params["item_emb_size"], mask_zero=True)
item_biases = tf.keras.layers.Embedding(params["num_items"], 1, mask_zero=True, embeddings_initializer=tf.keras.initializers.Zeros())
prod_embed = prod_embeddings(users_prev_items_inputs_train)
targets = tf.transpose(prod_embed, [0, 2, 1])
sequence_embeddings = tf.expand_dims(targets, axis=3)
sequence_embeddings = tf.pad(sequence_embeddings, paddings=tf.constant([[0, 0], [0, 0], [1, 0], [0, 0]]))
sequence_embedding_sum = tf.cumsum(sequence_embeddings, 2)
non_padding_entries = tf.cumsum(tf.cast(tf.not_equal(sequence_embeddings, tf.constant(0.0)), tf.float32), 2) # .expand_as(sequence_embedding_sum)
user_representations = tf.squeeze((sequence_embedding_sum / (non_padding_entries + 1)), [3])
user_representations_so_far = user_representations[:, :, :-1]
user_representations_new = user_representations[:, :, -1]
if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
global_step = tf.contrib.framework.get_or_create_global_step()
with tf.name_scope('loss'):
negative_samples = features['neg_samp']
with tf.device('/cpu:0'):
prod_embed_pos = prod_embeddings(users_prev_items_inputs_train)
target_embedding_positive = tf.squeeze(tf.transpose(prod_embed_pos, [0, 2, 1]))
prod_bias_pos = item_biases(users_prev_items_inputs_train)
target_bias_positive = tf.squeeze(prod_bias_pos)
dot_positive = tf.reduce_sum(user_representations_so_far * target_embedding_positive, 1) + target_bias_positive
with tf.device('/cpu:0'):
prod_embed_neg = prod_embeddings(negative_samples)
target_embedding_negative = tf.squeeze(tf.transpose(prod_embed_neg, [0, 2, 1]))
prod_bias_neg = item_biases(negative_samples)
target_bias_negative = tf.squeeze(prod_bias_neg)
dot_negative = tf.reduce_sum(user_representations_so_far * target_embedding_negative, 1) + target_bias_negative
mask = tf.not_equal(users_prev_items_inputs_train, 0)
loss = bpr_loss(dot_positive, dot_negative, mask)
if mode == tf.estimator.ModeKeys.TRAIN:
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate=params["lr"])
train_op = optimizer.minimize(loss, global_step=global_step)
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
if mode == tf.estimator.ModeKeys.PREDICT:
item_ids = np.arange(params['num_items']).reshape(-1, 1)
item_ids_tensor = tf.convert_to_tensor(item_ids, dtype=tf.int64)
with tf.device('/cpu:0'):
prod_embed_pos = prod_embeddings(item_ids_tensor) # tf.nn.embedding_lookup(prod_embeddings, item_ids_tensor)
target_embedding_positive = tf.squeeze(tf.transpose(prod_embed_pos, [0, 2, 1]))
prod_bias_pos = item_biases(item_ids_tensor) # tf.nn.embedding_lookup(item_biases, item_ids_tensor)
target_bias_positive = tf.squeeze(prod_bias_pos)
dot_positive = tf.reduce_sum(user_representations_new * target_embedding_positive, 1) + target_bias_positive
predictions = {
'products': tf.reshape(dot_positive, [1, -1])
export_outputs = {
'prediction': tf.estimator.export.PredictOutput(predictions)
return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
and the loss function
def bpr_loss(positive_predictions, negative_predictions, mask):
loss1 = 1.0 - tf.nn.sigmoid(positive_predictions - negative_predictions)
if mask is not None:
mask = tf.cast(mask, loss1.dtype)
final_loss = loss1 * mask
return tf.reduce_sum(final_loss) / tf.reduce_sum(mask)
return tf.reduce_mean(loss1)
With the above model, I can't get the same predictions on the exact same dataset (and same random seed) as I do with Spotlight. I end up that the problem is with the zero-padding. The way that the data is generated is as the following:
they have leading zero-padding so every input sample has the same length.
Based on my code I believed I did everything to mask out these zeros from the loss, the embedding layer (using the mask_zero parameter from Keras) as well as from the averaging of the embeddings that I'm computing (using the cumsum). Still though, after training, the zero-indexed embedding is constantly changing (meaning that instead of excluded is taken into consideration and leading to influence the rest gradients and adding noise to my results).
Pytorch seems to have a nice feature in their implementation of the Embedding layer where you can set the padding_idx with the id of the pad and this will be initialized with zeros. Also, it keeps the gradient of this index always zero. So basically, I'm trying to do the same thing with Tensorflow.
Any help would be appreciated.
I solved it using the following solution posted on Tensorflow's Github. It seems to work now.
mask_padding_zero_op = tf.scatter_update(lookup_table,
tf.zeros([EMBEDDING_DIM,], dtype=DTYPE))
with tf.control_dependencies([mask_padding_zero_op]):
# do embedding lookup...

why if we use "tf.make_template()" in training stage, we must use tf.make_template() again in testing stage

I defined a model function which named "drrn_model". While I was training my model, I use model by:
shared_model = tf.make_template('shared_model', drrn_model)
train_output = shared_model(train_input, is_training=True)
It begin training step by step, and I can restore .ckpt file to the model when I want to continue to train the model from an old point.
But there is a problem when I test my trained model.
I use the code below directly without using tf.make_template:
train_output = drrn_model(train_input, is_training=False)
Then the terminal gave me a lots of NotFoundError like "Key LastLayer/Variable_2 not found in checkpoint".
But when I use
shared_model = tf.make_template('shared_model', drrn_model)
output_tensor = shared_model(input_tensor,is_training=False)
It can test normally.
So why we must use tf.make_template() again in testing stage. What is the difference between drrn_model and make_template when we construct our model.
And there is another question: the BN layer in tensorflow.
I have tried many ways but the outputs is always wrong(always worse then the version without BN layer).
There is my newest version of model with BN layer:
tensor = None
def drrn_model(input_tensor, is_training):
with tf.device("/gpu:0"):
with tf.variable_scope("FirstLayer"):
conv_0_w = tf.get_variable("conv_w", [3, 3, 1, 128], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
tensor = tf.nn.conv2d(tf.nn.relu(batchnorm(input_tensor, is_training= is_training)), conv_0_w, strides=[1,1,1,1], padding="SAME")
first_layer = tensor
### recursion ###
with tf.variable_scope("recycle", reuse=False):
tensor = drrnblock(first_layer, tensor, is_training)
for i in range(1,10):
with tf.variable_scope("recycle", reuse=True):
tensor = drrnblock(first_layer, tensor, is_training)
### end layer ###
with tf.variable_scope("LastLayer"):
conv_end_w = tf.get_variable("conv_w", [3, 3, 128, 1], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
conv_end_layer = tf.nn.conv2d(tf.nn.relu(batchnorm(tensor, is_training= is_training)), conv_end_w, strides=[1, 1, 1, 1], padding='SAME')
tensor = tf.add(input_tensor,conv_end_layer)
return tensor
def drrnblock(first_layer, input_layer, is_training):
conv1_w = tf.get_variable("conv1__w", [3, 3, 128, 128], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
conv1_layer = tf.nn.conv2d(tf.nn.relu(batchnorm(input_layer, is_training= is_training)), conv1_w, strides=[1,1,1,1], padding= "SAME")
conv2_w = tf.get_variable("conv2__w", [3, 3, 128, 128], initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / 9)))
conv2_layer = tf.nn.conv2d(tf.nn.relu(batchnorm(conv1_layer, is_training=is_training)), conv2_w, strides=[1, 1, 1, 1], padding="SAME")
tensor = tf.add(first_layer, conv2_layer)
return tensor
def batchnorm(inputs, is_training, decay = 0.999):# there is my BN layer
scale = tf.Variable(tf.ones([inputs.get_shape()[-1]]))
beta = tf.Variable(tf.zeros([inputs.get_shape()[-1]]))
pop_mean = tf.Variable(tf.zeros([inputs.get_shape()[-1]]), trainable=False)
pop_var = tf.Variable(tf.ones([inputs.get_shape()[-1]]), trainable=False)
if is_training:
batch_mean, batch_var = tf.nn.moments(inputs,[0,1,2])
print("batch_mean.shape: ", batch_mean.shape)
train_mean = tf.assign(pop_mean, pop_mean*decay+batch_mean*(1-decay))
train_var = tf.assign(pop_var, pop_var*decay+batch_var*(1-decay))
with tf.control_dependencies([train_mean, train_var]):
return tf.nn.batch_normalization(inputs,batch_mean,batch_var,beta,scale,variance_epsilon=1e-3)
return tf.nn.batch_normalization(inputs,pop_mean,pop_var,beta,scale,variance_epsilon=1e-3)
Please tell me where is wrong in my code.
Thanks a lot!!

Difference between SparseTensor and SparseTensorValue

What is the difference between SparseTensor and SparseTensorValue? Is there anything I should keep in mind if I want to build the sparse tensor based on fed indices and values? I could only find a few toy examples.
It depends on where you define your Sparse Tensor.
If you would like to define the tensor outside the graph, e.g. define the sparse tensor for later data feed, use SparseTensorValue. In contrast, if the sparse tensor is defined in graph, use SparseTensor
Sample code for tf.SparseTensorValue:
x_sp = tf.sparse_placeholder(dtype=tf.float32)
W = tf.Variable(tf.random_normal([6, 6]))
y = tf.sparse_tensor_dense_matmul(sp_a=x_sp, b=W)
init = tf.global_variables_initializer()
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
stv = tf.SparseTensorValue(indices=[[0, 0], [1, 2]], values=[1.1, 1.2],
result =,feed_dict={x_sp:stv})
Sample code for tf.SparseTensor:
indices_i = tf.placeholder(dtype=tf.int64, shape=[2, 2])
values_i = tf.placeholder(dtype=tf.float32, shape=[2])
dense_shape_i = tf.placeholder(dtype=tf.int64, shape=[2])
st = tf.SparseTensor(indices=indices_i, values=values_i, dense_shape=dense_shape_i)
W = tf.Variable(tf.random_normal([6, 6]))
y = tf.sparse_tensor_dense_matmul(sp_a=st, b=W)
init = tf.global_variables_initializer()
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
result =,feed_dict={indices_i:[[0, 0], [1, 2]], values_i:[1.1, 1.2], dense_shape_i:[2,6]})
Hope this help~