Given a tensor whose shape is Nx2, how is it possible to select k elements from this tensor akin to np.random.choice (with equal probability) ? Another point to note is that the value of N dynamically changes during execution. Meaning to say that I'm dealing with a dynamically-sized tensor.

You can just wrap np.random.choice as a tf.py_func. See for example this answer. In your case, you need to flatten your tensor so it is an array of length 2*N:
import numpy as np
import tensorflow as tf
a = tf.placeholder(tf.float32, shape=[None, 2])
size = tf.placeholder(tf.int32)
y = tf.py_func(lambda x, s: np.random.choice(x.reshape(-1),s), [a, size], tf.float32)
with tf.Session() as sess:
print(sess.run(y, {a: np.random.rand(4,2), size:5}))

I had a similar problem, where I wanted to subsample points from a pointcloud for an implementation of PointNet. My input dimension was [None, 2048, 3], and I was subsampling down to [None, 1024, 3] using the following custom layer:
class SubSample(Layer):
def __init__(self,num_samples):
super(SubSample, self).__init__()
def build(self, input_shape):
self.shape = input_shape #[None,2048,3]
def call(self, inputs, training=None):
k = tf.random.uniform([self.shape[1],]) #[2048,]
bl = tf.argsort(k)<self.num_samples #[2048,]
res = tf.boolean_mask(inputs, bl, axis=1) #[None,1024,3]
# Reshape needed so that channel shape is passed when `run_eagerly=False`, otherwise it returns `None`
return tf.reshape(res,(-1,self.num_samples,self.shape[-1])) #[None,1024,3]
>>> TensorShape([64, 1024, 3])
As far as I can tell, this works for TensorFlow 2.5.0
Note that this isn't directly an answer to the question at hand, but the answer that I was looking for when I stumbled across this question.


Converting from PyTorch to Tensorflow for Self-Attention Pooling Layer

I have found an implementation of the said layer from this paper, "Self-Attention Encoding and Pooling for Speaker Recognition", available at here via Pytorch. However, due to CUDA compatibility issues, I can't want to use the said code. Also, thus far, all my codes have been implemented in Tensorflow. So, I want to do a one-to-one translation/conversion or whatever, from PyTorch to Tensorflow.
First of all, this is the code in PyTorch:
class SelfAttentionPooling(nn.Module):
def __init__(self, input_dim):
super(SelfAttentionPooling, self).__init__()
self.W = nn.Linear(input_dim, 1)
def forward(self, batch_rep):
batch_rep : size (N, T, H), N: batch size, T: sequence length, H: Hidden dimension
att_w : size (N, T, 1)
utter_rep: size (N, H)
softmax = nn.functional.softmax
att_w = softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)
utter_rep = torch.sum(batch_rep * att_w, dim=1)
return utter_rep
And this is my translation of the snippet code to Tensorflow:
class Self_Attention_Pooling(keras.layers.Layer): ?
def __init__(self, input_dim):
super(Self_Attention_Pooling, self).__init__()
self.W = Dense(input_dim)
def forward(self, batch_rep):
softmax = Softmax()
att_w = self.W(batch_rep)
att_w = softmax(att_w)
# Not so sure about these two lines though.
#x = np.expand(batch_rep)
#att_w = softmax(self.W(x))
utter_rep = np.sum(batch_rep * att_w, axis=1)
return utter_rep
Is my implementation/translation/conversion from PyTorch to Tensorflow correct? If not, please edit and help me.
Thank you very much.
2 remarks regarding your implementation:
For custom layers in TF, you should implement the call method instead of the forward method cf Implementing custom layers.
For the operations you should replace the numpy functions by tensorflow functions to enable GPU support.
Here is the code I am using in TF for the SelfAttentionPooling:
import tensorflow as tf
class SelfAttentionPooling(tf.keras.layers.Layer):
def __init__(self, **kwargs) -> None:
self.dense = tf.keras.layers.Dense(units=1, use_bias=False)
def call(self, x: tf.Tensor) -> tf.Tensor:
"""Apply the self attention pooling on input tensor.
x: input tensor (?, seq_len, emb_dim)
(?, emb_dim)
# (?, seq_len)
attention_weights = tf.nn.softmax(tf.squeeze(self.dense(x)))
# (?, emb_dim)
pooled = tf.reduce_sum(tf.expand_dims(attention_weights, axis=-1) * x, axis=1)
return pooled
You can quickly check it gives the expected output:
self_attn_pooling = SelfAttentionPooling()
# (?, seq_len, emb_dim)
input_shape = 4, 9, 128
x = tf.random.normal(input_shape)
pooled = self_attn_pooling(x)
# (?, emb_dim)
assert pooled.shape == (4, 128)

Custom Attention Layer using in Keras

I want to create a custom attention layer that for input at any time this layer returns the weighted mean of inputs at all time inputs.
For Example, I want that input tensor with shape [32,100,2048] goes to layer and I get the tensor with the shape [32,100,2048]. I wrote the Layer as follow:
import tensorflow as tf
from keras.layers import Layer, Dense
from tensorflow.keras.layers import Layer, Dense
class Attention(Layer):
def __init__(self, units_att):
self.units_att = units_att
self.W = Dense(units_att)
self.V = Dense(1)
def __call__(self, values):
t = tf.constant(0, dtype= tf.int32)
time_steps = tf.shape(values)[1]
initial_outputs = tf.TensorArray(dtype=tf.float32, size=time_steps)
initial_att = tf.TensorArray(dtype=tf.float32, size=time_steps)
def should_continue(t, *args):
return t < time_steps
def iteration(t, values, outputs, atts):
score = self.V(tf.nn.tanh(self.W(values)))
# attention_weights shape == (batch_size, time_step, 1)
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
outputs = outputs.write(t, context_vector)
atts = atts.write(t, attention_weights)
return t + 1, values, outputs, atts
t, values, outputs, atts = tf.while_loop(should_continue, iteration,
[t, values, initial_outputs, initial_att])
outputs = outputs.stack()
outputs = tf.transpose(outputs, [1,0,2])
atts = atts.stack()
atts = tf.squeeze(atts, -1)
atts = tf.transpose(atts, [1,0,2])
return t, values, outputs, atts
For input= tf.constant(2, shape= [32, 100, 2048], dtype= tf.float32) I get the
output with shape = [32,100,2048] in tf2 and [32,None, 2048] in tf1.
For Input input= Input(shape= (None, 2048)) I get the output with shape = [None, None, 2048] in tf1 and I get error
TypeError: 'Tensor' object cannot be interpreted as an integer
in tf2.
Finally, in both cases, I can't use this layer in my model because my model input is Input(shape= (None, 2048)) and I get the error
AttributeError: 'NoneType' object has no attribute '_inbound_nodes'
in tf1 and in tf2 I get the same error as said in above, I create my model with Keras functional method.
From the code you have shared, looks like you want to implement Bahdanau's attention layer in your code. You want to attend to all the 'values' (prev layer output - all its hidden states) and your 'query' would be the last hidden state of the decoder. Your code should actually be very simple and should look like:
class Bahdanau(tf.keras.layers.Layer):
def __init__(self, n):
super(Bahdanau, self).__init__()
self.w = tf.keras.layers.Dense(n)
self.u = tf.keras.layers.Dense(n)
self.v = tf.keras.layers.Dense(1)
def call(self, query, values):
query = tf.expand_dims(query, 1)
e = self.v(tf.nn.tanh(self.w(query) + self.u(values)))
a = tf.nn.softmax(e, axis=1)
c = a * values
c = tf.reduce_sum(c, axis=1)
return a,c
##Say we want 10 units in the single layer MLP determining w,u
attentionlayer = Bahdanau(10)
##Call with i/p: decoderstate # t-1 and all encoder hidden states
a, c = attentionlayer(stminus1, hj)
We are not specifying the tensor shape anywhere in the code. This code will return you a context tensor of same size as 'stminus1' which is the 'query'. It does this after attending to all the 'values' (all output states of decoder) using Bahdanau's attention mechanism.
So assuming your batch size is 32, timesteps=100 and embedding dimension=2048, the shape of stminus1 should be (32,2048) and the shape of the hj should be (32,100,2048). The shape of the output context would be (32,2048). We also returned the 100 attention weights just in case you want to route them to a nice display.
This is the simplest version of 'Attention'. If you have any other intent, please let me know and I will reformat my answer. For more specific details, please refer https://towardsdatascience.com/create-your-own-custom-attention-layer-understand-all-flavours-2201b5e8be9e

Backpropagating gradients through nested tf.map_fn

I would like to map a TensorFlow function on each vector corresponding to the depth channel of every pixel in a matrix with dimension [batch_size, H, W, n_channels].
In other words, for every image of size H x W that I have in the batch:
I extract some features maps F_k (whose number is n_channels) with the same size H x W (hence, the features maps all together are a tensor of shape [H, W, n_channels];
then, I wish to apply a custom function to the vector v_ij that is associated with the i-th row and j-th column of each feature map F_k, but explores the depth channel in its entirety (e.g. v has dimension [1 x 1 x n_channels]). Ideally, all of this would happen in parallel.
A picture to explain the process can be found below. The only difference with the picture is that both input and output "receptive fields" have size 1x1 (apply the function to each pixel independently).
This would be similar to applying a 1x1 convolution to the matrix; however, I need to apply a more general function over the depth channel, rather than a simple sum operation.
I think tf.map_fn() could be an option and I tried the following solution, where I recursively use tf.map_fn() to access the features associated with each pixel. However, this kind of seems sub-optimal, and most importantly it raises an error when trying to backpropagate the gradients.
Do you have any idea of the reason why this happens and how I should structure my code to avoid the error?
This is my current implementation of the function:
import tensorflow as tf
from tensorflow import layers
def apply_function_on_pixel_features(incoming):
# at first the input is [None, W, H, n_channels]
if len(incoming.get_shape()) > 1:
return tf.map_fn(lambda x: apply_function_on_pixel_features(x), incoming)
# here the input is [n_channels]
# apply some function that applies a transfomration and returns a vetor of the same size
output = my_custom_fun(incoming) # my_custom_fun() doesn't change the shape
return output
and the body of my code:
H = 128
W = 132
n_channels = 8
x1 = tf.placeholder(tf.float32, [None, H, W, 1])
x2 = layers.conv2d(x1, filters=n_channels, kernel_size=3, padding='same')
# now apply a function to the features vector associated to each pixel
x3 = apply_function_on_pixel_features(x2)
x4 = tf.nn.softmax(x3)
loss = cross_entropy(x4, labels)
optimizer = tf.train.AdamOptimizer(lr)
train_op = optimizer.minimize(loss) # <--- ERROR HERE!
Particularly, the error is the following:
File "/home/venvs/tensorflowGPU/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2481, in AddOp
File "/home/venvs/tensorflowGPU/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2509, in _AddOpInternal
File "/home/venvs/tensorflowGPU/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py", line 2547, in _MaybeAddControlDependency
AttributeError: 'NoneType' object has no attribute 'op'
The whole error stack and the code can be found here.
Thanks for the help,
Following #thushv89 suggestion, I added a possible solution to the problem. I still don't know why my previous code didn't work. Any insight on this would still be very appreciated.
#gabriele regarding having to depend on batch_size, have you tried doing it the following way? This function does not depend on batch_size. You can replace the map_fn with anything you like.
def apply_function_on_pixel_features(incoming):
# get input shape:
_, W, H, C = incoming.get_shape().as_list()
incoming_flat = tf.reshape(incoming, shape=[-1, C])
# apply function on every vector of shape [1, C]
out_matrix = tf.map_fn(lambda x: x+1, incoming_flat) # dimension remains unchanged
# go back to the input shape shape [None, W, H, C]
out_matrix = tf.reshape(out_matrix, shape=[-1, W, H, C])
return out_matrix
The full code of what I tested is as below.
import numpy as np
import tensorflow as tf
from tensorflow.keras.losses import categorical_crossentropy
def apply_function_on_pixel_features(incoming):
# get input shape:
_, W, H, C = incoming.get_shape().as_list()
incoming_flat = tf.reshape(incoming, shape=[-1])
# apply function on every vector of shape [1, C]
out_matrix = tf.map_fn(lambda x: x+1, incoming_flat) # dimension remains unchanged
# go back to the input shape shape [None, W, H, C]
out_matrix = tf.reshape(out_matrix, shape=[-1, W, H, C])
return out_matrix
H = 32
W = 32
x1 = tf.placeholder(tf.float32, [None, H, W, 1])
labels = tf.placeholder(tf.float32, [None, 10])
x2 = tf.layers.conv2d(x1, filters=1, kernel_size=3, padding='same')
# now apply a function to the features vector associated to each pixel
x3 = apply_function_on_pixel_features(x2)
x4 = tf.layers.flatten(x3)
x4 = tf.layers.dense(x4, units=10, activation='softmax')
loss = categorical_crossentropy(labels, x4)
optimizer = tf.train.AdamOptimizer(0.001)
train_op = optimizer.minimize(loss)
x = np.zeros(shape=(10, H, W, 1))
y = np.random.choice([0,1], size=(10, 10))
with tf.Session() as sess:
sess.run(train_op, feed_dict={x1: x, labels:y})
Following #thushv89 suggestion, I reshaped the array, applied the function and then reshaped it back (so to avoid the tf.map_fn recursion). I still don't know exactly why the previous code didn't work, but the current implementation allowed to propagate the gradients back to the previous layers. I'll leave it below, for whom might be interested:
def apply_function_on_pixel_features(incoming, batch_size):
# get input shape:
_, W, H, C = incoming.get_shape().as_list()
incoming_flat = tf.reshape(incoming, shape=[batch_size * W * H, C])
# apply function on every vector of shape [1, C]
out_matrix = my_custom_fun(incoming_flat) # dimension remains unchanged
# go back to the input shape shape [None, W, H, C]
out_shape = tf.convert_to_tensor([batch_size, W, H, C])
out_matrix = tf.reshape(out_matrix, shape=out_shape)
return out_matrix
Notice that now I needed to give the batch size to correctly reshape the tensor because TensorFlow would complain if I gave None or -1 as a dimension.
Any comments and insight on the above code would still be very appreciated.

How can I create a `tf.data.Dataset` for a `tf.keras` model that accepts multiple inputs of various shapes?

I have a tf.keras model that needs to accept multiple inputs of multiple shapes. My goal is to build it in such a way that I can train and evaluate it easily using its fit and evaluate API.
So far, the model is built as follows:
class MultipleLSTM(Model):
def __init__(self, lstm_dims=128, name='multi_lstm', **kwargs):
super(MultipleLSTM, self).__init__(name=name)
# initialize encoders for every attribute
self.encoders = []
for key, value in kwargs.items():
self.encoders.append(self._create_encoder(lstm_dims, value))
# initialize the rest of the network layers
self.concat = Concatenate(axis=0)
self.conv_1 = Conv2D(6, 4, activation='relu')
self.flatten = Flatten()
self.dense = Dense(128, activation='relu')
self.out = Dense(1, activation='sigmoid')
def call(self, inputs):
x_1 = self.encoders[0](inputs[0])
x_2 = self.encoders[1](inputs[1])
x_3 = self.encoders[2](inputs[2])
x_4 = self.encoders[3](inputs[3])
x = self.concat([x_1, x_2, x_3, x_4])
# fix the shape for the convolutions
x = tf.expand_dims(x, axis=0)
x = tf.expand_dims(x, axis=3)
x = self.conv_1(x)
x = self.flatten(x)
x = self.dense(x)
x = self.out(x)
return x
def _create_encoder(self, lstm_dims, conf):
with tf.name_scope(conf['name']) as scope:
encoder = tf.keras.Sequential(name=scope)
return encoder
There are four different inputs, text sentences of different lengths, that are fed to four different Embedding and LSTM layers (encoders). Then the outputs of those layers are concatenated to create a single tensor that is forwarded to the subsequent layers.
To train this network, I'm passing as input a list of lists, for the different tokenized sentences. The label is just number, 0 or 1 (binary classification). For example, an input could be:
x = [[1, 2, 3, 4],
[2, 3, 5],
[3, 5, 6, 7],
[1, 5, 7]]
y = 0
For now, I have implemented a custom loop that takes such input and trains the network:
def train(data, model, loss_fn, optimizer, metric, epochs=10, print_every=50):
for epoch in range(epochs):
print(f'Start of epoch {epoch+1}')
for step, (x_batch, y_batch) in enumerate(data):
with GradientTape() as tape:
output = model(x_batch)
loss = loss_fn(y_batch, output)
grads = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
if step % print_every == 0:
print(f'step {step}: mean loss = {metric.result()}')
But this prevents me from exploiting the easy to use tf.keras API, to fit and evaluate the model or even split the dataset into train and test sets. Thus, the question is: How can I create a tf.data.Dataset from such x's and y's and pass it to the fit function of tf.keras?
You can use the functional api of keras to do so. Here is the link of the keras documentation on multi input, output if you want : Multi-input and multi-output models
You can directly pass the different inputs as a list and fit and evaluate methods.
model.fit([X_train[:,0], X_train[:,1]], y_train, ...)

TensorFlow: How to embed float sequences to fixed size vectors?

I am looking methods to embed variable length sequences with float values to fixed size vectors. The input formats as following:
[f1,f2,f3,f4]->[f1,f2,f3,f4]->[f1,f2,f3,f4]-> ... -> [f1,f2,f3,f4]
[f1,f2,f3,f4]->[f1,f2,f3,f4]->[f1,f2,f3,f4]->[f1,f2,f3,f4]-> ... -> [f1,f2,f3,f4]
[f1,f2,f3,f4]-> ... -> ->[f1,f2,f3,f4]
Each line is a variable length sequnece, with max length 60. Each unit in one sequece is a tuple of 4 float values. I have already paded zeros to fill all sequences to the same length.
The following architecture seems solve my problem if I use the output as the same as input, I need the thought vector in the center as the embedding for the sequences.
In tensorflow, I have found tow candidate methods tf.contrib.legacy_seq2seq.basic_rnn_seq2seq and tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq.
However, these tow methos seems to be used to solve NLP problem, and the input must be discrete value for words.
So, is there another functions to solve my problems?
All you need is only an RNN, not the seq2seq model, since seq2seq goes with an additional decoder which is unecessary in your case.
An example code:
import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn
input_size = 4
max_length = 60
output_size = 4
x = tf.placeholder(tf.float32, shape=[None, max_length, input_size], name='x')
seqlen = tf.placeholder(tf.int64, shape=[None], name='seqlen')
lstm_cell = rnn.BasicLSTMCell(hidden_size, forget_bias=1.0)
outputs, states = tf.nn.dynamic_rnn(cell=lstm_cell, inputs=x, sequence_length=seqlen, dtype=tf.float32)
encoded_states = states[-1]
W = tf.get_variable(
shape=[hidden_size, output_size],
b = tf.get_variable(
z = tf.matmul(encoded_states, W) + b
results = tf.sigmoid(z)
## cost computing and training components goes here
# e.g.
# targets = tf.placeholder(tf.float32, shape=[None, input_size], name='targets')
# cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=z))
# optimizer = tf.train.AdamOptimizer(learning_rate=0.1).minimize(cost)
init = tf.global_variables_initializer()
batch_size = 4
data_in = np.zeros((batch_size, max_length, input_size), dtype='float32')
data_in[0, :4, :] = np.random.rand(4, input_size)
data_in[1, :6, :] = np.random.rand(6, input_size)
data_in[2, :20, :] = np.random.rand(20, input_size)
data_in[3, :, :] = np.random.rand(60, input_size)
data_len = np.asarray([4, 6, 20, 60], dtype='int64')
with tf.Session() as sess:
# training process goes here
res = sess.run(results,
x: data_in,
seqlen: data_len})
To encode sequence to a fixed length vector you typically use recurrent neural networks (RNNs) or convolutional neural networks (CNNs).
If you use a recurrent neural network you can use the output at the last time step (last element in your sequence). This corresponds to the thought vector in your question. Have a look at tf.dynamic_rnn. dynamic_rnn requires you to specify to type of RNN cell you want to use. tf.contrib.rnn.LSTMCell and tf.contrib.rnn.GRUCell are most common.
If you want to use CNNs you need to use 1 dimensional convolutions. To build CNNs you need tf.layers.conv1d and tf.layers.max_pooling1d
I have found a solution to my problem, using the following architecture,
The LSTMs layer below encode the series x1,x2,...,xn. The last output, the green one, is duplicated to the same count as the input for the decoding LSTM layers above. The tensorflow code is as following
series_input = tf.placeholder(tf.float32, [None, conf.max_series, conf.series_feature_num])
print("Encode input Shape", series_input.get_shape())
# encoding layer
encode_cell = tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.BasicLSTMCell(conf.rnn_hidden_num, reuse=False) for _ in range(conf.rnn_layer_num)]
encode_output, _ = tf.nn.dynamic_rnn(encode_cell, series_input, dtype=tf.float32, scope='encode')
print("Encode output Shape", encode_output.get_shape())
# last output
encode_output = tf.transpose(encode_output, [1, 0, 2])
last = tf.gather(encode_output, int(encode_output.get_shape()[0]) - 1)
# duplite the last output of the encoding layer
decoder_input = tf.stack([last for _ in range(conf.max_series)], axis=1)
print("Decoder input shape", decoder_input.get_shape())
# decoding layer
decode_cell = tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.BasicLSTMCell(conf.series_feature_num, reuse=False) for _ in range(conf.rnn_layer_num)]
decode_output, _ = tf.nn.dynamic_rnn(decode_cell, decoder_input, dtype=tf.float32, scope='decode')
print("Decode output", decode_output.get_shape())
# Loss Function
loss = tf.losses.mean_squared_error(labels=series_input, predictions=decode_output)
print("Loss", loss)