How to make a Keras Dense Layer deal with 3D tensor as input for this Softmax Fully Connected Layer? - tensorflow

I am working on a custom problem, and i have to change the fully connected layer (Dense with softmax), My model code is something like this (with Keras Framework):
.......
batch_size = 8
inputs = tf.random.uniform(shape=[batch_size,1024,256],dtype=tf.dtypes.float32)
preds = Dense(num_classes,activation='softmax')(x) #final layer with softmax activation
....
model = Model(inputs=base_model.input,outputs=preds)
So, i have to change the Code of Dense Layer to output a Tensor of probabilities with the shape of [batch_size, 1024, num_classes], without using a for loop, i need it to be optimized and not a consuming time function
The Dense code version that i want to change:
class Dense(Layer):
"""Just your regular densely-connected NN layer.
`Dense` implements the operation:
`output = activation(dot(input, kernel) + bias)`
where `activation` is the element-wise activation function
passed as the `activation` argument, `kernel` is a weights matrix
created by the layer, and `bias` is a bias vector created by the layer
(only applicable if `use_bias` is `True`).
Note: if the input to the layer has a rank greater than 2, then
it is flattened prior to the initial dot product with `kernel`.
# Example
```python
# as first layer in a sequential model:
model = Sequential()
model.add(Dense(32, input_shape=(16,)))
# now the model will take as input arrays of shape (*, 16)
# and output arrays of shape (*, 32)
# after the first layer, you don't need to specify
# the size of the input anymore:
model.add(Dense(32))
```
# Arguments
units: Positive integer, dimensionality of the output space.
activation: Activation function to use
(see [activations](../activations.md)).
If you don't specify anything, no activation is applied
(ie. "linear" activation: `a(x) = x`).
use_bias: Boolean, whether the layer uses a bias vector.
kernel_initializer: Initializer for the `kernel` weights matrix
(see [initializers](../initializers.md)).
bias_initializer: Initializer for the bias vector
(see [initializers](../initializers.md)).
kernel_regularizer: Regularizer function applied to
the `kernel` weights matrix
(see [regularizer](../regularizers.md)).
bias_regularizer: Regularizer function applied to the bias vector
(see [regularizer](../regularizers.md)).
activity_regularizer: Regularizer function applied to
the output of the layer (its "activation").
(see [regularizer](../regularizers.md)).
kernel_constraint: Constraint function applied to
the `kernel` weights matrix
(see [constraints](../constraints.md)).
bias_constraint: Constraint function applied to the bias vector
(see [constraints](../constraints.md)).
# Input shape
nD tensor with shape: `(batch_size, ..., input_dim)`.
The most common situation would be
a 2D input with shape `(batch_size, input_dim)`.
# Output shape
nD tensor with shape: `(batch_size, ..., units)`.
For instance, for a 2D input with shape `(batch_size, input_dim)`,
the output would have shape `(batch_size, units)`.
"""
def __init__(self, units,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
**kwargs):
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super(Dense, self).__init__(**kwargs)
self.units = units
self.activation = activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.activity_regularizer = regularizers.get(activity_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.bias_constraint = constraints.get(bias_constraint)
self.input_spec = InputSpec(min_ndim=2)
self.supports_masking = True
def build(self, input_shape):
assert len(input_shape) >= 2
input_dim = input_shape[-1]
self.kernel = self.add_weight(shape=(input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
self.built = True
def call(self, inputs):
output = K.dot(inputs, self.kernel)
if self.use_bias:
output = K.bias_add(output, self.bias)
if self.activation is not None:
output = self.activation(output)
return output
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 2
assert input_shape[-1]
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
def get_config(self):
config = {
'units': self.units,
'activation': activations.serialize(self.activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'activity_regularizer': regularizers.serialize(self.activity_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint)
}
base_config = super(Dense, self).get_config()
return dict(list(base_config.items()) + list(config.items()))

There are three different ways in which this can be done (that I can think of). If you want to have a single dense layer, that maps a vector of 256 elements to a vector of num_classes elements, and apply it all across your batch of data (that is, use the same 256 x num_classes matrix of weights for every sample), then you don't need to do anything special, just use a regular Dense layer:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense
batch_size = 8
num_classes = 10
inp = Input(shape=(1024, 256))
layer = Dense(num_classes, activation='softmax')
out = layer(inp)
print(out.shape)
# (None, 1024, 10)
print(layer.count_params())
# 2570
Another way would be to have a single huge Dense layer that takes all 1024 * 256 values in at the same time and produces all 1024 * num_classes values at the output, that is, a layer with a matrix of weights with shape (1024 * 256) x (1024 * num_classes) (in the order if gigabytes of memory!). This is easy to do too, although it seems unlikely to be what you need:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Flatten, Dense, Reshape, Softmax
batch_size = 8
num_classes = 10
inp = Input(shape=(1024, 256))
res = Flatten()(inp)
# This takes _a lot_ of memory!
layer = Dense(1024 * num_classes, activation=None)
out_res = layer(res)
# Apply softmax after reshaping
out_preact = Reshape((-1, num_classes))(out_res)
out = Softmax()(out_preact)
print(out.shape)
# (None, 1024, 10)
print(layer.count_params())
# 2684364800
Finally, you may want to have a set of 1024 weight matrices, each one applied to the corresponding sample in the input, which would imply an array of weights with shape (1024, 256, num_classes). I don't think this can be done with one of the standard Keras layers (or don't know how to)1, but it's easy enough to write a custom layer based on Dense to do that:
import tensorflow as tf
from tensorflow.keras.layers import Dense, InputSpec
class Dense2D(Dense):
def __init__(self, *args, **kwargs):
super(Dense2D, self).__init__(*args, **kwargs)
def build(self, input_shape):
assert len(input_shape) >= 3
input_dim1 = input_shape[-2]
input_dim2 = input_shape[-1]
self.kernel = self.add_weight(shape=(input_dim1, input_dim2, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(input_dim1, self.units),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
self.input_spec = InputSpec(min_ndim=3, axes={-2: input_dim1, -1: input_dim2})
self.built = True
def call(self, inputs):
# Multiply each set of weights with each input element
output = tf.einsum('...ij,ijk->...ik', inputs, self.kernel)
if self.use_bias:
output += self.bias
if self.activation is not None:
output = self.activation(output)
return output
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 3
assert input_shape[-1]
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
You would then use it like this:
import tensorflow as tf
from tensorflow.keras import Input
batch_size = 8
num_classes = 10
inp = Input(shape=(1024, 256))
layer = Dense2D(num_classes, activation='softmax')
out = layer(inp)
print(out.shape)
# (None, 1024, 10)
print(layer.count_params())
# 2631680
1: As today points out in the comments, you can actually use a LocallyConnected1D layer to do the same that I tried to do with my Dense2D layer. It is as simple as this:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import LocallyConnected1D
batch_size = 8
num_classes = 10
inp = Input(shape=(1024, 256))
layer = LocallyConnected1D(num_classes, 1, activation='softmax')
out = layer(inp)
print(out.shape)
# (None, 1024, 10)
print(layer.count_params())
# 2631680

Related

Custom Attention Layer using in Keras

I want to create a custom attention layer that for input at any time this layer returns the weighted mean of inputs at all time inputs.
For Example, I want that input tensor with shape [32,100,2048] goes to layer and I get the tensor with the shape [32,100,2048]. I wrote the Layer as follow:
import tensorflow as tf
from keras.layers import Layer, Dense
#or
from tensorflow.keras.layers import Layer, Dense
class Attention(Layer):
def __init__(self, units_att):
self.units_att = units_att
self.W = Dense(units_att)
self.V = Dense(1)
super().__init__()
def __call__(self, values):
t = tf.constant(0, dtype= tf.int32)
time_steps = tf.shape(values)[1]
initial_outputs = tf.TensorArray(dtype=tf.float32, size=time_steps)
initial_att = tf.TensorArray(dtype=tf.float32, size=time_steps)
def should_continue(t, *args):
return t < time_steps
def iteration(t, values, outputs, atts):
score = self.V(tf.nn.tanh(self.W(values)))
# attention_weights shape == (batch_size, time_step, 1)
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
outputs = outputs.write(t, context_vector)
atts = atts.write(t, attention_weights)
return t + 1, values, outputs, atts
t, values, outputs, atts = tf.while_loop(should_continue, iteration,
[t, values, initial_outputs, initial_att])
outputs = outputs.stack()
outputs = tf.transpose(outputs, [1,0,2])
atts = atts.stack()
atts = tf.squeeze(atts, -1)
atts = tf.transpose(atts, [1,0,2])
return t, values, outputs, atts
For input= tf.constant(2, shape= [32, 100, 2048], dtype= tf.float32) I get the
output with shape = [32,100,2048] in tf2 and [32,None, 2048] in tf1.
For Input input= Input(shape= (None, 2048)) I get the output with shape = [None, None, 2048] in tf1 and I get error
TypeError: 'Tensor' object cannot be interpreted as an integer
in tf2.
Finally, in both cases, I can't use this layer in my model because my model input is Input(shape= (None, 2048)) and I get the error
AttributeError: 'NoneType' object has no attribute '_inbound_nodes'
in tf1 and in tf2 I get the same error as said in above, I create my model with Keras functional method.
From the code you have shared, looks like you want to implement Bahdanau's attention layer in your code. You want to attend to all the 'values' (prev layer output - all its hidden states) and your 'query' would be the last hidden state of the decoder. Your code should actually be very simple and should look like:
class Bahdanau(tf.keras.layers.Layer):
def __init__(self, n):
super(Bahdanau, self).__init__()
self.w = tf.keras.layers.Dense(n)
self.u = tf.keras.layers.Dense(n)
self.v = tf.keras.layers.Dense(1)
def call(self, query, values):
query = tf.expand_dims(query, 1)
e = self.v(tf.nn.tanh(self.w(query) + self.u(values)))
a = tf.nn.softmax(e, axis=1)
c = a * values
c = tf.reduce_sum(c, axis=1)
return a,c
##Say we want 10 units in the single layer MLP determining w,u
attentionlayer = Bahdanau(10)
##Call with i/p: decoderstate # t-1 and all encoder hidden states
a, c = attentionlayer(stminus1, hj)
We are not specifying the tensor shape anywhere in the code. This code will return you a context tensor of same size as 'stminus1' which is the 'query'. It does this after attending to all the 'values' (all output states of decoder) using Bahdanau's attention mechanism.
So assuming your batch size is 32, timesteps=100 and embedding dimension=2048, the shape of stminus1 should be (32,2048) and the shape of the hj should be (32,100,2048). The shape of the output context would be (32,2048). We also returned the 100 attention weights just in case you want to route them to a nice display.
This is the simplest version of 'Attention'. If you have any other intent, please let me know and I will reformat my answer. For more specific details, please refer https://towardsdatascience.com/create-your-own-custom-attention-layer-understand-all-flavours-2201b5e8be9e

Converting keras functional model to keras class in tensorflow 2

I am trying to convert a Keras functional model into class derived from tensorflow.keras.models.Model and I'm facing 2 issues.
1. I need to multiply 2 layers using tensorflow.keras.layers.multiply, but it returns a ValueError: A merge layer should be called on a list of inputs.
2. If I remove this layern thus working with a classical CNN, it returns a tensorflow.python.eager.core._SymbolicException:Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'patch:0' shape=(None, 64, 64, 3) dtype=float32>].
I would appreciate some guidance to convert my code. I'm using Python 3.7, TensorFlow 2.0rc2 and Keras 2.3.0. The class I have defined is the following:
class TestCNN(Model):
"""
conv1 > conv2 > fc1 > fc2 > alpha * fc2 > Sigmoid > output
"""
def __init__(self, input_dimension, n_category,**kwargs):
"""
Instanciator
:param input_dimension: tuple of int, theoretically (patch_size x patch_size x channels)
:param n_category: int, the number of categories to classify,
:param weight_decay: float, weight decay parameter for all the kernel regularizers
:return: the Keras model
"""
super(TestCNN, self).__init__(name='testcnn', **kwargs)
self.input_dimension = input_dimension
self.n_category = n_category
self.conv1 = Conv2D(36, activation='relu', name='conv1/relu')
self.conv1_maxpooling = MaxPooling2D((2, 2), name='conv1/maxpooling')
self.conv2 = Conv2D(48, activation='relu', name='conv2/relu')
self.conv2_maxpooling = MaxPooling2D((2, 2), name='conv2/maxpooling')
self.flatten1 = Flatten(name='flatten1')
self.fc1 = Dense(512, activation='relu', name='fc1/relu')
self.fc2 = Dense(512, activation='relu', name='fc2/relu')
self.alpha = TestLayer(layer_dim=128, name='alpha')
self.output1 = TestSigmoid(output_dimension=n_category, name='output_layer')
#tensorflow.function
def call(self, x):
x = self.conv1(x)
x = self.conv1_maxpooling(x)
x = self.conv2(x)
x = self.conv2_maxpooling(x)
x = self.flatten1(x)
x = self.fc1(x)
x = self.fc2(x)
alpha_times_fc2 = multiply([alpha_output, fc2_output], name='alpha_times_fc2')
return self.output1(alpha_times_fc2)
def build(self, **kwargs):
inputs = Input(shape=self.input_dimension, dtype='float32', name='patch')
outputs = self.call(inputs)
super(TestCNN, self).__init__(name="TestCNN", inputs=inputs, outputs=outputs, **kwargs)
Then, in my main loop, I'm creating the instance as following:
testcnn = TestCNN(input_dimension=input_dimension, n_category=training_set.category_count)
optimizer = tensorflow.keras.optimizers.Adam(
lr=parameter['training']['adam']['learning_rate'],
beta_1=parameter['training']['adam']['beta1'],
beta_2=parameter['training']['adam']['beta2'])
metrics_list = [tensorflow.keras.metrics.TruePositives]
loss_function = tensorflow.keras.losses.categorical_crossentropy
loss_metrics = tensorflow.keras.metrics.Mean()
testcnn.build()
testcnn.summary()
This code is raising the tensorflow.python.eager.core._SymbolicException. If I comment out some lines and return directly the results of the fc2 layer, I've got the ValueError.
I have commenter the build() function in my model and call it in my main script as following:
testcnn.build(input_dimension)
testcnn.compile(optimizer=adam_optimizer, loss=loss_function, metrics=metrics_list)
testcnn.summary()
Input dimension is a list formatted as following:
input_dimension = (batch_size, image_size, image_size, channels)

How to use a pre-trained embedding matrix in tensorflow 2.0 RNN as initial weights in an embedding layer?

I'd like to use a pretrained GloVe embedding as the initial weights for an embedding layer in an RNN encoder/decoder. The code is in Tensorflow 2.0. Simply adding the embedding matrix as a weights = [embedding_matrix] parameter to the tf.keras.layers.Embedding layer won't do it because the encoder is an object and I'm not sure now to effectively pass the embedding_matrix to this object at training time.
My code closely follows the neural machine translation example in the Tensorflow 2.0 documentation. How would I add a pre-trained embedding matrix to the encoder in this example? The encoder is an object. When I get to training, the GloVe embedding matrix is unavailable to the Tensorflow graph. I get the error message:
RuntimeError: Cannot get value inside Tensorflow graph function.
The code uses the GradientTape method and teacher forcing in the training process.
I've tried modifying the encoder object to include the embedding_matrix at various points, including in the encoder's init, call and initialize_hidden_state. All of these fail. The other questions on stackoverflow and elsewhere are for Keras or older versions of Tensorflow, not Tensorflow 2.0.
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix])
self.gru = tf.keras.layers.GRU(self.enc_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
def call(self, x, hidden):
x = self.embedding(x)
output, state = self.gru(x, initial_state = hidden)
return output, state
def initialize_hidden_state(self):
return tf.zeros((self.batch_sz, self.enc_units))
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))
# ... Bahdanau Attention, Decoder layers, and train_step defined, see link to full tensorflow code above ...
# Relevant training code
EPOCHS = 10
training_record = pd.DataFrame(columns = ['epoch', 'training_loss', 'validation_loss', 'epoch_time'])
for epoch in range(EPOCHS):
template = 'Epoch {}/{}'
print(template.format(epoch +1,
EPOCHS))
start = time.time()
enc_hidden = encoder.initialize_hidden_state()
total_loss = 0
total_val_loss = 0
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, enc_hidden)
total_loss += batch_loss
if batch % 100 == 0:
template = 'batch {} ============== train_loss: {}'
print(template.format(batch +1,
round(batch_loss.numpy(),4)))
I was trying to do the same thing and getting the exact same error. The problem was that weights in the Embedding layer is currently deprecated. Changing weights= to embeddings_initializer= worked for me.
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
trainable=False)
firslty : load pretrained embedding matrix using
def pretrained_embeddings(file_path, EMBEDDING_DIM, VOCAB_SIZE, word2idx):
# 1.load in pre-trained word vectors #feature vector for each word
print("graph in function",tf.get_default_graph())
print('Loading word vectors...')
word2vec = {}
with open(os.path.join(file_path+'.%sd.txt' % EMBEDDING_DIM), errors='ignore', encoding='utf8') as f:
# is just a space-separated text file in the format:
# word vec[0] vec[1] vec[2] ...
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))
# 2.prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = VOCAB_SIZE
# initialization by zeros
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
if i < VOCAB_SIZE:
embedding_vector = word2vec.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all zeros.
embedding_matrix[i] = embedding_vector
return embedding_matrix
2-then update Encoder class as following:
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz,embedding_matrix):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix])
self.gru = tf.keras.layers.GRU(self.enc_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
def call(self, x, hidden):
x = self.embedding(x)
output, state = self.gru(x, initial_state = hidden)
return output, state
def initialize_hidden_state(self):
return tf.zeros((self.batch_sz, self.enc_units))
3-calling function that loads pre-trained embedding to get embedding matrix
embedding_matrix = pretrained_embeddings(file_path, EMBEDDING_DIM,vocab_size, word2idx)
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE,embedding_matrix)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))
Note : this works on tensorflow 1.13.1 well

Tensorflow multi-label with NCE or sampled softmax

Are there any code examples for using Tensorflow's sampled_softmax_loss or nce_loss functions with multi-label problems? That is, where num_true is more than one?
What follows is my attempt to create a wrapper for nce_loss() and sampled_softmax_loss() based Jeff Chao's work (https://github.com/joelthchao/keras). In the following code, if you change num_true to 1, both samplers work. But with num_true > 1, both samplers throw slightly different exceptions involving tensor shape.
The main program is a simple autoencoder that replicates the class of problem I'm trying to solve: multi-label testing with a huge number of output classes, with a Zipfian distribution. Comments and stack trace at the end.
import tensorflow as tf
import numpy as np
import keras.layers as layers
from keras.models import Model
from keras import backend as K
from keras import initializers,regularizers,constraints
from keras.models import Model
from keras.layers import Dense
from keras.engine.base_layer import InputSpec
from keras.engine.topology import Layer
from keras.engine.input_layer import Input
from tensorflow.keras.optimizers import Nadam, Adam
np.random.seed(10)
import random
def nce_loss_function(weights, biases, labels, inputs, num_sampled, num_classes, num_true):
if K.learning_phase() == 1:
loss = tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true,
partition_strategy="div")
else:
logits = tf.matmul(inputs, tf.transpose(weights))
logits = tf.nn.bias_add(logits, biases)
labels_one_hot = tf.one_hot(labels, num_classes)
loss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=labels_one_hot[:][0][:],
logits=logits)
loss = tf.reduce_sum(loss, axis=1)
return loss
def sampled_softmax_loss_function(weights, biases, labels, inputs, num_sampled, num_classes, num_true):
if K.learning_phase() == 1:
return tf.nn.sampled_softmax_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true,
partition_strategy="div")
else:
logits = tf.matmul(inputs, tf.transpose(weights))
logits = tf.nn.bias_add(logits, biases)
labels_one_hot = tf.one_hot(labels, num_classes)
loss = tf.nn.softmax_cross_entropy_with_logits_v2(
labels=labels_one_hot,
logits=logits)
return loss
class Sampling(Layer):
"""Regular densely-connected NN layer with various sampling Loss.
`Sampling` implements the operation:
`output = dot(input, kernel) + bias`
`kernel` is a weights matrix created by the layer, and `bias` is a bias vector
created by the layer. Also, it adds a sampling Loss to the model.
See [reference](http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf).
# Example
```python
inputs = Input(shape=(4,))
target = Input(shape=(1,)) # sparse format, e.g. [1, 3, 2, 6, ...]
net = Dense(8)(inputs)
net = Sampling(units=128, num_sampled=32)([net, target])
model = Model(inputs=[inputs, target], outputs=net)
model.compile(optimizer='adam', loss=None)
x = np.random.rand(1000, 4)
y = np.random.randint(128, size=1000)
model.fit([x, y], None)
```
# Arguments
units: Positive integer, dimensionality of the output space (num classes).
num_sampled: Positive integer, number of classes to sample in Sampling Loss.
type: 'sampled_softmax', 'nce'
num_true: Max # of positive classes, pad to this for variable inputs
kernel_initializer: Initializer for the `kernel` weights matrix
(see [initializers](../initializers.md)).
bias_initializer: Initializer for the bias vector
(see [initializers](../initializers.md)).
kernel_regularizer: Regularizer function applied to
the `kernel` weights matrix
(see [regularizer](../regularizers.md)).
bias_regularizer: Regularizer function applied to the bias vector
(see [regularizer](../regularizers.md)).
activity_regularizer: Regularizer function applied to
the output of the layer (its "activation").
(see [regularizer](../regularizers.md)).
kernel_constraint: Constraint function applied to
the `kernel` weights matrix
(see [constraints](../constraints.md)).
bias_constraint: Constraint function applied to the bias vector
(see [constraints](../constraints.md)).
# Input shape
Two tensors. First one is 2D tensor with shape: `(batch_size, input_dim)`.
Second one is 1D tensor with length `batch_size`
# Output shape
2D tensor with shape: `(batch_size, units)`.
For instance, for a 2D input with shape `(batch_size, input_dim)`,
the output would have shape `(batch_size, units)`.
"""
def __init__(self,
units,
num_sampled,
type='sampled_softmax',
num_true=1,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
**kwargs):
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super(Sampling, self).__init__(**kwargs)
self.units = units
self.num_sampled = num_sampled
if self.num_sampled > self.units:
raise Exception('num_sample: {} cannot be greater than units: {}'.format(
num_sampled, units))
self.type = type
if not (self.type == 'nce' or self.type == 'sampled_softmax'):
raise Exception('type {} is not a valid sampling loss type'.format(type))
self.num_true = num_true
self.kernel_initializer = initializers.get(kernel_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.activity_regularizer = regularizers.get(activity_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.bias_constraint = constraints.get(bias_constraint)
self.input_spec = [InputSpec(min_ndim=2), InputSpec(min_ndim=1)]
self.supports_masking = True
def build(self, input_shape):
assert len(input_shape) == 2
input_dim = input_shape[0][-1]
self.kernel = self.add_weight(shape=(input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
self.input_spec[0] = InputSpec(min_ndim=2, axes={-1: input_dim})
self.built = True
def call(self, inputs):
pred, target = inputs
output = K.dot(pred, self.kernel)
output = K.bias_add(output, self.bias, data_format='channels_last')
# TODO : check train or test mode
if self.type == 'nce':
nce_loss = nce_loss_function(
K.transpose(self.kernel), self.bias, target, pred, self.num_sampled, self.units, self.num_true)
self.add_loss(K.mean(nce_loss))
else:
sampled_softmax_loss = sampled_softmax_loss_function(
K.transpose(self.kernel), self.bias, target, pred, self.num_sampled, self.units, self.num_true)
self.add_loss(K.mean(sampled_softmax_loss))
return output
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) == 2
assert input_shape[0][-1]
output_shape = list(input_shape[0])
output_shape[-1] = self.units
return tuple(output_shape)
def get_config(self):
config = {
'units': self.units,
'num_sampled': self.num_sampled,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'activity_regularizer': regularizers.serialize(self.activity_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint)
}
base_config = super(Sampling, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def fill_zipf(length, num_classes, num_true=1):
data_onehot = np.zeros((length, num_classes), dtype='float32')
data_labels = np.zeros((length, num_true), dtype='int32')
# all indexes outside of num_classes scattered in existing space
rand = np.random.zipf(1.3, length * num_true) % num_classes
for i in range(length):
for j in range(num_true):
k = rand[i]
data_onehot[i][k] = 1.0
data_labels[i][j] = k
return data_onehot, data_labels
# number of test samples
num_train = 32*500
num_test = 32*500
num_valid = 100
num_epochs = 5
num_hidden = 10
# number of classes
num_classes = 2000
# number of samples for NCE
num_sampled = 24
# number of labels
num_true = 1
# type of negative sampler
sampler_type='sampled_softmax'
inputs = Input(shape=(num_classes,))
target = Input(shape=(num_true,), dtype=tf.int32) # sparse format, e.g. [1, 3, 2, 6, ...]
net = Dense(num_classes)(inputs)
net = Dense(num_hidden, activation='relu')(net)
net = Sampling(units=num_classes, num_sampled=num_sampled, type=sampler_type)([net, target])
model = Model(inputs=[inputs, target], outputs=net)
model.compile(optimizer='adam', loss=None, metrics=['binary_crossentropy'])
model.summary()
train_input, train_output = fill_zipf(num_train, num_classes, num_true)
valid_input, valid_output = fill_zipf(num_valid, num_classes, num_true)
history = model.fit([train_input, train_output], None,
validation_data=([valid_input, valid_output], None),
epochs=num_epochs, verbose=2)
test_input, test_output = fill_zipf(num_test, num_classes, num_true)
predicts = model.predict([test_input, test_output], batch_size=32)
count = 0
for test in range(num_test):
pred = predicts[test]
imax = np.argmax(pred)
if imax == test_output[test]:
count += 1
print("Found {0} out of {1}".format(count/num_true, num_test))
This test works for the single-label case, both 'nce' and 'sampled_softmax'. But, when I set num_true to greater than one, both NCE and Sampled Softmax throw a tensor mismatch exception.
num_true=3
width=2000
sampler_type='sampled_softmax'
With these parameters, for Sampled Softmax, the code throws this exception trace:
File "postable_sampling_tests.py", line 220, in <module>
epochs=num_epochs, verbose=2)
File "/opt/ds/lib/python3.6/site-packages/keras/engine/training.py", line 1039, in fit
validation_steps=validation_steps)
File "/opt/ds/lib/python3.6/site-packages/keras/engine/training_arrays.py", line 199, in fit_loop
outs = f(ins_batch)
File "/opt/ds/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
return self._call(inputs)
File "/opt/ds/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
fetched = self._callable_fn(*array_vals)
File "/opt/ds/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1399, in __call__
run_metadata_ptr)
File "/opt/ds/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 526, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: logits and labels must be broadcastable: logits_size=[32,2000] labels_size=[96,2000]
[[{{node sampling_1/softmax_cross_entropy_with_logits}} = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _class=["loc:#train...s_grad/mul"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](sampling_1/BiasAdd_1, sampling_1/softmax_cross_entropy_with_logits/Reshape_1)]]
32 is the batch_size. Clearly, something is num_true * batch_size but I don't know how to fix this.
If we change the sampler to NCE:
num_true=3
width=2000
sampler_type='nce'
The final two lines of the exception stack:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [32,2000] vs. [3,2000]
[[{{node sampling_1/logistic_loss/mul}} = Mul[T=DT_FLOAT, _class=["loc:#training/Adam/gradients/sampling_1/logistic_loss/mul_grad/Reshape"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](sampling_1/BiasAdd_1, sampling_1/strided_slice_2)]]
In this case, the labels have not been multiplied by batch_size.
What am I doing wrong? How can I get this wrapper system working for multi-label cases?
You can also use samples softmax with multiple labels, you just have to take the mean of each samples softmax
embeddings = tf.get_variable( 'embeddings',
initializer= tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.get_variable( 'softmax_weights',
initializer= tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.get_variable('softmax_biases',
initializer= tf.zeros([vocabulary_size]), trainable=False )
embed = tf.nn.embedding_lookup(embeddings, train_dataset) #train data set is
embed_reshaped = tf.reshape( embed, [batch_size*num_inputs, embedding_size] )
segments= np.arange(batch_size).repeat(num_inputs)
averaged_embeds = tf.segment_mean(embed_reshaped, segments, name=None)
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=averaged_embeds,
labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) #Original learning rate was 1.0
from
https://github.com/Santosh-Gupta/Research2Vec/blob/master/Research2VecTraining2.ipynb

implement one RNN layer in deep DAE seems worse performance

I was trying to implement one RNN layer in deep DAE which is shown in the figure:
DRDAE:
My code is modified based on the DAE tutorial, I change one layer to basic LSTM RNN layer. It somehow can works. The noise in output among different pictures seems lies in same places.
However, compared to both only one layer of RNN and the DAE tutorial, the performance of the structure is much worse. And it requires much more iteration to reach a lower cost.
Can someone help why does the structure got worse result? Below is my code for DRDAE.
# -*- coding: utf-8 -*-
from __future__ import division, print_function, absolute_import
import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
# Parameters
learning_rate = 0.0001
training_epochs = 50001
batch_size = 256
display_step = 500
examples_to_show = 10
total_batch = int(mnist.train.num_examples/batch_size)
# Network Parameters
n_input = 784 # data input
n_hidden_1 = 392 # 1st layer num features
n_hidden_2 = 196 # 2nd layer num features
n_steps = 14
# tf Graph input
X = tf.placeholder("float", [None, n_input])
Y = tf.placeholder("float", [None, n_input])
weights = {
'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'decoder_h1': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_1])),
'decoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_input])),
}
biases = {
'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),
'decoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
'decoder_b2': tf.Variable(tf.random_normal([n_input])),
}
def RNN(x, size, weights, biases):
# Prepare data shape to match `rnn` function requirements
# Current data input shape: (batch_size, n_steps, n_input)
# Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
# Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
x = tf.split(x,n_steps,1)
# Define a lstm cell with tensorflow
lstm_cell = rnn.BasicLSTMCell(size, forget_bias=1.0)
# Get lstm cell output
outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
# Linear activation, using rnn inner loop last output
return tf.matmul(outputs[-1], weights) + biases
# Building the encoder
def encoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
return layer_2
# Building the decoder
def decoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = RNN(x, n_hidden_2, weights['decoder_h1'],biases['decoder_b1'])
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
return layer_2
# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
# Prediction
y_pred = decoder_op
# Targets (Labels) are the original data.
y_true = Y
# Define loss and optimizer, minimize the squared error
cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(y_pred,1), tf.argmax(y_true,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
#with tf.device("/cpu:0"):
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
# Loop over all batches
for i in range(total_batch):
batch, _ = mnist.train.next_batch(batch_size)
origin = batch
# Run optimization op (backprop) and cost op (to get loss value)
sess.run(optimizer, feed_dict={X: batch, Y: origin})
# Display logs per epoch step
if epoch % display_step == 0:
c, acy = sess.run([cost, accuracy], feed_dict={X: batch, Y: origin})
print("Epoch:", '%05d' % (epoch+1), "cost =", "{:.9f}".format(c), "accuracy =", "{:.3f}".format(acy))
print("Optimization Finished!")
# Applying encode and decode over test set
encode_decode = sess.run(
y_pred, feed_dict={X: mnist.test.images[:examples_to_show]})
# Compare original images with their reconstructions
f, a = plt.subplots(2, 10, figsize=(10, 2))
for i in range(examples_to_show):
a[0][i].imshow(np.reshape(mnist.test.images[i], (28, 28)))
a[1][i].imshow(np.reshape(encode_decode[i], (28, 28)))