I am trying to modify data flow of a tensorflow model at runtime. Consider a 3 layers FC neural network. Let's say I want to define 2 different layers for the middle position.
Let's say,
1st option: 64 neuron layer
2nd option: 128 neuran layer.
Then during predict function, I want to give an input alongside the input data like;
model.predict([x_test, decider])
Then if decider is 0, I want my model to execute 64 neuron layer as middle layer. Otherwise, I want my model to execute 128 neuron layer as middle layer.
If I choose one of them, I don't want the other option to be executed for performance reasons.
Note: I do not care for training.
Is there a way to do that? So far, I have been trying to use tf.cond() but could not make it work.
I think you could achieve same thing by recombine the independent models:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
# First model
input_shape = (16, )
inputs_0 = layers.Input(shape=input_shape)
outputs_0 = layers.Dense(256, 'relu')(inputs_0)
fc_0 = models.Model(inputs_0, outputs_0)
# Middel model 0
inputs_1_0 = layers.Input(shape=(256, ))
outputs_1_0 = layers.Dense(64, 'relu')(inputs_1_0)
outputs_1_0 = layers.Dense(128, 'relu')(outputs_1_0)
fc_1_0 = models.Model(inputs_1_0, outputs_1_0)
# Middel model 1
inputs_1_1 = layers.Input(shape=(256, ))
outputs_1_1 = layers.Dense(128, 'relu')(inputs_1_1)
outputs_1_1 = layers.Dense(128, 'relu')(outputs_1_1)
fc_1_1 = models.Model(inputs_1_1, outputs_1_1)
# Last model
inputs_2 = layers.Input(shape=(128, ))
outputs_2 = layers.Dense(1, 'sigmoid')(inputs_2)
fc_2 = models.Model(inputs_2, outputs_2)
def custom_model(x, d):
h = fc_0(x)
if d == 1:
h = fc_1_0(h)
h = fc_1_1(h)
return fc_2(h)
x = np.random.rand(1, input_shape[0])
decider = 0 # Middel model 0 or 1
y = custom_model(x, decider)
Keras (tensorflow 2.6 backend) masks supposed to propagate through the network, as mentioned in the docs:
When using the Functional API or the Sequential API, a mask generated by an Embedding or Masking layer will be propagated through the network for any layer that is capable of using them.
Both Conv1D and GlobalMaxPool1D support masks but mask is not propagated, as demonstrated by the following example,
model1 applies mask and GlobalMaxPool1D
model2 applies mask and Conv1D
model3 applies mask and Conv1D followed by GlobalMaxPool1D
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import initializers
import numpy as np
mask_val = 10.
inp = layers.Input(shape = (4,3))
masked = layers.Masking(mask_value = mask_val)(inp)
max_pool = layers.GlobalMaxPool1D()(masked)
model1 = keras.models.Model(inputs = inp, outputs = max_pool)
#--- initialize the conv kernel to -1. so it's easy to interpret output
conv = layers.Conv1D(1, 2, padding = 'valid', kernel_initializer = initializers.Constant(-1.))(masked)
model2 = keras.models.Model(inputs = inp, outputs = conv)
out = layers.GlobalMaxPool1D()(conv)
model3 = models.Model(inputs = inp, outputs = out)
Now test these 3 models on a simple input, 2 samples of dimension 3, followed by 2 masked samples:
x0 = np.concatenate((np.ones((1,2,3)), mask_val * np.ones((1,2,3))), axis = 1)
model1(x0) # outputs [1,1,1], as expected
model2(x0) # outputs [-6, -3, 0] as expected
model3(x0) # outputs [0], but should output [-3] as the 0 value should be masked
Am I missing something?
So, I was wrong thinking that Conv1D and GlobalMaxPool1D support masks. Turns out that:
layers.GlobalMaxPooling1D().supports_masking #--- this property is False
layers.Conv1D(1,1).supports_masking #--- this property is False
It seemed to support mask since the masking layer replaces the masked values by 0s, and then the outputs of both the Conv1D layer and the GlobalMaxPooling1D layer were not affected by these 0.
A different input (with -1 instead of 1 as the un-masked values) shows it:
x0 = np.concatenate((-np.ones((1,2,3)), mask_val * np.ones((1,2,3))), axis = 1)
l_masked = layers.Masking(mask_value = mask_val)
l_max_pool = layers.GlobalMaxPool1D()
l_max_pool(l_masked(x0)) #--- returns [0,0,0] - ignoring the mask :(
I am trying to use tensors variables to use as weights in a keras layer..
I know that I can use numpy arrays instead but the reason I want to feed tensors is that I want my weight matrices to be of the type SparseTensor.
This is a small example that I have coded so far:
def model_keras(seed, new_hidden_size_list=None):
number_of_layers = 1
hidden_size = 512
hidden_size_list = [hidden_size] * number_of_layers
input_size = 784
output_size = 10
if new_hidden_size_list is not None:
hidden_size_list = new_hidden_size_list
weight_input = tf.Variable(tf.random.normal([784, 512], mean=0.0, stddev=1.0))
bias_input = tf.Variable(tf.random.normal([512], mean=0.0, stddev=1.0))
weight_output = tf.Variable(tf.random.normal([512, 10], mean=0.0, stddev=1.0))
# This gives me an error when trying to use in kernel_initializer and bias_initializer in the keras model
weight_initializer_input = tf.initializers.variables([weight_input])
bias_initializer_input = tf.initializers.variables([bias_input])
weight_initializer_output = tf.initializers.variables([weight_output])
# This works fine
#weight_initializer_input = tf.initializers.lecun_uniform(seed=None)
#bias_initializer_input = tf.initializers.lecun_uniform(seed=None)
#weight_initializer_output = tf.initializers.lecun_uniform(seed=None)
print(weight_initializer_input, bias_initializer_input, weight_initializer_output)
model = keras.models.Sequential()
for index in range(number_of_layers):
if index == 0:
# input layer
model.add(keras.layers.Dense(hidden_size_list[index], activation=nn.selu, use_bias=True,
model.add(keras.layers.Dense(hidden_size_list[index], activation=nn.selu, use_bias=True,
# output layer
model.add(keras.layers.Dense(output_size, use_bias=False, kernel_initializer=weight_initializer_output))
return model
I am using tensorflow 1.15.
Any idea how one can use custom (user defined) Tensor Variables as initializer instead of pre-set schemes (e.g. Glorot, Truncated Normal etc). Another approach that I could take is to explicitly define the computations instead of using the keras.Layer.
Many thanks
Your code works after enabling eager execution.
import tensorflow as tf
Add this at the top of you file.
See this for working code.
I am trying to create a Keras model with three inputs. Only one of them goes through the first few layers and the other two are concatenated at a dense layer. How would I achieve this without disconnecting the graph? The code is shown below
import keras
input_img = Input(shape=(784,))
input_1 = Input(shape=(1,))
input_2 = Input(shape=(1,))
x = (Dense(48,kernel_initializer='normal',activation="relu"))(input_img)
x = (Dropout(0.2))(x)
x = (Dense(24,activation="tanh"))(x)
x = (Dropout(0.3))(x)
x = (Dense(1))(x)
x = keras.layers.concatenate([x, input_1, input_2])
x = (Activation("sigmoid"))(x)
x = Model(input_img, x)
x.compile(loss="binary_crossentropy", optimizer='adam')
To give a more general overview of what I'm attempting, I am essentially trying to create a Convolutional Neural Network with additional features added to the dense layer for classification.
Since your model has three inputs, i.e. input_img, input_1 and input_2 you need to pass a list of these three inputs while defining your model as follows:
x = Model([input_img, input_1, input_2], x)
Hope this helps.
Trying to implement this article.
Edit1: found one error, my output size is 10 and not 1. (one number per each sentence, there are 10 sentences per document)
Edit2: I got another error that involves the batch size. when I make it 10 the model trains (!!!!). but I think it's not the right way... the error I had given batch size 3 is
Edit 3 Solved!! stuff with sizes + the fact the BIDIRECTIONAL returns different stuff from LSTM so I need to concat myself. Will put right code in answer.
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Incompatible shapes: [10] vs. [3]
[[{{node training_5/Adam/gradients/loss_8/dense_61_loss/mul_grad/BroadcastGradientArgs}}]]
(1) Invalid argument: Incompatible shapes: [10] vs. [3]
[[{{node training_5/Adam/gradients/loss_8/dense_61_loss/mul_grad/BroadcastGradientArgs}}]]
0 successful operations.
0 derived errors ignored.
The target is extractive document summarization.
Link to colab with code
What they do is (you can see in the picture at page 3)
100 units of BI_LSTM + Attention for each sentence of the document.
Concat those and insert them to 1 BI_LSTM + Attention to get document embeddings.
Use document embeddings + hidden states from the LSTM to get some features.
Classify according to features
After a lot of struggle with keras low level api, I got a simple version to work.
What I did was to get the already sentence embedding and just do the last LSTM.
Or get words embedding of a sentence and make the small unit of sentence LSTM to work.
Now I am trying to put every thing together but can not get the sizes to fit.
My input size is
number_of_document * sentences_in_document * words_in_sentence * word_embedding
In the code I set those to be 20 * 10 * 50 * 100
(10 sentence in document is for everything to run faster for now..).
My output is
10 * 1 meaning for each sentence I get 1/0 if it's part of the document summary.
(I have not yet did the features extraction part, I just use another dense layer to give me probabilities instead..)
I think the problem is with this part of the code
X_doc = Lambda(lambda x: x[:,t, :, :])(X)
The code with sample data
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np
import keras
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import tensorflow as tf
from keras import backend as K
num_of_training_examples = 20
words_in_sentence = 50 # max words per sentence
sentences_in_doc = 10
model_output_size = 10
word_embeddings_size = 100
lstm_hidden_size = 200
densor1_output_size = 400
densor2_output_size = 400
x_train = np.random.rand(num_of_training_examples, sentences_in_doc, words_in_sentence, word_embeddings_size)
y_train= np.random.randint(2, size=(num_of_training_examples, sentences_in_doc))
# Initialize arrays
inputs = []
bi_lstms = []
densors_1 =[]
densors_2 = []
for i in range(sentences_in_doc):
bi_lstms.append(Bidirectional(LSTM(units = lstm_hidden_size, input_shape=(words_in_sentence, word_embeddings_size),
return_sequences=False, name='bidirectional_' + str(i)), merge_mode='concat'))
densors_1.append(Dense(densor1_output_size, activation = "tanh"))
densors_2.append(Dense(densor2_output_size, activation = "softmax"))
def invoke_sentence(sentence_matrix, index):
if index==0:
Ys = bi_lstms[index](sentence_matrix)
attention_middle = densors_1[index](Ys)
output = densors_2[index](attention_middle)
if index==0:
print(f'Ys shape is {Ys.shape}')
print(f'attention_middle shape is {attention_middle.shape}')
print(f'output shape is {output.shape}')
return output
def model(words_in_sentence, sentences_in_doc, lstm_hidden_size, word_embeddings_size, model_output_size):
words_in_sentence -- Tx -- length of the input sequence - max words per sentence
sentences_in_doc --Ty -- length of the output sequence - number of sentences per document
lstm_hidden_size -- hidden state size of the Bi-LSTM
word_embeddings_size -- size of the word embeddings
model_output_size -- size of each sentence label (1 or 0)
model -- Keras model instance
sentences_embeddings = []
X = Input(shape=(sentences_in_doc , words_in_sentence, word_embeddings_size), name= 'X')
for t in range(Ty):
X_doc = Lambda(lambda x: x[:,t, :, :])(X)
sentences_embeddings.append(invoke_sentence(X_doc, t))
sentences_embeddings_stacked = Lambda(lambda x: tf.stack(x, axis=0))(sentences_embeddings)
Ys = Bidirectional(LSTM(units = lstm_hidden_size, input_shape=(sentences_in_doc , lstm_hidden_size*2),
return_sequences=False, name='bidirectional_document'),
attention_middle = Dense(densor1_output_size, activation = "tanh")(Ys)
document_embedding = Dense(densor2_output_size, activation = "softmax")(attention_middle)
outputs = Dense(model_output_size, activation = "softmax")(document_embedding)
# compute_features(document_embeddings, sentences_embeddings, ys)
model = Model(inputs=X, outputs=outputs)
return model
model = model(words_in_sentence, sentences_in_doc, lstm_hidden_size, word_embeddings_size, model_output_size)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x = x_train, y = y_train, batch_size=2, epochs=1)
# Sizes
num_of_training_examples = 20
words_in_sentence = 50 # max words per sentence
sentences_in_doc = 10
model_output_size = 10
word_embeddings_size = 100
lstm_hidden_size = 200
densor1_output_size = 400
densor2_output_size = 400
# Data
x_train = np.random.rand(num_of_training_examples, sentences_in_doc, words_in_sentence, word_embeddings_size)
y_train= np.random.randint(2, size=(num_of_training_examples, sentences_in_doc))
# Initialize arrays
inputs = []
bi_lstms = []
densors_1 =[]
densors_2 = []
for i in range(sentences_in_doc):
bi_lstms.append(Bidirectional(LSTM(units = lstm_hidden_size, input_shape=(words_in_sentence, word_embeddings_size),
return_sequences=True, return_state=True, name='bidirectional_' + str(i))))
densors_1.append(Dense(densor1_output_size, activation = "tanh",name='senteence_dense_tanh' + str(i)))
densors_2.append(Dense(densor2_output_size, activation = "softmax",name='senteence_dense_softmax' + str(i)))
def invoke_sentence(sentence_matrix, index):
if index==0:
lstm, forward_h, forward_c, backward_h, backward_c = bi_lstms[index](sentence_matrix)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
attention_middle = densors_1[index](state_h)
output = densors_2[index](attention_middle)
if index==0:
print(f'lstm shape is {lstm.shape}')
print(f'state_h shape is {state_h.shape}')
print(f'state_c shape is {state_c.shape}')
print(f'attention_middle shape is {attention_middle.shape}')
print(f'output shape is {output.shape}')
return output
def model(words_in_sentence, sentences_in_doc, lstm_hidden_size, word_embeddings_size, model_output_size):
words_in_sentence -- Tx -- length of the input sequence - max words per sentence
sentences_in_doc --Ty -- length of the output sequence - number of sentences per document
lstm_hidden_size -- hidden state size of the Bi-LSTM
word_embeddings_size -- size of the word embeddings
model_output_size -- size of each sentence label (1 or 0)
model -- Keras model instance
sentences_embeddings = []
X = Input(shape=(sentences_in_doc, words_in_sentence, word_embeddings_size), name= 'X')
for t in range(sentences_in_doc):
X_doc = Lambda(lambda x: x[:, t,:, :])(X)
sentence_embedding = invoke_sentence(X_doc, t)
sentences_embeddings_stacked = Lambda(lambda x: tf.stack(x, axis=1))(sentences_embeddings)
doc_lstm, doc_forward_h, doc_forward_c, doc_backward_h, doc_backward_c = Bidirectional(LSTM(units = lstm_hidden_size, input_shape=(sentences_in_doc, lstm_hidden_size*2),
return_sequences=True, return_state=True, name='bidirectional_document'),
doc_state_h = Concatenate()([doc_forward_h, doc_backward_h])
doc_state_c = Concatenate()([doc_forward_c, doc_backward_c])
print(f'doc_lstm shape is {doc_lstm.shape}')
print(f'doc_state_h shape is {doc_state_h.shape}')
print(f'doc_state_c shape is {doc_state_c.shape}')
attention_middle = Dense(densor1_output_size, activation = "tanh")(doc_state_h)
document_embedding = Dense(densor2_output_size, activation = "softmax")(attention_middle)
print(f'document_embedding shape is {document_embedding.shape}')
# my_layer = MyLayer(input_shape=((400), (10,400), (10,400)), output_dim=2)
# custom_output = my_layer([document_embedding, sentences_embeddings_stacked, doc_state_h])
# print(f'custom_output shape is {custom_output.shape}')
outputs = Dense(model_output_size, activation = "softmax")(document_embedding)
model = Model(inputs=X, outputs=outputs)
return model
model = model(words_in_sentence, sentences_in_doc, lstm_hidden_size, word_embeddings_size, model_output_size)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x = x_train, y = y_train, batch_size=5, epochs=1)
How do I create a custom loss function in MXNET? For example, instead of computing cross-entropy loss for one label (using standard mx.sym.SoftmaxOutput layer which computes cross-entropy loss and returns a symbol that can be passed as a loss symbol to the fit function), I want to compute weighted cross-entropy loss for each possible label. The MXNET tutorials mention using
mx.symbol.MakeLoss(scalar_loss_symbol, normalization='batch')
However, when I use MakeLoss function, the standard eval_metric - "acc" does not work (obviously as the model doesn't know what is my predicted probability vector). Therefore I need to write my own eval_metric.
Further, at the time of prediction, I need to predict the probability vector as well, which cannot be accessed unless I group the final probability vector with the loss symbol and block_grad on it.
The code below is a modification of the MXNET tutorial http://mxnet.io/tutorials/python/mnist.html where the standard SoftmaxOutput loss function is rewritten for a custom weighted loss function and required custom eval_metric is written.
import logging
import mxnet as mx
import numpy as np
mnist = mx.test_utils.get_mnist()
batch_size = 100
weighted_train_labels =
np.zeros((mnist['train_label'].shape[0],np.max(mnist['train_label'])+ 1))
weighted_train_labels[np.arange(mnist['train_label'].shape[0]),mnist['train_label']] = 1
train_iter = mx.io.NDArrayIter(mnist['train_data'], {'label':weighted_train_labels}, batch_size, shuffle=True)
weighted_test_labels = np.zeros((mnist['test_label'].shape[0],np.max(mnist['test_label'])+ 1))
weighted_test_labels[np.arange(mnist['test_label'].shape[0]),mnist['test_label']] = 1
val_iter = mx.io.NDArrayIter(mnist['test_data'], {'label':weighted_test_labels}, batch_size)
data = mx.sym.var('data')
# first conv layer
conv1 = mx.sym.Convolution(data=data, kernel=(5,5), num_filter=20)
tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2,2), stride=(2,2))
# second conv layer
conv2 = mx.sym.Convolution(data=pool1, kernel=(5,5), num_filter=50)
tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2,2), stride=(2,2))
# first fullc layer
flatten = mx.sym.flatten(data=pool2)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
# second fullc
fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
# softmax loss
#lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
label = mx.sym.var('label')
softmax = mx.sym.log_softmax(data=fc2)
softmax_output = mx.sym.BlockGrad(data = softmax,name = 'softmax')
ce = ce = -mx.sym.sum(mx.sym.sum(mx.sym.broadcast_mul(softmax,label),1))
lenet = mx.symbol.MakeLoss(ce, normalization='batch')
sym = mx.sym.Group([softmax_output,lenet])
print sym.list_outputs
def custom_metric(label,softmax):
return len(np.where(np.argmax(softmax,1)==np.argmax(label,1))[0])/float(label.shape[0])
eval_metrics = mx.metric.CustomMetric(custom_metric,name='custom-accuracy', output_names=['softmax_output'],label_names=['label'])
lenet_model = mx.mod.Module(symbol=sym, context=mx.gpu(),data_names=['data'], label_names=['label'])
#batch_end_callback = mx.callback.Speedometer(batch_size, 100),