How to assign one of the outputs of a layer to be 0 - tensorflow

I have model built in keras, tensorflow 2.0, I want to force the first output of its last layer to be zero. How can I do that?
Here is the model I built and my try for that process:
import tensorflow as tf
X = Input(shape=(32,))
Y = Dense(256, activation= 'relu', kernel_initializer=ini)(X)
Here is what I tried to do. I tried to create another layer whose all values are zeros except the first values which is similar to the output of the layer Y
def zeros_assign(Y):
a = tf.zeros(256,)
aa = aa[0].assign[-Y[0]]
out = aa + Y
return out
Y_out = Lambda(lambda x: channel(x))(Y)
But when I run that layer, it give an error as following:
TypeError: 'Functional' object does not support item assignment

Following is a solution where the BlockFirstNode layer pass all the output from the previous layer, except the first node which is blocked (set to 0). This is as suggested by Dr. Snoopy above by multiplying by zero.
Code example:
import tensorflow as tf
import numpy as np
class BlockFirstNode(tf.keras.layers.Layer):
def __init__(self) -> None:
super().__init__()
def build(self, input_shape):
output_len = input_shape[1]
bloc_vec_np = np.array([0]+[1]*(output_len-1)).reshape(1, output_len)
self.block_vec = tf.convert_to_tensor(bloc_vec_np, dtype=tf.float32)
def call(self, inputs):
return inputs*self.block_vec
X = tf.keras.layers.Input(shape=(32,))
Y = tf.keras.layers.Dense(5, activation= 'sigmoid')(X)
block_output = BlockFirstNode()(Y)
model = tf.keras.models.Model(inputs=X, outputs=[Y, block_output])
pred_y, pred_block = model.predict(np.ones((1,32)))
print(f'Y: {np.round(pred_y,2)}')
# Y: [[0.68 0.59 0.88 0.08 0.48]]
print(f'block_output: {np.round(pred_block,2)}')
# block_output: [[0. 0.59 0.88 0.08 0.48]]
Here we can see that all output are the same as the layer before, except the first node that is forced to zero.
EDIT (2023-02-04):
Now work for any n number of blockings, as requested by Sajjad.
import tensorflow as tf
import numpy as np
class BlockNFirstNode(tf.keras.layers.Layer):
def __init__(self, n_blocking) -> None:
super().__init__()
self.n_blocking = n_blocking
def build(self, input_shape):
output_len = input_shape[1]
if self.n_blocking>output_len:
raise ValueError("n_blocking cannot be larger than layer size")
bloc_vec_list = [0]*self.n_blocking+[1]*(output_len-self.n_blocking)
bloc_vec_np = np.array(bloc_vec_list).reshape(1, output_len)
self.block_vec = tf.convert_to_tensor(bloc_vec_np, dtype=tf.float32)
def call(self, inputs):
return inputs*self.block_vec
X = tf.keras.layers.Input(shape=(32,))
Y = tf.keras.layers.Dense(5, activation= 'sigmoid')(X)
block_output = BlockNFirstNode(n_blocking=2)(Y)
model = tf.keras.models.Model(inputs=X, outputs=[Y, block_output])
pred_y, pred_block = model.predict(np.ones((1,32)))
print(f'Y: {np.round(pred_y,2)}')
# Y: [[0.68, 0.59, 0.88, 0.08, 0.48]]
print(f'block_output: {np.round(pred_block,2)}')
# block_output: [[0, 0, 0.88, 0.08, 0.48]]

Related

Keras/Deepchem: epochs in data generator for prediction in graph convolutions affects prediction size

I am using graph convolutions in Deepchem/Keras for predicting molecular properties. Following the Deepchem tutorials I created a data generator. While there is no error in my code below, I fail to understand why the size of pred changes with epoch and batch_size.
First we create some dummy data.
!pip install --pre deepchem
!pip install --pre rdkit
import deepchem as dc
import numpy as np
import tensorflow as tf
from deepchem.feat.mol_graphs import ConvMol
mol = ['C-C-O']*240
ftr = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
X=ftr.featurize(mol)
y = np.arange(0,240,1)
w = np.arange(0,240,1)
ids = np.arange(0,240,1)
ds = dc.data.NumpyDataset(X=X, y=y, ids=ids)
Edit: We use the following function as generator:
def data_generator(dataset, epochs=1, batch_size = 100, pad_batches = True):
print(dataset)
for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size, epochs,
deterministic=False, pad_batches=pad_batches)):
multiConvMol = ConvMol.agglomerate_mols(X_b)
inputs = [multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership)]
for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
labels = [y_b]
weights = [w_b]
yield (inputs, labels, weights)
(end edit)
Then we define the model and fit it to the dataset generated above:
batch_size = 100
n_tasks = 1
class TestModel(tf.keras.Model):
def __init__(self, model = 1):
super(TestModel, self).__init__()
self.model = model
#____________Test Model 1___________
if self.model == 1:
self.gc1 = GraphConv(128, activation_fn=tf.nn.tanh)
self.readout = GraphGather(batch_size=batch_size,
activation_fn=tf.nn.tanh)
self.dense2 = layers.Dense(1)
def call(self, inputs):
#____________Test Model 1___________
if self.model == 1:
gc1_output = self.gc1(inputs)
readout_output = self.readout([gc1_output]+ inputs[1:])
dense2_output = self.dense2(readout_output)
return dense2_output
#Fit_generator
print("_________\nFitting:")
testmodel = dc.models.KerasModel(TestModel(1), loss=dc.models.losses.L2Loss())
testmodel.fit_generator(data_generator(ds, epochs=1, batch_size = 100))
Finally we try to predict the dataset labels setting epochs = 2:
#Predict
print("_________\nPredicting:")
pred = testmodel.predict_on_generator(data_generator(ds, epochs = 2, batch_size = 100, pad_batches = True))
print(ds.y.shape, pred.shape)
Giving:
_________
Predicting:
<NumpyDataset X.shape: (240,), y.shape: (240,), w.shape: (240,), ids: [0 1 2 ... 237 238 239], task_names: [0]>
(240,) (600, 1)
However if I change epochs to 1, the size of pred changes (300, 1) i.e. half of what we had before. Similarly, changing the batch_size affects the prediction size too.
Can anyone explain what I'm doing wrong?

How does one use keras add_weight() vars with tensorflow probability distributions?

I am creating a new keras layer which accepts a vector of input data and is parameterized by 2 scalars, a mean and standard deviation. I model the input data as a normal distribution and estimate its mean and variance through gradient descent. However, when I initialize tfp.Normal(mu, sigma) which mu and sigma are from add_weights() during, build(), the gradients do not propagate through mu and sigma.
The tensorflow probability documentation states that you can pass in training variables for distribution parameters and backprop through them. How do I get this to work inside of keras?
Below is a minimal working example.
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
tfk = tf.keras
tfkl = tf.keras.layers
tfd = tfp.distributions
tfpl = tfp.layers
EPS = 1e-5
batch_size = 4
N = 100
x = np.random.randn(batch_size, N)
class NormalLikelihood(tf.keras.layers.Layer):
def __init__(self):
super(NormalLikelihood, self).__init__()
def build(self, input_shape):
self.mu = self.add_weight("mean", shape=[1], initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=1), dtype=tf.float32)
self.sigma = self.add_weight("std", shape=[1], initializer=tf.keras.initializers.RandomUniform(minval=EPS, maxval=5.0, seed=None), constraint=tf.keras.constraints.non_neg(), dtype=tf.float32)
self.distribution = tfp.distributions.Normal(self.mu[0], self.sigma[0])
def call(self, input):
r = self.distribution.prob(input)
r = tf.clip_by_value(r, 1e-3, 1-1e-3)
return r
input_layer = tf.keras.layers.Input(shape=(100,))
r = NormalLikelihood()(input_layer)
r = -tf.reduce_sum(tf.math.log(r))
model = tf.keras.models.Model(input_layer, r)
model.add_loss(r)
model.compile(optimizer='rmsprop', loss=None)
model.fit(x, y=None)
This code results in builtins.ValueError: No gradients provided for any variable: ['normal_likelihood/mean:0', 'normal_likelihood/std:0'] which is not expected. Desired behavior would be that ['normal_likelihood/mean:0', 'normal_likelihood/std:0'] have gradients provided for them.
See the code in google colab: https://colab.research.google.com/drive/1_u4XTCIH-2qwNSgv9zkZiCG_zeCIEZGp?usp=sharing
Change tfp.distributions.Normal(self.mu[0], self.sigma[0]) to tfp.distributions.Normal(self.mu, self.sigma).
The reason this works is because under the hood of the .fit() keras method, the gradient computation is looking for trainable variables. When you index into the weights of the model you're taking the gradient against a constant that destroys the connectivity of the chain rule.
Example:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
EPS = 1e-5
class NormalLikelihoodYours(tf.keras.layers.Layer):
def __init__(self):
super(NormalLikelihoodYours, self).__init__()
def build(self, input_shape):
self.mu = self.add_weight(
"mean", shape=[1],
initializer=tf.keras.initializers.RandomNormal(
mean=0.0, stddev=1), dtype=tf.float32)
self.sigma = self.add_weight(
"std", shape=[1],
initializer=tf.keras.initializers.RandomUniform(
minval=EPS, maxval=5.0, seed=None),
constraint=tf.keras.constraints.non_neg(),
dtype=tf.float32)
self.distribution = tfp.distributions.Normal(self.mu[0], self.sigma[0])
def call(self, input):
r = self.distribution.prob(input)
r = tf.clip_by_value(r, 1e-3, 1-1e-3)
return r
class NormalLikelihoodMine(tf.keras.layers.Layer):
def __init__(self):
super(NormalLikelihoodMine, self).__init__()
def build(self, input_shape):
self.mu = self.add_weight(
"mean", shape=[1],
initializer=tf.keras.initializers.RandomNormal(
mean=0.0, stddev=1), dtype=tf.float32)
self.sigma = self.add_weight(
"std", shape=[1],
initializer=tf.keras.initializers.RandomUniform(
minval=EPS, maxval=5.0, seed=None),
constraint=tf.keras.constraints.non_neg(),
dtype=tf.float32)
self.distribution = tfp.distributions.Normal(self.mu, self.sigma)
def call(self, input):
r = self.distribution.prob(input)
r = tf.clip_by_value(r, 1e-3, 1-1e-3)
return r
# loss function
def calc_loss(logits):
return -tf.math.reduce_sum(tf.math.log(logits))
# model input
input_layer = tf.keras.layers.Input(shape=(100,))
x_in = tf.random.normal([4, 100])
# your model
your_output = NormalLikelihoodYours()(input_layer)
your_model = tf.keras.models.Model(input_layer, your_output)\
# my model
my_output = NormalLikelihoodMine()(input_layer)
my_model = tf.keras.models.Model(input_layer, my_output)
# yours has no gradients because the network weights are not
# included anywhere in the loss calculation. When you index them
# with `[0]` they go from being trainable variables in the network,
# to just constants.
with tf.GradientTape() as tape:
y_hat = your_model(x_in)
loss = calc_loss(y_hat)
print(tape.gradient(loss, your_model.trainable_variables))
# [None, None]
# my model has gradients because `loss` and the weights in
# `trainable_variables` are connected
with tf.GradientTape() as tape:
y_hat = my_model(x_in)
loss = calc_loss(y_hat)
print(tape.gradient(loss, my_model.trainable_variables))
# [<tf.Tensor: shape=(1,), numpy=array([43.83749], dtype=float32)>,
# <tf.Tensor: shape=(1,), numpy=array([-37.348656], dtype=float32)>]

Tensorflow 2 custom loss return nan

I have a model, I compile it using binary_crossentropy, the training process goes well, the loss is printed.
model = MyModel()
model.compile(optimizer="adadelta", loss="binary_crossentropy")
data1, data2 = get_random_data(4, 3) # this method return data1:(1000,4),data2:(1000,3)
model.fit([data1, data2], y, batch_size=4)
Then I write a custom loss function, the loss become nan
import tensorflow.keras.backend as K
class MyModel():
...
def batch_loss(self, y_true, y_pred_batch):
bottom = K.sum(K.exp(y_pred_batch))
batch_softmax = K.exp(y_pred_batch) / bottom
batch_log_likelihood = K.log(batch_softmax)
loss = K.sum(batch_log_likelihood)
return loss
model.compile(optimizer="adadelta", loss=model.batch_loss) # change above compile code to this
I use a batch_loss(tf.ones((1,))) to test my loss function, seems it return the correct result.
But when it run together with training, it becomes nan, where should I start to debug?
model and data code (for those who need it to reproduce):
class MyModel(tf.keras.models.Model):
def __init__(self):
super().__init__()
self.t1A = tf.keras.layers.Dense(300, activation='relu', input_dim=1)
self.t1B = tf.keras.layers.Dense(300, activation='relu', input_dim=1)
self.t1v = tf.keras.layers.Dense(128, activation='relu')
self.t2A = tf.keras.layers.Dense(300, activation='relu')
self.t2B = tf.keras.layers.Dense(300, activation='relu')
self.t2v = tf.keras.layers.Dense(128, activation='relu')
self.out = tf.keras.layers.Dot(axes=1)
def call(self, inputs, training=None, mask=None):
u, i = inputs[0], inputs[1]
u = self.t1A(u)
u = self.t1B(u)
u = self.t1v(u)
i = self.t2A(i)
i = self.t2B(i)
i = self.t2v(i)
out = self.out([u, i])
return out
def get_random_data(user_feature_num, item_feature_num):
def get_random_ndarray(data_size, dis_list, feature_num):
data_list = []
for i in range(feature_num):
arr = np.random.randint(dis_list[i], size=data_size)
data_list.append(arr)
data = np.array(data_list)
return np.transpose(data, axes=(1, 0))
uf_dis, if_dis, data_size = [1000, 2, 10, 20], [10000, 50, 60], 1000
y = np.zeros(data_size)
for i in range(int(data_size/10)):
y[i] = 1
return get_random_ndarray(data_size, uf_dis, feature_num=user_feature_num), \
get_random_ndarray(data_size, if_dis, feature_num=item_feature_num), y
The values outputted by your models are quite big. Combined with a call to tf.exp in your function, values quickly grows to nan. You might consider applying an activation function like a sigmoid to keep the values between 0 and 1.
I think your error is caused by calling exp(). This function quickly growing and returns nan.

Keras Model using Tensorflow Distribution for loss fails with batch size > 1

I'm trying to use a distribution from tensorflow_probability to define a custom loss function in Keras. More specifically, I'm trying to build a Mixture Density Network.
My model works on a toy dataset when batch_size = 1 (it learns to predict the correct mixture distribution for y using x). But it "fails" when batch_size > 1 (it predicts the same distribution for all y, ignoring x). This makes me think my problem has to do with batch_shape vs. sample_shape.
To reproduce:
import random
import keras
from keras import backend as K
from keras.layers import Dense, Activation, LSTM, Input, Concatenate, Reshape, concatenate, Flatten, Lambda
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.models import Sequential, Model
import tensorflow
import tensorflow_probability as tfp
tfd = tfp.distributions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# generate toy dataset
random.seed(12902)
n_obs = 20000
x = np.random.uniform(size=(n_obs, 4))
df = pd.DataFrame(x, columns = ['x_{0}'.format(i) for i in np.arange(4)])
# 2 latent classes, with noisy assignment based on x_0, x_1, (x_2 and x_3 are noise)
df['latent_class'] = 0
df.loc[df.x_0 + df.x_1 + np.random.normal(scale=.05, size=n_obs) > 1, 'latent_class'] = 1
df.latent_class.value_counts()
# Latent class will determines which mixture distribution we draw from
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(probs=[0.3, 0.7]),
components_distribution=tfd.Normal(
loc=[-1., 1], scale=[0.1, 0.5]))
d0_samples = d0.sample(sample_shape=(df.latent_class == 0).sum()).numpy()
d1 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(probs=[0.5, 0.5]),
components_distribution=tfd.Normal(
loc=[-2., 2], scale=[0.2, 0.6]))
d1_samples = d1.sample(sample_shape=(df.latent_class == 1).sum()).numpy()
df.loc[df.latent_class == 0, 'y'] = d0_samples
df.loc[df.latent_class == 1, 'y'] = d1_samples
fig, ax = plt.subplots()
bins = np.linspace(-4, 5, 9*4 + 1)
df.y[df.latent_class == 0].hist(ax=ax, bins=bins, label='Class 0', alpha=.4, density=True)
df.y[df.latent_class == 1].hist(ax=ax, bins=bins, label='Class 1', alpha=.4, density=True)
ax.legend();
# mixture density network
N_COMPONENTS = 2 # number of components in the mixture
input_feature_space = 4
flat_input = Input(shape=(input_feature_space,),
batch_shape=(None, input_feature_space),
name='inputs')
x = Dense(6, activation='relu',
kernel_initializer='glorot_uniform',
bias_initializer='ones')(flat_input)
x = Dense(6, activation='relu',
kernel_initializer='glorot_uniform',
bias_initializer='ones')(x)
# 3 params per component: weight, loc, scale
output = Dense(N_COMPONENTS*3,
kernel_initializer='glorot_uniform',
bias_initializer='ones')(x)
model = Model(inputs=[flat_input],
outputs=[output])
I suspect the problem is in the next 3 functions:
def get_mixture_coef(output, num_components):
"""
Extract mixture params from output
"""
out_pi = output[:, :num_components]
out_sigma = output[:, num_components:2*num_components]
out_mu = output[:, 2*num_components:]
# use softmax to normalize pi into prob distribution
max_pi = K.max(out_pi, axis=1, keepdims=True)
out_pi = out_pi - max_pi
out_pi = K.exp(out_pi)
normalize_pi = 1 / K.sum(out_pi, axis=1, keepdims=True)
out_pi = normalize_pi * out_pi
# use exp to ensure sigma is pos
out_sigma = K.exp(out_sigma)
return out_pi, out_sigma, out_mu
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=out_pi),
components_distribution=tfd.Normal(
loc=out_mu, scale=out_sigma,
),
)
# I suspect the problem is here
return -1 * d0.log_prob(y)
def mdn_loss(num_components):
def loss(y_true, y_pred):
out_pi, out_sigma, out_mu = get_mixture_coef(y_pred, num_components)
return get_lossfunc(out_pi, out_sigma, out_mu, y_true)
return loss
opt = Adam(lr=.001)
model.compile(
optimizer=opt,
loss = mdn_loss(N_COMPONENTS),
)
es = EarlyStopping(monitor='val_loss',
min_delta=1e-5,
patience=5,
verbose=1, mode='auto')
validation = .15
validate_idx = np.random.choice(df.index.values,
size=int(validation * df.shape[0]),
replace=False)
train_idx = [i for i in df.index.values if i not in validate_idx]
x_cols = ['x_0', 'x_1', 'x_2', 'x_3']
model.fit(x=df.loc[train_idx, x_cols].values,
y=df.loc[train_idx, 'y'].values[:, np.newaxis],
validation_data=(
df.loc[validate_idx, x_cols].values,
df.loc[validate_idx, 'y'].values[:, np.newaxis]),
# model works when batch_size = 1
# model fails when batch_size > 1
epochs=2, batch_size=1, verbose=1, callbacks=[es])
def sample(output, n_samples, num_components):
"""Sample from a mixture distribution parameterized by
model output."""
pi, sigma, mu = get_mixture_coef(output, num_components)
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=pi),
components_distribution=tfd.Normal(
loc=mu,
scale=sigma))
return d0.sample(sample_shape=n_samples).numpy()
yhat = model.predict(df.loc[train_idx, x_cols].values)
out_pi, out_sigma, out_mu = get_mixture_coef(yhat, 2)
latent_1_samples = sample(yhat[:1], n_samples=1000, num_components=2)
latent_1_samples = pd.DataFrame({'latent_1_samples': latent_1_samples.ravel()})
fig, ax = plt.subplots()
bins = np.linspace(-4, 5, 9*4 + 1)
latent_1_samples.latent_1_samples.hist(ax=ax, bins=bins, label='Class 1: yHat', alpha=.4, density=True)
df.y[df.latent_class == 0].hist(ax=ax, bins=bins, label='Class 0: True', density=True, histtype='step')
df.y[df.latent_class == 1].hist(ax=ax, bins=bins, label='Class 1: True', density=True, histtype='step')
ax.legend();
Thanks in advance!
Update
I found two ways to solve the problem, guided by this answer. Both solutions point to the fact that Keras is awkwardly broadcasting y to match y_pred:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=out_pi),
components_distribution=tfd.Normal(
loc=out_mu, scale=out_sigma,
),
)
# this also works:
# return -1 * d0.log_prob(tensorflow.transpose(y))
return -1 * d0.log_prob(y[:, 0])
Specifying the workaround here (Answer Section) even though it is specified by Dan in the question, for the benefit of the Community.
The problem of predicting the same distribution for all y, ignoring x can be resolved in two ways.
Code for Solution 1 is mentioned below:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=out_pi),
components_distribution=tfd.Normal(
loc=out_mu, scale=out_sigma,
),
)
return -1 * d0.log_prob(tensorflow.transpose(y))
Code for Solution 2 is mentioned below:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=out_pi),
components_distribution=tfd.Normal(
loc=out_mu, scale=out_sigma,
),
)
return -1 * d0.log_prob(y[:, 0])
Hope this helps. Happy Learning!

How to multiply a layer by a constant vector element wise in Keras?

I want to make a weighted average ensemble of 3 of my trained models. So, I want first to multiply the softmax output of a model (element-wise) by a vector and then average the 3 weighted outputs of the 3 models.
I used the following code to multiply the output of the first model by its weight vector:
from keras.layers import Multiply, Average
resnet_weights = np.asarray([[0.91855, 0.99485, 0.89065, 0.96525, 0.98005,
0.93645, 0.6149, 0.934, 0.92505, 0.785, 0.85]], np.float32)
resnet_weight_tensor=tf.constant(resnet_weights, np.float32)
sess = tf.InteractiveSession()
print(resnet_weight_tensor.eval())
sess.close()
resnet_weighted = Multiply()([finetuned_model.layers[-1].output, resnet_weight_tensor])
print(resnet_weighted)
new_model=Model(model.input, resnet_weighted)
However, I'm stuck with the following error:
What can I do?
Use Lambda instead of Multiply, and K.constant instead of tf.constant (is backend-neutral):
resnet_weight_tensor=K.constant(resnet_weights, 'float32')
out = finetuned_model.layers[-1].output
resnet_weighted = Lambda(lambda x: x * resnet_weight_tensor)(out)
FULL EXAMPLE:
## BUILD MODELS
batch_size = 32
num_batches = 100
input_shape = (4,)
num_classes = 3
model_1 = make_model(input_shape, 8, num_classes)
model_2 = make_model(input_shape, 10, num_classes)
model_3 = make_model(input_shape, 12, num_classes)
## BUILD ENSEMBLE
models = (model_1, model_2, model_3)
models_ins = [model.input for model in models]
models_outs = [model.input for model in models]
outputs_weights = [np.random.random((batch_size, num_classes)),
np.random.random((batch_size, num_classes)),
np.random.random((batch_size, num_classes))]
outs_avg = model_outputs_average(models, outputs_weights)
final_out = Dense(num_classes, activation='softmax')(outs_avg)
model_ensemble = Model(inputs=models_ins, outputs=final_out)
model_ensemble.compile('adam', loss='categorical_crossentropy')
### TEST ENSEMBLE
x1 = np.random.randn(batch_size, *input_shape) # toy data
x2 = np.random.randn(batch_size, *input_shape)
x3 = np.random.randn(batch_size, *input_shape)
y = np.random.randint(0,2,(batch_size, num_classes)) # toy labels
model_ensemble.fit([x1,x2,x3], y)
Verify averaging:
[print(layer.name) for layer in model_ensemble.layers] # show layer names
preouts1 = get_layer_outputs(model_ensemble, 'lambda_1', [x1,x2,x3])
preouts2 = get_layer_outputs(model_ensemble, 'lambda_2', [x1,x2,x3])
preouts3 = get_layer_outputs(model_ensemble, 'lambda_3', [x1,x2,x3])
preouts_avg = get_layer_outputs(model_ensemble, 'average_1',[x1,x2,x3])
preouts = np.asarray([preouts1, preouts2, preouts3])
sum_of_diff_of_means = np.sum(np.mean(preouts, axis=0) - preouts_avg)
print(np.sum(np.mean([preouts1, preouts2, preouts3],axis=0) - preouts_avg))
# 4.69e-07
Functions used:
def make_model(input_shape, dense_dim, num_classes=3):
ipt = Input(shape=input_shape)
x = Dense(dense_dim, activation='relu')(ipt)
out = Dense(num_classes, activation='softmax')(x)
model = Model(ipt, out)
model.compile('adam', loss='categorical_crossentropy')
return model
def model_outputs_average(models, outputs_weights):
outs = [model.output for model in models]
out_shape = K.int_shape(outs[0])[1:] # ignore batch dim
assert all([(K.int_shape(out)[1:] == out_shape) for out in outs]), \
"All model output shapes must match"
outs_weights = [K.constant(w, 'float32') for w in outputs_weights]
ow_shape = K.int_shape(outs_weights[0])
assert all([(K.int_shape(w) == ow_shape) for w in outs_weights]), \
"All outputs_weights and model.output shapes must match"
weights_layers = [Lambda(lambda x: x * ow)(out) for ow, out
in zip(outs_weights, outs)]
return Average()(weights_layers)
def get_layer_outputs(model,layer_name,input_data,train_mode=False):
outputs = [layer.output for layer in model.layers if layer_name in layer.name]
layers_fn = K.function([model.input, K.learning_phase()], outputs)
return [layers_fn([input_data,int(train_mode)])][0][0]
The bug is possibly caused by the mixture of kears api and tensorflow api, since your resnet_weight_tensor is a tensor from tensorflow api, while finetuned_model.layers[-1].output is the output from a keras layer. Some discusses can be seen here issue 7362
One walk around is to wrap resnet_weight_tensor into keras Input layer.
from keras.layers import Multiply, Average, Input
resnet_weights = np.asarray([[0.91855, 0.99485, 0.89065, 0.96525, 0.98005,
0.93645, 0.6149, 0.934, 0.92505, 0.785, 0.85]], np.float32)
resnet_weight_tensor=tf.constant(resnet_weights, np.float32)
resnet_weight_input = Input(tensor=resnet_weight_tensor)
sess = tf.InteractiveSession()
print(resnet_weight_tensor.eval())
sess.close()
resnet_weighted = Multiply()([finetuned_model.layers[-1].output, resnet_weight_input])
print(resnet_weighted)
new_model=Model([model.input, resnet_weight_input], resnet_weighted)