Tensorflow with custom loss containing multiple inputs - Graph disconnected error - tensorflow

I have a CNN output a scalar, this output is concatenated with the output of an MLP and then fed to another dense layer. I get a Graph Disconnected error
Please advise as to how to fix this. Thanks in advance.
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, Dense, Flatten, concatenate, Input
import tensorflow as tf
tf.keras.backend.clear_session()
#----custom function
def custom_loss(ytrue, ypred):
loss = tf.math.log(1. + ytrue) - tf.math.log(1. + ypred)
loss = tf.math.square(loss)
loss = tf.math.reduce_mean(loss)
return loss
#------------------
cnnin = Input(shape=(10, 10, 1))
x = Conv2D(8, 4)(cnnin)
x = Conv2D(16, 4)(x)
x = Conv2D(32, 2)(x)
x = Conv2D(64, 2)(x)
x = Flatten()(x)
x = Dense(4)(x)
x = Dense(4, activation="relu")(x)
cnnout = Dense(1, activation="linear")(x)
cnnmodel= Model(cnnin, cnnout, name="cnn_model")
yt = Input(shape=(2, )) #---dummy input
#---mlp start
mlpin = Input(shape=(2, ), name="mlp_input")
z = Dense(4, activation="sigmoid")(mlpin)
z = Dense(4, activation = "softmax")(z)
mlpout = Dense(1, activation="linear")(z)
mlpmodel = Model(mlpin, mlpout, name="mlp_model")
#----concatenate
combinedout = concatenate([mlpmodel.output, cnnmodel.output ])
x = Dense(4, activation="sigmoid")(combinedout)
finalout = Dense(2, activation="linear")(x)
model = Model( [mlpin, cnnin], finalout)
model.add_loss(custom_loss(yt, finalout))
model.compile(optimizer='adam', learning_rate=1e-3, initialization="glorotnorm",
loss=None)
Graph disconnected: cannot obtain value for tensor Tensor("input_8:0", shape=(None, 2), dtype=float32) at layer "input_8". The following previous layers were accessed without issue: ['input_7', 'conv2d_12', 'conv2d_13', 'conv2d_14', 'conv2d_15', 'flatten_3', 'mlp_input', 'dense_24', 'dense_27', 'dense_25', 'dense_28', 'dense_29', 'dense_26', 'concatenate_3', 'dense_30', 'dense_31']

You can customize what happens in Model.fit based on https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit
We create a new class that subclasses keras.Model.
We just override the method train_step(self, data).
We return a dictionary mapping metric names (including the loss) to
their current value.
For example with your models:
loss_tracker = tf.keras.metrics.Mean(name = "custom_loss")
class TestModel(tf.keras.Model):
def __init__(self, model1):
super(TestModel, self).__init__()
self.model1 = model1
def compile(self, optimizer):
super(TestModel, self).compile()
self.optimizer = optimizer
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
ypred = self.model1([x], training = True)
loss_value = custom_loss(y, ypred)
# Compute gradients
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss_value, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
loss_tracker.update_state(loss_value)
return {"loss": loss_tracker.result()}
import numpy as np
x = np.random.rand(6, 10,10,1)
x2 = np.random.rand(6,2)
y = tf.ones((6,2))
model = Model( [mlpin, cnnin], finalout)
trainable_model = TestModel(model)
trainable_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0001))
trainable_model.fit(x=(x2, x), y = y, epochs=5)
Gives the following output:
Epoch 1/5
1/1 [==============================] - 0s 382ms/step - loss: 0.2641
Epoch 2/5
1/1 [==============================] - 0s 4ms/step - loss: 0.2640
Epoch 3/5
1/1 [==============================] - 0s 6ms/step - loss: 0.2638
Epoch 4/5
1/1 [==============================] - 0s 7ms/step - loss: 0.2635
Epoch 5/5
1/1 [==============================] - 0s 6ms/step - loss: 0.2632
<tensorflow.python.keras.callbacks.History at 0x14c69572688>

Related

eager mode and keras.fit have different results

I am trying to convert model.fit() in Keras to the eager mode training. The model is an autoencoder. It has one encoder and two decoders. The decoders have different loss functions. The losses for decoders in eager model and model.fit are the same. I tried to set everything as the model.fit(). But the losses are different. I really appreciate help me out.
The link for google colab: https://colab.research.google.com/drive/1XNOwJ9oVgs1z9qqXIs_ldnKuSm3Dn2Ud?usp=sharing
In the following, the definition and training of the model are shown. I use model.fit() for training. Also, in the end, the output is shown, which shows the values for losses.
def fit_ae (x_unlab, p_m, alpha, parameters):
# Parameters
_, dim = x_unlab.shape
epochs = parameters['epochs']
batch_size = parameters['batch_size']
# Build model
inputs = contrib_layers.Input(shape=(dim,))
# Encoder
h = contrib_layers.Dense(int(256), activation='relu', name='encoder1')(inputs)
h = contrib_layers.Dense(int(128), activation='relu', name='encoder2')(h)
h = contrib_layers.Dense(int(26), activation='relu', name='encoder3')(h)
# Mask estimator
output_1 = contrib_layers.Dense(dim, activation='sigmoid', name = 'mask')(h)
# Feature estimator
output_2 = contrib_layers.Dense(dim, activation='sigmoid', name = 'feature')(h)
#Projection Network
model = Model(inputs = inputs, outputs = [output_1, output_2])
model.compile(optimizer='rmsprop',
loss={'mask': 'binary_crossentropy',
'feature': 'mean_squared_error'},
loss_weights={'mask':1, 'feature':alpha})
m_unlab = mask_generator(p_m, x_unlab)
m_label, x_tilde = pretext_generator(m_unlab, x_unlab)
# Fit model on unlabeled data
model.fit(x_tilde, {'mask': m_label, 'feature': x_unlab}, epochs = epochs, batch_size= batch_size)
########### OUTPUT
Epoch 1/15
4/4 [==============================] - 1s 32ms/step - loss: 1.0894 - mask_loss: 0.6560 - feature_loss: 0.2167
Epoch 2/15
4/4 [==============================] - 0s 23ms/step - loss: 0.6923 - mask_loss: 0.4336 - feature_loss: 0.1293
Epoch 3/15
4/4 [==============================] - 0s 26ms/step - loss: 0.4720 - mask_loss: 0.3022 - feature_loss: 0.0849
Epoch 4/15
4/4 [==============================] - 0s 23ms/step - loss: 0.4054 - mask_loss: 0.2581 - feature_loss: 0.0736
In the following code, I implemented the above code in eager mode. I set all optimizer and loss functions same as the above code. Data are the same for training both model.
###################################################### MODEL AUTOENCODER ============================================
def eager_ae(x_unlab,p_m,alpha,parameters):
# import pdb; pdb.set_trace()
_, dim = x_unlab.shape
epochs = parameters['epochs']
batch_size = parameters['batch_size']
E = keras.Sequential([
Input(shape=[dim,]),
Dense(256,activation='relu'),
Dense(128,activation='relu'),
Dense(26,activation='relu'),
])
# Mask estimator
output_1 = keras.Sequential([
Dense(dim,activation='sigmoid'),
])
# Feature estimator
output_2 = keras.Sequential([
Dense(dim,activation='sigmoid'),
])
optimizer = tf.keras.optimizers.RMSprop()
loss_mask = tf.keras.losses.BinaryCrossentropy()
loss_feature = tf.keras.losses.MeanSquaredError()
# Generate corrupted samples
m_unlab = mask_generator(p_m, x_unlab)
m_label, x_tilde = pretext_generator(m_unlab, x_unlab)
for epoch in range(epochs):
loss_metric = tf.keras.metrics.Mean(name='train_loss')
len_batch = range(int(x_unlab.shape[0]/batch_size))
for i in len_batch:
samples = x_tilde[i*batch_size:(i+1)*batch_size]
mask = m_label[i*batch_size:(i+1)*batch_size]
# train_step(samples,tgt)
with tf.GradientTape() as tape:
latent = E(samples, training=True)
out_mask = output_1(latent)
out_feat = output_2(latent)
# import pdb; pdb.set_trace()
lm = loss_mask(out_mask,tf.Variable(mask,dtype=tf.float32))
lf = loss_feature(out_feat,tf.Variable(samples,dtype=tf.float32))
pred_loss = lm + alpha*lf
trainable_vars = E.trainable_weights+output_1.trainable_weights+output_2.trainable_weights
grads = tape.gradient(pred_loss, trainable_vars)
optimizer.apply_gradients(zip(grads, trainable_vars))
loss_metric.update_state(pred_loss)
print(f'Epoch {epoch}, Loss {loss_metric.result()}')
return E
############# OUTPUT
Epoch 0, Loss 7.902271747589111
Epoch 1, Loss 5.336598873138428
Epoch 2, Loss 2.880791664123535
Epoch 3, Loss 1.9296690225601196
Epoch 4, Loss 1.6377944946289062
Epoch 5, Loss 1.5342860221862793
Epoch 6, Loss 1.5015968084335327
Epoch 7, Loss 1.4912563562393188
The total loss in the first code is less than zero (≈0.25), while the total loss in the second code is more than 1 (≈1.3). I can not find the issue in my second implementation (the second code).

Stateful LSTM VAE: Invalid argument: You must feed a value for placeholder tensor 'decoder_input' with dtype float and shape [batch_size, latent_dim]

I am solving a Timeseries problem using LSTM VAE(Variational auto-encoder), I have built my VAE model as below
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
class VAE:
def __init__(self,
hidden_layer_units,
hidden_layer_leakyrelu_alphas,
hidden_layer_dropout_rates,
batch_size,
time_steps,
num_features,
is_stateful_learning):
self.hidden_layer_units = hidden_layer_units
self.hidden_layer_leakyrelu_alphas = hidden_layer_leakyrelu_alphas
self.hidden_layer_dropout_rates = hidden_layer_dropout_rates
self.encoder_num_layers = 0
self.latent_space_dim = 0
vae_total_layers = len(hidden_layer_units)
if 0 < vae_total_layers:
self.encoder_num_layers = int((vae_total_layers - 1) / 2)
self.latent_space_dim = self.hidden_layer_units[self.encoder_num_layers]
self.batch_size = batch_size
self.time_steps = time_steps
self.num_features = num_features
self.is_stateful_learning = is_stateful_learning
self.encoder = None
self.decoder = None
self.model = None
self.model_input = None
self.model_output = None
self.mu = None
self.log_variance = None
self.kulback_coef = 0.0001
self._build()
def summary(self):
self.encoder.summary()
self.decoder.summary()
self.model.summary()
def compile(self, learning_rate=0.001):
optimizer = Adam(learning_rate=learning_rate)
self.model.compile(optimizer=optimizer,
loss=self._calculate_combined_loss,
metrics=[self._calculate_reconstruction_loss, self._calculate_kl_loss])
def _build(self):
self._build_encoder()
self._build_decoder()
self._build_autoencoder()
def _build_encoder(self):
encoder_input = self._add_encoder_input()
lstm_layers = self._add_encoder_lstm_layers(encoder_input)
bottleneck = self._add_bottleneck(lstm_layers)
self.model_input = encoder_input
self.encoder = Model(encoder_input, bottleneck, name="encoder")
def _build_decoder(self):
decoder_input = self._add_decoder_input()
repeater_layer = self._add_repeater_layer(decoder_input)
lstm_layer = self._add_decoder_lstm_layer(repeater_layer)
decoder_output = self._add_decoder_output(lstm_layer)
self.decoder = Model(decoder_input, decoder_output, name="decoder")
def _build_autoencoder(self):
model_input = self.model_input
encoder_output = self.encoder(model_input)
model_output = self.decoder(encoder_output)
self.model_output = model_output
self.model = Model(model_input, model_output, name="autoencoder")
def _add_encoder_input(self):
if self.is_stateful_learning:
x = Input(batch_shape=(self.batch_size, self.time_steps, self.num_features), name="encoder_input")
else:
x = Input(shape=(self.time_steps, self.num_features), name="encoder_input")
return x
def _add_encoder_lstm_layers(self, encoder_input):
""" Create all lstm layers in encoder."""
x = encoder_input
for layer_index, units in enumerate(self.hidden_layer_units[:self.encoder_num_layers]):
lstm_params = {}
if layer_index < self.encoder_num_layers - 1:
lstm_params["return_sequences"] = True
if self.is_stateful_learning:
lstm_params["stateful"] = True
x = LSTM(units=units, **lstm_params)(x)
x = LeakyReLU(alpha=self.hidden_layer_leakyrelu_alphas[layer_index])(x)
x = Dropout(rate=self.hidden_layer_dropout_rates[layer_index])(x)
return x
def _add_bottleneck(self, x):
""" add bottleneck with Guassian sampling (Dense layer)."""
self.mu = Dense(self.latent_space_dim, name="mu")(x)
self.log_variance = Dense(self.latent_space_dim, name="log_variance")(x)
x = Lambda(self.sample_point_from_normal_distribution, name="encoder_output")([self.mu, self.log_variance])
return x
def sample_point_from_normal_distribution(self, args):
mu, log_variance = args
epsilon = K.random_normal(shape=K.shape(mu), mean=0., stddev=1.)
sampled_point = mu + K.exp(log_variance / 2) * epsilon
return sampled_point
def _add_decoder_input(self):
if self.is_stateful_learning:
x = Input(batch_shape=(self.batch_size, self.latent_space_dim), name="decoder_input")
else:
x = Input(shape=(self.latent_space_dim), name="decoder_input")
return x
def _add_repeater_layer(self, decoder_input):
return RepeatVector(self.time_steps)(decoder_input)
def _add_decoder_lstm_layer(self, repeater_layer):
x = repeater_layer
for layer_index, units in enumerate(self.hidden_layer_units[self.encoder_num_layers + 1:]):
lstm_params = {}
if self.is_stateful_learning:
# stateful build
lstm_params = {'stateful': True, 'return_sequences': True}
else:
lstm_params["return_sequences"] = True
layer_no = layer_index + self.encoder_num_layers + 1
x = LSTM(units=units, **lstm_params)(x)
x = LeakyReLU(alpha=self.hidden_layer_leakyrelu_alphas[layer_no])(x)
x = Dropout(rate=self.hidden_layer_dropout_rates[layer_no])(x)
return x
def _add_decoder_output(self, lstm_layer):
return TimeDistributed(Dense(1))(lstm_layer)
def _calculate_combined_loss(self, y_target, y_predicted):
reconstruction_loss = self._calculate_reconstruction_loss(y_target, y_predicted)
kl_loss = self._calculate_kl_loss(y_target, y_predicted)
combined_loss = reconstruction_loss + (self.kulback_coef * kl_loss)
return combined_loss
def _calculate_reconstruction_loss(self, y_target, y_predicted):
error = y_target - y_predicted
reconstruction_loss = K.mean(K.square(error), axis=1)
return reconstruction_loss
def _calculate_kl_loss(self, y_target, y_predicted):
kl_loss = -0.5 * K.sum(1 + self.log_variance - K.square(self.mu) - K.exp(self.log_variance), axis=1)
return kl_loss
# Build Variational AutoEncoder(VAE) LSTM Model:
def build_lstm_neural_network(lstm_layer_units=[], leakyrelu_layer_alphas=[], dropout_layer_rates=[],
number_of_sequences=32, time_steps=32, data_dim=1, is_stateful_learning=False):
vae = VAE(
hidden_layer_units=lstm_layer_units,
hidden_layer_leakyrelu_alphas=leakyrelu_layer_alphas,
hidden_layer_dropout_rates=dropout_layer_rates,
batch_size=number_of_sequences,
time_steps=time_steps,
num_features=data_dim,
is_stateful_learning=is_stateful_learning
)
vae.compile(learning_rate)
vae.summary()
return vae.model
Model training block looks as below
# configuration
nn_lstm_layer_units = [160, 3, 160]
nn_leakyrelu_layer_alphas = [0.0, 0.0, 0.0]
nn_dropout_layer_rates = [0.3, 0.0, 0.3]
batch_size = 96
win_length = 64
num_features = 6 # You can use single variate Timeseries data as well, num_features = 1
epochs = 782
learning_rate = 0.0001
want_stateful_learning = True
# Build LSTM VAE model
model = build_lstm_neural_network(nn_lstm_layer_units, nn_leakyrelu_layer_alphas, nn_dropout_layer_rates, batch_size,
win_length, num_features, want_stateful_learning)
TIME_STEPS = win_length
# Generated training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS):
output = []
for i in range(len(values) - time_steps + 1):
output.append(values[i: (i + time_steps)])
return np.stack(output)
x_train = create_sequences(x_train)
x_val = create_sequences(x_val)
callbacks = []
unfit_train_record_count = 0
unfit_val_record_count = 0
if want_stateful_learning:
# stateful learning
# adjust train data size(should be in multiples of batch size)
unfit_train_record_count = len(x_train) % batch_size
unfit_val_record_count = len(x_val) % batch_size
# Reset states of the stateful model on epoch end
stateful_model_reset_states = LambdaCallback(on_epoch_end=lambda batch, logs: model.reset_states())
callbacks.append(stateful_model_reset_states)
early_stopping = EarlyStopping(monitor=monitor, patience=patience)
callbacks.append(early_stopping)
# Model traning
history = model.fit(x=x_train[unfit_train_record_count:], y=x_train[unfit_train_record_count:, :, [0]], validation_data=(x_val[unfit_val_record_count:], x_val[unfit_val_record_count:, :, [0]]), batch_size=batch_size, epochs=epochs, shuffle=False, callbacks=callbacks)
The stateless mode of the model is working as expected but the stateful mode is throwing an error as below-
1632/1632 [==============================] - ETA: 0s - loss: 0.2447 - _calculate_reconstruction_loss: 0.2447 - _calculate_kl_loss: 0.0326
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: You must feed a value for placeholder tensor 'decoder_input' with dtype float and shape [96,3]
[[{{node decoder_input}}]]
[[metrics/_calculate_reconstruction_loss/Identity/_229]]
(1) Invalid argument: You must feed a value for placeholder tensor 'decoder_input' with dtype float and shape [96,3]
[[{{node decoder_input}}]]
Environment used is as
Python-3.8.12,
Tensorflow-gpu: 2.5,
cudnn: 8.2.1.32
I am not clear why the stateful model run 1 Epoch for training data, but as soon as it starts to process the validation data, it throws the error.
I had the same experiences with dataset and loss function that not suitable, I try to simulate again it possible no loss value change, no loss as nan, error when validation.
That is possible no value, no match or not update neuron, you can use Tensorflow 2.x is a lot moire easier.
This is no match validation: Working on training but results in errors when validation. ( one possible )
Epoch 1/100
2022-01-23 21:04:59.846791: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8100
1/1 [==============================] - ETA: 0s - loss: 3.1866 - accuracy: 0.0000e+00Traceback (most recent call last):
Another possible is loss Fn no match: It is possible they are not update the neurons
Epoch 1/100
2022-01-23 21:08:23.330068: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8100
1/1 [==============================] - 3s 3s/step - loss: 13.7138 - accuracy: 0.2000 - val_loss: 8.2133 - val_accuracy: 0.0000e+00
Epoch 2/100
1/1 [==============================] - 0s 65ms/step - loss: 7.7745 - accuracy: 0.0000e+00 - val_loss: 8.0456 - val_accuracy: 0.0000e+00
I solved the problem, by changing the loss calculation logic, instead of defining the functions to calculate reconstruction and KL loss in the VAE class, I moved the loss calculation part outside the VAE class as below
# Build Variational AutoEncoder(VAE) LSTM Model:
def build_lstm_neural_network(lstm_layer_units=[], leakyrelu_layer_alphas=[], dropout_layer_rates=[],
number_of_sequences=32, time_steps=32, data_dim=1, is_stateful_learning=False):
vae = VAE(
hidden_layer_units=lstm_layer_units,
hidden_layer_leakyrelu_alphas=leakyrelu_layer_alphas,
hidden_layer_dropout_rates=dropout_layer_rates,
batch_size=number_of_sequences,
time_steps=time_steps,
num_features=data_dim,
is_stateful_learning=is_stateful_learning
)
# Add reconstruction loss
error = vae.model_input - vae.model_output
reconstruction_loss = K.mean(K.square(error))
vae.model.add_loss(reconstruction_loss)
vae.model.add_metric(reconstruction_loss, name='mse_loss', aggregation='mean')
# Add KL loss
kl_loss = kl_beta * K.mean(-0.5 * K.sum(1 + vae.log_variance - K.square(vae.mu) - K.exp(vae.log_variance), axis = 1), axis=0)
model.add_loss(kl_loss)
model.add_metric(kl_loss, name='kl_loss', aggregation='mean')
optimizer = Adam(learning_rate=vae.learning_rate, clipvalue=vae.clipvalue)
vae.model.compile(loss=None, optimizer=optimizer)
vae.summary()
return vae.model

What does it mean when the loss starts going up again?

I am running the code from https://www.tensorflow.org/tutorials/text/text_generation. I will copy it at the bottom of the question. If I change the EPOCHS line to
EPOCHS = 100
something odd happens to the loss. It starts by going down, as in:
Epoch 1/100
172/172 [==============================] - 301s 2s/step - loss: 2.7219
Epoch 2/100
172/172 [==============================] - 328s 2s/step - loss: 1.9963
Epoch 3/100
172/172 [==============================] - 344s 2s/step - loss: 1.7313
Epoch 4/100
172/172 [==============================] - 321s 2s/step - loss: 1.5778
Epoch 5/100
172/172 [==============================] - 325s 2s/step - loss: 1.4840
reaching it's lowest level at Epoch 46/100 when the loss is 0.6233. It then goes back up again finishing with:
Epoch 96/100
172/172 [==============================] - 292s 2s/step - loss: 0.8749
Epoch 97/100
172/172 [==============================] - 292s 2s/step - loss: 0.8933
Epoch 98/100
172/172 [==============================] - 292s 2s/step - loss: 0.9073
Epoch 99/100
172/172 [==============================] - 292s 2s/step - loss: 0.9181
Epoch 100/100
172/172 [==============================] - 292s 2s/step - loss: 0.9298
Why is it doing this and what does it mean?
import tensorflow as tf
import numpy as np
import os
import time
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))
# Take a look at the first 250 characters in text
print(text[:250])
# The unique characters in the file
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])
print('{')
for char,_ in zip(char2idx, range(20)):
print(' {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print(' ...\n}')
# Show how the first 13 characters from the text are mapped to integers
print('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))
# The maximum length sentence you want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
for i in char_dataset.take(5):
print(idx2char[i.numpy()])
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
for item in sequences.take(5):
print(repr(''.join(idx2char[item.numpy()])))
def split_input_target(chunk):
input_text = chunk[:-1]
target_text = chunk[1:]
return input_text, target_text
dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
print('Target data:', repr(''.join(idx2char[target_example.numpy()])))
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
print("Step {:4d}".format(i))
print(" input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
print(" expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))
# Batch size
BATCH_SIZE = 64
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset
# Length of the vocabulary in chars
vocab_size = len(vocab)
# The embedding dimension
embedding_dim = 256
# Number of RNN units
rnn_units = 1024
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim,
batch_input_shape=[batch_size, None]),
tf.keras.layers.GRU(rnn_units,
return_sequences=True,
stateful=True,
recurrent_initializer='glorot_uniform'),
tf.keras.layers.Dense(vocab_size)
])
return model
model = build_model(
vocab_size=len(vocab),
embedding_dim=embedding_dim,
rnn_units=rnn_units,
batch_size=BATCH_SIZE)
for input_example_batch, target_example_batch in dataset.take(1):
example_batch_predictions = model(input_example_batch)
print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
model.summary()
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))
def loss(labels, logits):
return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss: ", example_batch_loss.numpy().mean())
model.compile(optimizer='adam', loss=loss)
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_prefix,
save_weights_only=True)
EPOCHS = 100
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()
def generate_text(model, start_string):
# Evaluation step (generating text using the learned model)
# Number of characters to generate
num_generate = 1000
# Converting our start string to numbers (vectorizing)
input_eval = [char2idx[s] for s in start_string]
input_eval = tf.expand_dims(input_eval, 0)
# Empty string to store our results
text_generated = []
# Low temperature results in more predictable text.
# Higher temperature results in more surprising text.
# Experiment to find the best setting.
temperature = 1.0
# Here batch size == 1
model.reset_states()
for i in range(num_generate):
predictions = model(input_eval)
# remove the batch dimension
predictions = tf.squeeze(predictions, 0)
# using a categorical distribution to predict the character returned by the model
predictions = predictions / temperature
predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
# Pass the predicted character as the next input to the model
# along with the previous hidden state
input_eval = tf.expand_dims([predicted_id], 0)
text_generated.append(idx2char[predicted_id])
return (start_string + ''.join(text_generated))
print(generate_text(model, start_string=u"ROMEO: "))
This particular model can't fit any better than this, since it is limited to its architecture and only one symbol generation per step.
A loss steadily going up after some epochs is a usual thing indicating your model overtrains, and there is no point in training any further.
You could tune hyperparameters to (possibly) make some minor improvements.
Edit:
To tune embedding dimensions, rnn units, and sequence length change those values:
seq_length = 100
embedding_dim = 256
rnn_units = 1024
To tune learning rate replace this lane:
model.compile(optimizer='adam', loss=loss)
with this one:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), loss=loss)
Also, you can add arbitrary layers to build_model function.
Here is an example with an extra GRU layer:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim,
batch_input_shape=[batch_size, None]),
tf.keras.layers.GRU(rnn_units,
return_sequences=True,
stateful=True,
recurrent_initializer='glorot_uniform'),
tf.keras.layers.GRU(rnn_units,
return_sequences=True,
stateful=True,
recurrent_initializer='glorot_uniform'),
tf.keras.layers.Dense(vocab_size)
])
return model

TensorFlow ValueError: Rank mismatch error

All, I am completely stuck due to an error in my code to classify Cats vs. Dogs using a Convolution network. I could use the high level libraries available these days, but for learning, I want to get this lower level working. The output is a binary classification of an image containing either a cat or a dog. I have scanned a number of Rank related threads, but unable to make out how to solve this error using sparse_softmax_cross_entropy_with_logits specifically.
If I change 2 lines; use softmax_cross_entropy_with_logits_v2() and uncomment labels = tf.argmax(y, 1), then it runs, but the Accuracy even on the train set, degrades rapidly (net diverges).
Any help would be much appreciated. Thanks.
The 2 lines I am not a 100% sure about are as follows.
Should the 1 here be n_outputs (which is 2)? (since binary but it does not seem right)
y = tf.placeholder(dtype=tf.int64, shape=[100, 1], name="y")
This is the line that throws the error: ValueError: Rank mismatch: Rank of labels (received 2) should equal rank of logits minus 1 (received 2).
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(\
labels=y, logits=logits)
Full code (from the point of having data in hand) is as below; it is long-ish but simple. I have commented out the end since the error is thrown before it gets there. Error is at the end of it below.
#---Split data into training & test sets---
# Work the data for cats and dogs numpy arrays
# These numpy arrays were generated in previous data prep work
# Stack the numpy arrays for the inputs
X_cat_dog = np.concatenate((cats_1000_64_64_1, dogs_1000_64_64_1),
axis = 0)
X_cat_dog = X_cat_dog.reshape(-1, width*height) #Flatten
# Scikit Learn for min-max scaling of the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(np.array([0., 255.]).reshape(-1,1))
X_cat_dog_scaled = scaler.transform(X_cat_dog)
# Define the labels to be used: cats = 0, dogs = 1
y_cat_dog = np.concatenate((np.zeros((1000), dtype = np.int32),
np.ones((1000), dtype = np.int32)),
axis = 0)
# Scikit Learn for random splitting of the data
from sklearn.model_selection import train_test_split
# Random split of data into training (80%) and test (20%)
X_train, X_test, y_train, y_test = \
train_test_split(X_cat_dog_scaled, y_cat_dog, test_size=0.20,
random_state = RANDOM_SEED)
print('Train orig. shape:', X_train.shape, y_train.shape)
print('Test orig. shape:', X_test.shape, y_test.shape)
#Reshape into 4D
X_train = np.reshape(X_train, newshape=[X_train.shape[0], height, width, channels])
y_train = np.reshape(y_train, newshape=[y_train.shape[0], 1])
X_test = np.reshape(X_test, newshape=[X_test.shape[0], height, width, channels])
y_test = np.reshape(y_test, newshape=[y_test.shape[0], 1])
print('Train 4D shape:', X_train.shape, y_train.shape, type(X_train), type(y_train))
print('Test 4D shape:', X_test.shape, y_test.shape, type(X_test), type(y_test))
#---Define and run convolution net---
#Init
results = [] #Summary results
reset_graph() #Else upon rerun, error occurs
n_outputs = 2 #Binary; cat or dog
n_strides = [1,2,2] #Symmetric XY + same across conv & pool
n_conv_blocks = 1 #Number of convolution blocks
n_filters = [5, 10, 20] #Number of filters applied per layer
#Placeholders for batch training
X = tf.placeholder(dtype=tf.float64,
shape=[100, height, width, channels], name="X")
y = tf.placeholder(dtype=tf.int64, shape=[100, 1], name="y")
print('X.shape =', X.shape, tf.rank(X))
print('y.shape =', y.shape, tf.rank(y))
#Define hidden layers
with tf.name_scope("cnn"):
#Create number of convolution blocks required
for block in range(n_conv_blocks):
#Convolution layer
inputLayer = X
if (block>0):
inputLayer = pool
print('\nStride:', n_strides[block])
conv = tf.layers.conv2d(inputLayer,
filters = n_filters[block],
kernel_size = 1,
strides = n_strides[block],
activation = tf.nn.leaky_relu,
padding = "SAME")
print('Conv '+str(block)+'.shape =',
conv.get_shape().as_list())
#Pooling layer
pool = tf.nn.avg_pool(conv,
ksize = [1,2,2,1],
strides = [1,n_strides[block],n_strides[block],1],
padding = "SAME")
print('Pool '+str(block)+'.shape =', pool.shape)
pool_shape = pool.get_shape().as_list()
next_width = pool_shape[1]
next_height = pool_shape[2]
next_depth = pool_shape[3]
#Fully connected
flattened = tf.reshape(pool, [-1,
next_width * next_height * next_depth])
print('\nFlattened.shape =', flattened.shape)
hidden = tf.layers.dense(flattened,
next_width * next_height * next_depth,
name="hidden1",
activation=tf.nn.leaky_relu)
print('\nHidden.shape =', hidden.shape, tf.rank(hidden))
#Output
logits = tf.layers.dense(hidden, n_outputs, name="outputs")
print('\nLogits.shape =', logits.shape, tf.rank(logits))
#Define loss function
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(\
labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
#Define optimizer used for reducing the loss; MomentumOptimizer
learning_rate = 0.01
momentum = 0.01
with tf.name_scope("train"):
optimizer = tf.train.MomentumOptimizer(learning_rate,
momentum)
training_op = optimizer.minimize(loss)
#Define performance measure; accuracy in this case
with tf.name_scope("eval"):
#labels = tf.argmax(y, 1)
labels = y
correct = tf.nn.in_top_k(logits, labels, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
#Define instantiator for TensorFlow variables
init = tf.global_variables_initializer()
#Carry out training in mini-batches
n_epochs = 1
batch_size = 100
with tf.Session() as sess:
#Instantiate variables
init.run()
#Loop over n_epochs
for epoch in range(n_epochs):
#Loop over batches
for iteration in range(y_train.shape[0] // batch_size):
X_batch = X_train[\
iteration*batch_size:(iteration + 1)*batch_size,:]
y_batch = y_train[\
iteration*batch_size:(iteration + 1)*batch_size]
print(y_batch.shape, type(y_batch))
# sess.run(training_op, feed_dict={X: X_batch,
# y: y_batch})
# #Measure performance
# acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
# acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
# if (epoch % 1 == 0):
# print(epoch,
# "Train Accuracy:",
# '{:0.1%}'.format(acc_train),
# "\tTest Accuracy:",
# '{:0.1%}'.format(acc_test))
# results.append([epoch, acc_train, acc_test])
Error is as follows.
X.shape = (100, 64, 64, 1) Tensor("Rank:0", shape=(), dtype=int32)
y.shape = (100, 1) Tensor("Rank_1:0", shape=(), dtype=int32)
Stride: 1
Conv 0.shape = [100, 64, 64, 5]
Pool 0.shape = (100, 64, 64, 5)
Flattened.shape = (100, 20480)
Hidden.shape = (100, 20480) Tensor("cnn/Rank:0", shape=(), dtype=int32)
Logits.shape = (100, 2) Tensor("cnn/Rank_1:0", shape=(), dtype=int32)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-25-7961eb9a772c> in <module>()
58 #Define loss function
59 with tf.name_scope("loss"):
---> 60 xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=y, logits=logits)
61 loss = tf.reduce_mean(xentropy, name="loss")
62
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_ops.py in sparse_softmax_cross_entropy_with_logits(_sentinel, labels, logits, name)
2645 raise ValueError("Rank mismatch: Rank of labels (received %s) should "
2646 "equal rank of logits minus 1 (received %s)." %
-> 2647 (labels_static_shape.ndims, logits.get_shape().ndims))
2648 if (static_shapes_fully_defined and
2649 labels_static_shape != logits.get_shape()[:-1]):
ValueError: Rank mismatch: Rank of labels (received 2) should equal rank of logits minus 1 (received 2).
OK, I figured it out. I adjusted 2 lines as follows.
I dropped the extra dimension from shape of y as follows.
y = tf.placeholder(dtype=tf.int64, shape=[None], name="y")
All references to y_batch after its definition, were replaced with y_batch.reshape(-1). This was needed to reduce the rank of y_batch to get rid of the error.
Rest remained unchanged. Now I have a new problem, the accuracy remains low, but at least now its behaving itself (and not going to zero). Playtime!
0 Train Accuracy: 61.0% Test Accuracy: 48.5%
1 Train Accuracy: 60.0% Test Accuracy: 48.8%
2 Train Accuracy: 61.0% Test Accuracy: 49.5%
3 Train Accuracy: 65.0% Test Accuracy: 50.2%
4 Train Accuracy: 65.0% Test Accuracy: 51.0%
5 Train Accuracy: 64.0% Test Accuracy: 51.0%
6 Train Accuracy: 65.0% Test Accuracy: 51.5%
7 Train Accuracy: 66.0% Test Accuracy: 51.0%
8 Train Accuracy: 64.0% Test Accuracy: 51.2%
9 Train Accuracy: 63.0% Test Accuracy: 52.5%
10 Train Accuracy: 62.0% Test Accuracy: 52.0%
11 Train Accuracy: 62.0% Test Accuracy: 52.0%
12 Train Accuracy: 63.0% Test Accuracy: 53.5%
13 Train Accuracy: 63.0% Test Accuracy: 53.5%
14 Train Accuracy: 63.0% Test Accuracy: 54.0%
15 Train Accuracy: 63.0% Test Accuracy: 53.5%
16 Train Accuracy: 64.0% Test Accuracy: 53.5%
17 Train Accuracy: 64.0% Test Accuracy: 53.8%
18 Train Accuracy: 65.0% Test Accuracy: 53.8%
19 Train Accuracy: 65.0% Test Accuracy: 53.8%

Tensorflow estimator fails to converge on model converted from Keras (when using binary_crossentropy)

I've been stuck for quite a while using the model_to_estimator functionality in Tensorflow Estimators. The problem seems to be that Keras allows a binary_crossentropy loss on a single neuron Dense output.
In my case, I am feeding a RNN sequential data and I want to figure out if the sequence leads to a conversion or not. The code (also to be found at https://colab.research.google.com/drive/194Puigi-LdzxZup6LNREk47l9uP0_Dx9) for that would be
import numpy as np
import pandas as pd
import tensorflow as tf
np.random.seed(2)
data = np.random.randint(1,500,size=(10000, 50)) # create something like 50 words out of a vocab of 500
#split
train = data[:7999]
val = data[8000:]
def _input_fn2(arr, batch_size=500, shuffle=False):
arr_copy = arr.copy()
def _parse_func(features):
sum = tf.math.reduce_sum(features)
label = tf.cond(sum >= 15000, lambda: np.array([1]), lambda: np.array([0])) # label=true if sum is larger 15000, gives about 1% true
return (features, label)
dataset = tf.data.Dataset.from_tensor_slices(arr_copy)
dataset = dataset.map(_parse_func)
dataset = dataset.shuffle(200)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat()
return dataset
from tensorflow.keras.layers import Dense, Input, CuDNNGRU, Embedding
import tensorflow.keras.backend as K
inputs = Input(shape=(50,))
embedding = Embedding(
output_dim=5,
input_dim=500,
input_length=50)(inputs)
lstm = CuDNNGRU(
units=5,
input_shape=((5,1)),
return_sequences=False,
)(embedding)
outputs = Dense(1, activation='sigmoid',name='final')(lstm)
model = tf.keras.Model(inputs, outputs)
def true_positives(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
return true_positives
def false_positives(y_true, y_pred):
true_positives = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
return true_positives
def true_negatives(y_true, y_pred):
true_positives = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
return true_positives
def false_negatives(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))
return true_positives
def recall(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
model.compile(
optimizer=tf.keras.optimizers.Adam(),
loss='binary_crossentropy',
metrics=[
'acc',
true_positives,
true_negatives,
false_positives,
false_negatives,
recall,
precision
]
)
print(model.summary())
train_ds = _input_fn2(train, shuffle=True)
val_ds = _input_fn2(val)
​
model.fit(
train_ds,
steps_per_epoch=50 ,
epochs=100,
validation_data=val_ds,
validation_steps=10,
verbose=2
)
This works alright, the model converges and starts to learn.
Epoch 100/100
- 2s - loss: 3.5754e-04 - acc: 1.0000 - true_positives: 3.2000 - true_negatives: 496.7400 - false_positives: 0.0000e+00 -
false_negatives: 0.0000e+00 - recall: 0.9400 - precision: 0.9400 -
val_loss: 0.1281 - val_acc: 0.9806 - val_true_positives: 0.0000e+00 -
val_true_negatives: 490.3000 - val_false_positives: 4.5000 -
val_false_negatives: 5.2000 - val_recall: 0.0000e+00 - val_precision:
0.0000e+00
You can see that mostly it guesses a negative outcome, this is due to the imbalance in the dataset and probably the right thing to do.
Now converting this into an Estimator model like this
from tensorflow.keras.estimator import model_to_estimator
from tensorflow.estimator import train_and_evaluate, RunConfig
from tensorflow.estimator import TrainSpec, EvalSpec
from tensorflow import metrics
from tensorflow.contrib.estimator import add_metrics
run_config = RunConfig(
save_checkpoints_secs=5,
keep_checkpoint_max=10
)
def eval_metrics(features, labels, predictions):
return {
'precision_streaming': metrics.precision(labels=labels, predictions=predictions['final']),
'recall_streaming': metrics.recall(labels=labels, predictions=predictions['final']),
'true_positives_streaming': metrics.true_positives(labels=labels, predictions=predictions['final']),
'true_negatives_streaming': metrics.true_negatives(labels=labels, predictions=predictions['final']),
'false_positives_streaming': metrics.false_positives(labels=labels, predictions=predictions['final']),
'false_negatives_streaming': metrics.false_negatives(labels=labels, predictions=predictions['final'])
}
estimator = model_to_estimator(keras_model=model, config=run_config)
estimator = add_metrics(estimator, eval_metrics) #took out these metrics for showcase
train_spec = TrainSpec(
input_fn=lambda: _input_fn2(train, shuffle=True), max_steps=2000
)
eval_spec = EvalSpec(input_fn=lambda: _input_fn2(val), steps=4)
score = train_and_evaluate(estimator, train_spec, eval_spec)
print(score)
After resetting the model and training the Estimator based version, the model doesn't converge but only seems to predict trues now
({'binary_accuracy': 0.9865, 'false_negatives_streaming': 0.0,
'false_positives_streaming': 1979.0, 'precision_streaming': 0.0105,
'recall_streaming': 1.0, 'true_negatives_streaming': 0.0,
'true_positives_streaming': 21.0, 'global_step': 2000}, []
Now I managed to get this to work by using a Dense(2) final layer, one-hot encoding the label and switching the loss function to sparse_categorical_crossentropy ... but I'd really rather keep the single output class as it makes my downstream f1-score and whatnot calculations easier.
An inspired guess would be that the Estimator fails to distribute the loss to the single dense output layer, Keras somehow manages to do this.
Any help would be greatly appreciated
Bests wirtsi