Eager tf.GradientTape() returns only Nones - tensorflow

I try to calculate the gradients with Tensorflow in the eager mode, but
tf.GradientTape () returns only None values. I can not understand why.
The gradients are calculated in the update_policy () function.
The output of the line:
grads = tape.gradient(loss, self.model.trainable_variables)
is
{list}<class 'list'>:[None, None, ... ,None]
Here is the code.
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
import numpy as np
tf.enable_eager_execution()
print(tf.executing_eagerly())
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
set_session(sess)
class PGEagerAtariNetwork:
def __init__(self, state_space, action_space, lr, gamma):
self.state_space = state_space
self.action_space = action_space
self.gamma = gamma
self.model = tf.keras.Sequential()
# Conv
self.model.add(
tf.keras.layers.Conv2D(filters=32, kernel_size=[8, 8], strides=[4, 4], activation='relu',
input_shape=(84, 84, 4,),
name='conv1'))
self.model.add(
tf.keras.layers.Conv2D(filters=64, kernel_size=[4, 4], strides=[2, 2], activation='relu', name='conv2'))
self.model.add(
tf.keras.layers.Conv2D(filters=128, kernel_size=[4, 4], strides=[2, 2], activation='relu', name='conv3'))
self.model.add(tf.keras.layers.Flatten(name='flatten'))
# Fully connected
self.model.add(tf.keras.layers.Dense(units=512, activation='relu', name='fc1'))
self.model.add(tf.keras.layers.Dropout(rate=0.4, name='dr1'))
self.model.add(tf.keras.layers.Dense(units=256, activation='relu', name='fc2'))
self.model.add(tf.keras.layers.Dropout(rate=0.3, name='dr2'))
self.model.add(tf.keras.layers.Dense(units=128, activation='relu', name='fc3'))
self.model.add(tf.keras.layers.Dropout(rate=0.1, name='dr3'))
# Logits
self.model.add(tf.keras.layers.Dense(units=self.action_space, activation=None, name='logits'))
self.model.summary()
# Optimizer
self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
def get_probs(self, s):
s = s[np.newaxis, :]
logits = self.model.predict(s)
probs = tf.nn.softmax(logits).numpy()
return probs
def update_policy(self, s, r, a):
with tf.GradientTape() as tape:
logits = self.model.predict(s)
policy_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a, logits=logits)
policy_loss = policy_loss * tf.stop_gradient(r)
loss = tf.reduce_mean(policy_loss)
grads = tape.gradient(loss, self.model.trainable_variables)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

You don't have a forward pass in your model. The Model.predict() method returns numpy() array without taping the forward pass. Take a look at this example:
Given a following data and model:
import tensorflow as tf
import numpy as np
x_train = tf.convert_to_tensor(np.ones((1, 2), np.float32), dtype=tf.float32)
y_train = tf.convert_to_tensor([[0, 1]])
model = tf.keras.models.Sequential([tf.keras.layers.Dense(2, input_shape=(2, ))])
First we use predict():
with tf.GradientTape() as tape:
logits = model.predict(x_train)
print('`logits` has type {0}'.format(type(logits)))
# `logits` has type <class 'numpy.ndarray'>
xentropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_train, logits=logits)
reduced = tf.reduce_mean(xentropy)
grads = tape.gradient(reduced, model.trainable_variables)
print('grads are: {0}'.format(grads))
# grads are: [None, None]
Now we use model's input:
with tf.GradientTape() as tape:
logits = model(x_train)
print('`logits` has type {0}'.format(type(logits)))
# `logits` has type <class 'tensorflow.python.framework.ops.EagerTensor'>
xentropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_train, logits=logits)
reduced = tf.reduce_mean(xentropy)
grads = tape.gradient(reduced, model.trainable_variables)
print('grads are: {0}'.format(grads))
# grads are: [<tf.Tensor: id=2044, shape=(2, 2), dtype=float32, numpy=
# array([[ 0.77717704, -0.777177 ],
# [ 0.77717704, -0.777177 ]], dtype=float32)>, <tf.Tensor: id=2042,
# shape=(2,), dtype=float32, numpy=array([ 0.77717704, -0.777177 ], dtype=float32)>]
So use model's __call__() (i.e. model(x)) for forward pass and not predict().

Related

Custom pooling layer, WARNING:tensorflow:Gradients do not exist for variables

Below is the code I am trying to custom the global average pooling code. My goal is to change the line "return np.mean(inputs, axis=(1, 2)" and write my own custom pooling method. However, although the code is working, I'm having problems with gradients and I can't get the same result with the global average pooling method. I am getting the below warning. Can you help me please?
WARNING:tensorflow:Gradients do not exist for variables ['conv2d_2/kernel:0', 'conv2d_2/bias:0', 'conv2d_3/kernel:0', 'conv2d_3/bias:0'] when minimizing the loss. If you're using model.compile(), did you forget to provide a lossargument?
WARNING:tensorflow:Gradients do not exist for variables ['conv2d_2/kernel:0', 'conv2d_2/bias:0', 'conv2d_3/kernel:0', 'conv2d_3/bias:0'] when minimizing the loss. If you're using model.compile(), did you forget to provide a lossargument?
import numpy as np
import tensorflow as tf
import math
import os
import random
import itertools
from tensorflow.python.ops import gen_nn_ops
from tensorflow.python.ops import array_ops
import numba as nb
import matplotlib.pyplot as plt
import skimage
from keras import backend as K
random.seed(91)
class LGPooling2D(tf.keras.layers.Layer):
def __init__(self, pool_size=(3, 3), strides=(2, 2), padding='SAME', data_format='channels_last', **kwargs):
super(LGPooling2D, self).__init__(**kwargs)
self.pool_size = pool_size
self.strides = strides
self.padding = padding
self.data_format = 'NHWC' if data_format == 'channels_last' else 'NCHW'
self.output_dim = 64
def build(self, input_shape):
super(LGPooling2D, self).build(input_shape)
def _pooling_function(self, x, name=None):
#b = K.shape(x)[0]
input_shape = tf.keras.backend.int_shape(x)
b, r,c,channel = input_shape[0],input_shape[1],input_shape[2],input_shape[3]
def _mid_pool(inputs, is_train):
return np.mean(inputs, axis=(1, 2)) # we change this part.
def custom_grad(op, grad):
if self.data_format == 'NHWC':
ksizes = [1, self.pool_size[0], self.pool_size[1], 1]
strides = [1, self.strides[0], self.strides[1], 1]
else:
ksizes = [1, 1, self.pool_size[0], self.pool_size[1]]
strides = [1, 1, self.strides[0], self.strides[1]]
return gen_nn_ops.max_pool_grad_v2(
op.inputs[0],
op.outputs[0],
grad,
ksizes,
strides,
self.padding,
data_format=self.data_format
), tf.constant(0.0)
def py_func(func, inp, Tout, stateful=True, name=None, grad=None, rnd_name=None):
# Need to generate a unique name to avoid duplicates:
tf.compat.v1.RegisterGradient(rnd_name)(grad)
g = tf.compat.v1.get_default_graph()
with g.gradient_override_map({"PyFunc": rnd_name}):
return tf.compat.v1.py_func(func, inp, Tout, stateful=stateful, name=name)
def _mid_range_pool(x, name=None):
rnd_name = 'LGPooling2D' + str(np.random.randint(0, 1E+8))
with tf.compat.v1.name_scope(name, "mod", [x]) as name:
z = py_func(_mid_pool,
[x, tf.keras.backend.learning_phase()],
[tf.float32],
name=name,
grad=custom_grad, rnd_name=rnd_name)[0]
z.set_shape((b, channel))
return z
return _mid_range_pool(x, name)
def compute_output_shape(self, input_shape):
r, c = input_shape[1], input_shape[2]
sr, sc = self.strides
num_r = math.ceil(r/sr) if self.padding == 'SAME' else r//sr
num_c = math.ceil(c/sc) if self.padding == 'SAME' else c//sc
return (input_shape[0], input_shape[3])
def call(self, inputs):
# K.in_train_phase(self._tf_pooling_function(inputs), self._pooling_function_test(inputs))
input_shape = tf.shape(inputs)
output = self._pooling_function(inputs)
# output = tf.reshape(output, self.compute_output_shape(input_shape))
return output
def get_config(self):
config = {
'pool_size': self.pool_size,
'strides': self.strides
}
base_config = super(LGPooling2D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def get_model():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(32, kernel_size=(3, 3),activation="relu",input_shape=input_shape))
# model.add(tf.keras.layers.SpatialDropout2D(0.15))
model.add(tf.keras.layers.MaxPooling2D())
model.add(tf.keras.layers.Conv2D(64, kernel_size=(3, 3),activation='relu'))
# model.add(tf.keras.layers.SpatialDropout2D(0.1))
# model.add(tf.keras.layers.MaxPooling2D())
model.add(LGPooling2D(pool_size=(3, 3), strides=(2, 2)))
# model.add(tf.keras.layers.GlobalAveragePooling2D())
# model.add(tf.keras.layers.Flatten())
# model.add(tf.keras.layers.Dense(100, activation='relu'))
# model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
optim = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=1e-6)
model.compile( loss="categorical_crossentropy",optimizer=optim, metrics=['accuracy'])
return model
batch_size = 32
num_classes = 100
epochs = 50
# input image dimensions
img_rows, img_cols = 32, 32
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar100.load_data()
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 3)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 3)
input_shape = (img_rows, img_cols, 3)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255.0
x_test /= 255.0
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)
reduceLROnPlat = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=6, verbose=1, mode='min', min_delta=0.0001, cooldown=5, min_lr=0.00001)
callbacks_list = [reduceLROnPlat]
print('custom pooling')
model = get_model()
# model = get_model(IMG_SHAPE = (32, 32, 3))
tf.keras.utils.plot_model(model, to_file='LbpModel.png', show_shapes=True)
modelcustom = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_split=0.10,
callbacks=callbacks_list)

TF2 code 10 times slower than equivalent PyTorch code for a Conv1D network

I've been trying to translate some PyTorch code to TensorFlow 2, but the TF2 code is around 10 times slower. I've tried looking at where this might come from, and as far as I can tell it comes from the tape.gradient call (performance was the same with keras' .fit function). I've tried to use different data loaders, ways of declaring the model, installations, etc... and the results have been consistent.
Any explanation / solution as to why this is happening would be much appreciated.
Here is a minimalist version of the TF2 code:
import time
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 120, 18, 1)
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_dataset = train_dataset.batch(256)
# Create a small model
model = tf.keras.Sequential([
layers.Conv1D(64, kernel_size=7, strides=3, padding="same", activation="relu"),
layers.Conv1D(64, kernel_size=5, strides=2, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=5, strides=2, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.Conv1D(256, kernel_size=1, strides=1, padding="same", activation="relu"),
layers.GlobalAveragePooling2D(),
layers.Flatten(),
layers.Dense(128, use_bias=True, activation="relu"),
layers.Dense(32, use_bias=True, activation="relu"),
layers.Dense(1, activation='sigmoid', use_bias=True),
])
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=5e-4)
#tf.function
def train_step(data_batch, label_batch):
with tf.GradientTape() as tape:
y_pred = model(data_batch)
loss = tf.keras.losses.MSE(labels_batch, y_pred)
gradients = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(gradients, model.trainable_weights))
step_times = []
for epoch in range(20):
for data_batch, labels_batch in train_dataset:
step_start_time = time.perf_counter()
train_step(data_batch, labels_batch)
if epoch != 0:
step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")
And the PyTorch equivalent:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.backends.cudnn.benchmark = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 18, 120)
# Create a small model
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv1d(18, 64, kernel_size=7, stride=3, padding=3)
self.conv2 = nn.Conv1d(64, 64, kernel_size=5, stride=2, padding=2)
self.conv3 = nn.Conv1d(64, 128, kernel_size=5, stride=2, padding=2)
self.conv4 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1)
self.conv5 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1)
self.conv6 = nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1)
self.fc1 = nn.Linear(256, 128)
self.fc2 = nn.Linear(128, 32)
self.fc3 = nn.Linear(32, 1)
def forward(self, inputs):
x = F.relu(self.conv1(inputs))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
x = F.relu(self.conv5(x))
x = F.relu(self.conv6(x))
x = x.mean(2)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = torch.sigmoid(self.fc3(x))
return x
model = Model()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
loss_fn = torch.nn.MSELoss()
batch_size = 256
train_steps_per_epoch = train_data.shape[0] // batch_size
step_times = []
for epoch in range(20):
for step in range(train_steps_per_epoch):
batch_start, batch_end = step * batch_size, (step+1) * batch_size
data_batch = torch.FloatTensor(train_data[batch_start:batch_end]).to(device)
labels_batch = torch.FloatTensor(train_labels[batch_start:batch_end]).to(device)
step_start_time = time.perf_counter()
optimizer.zero_grad()
y_pred = model(data_batch)
loss = loss_fn(labels_batch, torch.squeeze(y_pred))
loss.backward()
optimizer.step()
if epoch != 0:
step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")
You're using tf.GradientTape correctly, but both your models and data are different in the snippets you provided.
Here is the TF code that uses the same data and model architecture as your Pytorch model.
import time
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 120, 18)
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_dataset = train_dataset.batch(256)
model = tf.keras.Sequential([
layers.Conv1D(64, kernel_size=7, strides=3, padding="same", activation="relu"),
layers.Conv1D(64, kernel_size=5, strides=2, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=5, strides=2, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.Conv1D(256, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.GlobalAveragePooling1D(),
layers.Dense(128, use_bias=True, activation="relu"),
layers.Dense(32, use_bias=True, activation="relu"),
layers.Dense(1, activation='sigmoid', use_bias=True),
])
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=5e-4)
#tf.function
def train_step(data_batch, label_batch, model):
with tf.GradientTape() as tape:
y_pred = model(data_batch, training=True)
loss = tf.keras.losses.MSE(labels_batch, y_pred)
gradients = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(gradients, model.trainable_weights))
step_times = []
for epoch in range(20):
for data_batch, labels_batch in train_dataset:
step_start_time = time.perf_counter()
train_step(data_batch, labels_batch, model)
if epoch != 0:
step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")
So, in reality, TF is 3 times faster than Pytorch: 0.035s vs 0.112s

tf.keras model.predict each time provides different values

Each time I run:
y_true = np.argmax(tf.concat([y for x, y in train_ds], axis=0), axis=1)
y_pred = np.argmax(model.predict(train_ds), axis=1)
confusion_matrix(y_true, y_pred)
The result each time is different to my understanding the line:
y_pred = np.argmax(model.predict(train_ds), axis=1) is different each time.
Clarification: I run cell 1 (training) once. And cell 2 (inference) few times.
Why?
THE CODE:
Cell 1 (jupyter)
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, experimental
from tensorflow.keras.layers import MaxPool2D, Flatten, Dense
from tensorflow.keras import Model
from tensorflow.keras.losses import categorical_crossentropy
from sklearn.metrics import accuracy_score
image_size = (100, 100)
batch_size = 32
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
directory,
label_mode='categorical',
validation_split=0.2,
subset="training",
seed=1337,
color_mode="grayscale",
image_size=image_size,
batch_size=batch_size,
)
inputs = Input(shape =(100,100,1))
x = experimental.preprocessing.Rescaling(1./255)(inputs)
x = Conv2D (filters =4, kernel_size =3, padding ='same', activation='relu')(x)
x = Conv2D (filters =4, kernel_size =3, padding ='same', activation='relu')(x)
x = MaxPool2D(pool_size =2, strides =2, padding ='same')(x)
x = Conv2D (filters =8, kernel_size =3, padding ='same', activation='relu')(x)
x = Conv2D (filters =8, kernel_size =3, padding ='same', activation='relu')(x)
x = MaxPool2D(pool_size =2, strides =2, padding ='same')(x)
x = Flatten()(x)
x = Dense(units = 4, activation ='relu')(x)
x = Dense(units = 4, activation ='relu')(x)
output = Dense(units = 5, activation ='softmax')(x)
model = Model (inputs=inputs, outputs =output)
model.compile(
optimizer=tf.keras.optimizers.Adam(1e-3),
loss=categorical_crossentropy,
metrics=["accuracy"])
model.fit(train_ds, epochs=5)
Cell 2:
print (Accuracy:)
y_pred = np.argmax(model.predict(train_ds), axis=1)
print (accuracy_score(y_true, y_pred))
y_pred = np.argmax(model.predict(train_ds), axis=1)
print (accuracy_score(y_true, y_pred))
OUTPUT:
118/118 [==============================] - 7s 57ms/step - loss: 0.1888 - accuracy: 0.9398
Accuracy:
0.593
0.586
Are you sure you do not train the model again every time you run the code? If the parameters of the model are the same the predicted result for the same input should be the same every time.
To my current understanding the reason of an above is the:
tf.keras.preprocessing.image_dataset_from_directory
While instance of it is:
type(train_ds)
tensorflow.python.data.ops.dataset_ops.BatchDataset
Reproduction:
First run:
[x for x, y in train_ds]
Output:
[<tf.Tensor: shape=(32, 100, 100, 1), dtype=float32, numpy= array([[[[157.],
[155.],
[159.],
Second run:
[x for x, y in train_ds]
Output:
[<tf.Tensor: shape=(32, 100, 100, 1), dtype=float32, numpy= array([[[[ 34.],
[ 36.],
[ 39.],
...,
The possible solution
imgs, y_true = [], []
for img, label in train_ds:
imgs.append(img)
y_true.append(label)
imgs = tf.concat(imgs, axis=0)
y_true = np.argmax(tf.concat(y_true, axis=0), axis=1)
y_pred = np.argmax(model.predict(imgs), axis=1)
print (accuracy_score(y_true, y_pred))
y_pred = np.argmax(model.predict(imgs), axis=1)
print (accuracy_score(y_true, y_pred))
OUTPUT
0.944044764
0.944044764
Is there any better solution?
UPDATE 2:
Maybe more appropriate apporach in case of validation dataset (here the train_ds is just for example is to add an argument Shuffle=False)
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
directory,
label_mode='categorical',
validation_split=0.2,
subset="training",
seed=1337,
color_mode="grayscale",
image_size=image_size,
batch_size=batch_size,
Shuffle=False
)
UPDATE 3:
Here it's probably the best option in case if your test images are in a separate folder.
path = 'your path to test folder'
test_generator = ImageDataGenerator().flow_from_directory(
directory=path,
class_mode='categorical',
shuffle=False,
batch_size=32,
target_size=(512, 512)
)
test_generator.reset()
This is better than OPTION 1, since it can work on dataset, which doesn't fits into memory (RAM).

TensorFlow Keras(v2.2) model fit with multiple outputs and losses failed

I want to use TensorFlow Keras(v2.2) model fit in mnist with multiple outputs and losses, but it failed.
My costume model will return a list [logits, embedding]. logits is 2D tensor [batch , 10] and embedding is also 2D tensor [batch, 64].
class MyModel(tf.keras.Model):
def __init__(self):
super(MyModel, self).__init__()
self.reshape = tf.keras.layers.Reshape((28, 28, 1))
self.conv2D1 = tf.keras.layers.Conv2D(filters=8, kernel_size=(3,3), strides=(1, 1), padding='same', activation='relu')
self.maxPool1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding="same")
self.conv2D2 = tf.keras.layers.Conv2D(filters=8, kernel_size=(3,3), strides=(1, 1), padding='same', activation='relu')
self.maxPool2 = tf.keras.layers.MaxPooling2D(pool_size=2)
self.flatten = tf.keras.layers.Flatten(data_format="channels_last")
self.dropout = tf.keras.layers.Dropout(tf.compat.v1.placeholder_with_default(0.25, shape=[], name="dropout"))
self.dense1 = tf.keras.layers.Dense(64, activation=None)
self.dense2 = tf.keras.layers.Dense(10, activation=None)
def call(self, inputs, training):
x = self.reshape(inputs)
x = self.conv2D1(x)
x = self.maxPool1(x)
if training:
x = self.dropout(x)
x = self.conv2D2(x)
x = self.maxPool2(x)
if training:
x = self.dropout(x)
x = self.flatten(x)
x = self.dense1(x)
embedding = tf.math.l2_normalize(x, axis=1)
logits = self.dense2(embedding)
return [logits, embedding]
loss_0 is normal cross_entropy
def loss_0(y_true, y_pred):
loss_0 = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred[0]))
loss_1 is triplet_semihard_loss
def loss_1(y_true, y_pred):
loss_1 = tfa.losses.triplet_semihard_loss(y_true=y_true, y_pred=y_pred[1], distance_metric="L2")
return loss_1
When I use model fit, I can only get logits tensor in each loss. I can't get embedding tensor. y_pred[0] and y_pred[1] is not work. Any suggestion?
model = MyModel()
model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3), loss=[loss_0, loss_1], loss_weights=[0.1, 0.1])
history = model.fit(train_dataset, epochs=5)

Why is the reduce_mean applied to the output of sparse_softmax_cross_entropy_with_logits?

There are several tutorials that applied reduce_mean to the output of sparse_softmax_cross_entropy_with_logits. For example
cross_entropy = -tf.reduce_sum(y_ * tf.log(y_conv))
or
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv))
Why is the reduce_mean applied to the output of sparse_softmax_cross_entropy_with_logits? Is it because we are using mini-batches, and so we want to calculate (using reduce_mean) the average loss over all samples of the mini-batch?
The reason is to get the average loss over the batch.
Generally you will train a neural network with input batches of size > 1, each element in the batch will produce a loss value so the easiest way to merge these into one value is to average.
I find something interesting~
first, let define sparse_vector as
sparse_vector = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv)
the sparse_vector is a vector, and we should calculate the summery of it, that why we should use the reduce_mean.
import numpy as np
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=False)
print(mnist.test.labels.shape)
print(mnist.train.labels.shape)
with tf.name_scope('inputs'):
X_ = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.int64, [None])
X = tf.reshape(X_, [-1, 28, 28, 1])
h_conv1 = tf.layers.conv2d(X, filters=32, kernel_size=5, strides=1,
padding='same', activation=tf.nn.relu, name='conv1')
h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=2, strides=2,
padding='same', name='pool1')
h_conv2 = tf.layers.conv2d(h_pool1, filters=64, kernel_size=5, strides=1,
padding='same',activation=tf.nn.relu, name='conv2')
h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=2, strides=2,
padding='same', name='pool2')
# flatten
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.layers.dense(h_pool2_flat, 1024, name='fc1', activation=tf.nn.relu)
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, 0.5)
h_fc2 = tf.layers.dense(h_fc1_drop, units=10, name='fc2')
# y_conv = tf.nn.softmax(h_fc2)
y_conv = h_fc2
# print('Finished building network.')
# cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv))
sparse_vector = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv)
cross_entropy = tf.reduce_mean(sparse_vector)
sess.run(tf.global_variables_initializer())
# print(sparse_vector)
# print(cross_entropy)
# Tensor("SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits:0", shape=(?,), dtype=float32)
# Tensor("Mean:0", shape=(), dtype=float32)
batch = mnist.train.next_batch(10)
sparse_vector,cross_entropy = sess.run(
[sparse_vector,cross_entropy],
feed_dict={X_: batch[0], y_: batch[1]})
print(sparse_vector)
print(cross_entropy)
the output is
[2.2213464 2.2676413 2.3555744 2.3196406 2.0794516 2.394274 2.266591
2.3139718 2.345526 2.3952296]
2.2959247