Verifying the implementation of Multihead Attention in Transformer - tensorflow

I have implemented the MultiAttention head in Transformers. There are so many implementations around so it's confusing. Can someone please verify if my implementation is correct:
DotProductAttention referred from: https://www.tensorflow.org/tutorials/text/transformer#setup
import tensorflow as tf
def scaled_dot_product(q,k,v):
#calculates Q . K(transpose)
qkt = tf.matmul(q,k,transpose_b=True)
#caculates scaling factor
dk = tf.math.sqrt(tf.cast(q.shape[-1],dtype=tf.float32))
scaled_qkt = qkt/dk
softmax = tf.nn.softmax(scaled_qkt,axis=-1)
z = tf.matmul(softmax,v)
#shape: (m,Tx,depth), same shape as q,k,v
return z
class MultiAttention(tf.keras.layers.Layer):
def __init__(self,d_model,num_of_heads):
super(MultiAttention,self).__init__()
self.d_model = d_model
self.num_of_heads = num_of_heads
self.depth = d_model//num_of_heads
self.wq = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
self.wk = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
self.wv = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
self.wo = tf.keras.layers.Dense(d_model)
def call(self,x):
multi_attn = []
for i in range(self.num_of_heads):
Q = self.wq[i](x)
K = self.wk[i](x)
V = self.wv[i](x)
multi_attn.append(scaled_dot_product(Q,K,V))
multi_head = tf.concat(multi_attn,axis=-1)
multi_head_attention = self.wo(multi_head)
return multi_head_attention
#Calling the attention
multi = MultiAttention(d_model=512,num_of_heads=8)
m = 5; sequence_length = 4; word_embedding_dim = 512
sample_ip = tf.constant(tf.random.normal(shape=(m,sequence_length,word_embedding_dim)))
attn =multi(sample_ip)
#shape of op (attn): (5,4,512)

In your implementation, in scaled_dot_product you scaled with query but according to the original paper, they used key to normalize. Apart from that, this implementation seems Ok but not general.
class MultiAttention(tf.keras.layers.Layer):
def __init__(self, num_of_heads, out_dim):
super(MultiAttention,self).__init__()
self.out_dim = out_dim
self.num_of_heads = num_of_heads
self.depth = self.out_dim // self.num_of_heads
self.wq = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
self.wk = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
self.wv = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
self.wo = tf.keras.layers.Dense(self.out_dim)
def call(self,x):
multi_attn = []
for i in range(self.num_of_heads):
Q = self.wq[i](x)
K = self.wk[i](x)
V = self.wv[i](x)
multi_attn.append(self.scaled_dot_product(Q,K,V))
multi_head = tf.concat(multi_attn, axis=-1)
multi_head_attention = self.wo(multi_head)
return multi_head_attention
def scaled_dot_product(self, q,k,v):
qkt = tf.matmul(q, k, transpose_b=True)
dk = tf.math.sqrt( tf.cast(k.shape[-1], dtype=tf.float32) )
scaled_qkt = qkt/dk
softmax = tf.nn.softmax(scaled_qkt, axis=-1)
z = tf.matmul(softmax, v)
return z
multi = MultiAttention(num_of_heads=3, out_dim=32)
sample_ip = tf.random.normal(shape=(2, 2, 32)); print(sample_ip.shape)
multi(sample_ip).shape
The general transformer architecture can be demonstrated as follows where the first two linear layers represent query and key and responsible to produce attention weights maps and followed by weighted the value in matrix multiplication fashion.
Image Source.
I understand you're trying to minimize the original TF tutorial code but I think you should add reference first to your original question. In the original implementation, they also returned weighted probabilities or scores along with the weighted feature maps. I think you shouldn't skip that.
The original code that you're following is more general and efficient optimized.
class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % self.num_heads == 0
self.depth = d_model // self.num_heads
self.wq = tf.keras.layers.Dense(d_model)
self.wk = tf.keras.layers.Dense(d_model)
self.wv = tf.keras.layers.Dense(d_model)
self.dense = tf.keras.layers.Dense(d_model)
def scaled_dot_product_attention(self, q, k, v, mask=None):
matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)
# scale matmul_qk
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
# add the mask to the scaled tensor.
if mask is not None: scaled_attention_logits += (mask * -1e9)
# softmax is normalized on the last axis (seq_len_k) so that the scores
# add up to 1.
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)
output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)
return output, attention_weights
def split_heads(self, x, batch_size):
"""Split the last dimension into (num_heads, depth).
Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
"""
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask=None):
batch_size = tf.shape(q)[0]
q = self.wq(q) # (batch_size, seq_len, d_model)
k = self.wk(k) # (batch_size, seq_len, d_model)
v = self.wv(v) # (batch_size, seq_len, d_model)
q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
# scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)
output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)
return output, attention_weights
FYI, in TF 2.4, the tf.keras.layers.MultiHeadAttention layer is officially added.
layer = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=2)
input_tensor = tf.keras.Input(shape=[2, 2, 32]); print(input_tensor.shape)
print(layer(input_tensor, input_tensor).shape)
You can test these two as follows:
# custom layer MHA
multi = MultiHeadAttention(d_model=512, num_heads=2)
y = tf.random.uniform((1, 60, 512))
out, attn = multi(y, k=y, q=y, mask=None)
out.shape, attn.shape
(TensorShape([1, 60, 512]), TensorShape([1, 2, 60, 60]))
# built-in layer
layer = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=2)
y = tf.random.uniform((1, 60, 512))
out, attn = layer(y, y, return_attention_scores=True)
out.shape, attn.shape
(TensorShape([1, 60, 512]), TensorShape([1, 2, 60, 60]))

Related

the param of attention layer is 0

when I build multi_head_self_attention ,I found the param of this layer is 0,what is wrong with this attention layer?what should i do to modify this layer?
I initialize query, key, value in init,and by attention function ,I can get the result of query\key\value
class MultiHeadSelfAttention(Layer):
def __init__(self, embed_dim, num_heads): **num_heads represent the num of heads**
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = Dense(embed_dim,use_bias=False)
self.key_dense = Dense(embed_dim,use_bias=False)
self.value_dense = Dense(embed_dim,use_bias=False)
self.combine_heads = Dense(embed_dim,use_bias=False)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
**x.shape = [batch_size, seq_len, embedding_dim]**
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs)
key = self.key_dense(inputs)
value = self.value_dense(inputs)
query = self.separate_heads(
query, batch_size
)
key = self.separate_heads(
key, batch_size
)
value = self.separate_heads(
value, batch_size
)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) * (batch_size, seq_len, num_heads, projection_dim)*
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) * (batch_size, seq_len, embed_dim)*
output = self.combine_heads(
concat_attention
) * (batch_size, seq_len, embed_dim)*
return output
x = MultiHeadSelfAttention(embed_dim, num_heads)(embed_input)

Mobilevit Binary classification ValueError: `logits` and `labels` must have the same shape, received ((None, 2) vs (None, 1))

I am using the colab notebook(https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/vision/ipynb/mobilevit.ipynb) for mobilevit to train on a dataset I have of 25k pictures for 2 classes. Since it's a binary classification, I have used keras.losses.BinaryCrossentropy and Sigmoid as activation function at the last layer:-
def create_mobilevit(num_classes=2):
inputs = keras.Input((image_size, image_size, 3))
x = layers.Rescaling(scale=1.0 / 255)(inputs)
# Initial conv-stem -> MV2 block.
x = conv_block(x, filters=16)
x = inverted_residual_block(
x, expanded_channels=16 * expansion_factor, output_channels=16
)
# Downsampling with MV2 block.
x = inverted_residual_block(
x, expanded_channels=16 * expansion_factor, output_channels=24, strides=2
)
x = inverted_residual_block(
x, expanded_channels=24 * expansion_factor, output_channels=24
)
x = inverted_residual_block(
x, expanded_channels=24 * expansion_factor, output_channels=24
)
# First MV2 -> MobileViT block.
x = inverted_residual_block(
x, expanded_channels=24 * expansion_factor, output_channels=48, strides=2
)
x = mobilevit_block(x, num_blocks=2, projection_dim=64)
# Second MV2 -> MobileViT block.
x = inverted_residual_block(
x, expanded_channels=64 * expansion_factor, output_channels=64, strides=2
)
x = mobilevit_block(x, num_blocks=4, projection_dim=80)
# Third MV2 -> MobileViT block.
x = inverted_residual_block(
x, expanded_channels=80 * expansion_factor, output_channels=80, strides=2
)
x = mobilevit_block(x, num_blocks=3, projection_dim=96)
x = conv_block(x, filters=320, kernel_size=1, strides=1)
# Classification head.
x = layers.GlobalAvgPool2D()(x)
outputs = layers.Dense(num_classes, activation="sigmoid")(x)
return keras.Model(inputs, outputs)
And here's my dataset preparation cell:-
batch_size = 64
auto = tf.data.AUTOTUNE
resize_bigger = 512
num_classes = 2
def preprocess_dataset(is_training=True):
def _pp(image, label):
if is_training:
# Resize to a bigger spatial resolution and take the random
# crops.
image = tf.image.resize(image, (resize_bigger, resize_bigger))
image = tf.image.random_crop(image, (image_size, image_size, 3))
image = tf.image.random_flip_left_right(image)
else:
image = tf.image.resize(image, (image_size, image_size))
label = tf.one_hot(label, depth=num_classes)
return image, label
return _pp
def prepare_dataset(dataset, is_training=True):
if is_training:
dataset = dataset.shuffle(batch_size * 10)
dataset = dataset.map(preprocess_dataset(is_training), num_parallel_calls=auto)
return dataset.batch(batch_size).prefetch(auto)
And this is the cell for training the model:-
learning_rate = 0.002
label_smoothing_factor = 0.1
epochs = 30
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing_factor)
def run_experiment(epochs=epochs):
mobilevit_xxs = create_mobilevit(num_classes=num_classes)
mobilevit_xxs.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])
checkpoint_filepath = "/tmp/checkpoint"
checkpoint_callback = keras.callbacks.ModelCheckpoint(
checkpoint_filepath,
monitor="val_accuracy",
save_best_only=True,
save_weights_only=True,
)
mobilevit_xxs.fit(
train_ds,
validation_data=val_ds,
epochs=epochs,
callbacks=[checkpoint_callback],
)
mobilevit_xxs.load_weights(checkpoint_filepath)
_, accuracy = mobilevit_xxs.evaluate(val_ds)
print(f"Validation accuracy: {round(accuracy * 100, 2)}%")
return mobilevit_xxs
mobilevit_xxs = run_experiment()
Basically the code is identical to https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/vision/ipynb/mobilevit.ipynb except for the change in BinaryCrossEntropy loss and Sigmoid as actv. func. I don't understand why I am getting this even though I am explicitly ont-hot-coded my class labels -
ValueError: `logits` and `labels` must have the same shape, received ((None, 2) vs (None, 1)).
You need to change the num_classes = 1 instead of num_classes = 2 as you have used Sigmoid activation function which returns the values between 0 to 1 for binary classification(0,1).
The values <0.5 will be considered as class 0 and values >0.5 will be as class 1 in between two binary classes (0,1).
Please refer to the replicated gist for your reference.

How to modify the Keras CycleGAN example code to run parallelly on GPUs using tf.strategy

Here is the example of CycleGAN from the Keras
CycleGAN Example Using Keras.
Here is my modified implementation to use multiple GPUs. To implement the custom training I have used a reference Custom training with tf.distribute.Strategy
I want an example of CycleGAN from the Keras to run fast using GPUs. As further I need to process and train a huge amount of data. As well as CycleGAN uses multiple loss functions train_step will return 4 types of losses, currently, I am just returning one for easier understanding. Still, the training on GPUs is dead slow. I am not able to find the reason behind this.
Am I using tf.distribute.Strategy wrongly?
"""
Title: CycleGAN
Author: [A_K_Nain](https://twitter.com/A_K_Nain)
Date created: 2020/08/12
Last modified: 2020/08/12
Description: Implementation of CycleGAN.
"""
"""
## CycleGAN
CycleGAN is a model that aims to solve the image-to-image translation
problem. The goal of the image-to-image translation problem is to learn the
mapping between an input image and an output image using a training set of
aligned image pairs. However, obtaining paired examples isn't always feasible.
CycleGAN tries to learn this mapping without requiring paired input-output images,
using cycle-consistent adversarial networks.
- [Paper](https://arxiv.org/pdf/1703.10593.pdf)
- [Original implementation](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix)
"""
"""
## Setup
"""
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
tfds.disable_progress_bar()
autotune = tf.data.experimental.AUTOTUNE
# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
"""
## Prepare the dataset
In this example, we will be using the
[horse to zebra](https://www.tensorflow.org/datasets/catalog/cycle_gan#cycle_ganhorse2zebra)
dataset.
"""
# Load the horse-zebra dataset using tensorflow-datasets.
dataset, _ = tfds.load("cycle_gan/horse2zebra", with_info=True, as_supervised=True)
train_horses, train_zebras = dataset["trainA"], dataset["trainB"]
test_horses, test_zebras = dataset["testA"], dataset["testB"]
# Define the standard image size.
orig_img_size = (286, 286)
# Size of the random crops to be used during training.
input_img_size = (256, 256, 3)
# Weights initializer for the layers.
kernel_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
# Gamma initializer for instance normalization.
gamma_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
buffer_size = 256
batch_size = 1
def normalize_img(img):
img = tf.cast(img, dtype=tf.float32)
# Map values in the range [-1, 1]
return (img / 127.5) - 1.0
def preprocess_train_image(img, label):
# Random flip
img = tf.image.random_flip_left_right(img)
# Resize to the original size first
img = tf.image.resize(img, [*orig_img_size])
# Random crop to 256X256
img = tf.image.random_crop(img, size=[*input_img_size])
# Normalize the pixel values in the range [-1, 1]
img = normalize_img(img)
return img
def preprocess_test_image(img, label):
# Only resizing and normalization for the test images.
img = tf.image.resize(img, [input_img_size[0], input_img_size[1]])
img = normalize_img(img)
return img
"""
## Create `Dataset` objects
"""
BATCH_SIZE_PER_REPLICA = batch_size
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
# Apply the preprocessing operations to the training data
train_horses = (
train_horses.map(preprocess_train_image, num_parallel_calls=autotune)
.cache()
.shuffle(buffer_size)
.batch(GLOBAL_BATCH_SIZE)
)
train_zebras = (
train_zebras.map(preprocess_train_image, num_parallel_calls=autotune)
.cache()
.shuffle(buffer_size)
.batch(GLOBAL_BATCH_SIZE)
)
# Apply the preprocessing operations to the test data
test_horses = (
test_horses.map(preprocess_test_image, num_parallel_calls=autotune)
.cache()
.shuffle(buffer_size)
.batch(GLOBAL_BATCH_SIZE)
)
test_zebras = (
test_zebras.map(preprocess_test_image, num_parallel_calls=autotune)
.cache()
.shuffle(buffer_size)
.batch(GLOBAL_BATCH_SIZE)
)
# Visualize some samples
_, ax = plt.subplots(4, 2, figsize=(10, 15))
for i, samples in enumerate(zip(train_horses.take(4), train_zebras.take(4))):
horse = (((samples[0][0] * 127.5) + 127.5).numpy()).astype(np.uint8)
zebra = (((samples[1][0] * 127.5) + 127.5).numpy()).astype(np.uint8)
ax[i, 0].imshow(horse)
ax[i, 1].imshow(zebra)
plt.show()
plt.savefig('Visualize_Some_Samples')
plt.close()
# Building blocks used in the CycleGAN generators and discriminators
class ReflectionPadding2D(layers.Layer):
"""Implements Reflection Padding as a layer.
Args:
padding(tuple): Amount of padding for the
spatial dimensions.
Returns:
A padded tensor with the same type as the input tensor.
"""
def __init__(self, padding=(1, 1), **kwargs):
self.padding = tuple(padding)
super(ReflectionPadding2D, self).__init__(**kwargs)
def call(self, input_tensor, mask=None):
padding_width, padding_height = self.padding
padding_tensor = [
[0, 0],
[padding_height, padding_height],
[padding_width, padding_width],
[0, 0],
]
return tf.pad(input_tensor, padding_tensor, mode="REFLECT")
def residual_block(
x,
activation,
kernel_initializer=kernel_init,
kernel_size=(3, 3),
strides=(1, 1),
padding="valid",
gamma_initializer=gamma_init,
use_bias=False,
):
dim = x.shape[-1]
input_tensor = x
x = ReflectionPadding2D()(input_tensor)
x = layers.Conv2D(
dim,
kernel_size,
strides=strides,
kernel_initializer=kernel_initializer,
padding=padding,
use_bias=use_bias,
)(x)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
x = activation(x)
x = ReflectionPadding2D()(x)
x = layers.Conv2D(
dim,
kernel_size,
strides=strides,
kernel_initializer=kernel_initializer,
padding=padding,
use_bias=use_bias,
)(x)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
x = layers.add([input_tensor, x])
return x
def downsample(
x,
filters,
activation,
kernel_initializer=kernel_init,
kernel_size=(3, 3),
strides=(2, 2),
padding="same",
gamma_initializer=gamma_init,
use_bias=False,
):
x = layers.Conv2D(
filters,
kernel_size,
strides=strides,
kernel_initializer=kernel_initializer,
padding=padding,
use_bias=use_bias,
)(x)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
if activation:
x = activation(x)
return x
def upsample(
x,
filters,
activation,
kernel_size=(3, 3),
strides=(2, 2),
padding="same",
kernel_initializer=kernel_init,
gamma_initializer=gamma_init,
use_bias=False,
):
x = layers.Conv2DTranspose(
filters,
kernel_size,
strides=strides,
padding=padding,
kernel_initializer=kernel_initializer,
use_bias=use_bias,
)(x)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
if activation:
x = activation(x)
return x
def get_resnet_generator(
filters=64,
num_downsampling_blocks=2,
num_residual_blocks=9,
num_upsample_blocks=2,
gamma_initializer=gamma_init,
name=None,
):
img_input = layers.Input(shape=input_img_size, name=name + "_img_input")
x = ReflectionPadding2D(padding=(3, 3))(img_input)
x = layers.Conv2D(filters, (7, 7), kernel_initializer=kernel_init, use_bias=False)(
x
)
x = tfa.layers.InstanceNormalization(gamma_initializer=gamma_initializer)(x)
x = layers.Activation("relu")(x)
# Downsampling
for _ in range(num_downsampling_blocks):
filters *= 2
x = downsample(x, filters=filters, activation=layers.Activation("relu"))
# Residual blocks
for _ in range(num_residual_blocks):
x = residual_block(x, activation=layers.Activation("relu"))
# Upsampling
for _ in range(num_upsample_blocks):
filters //= 2
x = upsample(x, filters, activation=layers.Activation("relu"))
# Final block
x = ReflectionPadding2D(padding=(3, 3))(x)
x = layers.Conv2D(3, (7, 7), padding="valid")(x)
x = layers.Activation("tanh")(x)
model = keras.models.Model(img_input, x, name=name)
return model
"""
## Build the discriminators
The discriminators implement the following architecture:
`C64->C128->C256->C512`
"""
def get_discriminator(
filters=64, kernel_initializer=kernel_init, num_downsampling=3, name=None
):
img_input = layers.Input(shape=input_img_size, name=name + "_img_input")
x = layers.Conv2D(
filters,
(4, 4),
strides=(2, 2),
padding="same",
kernel_initializer=kernel_initializer,
)(img_input)
x = layers.LeakyReLU(0.2)(x)
num_filters = filters
for num_downsample_block in range(3):
num_filters *= 2
if num_downsample_block < 2:
x = downsample(
x,
filters=num_filters,
activation=layers.LeakyReLU(0.2),
kernel_size=(4, 4),
strides=(2, 2),
)
else:
x = downsample(
x,
filters=num_filters,
activation=layers.LeakyReLU(0.2),
kernel_size=(4, 4),
strides=(1, 1),
)
x = layers.Conv2D(
1, (4, 4), strides=(1, 1), padding="same", kernel_initializer=kernel_initializer
)(x)
model = keras.models.Model(inputs=img_input, outputs=x, name=name)
return model
"""
## Build the CycleGAN model
"""
class CycleGan(keras.Model):
def __init__(
self,
generator_G,
generator_F,
discriminator_X,
discriminator_Y,
lambda_cycle=10.0,
lambda_identity=0.5,
):
super(CycleGan, self).__init__()
self.gen_G = generator_G
self.gen_F = generator_F
self.disc_X = discriminator_X
self.disc_Y = discriminator_Y
self.lambda_cycle = lambda_cycle
self.lambda_identity = lambda_identity
def compile(
self,
gen_G_optimizer,
gen_F_optimizer,
disc_X_optimizer,
disc_Y_optimizer,
gen_loss_fn,
disc_loss_fn,
cycle_loss_fn,
identity_loss_fn
):
super(CycleGan, self).compile()
self.gen_G_optimizer = gen_G_optimizer
self.gen_F_optimizer = gen_F_optimizer
self.disc_X_optimizer = disc_X_optimizer
self.disc_Y_optimizer = disc_Y_optimizer
self.generator_loss_fn = gen_loss_fn
self.discriminator_loss_fn = disc_loss_fn
#self.cycle_loss_fn = keras.losses.MeanAbsoluteError()
#self.identity_loss_fn = keras.losses.MeanAbsoluteError()
self.cycle_loss_fn = cycle_loss_fn
self.identity_loss_fn = identity_loss_fn
def train_step(self, batch_data):
# x is Horse and y is zebra
real_x, real_y = batch_data
with tf.GradientTape(persistent=True) as tape:
# Horse to fake zebra
fake_y = self.gen_G(real_x, training=True)
# Zebra to fake horse -> y2x
fake_x = self.gen_F(real_y, training=True)
# Cycle (Horse to fake zebra to fake horse): x -> y -> x
cycled_x = self.gen_F(fake_y, training=True)
# Cycle (Zebra to fake horse to fake zebra) y -> x -> y
cycled_y = self.gen_G(fake_x, training=True)
# Identity mapping
same_x = self.gen_F(real_x, training=True)
same_y = self.gen_G(real_y, training=True)
# Discriminator output
disc_real_x = self.disc_X(real_x, training=True)
disc_fake_x = self.disc_X(fake_x, training=True)
disc_real_y = self.disc_Y(real_y, training=True)
disc_fake_y = self.disc_Y(fake_y, training=True)
# Generator adverserial loss
gen_G_loss = self.generator_loss_fn(disc_fake_y)
gen_F_loss = self.generator_loss_fn(disc_fake_x)
# Generator cycle loss
cycle_loss_G = self.cycle_loss_fn(real_y, cycled_y) * self.lambda_cycle
cycle_loss_F = self.cycle_loss_fn(real_x, cycled_x) * self.lambda_cycle
# Generator identity loss
id_loss_G = (
self.identity_loss_fn(real_y, same_y)
* self.lambda_cycle
* self.lambda_identity
)
id_loss_F = (
self.identity_loss_fn(real_x, same_x)
* self.lambda_cycle
* self.lambda_identity
)
# Total generator loss
total_loss_G = gen_G_loss + cycle_loss_G + id_loss_G
total_loss_F = gen_F_loss + cycle_loss_F + id_loss_F
# Discriminator loss
disc_X_loss = self.discriminator_loss_fn(disc_real_x, disc_fake_x)
disc_Y_loss = self.discriminator_loss_fn(disc_real_y, disc_fake_y)
# Get the gradients for the generators
grads_G = tape.gradient(total_loss_G, self.gen_G.trainable_variables)
grads_F = tape.gradient(total_loss_F, self.gen_F.trainable_variables)
# Get the gradients for the discriminators
disc_X_grads = tape.gradient(disc_X_loss, self.disc_X.trainable_variables)
disc_Y_grads = tape.gradient(disc_Y_loss, self.disc_Y.trainable_variables)
# Update the weights of the generators
self.gen_G_optimizer.apply_gradients(
zip(grads_G, self.gen_G.trainable_variables)
)
self.gen_F_optimizer.apply_gradients(
zip(grads_F, self.gen_F.trainable_variables)
)
# Update the weights of the discriminators
self.disc_X_optimizer.apply_gradients(
zip(disc_X_grads, self.disc_X.trainable_variables)
)
self.disc_Y_optimizer.apply_gradients(
zip(disc_Y_grads, self.disc_Y.trainable_variables)
)
return total_loss_G
# return [total_loss_G, total_loss_F, disc_X_loss, disc_Y_loss]
# Open a strategy scope.
with strategy.scope():
mae_loss_fn = keras.losses.MeanAbsoluteError(reduction=tf.keras.losses.Reduction.NONE)
# Loss function for evaluating cycle consistency loss
def cycle_loss_fn(real, cycled):
cycle_loss = mae_loss_fn(real, cycled)
cycle_loss = tf.nn.compute_average_loss(cycle_loss, global_batch_size=GLOBAL_BATCH_SIZE)
return cycle_loss
# Loss function for evaluating identity mapping loss
def identity_loss_fn(real, same):
identity_loss = mae_loss_fn(real, same)
identity_loss = tf.nn.compute_average_loss(identity_loss, global_batch_size=GLOBAL_BATCH_SIZE)
return identity_loss
# Loss function for evaluating adversarial loss
adv_loss_fn = keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
# Define the loss function for the generators
def generator_loss_fn(fake):
fake_loss = adv_loss_fn(tf.ones_like(fake), fake)
fake_loss = tf.nn.compute_average_loss(fake_loss, global_batch_size=GLOBAL_BATCH_SIZE)
return fake_loss
# Define the loss function for the discriminators
def discriminator_loss_fn(real, fake):
real_loss = adv_loss_fn(tf.ones_like(real), real)
fake_loss = adv_loss_fn(tf.zeros_like(fake), fake)
real_loss = tf.nn.compute_average_loss(real_loss, global_batch_size=GLOBAL_BATCH_SIZE)
fake_loss = tf.nn.compute_average_loss(fake_loss, global_batch_size=GLOBAL_BATCH_SIZE)
return (real_loss + fake_loss) * 0.5
# Get the generators
gen_G = get_resnet_generator(name="generator_G")
gen_F = get_resnet_generator(name="generator_F")
# Get the discriminators
disc_X = get_discriminator(name="discriminator_X")
disc_Y = get_discriminator(name="discriminator_Y")
# Create cycle gan model
cycle_gan_model = CycleGan(
generator_G=gen_G, generator_F=gen_F, discriminator_X=disc_X, discriminator_Y=disc_Y
)
optimizer = keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5)
# Compile the model
cycle_gan_model.compile(
gen_G_optimizer=optimizer,
gen_F_optimizer=optimizer,
disc_X_optimizer=optimizer,
disc_Y_optimizer=optimizer,
gen_loss_fn=generator_loss_fn,
disc_loss_fn=discriminator_loss_fn,
cycle_loss_fn=cycle_loss_fn,
identity_loss_fn=identity_loss_fn
)
train_dist_dataset = strategy.experimental_distribute_dataset(
tf.data.Dataset.zip((train_horses,
train_zebras)))
# `run` replicates the provided computation and runs it
# with the distributed input.
#tf.function
def distributed_train_step(dataset_inputs):
per_replica_losses = strategy.run(cycle_gan_model.train_step, args=(dataset_inputs,))
return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
axis=None)
"""
## Train the end-to-end model
"""
for epoch in range(1):
# TRAIN LOOP
all_loss = 0.0
num_batches = 0.0
for one_batch in train_dist_dataset:
all_loss += distributed_train_step(one_batch)
num_batches += 1
train_loss = all_loss/num_batches
print(train_loss)

Problem running Tensorflow Transformer Tutorial

I was checking the TensorFlow tutorial "Transformer model for language understanding," and I copied the code exactly as it is into my Spyder 4 environment. However, the code shows the following error when running:
AttributeError: 'RepeatedCompositeFieldContainer' object has no attribute 'append'
I checked the code and realized that the error comes from the call function of the MultiHeadAttention class. However, I do not understand what the problem is since the code runs just fine in the Colab notebook.
class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % self.num_heads == 0
self.depth = d_model // self.num_heads
self.wq = tf.keras.layers.Dense(d_model)
self.wk = tf.keras.layers.Dense(d_model)
self.wv = tf.keras.layers.Dense(d_model)
self.dense = tf.keras.layers.Dense(d_model)
def split_heads(self, x, batch_size):
"""Split the last dimension into (num_heads, depth).
Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
"""
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask):
batch_size = tf.shape(q)[0]
q = self.wq(q) # (batch_size, seq_len, d_model)
k = self.wk(k) # (batch_size, seq_len, d_model)
v = self.wv(v) # (batch_size, seq_len, d_model)
q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
# scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
scaled_attention, attention_weights = scaled_dot_product_attention(
q, k, v, mask)
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q,
num_heads, depth)
concat_attention = tf.reshape(scaled_attention,
(batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)
output = self.dense(concat_attention) # (batch_size, seq_len_q, d_model)
return output, attention_weights
The error shows when executing the line of q = self.wq(q) in the call function. Any help will be appreciated.
Thanks in advance.
I suspect that the problem is that your protobuf Python package version is too old. It should be >=3.8.0. See the troubleshooting here.
I was seeing the same error message and upgrading protobuf proved to be the solution. In my case there was extra anguish in making sure Python found the upgraded package in the labyrinth of remnants of old Python installations, virtual environments and the PYTHONPATH pointing to an old installation with an old protobuf.

TensorFlow training with large dataset takes too long

Yesterday, I have created a pretrained VGG19 with custom head and tried to train it with 60000 images. After more than 12 hours, the training of first epoch didn't complete.
The batch size has been set to 64 and the number of steps per epoch has been set to training_set_size/batch_size.
Below is the code of DataLoader:
IMAGE_CHANNEL = 3
def crop(image, margin):
return image[margin:-margin, margin:-margin]
def random_rotation(image, angle):
M = cv2.getRotationMatrix2D((0, 0),angle,1)
rows,cols, _ = image.shape
new_img = cv2.warpAffine(image, M, (cols, rows))
return new_img
def get_generator(in_gen, should_augment=True):
weights = None
if should_augment:
image_gen = tf.keras.preprocessing.image.ImageDataGenerator(fill_mode='reflect',
data_format='channels_last',
brightness_range=[0.5, 1.5])
else:
image_gen = tf.keras.preprocessing.image.ImageDataGenerator(fill_mode='reflect',
data_format='channels_last',
brightness_range=[1, 1])
for items in in_gen:
in_x, in_y = items
g_x = image_gen.flow(255 * in_x, in_y, batch_size=in_x.shape[0])
x, y = next(g_x)
yield x / 255.0, y
class DataLoader:
def __init__(self, source_filename, dataset_path, image_size, batch_size, training_set_size=0.8, sample_size=None):
path_dataset = Path(dataset_path)
path_image_folders = path_dataset / 'images'
self.data = pd.read_pickle(source_filename)
if sample_size is not None:
self.data = self.data[:sample_size]
self.image_size = image_size
self.batch_size = batch_size
self.training_set_size = training_set_size
self.steps_per_epoch = int(self.data.shape[0] * training_set_size // batch_size)
if self.steps_per_epoch == 0: self.steps_per_epoch = 1
self.validation_steps = int(self.data.shape[0] * (1 - training_set_size)//batch_size)
if self.validation_steps == 0: self.validation_steps = 1
def draw_idx(self, i):
img_path = self.data.iloc[i].image
img = tf.keras.preprocessing.image.img_to_array(tf.keras.preprocessing.image.load_img(str(img_path)))
# print(img.shape)
height, width, _ = img.shape
fig = plt.figure(figsize=(15, 15), facecolor='w')
# original image
ax = fig.add_subplot(1, 1, 1)
ax.imshow(img / 255.0)
openness = self.data.iloc[i].Openness
conscientiousness = self.data.iloc[i].Conscientiousness
extraversion = self.data.iloc[i].Extraversion
agreeableness = self.data.iloc[i].Agreeableness
neuroticism = self.data.iloc[i].Neuroticism
ax.title.set_text(
f'O: {openness}, C: {conscientiousness}, E: {extraversion}, A: {agreeableness}, N: {neuroticism}')
plt.axis('off')
plt.tight_layout()
plt.show()
def get_image(self, index, data, should_augment):
# Read image and appropiate landmarks
image = cv2.imread(data['image'].values[index])
h, w, _ = image.shape
o, c, e, a, n = data[['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']].values[
index]
should_flip = random.randint(0, 1)
should_rotate = random.randint(0, 1)
should_crop = random.randint(0, 1)
if should_augment:
if should_flip == 1:
# print("Image {} flipped".format(data['path'].values[index]))
image = cv2.flip(image, 1)
if should_rotate == 1:
angle = random.randint(-5, 5)
image = random_rotation(image, angle)
if should_crop == 1:
margin = random.randint(1, 10)
image = crop(image, margin)
image = cv2.resize(image, (self.image_size, self.image_size))
return [image, o, c, e, a, n]
def generator(self, data, should_augment=True):
while True:
# Randomize the indices to make an array
indices_arr = np.random.permutation(data.count()[0])
for batch in range(0, len(indices_arr), self.batch_size):
# slice out the current batch according to batch-size
current_batch = indices_arr[batch:(batch + self.batch_size)]
# initializing the arrays, x_train and y_train
x_train = np.empty(
[0, self.image_size, self.image_size, IMAGE_CHANNEL], dtype=np.float32)
y_train = np.empty([0, 5], dtype=np.int32)
for i in current_batch:
# get an image and its corresponding color for an traffic light
[image, o, c, e, a, n] = self.get_image(i, data, should_augment)
# Appending them to existing batch
x_train = np.append(x_train, [image], axis=0)
y_train = np.append(y_train, [[o, c, e, a, n]], axis=0)
# replace nan values with zeros
y_train = np.nan_to_num(y_train)
yield (x_train, y_train)
def get_training_and_test_generators(self, should_augment_training=True, should_augment_test=True):
msk = np.random.rand(len(self.data)) < self.training_set_size
train = self.data[msk]
test = self.data[~msk]
train_gen = self.generator(train, should_augment_training)
test_gen = self.generator(test, should_augment_test)
return get_generator(train_gen, should_augment_training), get_generator(test_gen, should_augment_test)
def show_batch_images_sample(self, images, landmarks, n_rows=3, n_cols=3):
assert n_rows * n_cols <= self.batch_size, "Number of expected images to display is larger than batch!"
fig = plt.figure(figsize=(15, 15))
xs, ys = [], []
count = 1
for img, y in zip(images, landmarks):
ax = fig.add_subplot(n_rows, n_cols, count)
ax.imshow(img)
h, w, _ = img.shape
o, c, e, a, n = y
ax.title.set_text(f'{o}, {c}, {e}, {a}, {n}')
ax.axis('off')
if count == n_rows * n_cols:
break
count += 1
class CallbackTensorboardImageOutput(Callback):
def __init__(self, model, generator, log_dir, feed_inputs_display=9):
# assert ((feed_inputs_display & (feed_inputs_display - 1)) == 0) and feed_inputs_display != 0
self.generator = generator
self.model = model
self.log_dir = log_dir
self.writer = tf.summary.create_file_writer(self.log_dir)
self.feed_inputs_display = feed_inputs_display
self.seen = 0
def plot_to_image(figure):
"""Converts the matplotlib plot specified by 'figure' to a PNG image and
returns it. The supplied figure is closed and inaccessible after this call."""
# Save the plot to a PNG in memory.
buf = io.BytesIO()
plt.savefig(buf, format='png')
# Closing the figure prevents it from being displayed directly inside
# the notebook.
plt.close(figure)
buf.seek(0)
# Convert PNG buffer to TF image
image = tf.image.decode_png(buf.getvalue(), channels=4)
# Add the batch dimension
image = tf.expand_dims(image, 0)
return image
#staticmethod
def get_loss(gt, predictions):
return tf.losses.mse(gt, predictions)
def on_epoch_end(self, epoch, logs={}):
self.seen += 1
if self.seen % 1 == 0:
items = next(self.generator)
images_to_display = self.feed_inputs_display
images_per_cell_count = int(math.sqrt(images_to_display))
# in case of regular model training using generator, an array is passed
if not isinstance(items, dict):
frames_arr, ocean_scores = items
# Take just 1st sample from batch
batch_size = frames_arr.shape[0]
if images_to_display > batch_size:
images_to_display = batch_size
frames_arr = frames_arr[0:images_to_display]
ocean_scores = ocean_scores[0:images_to_display]
y_pred = self.model.predict(frames_arr)
# in case of adversarial training, a dictionary is passed
else:
batch_size = items['feature'].shape[0]
if images_to_display > batch_size:
images_to_display = batch_size
# items['feature'] = items['feature'][0:images_to_display]
# landmarks = items['label'][0:images_to_display]
frames_arr = items['feature']
landmarks = items['label']
y_pred = self.model.predict(items)
figure = plt.figure(figsize=(15, 15))
for i in range(images_to_display):
image_current = frames_arr[i]
y_prediction_current = y_pred[i]
y_gt_current = ocean_scores[i]
lbl_prediction = 'plot/img/{}'.format(i)
ax = plt.subplot(images_per_cell_count, images_per_cell_count, i + 1, title=lbl_prediction)
ax.imshow(image_current)
ax.axis('off')
with self.writer.as_default():
tf.summary.image("Training Data", CallbackTensorboardImageOutput.plot_to_image(figure), step=self.seen)
Below is the definition of the network architecture and the call of fit_generator function:
data_loader = dataloader.DataLoader('dataset.pkl', '/home/niko/data/PsychoFlickr', 224, 64)
train_gen, test_gen = data_loader.get_training_and_test_generators()
pre_trained_model = tf.keras.applications.VGG19(input_shape=(data_loader.image_size, data_loader.image_size, dataloader.IMAGE_CHANNEL), weights='imagenet', include_top=False)
x = pre_trained_model.output
x = tf.keras.layers.Flatten()(x)
# Add a fully connected layer with 256 hidden units and ReLU activation
x = tf.keras.layers.Dense(256)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dropout(rate=0.5)(x)
x = tf.keras.layers.Dense(256)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dropout(rate=0.5)(x)
x = tf.keras.layers.Dense(5, name='regresion_output')(x)
x = tf.keras.layers.Activation('linear')(x)
model = tf.keras.Model(pre_trained_model.input, x)
print(model.summary())
log_dir = "logs/{}".format(model_name)
model_filename = "saved-models/{}.h5".format(model_name)
cb_tensorboard = TensorBoard(log_dir=log_dir)
callback_save_images = dataloader.CallbackTensorboardImageOutput(model, test_gen, log_dir)
checkpoint = ModelCheckpoint(model_filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
lr = 1e-3
opt = tf.optimizers.Adam(lr=lr)
model.compile(loss=loss_mse, optimizer=opt, metrics=[loss_mse])
history = model.fit_generator(
train_gen,
validation_data=test_gen,
steps_per_epoch=data_loader.steps_per_epoch,
epochs=20,
validation_steps=data_loader.validation_steps,
verbose=2,
use_multiprocessing=True,
callbacks=[checkpoint, callback_save_images, cb_tensorboard]
)
When I tried to run the same procedure with small sample data (200 records), everything seemed to work fine. On the dataset of 60000 records, however, after more than 12 hours the training of 1st epoch hasn't completed.
The training is performed on NVIDIA RTX2080Ti.
I would be thankful if anyone suggested what has to be modified or in general configured in order to train the network on reasonable time.