GCN model is not learning - tensorflow

I am trying to implement a GCN layer using tensorflow, but it is not learning. Can someone check what potential issue could be?
I have tried normalizing the adjacency matrix and even replaced it with identity so that the GCN layer becomes a simple MLP. But there is no change. I think, I have made some fundamental/silly mistake in my implementation which I am not able to find. Can someone let me know what the issue could be?
!pip install numpy
!pip install tensorflow
!pip install spektral
#!pip install tqdm
import numpy as np
import tensorflow as
import spektral
def masked_cross_entropy_loss( labels,logits, mask ):
loss = tf.nn.softmax_cross_entropy_with_logits(labels,logits )
mask = tf.cast(mask, dtype=tf.float32)
# the step below is important, because we need to find mean of only masked nodes
# dividing the mask by its mean = mask * total_nodes/total_masked nodes, comes in handy when we try to take the mean of the loss in the final step
# the total number of nodes that are input to the function are cancelled out between two means of mask and loss
# What remains is only the total number of masked nodes in denominator.
mask /= tf.reduce_mean(mask)
loss *= mask
return tf.reduce_mean(loss)
def masked_accuracy( labels, logits, mask ):
accuracy_array = tf.equal(tf.argmax(logits, axis=1), tf.argmax(labels, axis=1))
accuracy_array = tf.cast(accuracy_array, dtype =tf.float32)
mask = tf.cast(mask, dtype = tf.float32)
mask/= tf.reduce_mean(mask)
accuracy_array *= mask
return tf.reduce_mean(accuracy_array)
class GCNLayer:
def __init__( self, A ):
self.A = A
def _transform( self, units, features, trans_func ):
if trans_func == 'dense':
features = tf.keras.layers.Dense(units)( features )
features = tf.cast(features, dtype=tf.float32)
return features
else:
raise Exception('Transformation function not implemented')
def _aggregate( self, features, agg_func ):
if agg_func == 'adj_matmul':
return self.A # features
else:
raise Exception('Aggregation function not implemented')
def _activate( self, features, activation ):
features = tf.keras.layers.Activation(activation)( features)
return features
def __call__( self, units, features, trans_func='dense', agg_func = 'adj_matmul', activation='relu' ):
features = self._transform(units, features, trans_func )
features = self._aggregate(features, agg_func)
if activation is not None:
features = self._activate(features, activation)
return features
class MyModel:
def __init__( self, A, node_features, node_labels, train_mask, val_mask, test_mask ):
self.A = A
self.node_features = node_features
self.node_labels = node_labels
self.train_mask = train_mask
self.val_mask = val_mask
self.test_mask = test_mask
self.gcn_layer1 = GCNLayer(self.A)
self.gcn_layer2 = GCNLayer(self.A)
def __call__( self ):
hidden_out = self.gcn_layer1(32, self.node_features, activation='relu' )
output = self.gcn_layer2(7, hidden_out, activation=None)
return output
def train( self, num_epochs=1, lr =0.01 ):
optimizer = tf.keras.optimizers.Adam(lr)
best_val_acc = 0.0
for e in range(num_epochs):
with tf.GradientTape() as t:
logits = self()
train_loss = masked_cross_entropy_loss( self.node_labels, logits, self.train_mask )
variables = t.watched_variables()
grads = t.gradient(train_loss, variables)
optimizer.apply_gradients(zip(grads, variables))
logits = self()
train_acc = masked_accuracy( self.node_labels, logits, self.train_mask )
val_acc = masked_accuracy( self.node_labels, logits, self.val_mask )
if val_acc > best_val_acc:
best_val_acc = val_acc
print(f'epoch={e},Training Loss:{train_loss.numpy()},Training Accuracy:{train_acc.numpy()}, Validation Accuracy:{val_acc.numpy()}')
model = MyModel(A, node_features, node_labels, train_mask, val_mask, test_mask)
model.train(num_epochs=200, lr=0.01)
Output
epoch=0,Training Loss:4.099794864654541,Training Accuracy:0.1428571492433548, Validation Accuracy:0.09000000357627869
epoch=1,Training Loss:6.438627243041992,Training Accuracy:0.20714285969734192, Validation Accuracy:0.16599997878074646
epoch=5,Training Loss:5.980966091156006,Training Accuracy:0.17142857611179352, Validation Accuracy:0.17399999499320984
epoch=13,Training Loss:3.9486303329467773,Training Accuracy:0.15000000596046448, Validation Accuracy:0.2800000011920929
epoch=40,Training Loss:5.182331562042236,Training Accuracy:0.23571430146694183, Validation Accuracy:0.29600000381469727
epoch=158,Training Loss:6.245728969573975,Training Accuracy:0.2142857164144516, Validation Accuracy:0.3160000145435333

Your model is learning but doesn't converge. Consider checking/adding data ,use simpler model, or tuning parameters while training (e.g: learning rate, batches size).

I found the problem in my code. I had instantiated the tf.keras.Dense layer in the call function which was causing the weights to be initialized on every epoch confusing the GradientTape.

Related

How to create a keras layer with a custom gradient *and learnable parameters* in TF2.0?

this is a similar question to: How to create a keras layer with a custom gradient in TF2.0?
Only, I would like to introduce a learnable parameter into the custom layer that I am training.
Here's a toy example of my current approach here:
# Method for calculation custom gradient
#tf.custom_gradient
def scaler(x, s):
def grad(upstream):
dy_dx = s
dy_ds = x
return dy_dx, dy_ds
return x * s, grad
# Keras Layer with trainable parameter
class TestLayer(tf.keras.layers.Layer):
def build(self, input_shape):
self.scale = self.add_weight("scale",
shape=[1,],
initializer=tf.keras.initializers.Constant(value=2.0),
trainable=True)
def call(self, inputs):
return scaler(inputs, self.scale)
# Creates Keras Model that uses the layer
def Model():
x_in = tf.keras.layers.Input(shape=(1,))
x_out = TestLayer()(x_in)
return tf.keras.Model(inputs=x_in, outputs=x_out, name="fp8_test")
# Create toy dataset, want to learn `scale` such to satisfy 5 = 2 * scale (i.e, `scale` should learn ~2.5)
def Dataset():
inps = tf.ones(shape=(10**5,)) * 2 # inputs
expected = tf.ones(shape=(10**5,)) * 5 # targets
data_in = tf.data.Dataset.from_tensors(inps)
data_exp = tf.data.Dataset.from_tensors(expected)
dataset = tf.data.Dataset.zip((data_in, data_exp))
return dataset
model = Model()
model.summary()
dataset = Dataset()
# Use `MSE` loss and `SGD` optimizer
model.compile(
loss=tf.keras.losses.MSE,
optimizer=tf.keras.optimizers.SGD(),
)
model.fit(dataset, epochs=100)
This is failing with the following shape related error in the optimizer:
ValueError: Shapes must be equal rank, but are 1 and 2 for '{{node SGD/SGD/update/ResourceApplyGradientDescent}} = ResourceApplyGradientDescent[T=DT_FLOAT, use_locking=true](fp8_test/test_layer_1/ReadVariableOp/resource, SGD/Identity, SGD/IdentityN)' with input shapes: [], [], [100000,1].
I've been staring at the docs for a while, I'm a bit stumped as to why this isn't working, I would really appreciate any input on how to fix this toy example.
Thanks in advance.

How to handle target decoder inputs for self attention transformer model during predict()

My question is essentially a duplicate of this one, where I'm confused as to what to pass into the decoder during the predict() (i.e., call()) phase. I've modified tutorials found here and here in order to create this script. This is being used for the purposes of self-attention on a time series dataset for regression (not NLP).
There's too much boilerplate to provide the full model so I'll write in the pertinent script:
Transformer.py
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
# The following imports are my custom Layers/Functions
from Utilities.MachineLearning.Keras.Layers.Encoder import Encoder
from Utilities.MachineLearning.Keras.Layers.Decoder import Decoder
from Utilities.MachineLearning.Keras.Functions.etc import create_padding_mask, create_look_ahead_mask
def create_masks(input, target):
# Encoder padding mask
encoder_mask = create_padding_mask(input)
# Used in the 2nd attention block in the decoder.
# This padding mask is used to mask the encoder outputs.
decoder_mask = create_padding_mask(input)
# Used in the 1st attention block in the decoder.
# It is used to pad and mask future tokens in the input received by
# the decoder.
look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
target_mask = create_padding_mask(target)
encoder_decoder_mask = tf.maximum(target_mask, look_ahead_mask)
return encoder_mask, encoder_decoder_mask, decoder_mask
class Transformer(Model):
def __init__(
self,
num_inputs,
num_outputs=1,
num_heads=1,
num_layers=1,
num_embedding_inputs=None,
num_ff_inputs=None,
dropout=0,
):
super().__init__()
self.encoder = Encoder(
num_inputs,
num_heads,
num_layers,
num_embedding_inputs,
num_ff_inputs,
dropout,
)
self.decoder = Decoder(
num_inputs,
num_heads,
num_layers,
num_embedding_inputs,
num_ff_inputs,
dropout,
)
self.output_layer = Dense(num_outputs, name="Output")
def call(
self,
inputs,
targets,
training=None,
):
encoder_mask, encoder_decoder_mask, decoder_mask = create_masks(inputs, targets)
encoder_output = self.encoder(inputs, encoder_mask, training)
decoder_output, attention_weights = self.decoder(
targets, encoder_output, encoder_decoder_mask, decoder_mask, training
)
output = self.output_layer(decoder_output)
return output, attention_weights
train_step_signature = [
tf.TensorSpec(shape=(None, None), dtype=tf.int64),
tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]
#tf.function(input_signature=train_step_signature)
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
y_pred = self(x, y, training=True)
loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
# Compute gradients
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update metrics (includes the metric that tracks the loss)
self.compiled_metrics.update_state(y, y_pred)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
SelfAttention.py
# Don't worry about what Custom is, it's basically a modified Keras Model
from Utilities.MachineLearning.Keras.Models.Custom import Custom
from Utilities.MachineLearning.Keras.Models.Transformer import Transformer
class SelfAttention(Custom):
def initialize(self):
self.transformer = Transformer(
self.batch_input_shape[-1],
num_heads=self.attention_units,
dropout=self.attention_dropout,
name="Transformer",
)
def call(self, inputs, training=False):
# TODO: What about `targets`?
return self.transformer(inputs, training=training)
There was no point in using a decoder as all the relevant information for time series data is used by the encoder block.

Perceptual loss function is not giving any gradient tensorflow

I am trying to implement perceptual loss function in tensorflow and here is
loss_model = tf.keras.models.Sequential()
for eachLayer in base_model.layers[:12]:
eachLayer.trainable=False
loss_model.add(eachLayer)
def meanSquaredLoss(y_true,y_pred):
return tf.reduce_mean(tf.keras.losses.MSE(y_true,y_pred))
def featureLoss(image):
predicted_image = model(image,training=False)
activatedModelVal = loss_model(predicted_image,training=False)
actualModelVal = loss_model(image,training=False)
return meanSquaredLoss(actualModelVal,activatedModelVal)
Here is the style loss function given by:
def gram_matrix(input_tensor):
result = tf.linalg.einsum('bijc,bijd->bcd', input_tensor, input_tensor)
input_shape = tf.shape(input_tensor)
num_locations = tf.cast(input_shape[1]*input_shape[2], tf.float32)
return result/(num_locations)
def styleLoss(image):
predicted_image = model(image,training=False)
activatedModelVal = loss_model(predicted_image,training=False)
actualModelVal = loss_model(image,training=False)
return meanSquaredLoss(gram_matrix(actualModelVal),gram_matrix(activatedModelVal))
So now I have both losses, and here is what I have done for optimization and stuffs!
opt = tf.keras.optimizers.Adam(0.02)
def each_train_step(image,showImage=False):
predicted_image = model(image,training=False)
loss = tf.reduce_sum(featureLoss(predicted_image,image)+styleLoss(predicted_image,image))
with tf.GradientTape() as tape:
grad = tape.gradient(loss, model.trainable_variables)
print(grad)
# opt.apply_gradients(zip(grad, model.trainable_variables))
if showImage:
plt.imshow(predicted_image)
The problem is the grad object is getting list of None and I don't know WHY! Why is the gradient returning list of None? Any solution to get the actual gradients ?

How to apply a computed loss to a graph?

I am new to tensorflow and trying to code a toy discriminator problem. The way I have it set up, the loss is calculated from the expert_actions and the novice_actions. However, I am running an error when I am trying to optimize using the computed loss. The error is ValueError: No variables to optimize. I do understand that I am getting the error because there is no feed_dict. However, I do not know the solution to this.
class discriminator:
def __init__(self,n_actions, learning_rate):
self.n_actions = n_actions
self.learning_rate_dist = learning_rate
self.graph = tf.Graph()
with self.graph.as_default():
self.dis_input = tf.placeholder(tf.float32, [None, self.n_actions])
self.discriminator_function()
init = tf.global_variables_initializer()
self.sess = tf.Session(graph=self.graph)
self.sess.run(init)
def discriminator_function(self, hidden = None):
if hidden == None:
hidden = 16
x = tf.layers.dense(self.dis_input,hidden,tf.nn.relu)
x = tf.layers.dense(x,hidden,tf.nn.relu)
self.dis_output = tf.layers.dense(x,1)
def discriminator(self,expert_actions,novice_actions):
expert_out = self.sess.run(self.dis_output,feed_dict={self.dis_input : expert_actions})
novice_out = self.sess.run(self.dis_output,feed_dict={self.dis_input : novice_actions})
loss = tf.reduce_mean(tf.log(expert_out) + tf.log(1.-novice_out))
# update discriminator loss
optimize = tf.train.AdamOptimizer(self.learning_rate_dis).minimize(-loss)
self.sess.run(optimize) #error over here
return loss
if __name__ == '__main__':
d = discriminator(2,0.001)
expert_actions = np.random.randint(2, size=10)
novice_actions = np.random.randint(2, size=10)
d.discriminator(expert_actions,novice_actions)
You are trying to optimize loss = tf.reduce_mean(tf.log(expert_out) + tf.log(1.-novice_out)) with expert_out and novice_out being numpy arrays. There are no variables between the input and loss, to compute gradients.
Your discriminator function should be something like this:
def discriminator(self,expert_actions,novice_actions):
#Make sure you add the new ops and variables to the graph defined.
with self.graph.as_default():
loss = tf.reduce_mean(tf.log('Should be a tensor that is part of the graph and not a numpy array))
optimize = tf.train.AdamOptimizer(0.01).minimize(-loss)
self.sess.run(tf.global_variables_initializer())
#pass the inputs here
loss = self.sess.run([loss, optimize], feed_dict={self.dis_input : expert_actions})
return loss

How do I load a checkpoint using tensorflow in eager execution mode?

I am using tensorflow 1.7.0 in eager execution mode. I have the model working, but none of the examples that I have found for saving the model work.
This is the code that I am using:
checkpoint_directory ='./JokeWords/'
checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
checkpoint = tfe.Checkpoint(model=model,optimizer=optimizer) # save as "x"
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
evaluate(model,jokes,2,32)
....
checkpoint.save(file_prefix=checkpoint_prefix)
I have trained the model and use evaluate to check the results when loading from a restart. Each time I get a random result from evaluate, meaning that the model is not loading from the data, but instead only having random weights.
How do I save the model? It can take days to train one of these.
Edit. Here is the model:
class EagerRNN(tfe.Network):
def __init__(self,embedding, hidden_dim, num_layers, keep_ratio):
super(EagerRNN, self).__init__()
self.keep_ratio = keep_ratio
self.cells = self._add_cells([
tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_dim)
for _ in range(num_layers)
])
self.backcells = self._add_cells([
tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_dim)
for _ in range(num_layers)
])
self.linear = layers.Dense(embedding. vocab_size, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
self.backlinear = layers.Dense(embedding. vocab_size, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
self.attension = layers.Dense(hidden_dim, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
def call(self, input_seq,seq_lengths, training):
lengths=[i[0] for i in seq_lengths]
nRotations=max(lengths)
batchSize=input_seq.shape[0]
input_seq2 = tf.unstack(input_seq, num=int(input_seq.shape[1]), axis=1)
atten = None
state = self.cells[0].zero_state(batchSize, tf.float32)
for i in range(0,nRotations):
for j in range(0,len(self.cells)):
c=self.cells[j]
inp=input_seq2[i]
output, state = c(inp, state)
#input_seq2[i]=(output)
if atten==None:
atten =self.linear(output)
else:
atten=atten+self.linear(output)
for i in range(nRotations-1,-1,-1):
for j in range(0,len(self.backcells)):
c=self.backcells[j]
inp=input_seq2[i]
output, state = c(inp, state)
#input_seq2[i]=(output)
atten=atten+self.backlinear(output)
#input_seq = tf.stack(input_seq2[0:nRotations], axis=1)
atten=self.attension(atten)
if training:
input_seq = tf.nn.dropout(input_seq, self.keep_ratio)
# Returning a list instead of a single tensor so that the line:
# y = self.rnn(y, ...)[0]
# in PTBModel.call works for both this RNN and CudnnLSTM (which returns a
# tuple (output, output_states).
return input_seq,state,atten
def _add_cells(self, cells):
# "Magic" required for keras.Model classes to track all the variables in
# a list of Layer objects.
# TODO(ashankar): Figure out API so user code doesn't have to do this.
for i, c in enumerate(cells):
setattr(self, "cell-%d" % i, c)
return cells
class EagerLSTM_Model(tfe.Network):
"""LSTM for word language modeling.
Model described in:
(Zaremba, et. al.) Recurrent Neural Network Regularization
http://arxiv.org/abs/1409.2329
See also:
https://github.com/tensorflow/models/tree/master/tutorials/rnn/ptb
"""
def __init__(self,
embedding,
hidden_dim,
num_layers,
dropout_ratio,
use_cudnn_rnn=True):
super(EagerLSTM_Model, self).__init__()
self.keep_ratio = 1 - dropout_ratio
self.use_cudnn_rnn = use_cudnn_rnn
self.embedding = embedding
if self.use_cudnn_rnn:
self.rnn = cudnn_rnn.CudnnLSTM(
num_layers, hidden_dim, dropout=dropout_ratio)
else:
self.rnn = EagerRNN(embedding,hidden_dim, num_layers, self.keep_ratio)
self.unrnn = EagerUnRNN(embedding,hidden_dim, num_layers, self.keep_ratio)
def callRNN(self, input_seq,seq_lengths, training):
y = self.embedding.callbatchword(input_seq)
if training:
y = tf.nn.dropout(y, self.keep_ratio)
y,state,atten = self.rnn.call(y,seq_lengths, training=training)
return state,atten
def callUnRNN (self,state,atten,seq_lengths, training ):
x,state = self.unrnn(state,atten,seq_lengths,training=training)
#b=tf.reshape(y, self._output_shape)
#c=self.linear(b)
return x
tfe.Network is not (easily) Checkpointable and it will soon be deprecated. Prefer to subclass tf.Keras.Model instead. So if you change class EagerRNN(tfe.Network) to class EagerRNN(tf.keras.Model) and class EagerLSTM_Model(tfe.Network) to class EagerLSTM_Model(tf.keras.Model), checkpoint.save(file_prefix=checkpoint_prefix) should actually save all your variables and checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory)) should restore them.