Updating keras model using tf gradients - tensorflow

I'm trying to build an a3c implementation in keras. I have experience working with keras, but absolutely no experience working with tensorflow. So I would really apreciate if someone could make it as simple as possible, since I want to finish it as fast as possible without diving too deep into tensorflow.
self.session = tf.Session()
K.set_session(self.session)
K.manual_variable_initialization(True)
self.stop_signal = False
self.model = self._build_model()
self.graph = self._build_graph(self.model)
self.session.run(tf.global_variables_initializer())
self.default_graph = tf.get_default_graph()
self.default_graph.finalize() # avoid modifications
def _build_model(self):
l_input = Input(batch_shape=(None, NUM_STATE))
input_layer = Reshape((1, -1))(l_input)
lstm = LSTM(64, activation='relu', return_sequences=True)(input_layer)
lstm = LSTM(128, activation='relu', return_sequences=True)(lstm)
lstm = LSTM(128, activation='relu')(lstm)
out_actions = Dense(NUM_ACTIONS, activation='softmax')(lstm)
out_value = Dense(1, activation='linear')(lstm)
model = Model(inputs=[l_input], outputs=[out_actions, out_value])
model._make_predict_function() # have to initialize before threading
return model
def _build_graph(self, model):
s_t = tf.placeholder(tf.float32, shape=(None, NUM_STATE))
a_t = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))
r_t = tf.placeholder(tf.float32, shape=(None, 1))
p, v = model(s_t)
log_prob = tf.log(tf.reduce_sum(p * a_t, axis=1, keepdims=True) + 1e-10)
advantage = r_t - v
loss_policy = - log_prob * tf.stop_gradient(advantage)
loss_value = LOSS_V * tf.square(advantage)
entropy = LOSS_ENTROPY * tf.reduce_sum(p * tf.log(p + 1e-10), axis=1, keepdims=True)
loss_total = tf.reduce_mean(loss_policy + loss_value + entropy)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
minimize = optimizer.minimize(loss_total)
return s_t, a_t, r_t, minimize
Then it is beeing trained:
s_t, a_t, r_t, minimize = self.graph
self.session.run(minimize, feed_dict={s_t: s, a_t: a, r_t: r})
Predictions are done this way:
with self.default_graph.as_default():
p, v = self.model.predict(s)
So I want to update my keras model weights using these gradients after I finish training in order to save it using model.save('path.h5'). Peudo code:
model_weights = model.trainable_weights
model_weights = apply_gradients(grades, model_weights)
model = model.set_weights(model_weights)
model.save('path.h5')
The code was taken from here with little changes: https://github.com/jaara/AI-blog/blob/master/CartPole-A3C.py
I found something on this topic but can't really figure out how to actually use it.
https://github.com/keras-team/keras/issues/3062
https://github.com/keras-team/keras/issues/3069

Turns out the problem has to do with the algorithm not converging properly. If someone knows what can I do to make it converge? I'm using custom environment and I trained a DQN on this environment in the past and it successfully converged. I aslo implemented target model which I update every 300 steps (or 1 episode in my case).

Related

Problem with Deep Sarsa algorithm which work with pytorch (Adam optimizer) but not with keras/Tensorflow (Adam optimizer)

I have a deep sarsa algorithm which work great on Pytorch on lunar-lander-v2 and I would use with Keras/Tensorflow. It use mini-batch of size 64 which are used 128 time to train at each episode.
There are the results I get. As you can see, it work great with Pytorch but not with Keras / Tensorflow... So I think I do not correctly implement the training function is Keras/Tensorflow (code is below).
It seems that loss is oscillating in Keras because epsilon go to early to slow value but it work very great in Pytorch...
Do you see something that could explain why it do not work in Keras/Tensorflow please? Thanks a lot for your help and any idea that could help me...
Network information:
It use Adam optimizer, and a network with two layers : 256 and 128, with relu on each:
class Q_Network(nn.Module):
def __init__(self, state_dim , action_dim):
super(Q_Network, self).__init__()
self.x_layer = nn.Linear(state_dim, 256)
self.h_layer = nn.Linear(256, 128)
self.y_layer = nn.Linear(128, action_dim)
print(self.x_layer)
def forward(self, state):
xh = F.relu(self.x_layer(state))
hh = F.relu(self.h_layer(xh))
state_action_values = self.y_layer(hh)
return state_action_values
For keras/Tensorflwo I use this one:
def CreationModele(dimension):
entree_etat = keras.layers.Input(shape=(dimension))
sortie = keras.layers.Dense(units=256, activation='relu')(entree_etat)
sortie = keras.layers.Dense(units=128, activation='relu')(sortie)
sortie = keras.layers.Dense(units=4)(sortie)
modele = keras.Model(inputs=entree_etat,outputs=sortie)
return modele
Training code
In Pytorch, the training is done by:
def update_Sarsa_Network(self, state, next_state, action, next_action, reward, ends):
actions_values = torch.gather(self.qnet(state), dim=1, index=action.long())
next_actions_values = torch.gather(self.qnet(next_state), dim=1, index=next_action.long())
next_actions_values = reward + (1.0 - ends) * (self.discount_factor * next_actions_values)
q_network_loss = self.MSELoss_function(actions_values, next_actions_values.detach())
self.qnet_optim.zero_grad()
q_network_loss.backward()
self.qnet_optim.step()
return q_network_loss
And in Keras/Tensorflow by:
mse = keras.losses.MeanSquaredError(
reduction=keras.losses.Reduction.SUM)
#tf.function
def train(model, batch_next_states_tensor, batch_next_actions_tensor, batch_reward_tensor, batch_end_tensor, batch_states_tensor, batch_actions_tensor, optimizer, gamma):
with tf.GradientTape() as tape:
# EStimation des valeurs des actions courantes
actions_values = model(batch_states_tensor) # (mini_batch_size,4)
actions_values = tf.linalg.diag_part(tf.gather(actions_values,batch_actions_tensor,axis=1)) # (mini_batch_size,)
actions_values = tf.expand_dims(actions_values,-1) # (mini_batch_size,1)
# EStimation des valeurs des actions suivantes
next_actions_values = model(batch_next_states_tensor) # (mini_batch_size,4)
next_actions_values = tf.linalg.diag_part(tf.gather(next_actions_values,batch_next_actions_tensor,axis=1)) # (mini_batch_size,)
cibles = batch_reward_tensor + (1.0 - batch_end_tensor)*gamma*tf.expand_dims(next_actions_values,-1) # (mini_batch_size,1)
error = mse(cibles, actions_values)
grads = tape.gradient(error, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return error
Error function and Optimizer code
The optimizer is Adam in Pytorch and Tensorflow with lr=0.001. In Pytorch:
def __init__(self, state_dim, action_dim):
self.qnet = Q_Network(state_dim, action_dim)
self.qnet_optim = torch.optim.Adam(self.qnet.parameters(), lr=0.001)
self.discount_factor = 0.99
self.MSELoss_function = nn.MSELoss(reduction='sum')
self.replay_buffer = ReplayBuffer()
pass
In Keras / Tensorflow:
alpha = 1e-3
# Initialise le modèle
modele_Keras = CreationModele(8)
optimiseur_Keras = keras.optimizers.Adam(learning_rate=alpha)
Ok I finnaly foud a solution by de-correlate target and action value using two model, one being updated periodically for target values calculation.
I use a model for estimating the epsilon-greedy actions and computing the Q(s,a) values and a fixed model (but periodically uptated with the weight of the previous model) for calculate the targer r+gamma*Q(s',a').
Here is my result :

Object localization MNIST Tensorflow to Pytorch : Losses doesn't decrease

I am trying to convert a Tensorflow object localization code into Pytorch. In the original code, the author use model.compile / model.fit to train the model so I don't understand how the losses of classification of the MNIST digits and box regressions work. Still, I'm trying to implement my own training loop in Pytorch.
The goal here is, after some preprocessing, past the MNIST digits randomly into a black square image and then, classify and localize (bounding boxes) the digit.
I set two losses : nn.CrossEntropyLoss and nn.MSELoss and I do (loss_1+loss_2).backward() to compute the gradients. I know it's the right way to compute gradients with two losses from here and here.
But still, my loss doesn't decrease whereas it collapses quasi-imediately with the Tensorflow code. I checked the model with torchinfo.summary and it seems behaving as well as the Tensorflow implementation.
EDIT :
I looked for the predicted labels of my model and it doesn't seem to change at all.
This line of code label_preds, bbox_coords_preds = model(digits) always returns the same values
label_preds[0] = tensor([[0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156, 0.0156]], device='cuda:0', grad_fn=<SliceBackward0>)
Here are my questions :
Is my custom network set correctly ?
Are my losses set correctly ?
Why my label predictions don't change ?
Do my training loop work as well as the .compile and .fit Tensorflow methods ?
Thanks a lot !
PYTORCH CODE
class ConvNetwork(nn.Module):
def __init__(self):
super(ConvNetwork, self).__init__()
self.conv2d_1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3)
self.conv2d_2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3)
self.conv2d_3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
self.avgPooling2D = nn.AvgPool2d((2,2))
self.dense_1 = nn.Linear(in_features=3136, out_features=128)
self.dense_classifier = nn.Linear(in_features=128, out_features=10)
self.softmax = nn.Softmax(dim=0)
self.dense_regression = nn.Linear(in_features=128, out_features=4)
def forward(self, input):
x = self.avgPooling2D(F.relu(self.conv2d_1(input)))
x = self.avgPooling2D(F.relu(self.conv2d_2(x)))
x = self.avgPooling2D(F.relu(self.conv2d_3(x)))
x = nn.Flatten()(x)
x = F.relu(self.dense_1(x))
output_classifier = self.softmax(self.dense_classifier(x))
output_regression = self.dense_regression(x)
return [output_classifier, output_regression]
######################################################
learning_rate = 0.1
EPOCHS = 1
BATCH_SIZE = 64
model = ConvNetwork()
model = model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
classification_loss = nn.CrossEntropyLoss()
regression_loss = nn.MSELoss()
######################################################
begin_time = time.time()
for epoch in range(EPOCHS) :
tot_loss = 0
train_start = time.time()
training_losses = []
print("-"*20)
print(" "*5 + f"EPOCH {epoch+1}/{EPOCHS}")
print("-"*20)
model.train()
for batch, (digits, labels, bbox_coords) in enumerate(training_dataset):
digits, labels, bbox_coords = digits.to(device), labels.to(device), bbox_coords.to(device)
optimizer.zero_grad()
[label_preds, bbox_coords_preds] = model(digits)
class_loss = classification_loss(label_preds, labels)
box_loss = regression_loss(bbox_coords_preds, bbox_coords)
training_loss = class_loss + box_loss
training_loss.backward()
optimizer.step()
######### print part #######################
training_losses.append(training_loss.item())
if batch+1 <= len_training_ds//BATCH_SIZE:
current_training_sample = (batch+1)*BATCH_SIZE
else:
current_training_sample = (batch)*BATCH_SIZE + len_training_ds%BATCH_SIZE
if (batch+1) == 1 or (batch+1)%100 == 0 or (batch+1) == len_training_ds//BATCH_SIZE +1:
print(f"Elapsed time : {(time.time()-train_start)/60:.3f}",\
f" --- Digit : {current_training_sample}/{len_training_ds}",\
f" : loss = {training_loss:.5f}")
if batch+1 == (len_training_ds//BATCH_SIZE)+1:
print(f"Total elapsed time for training : {(time.time()-begin_time)/60:.3f}")
ORIGINAL TENSORFLOW CODE
def feature_extractor(inputs):
x = tf.keras.layers.Conv2D(16, activation='relu', kernel_size=3, input_shape=(75, 75, 1))(inputs)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
x = tf.keras.layers.Conv2D(32,kernel_size=3,activation='relu')(x)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
x = tf.keras.layers.Conv2D(64,kernel_size=3,activation='relu')(x)
x = tf.keras.layers.AveragePooling2D((2, 2))(x)
return x
def dense_layers(inputs):
x = tf.keras.layers.Flatten()(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)
return x
def classifier(inputs):
classification_output = tf.keras.layers.Dense(10, activation='softmax', name = 'classification')(inputs)
return classification_output
def bounding_box_regression(inputs):
bounding_box_regression_output = tf.keras.layers.Dense(units = '4', name = 'bounding_box')(inputs)
return bounding_box_regression_output
def final_model(inputs):
feature_cnn = feature_extractor(inputs)
dense_output = dense_layers(feature_cnn)
classification_output = classifier(dense_output)
bounding_box_output = bounding_box_regression(dense_output)
model = tf.keras.Model(inputs = inputs, outputs = [classification_output,bounding_box_output])
return model
def define_and_compile_model(inputs):
model = final_model(inputs)
model.compile(optimizer='adam',
loss = {'classification' : 'categorical_crossentropy',
'bounding_box' : 'mse'
},
metrics = {'classification' : 'accuracy',
'bounding_box' : 'mse'
})
return model
inputs = tf.keras.layers.Input(shape=(75, 75, 1,))
model = define_and_compile_model(inputs)
EPOCHS = 10 # 45
steps_per_epoch = 60000//BATCH_SIZE # 60,000 items in this dataset
validation_steps = 1
history = model.fit(training_dataset,
steps_per_epoch=steps_per_epoch,
validation_data=validation_dataset,
validation_steps=validation_steps, epochs=EPOCHS)
loss, classification_loss, bounding_box_loss, classification_accuracy, bounding_box_mse = model.evaluate(validation_dataset, steps=1)
print("Validation accuracy: ", classification_accuracy)
I answering to myself about this bug :
What I found :
I figured that I use a Softmax layer in my code while I'm using the nn.CrossEntropyLoss() as a loss.
What this problem was causing :
This loss already apply a softmax (doc)
Apply a softmax twice must add some noise to the loss and preventing convergence
What I did :
One should let a linear layer as an output for the classification layer.
An other way is to use the NLLLoss (doc) instead and let the softmax layer in the model class.
Also :
I don't fully understand how the .compile() and .fit() Tensorflow methods work but I think it should optimize the training one way or another (I think about the learning rate) since I had to decrease the learning rate to 0.001 in Pytorch to "unstick" the loss and makes it decrease.

Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.
For both models:
Setting the last 50 layers trainable and adding the same fully connected layers to the end.
Learning rate 3e-2
Batch size 32
Adam optimizer with the same betas
100 epochs
The inputs consist of RGB unscaled images
Pytorch
Model
def _init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
m.bias.data.fill_(0.01)
def get_mob_v3_small():
model = torchvision.models.mobilenet_v3_small(pretrained=True)
children_list = get_children(model)
for c in children_list[:-50]:
for p in c.parameters():
p.requires_grad = False
return model
class TransferMobileNetV3_v2(nn.Module):
def __init__(self,
num_keypoints: int = 5):
super(TransferMobileNetV3_v2, self).__init__()
self.classifier_neurons = num_keypoints*2
self.base_model = get_mob_v3_small()
self.base_model.classifier = nn.Sequential(
nn.Linear(in_features=1024, out_features=1024),
nn.ReLU(),
nn.Linear(in_features=1024, out_features=512),
nn.ReLU(),
nn.Linear(in_features=512, out_features=self.classifier_neurons)
)
self.base_model.apply(_init_weights)
def forward(self, x):
out = self.base_model(x)
return out
Training Script
def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):
len_dataloader = len(trainloader)
for epoch in range(1, args.epochs+1):
net.train()
for batch_idx, sample in enumerate(trainloader):
inputs, labels = sample
inputs, labels = inputs.to(args.device), labels.to(args.device)
optimizer.zero_grad()
with torch.cuda.amp.autocast(args.use_amp):
prediction = net(inputs)
loss = train_loss_fn(prediction, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
def main():
args = make_args_parser()
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = args.seed
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=3e-2,
betas=(0.9, 0.999))
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)
Tensorflow
Model
base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',
input_shape=(224,224,3))
x_in = base_model.layers[-6].output
x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=x)
for layer in model.layers[:-50]:
layer.trainable=False
Training Script
model.compile(loss = "mse",
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))
history = model.fit(input_numpy, output_numpy,
verbose=1,
batch_size=32, epochs=100,validation_split = 0.2)
Results
The PyTorch model predicts one single point around the center for all 5 different points.
The Tensorflow model predicts the points quite well and are quite accurate.
The loss in the Pytorch model is much higher than the Tensorflow model.
Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results. Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.
Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I'm putting it here just in case.

My Pytorch model is giving very bad results

I am new with Deep Learning with Pytorch. I am more experienced with Tensorflow, and thus I should say I am not new to Deep Learning itself.
Currently, I am working on a simple ANN classification. There are only 2 classes so quite naturally I am using a Softmax BCELoss combination.
The dataset is like this:
shape of X_train (891, 7)
Shape of Y_train (891,)
Shape of x_test (418, 7)
I transformed the X_train and others to torch tensors as train_data and so on. The next step is:
train_ds = TensorDataset(train_data, train_label)
# Define data loader
batch_size = 32
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
I made the model class like:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(7, 32)
self.bc1 = nn.BatchNorm1d(32)
self.fc2 = nn.Linear(32, 64)
self.bc2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, 128)
self.bc3 = nn.BatchNorm1d(128)
self.fc4 = nn.Linear(128, 32)
self.bc4 = nn.BatchNorm1d(32)
self.fc5 = nn.Linear(32, 10)
self.bc5 = nn.BatchNorm1d(10)
self.fc6 = nn.Linear(10, 1)
self.bc6 = nn.BatchNorm1d(1)
self.drop = nn.Dropout2d(p=0.5)
def forward(self, x):
torch.nn.init.xavier_uniform(self.fc1.weight)
x = self.fc1(x)
x = self.bc1(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc2(x)
x = self.bc2(x)
x = F.relu(x)
#x = self.drop(x)
x = self.fc3(x)
x = self.bc3(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc4(x)
x = self.bc4(x)
x = F.relu(x)
#x = self.drop(x)
x = self.fc5(x)
x = self.bc5(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc6(x)
x = self.bc6(x)
x = torch.sigmoid(x)
return x
model = Net()
The loss function and the optimizer are defined:
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
At last, the task is to run the forward in epochs:
num_epochs = 1000
# Repeat for given number of epochs
for epoch in range(num_epochs):
# Train with batches of data
for xb,yb in train_dl:
pred = model(xb)
yb = torch.unsqueeze(yb, 1)
#print(pred, yb)
print('grad', model.fc1.weight.grad)
l = loss(pred, yb)
#print('loss',l)
# 3. Compute gradients
l.backward()
# 4. Update parameters using gradients
optimizer.step()
# 5. Reset the gradients to zero
optimizer.zero_grad()
# Print the progress
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, l.item()))
I can see in the output that after each iteration with all the batches, the hard weights are non-zero, after this zero_grad is applied.
However, the model is pretty bad. I get an F1 score of around 50% only! And the model is bad when I call it to predict the train_dl itself!!!
I am wondering what the reason is. The grad of weights not zero but not updating properly? The optimizer not optimizing the weights? Or what else?
Can someone please have a look?
I already tried different loss functions and optimizers. I tried with smaller datasets, bigger batches, different hyperparameters.
Thanks! :)
First of all, you don't use softmax activation for BCE loss, unless you have 2 output nodes, which is not the case. In PyTorch, BCE loss doesn't apply any activation function before calculating the loss, unlike the CCE which has a built-in softmax function. So, if you want to use BCE, you have to use sigmoid (or any function f: R -> [0, 1]) at the output layer, which you don't have.
Moreover, you should ideally do optimizer.zero_grad() for each batch if you want to do SGD (which is the default). If you don't do that, you will be just doing full-batch gradient descent, which is quite slow and gets stuck in local minima easily.

character level bidirectional language model in tensorflow

Inspired from Andrej Karpathy Char-RNN, There is a Tensorflow implementation of char-rnn sherjilozair/char-rnn-tensorflow: Multi-layer Recurrent Neural Networks (LSTM, RNN) for character-level language models in Python using Tensorflow. I want to implement bidirectional character level language model from this code. I change the model.py and wrote a simple code:
class Model:
def __init__(self, input_data, targets, seq_length=Config.max_seq_length, training=True):
if Config.model == 'rnn':
cell_fn = rnn.BasicRNNCell
elif Config.model == 'gru':
cell_fn = rnn.GRUCell
elif Config.model == 'lstm':
cell_fn = rnn.BasicLSTMCell
elif Config.model == 'nas':
cell_fn = rnn.NASCell
else:
raise Exception("model type not supported: {}".format(Config.model))
fw_cells = []
bw_cells = []
for _ in range(Config.num_layers):
fw_cell = cell_fn(Config.rnn_size)
bw_cell = cell_fn(Config.rnn_size)
fw_cells.append(fw_cell)
bw_cells.append(bw_cell)
self.fw_cell = rnn.MultiRNNCell(fw_cells, state_is_tuple=True)
self.bw_cell = rnn.MultiRNNCell(bw_cells, state_is_tuple=True)
self.input_data, self.targets = input_data, targets
with tf.variable_scope('rnnlm'):
softmax_w = tf.get_variable("softmax_w", [Config.rnn_size*2, Config.vocab_size])
softmax_b = tf.get_variable("softmax_b", [Config.vocab_size])
embedding = tf.get_variable("embedding", [Config.vocab_size, Config.rnn_size])
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
inputs = tf.unstack(inputs, num=seq_length, axis=1)
outputs, _, _ = tf.nn.static_bidirectional_rnn(self.fw_cell, self.bw_cell, inputs,
dtype=tf.float32, scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, Config.rnn_size*2])
self.logits = tf.matmul(output, softmax_w) + softmax_b
self.probs = tf.nn.softmax(self.logits)
self.lr = tf.Variable(0.0, trainable=False)
if training:
loss = legacy_seq2seq.sequence_loss_by_example(
[self.logits],
[tf.reshape(self.targets, [-1])],
[tf.sign(tf.cast(tf.reshape(self.targets, [-1]), dtype=tf.float32))])
with tf.name_scope('cost'):
self.cost = tf.reduce_mean(loss)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), Config.grad_clip)
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(zip(grads, tvars))
In training phase, I see a fast converge. After near 3000 iteration, the loss reach 0.003. In test phase, the probability of all character is 1.0. I think there is a mistake.
I will so glad to get some help to find my mistake.
use the preceding and following output to predict the prob of current word. In your case, you used current rnn output to predict the prob of current word.
Looks like you set self.lr = tf.Variable(0.0, trainable=False). Try changing this to a nonzero value. If you are reading probabilities from self.probs during the testing phase this should be normalized appropriately,