Related
I have made a ranking model using tensorflow_ranking losses and metrics, but the ListMLELoss() is always 0. The model will train and complete, but I imagine no learning is actually happening since the loss is not getting calculated. I tried to follow this guide, https://www.tensorflow.org/recommenders/examples/listwise_ranking, as well as I could, but there are some differences in use cases so it is a bit different. I am not sure why model.fit() runs and I get an NDCG value, but clearly the model cannot be learning as a loss value is not getting computed.
Here is my ranking model class:
class RankingModel(tf.keras.Model):
def __init__(self, embeddings, vocab_size_dict, dim_dict, loss, activation='sigmoid'):
super().__init__()
self.embeddings = embeddings
self.embedding_layers = {}
self.vocab_size_dict = vocab_size_dict
self.dim_dict = dim_dict
self.activation = activation
self.loss = loss
self.embedding_layers['feature_one'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_one']+1,
self.dim_dict['feature_one'],
name='embedded_feature_one')
self.embedding_layers['feature_two'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_two']+1,
self.dim_dict['feature_two'],
name='embedded_feature_two')
self.embedding_layers['feature_three'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_three']+1,
self.dim_dict['feature_three'],
name='embedded_feature_three')
self.embedding_layers['feature_four'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_four']+1,
self.dim_dict['feature_four'],
name='embedded_feature_four')
self.embedding_layers['feature_five'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_five']+1,
self.dim_dict['feature_five'],
name='embedded_feature_five')
self.flatten = tf.keras.layers.Flatten()
self.concatenate = tf.keras.layers.Concatenate(axis=1, name='Input_Concatenation')
self.batchnorm = tf.keras.layers.BatchNormalization(name='batchnorm')
self.score_model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='leaky_relu'),
tf.keras.layers.Dense(12, activation=activation)
])
self.task = tfrs.tasks.Ranking(
loss=self.loss,
metrics=[
tfr.keras.metrics.NDCGMetric(name="ndcg_metric")
]
)
def __call__(self, features, training=False):
feats = []
for feat, tens in features[0].items():
if feat in self.embeddings:
embedding = self.embedding_layers[feat](tens)
flatten = self.flatten(embedding)
feats.append(flatten)
if feat == 'continuous':
flatten = self.flatten(tens)
feats.append(flatten)
deep_concatenated = self.concatenate(feats)
batchnorm = self.batchnorm(deep_concatenated)
scores = self.score_model(batchnorm)
print("scores: ", scores)
print("mask: ", features[0]['mask'])
masked_scores = tf.boolean_mask(scores, features[0]['mask'])
# pred = tf.expand_dims(masked_scores, axis=1)
# return pred
return tf.expand_dims(masked_scores, axis=1)
def compute_loss(self, features, training=False):
labels = features[1]
# print("labels: ", labels)
# print("mask: ", features[0]['mask'])
masked_labels = tf.boolean_mask(labels, features[0]['mask'])
# print("masked labels:", masked_labels)
masked_labels = tf.expand_dims(masked_labels, axis=1)
print("masked_labels: ", masked_labels)
scores = self(features)
print("scores: ", scores)
print("loss: ", self.task(labels=masked_labels, predictions=scores))
return self.task(
labels=masked_labels,
predictions=scores
)
def train_step(self, inputs):
"""Custom train step using the `compute_loss` method."""
with tf.GradientTape() as tape:
loss = self.compute_loss(inputs)
# Handle regularization losses as well.
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
gradients = tape.gradient(total_loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
def test_step(self, inputs):
"""Custom test step using the `compute_loss` method."""
loss = self.compute_loss(inputs)
# Handle regularization losses as well.
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
Can anybody see why I am not getting a loss value? Thanks a lot. Please let me know if you need additional info. Maybe I can create some synthetic data so you can run it all yourself. Been pulling my hair out for a few days trying to get this to work so any advice is MUCH appreicated.
I am trying to predict uncertainty in a regression problem using Dropout during testing as per Yarin Gal's article. I created a class using Keras's backend function as provided by this stack overflow question's answer. The class takes a NN model as input and randomly drops neurons during testing to give a stochastic estimate rather than deterministic output for a time-series forecasting.
I create a simple encoder-decoder model as shown below for the forecasting with 0.1 dropout during training:
input_sequence = Input(shape=(lookback, train_x.shape[2]))
encoder = LSTM(128, return_sequences=False)(input_sequence)
r_vec = RepeatVector(forward_pred)(encoder)
decoder = LSTM(128, return_sequences=True, dropout=0.1)(r_vec) #maybe use dropout=0.1
output = TimeDistributed(Dense(train_y.shape[2], activation='linear'))(decoder)
# optimiser = optimizers.Adam(clipnorm=1)
enc_dec_model = Model(input_sequence, output)
enc_dec_model.compile(loss="mean_squared_error",
optimizer="adam",
metrics=['mean_squared_error'])
enc_dec_model.summary()
After that, I define and call the DropoutPrediction class.
# Define the class:
class KerasDropoutPrediction(object):
def __init__(self ,model):
self.f = K.function(
[model.layers[0].input,
K.learning_phase()],
[model.layers[-1].output])
def predict(self ,x, n_iter=10):
result = []
for _ in range(n_iter):
result.append(self.f([x , 1]))
result = np.array(result).reshape(n_iter ,x.shape[0] ,x.shape[1]).T
return result
# Call the object:
kdp = KerasDropoutPrediction(enc_dec_model)
y_pred_do = kdp.predict(x_test,n_iter=100)
y_pred_do_mean = y_pred_do.mean(axis=1)
However, in the line
kdp = KerasDropoutPrediction(enc_dec_model), when I call the LSTM model,
I got the following error message which says the input has to be a Keras Tensor. Can anyone help me with this error?
Error Message:
ValueError: Found unexpected instance while processing input tensors for keras functional model. Expecting KerasTensor which is from tf.keras.Input() or output from keras layer call(). Got: 0
To activate Dropout at inference time, you simply have to specify training=True (TF>2.0) in the layer of interest (in the last LSTM layer in your case)
with training=False
inp = Input(shape=(10, 1))
x = LSTM(1, dropout=0.3)(inp, training=False)
m = Model(inp,x)
# m.compile(...)
# m.fit(...)
X = np.random.uniform(0,1, (1,10,1))
output = []
for i in range(0,100):
output.append(m.predict(X)) # always the same
with training=True
inp = Input(shape=(10, 1))
x = LSTM(1, dropout=0.3)(inp, training=True)
m = Model(inp,x)
# m.compile(...)
# m.fit(...)
X = np.random.uniform(0,1, (1,10,1))
output = []
for i in range(0,100):
output.append(m.predict(X)) # always different
In your example, this becomes:
input_sequence = Input(shape=(lookback, train_x.shape[2]))
encoder = LSTM(128, return_sequences=False)(input_sequence)
r_vec = RepeatVector(forward_pred)(encoder)
decoder = LSTM(128, return_sequences=True, dropout=0.1)(r_vec, training=True)
output = TimeDistributed(Dense(train_y.shape[2], activation='linear'))(decoder)
enc_dec_model = Model(input_sequence, output)
enc_dec_model.compile(
loss="mean_squared_error",
optimizer="adam",
metrics=['mean_squared_error']
)
enc_dec_model.fit(train_x, train_y, epochs=10, batch_size=32)
and the KerasDropoutPrediction:
class KerasDropoutPrediction(object):
def __init__(self, model):
self.model = model
def predict(self, X, n_iter=10):
result = []
for _ in range(n_iter):
result.append(self.model.predict(X))
result = np.array(result)
return result
kdp = KerasDropoutPrediction(enc_dec_model)
y_pred_do = kdp.predict(test_x, n_iter=100)
y_pred_do_mean = y_pred_do.mean(axis=0)
I am trying to train on a colab TPU using data from my GCP account.
When I run the cell that starts the training, the cell just seems to hang, with no progress. I put a very low number of steps, so that the training should complete pretty quickly, about a minute on GPU, but it never finishes on TPU.
I am using a custom model, and I am using files saved on GCP using the solution given in this stackoverflow answer How to connect to private storage bucket using the Google Colab TPU
The model trains/runs just fine on GPU/CPU.
The full code is in this colab notebook here
https://colab.research.google.com/drive/13HgRJru0glOzn7m0b7tmVCO_VrRpa1XS?usp=sharing
And here's a google drive link to the sample data file
https://drive.google.com/file/d/10EFyxau97jLfeGaKugMevIyX-bobsFe5/view?usp=sharing
And below is the code from the colab notebook
!pip install transformers --q
%tensorflow_version 2.x
!gcloud auth login
'''NEED TO RUN THIS CELL TWICE TO AVOID ERROR'''
from google.colab import auth
auth.authenticate_user()
project_id = 'machinelearning-264918'
!gcloud config set project {project_id}
!pip install tfa-nightly
import tensorflow_addons as tfa
from transformers import TFBertModel, AutoModel
import tensorflow as tf
from tensorflow.keras.layers import (Dense,
Dropout)
import os
import tensorflow_addons as tfa
logger = tf.get_logger()
logger.info(tf.__version__)
autotune = tf.data.experimental.AUTOTUNE
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
logger.info('Running with TPUStrategy on TPU {} with {} cores '
.format(tpu.cluster_spec().as_dict()['worker'],
strategy.num_replicas_in_sync))
batch_size = 3 * strategy.num_replicas_in_sync
except Exception:
# raise ValueError
strategy = tf.distribute.OneDeviceStrategy(device='/gpu:0')
logger.warning('Failed initializing TPU! Running on GPU')
batch_size = 3
from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
from tensorflow.python.distribute import parameter_server_strategy
def _minimize(strategy, tape, optimizer, loss, trainable_variables):
with tape:
if isinstance(optimizer, lso.LossScaleOptimizer):
loss = optimizer.get_scaled_loss(loss)
gradients = tape.gradient(loss, trainable_variables)
# Whether to aggregate gradients outside of optimizer. This requires support
# of the optimizer and doesn't work with ParameterServerStrategy and
# CentralStroageStrategy.
aggregate_grads_outside_optimizer = (
optimizer._HAS_AGGREGATE_GRAD and # pylint: disable=protected-access
not isinstance(strategy.extended,
parameter_server_strategy.ParameterServerStrategyExtended))
if aggregate_grads_outside_optimizer:
# We aggregate gradients before unscaling them, in case a subclass of
# LossScaleOptimizer all-reduces in fp16. All-reducing in fp16 can only be
# done on scaled gradients, not unscaled gradients, for numeric stability.
gradients = optimizer._aggregate_gradients(zip(gradients, # pylint: disable=protected-access
trainable_variables))
if isinstance(optimizer, lso.LossScaleOptimizer):
gradients = optimizer.get_unscaled_gradients(gradients)
gradients = optimizer._clip_gradients(gradients) # pylint: disable=protected-access
if trainable_variables:
if aggregate_grads_outside_optimizer:
optimizer.apply_gradients(
zip(gradients, trainable_variables),
experimental_aggregate_gradients=False)
else:
optimizer.apply_gradients(zip(gradients, trainable_variables))
class CustomModel(tf.keras.Model):
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
x, y = data
batch_label = tf.reshape(y, (tf.size(y)/2, 2), name=None)
rs = tf.ragged.stack(x, axis=0)
reg = rs.to_tensor()
batch_input = tf.reshape(reg, (tf.shape(reg)[0]*tf.shape(reg)[1], tf.shape(reg)[2]))
with tf.GradientTape() as tape:
y_pred = self(batch_input, training=True) # Forward pass
# Compute the loss value
# (the loss function is configured in `compile()`)
loss = self.compiled_loss(batch_label, y_pred, regularization_losses=self.losses)
# Compute gradients
_minimize(self.distribute_strategy, tape, self.optimizer, loss,
self.trainable_variables)
# Update weights
# self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update metrics (includes the metric that tracks the loss)
self.compiled_metrics.update_state(y, y_pred)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
def get_model(drop_out):
sciBert = TFBertModel.from_pretrained('bert-base-uncased', from_pt=True)
allFinal = tf.keras.Input(shape=(None,), dtype=tf.int32, name='inputN')
'''Should posFinal and negFinal be concatenated, so there's only one call to sciBert'''
allBertOut = sciBert(allFinal, training=True)
allPoolConcat = tf.concat([
allBertOut[0][:, 0], #output of ff layer after last hidden state since it seems to be untrained in roberta
tf.reduce_mean(allBertOut[0][:, 1:-1], axis=1)
],axis=1)
postLayer = tf.keras.layers.Dense(768, activation='swish', name='postff')
LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNormO")
postLayer2 = tf.keras.layers.Dense(768, activation='swish', name='2postff')
classifier = tf.keras.layers.Dense(2, name='classifierff')
postWeights = postLayer(allPoolConcat)
postWeights = LayerNorm(postWeights)
postWeights = Dropout(drop_out)(postWeights)
postWeights2 = postLayer2(postWeights)
allScores = classifier(postWeights2)
model = CustomModel(inputs=allFinal, outputs=allScores)
return model
#tf.function
def _parse_example(example_proto):
features = {
'sciBert_SentenceIndex': tf.io.VarLenFeature( dtype=tf.int64),
'SciBert_IDs': tf.io.VarLenFeature(dtype=tf.int64),
}
parsed_example_dict = tf.io.parse_single_example(example_proto, features)
sentencePositions = parsed_example_dict['sciBert_SentenceIndex']
passageIds = parsed_example_dict['SciBert_IDs']
sentencePositions = tf.sparse.to_dense(sentencePositions)
bertIds = tf.sparse.to_dense(passageIds)
sentencePositions = tf.cast(sentencePositions, dtype=tf.int32)
passageIds = tf.cast(passageIds, dtype=tf.int32)
length = tf.shape(
sentencePositions, out_type=tf.dtypes.int32, name='shape'
)
lengthMinusOne = tf.math.subtract(
length, 1, name='SubtractOne'
)
# creage random numbers for a sentence index up to 2nd to last index
# the last index is just the last position of the non-padded bertID
startRandSentIndex = tf.random.uniform(
shape=[1], minval=0, maxval=lengthMinusOne[0], dtype=tf.dtypes.int32, seed=None, name=None)
# Get the end point for that sentence
endRandSentIndex = tf.math.add(startRandSentIndex, 1, name=None)
# last position of the non-padded bertID
lastPosition = length-1
# extract BertID positions for sentence start/end and bertID end
startSentencePosit = tf.gather_nd(sentencePositions, [startRandSentIndex], batch_dims=0)
endSentencePosit = tf.gather_nd(sentencePositions, [endRandSentIndex], batch_dims=0)
lastPassagePosit = tf.gather_nd(sentencePositions, [lastPosition], batch_dims=0)
# Get slices of BertIDs for the query, and the rest
firstPiece = tf.slice(bertIds, [0], [startSentencePosit[0]] )
queryPiece = tf.slice(bertIds, [startSentencePosit[0]], [endSentencePosit[0]-startSentencePosit[0]] )
lastPiece = tf.slice(bertIds, [endSentencePosit[0]], [lastPassagePosit[0]-endSentencePosit[0]] )
# concat rest of passage
restPassagePiece = tf.concat( [firstPiece,lastPiece], axis=0 )
# Clip
queryPiece = queryPiece[0:256]
restPassagePiece = restPassagePiece[0:510]
# add special tokens for proper input into the model
return tf.cast(queryPiece, dtype=tf.int32), tf.cast(restPassagePiece, dtype=tf.int32)
#tf.function
def clip_seq_to_len(seq, num_tokens=512):
seq_len = tf.shape(seq)[0]
if seq_len > 511:
return seq[:511]
return seq[:]
#tf.function
def make_samples(query_a, passage_a, query_b, passage_b):
CLS_inputID = tf.constant([102])
SEP_inputID = tf.constant([103])
positive_sample_a = clip_seq_to_len(tf.concat([CLS_inputID, query_a, SEP_inputID, passage_a], axis=-1))
positive_sample_b = clip_seq_to_len(tf.concat([CLS_inputID, query_b, SEP_inputID, passage_b], axis=-1))
negative_sample_a = clip_seq_to_len(tf.concat([CLS_inputID, query_a, SEP_inputID, passage_b], axis=-1))
negative_sample_b = clip_seq_to_len(tf.concat([CLS_inputID, query_b, SEP_inputID, passage_a], axis=-1))
positive_sample_a = tf.concat([positive_sample_a, SEP_inputID], axis=-1)
positive_sample_b = tf.concat([positive_sample_b, SEP_inputID], axis=-1)
negative_sample_a = tf.concat([negative_sample_a, SEP_inputID], axis=-1)
negative_sample_b = tf.concat([negative_sample_b, SEP_inputID], axis=-1)
return positive_sample_a, positive_sample_b, negative_sample_a, negative_sample_b
#tf.function
def get_samples(example_a, example_b):
samples = make_samples(*_parse_example(example_a), *_parse_example(example_b))
return samples
config = {
'drop_out':0.1
}
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
with strategy.scope():
model = get_model(**config)
model.compile(loss=loss_fn,
optimizer=tfa.optimizers.AdamW(weight_decay=1e-5, learning_rate=3e-4, epsilon=1e-07), run_eagerly=False)
config_name = 'model_b'
base_dir = 'gs://bdora-semanticscholar'
model_dir = os.path.join(base_dir, config_name)
# tensorboard_dir = os.path.join(model_dir, 'logs_' + str(time()))
tfrecords_pattern_train = os.path.join(base_dir, 'VersionB_00022*')
tfrecords_pattern_train2 = os.path.join(base_dir, 'VersionB_00022*')
#tf.function
def gen():
while True:
yield ([1, 0], [1, 0], [0, 1], [0, 1] )
batchNumber = batch_size
run_eagerly = False
with strategy.scope():
filenames = tf.io.gfile.glob(tfrecords_pattern_train)
train_dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=autotune)
filenames = tf.io.gfile.glob(tfrecords_pattern_train)
neg_dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=autotune)
train_dataset = train_dataset.shuffle(150_000, seed=1000, reshuffle_each_iteration=True)
neg_dataset = neg_dataset.shuffle(150_000, seed=2000, reshuffle_each_iteration=True)
train_datasetC = tf.data.Dataset.zip((train_dataset, neg_dataset))
train_datasetC = train_datasetC.map(get_samples, num_parallel_calls=autotune)
train_datasetC = train_datasetC.shuffle(1024, seed=1000, reshuffle_each_iteration=True)
train_datasetC = train_datasetC.padded_batch(batchNumber, padding_values=(0, 0, 0, 0))
datasetLabels = tf.data.Dataset.from_generator(
gen,
(tf.int32, tf.int32, tf.int32, tf.int32),
(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])))
datasetLabels = datasetLabels.batch(batchNumber)
train_datasetFinal = tf.data.Dataset.zip((train_datasetC, datasetLabels))
train_datasetFinal = train_datasetFinal.prefetch(autotune)
train_datasetFinal = train_datasetFinal.repeat()
train_datasetFinal = train_datasetFinal.apply(tf.data.experimental.ignore_errors())
model.fit(train_datasetFinal, steps_per_epoch=100, epochs=3)
And this is the only output I get
Epoch 1/3
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
I found this GitHub issue discussion [1] that you can refer to.
It's not an error, it just means it's not updating those variables. Those variables (pooler) are not used when doing sequence classification.
[1] https://github.com/tensorflow/tensorflow/issues/37501
I am new to tensorflow and trying to code a toy discriminator problem. The way I have it set up, the loss is calculated from the expert_actions and the novice_actions. However, I am running an error when I am trying to optimize using the computed loss. The error is ValueError: No variables to optimize. I do understand that I am getting the error because there is no feed_dict. However, I do not know the solution to this.
class discriminator:
def __init__(self,n_actions, learning_rate):
self.n_actions = n_actions
self.learning_rate_dist = learning_rate
self.graph = tf.Graph()
with self.graph.as_default():
self.dis_input = tf.placeholder(tf.float32, [None, self.n_actions])
self.discriminator_function()
init = tf.global_variables_initializer()
self.sess = tf.Session(graph=self.graph)
self.sess.run(init)
def discriminator_function(self, hidden = None):
if hidden == None:
hidden = 16
x = tf.layers.dense(self.dis_input,hidden,tf.nn.relu)
x = tf.layers.dense(x,hidden,tf.nn.relu)
self.dis_output = tf.layers.dense(x,1)
def discriminator(self,expert_actions,novice_actions):
expert_out = self.sess.run(self.dis_output,feed_dict={self.dis_input : expert_actions})
novice_out = self.sess.run(self.dis_output,feed_dict={self.dis_input : novice_actions})
loss = tf.reduce_mean(tf.log(expert_out) + tf.log(1.-novice_out))
# update discriminator loss
optimize = tf.train.AdamOptimizer(self.learning_rate_dis).minimize(-loss)
self.sess.run(optimize) #error over here
return loss
if __name__ == '__main__':
d = discriminator(2,0.001)
expert_actions = np.random.randint(2, size=10)
novice_actions = np.random.randint(2, size=10)
d.discriminator(expert_actions,novice_actions)
You are trying to optimize loss = tf.reduce_mean(tf.log(expert_out) + tf.log(1.-novice_out)) with expert_out and novice_out being numpy arrays. There are no variables between the input and loss, to compute gradients.
Your discriminator function should be something like this:
def discriminator(self,expert_actions,novice_actions):
#Make sure you add the new ops and variables to the graph defined.
with self.graph.as_default():
loss = tf.reduce_mean(tf.log('Should be a tensor that is part of the graph and not a numpy array))
optimize = tf.train.AdamOptimizer(0.01).minimize(-loss)
self.sess.run(tf.global_variables_initializer())
#pass the inputs here
loss = self.sess.run([loss, optimize], feed_dict={self.dis_input : expert_actions})
return loss
Inspired from Andrej Karpathy Char-RNN, There is a Tensorflow implementation of char-rnn sherjilozair/char-rnn-tensorflow: Multi-layer Recurrent Neural Networks (LSTM, RNN) for character-level language models in Python using Tensorflow. I want to implement bidirectional character level language model from this code. I change the model.py and wrote a simple code:
class Model:
def __init__(self, input_data, targets, seq_length=Config.max_seq_length, training=True):
if Config.model == 'rnn':
cell_fn = rnn.BasicRNNCell
elif Config.model == 'gru':
cell_fn = rnn.GRUCell
elif Config.model == 'lstm':
cell_fn = rnn.BasicLSTMCell
elif Config.model == 'nas':
cell_fn = rnn.NASCell
else:
raise Exception("model type not supported: {}".format(Config.model))
fw_cells = []
bw_cells = []
for _ in range(Config.num_layers):
fw_cell = cell_fn(Config.rnn_size)
bw_cell = cell_fn(Config.rnn_size)
fw_cells.append(fw_cell)
bw_cells.append(bw_cell)
self.fw_cell = rnn.MultiRNNCell(fw_cells, state_is_tuple=True)
self.bw_cell = rnn.MultiRNNCell(bw_cells, state_is_tuple=True)
self.input_data, self.targets = input_data, targets
with tf.variable_scope('rnnlm'):
softmax_w = tf.get_variable("softmax_w", [Config.rnn_size*2, Config.vocab_size])
softmax_b = tf.get_variable("softmax_b", [Config.vocab_size])
embedding = tf.get_variable("embedding", [Config.vocab_size, Config.rnn_size])
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
inputs = tf.unstack(inputs, num=seq_length, axis=1)
outputs, _, _ = tf.nn.static_bidirectional_rnn(self.fw_cell, self.bw_cell, inputs,
dtype=tf.float32, scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, Config.rnn_size*2])
self.logits = tf.matmul(output, softmax_w) + softmax_b
self.probs = tf.nn.softmax(self.logits)
self.lr = tf.Variable(0.0, trainable=False)
if training:
loss = legacy_seq2seq.sequence_loss_by_example(
[self.logits],
[tf.reshape(self.targets, [-1])],
[tf.sign(tf.cast(tf.reshape(self.targets, [-1]), dtype=tf.float32))])
with tf.name_scope('cost'):
self.cost = tf.reduce_mean(loss)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), Config.grad_clip)
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(zip(grads, tvars))
In training phase, I see a fast converge. After near 3000 iteration, the loss reach 0.003. In test phase, the probability of all character is 1.0. I think there is a mistake.
I will so glad to get some help to find my mistake.
use the preceding and following output to predict the prob of current word. In your case, you used current rnn output to predict the prob of current word.
Looks like you set self.lr = tf.Variable(0.0, trainable=False). Try changing this to a nonzero value. If you are reading probabilities from self.probs during the testing phase this should be normalized appropriately,