Inspired from Andrej Karpathy Char-RNN, There is a Tensorflow implementation of char-rnn sherjilozair/char-rnn-tensorflow: Multi-layer Recurrent Neural Networks (LSTM, RNN) for character-level language models in Python using Tensorflow. I want to implement bidirectional character level language model from this code. I change the model.py and wrote a simple code:
class Model:
def __init__(self, input_data, targets, seq_length=Config.max_seq_length, training=True):
if Config.model == 'rnn':
cell_fn = rnn.BasicRNNCell
elif Config.model == 'gru':
cell_fn = rnn.GRUCell
elif Config.model == 'lstm':
cell_fn = rnn.BasicLSTMCell
elif Config.model == 'nas':
cell_fn = rnn.NASCell
else:
raise Exception("model type not supported: {}".format(Config.model))
fw_cells = []
bw_cells = []
for _ in range(Config.num_layers):
fw_cell = cell_fn(Config.rnn_size)
bw_cell = cell_fn(Config.rnn_size)
fw_cells.append(fw_cell)
bw_cells.append(bw_cell)
self.fw_cell = rnn.MultiRNNCell(fw_cells, state_is_tuple=True)
self.bw_cell = rnn.MultiRNNCell(bw_cells, state_is_tuple=True)
self.input_data, self.targets = input_data, targets
with tf.variable_scope('rnnlm'):
softmax_w = tf.get_variable("softmax_w", [Config.rnn_size*2, Config.vocab_size])
softmax_b = tf.get_variable("softmax_b", [Config.vocab_size])
embedding = tf.get_variable("embedding", [Config.vocab_size, Config.rnn_size])
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
inputs = tf.unstack(inputs, num=seq_length, axis=1)
outputs, _, _ = tf.nn.static_bidirectional_rnn(self.fw_cell, self.bw_cell, inputs,
dtype=tf.float32, scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, Config.rnn_size*2])
self.logits = tf.matmul(output, softmax_w) + softmax_b
self.probs = tf.nn.softmax(self.logits)
self.lr = tf.Variable(0.0, trainable=False)
if training:
loss = legacy_seq2seq.sequence_loss_by_example(
[self.logits],
[tf.reshape(self.targets, [-1])],
[tf.sign(tf.cast(tf.reshape(self.targets, [-1]), dtype=tf.float32))])
with tf.name_scope('cost'):
self.cost = tf.reduce_mean(loss)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), Config.grad_clip)
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(zip(grads, tvars))
In training phase, I see a fast converge. After near 3000 iteration, the loss reach 0.003. In test phase, the probability of all character is 1.0. I think there is a mistake.
I will so glad to get some help to find my mistake.
use the preceding and following output to predict the prob of current word. In your case, you used current rnn output to predict the prob of current word.
Looks like you set self.lr = tf.Variable(0.0, trainable=False). Try changing this to a nonzero value. If you are reading probabilities from self.probs during the testing phase this should be normalized appropriately,
Related
I'm implementing a physical informed neural network (PINN) model to solve the Navier-Stokes equation, as in PINN. This type of model works better when using L_BFGS_B, and the better optimizer for my case is the fmin_l_bfgs_b from SciPy.
The problem with this optimizer is that they do not work directly with the TensorFlow library. To work with TensorFlow, I implement a class L_BFGS_B with the following methods.
set_weights: Set weights to the model.:
evaluate: evaluate loss and gradients
tf_evaluate: Evaluate loss and gradients as tf.tensor
fit: Train the model
All works fine. The optimizer is training all weights of the model, but the problem is that I only want to train two out of 18 trainable variables.
**Optimizer class **
class L_BFGS_B:
def __init__(self, model, x_train, y_train, factr = 1, m=50, maxls=50,maxfun = 50000, maxiter=50000):
self.model = model
#x_train = xyt, y_train = uv
self.x_train = x_train #tf.constant(x_train, dtype=tf.float32)
self.y_train = y_train #tf.constant(y_train, dtype=tf.float32)
# quando iteração termina
self.factr = factr
#The maximum number of variable metric corrections used
self.m = m
#max number of line search steps/iteration
# nesse caso 50/iteração
self.maxls = maxls
#max number of interation
self.maxiter = maxiter
self.maxfun = maxfun
#tf.function
def tf_evaluate(self, x, y):
"""
Evaluate loss and gradients for weights as tf.Tensor.
Args:
x: input data.
Returns:
loss and gradients for weights as tf.Tensor.
"""
# wehre x = xyt , y = uv
with tf.GradientTape() as g:
uv_fuv = self.model([x, y])
loss = self.model.losses[0]
grads = g.gradient(loss, self.model.trainable_variables, unconnected_gradients=tf.UnconnectedGradients.ZERO)
return loss, grads
def set_weights(self, flat_weights):
"""
Set weights to the model.
Args:
flat_weights: flatten weights.
"""
weights_shapes = [ w.shape for w in self.model.get_weights() ]
n = [0] + [ np.prod(shape) for shape in weights_shapes ]
partition = np.cumsum(n)
weights = [ flat_weights[from_part:to_part].reshape(shape)
for from_part, to_part, shape
in zip(partition[:-1], partition[1:], weights_shapes) ]
self.model.set_weights(weights)
def evaluate(self, flat_weights):
"""
Evaluate loss and gradients for weights as ndarray.
Args:
weights: flatten weights.
Returns:
loss and gradients for weights as ndarray.
"""
self.set_weights(flat_weights)
loss, grads = self.tf_evaluate(self.x_train, self.y_train)
loss = loss.numpy().astype('float64')
grads = np.concatenate([ g.numpy().flatten() for g in grads ]).astype('float64')
#printest('loss', loss)
return loss, grads
def fit(self):
"""
Train the model using L-BFGS-B algorithm.
"""
# Flatten initial weights
initial_weights = np.concatenate([ w.flatten() for w in self.model.get_weights() ])
#optmizer
fmin_l_bfgs_b(func = self.evaluate, x0 = initial_weights,
factr = self.factr, m = self.m,
maxls = self.maxls, maxiter = self.maxiter,
maxfun = self.maxfun)
if __name__ == "__main__":
...
# load Data
...
indices = np.random.choice(N*T, n_train, replace = False)
xyt_train = tf.concat( (x_1d[indices], y_1d[indices], t_1d[indices]), axis = 1)
uv_train = tf.concat( (u_1d[indices], v_1d[indices]), axis = 1)
# Model
nn_model = NeuralNet().build()
pinn_model = PhysicsInformedNN(model = nn_model).build()
#Optimizer
lbfgs = L_BFGS_B(model = pinn_model, x_train = xyt_train, y_train = uv_train)
lbfgs.fit()
Attempt
Use arg in the fmin_l_bfgs_b, where args is passed as the trainable variables that I want to fix and **x0 ** the initial two variables to be minimized. The following code is only a sanity test to see if passing the weights in this way works.
def evaluate(self, weights_var, *args):
weights = np.append(weights_var, args)
self.set_weights(weights)
loss, grads = self.tf_evaluate(self.x_train, self.y_train)
loss = loss.numpy().astype('float64')
grads = np.concatenate([ g.numpy().flatten() for g in grads ]).astype('float64')
#printest('loss', loss)
return loss, grads
def fit(self):
"""
Train the model using L-BFGS-B algorithm.
"""
# Flatten initial weights
weights_fixed = np.concatenate([ w.flatten() for w in self.model.get_weights()[2:] ])
weights_var = np.concatenate([ w.flatten() for w in self.model.get_weights()[0:2] ])
#optmizer
fmin_l_bfgs_b(func = self.evaluate, x0 = initial_weights, args = (weights_fixed)
factr = self.factr, m = self.m,
maxls = self.maxls, maxiter = self.maxiter,
maxfun = self.maxfun)
Unfortunately, the following error is raised: 0-th dimension must be fixed to 2 but got 2644.
Question: There is a way to fix the trainable variables that I do not want to minimize, work with the ones that are not fixed, and in the final set back then to the neural network model using this type of optimizer?
I have made a ranking model using tensorflow_ranking losses and metrics, but the ListMLELoss() is always 0. The model will train and complete, but I imagine no learning is actually happening since the loss is not getting calculated. I tried to follow this guide, https://www.tensorflow.org/recommenders/examples/listwise_ranking, as well as I could, but there are some differences in use cases so it is a bit different. I am not sure why model.fit() runs and I get an NDCG value, but clearly the model cannot be learning as a loss value is not getting computed.
Here is my ranking model class:
class RankingModel(tf.keras.Model):
def __init__(self, embeddings, vocab_size_dict, dim_dict, loss, activation='sigmoid'):
super().__init__()
self.embeddings = embeddings
self.embedding_layers = {}
self.vocab_size_dict = vocab_size_dict
self.dim_dict = dim_dict
self.activation = activation
self.loss = loss
self.embedding_layers['feature_one'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_one']+1,
self.dim_dict['feature_one'],
name='embedded_feature_one')
self.embedding_layers['feature_two'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_two']+1,
self.dim_dict['feature_two'],
name='embedded_feature_two')
self.embedding_layers['feature_three'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_three']+1,
self.dim_dict['feature_three'],
name='embedded_feature_three')
self.embedding_layers['feature_four'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_four']+1,
self.dim_dict['feature_four'],
name='embedded_feature_four')
self.embedding_layers['feature_five'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_five']+1,
self.dim_dict['feature_five'],
name='embedded_feature_five')
self.flatten = tf.keras.layers.Flatten()
self.concatenate = tf.keras.layers.Concatenate(axis=1, name='Input_Concatenation')
self.batchnorm = tf.keras.layers.BatchNormalization(name='batchnorm')
self.score_model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='leaky_relu'),
tf.keras.layers.Dense(12, activation=activation)
])
self.task = tfrs.tasks.Ranking(
loss=self.loss,
metrics=[
tfr.keras.metrics.NDCGMetric(name="ndcg_metric")
]
)
def __call__(self, features, training=False):
feats = []
for feat, tens in features[0].items():
if feat in self.embeddings:
embedding = self.embedding_layers[feat](tens)
flatten = self.flatten(embedding)
feats.append(flatten)
if feat == 'continuous':
flatten = self.flatten(tens)
feats.append(flatten)
deep_concatenated = self.concatenate(feats)
batchnorm = self.batchnorm(deep_concatenated)
scores = self.score_model(batchnorm)
print("scores: ", scores)
print("mask: ", features[0]['mask'])
masked_scores = tf.boolean_mask(scores, features[0]['mask'])
# pred = tf.expand_dims(masked_scores, axis=1)
# return pred
return tf.expand_dims(masked_scores, axis=1)
def compute_loss(self, features, training=False):
labels = features[1]
# print("labels: ", labels)
# print("mask: ", features[0]['mask'])
masked_labels = tf.boolean_mask(labels, features[0]['mask'])
# print("masked labels:", masked_labels)
masked_labels = tf.expand_dims(masked_labels, axis=1)
print("masked_labels: ", masked_labels)
scores = self(features)
print("scores: ", scores)
print("loss: ", self.task(labels=masked_labels, predictions=scores))
return self.task(
labels=masked_labels,
predictions=scores
)
def train_step(self, inputs):
"""Custom train step using the `compute_loss` method."""
with tf.GradientTape() as tape:
loss = self.compute_loss(inputs)
# Handle regularization losses as well.
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
gradients = tape.gradient(total_loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
def test_step(self, inputs):
"""Custom test step using the `compute_loss` method."""
loss = self.compute_loss(inputs)
# Handle regularization losses as well.
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
Can anybody see why I am not getting a loss value? Thanks a lot. Please let me know if you need additional info. Maybe I can create some synthetic data so you can run it all yourself. Been pulling my hair out for a few days trying to get this to work so any advice is MUCH appreicated.
I am trying to predict uncertainty in a regression problem using Dropout during testing as per Yarin Gal's article. I created a class using Keras's backend function as provided by this stack overflow question's answer. The class takes a NN model as input and randomly drops neurons during testing to give a stochastic estimate rather than deterministic output for a time-series forecasting.
I create a simple encoder-decoder model as shown below for the forecasting with 0.1 dropout during training:
input_sequence = Input(shape=(lookback, train_x.shape[2]))
encoder = LSTM(128, return_sequences=False)(input_sequence)
r_vec = RepeatVector(forward_pred)(encoder)
decoder = LSTM(128, return_sequences=True, dropout=0.1)(r_vec) #maybe use dropout=0.1
output = TimeDistributed(Dense(train_y.shape[2], activation='linear'))(decoder)
# optimiser = optimizers.Adam(clipnorm=1)
enc_dec_model = Model(input_sequence, output)
enc_dec_model.compile(loss="mean_squared_error",
optimizer="adam",
metrics=['mean_squared_error'])
enc_dec_model.summary()
After that, I define and call the DropoutPrediction class.
# Define the class:
class KerasDropoutPrediction(object):
def __init__(self ,model):
self.f = K.function(
[model.layers[0].input,
K.learning_phase()],
[model.layers[-1].output])
def predict(self ,x, n_iter=10):
result = []
for _ in range(n_iter):
result.append(self.f([x , 1]))
result = np.array(result).reshape(n_iter ,x.shape[0] ,x.shape[1]).T
return result
# Call the object:
kdp = KerasDropoutPrediction(enc_dec_model)
y_pred_do = kdp.predict(x_test,n_iter=100)
y_pred_do_mean = y_pred_do.mean(axis=1)
However, in the line
kdp = KerasDropoutPrediction(enc_dec_model), when I call the LSTM model,
I got the following error message which says the input has to be a Keras Tensor. Can anyone help me with this error?
Error Message:
ValueError: Found unexpected instance while processing input tensors for keras functional model. Expecting KerasTensor which is from tf.keras.Input() or output from keras layer call(). Got: 0
To activate Dropout at inference time, you simply have to specify training=True (TF>2.0) in the layer of interest (in the last LSTM layer in your case)
with training=False
inp = Input(shape=(10, 1))
x = LSTM(1, dropout=0.3)(inp, training=False)
m = Model(inp,x)
# m.compile(...)
# m.fit(...)
X = np.random.uniform(0,1, (1,10,1))
output = []
for i in range(0,100):
output.append(m.predict(X)) # always the same
with training=True
inp = Input(shape=(10, 1))
x = LSTM(1, dropout=0.3)(inp, training=True)
m = Model(inp,x)
# m.compile(...)
# m.fit(...)
X = np.random.uniform(0,1, (1,10,1))
output = []
for i in range(0,100):
output.append(m.predict(X)) # always different
In your example, this becomes:
input_sequence = Input(shape=(lookback, train_x.shape[2]))
encoder = LSTM(128, return_sequences=False)(input_sequence)
r_vec = RepeatVector(forward_pred)(encoder)
decoder = LSTM(128, return_sequences=True, dropout=0.1)(r_vec, training=True)
output = TimeDistributed(Dense(train_y.shape[2], activation='linear'))(decoder)
enc_dec_model = Model(input_sequence, output)
enc_dec_model.compile(
loss="mean_squared_error",
optimizer="adam",
metrics=['mean_squared_error']
)
enc_dec_model.fit(train_x, train_y, epochs=10, batch_size=32)
and the KerasDropoutPrediction:
class KerasDropoutPrediction(object):
def __init__(self, model):
self.model = model
def predict(self, X, n_iter=10):
result = []
for _ in range(n_iter):
result.append(self.model.predict(X))
result = np.array(result)
return result
kdp = KerasDropoutPrediction(enc_dec_model)
y_pred_do = kdp.predict(test_x, n_iter=100)
y_pred_do_mean = y_pred_do.mean(axis=0)
I am trying to implement perceptual loss function in tensorflow and here is
loss_model = tf.keras.models.Sequential()
for eachLayer in base_model.layers[:12]:
eachLayer.trainable=False
loss_model.add(eachLayer)
def meanSquaredLoss(y_true,y_pred):
return tf.reduce_mean(tf.keras.losses.MSE(y_true,y_pred))
def featureLoss(image):
predicted_image = model(image,training=False)
activatedModelVal = loss_model(predicted_image,training=False)
actualModelVal = loss_model(image,training=False)
return meanSquaredLoss(actualModelVal,activatedModelVal)
Here is the style loss function given by:
def gram_matrix(input_tensor):
result = tf.linalg.einsum('bijc,bijd->bcd', input_tensor, input_tensor)
input_shape = tf.shape(input_tensor)
num_locations = tf.cast(input_shape[1]*input_shape[2], tf.float32)
return result/(num_locations)
def styleLoss(image):
predicted_image = model(image,training=False)
activatedModelVal = loss_model(predicted_image,training=False)
actualModelVal = loss_model(image,training=False)
return meanSquaredLoss(gram_matrix(actualModelVal),gram_matrix(activatedModelVal))
So now I have both losses, and here is what I have done for optimization and stuffs!
opt = tf.keras.optimizers.Adam(0.02)
def each_train_step(image,showImage=False):
predicted_image = model(image,training=False)
loss = tf.reduce_sum(featureLoss(predicted_image,image)+styleLoss(predicted_image,image))
with tf.GradientTape() as tape:
grad = tape.gradient(loss, model.trainable_variables)
print(grad)
# opt.apply_gradients(zip(grad, model.trainable_variables))
if showImage:
plt.imshow(predicted_image)
The problem is the grad object is getting list of None and I don't know WHY! Why is the gradient returning list of None? Any solution to get the actual gradients ?
I'm trying to build an a3c implementation in keras. I have experience working with keras, but absolutely no experience working with tensorflow. So I would really apreciate if someone could make it as simple as possible, since I want to finish it as fast as possible without diving too deep into tensorflow.
self.session = tf.Session()
K.set_session(self.session)
K.manual_variable_initialization(True)
self.stop_signal = False
self.model = self._build_model()
self.graph = self._build_graph(self.model)
self.session.run(tf.global_variables_initializer())
self.default_graph = tf.get_default_graph()
self.default_graph.finalize() # avoid modifications
def _build_model(self):
l_input = Input(batch_shape=(None, NUM_STATE))
input_layer = Reshape((1, -1))(l_input)
lstm = LSTM(64, activation='relu', return_sequences=True)(input_layer)
lstm = LSTM(128, activation='relu', return_sequences=True)(lstm)
lstm = LSTM(128, activation='relu')(lstm)
out_actions = Dense(NUM_ACTIONS, activation='softmax')(lstm)
out_value = Dense(1, activation='linear')(lstm)
model = Model(inputs=[l_input], outputs=[out_actions, out_value])
model._make_predict_function() # have to initialize before threading
return model
def _build_graph(self, model):
s_t = tf.placeholder(tf.float32, shape=(None, NUM_STATE))
a_t = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))
r_t = tf.placeholder(tf.float32, shape=(None, 1))
p, v = model(s_t)
log_prob = tf.log(tf.reduce_sum(p * a_t, axis=1, keepdims=True) + 1e-10)
advantage = r_t - v
loss_policy = - log_prob * tf.stop_gradient(advantage)
loss_value = LOSS_V * tf.square(advantage)
entropy = LOSS_ENTROPY * tf.reduce_sum(p * tf.log(p + 1e-10), axis=1, keepdims=True)
loss_total = tf.reduce_mean(loss_policy + loss_value + entropy)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
minimize = optimizer.minimize(loss_total)
return s_t, a_t, r_t, minimize
Then it is beeing trained:
s_t, a_t, r_t, minimize = self.graph
self.session.run(minimize, feed_dict={s_t: s, a_t: a, r_t: r})
Predictions are done this way:
with self.default_graph.as_default():
p, v = self.model.predict(s)
So I want to update my keras model weights using these gradients after I finish training in order to save it using model.save('path.h5'). Peudo code:
model_weights = model.trainable_weights
model_weights = apply_gradients(grades, model_weights)
model = model.set_weights(model_weights)
model.save('path.h5')
The code was taken from here with little changes: https://github.com/jaara/AI-blog/blob/master/CartPole-A3C.py
I found something on this topic but can't really figure out how to actually use it.
https://github.com/keras-team/keras/issues/3062
https://github.com/keras-team/keras/issues/3069
Turns out the problem has to do with the algorithm not converging properly. If someone knows what can I do to make it converge? I'm using custom environment and I trained a DQN on this environment in the past and it successfully converged. I aslo implemented target model which I update every 300 steps (or 1 episode in my case).