Related
I am trying to create an end-to-end trainable offline English Handwriting Recognition Model (without segmenting individual character). I am using the word dataset from IAM Handwriting Database for training.
I tried decreasing the learning rate, increasing batch size, etc. but the loss keeps on fluctuating with no/significant overall decrease - TensorBoard visualization for cost at each step
I am new to TensorFlow so could have made some naive error. The code used:
class CRNN(object):
def __init__(self, config):
self.config = config
tf.reset_default_graph()
def read_and_decode(self, filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Define how to parse the example
context_features = {
'length': tf.FixedLenFeature([], dtype=tf.int64),
'out_length': tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
'token': tf.FixedLenSequenceFeature([], dtype=tf.float32),
'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
image = sequence_parsed['token']
label = tf.cast(sequence_parsed['labels'], tf.int32)
length = tf.cast(context_parsed['length'], tf.int32)
lab_length = tf.cast(context_parsed['out_length'], tf.int32)
image_shape = tf.cast(tf.stack([self.config.im_height,
length/self.config.im_height]), tf.int32)
image = tf.reshape(image, image_shape)
# Updating length to represent image width
length = tf.shape(image)[1]
# Batch the variable length tensor with dynamic padding
self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
tensors=[image, label, length, lab_length],
batch_size=self.config.batch_size, dynamic_pad=True)
def net(self):
batch_lab_length = tf.reduce_max(self.lab_lengths)
batch_im_length = tf.reduce_max(self.lengths)
# Reshape to time major
sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size,
self.config.im_height])
# Feed sequences into RNN
with tf.name_scope('RNN'):
self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=self.cell_fw,
cell_bw=self.cell_bw,
inputs=sequences,
dtype=tf.float32,
sequence_length=self.lengths,
time_major=True,
scope='RNN'
)
# Reshaping to apply the same weights over the timesteps
self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden])
self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden,
self.config.num_classes],
stddev=0.1), name='out_W')
self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b')
# Doing the affine projection
logits = tf.matmul(self.output, self.out_W) + self.out_b
# Reshaping back to the original shape
logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes])
# Time major
logits = tf.transpose(logits, (1, 0, 2))
# Training computation
# Prepare sparse tensor for CTC loss
labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length))
sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs))
labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size])
mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool)
labels_vals = tf.boolean_mask(labels_vals,mask)
labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals,
dense_shape=[self.config.batch_size,
tf.cast(batch_lab_length, tf.int64)])
self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths,
preprocess_collapse_repeated=False, ctc_merge_repeated=False,
time_major=True)
self.cost = tf.reduce_mean(self.loss)
# Optimizer
self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
momentum=0.9, use_nesterov=True).minimize(self.cost)
# Predictions for the training, validation, and test data.
self.train_prediction = tf.nn.ctc_beam_search_decoder(logits,
sequence_length=self.lab_lengths)
def train(self):
num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size)
tf.reset_default_graph()
filename_queue = tf.train.string_input_producer(
[self.config.tfrecord_filename], num_epochs=self.config.num_epochs)
self.read_and_decode(filename_queue)
self.net()
# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
saver = tf.train.Saver()
with tf.Session() as sess:
training_summary = tf.summary.scalar("training_cost", self.cost)
writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph)
sess.run(init_op)
print('Initialized')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start = time.time()
steps_time = start
epoch = 1
for step in range(num_steps):
_, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost,
self.train_prediction,
self.labels, training_summary])
writer.add_summary(train_summ, step)
if (step % 10000 == 0):
preds = np.zeros((predictions[0][0].dense_shape))
i = 0
for idx in predictions[0][0].indices:
preds[idx[0]][idx[1]] = predictions[0][0].values[i]
i+=1
print(time.time() - steps_time)
steps_time = time.time()
print('Minibatch cost at step %d: %f' % (step, c))
print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels],
'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds])
if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0):
print('Epoch', epoch, 'Completed')
epoch+=1
last_step = step
saver.save(sess, "model_BLSTM", global_step=last_step)
writer.close()
print(time.time() - start)
After trying a lot of things unsuccessfully, I found that an incorrect argument was provided to the sequence_length argument of tf.nn.ctc_loss. It should be set to 'length of input sequence' but I had set it to 'length of output sequence(labels - number of character)'
More details can be found in comments under the selected answer to this question - CTC Loss InvalidArgumentError: sequence_length(b) <= time
Also, if one has a GPU it would be better to use Baidu's CTC GPU implementation (https://github.com/baidu-research/warp-ctc) as it can speed up the training a lot.
The problem is that you are feeding raw images in the LSTM, so it is very difficult for it to extract any useful information. The CRNN paper first uses a series of convolutional layers to extract features from the images, and then these are fed into the LSTM.
I created a simple TensorFlow program that tries to predict the next character using the previous 3 characters in a body of text.
A single input could look like this:
np.array(['t','h','i'])
with the target about being
np.array(['s'])
I'm trying to expand this to output the next say 4 character rather than just the next character. To do this I tried feeding in a longer array to y
np.array(['s','','i'])
In addition to changing the y to
y = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
however, this yields the error:
Rank mismatch: Rank of labels (received 2) should equal rank of logits
minus 1 (received 2).
Here's the full code
embedding_size=40
n_neurons = 200
n_output = vocab_size
learning_rate = 0.001
with tf.Graph().as_default():
x = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
y = tf.placeholder(dtype=tf.int32, shape=[None])
seq_length = tf.placeholder(tf.int32, [None])
# Let's set up the embedding converting words to vectors
embeddings = tf.Variable(tf.random_uniform(shape=[vocab_size, embedding_size], minval=-1, maxval=1))
train_input = tf.nn.embedding_lookup(embeddings, x)
basic_cell = tf.nn.rnn_cell.GRUCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, train_input, sequence_length=seq_length, dtype=tf.float32)
logits = tf.layers.dense(states, units=vocab_size, activation=None)
predictions = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for r in range(1000):
x_batch, y_batch, seq_length_batch = input_fn()
feed_dict = {x: x_batch, y: y_batch, seq_length: seq_length_batch}
_, loss_out = sess.run([training_op, loss], feed_dict=feed_dict)
if r % 1000 == 0:
print("loss_out", loss_out)
sample_text = "for th"
sample_text_ids = np.expand_dims(np.array([w_to_id[c] for c in sample_text]+[0, 0], dtype=np.int32), 0)
prediction_out = sess.run(predictions, feed_dict={x: sample_text_ids, seq_length: np.array([len(sample_text)])})
print("Result:", id_to_w[np.argmax(prediction_out)])
In case of many-to-many RNN, you should use tf.contrib.seq2seq.sequence_loss to calculate per time step loss. Your code should look like this:
...
logits = tf.layers.dense(states, units=vocab_size, activation=None)
weights = tf.sequence_mask(seq_length, n_steps)
xentropy = tf.contrib.seq2seq.sequence_loss(logits, y, weights)
...
See here for more details on tf.contrib.seq2seq.sequence_loss.
I'm having a rough time trying to figure out what's wrong with my LSTM model. I have 11 inputs, and 2 output classes (one-hot encoded) and very quickly, like within 1 batch or so, the error just goes to the % of one of the output classes and stays there.
I tried printing weights and biases, but they seem to all be full of NaN.
If i decrease the learning rate, or mess around with layers/units, I can get it to arrive at the % of one class error slowly, but it seems to always get to that point.
Here's the code:
num_units = 30
num_layers = 50
dropout_rate = 0.80
learning_rate=0.0001
batch_size = 180
epoch = 1
input_classes = len(train_input[0])
output_classes = len(train_output[0])
data = tf.placeholder(tf.float32, [None, input_classes, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, output_classes]) #one-hot encoded: [1,0] = bad, [0,1] = good
dropout = tf.placeholder(tf.float32)
cell = tf.contrib.rnn.LSTMCell(num_units, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
#Input shape [batch_size, max_time, depth], output shape: [batch_size, max_time, cell.output_size]
val, _ = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
val = tf.transpose(val, [1, 0, 2]) #reshapes it to [sequence_size, batch_size, depth]
#get last entry as it includes previous results
last = tf.gather(val, int(val.get_shape()[0]) - 1)
weight = tf.get_variable("W", shape=[num_units, output_classes], initializer=tf.contrib.layers.xavier_initializer())
bias = tf.get_variable("B", shape=[output_classes], initializer=tf.contrib.layers.xavier_initializer())
logits = tf.matmul(last, weight) + bias
prediction = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=target)
prediction = tf.clip_by_value(prediction, 1e-10,100.0)
cost = tf.reduce_mean(prediction)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
minimize = optimizer.minimize(cost)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(logits, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()
sess = tf.Session()
sess.run(init_op)
no_of_batches = int((len(train_input)) / batch_size)
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_input[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out, dropout: dropout_rate })
sess.close()
Since you have one hot encoding use sparse_softmax_cross_entropy_with_logits instead of tf.nn.softmax_cross_entropy_with_logits.
Refer to this stackoverflow answer to understand the difference of two functions.
1
I have created ANN with two RELU hidden layers + linear activation layer and trying to approximate simple ln(x) function. And I am can't do this good. I am confused because lx(x) in x:[0.0-1.0] range should be approximated without problems (I am using learning rate 0.01 and basic grad descent optimization).
import tensorflow as tf
import numpy as np
def GetTargetResult(x):
curY = np.log(x)
return curY
# Create model
def multilayer_perceptron(x, weights, biases):
# Hidden layer with RELU activation
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
# # Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_2 = tf.nn.relu(layer_2)
# Output layer with linear activation
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
return out_layer
# Parameters
learning_rate = 0.01
training_epochs = 10000
batch_size = 50
display_step = 500
# Network Parameters
n_hidden_1 = 50 # 1st layer number of features
n_hidden_2 = 10 # 2nd layer number of features
n_input = 1
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_uniform([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_uniform([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_uniform([n_hidden_2, 1]))
}
biases = {
'b1': tf.Variable(tf.random_uniform([n_hidden_1])),
'b2': tf.Variable(tf.random_uniform([n_hidden_2])),
'out': tf.Variable(tf.random_uniform([1]))
}
x_data = tf.placeholder(tf.float32, [None, 1])
y_data = tf.placeholder(tf.float32, [None, 1])
# Construct model
pred = multilayer_perceptron(x_data, weights, biases)
# Minimize the mean squared errors.
loss = tf.reduce_mean(tf.square(pred - y_data))
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train = optimizer.minimize(loss)
# Before starting, initialize the variables. We will 'run' this first.
init = tf.initialize_all_variables ()
# Launch the graph.
sess = tf.Session()
sess.run(init)
for step in range(training_epochs):
x_in = np.random.rand(batch_size, 1).astype(np.float32)
y_in = GetTargetResult(x_in)
sess.run(train, feed_dict = {x_data: x_in, y_data: y_in})
if(step % display_step == 0):
curX = np.random.rand(1, 1).astype(np.float32)
curY = GetTargetResult(curX)
curPrediction = sess.run(pred, feed_dict={x_data: curX})
curLoss = sess.run(loss, feed_dict={x_data: curX, y_data: curY})
print("For x = {0} and target y = {1} prediction was y = {2} and squared loss was = {3}".format(curX, curY,curPrediction, curLoss))
For the configuration above NN is just learning to guess y = -1.00. I have tried different learning rates, couple optimizers and different configurations with no success - learning does not converge in any case. I did something like that with logarithm in past in other deep learning framework without problem.. Can be the TF specific issue? What am I doing wrong?
What your network has to predict
Source: WolframAlpha
What your architecture is
ReLU(ReLU(x * W_1 + b_1) * W_2 + b_2)*W_out + b_out
Thoughts
My first thought was that ReLU is the problem. However, you don't apply relu to the output, so that should not cause the problem.
Changing the initialization (from uniform to normal) and the Optimizer (from SGD to ADAM) seems to fix the problem:
#!/usr/bin/env python
import tensorflow as tf
import numpy as np
def get_target_result(x):
return np.log(x)
def multilayer_perceptron(x, weights, biases):
"""Create model."""
# Hidden layer with RELU activation
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
# # Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_2 = tf.nn.relu(layer_2)
# Output layer with linear activation
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
return out_layer
# Parameters
learning_rate = 0.01
training_epochs = 10**6
batch_size = 500
display_step = 500
# Network Parameters
n_hidden_1 = 50 # 1st layer number of features
n_hidden_2 = 10 # 2nd layer number of features
n_input = 1
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.truncated_normal([n_input, n_hidden_1], stddev=0.1)),
'h2': tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2], stddev=0.1)),
'out': tf.Variable(tf.truncated_normal([n_hidden_2, 1], stddev=0.1))
}
biases = {
'b1': tf.Variable(tf.constant(0.1, shape=[n_hidden_1])),
'b2': tf.Variable(tf.constant(0.1, shape=[n_hidden_2])),
'out': tf.Variable(tf.constant(0.1, shape=[1]))
}
x_data = tf.placeholder(tf.float32, [None, 1])
y_data = tf.placeholder(tf.float32, [None, 1])
# Construct model
pred = multilayer_perceptron(x_data, weights, biases)
# Minimize the mean squared errors.
loss = tf.reduce_mean(tf.square(pred - y_data))
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
# train = optimizer.minimize(loss)
train = tf.train.AdamOptimizer(1e-4).minimize(loss)
# Before starting, initialize the variables. We will 'run' this first.
init = tf.initialize_all_variables()
# Launch the graph.
sess = tf.Session()
sess.run(init)
for step in range(training_epochs):
x_in = np.random.rand(batch_size, 1).astype(np.float32)
y_in = get_target_result(x_in)
sess.run(train, feed_dict={x_data: x_in, y_data: y_in})
if(step % display_step == 0):
curX = np.random.rand(1, 1).astype(np.float32)
curY = get_target_result(curX)
curPrediction = sess.run(pred, feed_dict={x_data: curX})
curLoss = sess.run(loss, feed_dict={x_data: curX, y_data: curY})
print(("For x = {0} and target y = {1} prediction was y = {2} and "
"squared loss was = {3}").format(curX, curY,
curPrediction, curLoss))
Training this for 1 minute gave me:
For x = [[ 0.19118255]] and target y = [[-1.65452647]] prediction was y = [[-1.65021849]] and squared loss was = 1.85587377928e-05
For x = [[ 0.17362741]] and target y = [[-1.75084364]] prediction was y = [[-1.74087048]] and squared loss was = 9.94640868157e-05
For x = [[ 0.60853624]] and target y = [[-0.4966988]] prediction was y = [[-0.49964082]] and squared loss was = 8.65551464813e-06
For x = [[ 0.33864763]] and target y = [[-1.08279514]] prediction was y = [[-1.08586168]] and squared loss was = 9.4036658993e-06
For x = [[ 0.79126364]] and target y = [[-0.23412406]] prediction was y = [[-0.24541236]] and squared loss was = 0.000127425722894
For x = [[ 0.09994856]] and target y = [[-2.30309963]] prediction was y = [[-2.29796076]] and squared loss was = 2.6408026315e-05
For x = [[ 0.31053194]] and target y = [[-1.16946852]] prediction was y = [[-1.17038012]] and squared loss was = 8.31002580526e-07
For x = [[ 0.0512077]] and target y = [[-2.97186542]] prediction was y = [[-2.96796203]] and squared loss was = 1.52364455062e-05
For x = [[ 0.120253]] and target y = [[-2.11815739]] prediction was y = [[-2.12729549]] and squared loss was = 8.35050013848e-05
So the answer might be that your optimizer is not good / the optimization problem starts at a bad point. See
Xavier Glorot, Yoshua Bengio: Understanding the difficulty of training deep feedforward neural networks
Visualizing Optimization Algos
The following image is from Alec Radfords nice gifs. It does not contain ADAM, but you get a feeling for how much better one can do than SGD:
Two idea how this might be improved
try dropout
try not to use x values close to 0. I would rather sample values in [0.01, 1].
However, my experience with regression problems is quite limited.
First of all, your input data is in range [0, 1), which is not a good input to a neural network. Subtract mean from x after computing y to make it normalized (also ideally divide by standard deviation).
However, in your particular case it was not enough to make it work.
I played with it and found two ways to make it work (both require data normalization as described above):
Either completely remove the second layer
or
Make the number of neurons in the second layer 50.
My guess would be that 10 neurons do not have sufficient representation power to pass enough information to the last layer (obviously, a perfectly smart NN would learn to ignore the second layer in this case passing the answer in one of the neurons, but the theoretical possibility doesn't mean that gradient descent will learn to do so).
I have not look at the code but this is the theory. If you use an activation function like "tanh", then for small weights the activation function is in the linear region and for large weights the activation function is either -1 or +1. If you are in the linear region across all layers then you can not approximate complex functions (i.e. you have a sandwich of linear layers hence the best you can do is linear aproximations) but if you have bigger weights then the nonlinearly allow you to approximate a wide range of functions. There are no free lunches, the weights need to be at the right values to avoid over-fitting and under-fitting. This process is called regularization.
I'm working with the LSTM model in Tensorflow.
I already trained and saved the LSTM model. Now I'm coming up to the last task to generate the sentences.
Here is my pseudo code:
# We have already the run_epoch(session, m, data, eval_op, verbose=False) function with fee_dict like this:
feed_dict = {m.input_data: x,
m.targets: y,
m.initial_state: state}
...
# train and save model
...
# load saved model for generating task
new_sentence = [START_TOKEN]
# Here I want to generate a sentence until END_TOKEN is generated.
while new_sentence[-1] != END_TOKEN:
logits = get_logits(model, new_sentence)
# get argmax(logits) or sample(logits)
next_word = argmax(logits)
new_sentence.append(next_word)
print(new_sentence)
My question is:
When training, validating, or testing model I have to feed both of the inputs and their labels (by shifted inputs one) into model via feed_dict dictionary. But in the generating task, I have only one input which is the generating sentence new_sentence.
How can I build the right get_logits function or full generate function also?
when you train you have an output of the neural network, based on that output you calculate the error, based on error you create the optimizer to minimize the error.
In order to generate a new sentence you need to get just the output of the neural network(rnn).
Edited:
"""
Placeholders
"""
x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')
init_state = tf.zeros([batch_size, state_size])
"""
RNN Inputs
"""
# Turn our x placeholder into a list of one-hot tensors:
# rnn_inputs is a list of num_steps tensors with shape [batch_size, num_classes]
x_one_hot = tf.one_hot(x, num_classes)
rnn_inputs = tf.unpack(x_one_hot, axis=1)
"""
Definition of rnn_cell
This is very similar to the __call__ method on Tensorflow's BasicRNNCell. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn_cell.py
"""
with tf.variable_scope('rnn_cell'):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
def rnn_cell(rnn_input, state):
with tf.variable_scope('rnn_cell', reuse=True):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
return tf.tanh(tf.matmul(tf.concat(1, [rnn_input, state]), W) + b)
state = init_state
rnn_outputs = []
for rnn_input in rnn_inputs:
state = rnn_cell(rnn_input, state)
rnn_outputs.append(state)
final_state = rnn_outputs[-1]
#logits and predictions
with tf.variable_scope('softmax'):
W = tf.get_variable('W', [state_size, num_classes])
b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]
predictions = [tf.nn.softmax(logit) for logit in logits]
# Turn our y placeholder into a list labels
y_as_list = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(1, num_steps, y)]
#losses and train_step
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logit,label) for \
logit, label in zip(logits, y_as_list)]
total_loss = tf.reduce_mean(losses)
train_step = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)
def train():
with tf.Session() as sess:
#load the model
training_losses = []
for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps)):
training_loss = 0
training_state = np.zeros((batch_size, state_size))
if verbose:
print("\nEPOCH", idx)
for step, (X, Y) in enumerate(epoch):
tr_losses, training_loss_, training_state, _ = \
sess.run([losses,
total_loss,
final_state,
train_step],
feed_dict={x:X, y:Y, init_state:training_state})
training_loss += training_loss_
if step % 100 == 0 and step > 0:
if verbose:
print("Average loss at step", step,
"for last 250 steps:", training_loss/100)
training_losses.append(training_loss/100)
training_loss = 0
#save the model
def generate_seq():
with tf.Session() as sess:
#load the model
# load saved model for generating task
new_sentence = [START_TOKEN]
# Here I want to generate a sentence until END_TOKEN is generated.
while new_sentence[-1] != END_TOKEN:
logits = sess.run(final_state,{x:np.asarray([new_sentence])})
# get argmax(logits) or sample(logits)
next_word = argmax(logits[0])
new_sentence.append(next_word)
print(new_sentence)