I'm having a rough time trying to figure out what's wrong with my LSTM model. I have 11 inputs, and 2 output classes (one-hot encoded) and very quickly, like within 1 batch or so, the error just goes to the % of one of the output classes and stays there.
I tried printing weights and biases, but they seem to all be full of NaN.
If i decrease the learning rate, or mess around with layers/units, I can get it to arrive at the % of one class error slowly, but it seems to always get to that point.
Here's the code:
num_units = 30
num_layers = 50
dropout_rate = 0.80
learning_rate=0.0001
batch_size = 180
epoch = 1
input_classes = len(train_input[0])
output_classes = len(train_output[0])
data = tf.placeholder(tf.float32, [None, input_classes, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, output_classes]) #one-hot encoded: [1,0] = bad, [0,1] = good
dropout = tf.placeholder(tf.float32)
cell = tf.contrib.rnn.LSTMCell(num_units, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
#Input shape [batch_size, max_time, depth], output shape: [batch_size, max_time, cell.output_size]
val, _ = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
val = tf.transpose(val, [1, 0, 2]) #reshapes it to [sequence_size, batch_size, depth]
#get last entry as it includes previous results
last = tf.gather(val, int(val.get_shape()[0]) - 1)
weight = tf.get_variable("W", shape=[num_units, output_classes], initializer=tf.contrib.layers.xavier_initializer())
bias = tf.get_variable("B", shape=[output_classes], initializer=tf.contrib.layers.xavier_initializer())
logits = tf.matmul(last, weight) + bias
prediction = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=target)
prediction = tf.clip_by_value(prediction, 1e-10,100.0)
cost = tf.reduce_mean(prediction)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
minimize = optimizer.minimize(cost)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(logits, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()
sess = tf.Session()
sess.run(init_op)
no_of_batches = int((len(train_input)) / batch_size)
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_input[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out, dropout: dropout_rate })
sess.close()
Since you have one hot encoding use sparse_softmax_cross_entropy_with_logits instead of tf.nn.softmax_cross_entropy_with_logits.
Refer to this stackoverflow answer to understand the difference of two functions.
1
Related
I am using TensorFlow V1.10.0 and developing a Multi-Object Tracker based on MDNet. I need to assign a separate weight matrix for each detected object for the fully connected layers in order to get different embedding for each object during online training. I am using this tf.map_fn in order to generate a higher-order weight tensor (n_objects, flattened layer, hidden_units),
'''
def dense_fc4(n_objects):
initializer = lambda: tf.contrib.layers.xavier_initializer()(shape=(1024, 512))
return tf.Variable(initial_value=initializer, name='fc4/kernel',
shape=(n_objects.shape[0], 1024, 512))
W4 = tf.map_fn(dense_fc4, samples_flat)
b4 = tf.get_variable('fc4/bias', shape=512, initializer=tf.zeros_initializer())
fc4 = tf.add(tf.matmul(samples_flat, W4), b4)
fc4 = tf.nn.relu(fc4)
'''
However during execution when I run the session for W4 I get a weight matrix but all having the same values. Any help?
TIA
Here is a workaround, I was able to generate the multiple kernels outside the graph in a for loop and then giving it to the graph:
w6 = []
for n_obj in range(pos_data.shape[0]):
w6.append(tf.get_variable("fc6/kernel-" + str(n_obj), shape=(512, 2),
initializer=tf.contrib.layers.xavier_initializer()))
print("modeling fc6 branches...")
prob, train_op, accuracy, loss, pred, initialize_vars, y, fc6 = build_branches(fc5, w6)
def build_branches(fc5, w6):
y = tf.placeholder(tf.int64, [None, None])
b6 = tf.get_variable('fc6/bias', shape=2, initializer=tf.zeros_initializer())
fc6 = tf.add(tf.matmul(fc5, w6), b6)
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=fc6))
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="fc6")
with tf.variable_scope("", reuse=tf.AUTO_REUSE):
optimizer = tf.train.AdamOptimizer(learning_rate=0.001, name='adam')
train_op = optimizer.minimize(loss, var_list=train_vars)
initialize_vars = train_vars
initialize_vars += [optimizer.get_slot(var, name)
for name in optimizer.get_slot_names()
for var in train_vars]
if isinstance(optimizer, tf.train.AdamOptimizer):
initialize_vars += optimizer._get_beta_accumulators()
prob = tf.nn.softmax(fc6)
pred = tf.argmax(prob, 2)
correct_pred = tf.equal(pred, y)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
return prob, train_op, accuracy, loss, pred, initialize_vars, y, fc6
I created a simple TensorFlow program that tries to predict the next character using the previous 3 characters in a body of text.
A single input could look like this:
np.array(['t','h','i'])
with the target about being
np.array(['s'])
I'm trying to expand this to output the next say 4 character rather than just the next character. To do this I tried feeding in a longer array to y
np.array(['s','','i'])
In addition to changing the y to
y = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
however, this yields the error:
Rank mismatch: Rank of labels (received 2) should equal rank of logits
minus 1 (received 2).
Here's the full code
embedding_size=40
n_neurons = 200
n_output = vocab_size
learning_rate = 0.001
with tf.Graph().as_default():
x = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
y = tf.placeholder(dtype=tf.int32, shape=[None])
seq_length = tf.placeholder(tf.int32, [None])
# Let's set up the embedding converting words to vectors
embeddings = tf.Variable(tf.random_uniform(shape=[vocab_size, embedding_size], minval=-1, maxval=1))
train_input = tf.nn.embedding_lookup(embeddings, x)
basic_cell = tf.nn.rnn_cell.GRUCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, train_input, sequence_length=seq_length, dtype=tf.float32)
logits = tf.layers.dense(states, units=vocab_size, activation=None)
predictions = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for r in range(1000):
x_batch, y_batch, seq_length_batch = input_fn()
feed_dict = {x: x_batch, y: y_batch, seq_length: seq_length_batch}
_, loss_out = sess.run([training_op, loss], feed_dict=feed_dict)
if r % 1000 == 0:
print("loss_out", loss_out)
sample_text = "for th"
sample_text_ids = np.expand_dims(np.array([w_to_id[c] for c in sample_text]+[0, 0], dtype=np.int32), 0)
prediction_out = sess.run(predictions, feed_dict={x: sample_text_ids, seq_length: np.array([len(sample_text)])})
print("Result:", id_to_w[np.argmax(prediction_out)])
In case of many-to-many RNN, you should use tf.contrib.seq2seq.sequence_loss to calculate per time step loss. Your code should look like this:
...
logits = tf.layers.dense(states, units=vocab_size, activation=None)
weights = tf.sequence_mask(seq_length, n_steps)
xentropy = tf.contrib.seq2seq.sequence_loss(logits, y, weights)
...
See here for more details on tf.contrib.seq2seq.sequence_loss.
Dataset Description
The dataset contains a set of question pairs and a label which tells if the questions are same. e.g.
"How do I read and find my YouTube comments?" , "How can I see all my
Youtube comments?" , "1"
The goal of the model is to identify if the given question pair is same or different.
Approach
I have created a Siamese network to identify if two questions are same. Following is the model:
graph = tf.Graph()
with graph.as_default():
embedding_placeholder = tf.placeholder(tf.float32, shape=embedding_matrix.shape, name='embedding_placeholder')
with tf.variable_scope('siamese_network') as scope:
labels = tf.placeholder(tf.int32, [batch_size, None], name='labels')
keep_prob = tf.placeholder(tf.float32, name='question1_keep_prob')
with tf.name_scope('question1') as question1_scope:
question1_inputs = tf.placeholder(tf.int32, [batch_size, seq_len], name='question1_inputs')
question1_embedding = tf.get_variable(name='embedding', initializer=embedding_placeholder, trainable=False)
question1_embed = tf.nn.embedding_lookup(question1_embedding, question1_inputs)
question1_lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
question1_drop = tf.contrib.rnn.DropoutWrapper(question1_lstm, output_keep_prob=keep_prob)
question1_multi_lstm = tf.contrib.rnn.MultiRNNCell([question1_drop] * lstm_layers)
q1_initial_state = question1_multi_lstm.zero_state(batch_size, tf.float32)
question1_outputs, question1_final_state = tf.nn.dynamic_rnn(question1_multi_lstm, question1_embed, initial_state=q1_initial_state)
scope.reuse_variables()
with tf.name_scope('question2') as question2_scope:
question2_inputs = tf.placeholder(tf.int32, [batch_size, seq_len], name='question2_inputs')
question2_embedding = question1_embedding
question2_embed = tf.nn.embedding_lookup(question2_embedding, question2_inputs)
question2_lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
question2_drop = tf.contrib.rnn.DropoutWrapper(question2_lstm, output_keep_prob=keep_prob)
question2_multi_lstm = tf.contrib.rnn.MultiRNNCell([question2_drop] * lstm_layers)
q2_initial_state = question2_multi_lstm.zero_state(batch_size, tf.float32)
question2_outputs, question2_final_state = tf.nn.dynamic_rnn(question2_multi_lstm, question2_embed, initial_state=q2_initial_state)
Calculate the cosine distance using the RNN outputs:
with graph.as_default():
diff = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(question1_outputs[:, -1, :], question2_outputs[:, -1, :])), reduction_indices=1))
margin = tf.constant(1.)
labels = tf.to_float(labels)
match_loss = tf.expand_dims(tf.square(diff, 'match_term'), 0)
mismatch_loss = tf.expand_dims(tf.maximum(0., tf.subtract(margin, tf.square(diff)), 'mismatch_term'), 0)
loss = tf.add(tf.matmul(labels, match_loss), tf.matmul((1 - labels), mismatch_loss), 'loss_add')
distance = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(distance)
Following is the code to train the model:
with graph.as_default():
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer(), feed_dict={embedding_placeholder: embedding_matrix})
iteration = 1
for e in range(epochs):
summary_writer = tf.summary.FileWriter('/Users/mithun/projects/kaggle/quora_question_pairs/logs', sess.graph)
summary_writer.add_graph(sess.graph)
for ii, (x1, x2, y) in enumerate(get_batches(question1_train, question2_train, label_train, batch_size), 1):
feed = {question1_inputs: x1,
question2_inputs: x2,
labels: y[:, None],
keep_prob: 0.9
}
loss1 = sess.run([distance], feed_dict=feed)
if iteration%5==0:
print("Epoch: {}/{}".format(e, epochs),
"Iteration: {}".format(iteration),
"Train loss: {:.3f}".format(loss1))
if iteration%50==0:
val_acc = []
for x1, x2, y in get_batches(question1_val, question2_val, label_val, batch_size):
feed = {question1_inputs: x1,
question2_inputs: x2,
labels: y[:, None],
keep_prob: 1
}
batch_acc = sess.run([accuracy], feed_dict=feed)
val_acc.append(batch_acc)
print("Val acc: {:.3f}".format(np.mean(val_acc)))
iteration +=1
saver.save(sess, "checkpoints/quora_pairs.ckpt")
I have trained the above model with about 10,000 labeled data. But, the accuracy is stagnant at around 0.630 and strangely the validation accuracy is same across all the iterations.
lstm_size = 64
lstm_layers = 1
batch_size = 128
learning_rate = 0.001
Is there anything wrong with the way I have created the model?
This is a common problem with imbalanced datasets like the recently released Quora dataset which you are using. Since the Quora dataset is imbalanced (~63% negative and ~37% positive examples) you need proper initialization of weights. Without weight initialization your solution will be stuck in a local minima and it will train to predict only the negative class. Hence the 63% accuracy, because that is the percentage of 'not similar' questions in your validation data. If you check the results obtained on your validation set you will notice that it predicts all zeros. A truncated normal distribution proposed in He et al., http://arxiv.org/abs/1502.01852 is a good alternate for initializing the weights.
I've a problem for resuming training after saving my model.
The problem is that my loss decrease form 6 to 3 for example. At this time I save the model.
When I restore it and continue training, the loss restart from 6.
It seems that the restoration doesn't really work.
I don't understand because printing the weights, it seems that they are loaded properly.
I use an ADAM optimizer. Thanks in advance.
Here:
batch_size = self.batch_size
num_classes = self.num_classes
n_hidden = 50 #700
n_layers = 1 #3
truncated_backprop = self.seq_len
dropout = 0.3
learning_rate = 0.001
epochs = 200
with tf.name_scope('input'):
x = tf.placeholder(tf.float32, [batch_size, truncated_backprop], name='x')
y = tf.placeholder(tf.int32, [batch_size, truncated_backprop], name='y')
with tf.name_scope('weights'):
W = tf.Variable(np.random.rand(n_hidden, num_classes), dtype=tf.float32)
b = tf.Variable(np.random.rand(1, num_classes), dtype=tf.float32)
inputs_series = tf.split(x, truncated_backprop, 1)
labels_series = tf.unstack(y, axis=1)
with tf.name_scope('LSTM'):
cell = tf.contrib.rnn.BasicLSTMCell(n_hidden, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
cell = tf.contrib.rnn.MultiRNNCell([cell] * n_layers)
states_series, current_state = tf.contrib.rnn.static_rnn(cell, inputs_series, \
dtype=tf.float32)
logits_series = [tf.matmul(state, W) + b for state in states_series]
prediction_series = [tf.nn.softmax(logits) for logits in logits_series]
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels) \
for logits, labels, in zip(logits_series, labels_series)]
total_loss = tf.reduce_mean(losses)
train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
tf.summary.scalar('total_loss', total_loss)
summary_op = tf.summary.merge_all()
loss_list = []
writer = tf.summary.FileWriter('tf_logs', graph=tf.get_default_graph())
all_saver = tf.train.Saver()
with tf.Session() as sess:
#sess.run(tf.global_variables_initializer())
tf.reset_default_graph()
saver = tf.train.import_meta_graph('./models/tf_models/rnn_model.meta')
saver.restore(sess, './models/tf_models/rnn_model')
for epoch_idx in range(epochs):
xx, yy = next(self.get_batch)
batch_count = len(self.D.chars) // batch_size // truncated_backprop
for batch_idx in range(batch_count):
batchX, batchY = next(self.get_batch)
summ, _total_loss, _train_step, _current_state, _prediction_series = sess.run(\
[summary_op, total_loss, train_step, current_state, prediction_series],
feed_dict = {
x : batchX,
y : batchY
})
loss_list.append(_total_loss)
writer.add_summary(summ, epoch_idx * batch_count + batch_idx)
if batch_idx % 5 == 0:
print('Step', batch_idx, 'Batch_loss', _total_loss)
if batch_idx % 50 == 0:
all_saver.save(sess, 'models/tf_models/rnn_model')
if epoch_idx % 5 == 0:
print('Epoch', epoch_idx, 'Last_loss', loss_list[-1])
I had the same problem, in my case, the model was being correctly restored but the loss was starting really high again and again, the problem was that my batch retreival was not random. I had three classes, A, B and C. My data was being fed in this manner A, then B, then C. I don't know if that is your problem but you must ensure that every batch you give to your model has all of your classes in it, so in your case, the batch must have batch_size/num_classes input per class. I changed it and everything worked perfectly :)
Check out if you are correctly feeding your model.
My problem was a code error in labels, they were changing between two run.
So it works now.
Thank you for the help
I am trying to write a simple program using TensorFlow to predict the next number in a sequence.
I am not experienced in TensorFlow so instead of starting from scratch I started with this guide: http://monik.in/a-noobs-guide-to-implementing-rnn-lstm-using-tensorflow/
However, in contrast to the implementation in the link above I do not want to treat the problem as a classification problem - where I only have n possible outcomes - but instead just calculate a single value for a sequence.
I tried modifying the code to fit my problem:
import numpy as np
import random
from random import shuffle
import tensorflow as tf
NUM_EXAMPLES = 10000
train_input = ['{0:020b}'.format(i) for i in range(2**20)]
shuffle(train_input)
train_input = [map(int,i) for i in train_input]
ti = []
for i in train_input:
temp_list = []
for j in i:
temp_list.append([j])
ti.append(np.array(temp_list))
train_input = ti
train_output = []
for i in train_input:
count = 0
for j in i:
if j[0] == 1:
count+=1
#temp_list = ([0]*21)
#temp_list[count]=1
#train_output.append(temp_list)
train_output.append(count)
test_input = train_input[NUM_EXAMPLES:]
test_output = train_output[NUM_EXAMPLES:]
train_input = train_input[:NUM_EXAMPLES]
train_output = train_output[:NUM_EXAMPLES]
print "test and training data loaded"
target = tf.placeholder(tf.float32, [None, 1])
data = tf.placeholder(tf.float32, [None, 20,1]) #Number of examples, number of input, dimension of each input
#target = tf.placeholder(tf.float32, [None, 1])
#print('target shape: ', target.get_shape())
#print('shape[0]', target.get_shape()[1])
#print('int(shape) ', int(target.get_shape()[1]))
num_hidden = 24
cell = tf.nn.rnn_cell.LSTMCell(num_hidden)
val, _ = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
val = tf.transpose(val, [1, 0, 2])
print('val shape, ', val.get_shape())
last = tf.gather(val, int(val.get_shape()[0]) - 1)
weight = tf.Variable(tf.truncated_normal([num_hidden, int(target.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[target.get_shape()[1]]))
#prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
prediction = tf.matmul(last, weight) + bias
cross_entropy = -tf.reduce_sum(target - prediction)
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(cross_entropy)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(prediction, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op)
batch_size = 100
no_of_batches = int(len(train_input)) / batch_size
epoch = 500
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_input[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out})
print "Epoch ",str(i)
incorrect = sess.run(error,{data: test_input, target: test_output})
#print sess.run(prediction,{data: [[[1],[0],[0],[1],[1],[0],[1],[1],[1],[0],[1],[0],[0],[1],[1],[0],[1],[1],[1],[0]]]})
#print('Epoch {:2d} error {:3.1f}%'.format(i + 1, 100 * incorrect))
sess.close()
It is still work in progress, since the input is bogus as well as the cross entropy calculation.
However, my main problem is that the code doesn't compile at all.
I get this error:
ValueError: Cannot feed value of shape (100,) for Tensor
u'Placeholder:0', which has shape '(?, 1)'
The number 100 comes from the "batch_size" and the (?, 1) comes from the fact that my prediction is a one dimensional number. However, I do not have any idea where the problem is in my code?
Can anyone help me get the dimensions to match?
This error means your targets placeholder is being fed something with the wrong shape. To fix it, I think you should reshape something like test_output.reshape([-1, 1])
To fix the placeholders shape, change your code to
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp = train_input[ptr:ptr+batch_size]
out = train_output[ptr:ptr+batch_size]
ptr+=batch_size
out = np.reshape(out, (100,1)) #reshape
sess.run(minimize,{data: inp, target: out})
print ("Epoch ",str(i))
test_output = np.reshape(test_output, (1038576,1)) #reshape
incorrect = sess.run(error,{data: test_input, target: test_output})