Tensorflow: Logistic regression cost=NaN - tensorflow

I am trying to implement a logistic regression problem having 9 different targets. When debugging I get
Epoch: 0025 cost= nan
This is how one batch looks like
batch_xs
[[ 3.40000000e+01 3.34000000e+01 9.00000000e-02 3.40000000e+01
4.06858908e+00 0.00000000e+00 3.30000000e+01 4.04000000e+01
1.00000000e-02 3.30000000e+01 4.06858908e+00 1.00000000e+00
3.20000000e+01 4.22000000e+01 7.00000000e-01 3.20000000e+01
4.06858908e+00 2.00000000e+00 3.10000000e+01 4.18000000e+01
5.00000000e-01 3.10000000e+01 4.06858908e+00 3.00000000e+00
3.00000000e+01 3.70000000e+01 0.00000000e+00 3.00000000e+01
4.06858908e+00 4.00000000e+00 2.90000000e+01 3.78000000e+01
2.00000000e-02 2.90000000e+01 4.06858908e+00 5.00000000e+00
2.80000000e+01 3.78000000e+01 2.00000000e-02 2.90000000e+01
4.06858908e+00 6.00000000e+00 4.90000000e+01 4.00000000e+00
1.00000000e+00]]
batch_ys:
[[0 0 0 1 0 0 0 0 0]]
While the originnal y was. I converted it into a (_,9) matrix using convert_y
[[3]]
This is some of the code I use
learning_rate = 0.01
training_epochs = 25
batch_size = 1
display_step = 1
x = tf.placeholder(tf.float32, [None,feature_column_count])
y = tf.placeholder(tf.float32, [None,n_target_classes])
W = tf.Variable(tf.zeros([feature_column_count,n_target_classes]))
b = tf.Variable(tf.zeros([n_target_classes]))
pred = tf.nn.softmax(tf.matmul(x,W)+b)
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()
def next_batch(num, data, labels):
idx = np.arange(0, len(data))
idx = idx[:num]
data_s = data[idx]
labels_s = labels[idx]
return np.asarray(data_s), np.asarray(labels_s)
def convert_y(size,n_classes,y):
yk = np.zeros((size,n_classes), dtype=np.int)
for a in range(len(y)):
yk[a,y[a]] = 1
return yk
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(np.shape(TRAINING_SET.data)[0]/batch_size)
for i in range(total_batch):
features = TRAINING_SET.data
target = TRAINING_SET.target
batch_xs,batch_ys = next_batch(batch_size, features, target)
batch_ys = convert_y(len(batch_ys),n_target_classes,batch_ys)
print(batch_xs)
print(batch_ys)
_, c = sess.run([optimizer, cost], feed_dict={x: batch_xs, y: batch_ys})
avg_cost += c / total_batch
if (epoch+1) % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
print("Optimization Finished!")
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Test Accuracy:",accuracy.eval({x: TRAINING_SET.data, y: convert_y(len(TRAINING_SET.target),n_target_classes,TRAINING_SET.target)}))
print("Validation Accuracy:",accuracy.eval({x: VALIDATION_SET.data, y: convert_y(len(VALIDATION_SET.target),n_target_classes,VALIDATION_SET.target)}))
Anyone knows where the code issue is?

For regression it's better to use mean_square loss. you may try the following.
Also gradient clipping would help
logits = tf.nn.relu(tf.matmul(x,W)+b)
cost = tf.reduce_mean(tf.square(tf.subtract(y, logits)))
learning_rate = 0.01
momentum = 0.9
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
grads_vars = optimizer.compute_gradients(cost)
cliped_grads_vars = []
for gard, var in grads_vars:
grad = tf.clip_by_norm(grad, max_norm=10.0)
clipped_grads_vars.append((gard, var))
train_op = optimizer.apply_gradients(clipped_gards_vars)
....
_, c = sess.run([train_op, cost], feed_dict={x: batch_xs, y: batch_ys})

Related

Tensorflow Summary Scalar is not being displayed

I am using the following code to produce scalar graphs for my accuracy and cost, but the scalar summaries are not being displayed at the tensorboard. It gives me an error saying No scalar data was found. Can someone have a look please? code for the model:
def train_neural_network(x):
prediction = convolutional_neural_network(x)
merged_summary_op = tf.summary.merge_all()
with tf.name_scope("cost"):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(cost)
tf.summary.scalar("cost", cost)
hm_epochs = 10
with tf.Session() as sess:
writer = tf.summary.FileWriter('C:/Thesis/Conv3d/69', sess.graph)
sess.run(tf.initialize_all_variables())
successful_runs = 0
total_runs = 0
for epoch in range(hm_epochs):
epoch_loss = 0
for data in train_data:
total_runs += 1
try:
X = data[0]
Y = data[1]
_, c = sess.run([optimizer, cost], feed_dict={x: X, y: Y})
writer.add_summary(summary, global_step=epoch)
epoch_loss += c
successful_runs += 1
except Exception as e:
pass
print('Epoch', epoch + 1, 'completed out of', hm_epochs, 'loss:', epoch_loss)
with tf.name_scope("accuracy"):
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
tf.summary.scalar("accuracy", accuracy)
print('Accuracy:', accuracy.eval({x: [i[0] for i in validation_data], y: [i[1] for i in validation_data]}))
print('Done. Finishing accuracy:')
print('Accuracy:', accuracy.eval({x: [i[0] for i in validation_data], y: [i[1] for i in validation_data]}))
print('fitment percent:', successful_runs / total_runs)
You need to call merge_all after defining the summary ops. What happens right now is that the op simply doesn't have anything to summarize (since it's called first) and unfortunately isn't "smart" enough to add summary ops that are defined later.
Note that I also "fixed" the code in that you generally shouldn't run TF ops in a loop; I moved all the accuracy stuff before the training loop.
def train_neural_network(x):
prediction = convolutional_neural_network(x)
with tf.name_scope("cost"):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(cost)
tf.summary.scalar("cost", cost)
with tf.name_scope("accuracy"):
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
tf.summary.scalar("accuracy", accuracy)
merged_summary_op = tf.summary.merge_all()
hm_epochs = 10
with tf.Session() as sess:
writer = tf.summary.FileWriter('C:/Thesis/Conv3d/69', sess.graph)
sess.run(tf.initialize_all_variables())
successful_runs = 0
total_runs = 0
for epoch in range(hm_epochs):
epoch_loss = 0
for data in train_data:
total_runs += 1
try:
X = data[0]
Y = data[1]
_, c = sess.run([optimizer, cost], feed_dict={x: X, y: Y})
writer.add_summary(summary, global_step=epoch)
epoch_loss += c
successful_runs += 1
except Exception as e:
pass
print('Epoch', epoch + 1, 'completed out of', hm_epochs, 'loss:', epoch_loss)
print('Accuracy:', accuracy.eval({x: [i[0] for i in validation_data], y: [i[1] for i in validation_data]}))
print('Done. Finishing accuracy:')
print('Accuracy:', accuracy.eval({x: [i[0] for i in validation_data], y: [i[1] for i in validation_data]}))
print('fitment percent:', successful_runs / total_runs)

TensorFlow: why use fp result y rather than ExponentialMovingAverage fp result average_y as cross_entropy's parameter?

The code is as below using python 3,Anaconda Spyder3.6,Tensorflow 1.0.0
"""
Created on Sat Oct 14 11:00:54 2017
#author: Han.H
"""
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
INPUT_NODE = 784
OUTPUT_NODE = 10
LAYER1_NODE = 500
BATCH_SIZE = 100
LEARNING_RATE_BASE = 0.8
LEARNING_RATE_DECAY = 0.99
REGULARAZTION_RATE = 0.0001 #lambda
TRAINING_STEPS = 20000
MOVING_AVERAGE_DECAY = 0.99
# when not use ExponentialMovingAverage,just nomal fp
def inference(input_tensor, avg_class, weights1, biases1, weights2, biases2):
if avg_class == None:
layer1 = tf.nn.relu(tf.matmul(input_tensor, weights1) + biases1)
return tf.matmul(layer1, weights2) + biases2
else:
layer1 = tf.nn.relu(tf.matmul(input_tensor, avg_class.average(weights1)) + avg_class.average(biases1))
return tf.matmul(layer1, avg_class.average(weights2)) + avg_class.average(biases2)
# build a 3-layer full connected NN
def train(mnist):
x = tf.placeholder(tf.float32, [None, INPUT_NODE], name='x-input')
y_ = tf.placeholder(tf.float32, [None, OUTPUT_NODE], name='y-input')
weights1 = tf.Variable(tf.truncated_normal([INPUT_NODE, LAYER1_NODE], stddev=0.1))
biases1 = tf.Variable(tf.constant(0.1, shape=[LAYER1_NODE]))
weights2 = tf.Variable(tf.truncated_normal([LAYER1_NODE, OUTPUT_NODE], stddev=0.1))
biases2 = tf.Variable(tf.constant(0.1, shape=[OUTPUT_NODE]))
# normal fp
y = inference(x, None, weights1, biases1, weights2, biases2)
global_step = tf.Variable(0, trainable=False)
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
average_y = inference(x, variable_averages, weights1, biases1, weights2, biases2)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y, labels=tf.argmax(y_, 1))
cross_entropy_mean = tf.reduce_mean(cross_entropy)
# L2
regularizer = tf.contrib.layers.l2_regularizer(REGULARAZTION_RATE)
regularaztion = regularizer(weights1) + regularizer(weights2)
loss = cross_entropy_mean + regularaztion
# Set learning rate
learning_rate = tf.train.exponential_decay(
LEARNING_RATE_BASE,
global_step,
mnist.train.num_examples / BATCH_SIZE,
LEARNING_RATE_DECAY,
staircase=True)
# Gradient descent
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
with tf.control_dependencies([train_step, variables_averages_op]):
train_op = tf.no_op(name='train')
correct_prediction = tf.equal(tf.argmax(average_y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
tf.global_variables_initializer().run()
validate_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
test_feed = {x: mnist.test.images, y_: mnist.test.labels}
for i in range(TRAINING_STEPS):
if i % 1000 == 0:
validate_acc = sess.run(accuracy, feed_dict=validate_feed)
print("After %d training step(s), validation accuracy using average model is %g " % (i, validate_acc))
xs,ys=mnist.train.next_batch(BATCH_SIZE)
sess.run(train_op,feed_dict={x:xs,y_:ys})
test_acc=sess.run(accuracy,feed_dict=test_feed)
print(("After %d training step(s), test accuracy using average model is %g" %(TRAINING_STEPS, test_acc)))
def main(argv=None):
# Main programme here
mnist = input_data.read_data_sets("F:/python/MNIST_data/", one_hot=True)
train(mnist)
if __name__=='__main__':
main()
This code has no problem and run well. I just want to know that why can't use average_y as logits to calculate cross entropy.I tried to do so.It turned out terrible results.The accuracy was as random-initialized firstly as 0.009.

Shapes must be equal rank, but are 2 and 1

I'm following the example of Sentdex on youtube and here is the code I have
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot = True)
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500
n_classes = 10
batch_size = 100
x = tf.placeholder('float', [None, 784])
y = tf.placeholder('float')
def neural_network_model(data):
hidden_1_layer = {'weights':tf.Variable(tf.random_normal([784, n_nodes_hl1])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
hidden_3_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}
output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
'biases':tf.Variable(tf.random_normal([n_classes])),}
l1 = tf.add(tf.matmul(data,hidden_1_layer['weights']), hidden_1_layer['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1,hidden_2_layer['weights']), hidden_2_layer['biases'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2,hidden_3_layer['weights']), hidden_3_layer['biases'])
l3 = tf.nn.relu(l3)
output = tf.matmul(l3,output_layer['weights']) + output_layer['biases']
return output
def train_neural_network(x):
prediction = neural_network_model(x)
# OLD VERSION:
#cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(prediction,y) )
# NEW:
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
optimizer = tf.train.AdamOptimizer().minimize(cost)
hm_epochs = 10
with tf.Session() as sess:
# OLD:
#sess.run(tf.initialize_all_variables())
# NEW:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
epoch_loss = 0
for _ in range(int(mnist.train.num_examples/batch_size)):
epoch_x, epoch_y = mnist.train.next_batch(batch_size)
_, c = sess.run([optimizer, cost], feed_dict={x: epoch_x, y: epoch_y})
epoch_loss += c
print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:mnist.test.images, y:mnist.test.labels}))
train_neural_network(x)
It raises this error :
ValueError: Shapes must be equal rank, but are 2 and 1
From merging shape 0 with other shapes. for 'SparseSoftmaxCrossEntropyWithLogits/packed' (op: 'Pack') with input shapes: [?,10], [10].
on this line:
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
I think it is about the size of y that induced the error, I tried using
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
prediction, tf.squeeze(y)))
I'm pretty sure this means that the cost function induces the error (shown above) prediction and y are not the same shape, but I don't understand TensorFlow well enough to know how to fix it. I don't even really understand where y is being set, I got most of this code from a tutorial and fiddled with it to apply it to a different dataset. How can I fix this error?
ps I tried to print out prediction, it gives me two outputs and i guess thats where the error comes from:
prediction
(<tf.Tensor 'MatMul_39:0' shape=(?, 10) dtype=float32>,
<tf.Variable 'Variable_79:0' shape=(10,) dtype=float32_ref>)
Since you are using one_hot=True while reading the input data, just define correct shape for the y placeholder
# redefine the label and input with exact data type and shape
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, shape=[None, n_classes])
You have a comma between the closing parenthesis and the dict bracket in this statement:
output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),'biases':tf.Variable(tf.random_normal([n_classes])),}
just before the close bracket:
...([n_classes])),}
#WORKING CODE
#I had the same problem as you, (not counting the comma) and i´m sorry i don´t remember the things i changed, but hopefully this will work
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist= input_data.read_data_sets("/tmp/data/", one_hot=True)
#10 clasees, 0-9
n_nodes_hl1=500
n_nodes_hl2=500
n_nodes_hl3=500
n_classes=10
batch_size=100
x=tf.placeholder('float',[None,784])
y=tf.placeholder('float')
def neural(data):
hidden_1_layer={'weights':tf.Variable(tf.random_normal([784, n_nodes_hl1])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
hidden_2_layer={'weights':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
hidden_3_layer={'weights':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}
output_layer={'weights':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
'biases':tf.Variable(tf.random_normal([n_classes]))}
l1=tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
li= tf.nn.relu(l1)
l2=tf.add(tf.matmul(l1, hidden_2_layer['weights']), hidden_2_layer['biases'])
l2= tf.nn.relu(l2)
l3=tf.add(tf.matmul(l2, hidden_3_layer['weights']), hidden_3_layer['biases'])
l3= tf.nn.relu(l3)
output= tf.matmul(l3, output_layer['weights'])+ output_layer['biases']
return output
def train(x):
prediction=neural(x)
cost= tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
optimizer=tf.train.AdamOptimizer().minimize(cost)
hm_epochs=20
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
epoch_loss=0
for _ in range(int(mnist.train.num_examples/batch_size)):
epoch_x,epoch_y = mnist.train.next_batch(batch_size)
_,c=sess.run([optimizer,cost],feed_dict={x: epoch_x, y: epoch_y})
epoch_loss += c
print('Epoch', epoch, 'completed out of', hm_epochs, 'loss:',epoch_loss)
correct= tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy= tf.reduce_mean(tf.cast(correct,'float'))
print('Accuracy:',accuracy.eval({x:mnist.test.images, y:mnist.test.labels}))
train(x)

Weights and costs unchanged when training in tensorflow

I'm a total rookie and tried to use tensorflow for solving multi-input and multi-output problem. However, during the process of training, the weights and the cost of the network are unchanged. Here's some main code, any suggestion would be appreciated!
learning_rate = 0.01
training_epoch = 2000
batch_size = 100
display_step = 1
# place holder for graph input
x = tf.placeholder("float64", [None, 14])
y = tf.placeholder("float64", [None, 8])
# model weights
w_1 = tf.Variable(tf.zeros([14, 11], dtype = tf.float64))
w_2 = tf.Variable(tf.zeros([11, 8], dtype = tf.float64))
# construct a model
h_in = tf.matmul(x, w_1)
h_out = tf.nn.relu(h_in)
o_in = tf.matmul(h_out, w_2)
o_out = tf.nn.relu(o_in)
# cost: mean square error
cost = tf.reduce_sum(tf.pow((o_out - y), 2))
# optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# initializer
init = tf.global_variables_initializer()
# launch the graph
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epoch):
pos = 0;
# loop over all batches
if pos < train_input_array.shape[0]:
# get the next batch
batch_i = []
batch_o = []
for i in range(pos, pos + batch_size):
batch_i.append(train_input_array[i].tolist())
batch_o.append(train_output_array[i].tolist())
np.array(batch_i)
np.array(batch_o)
pos += batch_size;
sess.run(optimizer, feed_dict = {x: batch_i, y: batch_o})
print sess.run(w_2[0])
if (epoch + 1) % display_step == 0:
c = sess.run(cost, feed_dict = {x: batch_i, y: batch_o})
print("Epoch: ", "%04d" % (epoch + 1), "cost: ", "{:.9f}".format(c))
I think you need to change your cost function, to reduce_mean
# reduce sum doesn't work
cost = tf.reduce_sum(tf.pow((o_out - y), 2))
# you need to use mean
cost = tf.reduce_mean(tf.pow((o_out - y), 2))

Outputting sequence in TensorFlow RNN

I created a simple TensorFlow program that tries to predict the next character using the previous 3 characters in a body of text.
A single input could look like this:
np.array(['t','h','i'])
with the target about being
np.array(['s'])
I'm trying to expand this to output the next say 4 character rather than just the next character. To do this I tried feeding in a longer array to y
np.array(['s','','i'])
In addition to changing the y to
y = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
however, this yields the error:
Rank mismatch: Rank of labels (received 2) should equal rank of logits
minus 1 (received 2).
Here's the full code
embedding_size=40
n_neurons = 200
n_output = vocab_size
learning_rate = 0.001
with tf.Graph().as_default():
x = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
y = tf.placeholder(dtype=tf.int32, shape=[None])
seq_length = tf.placeholder(tf.int32, [None])
# Let's set up the embedding converting words to vectors
embeddings = tf.Variable(tf.random_uniform(shape=[vocab_size, embedding_size], minval=-1, maxval=1))
train_input = tf.nn.embedding_lookup(embeddings, x)
basic_cell = tf.nn.rnn_cell.GRUCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, train_input, sequence_length=seq_length, dtype=tf.float32)
logits = tf.layers.dense(states, units=vocab_size, activation=None)
predictions = tf.nn.softmax(logits)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for r in range(1000):
x_batch, y_batch, seq_length_batch = input_fn()
feed_dict = {x: x_batch, y: y_batch, seq_length: seq_length_batch}
_, loss_out = sess.run([training_op, loss], feed_dict=feed_dict)
if r % 1000 == 0:
print("loss_out", loss_out)
sample_text = "for th"
sample_text_ids = np.expand_dims(np.array([w_to_id[c] for c in sample_text]+[0, 0], dtype=np.int32), 0)
prediction_out = sess.run(predictions, feed_dict={x: sample_text_ids, seq_length: np.array([len(sample_text)])})
print("Result:", id_to_w[np.argmax(prediction_out)])
In case of many-to-many RNN, you should use tf.contrib.seq2seq.sequence_loss to calculate per time step loss. Your code should look like this:
...
logits = tf.layers.dense(states, units=vocab_size, activation=None)
weights = tf.sequence_mask(seq_length, n_steps)
xentropy = tf.contrib.seq2seq.sequence_loss(logits, y, weights)
...
See here for more details on tf.contrib.seq2seq.sequence_loss.