Tensorflow GraphDef cannot be larger than 2GB - tensorflow

I'm trying to implement a deep autoencoder with tensorflow. The RBM pretraining works just fine, but when it comes to fine tuning, it raises the error: 'ValueError: GraphDef cannot be larger than 2GB'. My input is an array in the shape of [12396, 8192], and here is my layers: [8192 16384 8192 4096 2048 1024 512 256 512 1024 2048 4096 8192 16384 8192].
I know where the problem is, but I have no idea how to fix it. I have thought about using multiple graph, but what if my input is too big to even store one layer? Besides I don't know how many graph I should use. Set up a graph for every layer? That would be too slow and unnecessary.
Thank you!
def __init__(self, input_num, layers, rbm_learning_rate, deepnn_learning_rate, rbm_num_epoch,
deepnn_num_epoch, momentum=0, batch_size=128, data_type='float32'):
self.input_num = input_num
self.layers = layers
self.n_layers = len(self.layers)
self.rbm_learning_rate = rbm_learning_rate
self.deepnn_learning_rate = deepnn_learning_rate
if momentum == 0:
self.momentum = []
for _ in range(self.n_layers):
self.momentum.append(1)
self.rbm_num_epoch = rbm_num_epoch
self.deepnn_num_epoch = deepnn_num_epoch
self.batch_size = batch_size
self.data_type = data_type
self.rbm_list = []
self.rbm_list.append(RBM(self.input_num, self.layers[0], self.rbm_num_epoch,
self.momentum[0], self.rbm_learning_rate[0], self.batch_size, self.data_type))
for i in range(self.n_layers-1):
self.rbm_list.append(RBM(self.layers[i], self.layers[i+1], self.rbm_num_epoch,
self.momentum[i], self.rbm_learning_rate[i], self.batch_size, self.data_type))
def pretrain(self, train_set):
self.W_list = []
self.b_list = []
self.a_list = []
if not cmp(train_set.dtype, self.data_type):
train_set.dtype = self.data_type
next_train = train_set
for i, rboltz in enumerate(self.rbm_list):
next_train = self._pretrain_and_get_para(rboltz, next_train)
def _pretrain_and_get_para(self, rboltz, next_train):
output, W_out, a_out, b_out = rboltz.fit(next_train)
self.W_list.append(W_out)
self.a_list.append(a_out)
self.b_list.append(b_out)
return output
def fine_tune(self, train_set):
m, _ = train_set.shape
self.num_per_epoch = m / self.batch_size
train_batch = tf.placeholder(self.data_type, [None, self.input_num])
logits = self._build_model(train_batch)
loss = self._loss(logits, train_batch)
train_op = self._training(loss)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
for _ in range(self.deepnn_num_epoch):
for i in range(self.num_per_epoch):
_, cost = sess.run([train_op, loss], feed_dict = self._feed_build(train_batch, train_set, i))
print cost
def _feed_build(self, train_batch, train_set, i):
batch = prepare_data.next_batch(train_set, i, self.batch_size)
feed_dict = {train_batch: batch}
return feed_dict
def _build_model(self, train_batch):
middle_layer = self._make_encoder(train_batch)
last_layer = self._make_decoder(middle_layer)
return last_layer
def _make_encoder(self, train_batch):
encoder = []
encoder.append(train_batch)
for i, layer in enumerate(self.layers):
with tf.name_scope('encoder'+str(i)):
W = tf.Variable(self.W_list[i], name = 'weights')
b = tf.Variable(self.b_list[i], name = 'biases')
encoder.append(tf.sigmoid(b + tf.matmul(encoder[i], W)))
return encoder[self.n_layers]
def _make_decoder(self, middle_layer):
decoder = []
decoder.append(middle_layer)
for i, layer in enumerate(self.layers):
with tf.name_scope('decoder'+str(i)):
W = tf.Variable(self.W_list[self.n_layers-i-1], name = 'weights')
a = tf.Variable(self.a_list[self.n_layers-i-1], name = 'biases')
decoder.append(tf.sigmoid(a + tf.matmul(decoder[i], W, transpose_b = True)))
return decoder[self.n_layers]
def _loss(self, logits, labels):
loss = tf.nn.l2_loss(logits-labels)
return loss
def _training(self, loss):
optimizer = tf.train.GradientDescentOptimizer(self.deepnn_learning_rate)
train_op = optimizer.minimize(loss)
return train_op

Related

Pytorch LSTM not using GPU

I'm trying to train a pytorch LSTM model connected with couple of MLP layers. The model is coded as follows:
class RNNBlock(nn.Module):
def __init__(self, in_dim, hidden_dim, num_layer=1, dropout=0):
super(RNNBlock, self).__init__()
self.hidden_dim = hidden_dim
self.num_layer = num_layer
self.lstm = nn.LSTM(in_dim, hidden_dim, num_layer, dropout)
def forward(self, onehot, length):
batch_size = onehot.shape[0]
h_in = nn.Parameter(torch.randn(self.num_layer, batch_size, self.hidden_dim))
c_in = nn.Parameter(torch.randn(self.num_layer, batch_size, self.hidden_dim))
packed = nn.utils.rnn.pack_padded_sequence(onehot, length, batch_first=True)
output, (h_out, c_out) = self.lstm(packed, (h_in, c_in))
unpacked, unpacked_length = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
vectors = list()
for i, vector in enumerate(unpacked):
vectors.append(unpacked[i, unpacked_length[i]-1, :].view(1, -1))
out = torch.cat(vectors, 0)
return out
class Predictor(nn.Module):
def __init__(self, in_dim, out_dim, act=None):
super(Predictor, self).__init__()
self.linear = nn.Linear(in_dim, out_dim)
nn.init.xavier_normal_(self.linear.weight)
self.activation = act
def forward(self, x):
out = self.linear(x)
if self.activation != None:
out = self.activation(out)
return out
class RNNNet(nn.Module):
def __init__(self, args):
super(RNNNet, self).__init__()
self.rnnBlock = RNNBlock(args.in_dim, args.hidden_dim, args.num_layer, args.dropout)
self.pred1 = Predictor(args.hidden_dim, args.pred_dim1, act=nn.ReLU())
self.pred2 = Predictor(args.pred_dim1, args.pred_dim2, act=nn.ReLU())
self.pred3 = Predictor(args.pred_dim2, args.out_dim)
def forward(self, onehot, length):
out = self.rnnBlock(onehot, length)
out = self.pred1(out)
out = self.pred2(out)
out = self.pred3(out)
return out
and this is my train and experiment functions
def train(model, device, optimizer, criterion, data_train, bar, args):
epoch_train_loss = 0
epoch_train_mae = 0
for i, batch in enumerate(data_train):
list_onehot = torch.tensor(batch[0]).cuda().float()
list_length = torch.tensor(batch[1]).cuda()
list_logP = torch.tensor(batch[2]).cuda().float()
# Sort onehot tensor with respect to the sequence length.
list_length, list_index = torch.sort(list_length, descending=True)
list_length.cuda()
list_index.cuda()
list_onehot = torch.Tensor([list_onehot.tolist()[i] for i in list_index]).cuda().float()
model.train()
optimizer.zero_grad()
list_pred_logP = model(list_onehot, list_length).squeeze().cuda()
list_pred_logP.require_grad = False
train_loss = criterion(list_pred_logP, list_logP)
train_mae = mean_absolute_error(list_pred_logP.tolist(), list_logP.tolist())
epoch_train_loss += train_loss.item()
epoch_train_mae += train_mae
train_loss.backward()
optimizer.step()
bar.update(len(list_onehot))
epoch_train_loss /= len(data_train)
epoch_train_mae /= len(data_train)
return model, epoch_train_loss, epoch_train_mae
def experiment(dict_partition, device, bar, args):
time_start = time.time()
model = RNNNet(args)
model.cuda()
if args.optim == 'Adam':
optimizer = optim.Adam(model.parameters(),
lr=args.lr,
weight_decay=args.l2_coef)
elif args.optim == 'RMSprop':
optimizer = optim.RMSprop(model.parameters(),
lr=args.lr,
weight_decay=args.l2_coef)
elif args.optim == 'SGD':
optimizer = optim.SGD(model.parameters(),
lr=args.lr,
weight_decay=args.l2_coef)
else:
assert False, 'Undefined Optimizer Type'
criterion = nn.MSELoss()
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
list_train_loss = list()
list_val_loss = list()
list_train_mae = list()
list_val_mae = list()
data_train = DataLoader(dict_partition['train'], batch_size=args.batch_size, shuffle=args.shuffle)
data_val = DataLoader(dict_partition['val'], batch_size=args.batch_size, shuffle=args.shuffle)
for epoch in range(args.epoch):
scheduler.step()
model, train_loss, train_mae = train(model, device, optimizer, criterion, data_train, bar, args)
list_train_loss.append(train_loss)
list_train_mae.append(train_mae)
mode, val_loss, val_mae = validate(model, device, criterion, data_val, bar, args)
list_val_loss.append(val_loss)
list_val_mae.append(val_mae)
data_test = DataLoader(dict_partition['test'], batch_size=args.batch_size, shuffle=args.shuffle)
mae, std, logP_total, pred_logP_total = test(model, device, data_test, args)
time_end = time.time()
time_required = time_end - time_start
args.list_train_loss = list_train_loss
args.list_val_loss = list_val_loss
args.list_train_mae = list_train_mae
args.list_val_mae = list_val_mae
args.logP_total = logP_total
args.pred_logP_total = pred_logP_total
args.mae = mae
args.std = std
args.time_required = time_required
return args
The list_onehot and list_length tensors are loaded from the DataLoader and uploaded to GPU. Then, to use packed sequence as input, I’ve sorted the both list_onehot and list_length and uploaded to GPU. The model was uploaded to GPU and h_in, c_in tensors and packed sequence object were also uploaded to the GPU. However, when I try to run this code, it does not use GPU but only use CPU. What should I do to use GPU to train this model?

Tensorflow: Can't overfit training data with batch size > 1

I coded a small RNN network with Tensorflow to return the total energy consumption given some parameters. There seem to be a problem in my code. It can't overfit the training data when I use a batch size > 1 (even with only 4 samples!). In the code below, the loss value reaches 0 when I set BatchSize to 1. However, by setting BatchSize to 2, the network fails to overfit and the loss value goes toward 12.500000 and gets stuck there forever.
I suspect this has something to do with LSTM states. I get the same problem if I don't update the state with each iteration. Or maybe the cost function? A help is appreciated. Thanks.
import tensorflow as tf
import numpy as np
import os
from utils import loadData
Epochs = 10000
LearningRate = 0.0001
MaxGradNorm = 5
SeqLen = 1
NChannels = 28
NClasses = 1
NLayers = 2
NUnits = 256
BatchSize = 1
NumSamples = 4
#################################################################
trainingFile = "./training.dat"
X_values, Y_values = loadData(trainingFile, SeqLen, NumSamples)
X = tf.placeholder(tf.float32, [BatchSize, SeqLen, NChannels], name='inputs')
Y = tf.placeholder(tf.float32, [BatchSize, SeqLen, NClasses], name='labels')
keep_prob = tf.placeholder(tf.float32, name='keep')
initializer = tf.contrib.layers.xavier_initializer()
Xin = tf.unstack(tf.transpose(X, perm=[1, 0, 2]))
lstm_layers = []
for i in range(NLayers):
lstm_layer = tf.nn.rnn_cell.LSTMCell(num_units=NUnits, initializer=initializer, use_peepholes=True, state_is_tuple=True)
dropout_layer = tf.contrib.rnn.DropoutWrapper(lstm_layer, output_keep_prob=keep_prob)
#[LSTM ---> DROPOUT] ---> [LSTM ---> DROPOUT] ---> etc...
lstm_layers.append(dropout_layer)
rnn = tf.nn.rnn_cell.MultiRNNCell(lstm_layers, state_is_tuple=True)
initial_state = rnn.zero_state(BatchSize, tf.float32)
outputs, final_state = tf.nn.static_rnn(rnn, Xin, dtype=tf.float32, initial_state=initial_state)
outputs = tf.transpose(outputs, [1,0,2])
outputs = tf.reshape(outputs, [-1, NUnits])
weight = tf.Variable(tf.truncated_normal([NUnits, NClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[NClasses]))
prediction = tf.matmul(outputs, weight) + bias
prediction = tf.reshape(prediction, [BatchSize, SeqLen, NClasses])
cost = tf.reduce_sum(tf.pow(tf.subtract(prediction, Y), 2)) / (2 * BatchSize)
tvars = tf.trainable_variables()
grad, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), MaxGradNorm)
optimizer = tf.train.AdamOptimizer(learning_rate = LearningRate)
train_step = optimizer.apply_gradients(zip(grad, tvars))
sess = tf.Session()
sess.run(tf.global_variables_initializer())
iteration = 1
for e in range(0, Epochs):
train_loss = []
state = sess.run(initial_state)
for i in xrange(0, len(X_values), BatchSize):
x = X_values[i:i + BatchSize]
y = Y_values[i:i + BatchSize]
y = np.expand_dims(y, 2)
feed = {X : x, Y : y, keep_prob : 1.0, initial_state : state}
_ , loss, state, pred = sess.run([train_step, cost, final_state, prediction], feed_dict = feed)
train_loss.append(loss)
iteration += 1
print("Epoch: {}/{}".format(e, Epochs), "Iteration: {:d}".format(iteration), "Train average rmse: {:6f}".format(np.mean(train_loss)))
Normalizing the input data solved the problem.

Cost-sensitive loss function in Tensorflow

I'm doing research for cost-sensitive neural network based on Tensorflow. But because of the static graph structure of Tensorflow. Some NN structure couldn't be realized by myself.
My loss function(cost) ,cost matrix and the computational progress is described as follow and my target is to compute the total cost and then optimize the NN :
Approximately computational progress:
the y_ is the last full-connect output of a CNN which has shape (1024,5)
the y is a Tensor which has shape(1024) and indicates the ground truth of x[i]
the y_soft[i] [j] indicates the probability of x[i] to be class j
How can I realize this in Tensorflow?
cost_matrix:
[[0,1,100],
[1,0,1],
[1,20,0]]
label:
[1,2]
y*:
[[0,1,0],
[0,0,1]]
y(prediction):
[[0.2,0.3,0.5],
[0.1,0.2,0.7]]
label,cost_matrix-->cost_embedding:
[[1,0,1],
[1,20,0]]
It obvious 0.3 in [0.2,0.3,0.5] refers to right lable probility of [0,1,0], so it should not contibute to loss.
0.7 in [0.1,0.2,0.7] is the same. In other words, the pos with value 1 in y* not contibute to loss.
So I have (1-y*):
[[1,0,1],
[1,1,0]]
Then the entropy is target*log(predict) + (1-target) * log(1-predict),and value 0 in y*,should use (1-target)*log(1-predict), so I use (1-predict) said (1-y)
1-y:
[[0.8,*0.7*,0.5],
[0.9,0.8,*0.3*]]
(italic num is useless)
the custom loss is
[[1,0,1], [1,20,0]] * log([[0.8,0.7,0.5],[0.9,0.8,0.3]]) *
[[1,0,1],[1,1,0]]
and you can see the (1-y*) can be drop here
so the loss is -tf.reduce_mean(cost_embedding*log(1-y))
,to make it applicable , should be:
-tf.reduce_mean(cost_embedding*log(tf.clip((1-y),1e-10)))
the demo is below
import tensorflow as tf
import numpy as np
hidden_units = 50
num_class = 3
class Model():
def __init__(self,name_scope,is_custom):
self.name_scope = name_scope
self.is_custom = is_custom
self.input_x = tf.placeholder(tf.float32,[None,hidden_units])
self.input_y = tf.placeholder(tf.int32,[None])
self.instantiate_weights()
self.logits = self.inference()
self.predictions = tf.argmax(self.logits,axis=1)
self.losses,self.train_op = self.opitmizer()
def instantiate_weights(self):
with tf.variable_scope(self.name_scope + 'FC'):
self.W = tf.get_variable('W',[hidden_units,num_class])
self.b = tf.get_variable('b',[num_class])
self.cost_matrix = tf.constant(
np.array([[0,1,100],[1,0,100],[20,5,0]]),
dtype = tf.float32
)
def inference(self):
return tf.matmul(self.input_x,self.W) + self.b
def opitmizer(self):
if not self.is_custom:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits\
(labels=self.input_y,logits=self.logits)
else:
batch_cost_matrix = tf.nn.embedding_lookup(
self.cost_matrix,self.input_y
)
loss = - tf.log(1 - tf.nn.softmax(self.logits))\
* batch_cost_matrix
train_op = tf.train.AdamOptimizer().minimize(loss)
return loss,train_op
import random
batch_size = 128
norm_model = Model('norm',False)
custom_model = Model('cost',True)
split_point = int(0.9 * dataset_size)
train_set = datasets[:split_point]
test_set = datasets[split_point:]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(100):
batch_index = random.sample(range(split_point),batch_size)
train_batch = train_set[batch_index]
train_labels = lables[batch_index]
_,eval_predict,eval_loss = sess.run([norm_model.train_op,
norm_model.predictions,norm_model.losses],
feed_dict={
norm_model.input_x:train_batch,
norm_model.input_y:train_labels
})
_,eval_predict1,eval_loss1 = sess.run([custom_model.train_op,
custom_model.predictions,custom_model.losses],
feed_dict={
custom_model.input_x:train_batch,
custom_model.input_y:train_labels
})
# print 'norm',eval_predict,'\ncustom',eval_predict1
print np.sum(((eval_predict == train_labels)==True).astype(np.int)),\
np.sum(((eval_predict1 == train_labels)==True).astype(np.int))
if i%10 == 0:
print 'norm_test',sess.run(norm_model.predictions,
feed_dict={
norm_model.input_x:test_set,
norm_model.input_y:lables[split_point:]
})
print 'custom_test',sess.run(custom_model.predictions,
feed_dict={
custom_model.input_x:test_set,
custom_model.input_y:lables[split_point:]
})

TensorFlow: loss jumps up after restoring RNN net

Environment info
Operating System: Windows 7 64-bit
Tensorflow installed from pre-built pip (no CUDA): 1.0.1
Python 3.5.2 64-bit
Problem
I have problems with restoring my net (RNN character base language model). Below is a simplified version with the same problem.
When I run it the first time, I get, for example, this.
...
step 160: loss = 1.956 (perplexity = 7.069016620211226)
step 180: loss = 1.837 (perplexity = 6.274748642468816)
step 200: loss = 1.825 (perplexity = 6.202084762557817)
But on the second run, after restoring parameters, I get this.
step 220: loss = 2.346 (perplexity = 10.446611983898903)
step 240: loss = 2.346 (perplexity = 10.446709120339545)
...
All the tf variables seem to be correctly restored, including the state, which will be fed to RNN.
Data position is also restored (from 'step').
I also made a similar program for MNIST recognition model, and this one works fine: the losses before and after the restoring are continuous.
Are there any other parameters or states that should be saved and restored?
import argparse
import os
import tensorflow as tf
import numpy as np
import math
B = 20 # batch size
H = 200 # size of hidden layer of neurons
T = 25 # number of time steps to unroll the RNN for
data_file = 'ptb.train.txt' # any plain text file will do
checkpoint_dir = "tmp"
#----------------
# prepare data
#----------------
data = open(data_file, 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has {0} characters, {1} unique.'.format(data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
input_index_raw = np.array([char_to_ix[ch] for ch in data])
input_index_raw = input_index_raw[0:len(input_index_raw) // T * T]
input_index_raw_shift = np.append(input_index_raw[1:], input_index_raw[0])
input_all = input_index_raw.reshape([-1, T])
target_all = input_index_raw_shift.reshape([-1, T])
num_packed_data = len(input_all)
#----------------
# build model
#----------------
class Model(object):
def __init__(self):
self.input_ph = tf.placeholder(tf.int32, [None, T], name="input_ph")
self.target_ph = tf.placeholder(tf.int32, [None, T], name="target_ph")
embedding = tf.get_variable("embedding", [vocab_size, H], initializer=tf.random_normal_initializer(), dtype=tf.float32)
# input_ph is B x T.
# input_embedded is B x T x H.
input_embedded = tf.nn.embedding_lookup(embedding, self.input_ph)
cell = tf.contrib.rnn.BasicRNNCell(H)
self.state_ph = tf.placeholder(tf.float32, (None, cell.state_size), name="state_ph")
# Make state variable so that it will be saved by the saver.
self.state = tf.get_variable("state", (B, cell.state_size), initializer=tf.zeros_initializer(), trainable=False, dtype=tf.float32)
# Construct initial_state according to whether restoring or not.
self.isRestore = tf.placeholder(tf.bool, shape=(), name="isRestore")
zero_state = cell.zero_state(B, dtype=tf.float32)
self.initial_state = tf.cond(self.isRestore, lambda: self.state, lambda: zero_state)
# input_embedded : B x T x H
# output: B x T x H
# state : B x cell.state_size
output, state_ = tf.nn.dynamic_rnn(cell, input_embedded, initial_state=self.state_ph)
self.final_state = tf.assign(self.state, state_)
# reshape to (B * T) x H.
output_flat = tf.reshape(output, [-1, H])
# Convert hidden layer's output to vector of logits for each vocabulary.
softmax_w = tf.get_variable("softmax_w", [H, vocab_size], dtype=tf.float32)
softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)
logits = tf.matmul(output_flat, softmax_w) + softmax_b
# cross_entropy is a vector of length B * T
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.target_ph, [-1]), logits=logits)
self.loss = tf.reduce_mean(cross_entropy)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
self.global_step = tf.get_variable("global_step", (), initializer=tf.zeros_initializer(), trainable=False, dtype=tf.int32)
self.training_op = optimizer.minimize(cross_entropy, global_step=self.global_step)
def train_batch(self, sess, input_batch, target_batch, initial_state):
final_state_, _, final_loss = sess.run([self.final_state, self.training_op, self.loss], feed_dict={self.input_ph: input_batch, self.target_ph: target_batch, self.state_ph: initial_state})
return final_state_, final_loss
# main
with tf.Session() as sess:
if not tf.gfile.Exists(checkpoint_dir):
tf.gfile.MakeDirs(checkpoint_dir)
batch_stride = num_packed_data // B
# make model
model = Model()
saver = tf.train.Saver()
# always initialize
init = tf.global_variables_initializer()
init.run()
# restore if necessary
isRestore = False
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt:
isRestore = True
last_model = ckpt.model_checkpoint_path
print("Loading " + last_model)
saver.restore(sess, last_model)
# set initial step
step = tf.train.global_step(sess, model.global_step) + 1
print("start step = {0}".format(step))
# fetch initial state
state = sess.run(model.initial_state, feed_dict={model.isRestore: isRestore})
print("Initial state: {0}".format(state))
while True:
# prepare batch data
idx = [(step + x * batch_stride) % num_packed_data for x in range(0, B)]
input_batch = input_all[idx]
target_batch = target_all[idx]
state, last_loss = model.train_batch(sess, input_batch, target_batch, state)
if step % 20 == 0:
print('step {0}: loss = {1:.3f} (perplexity = {2})'.format(step, last_loss, math.exp(last_loss)))
if step % 200 == 0:
saved_file = saver.save(sess, os.path.join(checkpoint_dir, "model.ckpt"), global_step=step)
print("Saved to " + saved_file)
print("Last state: {0}".format(model.state.eval()))
break;
step = step + 1
The problem is solved. It had nothing to do with RNN nor TensorFlow.
I changed
chars = list(set(data))
to
chars = sorted(set(data))
and now it works.
This is because python uses a random hash function to build the set, and every time python restarted, 'chars' had a different ordering.

Tensorflow RNN: Perplexity per Epoch remains constant

I am training an RNN-based language-model using Tensorflow. The model is very similar to the PTB model example in the TF tutorials section. However, when I attempt to train the model on my own data, the perplexity of the model does not go down; it remains constant throughout multiple epochs. Could anyone let me know what I might be doing wrong.
I have a feeling that I am not handling the targets properly, but the gist of my code for the targets is:
def batcher(batch_size,unroll_steps,data,pad):
print(len(data))
batches = len(data) / batch_size
inp = []
target = []
for i in range(batches):
#print(len(data[i*batch_size:(i+1)*batch_size]))
x = data[i*batch_size:(i+1)*batch_size]
y = [ line[1:]+[pad] for line in x ]
yield (x,y)
That is, I just shift the data by 1 and use that as the target for the next word in a sentence.
The training script and model (class) are seen below
Training script (excerpt):
def train(session, model, folder,batch_size,unroll_steps,epoch):
word_to_id, id_to_word, train, val = build_inputs(folder,unroll_steps)
pad = word_to_id['<pad>']
costs = 0
iters = 0
train_size = len(train)
batch_size = model.batch_size
batches = train_size / batch_size
state = session.run(model._initial_state)
print("Running epoch %d" % epoch)
for i in range(batches):
fetches = [model.cost, model._final_state, model.logits]
feed_dict = {}
x = train[i*batch_size:(i+1)*batch_size]
y = [ line[1:] +[pad] for line in x ]
feed_dict[model.input] = x
feed_dict[model.targets] = y
feed_dict[model._initial_state] = state
#print("Cell-state complete - Running")
cost, state, logits = session.run(fetches, feed_dict)
#print("Single Run complete")
costs += cost
iters += model.unroll_steps
print("\tEpoch %d: Perplexity is %f" % (epoch, np.exp(costs/iters)))
return np.exp(costs/iters)
Model:
import tensorflow as tf
class LM(object):
def __init__(self, train, max_gradient, batch_size, unroll_steps, vocab, size, layers, learning_rate, init, prob):
self.batch_size = batch_size
self.max_gradient = max_gradient
self.layers = layers
self.learning_rate = learning_rate
self.unroll_steps = unroll_steps
self.init = init
#with tf. name_scope("Paramters"):
with tf.device('/gpu:0'), tf.name_scope("Input"):
self.input = tf.placeholder(tf.int64, shape=[batch_size, unroll_steps], name="input")
self.targets = tf.placeholder(tf.int64, shape=[batch_size, unroll_steps], name="targets")
#self.init = tf.placeholder(tf.float32, shape=[], name="init")
with tf.device('/gpu:0'), tf.name_scope("Embedding"):
embedding = tf.Variable(tf.random_uniform([vocab, size], -self.init, self.init), dtype=tf.float32, name="embedding")
embedded_input = tf.nn.embedding_lookup(embedding, self.input, name="embedded_input")
with tf.device('/gpu:0'), tf.name_scope("RNN"), tf.variable_scope(tf.get_variable_scope(), reuse = False) as scope:
lstm_cell = tf.contrib.rnn.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True)
if train and prob < 1.0:
lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=prob)
cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for _ in range(layers)], state_is_tuple=True)
self._initial_state = cell.zero_state(batch_size, tf.float32)
outputs = []
state = self._initial_state
for step in range(unroll_steps):
if step > 0: tf.get_variable_scope().reuse_variables()
(cell_output, state) = cell(embedded_input[:, step, :], state)
outputs.append(cell_output)
with tf.device('/gpu:0'), tf.name_scope("Cost"), tf.variable_scope(tf.get_variable_scope(), reuse = False) as scope:
output = tf.reshape(tf.concat(outputs,1), [-1,size])
softmax_w = tf.get_variable("softmax_w", [size, vocab], dtype=tf.float32)
softmax_b = tf.get_variable("softmax_b", [vocab], dtype=tf.float32)
logits = tf.matmul(output, softmax_w) + softmax_b
losses = []
for logit, target in zip([logits], [tf.reshape(self.targets,[-1])]):
target = tf.reshape(target, [-1])
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logit,labels=target)
losses.append(loss)
self.cost = tf.reduce_sum(losses) / batch_size
self._final_state = state
self.logits = logits
scope.reuse_variables()
if not train:
return
with tf.device('/gpu:0'), tf.name_scope("Train"), tf.variable_scope(tf.get_variable_scope(), reuse=False):
train_variables = tf.trainable_variables()
gradients, _ = tf.clip_by_global_norm(tf.gradients(self.cost, train_variables),self.max_gradient)
optimizer = tf.train.AdamOptimizer(self.learning_rate)
self.training = optimizer.apply_gradients(zip(gradients, train_variables))
tf.get_variable_scope().reuse_variables()