Tensorflow model restoration (resume training seems starting from scratch) - tensorflow

I've a problem for resuming training after saving my model.
The problem is that my loss decrease form 6 to 3 for example. At this time I save the model.
When I restore it and continue training, the loss restart from 6.
It seems that the restoration doesn't really work.
I don't understand because printing the weights, it seems that they are loaded properly.
I use an ADAM optimizer. Thanks in advance.
Here:
batch_size = self.batch_size
num_classes = self.num_classes
n_hidden = 50 #700
n_layers = 1 #3
truncated_backprop = self.seq_len
dropout = 0.3
learning_rate = 0.001
epochs = 200
with tf.name_scope('input'):
x = tf.placeholder(tf.float32, [batch_size, truncated_backprop], name='x')
y = tf.placeholder(tf.int32, [batch_size, truncated_backprop], name='y')
with tf.name_scope('weights'):
W = tf.Variable(np.random.rand(n_hidden, num_classes), dtype=tf.float32)
b = tf.Variable(np.random.rand(1, num_classes), dtype=tf.float32)
inputs_series = tf.split(x, truncated_backprop, 1)
labels_series = tf.unstack(y, axis=1)
with tf.name_scope('LSTM'):
cell = tf.contrib.rnn.BasicLSTMCell(n_hidden, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
cell = tf.contrib.rnn.MultiRNNCell([cell] * n_layers)
states_series, current_state = tf.contrib.rnn.static_rnn(cell, inputs_series, \
dtype=tf.float32)
logits_series = [tf.matmul(state, W) + b for state in states_series]
prediction_series = [tf.nn.softmax(logits) for logits in logits_series]
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels) \
for logits, labels, in zip(logits_series, labels_series)]
total_loss = tf.reduce_mean(losses)
train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
tf.summary.scalar('total_loss', total_loss)
summary_op = tf.summary.merge_all()
loss_list = []
writer = tf.summary.FileWriter('tf_logs', graph=tf.get_default_graph())
all_saver = tf.train.Saver()
with tf.Session() as sess:
#sess.run(tf.global_variables_initializer())
tf.reset_default_graph()
saver = tf.train.import_meta_graph('./models/tf_models/rnn_model.meta')
saver.restore(sess, './models/tf_models/rnn_model')
for epoch_idx in range(epochs):
xx, yy = next(self.get_batch)
batch_count = len(self.D.chars) // batch_size // truncated_backprop
for batch_idx in range(batch_count):
batchX, batchY = next(self.get_batch)
summ, _total_loss, _train_step, _current_state, _prediction_series = sess.run(\
[summary_op, total_loss, train_step, current_state, prediction_series],
feed_dict = {
x : batchX,
y : batchY
})
loss_list.append(_total_loss)
writer.add_summary(summ, epoch_idx * batch_count + batch_idx)
if batch_idx % 5 == 0:
print('Step', batch_idx, 'Batch_loss', _total_loss)
if batch_idx % 50 == 0:
all_saver.save(sess, 'models/tf_models/rnn_model')
if epoch_idx % 5 == 0:
print('Epoch', epoch_idx, 'Last_loss', loss_list[-1])

I had the same problem, in my case, the model was being correctly restored but the loss was starting really high again and again, the problem was that my batch retreival was not random. I had three classes, A, B and C. My data was being fed in this manner A, then B, then C. I don't know if that is your problem but you must ensure that every batch you give to your model has all of your classes in it, so in your case, the batch must have batch_size/num_classes input per class. I changed it and everything worked perfectly :)
Check out if you are correctly feeding your model.

My problem was a code error in labels, they were changing between two run.
So it works now.
Thank you for the help

Related

My Pytorch model is giving very bad results

I am new with Deep Learning with Pytorch. I am more experienced with Tensorflow, and thus I should say I am not new to Deep Learning itself.
Currently, I am working on a simple ANN classification. There are only 2 classes so quite naturally I am using a Softmax BCELoss combination.
The dataset is like this:
shape of X_train (891, 7)
Shape of Y_train (891,)
Shape of x_test (418, 7)
I transformed the X_train and others to torch tensors as train_data and so on. The next step is:
train_ds = TensorDataset(train_data, train_label)
# Define data loader
batch_size = 32
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
I made the model class like:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(7, 32)
self.bc1 = nn.BatchNorm1d(32)
self.fc2 = nn.Linear(32, 64)
self.bc2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, 128)
self.bc3 = nn.BatchNorm1d(128)
self.fc4 = nn.Linear(128, 32)
self.bc4 = nn.BatchNorm1d(32)
self.fc5 = nn.Linear(32, 10)
self.bc5 = nn.BatchNorm1d(10)
self.fc6 = nn.Linear(10, 1)
self.bc6 = nn.BatchNorm1d(1)
self.drop = nn.Dropout2d(p=0.5)
def forward(self, x):
torch.nn.init.xavier_uniform(self.fc1.weight)
x = self.fc1(x)
x = self.bc1(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc2(x)
x = self.bc2(x)
x = F.relu(x)
#x = self.drop(x)
x = self.fc3(x)
x = self.bc3(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc4(x)
x = self.bc4(x)
x = F.relu(x)
#x = self.drop(x)
x = self.fc5(x)
x = self.bc5(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc6(x)
x = self.bc6(x)
x = torch.sigmoid(x)
return x
model = Net()
The loss function and the optimizer are defined:
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
At last, the task is to run the forward in epochs:
num_epochs = 1000
# Repeat for given number of epochs
for epoch in range(num_epochs):
# Train with batches of data
for xb,yb in train_dl:
pred = model(xb)
yb = torch.unsqueeze(yb, 1)
#print(pred, yb)
print('grad', model.fc1.weight.grad)
l = loss(pred, yb)
#print('loss',l)
# 3. Compute gradients
l.backward()
# 4. Update parameters using gradients
optimizer.step()
# 5. Reset the gradients to zero
optimizer.zero_grad()
# Print the progress
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, l.item()))
I can see in the output that after each iteration with all the batches, the hard weights are non-zero, after this zero_grad is applied.
However, the model is pretty bad. I get an F1 score of around 50% only! And the model is bad when I call it to predict the train_dl itself!!!
I am wondering what the reason is. The grad of weights not zero but not updating properly? The optimizer not optimizing the weights? Or what else?
Can someone please have a look?
I already tried different loss functions and optimizers. I tried with smaller datasets, bigger batches, different hyperparameters.
Thanks! :)
First of all, you don't use softmax activation for BCE loss, unless you have 2 output nodes, which is not the case. In PyTorch, BCE loss doesn't apply any activation function before calculating the loss, unlike the CCE which has a built-in softmax function. So, if you want to use BCE, you have to use sigmoid (or any function f: R -> [0, 1]) at the output layer, which you don't have.
Moreover, you should ideally do optimizer.zero_grad() for each batch if you want to do SGD (which is the default). If you don't do that, you will be just doing full-batch gradient descent, which is quite slow and gets stuck in local minima easily.

Problem with tensorflow initialization when it gets encapsulated

I am encapsulating an autoencoder cost calculation, in order to allow to be used with an swarm algorithms. The goal is to get a cost summary of the autoencoder sending a few parameters, so the method creates a model, train it and returns its cost tensor
def getAECost(dfnormalized, adamParam, iterations):
N_VISIBLE = 31
N_HIDDEN = 20
DEVICE = '/gpu:0' #Or '/cpu:0'
ITERATIONS = 1 + iterations
with tf.device(DEVICE):
# create node for input data(entiendo none columns and N_VISIBLE rows)
X = tf.placeholder("float", [None, N_VISIBLE], name='X')
# create nodes for hidden variables
W_init_max = 4 * np.sqrt(6. / (N_VISIBLE + N_HIDDEN))
W_init = tf.random_uniform(shape=[N_VISIBLE, N_HIDDEN])#,
# minval=-W_init_max,
# maxval=W_init_max)
#Inicialite our weight and bias
#W [784,500]
W = tf.Variable(W_init, name='W')
#Inicializate only bias of hidden layer
b = tf.Variable(tf.zeros([N_HIDDEN]), name='b')
#W_prime[500,784]
W_prime = tf.transpose(W) # tied weights between encoder and decoder
b_prime = tf.Variable(tf.zeros([N_VISIBLE]), name='b_prime')
#model that take our variables parameters
#Comportamiento de la red neuronal
def model(X, W, b, W_prime, b_prime):
tilde_X = X
#To decode ?
Y = tf.nn.sigmoid(tf.matmul(tilde_X, W) + b) # hidden state
#to reconstructed the input
Z = tf.nn.sigmoid(tf.matmul(Y, W_prime) + b_prime) # reconstructed input
return Z
# build model graph
pred = model(X, W, b, W_prime, b_prime)
# create cost function
#Sum of squared error
cost = tf.reduce_sum(tf.pow(X - pred, 2)) # minimize squared error
#Tensor to parameter learning rate
learning = tf.placeholder("float", name='learning')
train_op = tf.train.AdamOptimizer(learning).minimize(cost) # construct an optimizer
with tf.Session() as sess:
# you need to initialize all variables
tf.global_variables_initializer()
RATIO = adamParam
for i in range(ITERATIONS):
#Prepare input(minibach) from feed autoencoder
input_ = dfnormalized
# train autoencoder
sess.run(train_op, feed_dict={X: input_, learning: RATIO})
#Save last epoch and test
if(i == ITERATIONS-1):
#Get output as dataframe after training(Z is a array, we cast to list to append with a dataframe)
costAE = sess.run(cost, feed_dict={X: input_})
return costAE
It worked a few days ago (maybe I had another session on background), returning the method a float number, but nowadays is not working, getting the inizialization error
FailedPreconditionError: Attempting to use uninitialized value W
[[{{node W/read}}]]
in the training step
sess.run(train_op, feed_dict={X: input_, learning: RATIO})
Any advice about how this initialization problem can be solved, or how can I encapsulate a tensorflow model and session?
Thanks
You have to actually run the variables initializer, tf.global_variables_initializer() returns an op to be executed, it does not run the initialization for you. So the solution to your problem should be replacing the line
tf.global_variables_initializer()
with
sess.run(tf.global_variables_initializer())
I have tried what #Addy said, and reestructured the code to see more legible, and now works perfectly
class Model:
N_VISIBLE = 31
N_HIDDEN = 20
DEVICE = '/gpu:0' #Or '/cpu:0'
with tf.device(DEVICE):
# create node for input data(entiendo none columns and N_VISIBLE rows)
X = tf.placeholder("float", [None, N_VISIBLE], name='X')
# create nodes for hidden variables
W_init_max = 4 * np.sqrt(6. / (N_VISIBLE + N_HIDDEN))
W_init = tf.random_uniform(shape=[N_VISIBLE, N_HIDDEN])#,
# minval=-W_init_max,
# maxval=W_init_max)
#Inicialite our weight and bias
#W [784,500]
W = tf.Variable(W_init, name='W')
#Inicializate only bias of hidden layer
b = tf.Variable(tf.zeros([N_HIDDEN]), name='b')
#W_prime[500,784]
W_prime = tf.transpose(W) # tied weights between encoder and decoder
b_prime = tf.Variable(tf.zeros([N_VISIBLE]), name='b_prime')
#model that take our variables parameters
#Comportamiento de la red neuronal
def model(X, W, b, W_prime, b_prime):
tilde_X = X
#To decode ?
Y = tf.nn.sigmoid(tf.matmul(tilde_X, W) + b) # hidden state
#to reconstructed the input
Z = tf.nn.sigmoid(tf.matmul(Y, W_prime) + b_prime) # reconstructed input
return Z
# build model graph
pred = model(X, W, b, W_prime, b_prime)
# create cost function
#Sum of squared error
cost = tf.reduce_sum(tf.pow(X - pred, 2)) # minimize squared error
#Tensor to parameter learning rate
learning = tf.placeholder("float", name='learning')
train_op = tf.train.AdamOptimizer(learning).minimize(cost) # construct an optimizer
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
def train (self, data, adamParam, iterations):
input_ = data
RATIO = adamParam
for i in range(iterations):
# train autoencoder
_= self.sess.run(self.train_op, feed_dict={self.X: input_, self.learning: RATIO})
#print ("Model trained")
def getAECost(self, data):
input_ = data
return self.sess.run(self.cost, {self.X: data})
def trainAndGetCost (self, dataTrain, dataCost, adamParam, iterations):
self.train(dataTrain, adamParam, iterations)
return self.getAECost(dataCost)

tensorflow - linear regression does not give intended computational graph

I am trying to train a very simple linear regression with tensorflow but the loss doesn't decrease and the tensorboard also doesn't look right
### Generate data
w_true = np.array([1.0,2.0])
b_true = 0.5
x_train = np.random.multivariate_normal(mean=[0,0], cov=[[1,0],[0,1]], size=100)
x_test = np.random.multivariate_normal(mean=[0,0], cov=[[3,0],[0,3]], size=100)
y_train = np.dot(x_train,w_true) + b_true
y_test = np.dot(x_test,w_true) + b_true
### Placeholders for data input
x = tf.placeholder(dtype=tf.float32, shape=[None,2], name="x")
y = tf.placeholder(dtype=tf.float32, shape=[None], name="labels")
### Trainable parameters
w = tf.Variable(initial_value=np.random.multivariate_normal([0,0],[[1,0],[0,1]]), dtype=tf.float32,
name="W")
b = tf.Variable(initial_value=np.random.normal(1), dtype=tf.float32,name="B")
### Computational graph
y_pred = tf.tensordot(x,w,1)+b
tf.summary.histogram("weights",w)
tf.summary.histogram("bias",b)
loss = tf.reduce_sum(tf.squared_difference(y,y_pred), name="loss")
tf.summary.scalar("loss", loss)
with tf.name_scope("train"):
train_step = tf.train.GradientDescentOptimizer(0.00001).minimize(loss)
### Training
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# For TensorBoard
writer = tf.summary.FileWriter("path_to_some_folder")
writer.add_graph(sess.graph)
for t in range(1000):
x_batch = x_train[np.random.choice(100, 20)]
y_batch = y_train[np.random.choice(100, 20)]
sess.run(train_step, {x:x_batch,y:y_batch})
print(sess.run(loss, {x:x_train,y:y_train}))
print(sess.run(loss, {x:x_test,y:y_test}))
I have tried different step sizes but the error always stays above 400 on the training and 1000 on the test set. I have tested that tf.tensordot() behaves like I expect. I you would like to see the tensorboard just replace the path_to_some_folder and after training run tensorboard --logdir path_to_some_folder
Thanks very much for the help
Your problem is because of the following two lines,
x_batch = x_train[np.random.choice(100, 20)]
y_batch = y_train[np.random.choice(100, 20)]
In each iteration, np.random.choice(100, 20) returns two different index lists for x_batch and y_batch. Therefore, your x_batch and y_batch will never match. Instead, replace that part with the following code.
BATCH_SIZE= 10
N_COUNT = len(x_train)
for t in range(1000):
for start, end in zip(range(0, N_COUNT, BATCH_SIZE),
range(BATCH_SIZE, N_COUNT + 1,BATCH_SIZE)):
x_batch = x_train[start:end]
y_batch = y_train[start:end]
sess.run(train_step, {x:x_batch,y:y_batch})
Hope this helps.

Siamese Model with LSTM network fails to train using tensorflow

Dataset Description
The dataset contains a set of question pairs and a label which tells if the questions are same. e.g.
"How do I read and find my YouTube comments?" , "How can I see all my
Youtube comments?" , "1"
The goal of the model is to identify if the given question pair is same or different.
Approach
I have created a Siamese network to identify if two questions are same. Following is the model:
graph = tf.Graph()
with graph.as_default():
embedding_placeholder = tf.placeholder(tf.float32, shape=embedding_matrix.shape, name='embedding_placeholder')
with tf.variable_scope('siamese_network') as scope:
labels = tf.placeholder(tf.int32, [batch_size, None], name='labels')
keep_prob = tf.placeholder(tf.float32, name='question1_keep_prob')
with tf.name_scope('question1') as question1_scope:
question1_inputs = tf.placeholder(tf.int32, [batch_size, seq_len], name='question1_inputs')
question1_embedding = tf.get_variable(name='embedding', initializer=embedding_placeholder, trainable=False)
question1_embed = tf.nn.embedding_lookup(question1_embedding, question1_inputs)
question1_lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
question1_drop = tf.contrib.rnn.DropoutWrapper(question1_lstm, output_keep_prob=keep_prob)
question1_multi_lstm = tf.contrib.rnn.MultiRNNCell([question1_drop] * lstm_layers)
q1_initial_state = question1_multi_lstm.zero_state(batch_size, tf.float32)
question1_outputs, question1_final_state = tf.nn.dynamic_rnn(question1_multi_lstm, question1_embed, initial_state=q1_initial_state)
scope.reuse_variables()
with tf.name_scope('question2') as question2_scope:
question2_inputs = tf.placeholder(tf.int32, [batch_size, seq_len], name='question2_inputs')
question2_embedding = question1_embedding
question2_embed = tf.nn.embedding_lookup(question2_embedding, question2_inputs)
question2_lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
question2_drop = tf.contrib.rnn.DropoutWrapper(question2_lstm, output_keep_prob=keep_prob)
question2_multi_lstm = tf.contrib.rnn.MultiRNNCell([question2_drop] * lstm_layers)
q2_initial_state = question2_multi_lstm.zero_state(batch_size, tf.float32)
question2_outputs, question2_final_state = tf.nn.dynamic_rnn(question2_multi_lstm, question2_embed, initial_state=q2_initial_state)
Calculate the cosine distance using the RNN outputs:
with graph.as_default():
diff = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(question1_outputs[:, -1, :], question2_outputs[:, -1, :])), reduction_indices=1))
margin = tf.constant(1.)
labels = tf.to_float(labels)
match_loss = tf.expand_dims(tf.square(diff, 'match_term'), 0)
mismatch_loss = tf.expand_dims(tf.maximum(0., tf.subtract(margin, tf.square(diff)), 'mismatch_term'), 0)
loss = tf.add(tf.matmul(labels, match_loss), tf.matmul((1 - labels), mismatch_loss), 'loss_add')
distance = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(distance)
Following is the code to train the model:
with graph.as_default():
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer(), feed_dict={embedding_placeholder: embedding_matrix})
iteration = 1
for e in range(epochs):
summary_writer = tf.summary.FileWriter('/Users/mithun/projects/kaggle/quora_question_pairs/logs', sess.graph)
summary_writer.add_graph(sess.graph)
for ii, (x1, x2, y) in enumerate(get_batches(question1_train, question2_train, label_train, batch_size), 1):
feed = {question1_inputs: x1,
question2_inputs: x2,
labels: y[:, None],
keep_prob: 0.9
}
loss1 = sess.run([distance], feed_dict=feed)
if iteration%5==0:
print("Epoch: {}/{}".format(e, epochs),
"Iteration: {}".format(iteration),
"Train loss: {:.3f}".format(loss1))
if iteration%50==0:
val_acc = []
for x1, x2, y in get_batches(question1_val, question2_val, label_val, batch_size):
feed = {question1_inputs: x1,
question2_inputs: x2,
labels: y[:, None],
keep_prob: 1
}
batch_acc = sess.run([accuracy], feed_dict=feed)
val_acc.append(batch_acc)
print("Val acc: {:.3f}".format(np.mean(val_acc)))
iteration +=1
saver.save(sess, "checkpoints/quora_pairs.ckpt")
I have trained the above model with about 10,000 labeled data. But, the accuracy is stagnant at around 0.630 and strangely the validation accuracy is same across all the iterations.
lstm_size = 64
lstm_layers = 1
batch_size = 128
learning_rate = 0.001
Is there anything wrong with the way I have created the model?
This is a common problem with imbalanced datasets like the recently released Quora dataset which you are using. Since the Quora dataset is imbalanced (~63% negative and ~37% positive examples) you need proper initialization of weights. Without weight initialization your solution will be stuck in a local minima and it will train to predict only the negative class. Hence the 63% accuracy, because that is the percentage of 'not similar' questions in your validation data. If you check the results obtained on your validation set you will notice that it predicts all zeros. A truncated normal distribution proposed in He et al., http://arxiv.org/abs/1502.01852 is a good alternate for initializing the weights.

LSTM model error is percent of one output class

I'm having a rough time trying to figure out what's wrong with my LSTM model. I have 11 inputs, and 2 output classes (one-hot encoded) and very quickly, like within 1 batch or so, the error just goes to the % of one of the output classes and stays there.
I tried printing weights and biases, but they seem to all be full of NaN.
If i decrease the learning rate, or mess around with layers/units, I can get it to arrive at the % of one class error slowly, but it seems to always get to that point.
Here's the code:
num_units = 30
num_layers = 50
dropout_rate = 0.80
learning_rate=0.0001
batch_size = 180
epoch = 1
input_classes = len(train_input[0])
output_classes = len(train_output[0])
data = tf.placeholder(tf.float32, [None, input_classes, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, output_classes]) #one-hot encoded: [1,0] = bad, [0,1] = good
dropout = tf.placeholder(tf.float32)
cell = tf.contrib.rnn.LSTMCell(num_units, state_is_tuple=True)
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
#Input shape [batch_size, max_time, depth], output shape: [batch_size, max_time, cell.output_size]
val, _ = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
val = tf.transpose(val, [1, 0, 2]) #reshapes it to [sequence_size, batch_size, depth]
#get last entry as it includes previous results
last = tf.gather(val, int(val.get_shape()[0]) - 1)
weight = tf.get_variable("W", shape=[num_units, output_classes], initializer=tf.contrib.layers.xavier_initializer())
bias = tf.get_variable("B", shape=[output_classes], initializer=tf.contrib.layers.xavier_initializer())
logits = tf.matmul(last, weight) + bias
prediction = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=target)
prediction = tf.clip_by_value(prediction, 1e-10,100.0)
cost = tf.reduce_mean(prediction)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
minimize = optimizer.minimize(cost)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(logits, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()
sess = tf.Session()
sess.run(init_op)
no_of_batches = int((len(train_input)) / batch_size)
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_input[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out, dropout: dropout_rate })
sess.close()
Since you have one hot encoding use sparse_softmax_cross_entropy_with_logits instead of tf.nn.softmax_cross_entropy_with_logits.
Refer to this stackoverflow answer to understand the difference of two functions.
1