Related
I am modifying the 'train model' function below so to plot loss and accuracy graphs at every epochs during traning
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
losses=[]
accuracies=[]
y_loss = {} # loss history
y_loss['aug1_train'] = []
y_loss['valid'] = []
y_acc = {}
y_acc['aug1_train'] = []
y_acc['valid'] = []
x_epoch = []
fig = plt.figure()
ax0 = fig.add_subplot(121, title="loss")
ax1 = fig.add_subplot(122, title="accuracy")
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['aug1_train', 'valid']:
if phase == 'aug1_train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels,paths in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'aug1_train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'aug1_train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f} '.format(
phase, epoch_loss, epoch_acc))
y_loss[phase].append(epoch_loss)
y_acc[phase].append(epoch_acc)
# deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
def draw_curve(current_epoch):
x_epoch.append(current_epoch)
ax0.plot(x_epoch, y_loss['aug1_train'], 'bo-', label='train')
ax0.plot(x_epoch, y_loss['valid'], 'ro-', label='val')
ax1.plot(x_epoch, y_acc['aug1_train'], 'bo-', label='train')
ax1.plot(x_epoch, y_acc['valid'], 'ro-', label='val')
if current_epoch == 0:
ax0.legend()
ax1.legend()
fig.savefig(os.path.join('/content/drive/My Drive/Stanford40/Graphs', 'train.jpg'))
draw_curve(epoch)
if phase=='aug1_train':
losses.append(epoch_loss)
accuracies.append(epoch_acc)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model,losses,accuracies
and I load the Densenet161 for traning as below
#Load Pretrained Densenet161 model
model_ft = models.densenet161(pretrained=True)
model_ft.classifier=nn.Linear(2208,11)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
opt = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
sched = lr_scheduler.StepLR(opt, step_size=5, gamma=0.1)
Finally I run the code below to start training:
model_ft,losses,accuracies = train_model(model_ft, criterion,opt ,sched,num_epochs=30)
and got this error as in the picture below:
How can I modify the code to get away from this error by using tensor.cpu() ?
What if try to get item() here
running_corrects += torch.sum(preds == labels.data).item()
and remove double() when dividing?
epoch_acc = running_corrects / dataset_sizes[phase]
It's hard to say without a detailed error backtrace. It is vague but the information it gives is that something somewhere is detecting a tensor and trying to convert it to numpy array not properly. My intuition tells me it comes from the matplotlib code in your visualization step. I believe it is trying to convert your loss terms.
You should convert them to lists after having performed back propagation...
y_loss[phase].append(epoch_loss.item())
y_acc[phase].append(epoch_acc.item())
I am practising with TensorFlow on this tutorial. The evaluate function depends on the training to load the latest checkpoint:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
start_epoch = 0
if ckpt_manager.latest_checkpoint:
start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
ckpt.restore(ckpt_manager.latest_checkpoint)
for epoch in range(start_epoch, EPOCHS):
start = time.time()
total_loss = 0
for (batch, (img_tensor, target)) in enumerate(dataset):
batch_loss, t_loss = train_step(img_tensor, target)
total_loss += t_loss
if batch % 100 == 0:
print ('Epoch {} Batch {} Loss {:.4f}'.format(
epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
loss_plot.append(total_loss / num_steps)
ckpt_manager.save()
Without ckpt_manager.save(), the evaluation function does not work.
When we have already trained a model and the checkpoints are available in checkpoint_path. How should we load the model without training?
You can use tf.train.latest_checkpoint to get the latest checkpoint file and then load it manually using ckpt.restore:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
ckpt_path = tf.train.latest_checkpoint(checkpoint_path)
ckpt.restore(ckpt_path)
I am trying to train a dataset of 10,000+ images using Tensorflow GPU (GTX 1060 Max-Q 6GB). Because the size of the image in my dataset is huge (512 x 424 pixels) I get a MemoryError.
Traceback (most recent call last):
File "train.py", line 33, in <module>
data = dataset.read_train_sets(train_path, img_size, classes, validation_size=validation_size)
File "/home/nabeel/tf-realsense-gesture/dataset.py", line 103, in read_train_sets
images, labels, img_names, cls = shuffle(images, labels, img_names, cls)
File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 403, in shuffle
return resample(*arrays, **options)
File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 327, in resample
resampled_arrays = [safe_indexing(a, indices) for a in arrays]
File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 216, in safe_indexing
return X.take(indices, axis=0)
MemoryError
The problem with my code is that I am training all seven classes at the same time which is why I get a memory error. I want to process single class at a time.
I have tried to implement a while/for loop inside but every time a loop finishes, the .meta file is overwritten and only works on one class. Is there any way to train multiple classes at the time or one by one?
train.py
batch_size = 1
# 7 classess for recognitions
#classes = ['up']
classes = ['up','down','left','right','forward','backward','none']
#classes = ['up','down','left','right','forward','backward','none']
num_classes = len(classes)
# 20% of the data will automatically be used for validation
validation_size = 0.2
img_size = 200
num_channels = 3
train_path='training_data'
# load all the training and validation images and labels into memory
data = dataset.read_train_sets(train_path, img_size, classes, validation_size=validation_size)
print("Complete reading input data. Will Now print a snippet of it")
print("Number of files in Training-set:\t\t{}".format(len(data.train.labels)))
print("Number of files in Validation-set:\t{}".format(len(data.valid.labels)))
session = tf.Session()
x = tf.placeholder(tf.float32, shape=[batch_size,img_size,img_size,num_channels], name='x')
# labels
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, dimension=1)
#Network graph params
filter_size_conv1 = 3
num_filters_conv1 = 32
filter_size_conv2 = 3
num_filters_conv2 = 32
filter_size_conv3 = 3
num_filters_conv3 = 64
filter_size_conv4 = 3
num_filters_conv4 = 128
filter_size_conv5 = 3
num_filters_conv5 = 256
filter_size_conv6 = 3
num_filters_conv6 = 512
filter_size_conv7 = 3
num_filters_conv7= 1024
fc_layer_size = 2048
def create_weights(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.05))
def create_biases(size):
return tf.Variable(tf.constant(0.05, shape=[size]))
def create_convolutional_layer(input,num_input_channels,conv_filter_size,num_filters):
# define the weights that will be trained
weights = create_weights(shape=[conv_filter_size, conv_filter_size, num_input_channels, num_filters])
# create biases
biases = create_biases(num_filters)
# Creat convolutional layer
layer = tf.nn.conv2d(input=input,filter=weights,strides=[1, 1, 1, 1],padding='SAME')
layer += biases
# max-pooling
layer = tf.nn.max_pool(value=layer,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME')
# Relu is the activation function
layer = tf.nn.relu(layer)
return layer
def create_flatten_layer(layer):
layer_shape = layer.get_shape()
num_features = layer_shape[1:4].num_elements()
# Flatten the layer so reshape to num_features
layer = tf.reshape(layer, [-1, num_features])
return layer
def create_fc_layer(input,
num_inputs,
num_outputs,
use_relu=True):
# define trainable weights and biases.
weights = create_weights(shape=[num_inputs, num_outputs])
biases = create_biases(num_outputs)
# Fully connected layer
layer = tf.matmul(input, weights) + biases
if use_relu:
layer = tf.nn.relu(layer)
return layer
layer_conv1 = create_convolutional_layer(input=x,num_input_channels=num_channels,conv_filter_size=filter_size_conv1,
num_filters=num_filters_conv1)
layer_conv2 = create_convolutional_layer(input=layer_conv1,
num_input_channels=num_filters_conv1,
conv_filter_size=filter_size_conv2,
num_filters=num_filters_conv2)
layer_conv3= create_convolutional_layer(input=layer_conv2,
num_input_channels=num_filters_conv2,
conv_filter_size=filter_size_conv3,
num_filters=num_filters_conv3)
layer_conv4= create_convolutional_layer(input=layer_conv3,
num_input_channels=num_filters_conv3,
conv_filter_size=filter_size_conv4,
num_filters=num_filters_conv4)
layer_conv5= create_convolutional_layer(input=layer_conv4,
num_input_channels=num_filters_conv4,
conv_filter_size=filter_size_conv5,
num_filters=num_filters_conv5)
layer_conv6= create_convolutional_layer(input=layer_conv5,
num_input_channels=num_filters_conv5,
conv_filter_size=filter_size_conv6,
num_filters=num_filters_conv6)
layer_conv7= create_convolutional_layer(input=layer_conv6,
num_input_channels=num_filters_conv6,
conv_filter_size=filter_size_conv7,
num_filters=num_filters_conv7)
layer_flat = create_flatten_layer(layer_conv7)
layer_fc1 = create_fc_layer(input=layer_flat,num_inputs=layer_flat.get_shape()[1:4].num_elements(),num_outputs=fc_layer_size,
use_relu=True)
layer_fc2 = create_fc_layer(input=layer_fc1, num_inputs=fc_layer_size,num_outputs=num_classes, use_relu=False)
y_pred = tf.nn.softmax(layer_fc2,name='y_pred')
y_pred_cls = tf.argmax(y_pred, dimension=1)
session.run(tf.global_variables_initializer())
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc2,labels=y_true)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
session.run(tf.global_variables_initializer())
def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss):
acc = session.run(accuracy, feed_dict=feed_dict_train)
val_acc = session.run(accuracy, feed_dict=feed_dict_validate)
msg = "Training Epoch {0} --- Training Accuracy: {1:>6.1%}, Validation Accuracy: {2:>6.1%}, Validation Loss: {3:.3f}"
print(msg.format(epoch + 1, acc, val_acc, val_loss))
total_iterations = 0
saver = tf.train.Saver()
def train(num_iteration):
global total_iterations
for i in range(total_iterations,total_iterations + num_iteration):
x_batch, y_true_batch, _, cls_batch = data.train.next_batch(batch_size)
x_valid_batch, y_valid_batch, _, valid_cls_batch = data.valid.next_batch(batch_size)
feed_dict_tr = {x: x_batch,y_true: y_true_batch}
feed_dict_val = {x: x_valid_batch,y_true: y_valid_batch}
session.run(optimizer, feed_dict=feed_dict_tr)
if i % int(data.train.num_examples/batch_size) == 0:
val_loss = session.run(cost, feed_dict=feed_dict_val)
epoch = int(i / int(data.train.num_examples/batch_size))
show_progress(epoch, feed_dict_tr, feed_dict_val, val_loss)
saver.save(session, '/home/nabeel/tf-realsense-gesture/')
total_iterations += num_iteration
train(num_iteration=6000)
Since you are Facing Out Of Memory Issue in CNNs, you can try the below steps:
Increase the Strides of the Convolutional Layer i.e., instead of using Sh = 1 and Sw = 1, you can use either Sh = 2 and Sw = 2. This will reduce the Dimensionality of the Image and hence will reduce the RAM Consumption. Code for the same is shown below:
layer = tf.nn.conv2d(input=input,filter=weights,strides=[1, 2, 2, 1],padding='SAME')
Verify if you really require 7 Convolutional Layers. You can try with Less Number of Convolutional Layers (4 or 5 or 6) and can check the performance. Because each Convolutional Layer with some Number of Filters will increase the Memory usage.
Replace tf.float32 with tf.float16 and if it works without any error.
Using an Inception Module instead of Convolutional Layer.
I have written tensorflow code based on:
http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
but using precomputed word embeddings from the GoogleNews word2vec 300 dimension model.
I created my own data from the UCML News Aggregator Dataset in which I parsed the content of the news articles and have created my own labels.
Due to the size of the articles I use TF-IDF to filter out the top 120 words per article and embed those into 300 dimensions.
When I run the CNN I created regardless of the hyper parameters it converges to a small general accuracy, around 38%.
Hyper parameters changed:
Various filter sizes:
I've tried a single filter of 1,2,3
Combinations of filters [3,4,5], [1,3,4]
Learning Rate:
I've varied this from very low to very high, very low doesn't converge to 38% but anything between 0.0001 and 0.4 does.
Batch Size:
Tried many ranges between 5 and 100.
Weight and Bias Initialization:
Set stddev of weights between 0.4 and 0.01.
Set bias initial values between 0 and 0.1.
Tried using the xavier initializer for the conv2d weights.
Dataset Size:
I have only tried on two partial data sets, one with 15 000 training data, and the other on the 5000 test data. In total I have 263 000 data to train on. There is no accuracy difference whether trained and evaluated on the 15 000 training data or by using the 5000 test data as the training data (to save testing time).
I've run successful classifications on the 15 000 / 5000 split using a feed forward network with a BoW input (93% accurate), TF-IDF with SVM (92%), and TF-IDF with Native Bayes (91.5%). So I don't think it is the data.
What does this imply? Is the model just a poor model for this task? Is there an error in my work?
I feel like my do_eval function is incorrect to evaluate the accuracy / loss over an epoch of the data:
def do_eval(data_set,
label_set,
batch_size):
"""
Runs one evaluation against the full epoch of data.
data_set: The set of embeddings to eval
label_set: the set of labels to eval
"""
# And run one epoch of eval.
true_count = 0 # Counts the number of correct predictions.
steps_per_epoch = len(label_set) // batch_size
num_examples = steps_per_epoch * batch_size
totalLoss = 0
# Need to compute eval accuracy
for evalStep in xrange(steps_per_epoch):
input_batch, label_batch = nextBatch(data_set, labels_set, batchSize)
evalAcc, evalLoss = eval_step(input_batch, label_batch)
true_count += evalAcc * batchSize
totalLoss += evalLoss
precision = float(true_count) / num_examples
print(' Num examples: %d Num correct: %d Precision # 1: %0.04f' % (num_examples, true_count, precision))
print("Eval Loss: " + str(totalLoss))
The entire model is as follows:
class TextCNN(object):
"""
A CNN for text classification
Uses a convolutional, max-pooling and softmax layer.
"""
def __init__(
self, batchSize, numWords, num_classes,
embedding_size, filter_sizes, num_filters):
# Set place holders
self.input_placeholder = tf.placeholder(tf.float32,[batchSize,numWords,embedding_size,1])
self.labels = tf.placeholder(tf.int32, [batchSize,num_classes])
self.pKeep = tf.placeholder(tf.float32)
# Inference
'''
Ready to build conv layers followed by max pooling layers
Each conv layer produces a different shaped output so need to loop over
them and create a layer for each and then merge the results
'''
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, embedding_size, 1, num_filters]
# W: Filter matrix
W = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.01), name='W')
b = tf.Variable(tf.constant(0.0,shape=[num_filters]),name="b")
# Valid padding: Narrow convolution (no edge padded so filter slides over everything)
# Output size = (input_size (numWords in this case) + 2 * padding (0 in this case) - filter_size) + 1
conv = tf.nn.conv2d(
self.input_placeholder,
W,
strides=[1, 1, 1, 1],
padding="VALID",
name="conv")
# Apply nonlinearity i.e add the bias to Wx + b
# Where Wx is the conv layer above
# Then run it through the activation function
h = tf.nn.relu(tf.nn.bias_add(conv, b),name='relu')
# Max-pooling over the outputs
# Max-pool to control the output size
# By taking only the best features determined by the filter
# Ksize is the size of the window of the input tensor
pooled = tf.nn.max_pool(
h,
ksize=[1, numWords - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
# Each pooled outputs a tensor of size
# [batchSize, 1, 1, num_filters] where num_filters represents the
# Number of features we wanted pooled
pooled_outputs.append(pooled)
# Combine all pooled features
num_filters_total = num_filters * len(filter_sizes)
# Concat the pool output along the 3rd (num_filters / feature size) dimension
self.h_pool = tf.concat(pooled_outputs, 3)
# Flatten
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
# Add drop out to regularize the learning curve / accuracy
with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat,self.pKeep)
# Fully connected output layer
with tf.name_scope("output"):
W = tf.Variable(tf.truncated_normal([num_filters_total,num_classes],stddev=0.01),name="W")
b = tf.Variable(tf.constant(0.0,shape=[num_classes]), name='b')
self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name='logits')
self.predictions = tf.argmax(self.logits, 1, name='predictions')
# Loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.labels,logits=self.logits, name="xentropy")
self.loss = tf.reduce_mean(losses)
# Accuracy
with tf.name_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.labels,1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
##################################################################################################################
# Running the training
# Define various parameters for network
batchSize = 100
numWords = 120
embedding_size = 300
num_classes = 4
filter_sizes = [3,4,5] # slide over a the number of words, i.e 3 words, 4 words etc...
num_filters = 126
maxSteps = 5000
initial_learning_rate = 0.001
dropoutRate = 1
data_set = np.load("/home/kevin/Documents/NSERC_2017/articles/classifyDataSet/TestSmaller_CNN_inputMat_0.npy")
labels_set = np.load("Test_NN_target_smaller.npy")
with tf.Graph().as_default():
sess = tf.Session()
with sess.as_default():
cnn = TextCNN(batchSize=batchSize,
numWords=numWords,
num_classes=num_classes,
num_filters=num_filters,
embedding_size=embedding_size,
filter_sizes=filter_sizes)
# Define training operation
# Pick an optimizer, set it's learning rate, and tell it what to minimize
global_step = tf.Variable(0,name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(initial_learning_rate)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
# Summaries to save for tensor board
# Set directory
out_dir = "/home/kevin/Documents/NSERC_2017/articles/classifyDataSet/tf_logs/CNN_Embedding/"
# Loss and accuracy summaries
loss_summary = tf.summary.scalar("loss",cnn.loss)
acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
# Train summaries
train_summary_op = tf.summary.merge([loss_summary,acc_summary])
train_summary_dir = out_dir + "train/"
train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
# Test summaries
test_summary_op = tf.summary.merge([loss_summary, acc_summary])
test_summary_dir = out_dir + "test/"
test_summary_write = tf.summary.FileWriter(test_summary_dir, sess.graph)
# Init all variables
init = tf.global_variables_initializer()
sess.run(init)
############################################################################################
def train_step(input_data, labels_data):
'''
Single training step
:param input_data: input
:param labels_data: labels to train to
'''
feed_dict = {
cnn.input_placeholder: input_data,
cnn.labels: labels_data,
cnn.pKeep: dropoutRate
}
_, step, summaries, loss, accuracy = sess.run(
[train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
feed_dict=feed_dict)
train_summary_writer.add_summary(summaries, step)
###############################################################################################
def eval_step(input_data, labels_data, writer=None):
"""
Evaluates model on a test set
Single step
"""
feed_dict = {
cnn.input_placeholder: input_data,
cnn.labels: labels_data,
cnn.pKeep: 1.0
}
step, summaries, loss, accuracy = sess.run(
[global_step, test_summary_op, cnn.loss, cnn.accuracy],
feed_dict)
if writer:
writer.add_summary(summaries, step)
return accuracy, loss
###############################################################################
def nextBatch(data_set, labels_set, batchSize):
'''
Get the next batch of data
:param data_set: entire training or test data set
:param labels_set: entire training or test label set
:param batchSize: batch size
:return: a batch of the data and it's corresponding labels
'''
# Generate random row indices for the documents
rand_index = np.random.choice(data_set.shape[0], size=batchSize)
# Grab the data to give to the feed dicts
data_batch, labels_batch = data_set[rand_index, :, :], labels_set[rand_index, :]
# Resize for tensorflow
data_batch = data_batch.reshape([data_batch.shape[0],data_batch.shape[1],data_batch.shape[2],1])
return data_batch, labels_batch
################################################################################
def do_eval(data_set,
label_set,
batch_size):
"""
Runs one evaluation against the full epoch of data.
data_set: The set of embeddings to eval
label_set: the set of labels to eval
"""
# And run one epoch of eval.
true_count = 0 # Counts the number of correct predictions.
steps_per_epoch = len(label_set) // batch_size
num_examples = steps_per_epoch * batch_size
totalLoss = 0
# Need to compute eval accuracy
for evalStep in xrange(steps_per_epoch):
input_batch, label_batch = nextBatch(data_set, labels_set, batchSize)
evalAcc, evalLoss = eval_step(input_batch, label_batch)
true_count += evalAcc * batchSize
totalLoss += evalLoss
precision = float(true_count) / num_examples
print(' Num examples: %d Num correct: %d Precision # 1: %0.04f' % (num_examples, true_count, precision))
print("Eval Loss: " + str(totalLoss))
######################################################################################################
# Training Loop
for step in range(maxSteps):
input_batch, label_batch = nextBatch(data_set,labels_set,batchSize)
train_step(input_batch,label_batch)
# Evaluate over the entire data set on last eval
if step % 100 == 0:
print "On Step : " + str(step) + " of " + str(maxSteps)
do_eval(data_set, labels_set,batchSize)
The embedding is done before the model:
def createInputEmbeddedMatrix(corpusPath, maxWords, svName):
# Create a [docNum, Words per Art, Embedding Size] matrix to fill
genDocsPath = "gen_docs_classifyData_smallerTest_TFIDF.npy"
# corpus = "newsCorpus_word2vec_All_Corpus.mm"
dictPath = 'news_word2vec_smallerDict.dict'
tf_idf_path = "news_tfIdf_word2vec_All.tfidf_model"
gen_docs = np.load(genDocsPath)
dictionary = gensim.corpora.dictionary.Dictionary.load(dictPath)
tf_idf = gensim.models.tfidfmodel.TfidfModel.load(tf_idf_path)
corpus = corpora.MmCorpus(corpusPath)
numOfDocs = len(corpus)
embedding_size = 300
id2embedding = np.load("smallerID2embedding.npy").item()
# Need to process in batches as takes up a ton of memory
step = 5000
totalSteps = int(np.ceil(numOfDocs / step))
for i in range(totalSteps):
# inputMatrix = scipy.sparse.csr_matrix([step,maxWords,embedding_size])
inputMatrix = np.zeros([step, maxWords, embedding_size])
start = i * step
end = start + step
for docNum in range(start, end):
print "On docNum " + str(docNum) + " of " + str(numOfDocs)
# Extract the top N words
topWords, wordVal = tf_idfTopWords(docNum, gen_docs, dictionary, tf_idf, maxWords)
# doc = corpus[docNum]
# Need to track word dex and doc dex seperate
# Doc dex because of the batch processing
wordDex = 0
docDex = 0
for wordID in wordVal:
inputMatrix[docDex, wordDex, :] = id2embedding[wordID]
wordDex += 1
docDex += 1
# Save the batch of input data
# scipy.sparse.save_npz(svName + "_%d" % i, inputMatrix)
np.save(svName + "_%d.npy" % i, inputMatrix)
#####################################################################################
Turns out my error was in the creation of the input matrix.
for i in range(totalSteps):
# inputMatrix = scipy.sparse.csr_matrix([step,maxWords,embedding_size])
inputMatrix = np.zeros([step, maxWords, embedding_size])
start = i * step
end = start + step
for docNum in range(start, end):
print "On docNum " + str(docNum) + " of " + str(numOfDocs)
# Extract the top N words
topWords, wordVal = tf_idfTopWords(docNum, gen_docs, dictionary, tf_idf, maxWords)
# doc = corpus[docNum]
# Need to track word dex and doc dex seperate
# Doc dex because of the batch processing
wordDex = 0
docDex = 0
for wordID in wordVal:
inputMatrix[docDex, wordDex, :] = id2embedding[wordID]
wordDex += 1
docDex += 1
docDex should not have been reset to 0 on each iteration of the inner loop, I was effectively overwriting the first row of my input matrix and thus the rest were 0's.
I have stopped training at some point and saved checkpoint, meta files etc.
Now when I want to resume training, I want to start with last running learning rate of the optimizer. Can you provide a example of doing so ?
For those coming here (like me) wondering whether the last learning rate is automatically restored: tf.train.exponential_decay doesn't add any Variables to the graph, it only adds the operations necessary to derive the correct current learning rate value given a certain global_step value. This way, you only need to checkpoint the global_step value (which is done by default normally) and, assuming you keep the same initial learning rate, decay steps and decay factor, you'll automatically pick up training where you left it, with the correct learning rate value.
Inspecting the checkpoint won't show any learning_rate variable (or similar), simply because there is no need for any.
This example code learns to add two numbers:
import tensorflow as tf
import numpy as np
import os
save_ckpt_dir = './add_ckpt'
ckpt_filename = 'add.ckpt'
save_ckpt_path = os.path.join(save_ckpt_dir, ckpt_filename)
if not os.path.isdir(save_ckpt_dir):
os.mkdir(save_ckpt_dir)
if [fname.startswith("add.ckpt") for fname in os.listdir(save_ckpt_dir)]: # prefer to load pre-trained net
load_ckpt_path = save_ckpt_path
else:
load_ckpt_path = None # train from scratch
def add_layer(inputs, in_size, out_size, activation_fn=None):
Weights = tf.Variable(tf.ones([in_size, out_size]), name='Weights')
biases = tf.Variable(tf.zeros([1, out_size]), name='biases')
Wx_plus_b = tf.add(tf.matmul(inputs, Weights), biases)
if activation_fn is None:
layer_output = Wx_plus_b
else:
layer_output = activation_fn(Wx_plus_b)
return layer_output
def produce_batch(batch_size=256):
"""Loads a single batch of data.
Args:
batch_size: The number of excersises in the batch.
Returns:
x : column vector of numbers
y : another column of numbers
xy_sum : the sum of the columns
"""
x = np.random.random(size=[batch_size, 1]) * 10
y = np.random.random(size=[batch_size, 1]) * 10
xy_sum = x + y
return x, y, xy_sum
with tf.name_scope("inputs"):
xs = tf.placeholder(tf.float32, [None, 1])
ys = tf.placeholder(tf.float32, [None, 1])
with tf.name_scope("correct_labels"):
xysums = tf.placeholder(tf.float32, [None, 1])
with tf.name_scope("step_and_learning_rate"):
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.15, global_step, 10, 0.96) # start lr=0.15, decay every 10 steps with a base of 0.96
with tf.name_scope("graph_body"):
prediction = add_layer(tf.concat([xs, ys], 1), 2, 1, activation_fn=None)
with tf.name_scope("loss_and_train"):
# the error between prediction and real data
loss = tf.reduce_mean(tf.reduce_sum(tf.square(xysums-prediction), reduction_indices=[1]))
# Passing global_step to minimize() will increment it at each step.
train_step = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)
with tf.name_scope("init_load_save"):
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(init)
if load_ckpt_path:
saver.restore(sess, load_ckpt_path)
for i in range(1000):
x, y, xy_sum = produce_batch(256)
_, global_step_np, loss_np, lr_np = sess.run([train_step, global_step, loss, lr], feed_dict={xs: x, ys: y, xysums: xy_sum})
if global_step_np % 100 == 0:
print("global step: {}, loss: {}, learning rate: {}".format(global_step_np, loss_np, lr_np))
saver.save(sess, save_ckpt_path)
if you run it a few times, you will see the learning rate decrease. It also saves the global step. The trick is here:
with tf.name_scope("step_and_learning_rate"):
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.15, global_step, 10, 0.96) # start lr=0.15, decay every 10 steps with a base of 0.96
...
train_step = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)
By default, saver.save will save all savable objects (including learning rate and global step). However, if tf.train.Saver is provided with var_list, saver.save will only save the vars included in var_list:
saver = tf.train.Saver(var_list = ..list of vars to save..)
sources:
https://www.tensorflow.org/api_docs/python/tf/train/exponential_decay
https://stats.stackexchange.com/questions/200063/tensorflow-adam-optimizer-with-exponential-decay
https://www.tensorflow.org/api_docs/python/tf/train/Saver (see "saveable objects")