Related
I have trained a model in Tensorflow and am having trouble replicating it in PyTorch. The Tensorflow model achieves near 100% accuracy (the task is simple), but the PyTorch model performs at random. I've spent a while trying to figure this out, and can't understand what the problem could be.
The model is trained for the task of binary classification. Given an input utterance describing a quadrant and a (x, y, z) coordinate, the model has to predict if the (x, z) portion of the coordinate is in the quadrant described. For example, if the input text was "quadrant 1" and the coordinate was (0.5, -, 0.5), then the prediction should be true, but if the region was "quadrant 2" with the same coordinate, then the prediction should be false.
I generated some data and trained the model in Tensorflow using this code:
x_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="x_data")
y_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="y_data")
z_data_placeholder = tf.placeholder(tf.float32, [FLAGS.batch_size, 1], name="z_data")
# text and labels placeholders
text_data = tf.placeholder(tf.int32, [FLAGS.batch_size, maxtextlength])
text_lengths = tf.placeholder(tf.int32, [FLAGS.batch_size])
y_labels_placeholder = tf.placeholder(tf.int64, [FLAGS.batch_size])
# encode text and coordinate
embeddings = tf.Variable(tf.random_uniform([100, embedding_size], -1, -1))
rnn_inputs = tf.nn.embedding_lookup(embeddings, text_data)
rnn_layers = [tf.compat.v1.nn.rnn_cell.LSTMCell(size, initializer=tf.compat.v1.keras.initializers.glorot_normal) for size in [256]]
multi_rnn_cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(rnn_layers, state_is_tuple=True)
text_outputs, text_fstate = tf.compat.v1.nn.dynamic_rnn(cell=multi_rnn_cell,
inputs=rnn_inputs,
dtype=tf.float32, sequence_length=text_lengths)
# have fully connected layers to map them the input coordinates into the same dimension as the LSTM output layer from above
x_output_layer = tf.compat.v1.layers.dense(x_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='x_coordinate')
y_output_layer = tf.compat.v1.layers.dense(y_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='y_coordinate')
z_output_layer = tf.compat.v1.layers.dense(z_data_placeholder, units=FLAGS.fc_column_size, activation=tf.nn.relu, name='z_coordinate')
# add the representations
total_output_layer = x_output_layer + y_output_layer + z_output_layer + lstm_output_layer
# make the predictions with two fully connected layers
fc_1 = tf.compat.v1.layers.dense(total_output_layer, units=FLAGS.hidden_layer_size, activation=tf.nn.relu, name='fc_1')
logits = tf.compat.v1.layers.dense(fc_1, units=FLAGS.output_dims, activation=None, name='logits')
# train the model
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_labels_placeholder, logits=logits))
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate, epsilon=1e-7)
gradients, variables = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.gradient_clip_threshold)
optimize = optimizer.apply_gradients(zip(gradients, variables))
# then it'll be trained with sess.run ...
Now for the PyTorch replication:
class BaselineModel(nn.Module):
def __init__(self):
super(BaselineModel, self).__init__()
self.encode_x = nn.Linear(1, embed_size)
self.encode_y = nn.Linear(1, embed_size)
self.encode_z = nn.Linear(1, embed_size)
self._embeddings = nn.Embedding(vocab_size, self.embedding_table_size)
nn.init.uniform_(self._embeddings.weight, -1.0, 1.0)
self.num_layers = 1
self.rnn = nn.LSTM(self.embedding_table_size, self.hidden_size, batch_first=True)
self.fc_after_text_lstm = nn.Linear(self.hidden_size, 100)
self.fc = nn.Linear(100, 256)
self.fc_final = nn.Linear(256, 2)
self.relu_activation = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
def init_hidden(self, batch_size, device='cuda:0'):
# for LSTM, we need # of layers
h_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
c_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
return h_0, c_0
def forward(self, input_text, x_coordinate=None, y_coordinate=None, z_coordinate=None):
x_embed = self.relu_activation(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
y_embed = self.relu_activation(self.encode_y(y_coordinate.cuda().to(torch.float32))).cuda()
z_embed = self.relu_activation(self.encode_z(z_coordinate.cuda().to(torch.float32))).cuda()
embeds = self._embeddings(input_text)
embedding, hidden = self.rnn(embeds, self.hidden)
text_fc = self.relu_activation(self.fc_after_text_lstm(embedding[:, -1]))
representations_so_far_added = torch.sum(torch.stack([text_fc, x_embed, y_embed, z_embed]), dim=0)
pre_final_embedding = self.relu_activation(self.fc(representations_so_far_added))
return self.fc_final(pre_final_embedding )
### training code
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, eps=1e-7)
criterion = nn.CrossEntropyLoss()
for input_text, x_coordinate, y_coordinate, z_coordinate, targets, train_data:
optimizer.zero_grad()
pred = model(input_text, x_coordinate=x_coordinate, y_coordinate=y_coordinate, z_coordinate=z_coordinate)
loss = criterion(pred.float(), targets)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
optimizer.step()
scheduler.step()
# accuracy evaluation code, this is evaluated over the entire epoch
pred_idx = F.log_softmax(pred, dim=1)
target_labels = targets.cpu().int()
pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()
curr_acc = skm.accuracy_score(target_labels, pred_labels)
If anyone can spot any issue with the PyTorch implementation or maybe tell me what could be wrong, that would be much appreciated! I also tried to load the weights of the Tensorflow model into all the appropriate layers, and performance still struggles in PyTorch! Thanks in advance!
EDIT:
I have created a minimally reproducible example, because I still cannot figure out what the problem is. Any help would be still appreciated!
import torch
import torch.nn as nn
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr = 0.0005
n_epochs = 10
input_dim = 4
hidden_dim = 128
layer_dim = 2
output_dim = 2
batch_size = 50
class FeatureDataSet(torch.utils.data.Dataset):
def __init__(self, x_train, y_train, x_coordinates):
self.x_train = torch.tensor(x_train, dtype=torch.long)
self.y_train = torch.tensor(y_train)
self.x_coordinates = torch.tensor(x_coordinates, dtype=torch.float32)
def __len__(self):
return len(self.y_train)
def __getitem__(self, idx):
return self.x_train[idx], self.y_train[idx], self.x_coordinates[idx]
class RNN(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, batch_size):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
# linear layer to encode the coordinate
self.encode_x = nn.Linear(1, hidden_dim).cuda()
self._embeddings = nn.Embedding(40, 100).cuda()
# hidden_dim is 128
# layer_dim is 2
self.lstm = nn.LSTM(100, hidden_dim, layer_dim, batch_first=True).cuda()
self.fc = nn.Linear(2 * hidden_dim, output_dim).cuda()
self.batch_size = batch_size
self.hidden = None
def init_hidden(self, x):
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
return [t.cpu() for t in (h0, c0)]
def forward(self, x, x_coordinate):
#initializing the hidden states
h0, c0 = self.init_hidden(x)
embeds = self._embeddings(x)
out, (hn, cn) = self.lstm(embeds.cuda(), (h0.cuda(), c0.cuda()))
x_embed = F.relu(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
representations_so_far_added = torch.cat([out[:, -1, :], x_embed], dim=1)
out = self.fc(representations_so_far_added)
return out
model = RNN(input_dim, hidden_dim, layer_dim, output_dim, batch_size)
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=0.001)
print('Start model training')
import sklearn.metrics as skm
import torch.nn.functional as F
x_train = []
x_coordinates = []
y_train = []
for i in range(10000):
# create the data. if x_coordinate > 0 and the sentence says that (represented by [1, 5, 6, 8]), then we should predict positive else negative (if the x_coordinate > 0)
# same applies if the x_coordinate < 0, just that the sentence is now [1, 5, 6, 9]
if np.random.randint(0, 2) == 0:
if np.random.randint(0, 2) == 0:
# x coordinate > 0
x_train.append([1, 5, 6, 8])
x_coordinates.append([round(np.random.uniform(0.01, 1.00, 1)[0], 2)])
y_train.append(1.0)
else:
# x coordinate > 0 negative
x_train.append([1, 5, 6, 8])
x_coordinates.append([round(np.random.uniform(-1.00, 0.00, 1)[0], 2)])
y_train.append(0.0)
else:
if np.random.randint(0, 2) == 0:
# x coordinate < 0
x_train.append([1, 5, 6, 9])
x_coordinates.append([round(np.random.uniform(-1.00, 0.00, 1)[0], 2)])
y_train.append(1.0)
else:
# x coordinate < 0 negative
x_train.append([1, 5, 6, 9])
x_coordinates.append([round(np.random.uniform(0.01, 1.00, 1)[0], 2)])
y_train.append(0.0)
# print a sample of data
print(x_train[:10])
print(y_train[:10])
print(x_coordinates[:10])
# create a dataloader
trainingDataset = FeatureDataSet(x_train=x_train, y_train=y_train, x_coordinates=x_coordinates)
train_loader = torch.utils.data.DataLoader(dataset=trainingDataset, batch_size=batch_size, shuffle=True)
# for each epoch
for epoch in range(1, n_epochs + 1):
acc_all = []
# each batch
for i, (x_batch, y_batch, x_coord_batch) in enumerate(train_loader):
x_batch = x_batch.to(device)
y_batch = y_batch.to(device)
x_coord_batch = x_coord_batch.to(device)
opt.zero_grad()
# pass in the text (x_batch) and coordinate (x_coord_batch)
out = model(x_batch, x_coordinate=x_coord_batch)
loss = criterion(out.float(), y_batch.type(torch.LongTensor).cuda())
loss.backward()
opt.step()
pred_idx = F.log_softmax(out, dim=1)
target_labels = y_batch.cpu().int()
pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()
curr_acc = skm.accuracy_score(target_labels, pred_labels)
acc_all.append(curr_acc)
print(np.mean(acc_all))
I suppose perhaps there are some mistakes in your dataset implementation in the PyTorch version.
I tried your pytorch BaselineModel on both the dataset in your "minimally reproducible example" and my own dataset constructed according to your description, and find that it works fine.
The following is my codes for testing on my own dataset. Note that I add several hyperparameters to the code of BaselineModel to make it run. I got accuracy over 99%.
import random
import torch
import torch.nn as nn
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr = 0.0005
n_epochs = 100
input_dim = 4
hidden_dim = 128
layer_dim = 2
output_dim = 2
batch_size = 50
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
class FeatureDataSet(torch.utils.data.Dataset):
def __init__(self, x_train, y_train, x_coordinates, y_coordinates, z_coordinates):
self.x_train = torch.tensor(x_train, dtype=torch.long)
self.y_train = torch.tensor(y_train)
self.x_coordinates = torch.tensor(x_coordinates, dtype=torch.float32)
self.y_coordinates = torch.tensor(y_coordinates, dtype=torch.float32)
self.z_coordinates = torch.tensor(z_coordinates, dtype=torch.float32)
def __len__(self):
return len(self.y_train)
def __getitem__(self, idx):
return self.x_train[idx], self.y_train[idx], self.x_coordinates[idx], self.y_coordinates[idx], self.z_coordinates[idx]
class BaselineModel(nn.Module):
def __init__(self):
super(BaselineModel, self).__init__()
vocab_size = 40
self.hidden_size = 100
self.embedding_table_size = self.hidden_size
self.encode_x = nn.Linear(1, self.hidden_size)
self.encode_y = nn.Linear(1, self.hidden_size)
self.encode_z = nn.Linear(1, self.hidden_size)
self._embeddings = nn.Embedding(vocab_size, self.embedding_table_size)
nn.init.uniform_(self._embeddings.weight, -1.0, 1.0)
self.num_layers = 1
self.rnn = nn.LSTM(self.embedding_table_size, self.hidden_size, batch_first=True)
self.fc_after_text_lstm = nn.Linear(self.hidden_size, 100)
self.fc = nn.Linear(100, 256)
self.fc_final = nn.Linear(256, 2)
self.relu_activation = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
self.hidden = self.init_hidden(batch_size)
def init_hidden(self, batch_size, device='cuda:0'):
# for LSTM, we need # of layers
h_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
c_0 = torch.zeros(1, batch_size, self.hidden_size).to(device)
return h_0, c_0
def forward(self, input_text, x_coordinate=None, y_coordinate=None, z_coordinate=None):
x_embed = self.relu_activation(self.encode_x(x_coordinate.cuda().to(torch.float32)).cuda())
y_embed = self.relu_activation(self.encode_y(y_coordinate.cuda().to(torch.float32))).cuda()
z_embed = self.relu_activation(self.encode_z(z_coordinate.cuda().to(torch.float32))).cuda()
embeds = self._embeddings(input_text)
embedding, hidden = self.rnn(embeds, self.hidden)
text_fc = self.relu_activation(self.fc_after_text_lstm(embedding[:, -1]))
representations_so_far_added = torch.sum(torch.stack([text_fc, x_embed, y_embed, z_embed]), dim=0)
pre_final_embedding = self.relu_activation(self.fc(representations_so_far_added))
return self.fc_final(pre_final_embedding)
# model = RNN(input_dim, hidden_dim, layer_dim, output_dim, batch_size)
model = BaselineModel().cuda()
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=0.001)
print('Start model training')
import sklearn.metrics as skm
import torch.nn.functional as F
x_train = []
x_coordinates = []
y_coordinates = []
z_coordinates = []
y_train = []
for i in range(10000):
x_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
y_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
z_coordinate = round(np.random.uniform(-1, 1.00, 1)[0], 2)
x_coordinates.append([x_coordinate])
y_coordinates.append([y_coordinate])
z_coordinates.append([z_coordinate])
if np.random.randint(0, 2) == 0: # positive example
if x_coordinate <= 0 and z_coordinate <= 0:
x_train.append([1, 5, 6, 8])
elif x_coordinate <= 0 and z_coordinate > 0:
x_train.append([1, 5, 6, 9])
elif x_coordinate > 0 and z_coordinate <= 0:
x_train.append([1, 5, 6, 10])
elif x_coordinate > 0 and z_coordinate > 0:
x_train.append([1, 5, 6, 11])
y_train.append(1.0)
else:
if x_coordinate <= 0 and z_coordinate <= 0:
x_train.append(random.choice([[1, 5, 6, 9], [1, 5, 6, 10], [1, 5, 6, 11]]))
elif x_coordinate <= 0 and z_coordinate > 0:
x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 10], [1, 5, 6, 11]]))
elif x_coordinate > 0 and z_coordinate <= 0:
x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 9], [1, 5, 6, 11]]))
elif x_coordinate > 0 and z_coordinate > 0:
x_train.append(random.choice([[1, 5, 6, 8], [1, 5, 6, 9], [1, 5, 6, 10]]))
y_train.append(0.0)
# print a sample of data
print(x_train[:10])
print(y_train[:10])
print(x_coordinates[:10])
print(y_coordinates[:10])
print(z_coordinates[:10])
# create a dataloader
trainingDataset = FeatureDataSet(x_train=x_train, y_train=y_train, x_coordinates=x_coordinates, y_coordinates=y_coordinates, z_coordinates=z_coordinates)
train_loader = torch.utils.data.DataLoader(dataset=trainingDataset, batch_size=batch_size, shuffle=True)
# for each epoch
loss_meter = AverageMeter()
for epoch in range(1, n_epochs + 1):
acc_all = []
# each batch
loss_meter.reset()
for i, (x_batch, y_batch, x_coord_batch, y_coord_batch, z_coord_batch) in enumerate(train_loader):
x_batch = x_batch.to(device)
y_batch = y_batch.to(device)
x_coord_batch = x_coord_batch.to(device)
y_coord_batch = y_coord_batch.to(device)
z_coord_batch = z_coord_batch.to(device)
opt.zero_grad()
# pass in the text (x_batch) and coordinate (x_coord_batch)
out = model(x_batch, x_coordinate=x_coord_batch, y_coordinate=y_coord_batch, z_coordinate=z_coord_batch)
loss = criterion(out.float(), y_batch.type(torch.LongTensor).cuda())
loss.backward()
opt.step()
pred_idx = F.log_softmax(out, dim=1)
target_labels = y_batch.cpu().int()
pred_labels = torch.argmax(pred_idx, dim=-1).cpu().data.int()
curr_acc = skm.accuracy_score(target_labels, pred_labels)
acc_all.append(curr_acc)
loss_meter.update(loss.item())
print(np.mean(acc_all))
print("loss is %f" % loss_meter.val)
As for the "minimally reproducible example", I think the model RNN doesn't work is quite reasonable, as I have stated in the comments. I suppose that tensorflow can not fit as well, although I have not tried it. Your "minimally reproducible example" may be unrelated to your main problem.
here is the output of AlexNet, I mean it's the output of the third full-connection
I don't know why all the output is zero. And I try to reduce the layer, while cutting the last two layer, the output is all the same in each dimension, after training several steps. Any idea will be appreciated.
Here is the main Inference code and initial value:
import tensorflow as tf
import numpy as np
import os
import csv
import cifar10
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
IMAGE_SIZES = 32
IMAGE_CHANELS = 3
NUM_CLASSES = 10
FIRST_CONV_NUM = 64
SECOND_CONV_NUM = 192
THIRD_CONV_NUM = 384
FOURTH_CONV_NUM = 256
FIFTH_CONV_NUM = 256
MAX_POOL_SIZE = 3
BATCH_SIZE = 100
FIRST_FC_UNIT_NUM = 4096
SECOND_FC_UNIT_NUM = 1000
DROP_OUT_PRO = 0.5
THIRD_FC_UNIT_NUM = NUM_CLASSES
TRAIN_EPOCH = 10
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
NUM_EPOCHS_PER_DECAY = 350.0
LEARNING_RATE_DECAY_FACTOR = 0.1
INITIAL_LEARNING_RATE = 0.01
DISPLAY_STEPS = 5
def leaky_relu(x, alpha=0.0):
return tf.nn.relu(x) - alpha * tf.nn.relu(-x)
def activation(x,alpha=0.0):
if alpha > 0:
return leaky_relu(x,alpha)
else:
return tf.nn.relu(x)
def Alex_Weight(weight_name, weight_shape, weight_stddev, weight_type):
Weight = tf.truncated_normal(shape=weight_shape, stddev=weight_stddev,type=weight_type)
return tf.Variable(initial_value=Weight, trainable=True, name=weight_name)
def Alex_Bias(bias_name, bias_shape, bias_type, bias_init=0.1):
initial = tf.constant(bias_init, shape=bias_shape)
return tf.Variable(initial, trainable=True, dtype=bias_type,name=bias_name)
def Alex_AddActivationSummary(out):
tf.summary.histogram('/activations',out)
tf.summary.scalar('/sparsity',tf.nn.zero_fraction(out))
def Alex_Conv(conv_name, input, weight, bias, strides, alpha=0.1,padding="SAME", activation=activation, act_name="ReLU"):
with tf.name_scope(conv_name):
conv = tf.nn.conv2d(input, weight, [1, strides, strides, 1], padding)
pre_activation = tf.nn.bias_add(conv, bias)
with tf.name_scope(act_name):
conv = activation(pre_activation,alpha=alpha)
return conv
def Alex_Pool(conv, ksize, strides, pool_fuction=tf.nn.max_pool, padding="SAME"):
return pool_fuction(conv, [1, ksize, ksize, 1], [1, strides,strides, 1], padding)
def Alex_Fully_Connect( input, weight, bias, activation=tf.nn.relu, act_name="ReLU"):
with tf.name_scope("Wx_b"):
y = tf.add(tf.matmul(input, weight), bias)
with tf.name_scope(act_name):
fc = activation(y, act_name)
return fc
def Alex_Norm(norm_name, pool):
with tf.name_scope(norm_name):
norm = tf.nn.lrn(pool, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
name=norm_name)
return norm
def Alex_Inference(images):
with tf.name_scope("First_Conv"):
W1 = Alex_Weight("Fist_Conv_Weight", [5, 5, IMAGE_CHANELS, FIRST_CONV_NUM], weight_stddev=0.01,
weight_type=tf.float32)
bias1 = Alex_Bias("First_Conv_Bias", [FIRST_CONV_NUM], tf.float32,bias_init=0.0)
first_conv = Alex_Conv("First_Conv", images, W1, bias1, strides=1, padding="SAME")
Alex_AddActivationSummary(first_conv)
with tf.name_scope('lrn1') as scope:
lrn1 = tf.nn.local_response_normalization(first_conv,
alpha=1e-4,
beta=0.75,
depth_radius=2,
bias=2.0)
with tf.name_scope("First_Pool"):
first_pool = Alex_Pool(lrn1, MAX_POOL_SIZE, strides=2, padding="VALID")
with tf.name_scope("Second_Conv"):
W2 = Alex_Weight("Second_Conv_Weight", [5, 5, FIRST_CONV_NUM, SECOND_CONV_NUM], weight_stddev=0.01,
weight_type=tf.float32)
bias2 = Alex_Bias("Second_Conv_Bias", [SECOND_CONV_NUM], tf.float32,bias_init=1.0)
second_conv = Alex_Conv("Second_Conv", first_pool, W2, bias2, strides=1, padding="SAME")
Alex_AddActivationSummary(second_conv)
with tf.name_scope('lrn2') as scope:
lrn2 = tf.nn.local_response_normalization(second_conv,
alpha=1e-4,
beta=0.75,
depth_radius=2,
bias=2.0)
with tf.name_scope("Second_Pool"):
second_pool = Alex_Pool(lrn2, MAX_POOL_SIZE, strides=2, padding="VALID")
with tf.name_scope("Third_Conv"):
W3 = Alex_Weight("Third_Conv_Weight", [3, 3, SECOND_CONV_NUM, THIRD_CONV_NUM], weight_stddev=0.01,
weight_type=tf.float32)
bias3 = Alex_Bias("Third_Conv_Bias", [THIRD_CONV_NUM], tf.float32,bias_init=0.0)
third_conv = Alex_Conv("Third_Conv", second_pool, W3, bias3, strides=1, padding="SAME")
Alex_AddActivationSummary(third_conv)
with tf.name_scope("Fourth_Conv"):
W4 = Alex_Weight("Fourth_Conv_Weight", [3, 3, THIRD_CONV_NUM, FOURTH_CONV_NUM], weight_stddev=0.01,
weight_type=tf.float32)
bias4 = Alex_Bias("Fourth_Conv_Bias", [FOURTH_CONV_NUM], tf.float32,bias_init=1.0)
fourth_conv = Alex_Conv("Fourth_Conv", third_conv, W4, bias4, strides=1, padding="SAME")
Alex_AddActivationSummary(fourth_conv)
with tf.name_scope("Fifth_Conv"):
W5 = Alex_Weight("Fifth_Conv_Weight", [3, 3, FOURTH_CONV_NUM, FIFTH_CONV_NUM], weight_stddev=0.01,
weight_type=tf.float32)
bias5 = Alex_Bias("Fifth_Conv_Bias", [FIFTH_CONV_NUM], tf.float32,bias_init=1.0)
fifth_conv = Alex_Conv("Fifth_Conv", fourth_conv, W5, bias5, strides=1, padding="SAME")
Alex_AddActivationSummary(fifth_conv)
with tf.name_scope("Third_Pool"):
third_pool = Alex_Pool(fifth_conv, MAX_POOL_SIZE, strides=2, padding="VALID")
with tf.name_scope("Flatten"):
flatten = tf.reshape(third_pool, [BATCH_SIZE, -1])
flatten_dim = flatten.get_shape()[1].value
with tf.name_scope("First_Fully_Connection"):
W = Alex_Weight("Fist_FC_Weight", [flatten_dim, FIRST_FC_UNIT_NUM], weight_stddev=4e-2, weight_type=tf.float32)
bias = Alex_Bias("First_FC_Bias", [FIRST_FC_UNIT_NUM], tf.float32, bias_init=1.0)
fc1 = Alex_Fully_Connect(flatten, W, bias, activation=tf.nn.relu, act_name="ReLU")
Alex_AddActivationSummary(fc1)
with tf.name_scope("Drop_Out_1"):
drop_out_1 = tf.nn.dropout(fc1, DROP_OUT_PRO)
with tf.name_scope("Second_Fully_Connection"):
W = Alex_Weight("Second_FC_Weight", [FIRST_FC_UNIT_NUM, SECOND_FC_UNIT_NUM], weight_stddev=4e-2,
weight_type=tf.float32)
bias = Alex_Bias("Second_FC_Bias", [SECOND_FC_UNIT_NUM], tf.float32, bias_init=1.0)
fc2 = Alex_Fully_Connect(drop_out_1, W, bias, activation=tf.nn.relu, act_name="ReLU")
Alex_AddActivationSummary(fc2)
with tf.name_scope("Drop_Out_2"):
drop_out_2 = tf.nn.dropout(fc2, DROP_OUT_PRO)
with tf.name_scope("Third_Fully_Connection"):
W = Alex_Weight("Third_FC_Weight", [SECOND_FC_UNIT_NUM, THIRD_FC_UNIT_NUM], weight_stddev=1/SECOND_FC_UNIT_NUM,
weight_type=tf.float32)
bias = Alex_Bias("Third_FC_Bias", [THIRD_FC_UNIT_NUM], tf.float32,bias_init=1.0)
fc3 = Alex_Fully_Connect(drop_out_2, W, bias, activation=tf.nn.relu, act_name="ReLU")
Alex_AddActivationSummary(fc3)
return fc3
some value error in my code: ValueError: Variable d_conv2d1/weights/Adam/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?
My tensorflow version is 1.2. The data set is mnist .The code is as follow:
# -*- coding: utf-8 -*-
import os
import numpy as np
import scipy.misc
import tensorflow as tf
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
from tensorflow.examples.tutorials.mnist import input_data
import string
BATCH_SIZE = 64
def read_data():
#
mnist = input_data.read_data_sets("MNIST_data/", one_hot = True)
(x_train, y_train), (x_test, y_test) = tf.contrib.keras.datasets.mnist.load_data()
x_train = x_train.reshape((60000,784))
x_test = x_test.reshape((10000,784))
y_train_vec = np.zeros((len(y_train), 10), dtype=np.float)
for i, label in enumerate(y_train):
y_train_vec[i, int(y_train[i])] = 1.0
y_test_vec = np.zeros((len(y_test), 10), dtype=np.float)
for i, label in enumerate(y_test):
y_test_vec[i, int(y_test[i])] = 1.0
#
#
X = np.concatenate((x_train, x_test), axis=0)
y = np.concatenate((y_train_vec, y_test_vec), axis=0)
#
seed = 547
np.random.seed(seed)
np.random.shuffle(X)
np.random.seed(seed)
np.random.shuffle(y)
return X/255., y
# 常数偏置
def bias(name, shape, bias_start = 0.0, trainable = True):
dtype = tf.float32
var = tf.get_variable(name, shape, tf.float32, trainable = trainable,
initializer = tf.constant_initializer(
bias_start, dtype = dtype))
return var
# 随机权重
def weight(name, shape, stddev = 0.02, trainable = True):
dtype = tf.float32
var = tf.get_variable(name, shape, tf.float32, trainable = trainable,
initializer = tf.random_normal_initializer(
stddev = stddev, dtype = dtype))
return var
# 全连接层
def fully_connected(value, output_shape, name = 'fully_connected', with_w = False):
shape = value.get_shape().as_list()
with tf.variable_scope(name):
weights = weight('weights', [shape[1], output_shape], 0.02)
biases = bias('biases', [output_shape], 0.0)
if with_w:
return tf.matmul(value, weights) + biases, weights, biases
else:
return tf.matmul(value, weights) + biases
# Leaky-ReLu 层
def lrelu(x, leak=0.2, name = 'lrelu'):
with tf.variable_scope(name):
return tf.maximum(x, leak*x, name = name)
# ReLu 层
def relu(value, name = 'relu'):
with tf.variable_scope(name):
return tf.nn.relu(value)
# 解卷积层
def deconv2d(value, output_shape, k_h = 5, k_w = 5, strides =[1, 2, 2, 1],
name = 'deconv2d', with_w = False):
with tf.variable_scope(name):
weights = weight('weights',
[k_h, k_w, output_shape[-1], value.get_shape()[-1]])
deconv = tf.nn.conv2d_transpose(value, weights,
output_shape, strides = strides)
biases = bias('biases', [output_shape[-1]])
deconv = tf.reshape(tf.nn.bias_add(deconv, biases), deconv.get_shape())
if with_w:
return deconv, weights, biases
else:
return deconv
# 卷积层
def conv2d(value, output_dim, k_h = 5, k_w = 5,
strides =[1, 2, 2, 1], name = 'conv2d'):
with tf.variable_scope(name):
weights = weight('weights',
[k_h, k_w, value.get_shape()[-1], output_dim])
conv = tf.nn.conv2d(value, weights, strides = strides, padding = 'SAME')
biases = bias('biases', [output_dim])
conv = tf.reshape(tf.nn.bias_add(conv, biases), conv.get_shape())
return conv
# 把约束条件串联到 feature map
def conv_cond_concat(value, cond, name = 'concat'):
# 把张量的维度形状转化成 Python 的 list
value_shapes = value.get_shape().as_list()
cond_shapes = cond.get_shape().as_list()
# 在第三个维度上(feature map 维度上)把条件和输入串联起来,
# 条件会被预先设为四维张量的形式,假设输入为 [64, 32, 32, 32] 维的张量,
# 条件为 [64, 32, 32, 10] 维的张量,那么输出就是一个 [64, 32, 32, 42] 维张量
with tf.variable_scope(name):
return tf.concat( [value, cond * tf.ones(value_shapes[0:3] + cond_shapes[3:])],3)
# Batch Normalization 层
def batch_norm_layer(value, is_train = True, name = 'batch_norm'):
with tf.variable_scope(name) as scope:
if is_train:
return batch_norm(value, decay = 0.9, epsilon = 1e-5, scale = True,
is_training = is_train,
updates_collections = None, scope = scope)
else:
return batch_norm(value, decay = 0.9, epsilon = 1e-5, scale = True,
is_training = is_train, reuse = True,
updates_collections = None, scope = scope)
# 保存图片函数
def save_images(images, size, path):
"""
Save the samples images
The best size number is
int(max(sqrt(image.shape[0]),sqrt(image.shape[1]))) + 1
example:
The batch_size is 64, then the size is recommended [8, 8]
The batch_size is 32, then the size is recommended [6, 6]
"""
# 图片归一化,主要用于生成器输出是 tanh 形式的归一化
img = (images + 1.0) / 2.0
h, w = img.shape[1], img.shape[2]
# 产生一个大画布,用来保存生成的 batch_size 个图像
merge_img = np.zeros((h * size[0], w * size[1], 3))
# 循环使得画布特定地方值为某一幅图像的值
for idx, image in enumerate(images):
i = idx % size[1]
j = idx // size[1]
merge_img[j*h:j*h+h, i*w:i*w+w, :] = image
# 保存画布
return scipy.misc.imsave(path, merge_img)
# 定义生成器
def generator(z, y, train = True):
# y 是一个 [BATCH_SIZE, 10] 维的向量,把 y 转成四维张量
yb = tf.reshape(y, [BATCH_SIZE, 1, 1, 10], name = 'yb')
# 把 y 作为约束条件和 z 拼接起来
z = tf.concat([z, y], 1, name = 'z_concat_y')
# 经过一个全连接,BN 和激活层 ReLu
h1 = tf.nn.relu(batch_norm_layer(fully_connected(z, 1024, 'g_fully_connected1'),
is_train = train, name = 'g_bn1'))
# 把约束条件和上一层拼接起来
h1 = tf.concat([h1, y], 1, name = 'active1_concat_y')
h2 = tf.nn.relu(batch_norm_layer(fully_connected(h1, 128 * 49, 'g_fully_connected2'),
is_train = train, name = 'g_bn2'))
h2 = tf.reshape(h2, [64, 7, 7, 128], name = 'h2_reshape')
# 把约束条件和上一层拼接起来
h2 = conv_cond_concat(h2, yb, name = 'active2_concat_y')
h3 = tf.nn.relu(batch_norm_layer(deconv2d(h2, [64,14,14,128],
name = 'g_deconv2d3'),
is_train = train, name = 'g_bn3'))
h3 = conv_cond_concat(h3, yb, name = 'active3_concat_y')
# 经过一个 sigmoid 函数把值归一化为 0~1 之间,
h4 = tf.nn.sigmoid(deconv2d(h3, [64, 28, 28, 1],
name = 'g_deconv2d4'), name = 'generate_image')
return h4
# 定义判别器
def discriminator(image, y, reuse = False):
# 因为真实数据和生成数据都要经过判别器,所以需要指定 reuse 是否可用
if reuse:
tf.get_variable_scope().reuse_variables()
# 同生成器一样,判别器也需要把约束条件串联进来
yb = tf.reshape(y, [BATCH_SIZE, 1, 1, 10], name = 'yb')
x = conv_cond_concat(image, yb, name = 'image_concat_y')
# 卷积,激活,串联条件。
h1 = lrelu(conv2d(x, 11, name = 'd_conv2d1'), name = 'lrelu1')
h1 = conv_cond_concat(h1, yb, name = 'h1_concat_yb')
h2 = lrelu(batch_norm_layer(conv2d(h1, 74, name = 'd_conv2d2'),
name = 'd_bn2'), name = 'lrelu2')
h2 = tf.reshape(h2, [BATCH_SIZE, -1], name = 'reshape_lrelu2_to_2d')
h2 = tf.concat( [h2, y],1, name = 'lrelu2_concat_y')
h3 = lrelu(batch_norm_layer(fully_connected(h2, 1024, name = 'd_fully_connected3'),
name = 'd_bn3'), name = 'lrelu3')
h3 = tf.concat([h3, y], 1,name = 'lrelu3_concat_y')
# 全连接层,输出以为 loss 值
h4 = fully_connected(h3, 1, name = 'd_result_withouts_sigmoid')
return tf.nn.sigmoid(h4, name = 'discriminator_result_with_sigmoid'), h4
# 定义训练过程中的采样函数
def sampler(z, y, train = True):
tf.get_variable_scope().reuse_variables()
return generator(z, y, train = train)
def train():
# 设置 global_step ,用来记录训练过程中的 step
global_step = tf.Variable(0, name = 'global_step', trainable = False)
# 训练过程中的日志保存文件
train_dir = '/home/Zhoupinyi/DCGAN/logs'
# 放置三个 placeholder,y 表示约束条件,images 表示送入判别器的图片,
# z 表示随机噪声
y= tf.placeholder(tf.float32, [BATCH_SIZE, 10], name='y')
images = tf.placeholder(tf.float32, [64, 28, 28, 1], name='real_images')
z = tf.placeholder(tf.float32, [None, 100], name='z')
# 由生成器生成图像 G
G = generator(z, y)
# 真实图像送入判别器
D, D_logits = discriminator(images, y)
# 采样器采样图像
samples = sampler(z, y)
# 生成图像送入判别器
D_, D_logits_ = discriminator(G, y, reuse = True)
# 损失计算
d_loss_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( labels = tf.ones_like(D),logits = D_logits))
d_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.zeros_like(D_), logits = D_logits_))
d_loss = d_loss_real + d_loss_fake
g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(D_), logits = D_logits_))
# 总结操作
z_sum = tf.summary.histogram("z", z)
d_sum = tf.summary.histogram("d", D)
d__sum = tf.summary.histogram("d_", D_)
G_sum = tf.summary.image("G", G)
d_loss_real_sum = tf.summary.scalar("d_loss_real", d_loss_real)
d_loss_fake_sum = tf.summary.scalar("d_loss_fake", d_loss_fake)
d_loss_sum = tf.summary.scalar("d_loss", d_loss)
g_loss_sum = tf.summary.scalar("g_loss", g_loss)
# 合并各自的总结
g_sum = tf.summary.merge([z_sum, d__sum, G_sum, d_loss_fake_sum, g_loss_sum])
d_sum = tf.summary.merge([z_sum, d_sum, d_loss_real_sum, d_loss_sum])
# 生成器和判别器要更新的变量,用于 tf.train.Optimizer 的 var_list
t_vars = tf.trainable_variables()
d_vars = [var for var in t_vars if 'd_' in var.name]
g_vars = [var for var in t_vars if 'g_' in var.name]
saver = tf.train.Saver()
# 优化算法采用 Adam
optimizer = tf.train.AdamOptimizer(learning_rate = 0.0002, beta1 = 0.5)
d_optim = optimizer.minimize(d_loss, global_step = global_step, var_list = d_vars)
g_optim = optimizer.minimize(g_loss, global_step = global_step, var_list = g_vars)
os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.2
sess = tf.InteractiveSession(config=config)
init = tf.initialize_all_variables()
writer = tf.train.SummaryWriter(train_dir, sess.graph)
# 这个自己理解吧
data_x, data_y = read_data()
sample_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100))
# sample_images = data_x[0: 64]
sample_labels = data_y[0: 64]
sess.run(init)
# 循环 25 个 epoch 训练网络
for epoch in range(25):
batch_idxs = 1093
for idx in range(batch_idxs):
batch_images = data_x[idx*64: (idx+1)*64]
batch_labels = data_y[idx*64: (idx+1)*64]
batch_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100))
# 更新 D 的参数
_, summary_str = sess.run([d_optim, d_sum],
feed_dict = {images: batch_images,
z: batch_z,
y: batch_labels})
writer.add_summary(summary_str, idx+1)
# 更新 G 的参数
_, summary_str = sess.run([g_optim, g_sum],
feed_dict = {z: batch_z,
y: batch_labels})
writer.add_summary(summary_str, idx+1)
# 更新两次 G 的参数确保网络的稳定
_, summary_str = sess.run([g_optim, g_sum],
feed_dict = {z: batch_z,
y: batch_labels})
writer.add_summary(summary_str, idx+1)
# 计算训练过程中的损失,打印出来
errD_fake = d_loss_fake.eval({z: batch_z, y: batch_labels})
errD_real = d_loss_real.eval({images: batch_images, y: batch_labels})
errG = g_loss.eval({z: batch_z, y: batch_labels})
if idx % 20 == 0:
print("Epoch: [%2d] [%4d/%4d] d_loss: %.8f, g_loss: %.8f" \
% (epoch, idx, batch_idxs, errD_fake+errD_real, errG))
# 训练过程中,用采样器采样,并且保存采样的图片到
# /home/your_name/TensorFlow/DCGAN/samples/
if idx % 100 == 1:
sample = sess.run(samples, feed_dict = {z: sample_z, y: sample_labels})
samples_path = '/home/your_name/TensorFlow/DCGAN/samples/'
save_images(sample, [8, 8],
samples_path + 'test_%d_epoch_%d.png' % (epoch, idx))
print 'save down'
# 每过 500 次迭代,保存一次模型
if idx % 500 == 2:
checkpoint_path = os.path.join(train_dir, 'DCGAN_model.ckpt')
saver.save(sess, checkpoint_path, global_step = idx+1)
sess.close()
if __name__ == '__main__':
train()
The error information is as follow:
Traceback (most recent call last):
File "/home/zhoupinbyi/DCGAN/test_gan.py", line 370, in <module>
train()
File "/home/zhoupinbyi/DCGAN/test_gan.py", line 297, in train
d_optim = optimizer.minimize(d_loss, global_step = global_step, var_list = d_vars)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 325, in minimize
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 446, in apply_gradients
self._create_slots([_get_variable_for(v) for v in var_list])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/adam.py", line 128, in _create_slots
self._zeros_slot(v, "m", self._name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 766, in _zeros_slot
named_slots[_var_key(var)] = slot_creator.create_zeros_slot(var, op_name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/slot_creator.py", line 174, in create_zeros_slot
colocate_with_primary=colocate_with_primary)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/slot_creator.py", line 146, in create_slot_with_initializer
dtype)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/slot_creator.py", line 66, in _create_slot_var
validate_shape=validate_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 1065, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 962, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 367, in get_variable
validate_shape=validate_shape, use_resource=use_resource)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 352, in _true_getter
use_resource=use_resource)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variable_scope.py", line 682, in _get_single_variable
"VarScope?" % name)
ValueError: Variable d_conv2d1/weights/Adam/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?
the code can run in tensorflow 0.11, but in 1.2, you must add a code :
with tf.variable_scope(tf.get_variable_scope(), reuse=None):
above what you use your generator and discrimator function
I am building a binary classifier with my own image pipeline. The images are 178x65 (wxh) greyscale jpgs. It loads the images and their labels correctly, and fits all of the layer sizes correctly, but for some reason, it always outputs the same prediction for every image ([ 1. 0.]). Can anyone point me in the right direction?
# Example on how to use the tensorflow input pipelines. The explanation can be found here ischlag.github.io.
import tensorflow as tf
import random
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
dataset_path = "./v5-tensorflow/"
test_labels_file = "test-labels.csv"
train_labels_file = "train-labels.csv"
test_set_size = 39
IMAGE_HEIGHT = 65
IMAGE_WIDTH = 178
NUM_CHANNELS = 1
BATCH_SIZE = 39
def encode_label(label):
if label == "kalli\n":
return [0, 1]
elif label == "francisco\n":
return [1, 0]
else:
print("SOMETHING WENT WRONG")
return [0, 0]
def read_label_file(file):
f = open(file, "r")
filepaths = []
labels = []
for line in f:
filepath, label = line.split(",")
filepaths.append(filepath)
labels.append(encode_label(label))
return filepaths, labels
# reading labels and file path
train_filepaths, train_labels = read_label_file(dataset_path + train_labels_file)
test_filepaths, test_labels = read_label_file(dataset_path + test_labels_file)
# transform relative path into full path
train_filepaths = [ dataset_path + fp for fp in train_filepaths]
test_filepaths = [ dataset_path + fp for fp in test_filepaths]
# convert string into tensors
train_images = ops.convert_to_tensor(train_filepaths, dtype=dtypes.string)
test_images = ops.convert_to_tensor(test_filepaths, dtype=dtypes.string)
train_labels = ops.convert_to_tensor(train_labels, dtype=dtypes.int32)
test_labels = ops.convert_to_tensor(test_labels, dtype=dtypes.int32)
# create input queues
train_input_queue = tf.train.slice_input_producer(
[train_images, train_labels],
shuffle=True)
test_input_queue = tf.train.slice_input_producer(
[test_images, test_labels],
shuffle=True)
# process path and string tensor into an image and a label
file_content = tf.read_file(train_input_queue[0])
train_image = tf.image.decode_jpeg(file_content, channels=NUM_CHANNELS)
train_label = train_input_queue[1]
file_content = tf.read_file(test_input_queue[0])
test_image = tf.image.decode_jpeg(file_content, channels=NUM_CHANNELS)
test_label = test_input_queue[1]
# define tensor shape
train_image.set_shape([IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS])
test_image.set_shape([IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS])
# collect batches of images before processing
# train_image_batch, train_label_batch
train_batch = tf.train.batch(
[train_image, train_label],
batch_size=BATCH_SIZE
#,num_threads=1
)
# test_image_batch, test_label_batch
test_batch = tf.train.batch(
[test_image, test_label],
batch_size=BATCH_SIZE
#,num_threads=1
)
x = tf.placeholder(tf.float32, shape=[None, 65, 178, 1], name="x")
y_ = tf.placeholder(tf.float32, shape=[None, 2], name="y_")
#
def weight_variable(shape, n):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial, name=n)
def bias_variable(shape, n):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial, name=n)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
#
input_reshape = tf.reshape(x, [-1,65,178,1])
W_conv1 = weight_variable([5, 5, 1, 32], "w1")
b_conv1 = bias_variable([32], "b1")
h_conv1 = tf.nn.relu(conv2d(input_reshape, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1) # 33x89x32
#
W_conv2 = weight_variable([5, 5, 32, 64], "w2")
b_conv2 = bias_variable([64], "b2")
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2) # now 17x45x64
W_conv3 = weight_variable([5, 5, 64, 128], "w3")
b_conv3 = bias_variable([128], "b3")
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
h_pool3 = max_pool_2x2(h_conv3)# 9x23x128
W_conv4 = weight_variable([5, 5, 128, 256], "w4")
b_conv4 = bias_variable([256], "b4")
h_conv4 = tf.nn.relu(conv2d(h_pool3, W_conv4) + b_conv4)
h_pool4 = max_pool_2x2(h_conv4) # 5x12x256
W_fc1 = weight_variable([5 * 12 * 256, 5120], "w5")
b_fc1 = bias_variable([5120], "b5")
h_pool4_flat = tf.reshape(h_pool4, [-1, 5*12*256])
h_fc1 = tf.nn.relu(tf.matmul(h_pool4_flat, W_fc1) + b_fc1) # 1x5120
W_fc2 = weight_variable([5120, 2], "w6")
b_fc2 = bias_variable([2], "b6") # 1x2
y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2, name="softmax")
y_clipped = tf.clip_by_value(y_conv, 1e-10, 0.9999999)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_clipped), reduction_indices=[1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_clipped, 1), tf.argmax(y_, 1), name="correct")
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
saver = tf.train.Saver()
print("input pipeline ready")
with tf.Session() as sess:
# initialize the variables
sess.run(tf.global_variables_initializer())
# initialize the queue threads to start to shovel data
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
print("from the train set:")
for i in range(62):
batch = sess.run(train_batch)
if i % 3 == 0:
train_accuracy = accuracy.eval(feed_dict={
x:batch[0], y_: batch[1]
})
print("step %d, training accuracy %.2f" % (i, train_accuracy))
train_step.run(feed_dict={x: batch[0], y_: batch[1]})
saver.save(sess, "model.ckpt")
tf.train.write_graph(sess.graph_def, '', 'graph.pb')
t_batch = sess.run(test_batch)
print("test accuracy %g" % accuracy.eval(feed_dict={
x: t_batch[0], y_: t_batch[1]
}))
# stop our queue threads and properly close the session
coord.request_stop()
coord.join(threads)
sess.close()
I have a rather large network, and my GPU is running out of memory. It isn't a bug in any of the code, the network itself is simply too large to fit into memory. I've even tried the GPU config recommendations here.
For example, I've tried both of the gpu_options below...
gpu_options = tf.GPUOptions()
config = tf.ConfigProto(gpu_options=gpu_options)
config.gpu_options.allow_growth = True
# config.optimizer_options.opt_level = 2
# config.graph_options.enable_recv_scheduling = True
# config.graph_options.build_cost_model = 1
config.gpu_options.per_process_gpu_memory_fraction = 0.1
But I'm still running out of memory. I have been informed by GitHub user #girving here that Tensorflow doesn't handle Memory overflow (which makes no sense to me why they wouldn't implement this).
However, he also claimed there are workarounds. I can't find any support of anyone who has had to implement a workaround. Can anyone point me in the right direction? Can I implement Queuing somehow?
For reference, here is some code... the program runs out of memory at the time of sess(init)
#Kendall Weihe
#This is a CNN that handles 3D data
#Adjust network parameters below, also adjust data directory
import tensorflow as tf
import pdb
import numpy as np
from numpy import genfromtxt
from PIL import Image
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell
from tensorflow.tensorflow.scroll import scroll_data
# Parameters
learning_rate = 0.001
training_iters = 1000000
batch_size = 1
display_step = 1
# Network Parameters
n_images = 100
n_input_x = 396 # Input image x-dimension
n_input_y = 396 # Input image y-dimension
n_input_z = 5
n_hidden = 128
n_classes = 2 # Binary classification -- on a surface or not
n_output = n_input_x * n_classes
dropout = 0.75 # Dropout, probability to keep units
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input_z, n_input_x, n_input_y])
y = tf.placeholder(tf.float32, [None, n_input_z, n_input_x, n_input_y, n_classes], name="ground_truth")
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
def input_data():
data = np.empty((n_images, n_input_x, n_input_y))
temp = []
for i in range(n_images):
filename = "/home/volcart/Documents/Data/input_crops/cropped00" + str(i) + ".tif"
im = Image.open(path)
imarray = np.array(im)
temp.append(imarray)
for i in range(n_images):
for j in range(n_input_x):
for k in range(n_input_y):
data[i][j][k] = temp[i][j][k]
return data
# Create some wrappers for simplicity
def conv3d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv3d(x, W, strides=[1, strides, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool3d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool3d(x, ksize=[1, k, k, k, 1], strides=[1, k, k, k, 1],
padding='SAME')
def deconv3d(prev_layer, w, b, output_shape, strides):
# Deconv layer
deconv = tf.nn.conv3d_transpose(prev_layer, w, output_shape=output_shape, strides=strides, padding="VALID")
deconv = tf.nn.bias_add(deconv, b)
deconv = tf.nn.relu(deconv)
return deconv
# Create model
def conv_net(x, weights, biases, dropout):
# Reshape input picture
x = tf.reshape(x, shape=[-1, n_input_z, n_input_x, n_input_y, 1])
with tf.name_scope("conv1") as scope:
# Convolution Layer
conv1 = conv3d(x, weights['wc1'], biases['bc1'])
# Max Pooling (down-sampling)
#conv1 = tf.nn.local_response_normalization(conv1)
conv1 = maxpool3d(conv1, k=2)
# Convolution Layer
with tf.name_scope("conv2") as scope:
conv2 = conv3d(conv1, weights['wc2'], biases['bc2'])
# Max Pooling (down-sampling)
# conv2 = tf.nn.local_response_normalization(conv2)
conv2 = maxpool3d(conv2, k=2)
# Convolution Layer
with tf.name_scope("conv3") as scope:
conv3 = conv3d(conv2, weights['wc3'], biases['bc3'])
# Max Pooling (down-sampling)
# conv3 = tf.nn.local_response_normalization(conv3)
conv3 = maxpool3d(conv3, k=2)
# pdb.set_trace()
temp_batch_size = tf.shape(x)[0] #batch_size shape
with tf.name_scope("deconv1") as scope:
output_shape = [temp_batch_size, 2, n_input_x / 4, n_input_y / 4, 16]
strides = [1,2,2,2,1]
#conv4 = deconv3d(conv3, weights['wdc1'], biases['bdc1'], output_shape, strides)
# conv4 = tf.nn.local_response_normalization(conv4)
conv4 = tf.nn.conv3d_transpose(conv3, weights['wdc1'], output_shape=output_shape, strides=strides, padding="SAME")
conv4 = tf.nn.bias_add(conv4, biases['bdc1'])
conv4 = tf.nn.relu(conv4)
with tf.name_scope("deconv2") as scope:
output_shape = [temp_batch_size, 3, n_input_x / 2, n_input_y / 2, 8]
strides = [1,1,2,2,1]
conv5 = deconv3d(conv4, weights['wdc2'], biases['bdc2'], output_shape, strides)
# conv5 = tf.nn.local_response_normalization(conv5)
with tf.name_scope("deconv3") as scope:
output_shape = [temp_batch_size, n_input_z, n_input_x, n_input_y, 1]
#this time don't use ReLu -- since output layer
conv6 = tf.nn.conv3d_transpose(conv5, weights['wdc3'], output_shape=output_shape, strides=[1,1,2,2,1], padding="VALID")
conv6 = tf.nn.bias_add(conv6, biases['bdc3'])
conv6 = tf.nn.dropout(conv6, dropout)
# conv6 = tf.nn.relu(conv6)
# pdb.set_trace()
x = tf.reshape(conv6, [-1, n_input_x])
x = tf.split(0, n_input_y * n_input_z, x)
lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True, activation=tf.nn.relu)
# lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * n_hidden, state_is_tuple=True)
lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=0.75)
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
output = []
for i in xrange(n_input_y * n_input_z):
output.append(tf.matmul(outputs[i], lstm_weights[i]) + lstm_biases[i])
return output
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1' : tf.Variable(tf.random_normal([2, 2, 2, 1, 8])),
# 5x5 conv, 32 inputs, 64 outputs
'wc2' : tf.Variable(tf.random_normal([2, 2, 2, 8, 16])),
# 5x5 conv, 32 inputs, 64 outputs
'wc3' : tf.Variable(tf.random_normal([2, 2, 2, 16, 32])),
'wdc1' : tf.Variable(tf.random_normal([2, 2, 2, 16, 32])),
'wdc2' : tf.Variable(tf.random_normal([2, 2, 2, 8, 16])),
'wdc3' : tf.Variable(tf.random_normal([3, 2, 2, 1, 8])),
}
biases = {
'bc1': tf.Variable(tf.random_normal([8])),
'bc2': tf.Variable(tf.random_normal([16])),
'bc3': tf.Variable(tf.random_normal([32])),
'bdc1': tf.Variable(tf.random_normal([16])),
'bdc2': tf.Variable(tf.random_normal([8])),
'bdc3': tf.Variable(tf.random_normal([1])),
}
lstm_weights = {}
lstm_biases = {}
for i in xrange(n_input_y * n_input_z):
lstm_weights[i] = tf.Variable(tf.random_normal([n_hidden, n_output]))
lstm_biases[i] = tf.Variable(tf.random_normal([n_output]))
# Construct model
with tf.name_scope("net") as scope:
print "Building network..."
pred = conv_net(x, weights, biases, keep_prob)
print "Network built!"
# pdb.set_trace()
pred = tf.transpose(tf.pack(pred),[1,0,2])
pred = tf.reshape(pred, [-1, n_input_z, n_input_x, n_input_y, n_classes])
# Reshape for cost function
temp_pred = tf.reshape(pred, [-1, n_classes])
temp_y = tf.reshape(y, [-1, n_classes])
with tf.name_scope("loss") as scope:
# cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(pred, y))
cost = (tf.nn.sigmoid_cross_entropy_with_logits(temp_pred, temp_y))
with tf.name_scope("opt") as scope:
print "Initializing optimizer..."
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
print "optimizer initialized!"
# pdb.set_trace()
# Evaluate model
with tf.name_scope("acc") as scope:
# accuracy is the difference between prediction and ground truth matrices
correct_pred = tf.equal(0,tf.cast(tf.sub(tf.nn.sigmoid(temp_pred),temp_y), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
with tf.name_scope("prediction-node") as scope:
prediction_node = tf.nn.sigmoid(temp_pred)
# Initializing the variables
with tf.name_scope("initialize-and-config") as scope:
print "Initializing variables & configuring..."
init = tf.initialize_all_variables()
saver = tf.train.Saver()
gpu_options = tf.GPUOptions()
config = tf.ConfigProto(gpu_options=gpu_options)
config.gpu_options.allow_growth = True
# config.optimizer_options.opt_level = 2
# config.graph_options.enable_recv_scheduling = True
# config.graph_options.build_cost_model = 1
config.gpu_options.per_process_gpu_memory_fraction = 0.1
print "Variables and configurations initialized!"
# Launch the graph
with tf.Session(config=config) as sess:
print "Initializing session..."
sess.run(init)
print "Session initialized!"
print "Restoring session..."
saver.restore(sess, "/home/volcart/Documents/3D-CNN-2D-LSTM-reg-model/model.ckpt")
print "Session restored!"
tf.get_default_graph().finalize()
# Import data
print "Importing data..."
data = input_data()
print "Data imported!"
# Keep training until reach max iterations
for i in range(n_images):
print "Prediction image number -- " + str(i)
temp = []
for j in range(n_input_z):
temp.append(data[j,:,:])
temp = np.asarray(temp)
temp = temp.reshape((1, n_input_z, n_input_x, n_input_y))
prediction = sess.run(prediction_node, feed_dict={x: temp, keep_prob: 1.0})
prediction = prediction.reshape((n_input_x, n_input_y, n_classes))
temp_arr1 = np.empty((n_input_x, n_input_y))
for i in xrange(n_input_x):
for j in xrange(n_input_y):
if l == 0:
temp_arr1[i][j] = prediction[i][j][0]
csv_file = "/home/volcart/Documents/3D-CNN-2D-LSTM-pred/3D-CNN-2D-LSTM-step-" + str(i) + ".csv"
np.savetxt(csv_file, temp_arr1, delimiter=",")