GAN with batch norm acting very weird, both discriminator and generator get zero loss - tensorflow

I am training a DCGAN model with tensorflow.keras, and I added BatchNormalization layers in both generator and discriminator.
I train gan with following steps:
1. train discriminator with real images and images from generator(using generator.predict)
2. train adversarial network(compiled with discriminator.trainable=False)
Then I found that after a few rounds the training loss returned by train_on_batch() of both generator and discriminator goes to zero. But when I use test_on_batch() the loss is still huge for generator. And the generated images are all mess.
At first I thought that it is because in the 2.'s step mentioned above when training the adversarial network, the discriminator's input only containing fake images makes the batch normalization layers get different distribution as the 1.'s step when both fake & real images were fed.
But even if I removed all batch normalization layers in discriminator, the same problem still exists. Only when all batch normalization layers were removed the problem will disappear. Also I found out that existences of Dropout layers don't make a difference. I wondering why batch normalization can cause such problem, even if in generator fed with noises with same distribution.
# Model definition
class DCGAN_128:
def __init__(self, hidden_dim):
generator = M.Sequential()
generator.add(L.Dense(128 * 8 * 8, input_shape=[hidden_dim]))
generator.add(L.Reshape([8, 8, 128]))
generator.add(L.UpSampling2D()) # [8, 8, 128]
generator.add(L.Conv2D(128, kernel_size=3, padding="same")) # [16, 16, 128]
generator.add(L.LayerNormalization()) # 4
generator.add(L.ReLU())
generator.add(L.UpSampling2D()) # [32, 32, 128]
generator.add(L.Conv2D(64, kernel_size=5, padding="same")) # [32, 32, 64]
generator.add(L.LayerNormalization()) # 8
generator.add(L.ReLU())
generator.add(L.UpSampling2D()) # [64, 64, 128]
generator.add(L.Conv2D(32, kernel_size=7, padding="same")) # [64, 64, 32]
generator.add(L.LayerNormalization()) # 12
generator.add(L.ReLU())
generator.add(L.UpSampling2D()) # [128, 128, 32]
generator.add(L.Conv2D(3, kernel_size=3, padding="same", activation=A.sigmoid)) # [128, 128, 3]
discriminator = M.Sequential()
discriminator.add(L.Conv2D(32, kernel_size=5, strides=2, padding="same", input_shape=[128, 128, 3]))
discriminator.add(L.LeakyReLU())
# discriminator.add(L.Dropout(0.25)) # [64, 64, 32]
discriminator.add(L.Conv2D(64, kernel_size=3, strides=2, padding="same"))
# discriminator.add(L.BatchNormalization(epsilon=1e-5)) # 4
discriminator.add(L.LeakyReLU())
# discriminator.add(L.Dropout(0.25)) # [32, 32, 64]
discriminator.add(L.Conv2D(128, kernel_size=3, strides=2, padding="same"))
discriminator.add(L.LayerNormalization()) # 8
discriminator.add(L.LeakyReLU()) # [16, 16, 128]
discriminator.add(L.Dropout(0.25))
discriminator.add(L.Conv2D(256, kernel_size=3, strides=2, padding="same"))
discriminator.add(L.LayerNormalization()) # 12
discriminator.add(L.LeakyReLU()) # [8, 8, 256]
discriminator.add(L.Dropout(0.25))
discriminator.add(L.Conv2D(512, kernel_size=3, strides=2, padding="same"))
discriminator.add(L.LeakyReLU()) # [4, 4, 512]
discriminator.add(L.Flatten())
discriminator.add(L.Dense(1, activation=A.sigmoid))
self.model_gen = generator
self.model_dis = discriminator
self.adv_input = L.Input([hidden_dim])
self.adv_output = discriminator(generator(self.adv_input))
self.model_adversarial = M.Model(self.adv_input, self.adv_output)
# Training
dcgan = hidden_dim = 100
DCGAN_128(hidden_dim)
data_loader = AnimeFacesLoader([128, 128])
batch_size = 32
n_rounds = 40000
dis_model = dcgan.model_dis
gen_model = dcgan.model_gen
adv_model = dcgan.model_adversarial
gen_model.summary()
adv_model.summary()
dis_model.compile(Opt.Adam(0.0002), Lo.binary_crossentropy)
dis_model.trainable = False
adv_model.compile(Opt.Adam(0.0002), Lo.binary_crossentropy)
layer_outputs = [layer.output for layer in dis_model.layers]
visual_model = tf.keras.Model(dis_model.input, layer_outputs)
for rounds in range(n_rounds):
# Get output images
if rounds % 100 == 0 and rounds > 0:
noise = np.random.uniform(-1, 1, [16, hidden_dim])
tiled_images = np.zeros([4*128, 4*128, 3]).astype(np.uint8)
generated_imgs = gen_model.predict(noise)
generated_imgs *= 256
generated_imgs = generated_imgs.astype(np.uint8)
for i in range(16):
tiled_images[int(i / 4)*128: int(i / 4)*128 + 128,
int(i % 4)*128: int(i % 4)*128 + 128, :] = generated_imgs[i, :, :, :]
Image.fromarray(tiled_images).save("Output/DCGAN/" + "rounds_{0}.jpg".format(rounds))
'''
layer_visualization = visual_model.predict(generated_imgs[:1])
for i in range(len(layer_visualization)):
plt.imshow(layer_visualization[i][0, :, :, 0])
plt.show()
'''
# train discriminator on real & fake images
real_imgs = data_loader.get_batch(batch_size)
real_ys = np.ones([batch_size, 1])
noise = np.random.uniform(-1, 1, [batch_size, hidden_dim])
fake_ys = np.zeros([batch_size, 1])
fake_imgs = gen_model.predict(noise)
imgs = np.concatenate([real_imgs, fake_imgs], axis=0)
ys = np.concatenate([real_ys, fake_ys], axis=0)
loss_dis = dis_model.train_on_batch(imgs, ys)
print("Round {}, Loss dis:{:.4f}".format(rounds, loss_dis))
loss_dis_test = dis_model.test_on_batch(imgs, ys)
print(loss_dis_test)
noise = np.random.uniform(-1, 1, [batch_size, hidden_dim])
fake_ys = np.ones([batch_size, 1])
loss_gen = adv_model.train_on_batch(noise, fake_ys)
print("Round {}, Loss gen:{:.4f}".format(rounds, loss_gen))
loss_gen_test = adv_model.test_on_batch(noise, fake_ys)
print(loss_gen_test)

Related

What is the meaning of the parameter of max_pool2d?

In the 8th line of code shown below, I don't know the meaning of the second [2,2].
Is it the size of stride? (the API of max_pool2d is also shown below) By the way, how did the number 1024 in the 15th line of code come from?
def build_graph(top_k):
# with tf.device('/cpu:0'):
keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='keep_prob')
images = tf.placeholder(dtype=tf.float32, shape=[None, 64, 64, 1], name='image_batch')
labels = tf.placeholder(dtype=tf.int64, shape=[None], name='label_batch')
conv_1 = slim.conv2d(images, 64, [3, 3], 1, padding='SAME', scope='conv1')
max_pool_1 = slim.max_pool2d(conv_1, [2, 2], [2, 2], padding='SAME')
conv_2 = slim.conv2d(max_pool_1, 128, [3, 3], padding='SAME', scope='conv2')
max_pool_2 = slim.max_pool2d(conv_2, [2, 2], [2, 2], padding='SAME')
conv_3 = slim.conv2d(max_pool_2, 256, [3, 3], padding='SAME', scope='conv3')
max_pool_3 = slim.max_pool2d(conv_3, [2, 2], [2, 2], padding='SAME')
flatten = slim.flatten(max_pool_3)
fc1 = slim.fully_connected(slim.dropout(flatten, keep_prob), 1024, activation_fn=tf.nn.tanh, scope='fc1')
logits = slim.fully_connected(slim.dropout(fc1, keep_prob), FLAGS.charset_size, activation_fn=None, scope='fc2')
# logits = slim.fully_connected(flatten, FLAGS.charset_size, activation_fn=None, reuse=reuse, scope='fc')
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits, 1), labels), tf.float32))
Here is the API of max_pool2d:
#add_arg_scope
def max_pool2d(inputs,
kernel_size,
stride=2,
padding='VALID',
data_format=DATA_FORMAT_NHWC,
outputs_collections=None,
scope=None):
The second [2,2] in max_pool2d specifies the strides of the layer along the x-axis and the y-axis respectively.
Also, the 1024 number in the fully_connected function specifies the number of the neurons in that layer.
I would clarify that the number of the neurons of the fully_connected layer is not related to the strides of the max_pool2d layer. The size of the fully_connected layer can be set arbitrarily without considering its input size. Typically, the output layer size or the number of the classes in the problem should be considered while deciding about the number of the last fully_connected layers in the networks rather than the layer's input size.

Image regression with CNN

My immediate problem is that all of the various CNN regression models I've tried always return the same (or very similar) values and I'm trying to figure out why. But I would be open to a wide range of suggestions.
My dataset looks like this:
x: 64x64 greyscale images arranged into a 64 x 64 x n ndarray
y: Values between 0 and 1, each corresponding to an image (think of this as some sort of proportion)
weather: 4 weather readings from the time each image was taken (ambient temperature, humidity, dewpoint, air pressure)
The goal is to use the images and weather data to predict y. Since I'm working with images, I thought a CNN would be appropriate (please let me know if there are other strategies here).
From what I understand, CNNs are most often used for classification tasks--it's rather unusual to use them for regression. But in theory, it shouldn't be too different--I just need to change the loss function to MSE/RMSE and the last activation function to linear (although maybe a sigmoid is more appropriate here since y is between 0 and 1).
The first hurdle I ran into was trying to figure out how to incorporate the weather data, and the natural choice was to incorporate them into the first fully connected layer. I found an example here: How to train mix of image and data in CNN using ImageAugmentation in TFlearn
The second hurdle I ran into was determining an architecture. Normally I would just pick a paper and copy its architecture, but I couldn't find anything on CNN image regression. So I tried a (fairly simple) network with 3 convolutional layers and 2 fully connected layers, then I tried VGGNet and AlexNet architectures from https://github.com/tflearn/tflearn/tree/master/examples
Now the problem I'm having is that all of the models I'm trying output the same value, namely the mean y of the training set. Looking at tensorboard, the loss function flattens out fairly quickly (after around 25 epochs). Do you know what's going on here? While I do understand the basics of what each layer is doing, I have no intuition on what makes a good architecture for a particular dataset or task.
Here is an example. I am using VGGNet from the tflearn examples page:
tf.reset_default_graph()
img_aug = ImageAugmentation()
img_aug.add_random_flip_leftright()
img_aug.add_random_flip_updown()
img_aug.add_random_90degrees_rotation(rotations=[0, 1, 2, 3])
convnet = input_data(shape=[None, size, size, 1],
data_augmentation=img_aug,
name='hive')
weathernet = input_data(shape=[None, 4], name='weather')
convnet = conv_2d(convnet, 64, 3, activation='relu', scope='conv1_1')
convnet = conv_2d(convnet, 64, 3, activation='relu', scope='conv1_2')
convnet = max_pool_2d(convnet, 2, strides=2, name='maxpool1')
convnet = conv_2d(convnet, 128, 3, activation='relu', scope='conv2_1')
convnet = conv_2d(convnet, 128, 3, activation='relu', scope='conv2_2')
convnet = max_pool_2d(convnet, 2, strides=2, name='maxpool2')
convnet = conv_2d(convnet, 256, 3, activation='relu', scope='conv3_1')
convnet = conv_2d(convnet, 256, 3, activation='relu', scope='conv3_2')
convnet = conv_2d(convnet, 256, 3, activation='relu', scope='conv3_3')
convnet = max_pool_2d(convnet, 2, strides=2, name='maxpool3')
convnet = conv_2d(convnet, 512, 3, activation='relu', scope='conv4_1')
convnet = conv_2d(convnet, 512, 3, activation='relu', scope='conv4_2')
convnet = conv_2d(convnet, 512, 3, activation='relu', scope='conv4_3')
convnet = max_pool_2d(convnet, 2, strides=2, name='maxpool4')
convnet = conv_2d(convnet, 512, 3, activation='relu', scope='conv5_1')
convnet = conv_2d(convnet, 512, 3, activation='relu', scope='conv5_2')
convnet = conv_2d(convnet, 512, 3, activation='relu', scope='conv5_3')
convnet = max_pool_2d(convnet, 2, strides=2, name='maxpool5')
convnet = fully_connected(convnet, 4096, activation='relu', scope='fc6')
convnet = merge([convnet, weathernet], 'concat')
convnet = dropout(convnet, .75, name='dropout1')
convnet = fully_connected(convnet, 4096, activation='relu', scope='fc7')
convnet = dropout(convnet, .75, name='dropout2')
convnet = fully_connected(convnet, 1, activation='sigmoid', scope='fc8')
convnet = regression(convnet,
optimizer='adam',
learning_rate=learning_rate,
loss='mean_square',
name='targets')
model = tflearn.DNN(convnet,
tensorboard_dir='log',
tensorboard_verbose=0)
model.fit({
'hive': x_train,
'weather': weather_train
},
{'targets': y_train},
n_epoch=1000,
batch_size=batch_size,
validation_set=({
'hive': x_val,
'weather': weather_val
},
{'targets': y_val}),
show_metric=False,
shuffle=True,
run_id='poop')
To get at what my objects are:
x_train is an ndarray of shape (n, 64, 64, 1)
weather_train is an ndarray of shape (n, 4)
y_train is an ndarray of shape (n, 1)
Overfitting is another concern, but given that the models perform poorly on the training set, I think I can worry about that later.
To address your concern regarding the same predicted value for all instances in your test set. You have a couple options here that don't involve changing the structure of your conv net:
You can rescale your target variable using sklearn StandardScaler() (which standardizes features by removing the mean and scaling to unit variance)
Scale pixel data; generally performance increases with scaled pixel data, as a rule of thumb always divide pixel data by 255.0 (shown at end of post)
You can play around with learning rate and the error function (the reason that the CNN is outputting the same value for all predictions is because that is what it has determined is the point of minimum error)
Next. If you are trying to perform regression ensure that your final fully connected layer uses a linear activation function instead of sigmoid. A linear activation function takes inputs to the neuron multiplied by the neuron weight and creates an output proportional to the input.
convnet = fully_connected(convnet, 1, activation='linear', scope='fc8')
Lastly. I have recently implemented ResNet50 for regression tasks in Keras. Here is the construction of that network, this version does not permit loading of pretrained weights and it must receive images of shape (224, 224, 3).
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D, MaxPooling2D, DepthwiseConv2D
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, Input, Add, ZeroPadding2D, GlobalAveragePooling2D, GlobalMaxPooling2D
from keras.models import Model
from keras import backend
def block1(x, filters, kernel_size=3, stride=1, conv_shortcut=True, name=None):
"""
A residual block
:param x: input tensor
:param filters: integer, filters of the bottleneck layer
:param kernel_size: kernel size of bottleneck
:param stride: stride of first layer
:param conv_shortcut: use convolution shortcut if true, otherwise identity shortcut
:param name: string, block label
:return: Output tensor of the residual block
"""
# bn_axis = 3 if backend.image_data_format() == 'channels_last' else 1
bn_axis = -1
if conv_shortcut is True:
shortcut = Conv2D(4 * filters, 1, strides=stride, name=name+'_0_conv')(x)
shortcut = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name=name+'_0_bn')(shortcut)
else:
shortcut = x
x = Conv2D(filters, 1, strides=stride, name=name+'_1_conv')(x)
x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name=name+'_1_bn')(x)
x = Activation('relu', name=name+'_1_relu')(x)
x = Conv2D(filters, kernel_size, padding='SAME', name=name+'_2_conv')(x)
x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name=name+'_2_bn')(x)
x = Activation('relu', name=name+'_2_relu')(x)
x = Conv2D(4 * filters, 1, name=name+'_3_conv')(x)
x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name=name+'_3_bn')(x)
x = Add(name=name+'_add')([shortcut, x])
x = Activation('relu', name=name+'_out')(x)
return x
def stack1(x, filters, blocks, stride1=2, name=None):
"""
a set of stacked residual blocks
:param x: input tensor
:param filters: int, filters fof the bottleneck layer in the block
:param blocks: int, blocks in the stacked blocks,
:param stride1: stride of the first layer in the first block
:param name: stack label
:return: output tensor for the stacked blocks
"""
x = block1(x, filters, stride=stride1, name=name+'_block1')
for i in range(2, blocks+1):
x = block1(x, filters, conv_shortcut=False, name=name+'_block'+str(i))
return x
def resnet(height, width, depth, stack_fn, use_bias=False, nodes=256):
"""
:param height: height of image, int
:param width: image width, int
:param depth: bn_axis or depth, int
:param stack_fn: function that stacks residual blocks
:param nodes: width of nodes included in top layer of CNN, int
:return: a Keras model instance
"""
input_shape = (height, width, depth)
img_input = Input(shape=input_shape)
x = ZeroPadding2D(padding=((3, 3), (3, 3)), name='conv1_pad')(img_input)
x = Conv2D(64, 7, strides=2, use_bias=use_bias, name='conv1_conv')(x)
x = ZeroPadding2D(padding=((1, 1), (1, 1)), name='pool1_pad')(x)
x = MaxPooling2D(3, strides=2, name='pool1_pool')(x)
x = stack_fn(x)
# top layer
x = GlobalAveragePooling2D(name='avg_pool')(x)
x = Dense(nodes, activation='relu')(x)
# perform regression
x = Dense(1, activation='linear')(x)
model = Model(img_input, x)
return model
def resnet50(height, width, depth, nodes):
def stack_fn(x):
x = stack1(x, 64, 3, stride1=1, name='conv2')
x = stack1(x, 128, 4, name='conv3')
x = stack1(x, 256, 6, name='conv4')
x = stack1(x, 512, 3, name='conv5')
return x
return resnet(height, width, depth, stack_fn, nodes=nodes)
Which can be implemented using some x_train, x_test, y_train, y_test data (where x_train/test is image data and y_train,y_test data are numeric values on the interval [0, 1].
scaler = MinMaxScaler()
images = load_images(df=target, path=PATH_features, resize_shape=(224, 224), quadruple=True)
images = images / 255.0 # scale pixel data to [0, 1]
images = images.astype(np.float32)
imshape = images.shape
target = target[Target]
target = quadruple_target(target, target=Target)
x_train, x_test, y_train, y_test = train_test_split(images, target, test_size=0.3, random_state=101)
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
model = resnet50(imshape[1], imshape[2], imshape[3], nodes=256)
opt = Adam(lr=1e-5, decay=1e-5 / 200)
model.compile(loss=lossFN, optimizer=opt)
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), verbose=1, epochs=200)
pred = model.predict(x_test)

Tensorflow logits and labels error, but are same shape

this question has been asked several times already, but I don't seem to be able to adapt previous solutions to my code. I would therefore appreciate any advice on how to solve this. I have tried using pdb and set a trace point right before the problem, which didn't give me much information.
I am adapting this tutorial to my problem:
https://www.oreilly.com/ideas/visualizing-convolutional-neural-networks
Data Shape:
x_train.shape: (1161, 68, 68, 1)
x_test.shape: (216, 68, 68, 1)
y_test.shape: (216,)
y_train.shape: (1161,)
Where the error occurs:
#Train the Model
steps = int(x_train.shape[0]/batchSize)
for i in range(numEpochs):
print(i)
accHist = []
accHist2 = []
#x_train, y_train = imf.shuffle(x_train, y_train)
for j in range(steps):
print(j)
#Calculate our current step
step = i * steps + j
#Feed forward batch of train images into graph and log accuracy
acc = sess.run([accuracy], feed_dict={X: x_train[(j*batchSize):((j+1)*batchSize),:,:,:], Y_: np.array(y_train[(j*batchSize):((j+1)*batchSize)]).reshape(1,30), keepRate1: 1, keepRate2: 1})
print(accHist)
accHist.append(acc)
#Back propigate using adam optimizer to update weights and biases.
sess.run(train_step, feed_dict={X: x_train[(j*batchSize):((j+1)*batchSize),:,:,:], Y_: np.array(y_train[(j*batchSize):((j+1)*batchSize)]).reshape(1,30), keepRate1: 0.2, keepRate2: 0.5})
print("success")
print('Epoch number {} Training Accuracy: {}'.format(i+1, np.mean(accHist)))
#Feed forward all test images into graph and log accuracy
for k in range(int(x_test.shape[0]/batchSize)):
acc = sess.run(accuracy, feed_dict={X: x_test[(k*batchSize):((k+1)*batchSize),:,:,:], Y_: np.array(y_test[(k*batchSize):((k+1)*batchSize)]).reshape(1,30), keepRate1: 1, keepRate2: 1})
accHist2.append(acc)
print("Test Set Accuracy: {}".format(np.mean(accHist2)))
I am getting the following error message:
InvalidArgumentError: logits and labels must be same size: logits_size=[30,30] labels_size=[1,30]
[[Node: cross_entropy_7/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](cross_entropy_7/Reshape, cross_entropy_7/Reshape_1)]]
Following the tutorial, I thought the logins were set here:
#FULLY CONNECTED 3 & SOFTMAX OUTPUT
with tf.name_scope('softmax') as scope:
fc2w = tf.Variable(tf.truncated_normal([512, classes], dtype=tf.float32,
stddev=1e-1), name='weights3_2')
fc2b = tf.Variable(tf.constant(1.0, shape=[classes], dtype=tf.float32),
trainable=True, name='biases3_2')
Ylogits = tf.nn.bias_add(tf.matmul(fc1_drop, fc2w), fc2b)
Y = tf.nn.softmax(Ylogits)
print(Ylogits.shape) here gives me: (?, 30). Classes is set at 30 so this seems to make sense.
This seems to be the functions that doesn't work, so I printed the shapes:
with tf.name_scope('cross_entropy'):
print(Ylogits.shape)
print(Y.shape)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_)
loss = tf.reduce_mean(cross_entropy)
Which gave me:
(?, 30)
(?, 30)
When executing the line for back propagation above though this does not seem to work. Can anyone help?
In response to comment (this basically is the tutorial code from the link mentioned above):
Place Holders:
classes = 30
X = tf.placeholder(tf.float32, name="X-placeholder", shape=(None, 68, 68, 1))
Y_ = tf.placeholder(tf.float32, [None, classes], name="Y_-placeholder")
keepRate1 = tf.placeholder(tf.float32, name="keepRate1-placeholder")
keepRate2 = tf.placeholder(tf.float32, name="keepRate2-placeholder")
Model:
# CONVOLUTION 1 - 1
with tf.name_scope('conv1_1'):
filter1_1 = tf.Variable(tf.truncated_normal([3, 3, 1, 32], dtype=tf.float32,
stddev=1e-1), name='weights1_1')
stride = [1,1,1,1]
conv = tf.nn.conv2d(X, filter1_1, stride, padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[32], dtype=tf.float32),
trainable=True, name='biases1_1')
out = tf.nn.bias_add(conv, biases)
conv1_1 = tf.nn.relu(out)
# CONVOLUTION 1 - 2
with tf.name_scope('conv1_2'):
filter1_2 = tf.Variable(tf.truncated_normal([3, 3, 32, 32], dtype=tf.float32,
stddev=1e-1), name='weights1_2')
conv = tf.nn.conv2d(conv1_1, filter1_2, [1,1,1,1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[32], dtype=tf.float32),
trainable=True, name='biases1_2')
out = tf.nn.bias_add(conv, biases)
conv1_2 = tf.nn.relu(out)
# POOL 1
with tf.name_scope('pool1'):
pool1_1 = tf.nn.max_pool(conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool1_1')
pool1_1_drop = tf.nn.dropout(pool1_1, keepRate1)
# CONVOLUTION 2 - 1
with tf.name_scope('conv2_1'):
filter2_1 = tf.Variable(tf.truncated_normal([3, 3, 32, 64], dtype=tf.float32,
stddev=1e-1), name='weights2_1')
conv = tf.nn.conv2d(pool1_1_drop, filter2_1, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32),
trainable=True, name='biases2_1')
out = tf.nn.bias_add(conv, biases)
conv2_1 = tf.nn.relu(out)
# CONVOLUTION 2 - 2
with tf.name_scope('conv2_2'):
filter2_2 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], dtype=tf.float32,
stddev=1e-1), name='weights2_2')
conv = tf.nn.conv2d(conv2_1, filter2_2, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32),
trainable=True, name='biases2_2')
out = tf.nn.bias_add(conv, biases)
conv2_2 = tf.nn.relu(out)
# POOL 2
with tf.name_scope('pool2'):
pool2_1 = tf.nn.max_pool(conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool2_1')
pool2_1_drop = tf.nn.dropout(pool2_1, keepRate1)
#FULLY CONNECTED 1
with tf.name_scope('fc1') as scope:
shape = int(np.prod(pool2_1_drop.get_shape()[1:]))
fc1w = tf.Variable(tf.truncated_normal([shape, 512], dtype=tf.float32,
stddev=1e-1), name='weights3_1')
fc1b = tf.Variable(tf.constant(1.0, shape=[512], dtype=tf.float32),
trainable=True, name='biases3_1')
pool2_flat = tf.reshape(pool2_1_drop, [-1, shape])
out = tf.nn.bias_add(tf.matmul(pool2_flat, fc1w), fc1b)
fc1 = tf.nn.relu(out)
fc1_drop = tf.nn.dropout(fc1, keepRate2)
#FULLY CONNECTED 3 & SOFTMAX OUTPUT
with tf.name_scope('softmax') as scope:
fc2w = tf.Variable(tf.truncated_normal([512, classes], dtype=tf.float32,
stddev=1e-1), name='weights3_2')
fc2b = tf.Variable(tf.constant(1.0, shape=[classes], dtype=tf.float32),
trainable=True, name='biases3_2')
Ylogits = tf.nn.bias_add(tf.matmul(fc1_drop, fc2w), fc2b)
Y = tf.nn.softmax(Ylogits)
numEpochs = 400
batchSize = 30
alpha = 1e-5
with tf.name_scope('cross_entropy'):
print(Ylogits.shape)
print(Y.shape)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_)
loss = tf.reduce_mean(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('train'):
train_step = tf.train.AdamOptimizer(learning_rate=alpha).minimize(loss)
#Create Session and insert variables
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
The tensor shape (?, 30) denotes that the batch size is not set, so you can feed any batch size data to your graph, the problem is that then you can run into these kinds of problems, and need to keep track of the tensor shapes in your head.
The thing you need to fix is: either you have 30 images in one batch, but only 1 label in one batch, which needs to be fixed, because you cannot compute loss for 30 images with only one label, you either need to decrease number of images to 1 or increase label batch size to 30, it could also be that somewhere you are reshaping the tensors incorrectly.
I would look at where you read your data in, and then batch it, that is most likely where the problem will be, or at places where you are reshaping them.
Post your entire code, it would be more helpful.

TF error: Shapes of both tensors to match

I try to implement a CNN model based on MNIST tutorial on TF website.
Here is my code
import tensorflow as tf
import numpy as np
from tensorflow.contrib import learn
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
def cnn_model_fn(features, labels, mode):
"""Model function for CNN."""
# Input Layer
# Reshape X to 4-D tensor: [batch_size, width, height, channels]
# breaKHis images are 32x32 pixels, and have three color channel
input_layer = tf.reshape(features, [-1, 32, 32, 3])
# Convolutional Layer #1
# Computes 32 features using a 5x5 filter with ReLU activation.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 32, 32, 1]
# Output Tensor Shape: [batch_size, 32, 32, 32]
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
#print conv1.get_shape().as_list()
# Pooling Layer #1
# First max pooling layer with a 2x2 filter and stride of 2
# Input Tensor Shape: [batch_size, 32, 32, 32]
# Output Tensor Shape: [batch_size, 16, 16, 32]
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
#print pool1.get_shape().as_list()
# Convolutional Layer #2
# Computes 32 features using a 5x5 filter.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 16, 16, 32]
# Output Tensor Shape: [batch_size, 16, 16, 32]
conv2 = tf.layers.conv2d(
inputs=pool1,
filters=32,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
#print conv2.get_shape().as_list()
# Pooling Layer #2
# Second max pooling layer with a 3x3 filter and stride of 2
# Input Tensor Shape: [batch_size, 16, 16, 32]
# Output Tensor Shape: [batch_size, 8, 8, 32]
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
#print pool2.get_shape().as_list()
# Convolutional Layer #3
# Computes 64 features using a 5x5 filter.
# Padding is added to preserve width and height.
# Input Tensor Shape: [batch_size, 8, 8, 32]
# Output Tensor Shape: [batch_size, 8, 8, 64]
conv3 = tf.layers.conv2d(
inputs=pool2,
filters=64,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
#print conv3.get_shape().as_list()
# Pooling Layer #3
# Second max pooling layer with a 3x3 filter and stride of 2
# Input Tensor Shape: [batch_size, 8, 8, 64]
# Output Tensor Shape: [batch_size, 4, 4, 64]
pool3 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2], strides=2)
# Flatten tensor into a batch of vectors
# Input Tensor Shape: [batch_size, 4, 4, 64]
# Output Tensor Shape: [batch_size, 4 * 4 * 64]
pool3Shape = pool3.get_shape().as_list()
#print pool3Shape
pool2_flat = tf.reshape(pool2, [-1, 4*4*64])
# Dense Layer
# Densely connected layer with 64 neurons
# Input Tensor Shape: [batch_size, 4 * 4 * 64]
# Output Tensor Shape: [batch_size, 64]
dense = tf.layers.dense(inputs=pool2_flat, units=64, activation=tf.nn.relu)
# Add dropout operation; 0.6 probability that element will be kept
dropout = tf.layers.dropout(
inputs=dense, rate=0.4, training=mode == learn.ModeKeys.TRAIN)
# Logits layer
# Input Tensor Shape: [batch_size, 64]
# Output Tensor Shape: [batch_size, 2]
logits = tf.layers.dense(inputs=dropout, units=2)
loss = None
train_op = None
# Calculate Loss (for both TRAIN and EVAL modes)
if mode != learn.ModeKeys.INFER:
onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=2)
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
# Configure the Training Op (for TRAIN mode)
if mode == learn.ModeKeys.TRAIN:
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=0.001,
optimizer="SGD")
# Generate Predictions
predictions = {
"classes": tf.argmax(
input=logits, axis=1),
"probabilities": tf.nn.softmax(
logits, name="softmax_tensor")
}
# Return a ModelFnOps object
return model_fn_lib.ModelFnOps(
mode=mode, predictions=predictions, loss=loss, train_op=train_op)
And it throws me the error: InvalidArgumentError (see above for traceback): Assign requires shapes of both tensors to match. lhs shape= [1024,64] rhs shape= [2048,64]
I think there should be something wrong in the last FC layer but don't know where it is.
You can check this answer,
Tensorflow Assign requires shapes of both tensors to match. lhs shape= [20] rhs shape= [48]
Maybe you can install the previous version and try it again.

Obtain probabilities from logits - logits and labels not the same size

I am trying to use Tensorflow to classify some object representations. I used the same architecture as in the Tensorflow Cifar-10 example, with the last layer defined as:
with tf.variable_scope('sigmoid_linear') as scope:
weights = _variable_with_weight_decay('weights', [192, num_classes],
stddev=1 / 192.0, wd=0.0)
biases = _variable_on_cpu('biases', [num_classes],
initializer)
sigmoid_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
_activation_summary(sigmoid_linear)
return sigmoid_linear
In my case, num_classes is 2, and the amount of channels in the representation fed to the neural network is 8. Furthermore, I'm currently debugging with only 5 examples. The output of the last layer has a shape of[40,2]. I expect the first dimension is due to 5 examples * 8 channels and the second due to the number of classes.
In order to use compare the logits and the labels using e.g. tensorflow.nn.SparseSoftmaxCrossEntropyWithLogits I need them to have a common shape. How can I interpret the current content of the logits in the current shape, and how can I reduce the first dimension of the logits to be the same as num_classes?
Edit: the shape of the input to the inference function has a shape of [5,101,1008,8]. The inference function is defined as:
def inference(representations):
"""Build the model.
Args:
STFT spectra: spectra returned from distorted_inputs() or inputs().
Returns:
Logits.
"""
# conv1
with tf.variable_scope('conv1') as scope:
kernel = _variable_with_weight_decay('weights',
shape=[5, 5, nChannels, 64],
stddev=5e-2,
wd=0.0)
conv = tf.nn.conv2d(representations, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], initializer,
)
pre_activation = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(pre_activation, name=scope.name)
_activation_summary(conv1)
# pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool1')
# norm1
norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
name='norm1')
# conv2
with tf.variable_scope('conv2') as scope:
kernel = _variable_with_weight_decay('weights',
shape=[5, 5, 64, 64],
stddev=5e-2,
wd=0.0)
conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], initializer)
pre_activation = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(pre_activation, name=scope.name)
_activation_summary(conv2)
# norm2
norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
name='norm2')
# pool2
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool2')
# local3
with tf.variable_scope('local3') as scope:
# Move everything into depth so we can perform a single matrix multiply.
reshape = tf.reshape(pool2, [batch_size, -1])
dim = reshape.get_shape()[1].value
weights = _variable_with_weight_decay('weights', shape=[dim, 384],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [384], initializer)
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
_activation_summary(local3)
# local4
with tf.variable_scope('local4') as scope:
weights = _variable_with_weight_decay('weights', shape=[384, 192],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [192], initializer)
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)
_activation_summary(local4)
with tf.variable_scope('sigmoid_linear') as scope:
weights = _variable_with_weight_decay('weights', [192, num_classes],
stddev=1 / 192.0, wd=0.0)
biases = _variable_on_cpu('biases', [num_classes],
initializer)
sigmoid_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
_activation_summary(sigmoid_linear)
return sigmoid_linear
After more debugging I could find the problem. The posted code with the layers, originally from the Tensorflow tutorial, works well (of course it does). I printed all shapes, after each layer, and found out that the number 40 was not due to 5 examples * 8 channels, but that I had previously set batch_size = 40, and thus also higher than the amount of training examples. The mismatch began after the reshaping in the local layer 3. The question can now be closed.