Running a Multi Scale network with Tensor Flow - tensorflow

I want to try to build a multi-scale CNN using tensorflow from the cifar10 code.
For what I understood I should take the output of the first conv layer and merge it with the output of the second conv layer to feed the first fully connected layer. Is that right? If yes, how to actually do this?
I have almost the same first layers as for the cifar10 except for the norm1 and the pool1 layers that are switched
# conv1
with tf.variable_scope('conv1') as scope:
kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64],
stddev=1e-4, wd=0.0)
conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
bias = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(bias,
# norm1
norm1 = tf.nn.lrn(conv1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
# pool1
pool1 = tf.nn.max_pool(norm1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool1')
# conv2
with tf.variable_scope('conv2') as scope:
kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64],
stddev=1e-4, wd=0.0)
conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(bias,
# norm2
norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
# pool2
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool2')
Then I try to merge the norm1 layer with the pool2 layer using concat.
Here's how I do this
# local3
with tf.variable_scope('local3') as scope:
#concatenate tensors
concat = tf.concat(2,[pool1,pool2])
# Move everything into depth so we can perform a single matrix multiply.
for d in concat.get_shape()[1:].as_list():
dim *= d
reshape = tf.reshape(concat, [FLAGS.batch_size, dim])
weights = _variable_with_weight_decay('weights', shape=[dim, 384],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
local3 = tf.nn.relu_layer(reshape, weights, biases,
I'm not even sure that this is the right procedure because the loss is now 17 when in the single scale case I had the initial loss set around 3.
Is this common?
Thanks in advance.


How to build a neural network in tensorflow with custom activation functions?

I'm new to tensorflow. I'm building a 3-layer neural network (just one hidden layer ) using tensorflow and I want to apply a custom activation function to its hidden layer.
I implemented it using np library:
def my_network(input_layer,centers,beta, weights):
layer_1 = input_layer
gaussian = np.array([[sum([i*i for i in vec]) for vec in layer_1-center] for center in centers])
a = beta.reshape(len(beta),1)* gaussian
layer_2 = np.array([[np.exp(i) for i in vec] for vec in a])
output = tf.matmul(np.transpose(layer_2).astype(np.float32), weights['w'])
return output
I want to convert it to some code that is suitable with tensorflow and its gradients. How should I do this?
Try this small snippet for multiple convolution layers:
# placeholders
X = tf.placeholder(tf.float32, [None, 28, 28, 1], name="input_X")
y = tf.placeholder(tf.float32, [None, 14, 14, 1], name="Output_y")
# C1
with tf.name_scope("layer1"):
W1 = tf.get_variable("W1", shape=[3, 3, 1, 32],
b1 = tf.get_variable("b1", shape=[32], initializer=tf.contrib.layers.xavier_initializer())
layer1 = tf.nn.conv2d(X, W1, strides=[1, 1, 1, 1], padding='SAME') + b1
layer1_act = tf.nn.relu(layer1) # here you can change to other activation function
# C2
with tf.name_scope("layer2"):
W2 = tf.get_variable("W2", shape=[3, 3, 32, 64],
b2 = tf.get_variable("b2", shape=[64], initializer=tf.contrib.layers.xavier_initializer())
layer2 = tf.nn.conv2d(layer1_act, W2, strides=[1, 1, 1, 1], padding='SAME') + b2
layer2_act = tf.nn.relu(layer2) # here you can change to other activation function
# max pool
with tf.name_scope("maxpool"):
maxpool = tf.nn.max_pool(layer2_act, [1, 2, 2, 1], [1, 2, 2, 1], 'SAME') #just to show how to use maxpool
# C3
with tf.name_scope("layer3"):
W3 = tf.get_variable("W3", shape=[3, 3, 64, 32],
b3 = tf.get_variable("b3", shape=[32], initializer=tf.contrib.layers.xavier_initializer())
layer3 = tf.nn.conv2d(maxpool, W3, strides=[1, 1, 1, 1], padding='SAME') + b3
layer3_act = tf.nn.relu(layer3) # here you can change to other activation function
#draw graph of train operation
with tf.name_scope('loss and train operation'):
loss = tf.reduce_mean(tf.losses.mean_squared_error(
labels=tf.cast(y, tf.int32),
optimizer = tf.train.AdamOptimizer(learning_rate=0.00001)
train_op = optimizer.minimize(loss)

CNN performs worse than fully connected net - how to spot mistakes?

I'm experimenting with CNNs and I'm baffled, because model I've built actually learns slower and performs worse than fully connected NN. Here are two models:
fully connected:
hidden1 = tf.layers.dense(X, 2000, name="hidden1",
hidden2 = tf.layers.dense(hidden1, 1000, name="hidden2",
hidden3 = tf.layers.dense(hidden2, 1000, name="hidden3",
hidden4 = tf.layers.dense(hidden3, 1000, name="hidden4",
hidden5 = tf.layers.dense(hidden4, 700, name="hidden5",
hidden6 = tf.layers.dense(hidden5, 500, name="hidden6",
logits = tf.layers.dense(hidden6, 2, name="outputs")
f = tf.get_variable('conv1-fil', [5,5,1,10])
conv1 = tf.nn.conv2d(X, filter=f, strides=[1, 1, 1, 1], padding="SAME")
pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
f2 = tf.get_variable('conv2-fil', [3,3,10,7])
conv2 = tf.nn.conv2d(pool1, filter=f2, strides=[1, 1, 1, 1], padding="SAME")
pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
fc1 = tf.contrib.layers.flatten(pool2)
hidden1 = tf.layers.dense(fc1, 3630, name="hidden1",
hidden2 = tf.layers.dense(hidden1, 2000, name="hidden2",
hidden3 = tf.layers.dense(hidden2, 1000, name="hidden3",
hidden5 = tf.layers.dense(hidden3, 700, name="hidden5",
hidden6 = tf.layers.dense(hidden5, 500, name="hidden6",
logits = tf.layers.dense(hidden6, 2, name="outputs")
Basically CNN have a little more shallow fully connected net, but added conv layers vs just fully connected. CNN arrives to accuracy ~88% vs 92% of deep nn after same number of epochs and same dataset. How to debug issues like that? What are good practices in designing conv layers?

Obtain probabilities from logits - logits and labels not the same size

I am trying to use Tensorflow to classify some object representations. I used the same architecture as in the Tensorflow Cifar-10 example, with the last layer defined as:
with tf.variable_scope('sigmoid_linear') as scope:
weights = _variable_with_weight_decay('weights', [192, num_classes],
stddev=1 / 192.0, wd=0.0)
biases = _variable_on_cpu('biases', [num_classes],
sigmoid_linear = tf.add(tf.matmul(local4, weights), biases,
return sigmoid_linear
In my case, num_classes is 2, and the amount of channels in the representation fed to the neural network is 8. Furthermore, I'm currently debugging with only 5 examples. The output of the last layer has a shape of[40,2]. I expect the first dimension is due to 5 examples * 8 channels and the second due to the number of classes.
In order to use compare the logits and the labels using e.g. tensorflow.nn.SparseSoftmaxCrossEntropyWithLogits I need them to have a common shape. How can I interpret the current content of the logits in the current shape, and how can I reduce the first dimension of the logits to be the same as num_classes?
Edit: the shape of the input to the inference function has a shape of [5,101,1008,8]. The inference function is defined as:
def inference(representations):
"""Build the model.
STFT spectra: spectra returned from distorted_inputs() or inputs().
# conv1
with tf.variable_scope('conv1') as scope:
kernel = _variable_with_weight_decay('weights',
shape=[5, 5, nChannels, 64],
conv = tf.nn.conv2d(representations, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], initializer,
pre_activation = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(pre_activation,
# pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool1')
# norm1
norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
# conv2
with tf.variable_scope('conv2') as scope:
kernel = _variable_with_weight_decay('weights',
shape=[5, 5, 64, 64],
conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], initializer)
pre_activation = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(pre_activation,
# norm2
norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
# pool2
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool2')
# local3
with tf.variable_scope('local3') as scope:
# Move everything into depth so we can perform a single matrix multiply.
reshape = tf.reshape(pool2, [batch_size, -1])
dim = reshape.get_shape()[1].value
weights = _variable_with_weight_decay('weights', shape=[dim, 384],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [384], initializer)
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
# local4
with tf.variable_scope('local4') as scope:
weights = _variable_with_weight_decay('weights', shape=[384, 192],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [192], initializer)
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases,
with tf.variable_scope('sigmoid_linear') as scope:
weights = _variable_with_weight_decay('weights', [192, num_classes],
stddev=1 / 192.0, wd=0.0)
biases = _variable_on_cpu('biases', [num_classes],
sigmoid_linear = tf.add(tf.matmul(local4, weights), biases,
return sigmoid_linear
After more debugging I could find the problem. The posted code with the layers, originally from the Tensorflow tutorial, works well (of course it does). I printed all shapes, after each layer, and found out that the number 40 was not due to 5 examples * 8 channels, but that I had previously set batch_size = 40, and thus also higher than the amount of training examples. The mismatch began after the reshaping in the local layer 3. The question can now be closed.

generating unrecognizable image to fool vggnet

I am trying to generate an unrecognizable image which can fool Vggnet. I used the following vgg model for tensorflow. I add some modification for calculating the gradient. In the ending part, you can see my modification for calculating the gradient respect to the given image (is it correct? I am trying to generate an image to whom the vggnet assign high probability at class 1). With this gradient, I update the random image for fooling the vggnet. But this is not so successful. I can't generate an image with high probability. The maximum probability I got is around 0.001. How can I make it keep increasing?
Vggnet model
# Davi Frossard, 2016 #
# VGG16 implementation in TensorFlow #
# Details: #
# #
# #
# Model from #
# Weights from Caffe converted using #########################################################################################
import tensorflow as tf
import numpy as np
from scipy.misc import imread, imresize
from imagenet_classes import class_names
class vgg16:
def __init__(self, imgs, weights=None, sess=None):
self.imgs = imgs
self.probs = tf.nn.softmax(self.fc3l, name= 'prob')
if weights is not None and sess is not None:
self.load_weights(weights, sess)
def convlayers(self):
self.parameters = []
# zero-mean input
with tf.name_scope('preprocess') as scope:
mean = tf.constant([123.68, 116.779, 103.939], dtype=tf.float32, shape=[1, 1, 1, 3], name='img_mean')
images = self.imgs-mean
# conv1_1
with tf.name_scope('conv1_1') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 3, 64], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv1_1 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# conv1_2
with tf.name_scope('conv1_2') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 64, 64], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.conv1_1, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv1_2 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# pool1
self.pool1 = tf.nn.max_pool(self.conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
# conv2_1
with tf.name_scope('conv2_1') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 64, 128], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.pool1, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[128], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv2_1 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# conv2_2
with tf.name_scope('conv2_2') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 128, 128], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.conv2_1, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[128], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv2_2 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# pool2
self.pool2 = tf.nn.max_pool(self.conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
# conv3_1
with tf.name_scope('conv3_1') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 128, 256], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.pool2, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv3_1 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# conv3_2
with tf.name_scope('conv3_2') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.conv3_1, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv3_2 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# conv3_3
with tf.name_scope('conv3_3') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.conv3_2, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv3_3 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# pool3
self.pool3 = tf.nn.max_pool(self.conv3_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
# conv4_1
with tf.name_scope('conv4_1') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 512], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.pool3, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[512], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv4_1 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# conv4_2
with tf.name_scope('conv4_2') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 512, 512], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.conv4_1, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[512], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv4_2 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# conv4_3
with tf.name_scope('conv4_3') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 512, 512], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.conv4_2, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[512], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv4_3 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# pool4
self.pool4 = tf.nn.max_pool(self.conv4_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
# conv5_1
with tf.name_scope('conv5_1') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 512, 512], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.pool4, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[512], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv5_1 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# conv5_2
with tf.name_scope('conv5_2') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 512, 512], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.conv5_1, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[512], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv5_2 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# conv5_3
with tf.name_scope('conv5_3') as scope:
kernel = tf.Variable(tf.truncated_normal([3, 3, 512, 512], dtype=tf.float32,
stddev=1e-1), name='weights')
conv = tf.nn.conv2d(self.conv5_2, kernel, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[512], dtype=tf.float32),
trainable=True, name='biases')
out = tf.nn.bias_add(conv, biases)
self.conv5_3 = tf.nn.relu(out, name=scope)
self.parameters += [kernel, biases]
# pool5
self.pool5 = tf.nn.max_pool(self.conv5_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
def fc_layers(self):
# fc1
with tf.name_scope('fc1') as scope:
shape = int([1:]))
fc1w = tf.Variable(tf.truncated_normal([shape, 4096],
stddev=1e-1), name='weights')
fc1b = tf.Variable(tf.constant(1.0, shape=[4096], dtype=tf.float32),
trainable=True, name='biases')
pool5_flat = tf.reshape(self.pool5, [-1, shape])
fc1l = tf.nn.bias_add(tf.matmul(pool5_flat, fc1w), fc1b)
self.fc1 = tf.nn.relu(fc1l)
self.parameters += [fc1w, fc1b]
# fc2
with tf.name_scope('fc2') as scope:
fc2w = tf.Variable(tf.truncated_normal([4096, 4096],
stddev=1e-1), name='weights')
fc2b = tf.Variable(tf.constant(1.0, shape=[4096], dtype=tf.float32),
trainable=True, name='biases')
fc2l = tf.nn.bias_add(tf.matmul(self.fc1, fc2w), fc2b)
self.fc2 = tf.nn.relu(fc2l)
self.parameters += [fc2w, fc2b]
# fc3
with tf.name_scope('fc3') as scope:
fc3w = tf.Variable(tf.truncated_normal([4096, 1000],
stddev=1e-1), name='weights')
fc3b = tf.Variable(tf.constant(1.0, shape=[1000], dtype=tf.float32),
trainable=True, name='biases')
self.fc3l = tf.nn.bias_add(tf.matmul(self.fc2, fc3w), fc3b)
self.parameters += [fc3w, fc3b]
###################### Modified part######################
with tf.name_scope('grad') as scope:
temp = np.zeros(1000)
temp[0] = 1
vec = tf.constant(temp, dtype='float32', name = 'goal')
loss = tf.reduce_mean(tf.square(tf.sub(tf.nn.softmax(self.fc3l), vec)))
self.grad = tf.gradients(loss, self.imgs)[-1]
def load_weights(self, weight_file, sess):
weights = np.load(weight_file)
keys = sorted(weights.keys())
for i, k in enumerate(keys):
print i, k, np.shape(weights[k])[i].assign(weights[k]))
Create session
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
sess = tf.Session()
imgs = tf.placeholder(tf.float32, [None, 224, 224, 3])
vgg = vgg16(imgs, 'vgg16_weights.npz', sess)
Generate new image for fooling
imarray = np.random.rand(224,224,3) * 255
imarray = imarray.astype('float32')
feed_dict = {vgg.imgs: [imarray]}
prob_list = []
prob_list.append(, feed_dict={vgg.imgs: [imarray]})[0][0])
lamda = 0.1
#mean = np.array([123.68, 116.779, 103.939])
print 'start'
for i in range(1000):
rst =, feed_dict)
imarray -= lamda * (rst[0]*255)
feed_dict = {vgg.imgs: [imarray]}
prob_list.append(, feed_dict={vgg.imgs: [imarray]})[0][0])
I'm surprised that the shapes of the gradient and the image match.
You are taking the derivative of the loss with respect to the parameters, is should be with respect to the image placeholder. Excuse me, if I'm missing something obvious, I can't run the code right now.
The computation of the loss is based on fc3l, the final output is probs. I don't see where probs is computed in the VGG code. Maybe there are layers in between. You could plot the first component of fc3l instead, see if that goes up.
You should probably base the loss on probs.

Why no weight decay on the convolutional layers in the cifar10 example of tensorflow?

There seems to be no weight decay on convolutional layers in the cifar10 example on tensorflow. Actually there is no weight decay on any layers except for the two fully connected layers. Is this a common practice? I thought weight decay was applied to all weights (except biases).
For reference, here's the relevant code (wd is the weight decay factor):
# conv1
with tf.variable_scope('conv1') as scope:
kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64],
stddev=1e-4, wd=0.0)
conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
bias = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(bias,
# pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool1')
# norm1
norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
# conv2
with tf.variable_scope('conv2') as scope:
kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64],
stddev=1e-4, wd=0.0)
conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(bias,
# norm2
norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
# pool2
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool2')
# local3
with tf.variable_scope('local3') as scope:
# Move everything into depth so we can perform a single matrix multiply.
dim = 1
for d in pool2.get_shape()[1:].as_list():
dim *= d
reshape = tf.reshape(pool2, [FLAGS.batch_size, dim])
weights = _variable_with_weight_decay('weights', shape=[dim, 384],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
# local4
with tf.variable_scope('local4') as scope:
weights = _variable_with_weight_decay('weights', shape=[384, 192],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases,
# softmax, i.e. softmax(WX + b)
with tf.variable_scope('softmax_linear') as scope:
weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
stddev=1/192.0, wd=0.0)
biases = _variable_on_cpu('biases', [NUM_CLASSES],
softmax_linear = tf.add(tf.matmul(local4, weights), biases,
return softmax_linear
Weight decay doesn't necessarily improve performance. In my own experience, I've found reasonably often that my models perform worse (as measured by some metric on a held-out set) with any significant amount of weight decay. It is a useful form of regularization to be aware of, but you don't need to add it to every model without considering if it seems needed or comparing the performance with and without.
As for whether weight decay on only part of a model can be good compared with weight decay on the entire model, it does seem less common to only regularize some of the weights this way. I don't know that there's a theoretical reason for this, however. In general, neural networks already have too many hyperparameters to configure. Whether to use weight decay or not is already a question, and how strongly to regularize the weights if you do. If you also wonder, which layers should I regularize this way, you'll quickly run out of time to test the performance of all of the different ways you could turn it on and off for each layer.
I imagine that there are models that would benefit from weight decay on only part of the model; I don't think it's done often because it's difficult to test all of the possibilities and find out which one works best.