I want to understand that how l2 regularization is implement here. In l2 regularization we add a square of weights to the loss function. But in this code we are also adding bias term. Why is it so?
`x = tf.placeholder(tf.float32, [None, nPixels])
W1 = tf.Variable(tf.random_normal([nPixels, nNodes1], stddev=0.01))
b1 = tf.Variable(tf.zeros([nNodes1])
y1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
W2 = tf.Variable(tf.random_normal([nNodes1, nLabels], stddev=0.01))
b2 = tf.Variable(tf.zeros([nLabels]))
y = tf.matmul(y1, W2) + b2
y_ = tf.placeholder(tf.float32, [None, nLabels])
l2_loss = tf.nn.l2_loss(W1) + tf.nn.l2_loss(b1) + tf.nn.l2_loss(W2) +
cross_entropy =
tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=y))
regularized_cross_entropy = cross_entropy + beta * l2_loss`
This bias is not the same we used in l2 regularization.
This is the bias we add in neural network to keep the value not equal to zero.
My objective is training an autoencoder through SGD. By using tensorflow 1.x, I added L1 regularization for my loss function like this:
beta = 10e-3
n_inputs = X_Train.shape[1]
n_outputs = n_inputs
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
weights1 = tf.get_variable("weights1", shape=[n_inputs, n_hidden], dtype=tf.float32, initializer = tf.contrib.layers.variance_scaling_initializer())
weights2 = tf.get_variable("weights2", shape=[n_hidden, n_outputs], dtype=tf.float32, initializer = tf.contrib.layers.variance_scaling_initializer())
biases1 = tf.get_variable("biases1", shape=[n_hidden], initializer = tf.zeros_initializer())
biases2 = tf.get_variable("biases2", shape=[n_outputs], initializer = tf.zeros_initializer())
hidden = activation(tf.matmul(X, weights1) + biases1)
outputs = tf.matmul(hidden, weights2) + biases2
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
reg_loss = beta * (tf.reduce_sum(tf.abs(weights1)) + tf.reduce_sum(tf.abs(weights2)))
loss = reconstruction_loss + reg_loss
training_op = tf.train.AdamOptimizer(learning_rate).optimizer.minimize(loss)
init = tf.global_variables_initializer()
After training, I counted the number of zero in weights1 matrix. I found that all weights1[i][j] ≠ 0. what's the problem?
L1 regularization make the network push useless weight toward 0, but it doesn't asssign 0 To them by brute force. They Will near 0 but not at 0.
see: Is the L1 regularization in Keras/Tensorflow *really* L1-regularization?
for detailed answers
I tried to train the ANN model using the matrix multiplication and tf.layers.dense(). But I got diferent result , ANN model using the matrix multiplication it can not optimize the loss function (loss increase). how different between two method ?
ANN model using the matrix multiplication
W1 = tf.Variable(tf.zeros([4,64]))
b1 = tf.Variable(tf.zeros([64]))
y1 = tf.nn.relu(tf.matmul(x, W1) + b1)
W2 = tf.Variable(tf.zeros([64,64]))
b2 = tf.Variable(tf.zeros([64]))
y2 = tf.nn.relu(tf.matmul(y1, W2) + b2)
W3 = tf.Variable(tf.zeros([64,64]))
b3 = tf.Variable(tf.zeros([64]))
y3 = tf.nn.relu(tf.matmul(y2, W3) + b3)
W4 = tf.Variable(tf.zeros([64,3]))
b4 = tf.Variable(tf.zeros([3]))
y_out = tf.nn.softmax(tf.matmul(y3, W4) + b4)
ANN model using tf.layers.dense()
layer1 = tf.layers.dense(x, 64, activation=tf.nn.relu)
layer2 = tf.layers.dense(layer1, 64, activation=tf.nn.relu)
layer3 = tf.layers.dense(layer2, 64, activation=tf.nn.relu)
layer4 = tf.layers.dense(layer3, 64, activation=tf.nn.relu)
layer5 = tf.layers.dense(layer4, 64, activation=tf.nn.relu)
layer6 = tf.layers.dense(layer5, 64, activation=tf.nn.relu)
y_out = tf.layers.dense(layer6, 3 , activation = tf.nn.softmax)
You are initializing the weights with zeros, which effectively prevents the network from learning anything as the network always outputs zero, and the gradient is always zero.
Initialize your weights with random values, like uniform or gaussian distribution with a small range (less than 0.1).
I use a pre-trained network from Tensorflow-Hub and pass the outcoming vector through 2 fully connected layers. I initialize the weight matrices with He-initialization and the biases with 0.
The loss function is behaving strangly. Also it does update the weights matrices somewhat, but mainly the biases.
Does anybody know, how to improve the learning?
Thanks in advance!
with tf.name_scope('tf_hub'):
module = hub.Module("https://tfhub.dev/google/imagenet/pnasnet_large/feature_vector/2")
tf_hub_features = module(X) # Features with shape [batch_size, num_features].
he_initializer = tf.contrib.layers.variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False)
with tf.name_scope('Hidden1'):
W1 = tf.get_variable(initializer=he_initializer, shape=[Constants.PNAS_NET2_NB_FEATURES, config["h1_nb_units"]],
# W1 = tf.Variable(tf.random_normal([Constants.PNAS_NET2_NB_FEATURES, config["h1_nb_units"]]), name="W1")
tf.summary.histogram("W1", W1)
b1 = tf.Variable(tf.zeros([config["h1_nb_units"]]), name="b1")
tf.summary.histogram("b1", b1)
o1 = tf.nn.relu(tf.matmul(tf_hub_features, W1) + b1, name="o1")
# dropout1 = tf.layers.dropout(inputs=o1, rate=config["keep_probability"], name="dropout1")
with tf.name_scope('Hidden2'):
W2 = tf.get_variable(initializer=he_initializer, shape=[config["h1_nb_units"], config["h2_nb_units"]],
tf.summary.histogram("W2", W2)
b2 = tf.Variable(tf.zeros([config["h2_nb_units"]]), name="b2")
tf.summary.histogram("b2", b2)
o2 = tf.nn.relu(tf.matmul(o1, W2) + b2, name="o2")
with tf.name_scope('Y'):
WY = tf.get_variable(initializer=he_initializer, shape=[config["h2_nb_units"], config["output_dim"]],
tf.summary.histogram("WY", WY)
bY = tf.Variable(tf.zeros([config["output_dim"]]), name="bY")
tf.summary.histogram("bY", bY)
Y_star = tf.add(tf.matmul(o2, WY), bY, name="Y_star")
Y = tf.nn.sigmoid(Y_star, name="Y")
with tf.name_scope('loss'):
Y_ = tf.placeholder(tf.float32, shape=(None, 1), name="Y_")
loss = tf.losses.log_loss(Y_, Y_hat)
optimizer = tf.train.AdamOptimizer(config["learning_rate"])
train_step = optimizer.minimize(loss)
The answer is quite simple. I had an error in feeding the input. They were all zeros and some ones. Therefore, there were only minor changes in the weights. I suppose the bias adjusted since it will learn something like the "mean" in linear regression.
I am new to tensorflow and have tried to implement a simple one-layer linear network similar to https://www.tensorflow.org/get_started/mnist/beginners
x = tf.placeholder(tf.float32, [None, IN_SIZE], name="input")
W1 = tf.Variable(tf.zeros([IN_SIZE, OUT_SIZE]), name="Weight1")
b1 = tf.Variable(tf.zeros([OUT_SIZE]), name="bias1")
y = tf.matmul(x, W1) + b1
y_ = tf.placeholder(tf.float32, [None, OUT_SIZE], name="target")
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.AdamOptimizer(1e-3).minimize(cross_entropy)
The program works as expected and I have no problem on that. However, I try to add another layer but only found the W1,b1,W2 learnt are all zero matrix, and only the bias b2 contains nonzero values. Below is my modified network
x = tf.placeholder(tf.float32, [None, IN_SIZE], name="input")
W1 = tf.Variable(tf.zeros([IN_SIZE, L1_SIZE]), name="Weight1")
b1 = tf.Variable(tf.zeros([L1_SIZE]), name="bias1")
y = tf.matmul(x, W1) + b1
W2 = tf.Variable(tf.zeros([L1_SIZE, OUT_SIZE]), name="Weight2")
b2 = tf.Variable(tf.zeros([OUT_SIZE]), name="bias2")
y = tf.nn.relu(y)
y = tf.matmul(y, W2) + b2
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, OUT_SIZE], name="target")
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.AdamOptimizer(1e-3).minimize(cross_entropy)
The problem is that if you initialize the weight matrices before a relu with zeroes the gradients will always be zero and no learning will happen. You need to do random initialization.
I am trying to learn Tensorflow. I am doing a basic example- model the equation y = x + 0.1, train it using a neural net, and then make predictions. I am actually taking a sigmoid approach (not ideal), so not using the standard softmax/relu way (which didn't work for me). The code runs, but the answer is wrong: all predictions in a batch give nearly identical answers, like y_true = [[0.356], [0.356], [0.356],[0.356]], for input= [[0.1, 0.2, 0.3, 0.4]]. What am I doing wrong? Code is below:
import tensorflow as tf
import numpy as np
epochs = 1000
# For equation y = b + 0.1, sample data below
myImportedDatax_np = np.array([[.1],[.2],[.3],[.4]],dtype=float)
myImportedDatay_np = np.array([[.2],[.3],[.4],[.5]],dtype=float)
c = tf.constant(0.1, name='c')
b = tf.placeholder(tf.float32, [None, 1], name='b')
y = tf.add(b, c, name='y')
y_true = tf.placeholder(tf.float32, [None, 1], name='y_true')
W1 = tf.Variable(tf.random_normal([1, 3], stddev=0.03), name='W1')
b1 = tf.Variable(tf.random_normal([3]), name='b1')
W2 = tf.Variable(tf.random_normal([3, 1], stddev=0.03), name='W2')
b2 = tf.Variable(tf.random_normal([1]), name='b2')
hidden_out = tf.add(tf.matmul(b, W1), b1)
hidden_out = tf.sigmoid(hidden_out)
y_ = tf.sigmoid(tf.add(tf.matmul(hidden_out, W2), b2))
cost = tf.reduce_mean(tf.square(y_ - y_true))
optimiser = tf.train.GradientDescentOptimizer(0.005).minimize(cost)
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
# initialise the variables
for epoch in range(epochs):
_, cost_now = sess.run([optimiser, cost], {b: myImportedDatax_np, y_true: myImportedDatay_np})
print("Predicted values are:")
print(sess.run(y_, {b: myImportedDatax_np}))
There few things that are wrong with your code:
Yours is a regression problem, y = x + c, so remove the sigmoid output:
y_ = tf.add(tf.matmul(hidden_out, W2), b2)
You will be better served by a single hidden layer, your multiple hidden unit for such a simple task will require it to train it longer.
To handle 2, increase your epoch to higher value say, 10000 and your learning rate also higher, say 0.1
Adding the code:
#increased the number of epoch
epochs = 10000
# For equation y = b + 0.1, sample data below
myImportedDatax_np = np.array([[.1],[.2],[.3],[.4]],dtype=float)
myImportedDatay_np = np.array([[.2],[.3],[.4],[.5]],dtype=float)
c = tf.constant(0.1, name='c')
b = tf.placeholder(tf.float32, [None, 1], name='b')
y = tf.add(b, c, name='y')
y_true = tf.placeholder(tf.float32, [None, 1], name='y_true')
W1 = tf.Variable(tf.random_normal([1, 3], stddev=0.03), name='W1')
b1 = tf.Variable(tf.random_normal([3]), name='b1')
W2 = tf.Variable(tf.random_normal([3, 1], stddev=0.03), name='W2')
b2 = tf.Variable(tf.random_normal([1]), name='b2')
hidden_out = tf.add(tf.matmul(b, W1), b1)
hidden_out = tf.sigmoid(hidden_out)
# Removed the activation
y_ = tf.add(tf.matmul(hidden_out, W2), b2)
cost = tf.reduce_mean(tf.square(y_ - y_true)
#changed the learning rate
optimiser = tf.train.GradientDescentOptimizer(0.1).minimize(cost)
init_op = tf.global_variables_initializer()
#Predicted values are:
#[[ 0.19917184]
#[ 0.30153054]
#[ 0.40164429]
#[ 0.4976812 ]]