I am trying to learn Tensorflow. I am doing a basic example- model the equation y = x + 0.1, train it using a neural net, and then make predictions. I am actually taking a sigmoid approach (not ideal), so not using the standard softmax/relu way (which didn't work for me). The code runs, but the answer is wrong: all predictions in a batch give nearly identical answers, like y_true = [[0.356], [0.356], [0.356],[0.356]], for input= [[0.1, 0.2, 0.3, 0.4]]. What am I doing wrong? Code is below:
import tensorflow as tf
import numpy as np
epochs = 1000
# For equation y = b + 0.1, sample data below
myImportedDatax_np = np.array([[.1],[.2],[.3],[.4]],dtype=float)
myImportedDatay_np = np.array([[.2],[.3],[.4],[.5]],dtype=float)
c = tf.constant(0.1, name='c')
b = tf.placeholder(tf.float32, [None, 1], name='b')
y = tf.add(b, c, name='y')
y_true = tf.placeholder(tf.float32, [None, 1], name='y_true')
W1 = tf.Variable(tf.random_normal([1, 3], stddev=0.03), name='W1')
b1 = tf.Variable(tf.random_normal([3]), name='b1')
W2 = tf.Variable(tf.random_normal([3, 1], stddev=0.03), name='W2')
b2 = tf.Variable(tf.random_normal([1]), name='b2')
hidden_out = tf.add(tf.matmul(b, W1), b1)
hidden_out = tf.sigmoid(hidden_out)
y_ = tf.sigmoid(tf.add(tf.matmul(hidden_out, W2), b2))
cost = tf.reduce_mean(tf.square(y_ - y_true))
optimiser = tf.train.GradientDescentOptimizer(0.005).minimize(cost)
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
# initialise the variables
sess.run(init_op)
for epoch in range(epochs):
_, cost_now = sess.run([optimiser, cost], {b: myImportedDatax_np, y_true: myImportedDatay_np})
print("Predicted values are:")
print(sess.run(y_, {b: myImportedDatax_np}))
There few things that are wrong with your code:
Yours is a regression problem, y = x + c, so remove the sigmoid output:
y_ = tf.add(tf.matmul(hidden_out, W2), b2)
You will be better served by a single hidden layer, your multiple hidden unit for such a simple task will require it to train it longer.
To handle 2, increase your epoch to higher value say, 10000 and your learning rate also higher, say 0.1
EDIT:
Adding the code:
#increased the number of epoch
epochs = 10000
# For equation y = b + 0.1, sample data below
myImportedDatax_np = np.array([[.1],[.2],[.3],[.4]],dtype=float)
myImportedDatay_np = np.array([[.2],[.3],[.4],[.5]],dtype=float)
c = tf.constant(0.1, name='c')
b = tf.placeholder(tf.float32, [None, 1], name='b')
y = tf.add(b, c, name='y')
y_true = tf.placeholder(tf.float32, [None, 1], name='y_true')
W1 = tf.Variable(tf.random_normal([1, 3], stddev=0.03), name='W1')
b1 = tf.Variable(tf.random_normal([3]), name='b1')
W2 = tf.Variable(tf.random_normal([3, 1], stddev=0.03), name='W2')
b2 = tf.Variable(tf.random_normal([1]), name='b2')
hidden_out = tf.add(tf.matmul(b, W1), b1)
hidden_out = tf.sigmoid(hidden_out)
# Removed the activation
y_ = tf.add(tf.matmul(hidden_out, W2), b2)
cost = tf.reduce_mean(tf.square(y_ - y_true)
#changed the learning rate
optimiser = tf.train.GradientDescentOptimizer(0.1).minimize(cost)
init_op = tf.global_variables_initializer()
#Predicted values are:
#[[ 0.19917184]
#[ 0.30153054]
#[ 0.40164429]
#[ 0.4976812 ]]
Related
Hi guys I'm a newbie in deep learning and I'd tried to make my own object detection model MobileNetV3 SSDlite training pascal voc 2007 dataset in tensorflow datasets(tfds).
However, It's performance is poor(mAP 0.3~0.4). And I wonder the reason why it has poor mAP.
The reasons I thought of it:
Training data problem
#I use tfds pascal voc data
#train set
(train_dataset, train_dataset2), ds_info = tfds.load(name="voc/2007", split=["train", "validation"], with_info=True)
train_dataset = train_dataset.concatenate(train_dataset2)
#test set
val_dataset = tfds.load(name="voc/2007", split="test", with_info=False)
Is there any difference between tfds voc2007 and official voc2007 dataset?
Detection model problem
I use imagenet pretrained mobilenetv3 large as a backbone and extract feature from ("multiply_11", "multiply_17") layers which resolution is 19x19 and 10x10
input_tensor = Input((300, 300, 3))
backbone = tf.keras.applications.MobileNetV3Large(include_top=False, alpha=0.75, input_tensor = input_tensor, input_shape = (300, 300, 3))
And extract extra feature map as:
def InvertedResidualBlock(filters, kernel_size, strides, padding):
f1 = Conv2D(filters=filters//2, kernel_size=1, strides = 1, padding=padding, kernel_regularizer=l2(4e-5))
f2 = BatchNormalization()
f3 = ReLU(6.)
f4 = SeparableConv2D(filters = filters,
kernel_size =
kernel_size,
strides = strides,
padding=padding,
#depthwise_regularizer = l2(4e-5),
#pointwise_regularizer = l2(4e-5)
)
f5 = BatchNormalization()
f6 = ReLU(6.)
return reduce(lambda f, g: lambda *args, **kwargs: g(f(*args, **kwargs)), (f1, f2, f3, f4, f5, f6))
class HFPNeckBuilder():
def __init__(self, config) -> None:
self.isLite = config["model_config"]["neck"]["isLite"]
if self.isLite:
self.baseConvBlock = SeparableConvBlock
self.baseConv = SeparableConv
else:
self.baseConvBlock = ConvBlock
self.baseConv = Conv
def __call__(self, ex_stage_output):
Feature_map1 = ex_stage_output[0]
Feature_map2 = ex_stage_output[-1]
Feature_map3 = InvertedResidualBlock(filters= 512, strides = 2, kernel_size = 3, padding="same")(Feature_map2)
Feature_map4 = InvertedResidualBlock(filters= 256, strides = 2, kernel_size = 3, padding="same")(Feature_map3)
Feature_map5 = InvertedResidualBlock(filters= 256, strides = 2, kernel_size = 3, padding="same")(Feature_map4)
Feature_map6 = InvertedResidualBlock(filters= 128, strides = 2, kernel_size = 3, padding="same")(Feature_map5)
return [Feature_map1, Feature_map2, Feature_map3, Feature_map4, Feature_map5, Feature_map6]
Is there any problem in backbone and neck(feature extractor which was mentioned in MobileNetV2 paper)
3.optimizer and lr schedule problem
I had experimented all combination of below setting:
Batch Size:32
Optimizer: SGD(momentum 0.9), Adam, RAdam, RAdam+LookAhead, RAdam+LookAhead+GradientCentralize
Lr Schedule: Cosine decay, Cosine restart, Cosine Warmup and decay with initial learning rate 0.1 ~1e-4
epochs: 150epochs ~ 5000epochs
But all experiments shows poor mAP(0.3~0.4).
Loss Function Problem
I tried two loss function Hard Negative mining + Smooth L1 loss and Focal loss + Smooth L1 loss. I conferd I referred keras official example code for focal loss and pierluigiferrari github for HardNegative mining.
Here is my focal loss code
class SSDLoss(tf.losses.Loss):
'''
Loss with FocalLoss rather than Hard Negative Mining. It is refered from keras reference.
Gamma makes clear the difference between Good detection and Bad detection, if gamma==0 -> crossEntrophy
alpha is weighting factor, if alpha = 0.25 ->BackGround: 0.25, ForeGround: 0.75
'''
def __init__(self, num_classes=80, alpha=0.25, gamma=2.0, config = None):
super(SSDLoss, self).__init__(reduction="auto", name="SSDLoss")
self.alpha = alpha
self.gamma = gamma
self._num_classes = num_classes
self.anchor_boxes = AnchorBox(config).get_anchors()[None, ...]
self._box_variance = [0.1, 0.1, 0.2, 0.2]
def call(self, y_true, y_pred):
y_pred = tf.cast(y_pred, dtype=tf.float32)
box_labels = y_true[:, :, :4]
box_predictions = y_pred[:, :, :4]
cls_labels = tf.one_hot(
tf.cast(y_true[:, :, 4], dtype=tf.int32),
depth=self._num_classes,
dtype=tf.float32,
)
cls_predictions = y_pred[:, :, 4:]
positive_mask = tf.cast(tf.greater(y_true[:, :, 4], -1.0), dtype=tf.float32)
ignore_mask = tf.cast(tf.equal(y_true[:, :, 4], -2.0), dtype=tf.float32)
clf_loss = self.Focal_ClassificationLoss(cls_labels, cls_predictions)
box_loss = self.SmoothL1_BoxLoss(box_labels, box_predictions)
clf_loss = tf.where(tf.equal(ignore_mask, 1.0), 0.0, clf_loss)
box_loss = tf.where(tf.equal(positive_mask, 1.0), box_loss, 0.0)
normalizer = tf.reduce_sum(positive_mask, axis=-1)
clf_loss = tf.math.divide_no_nan(tf.reduce_sum(clf_loss, axis=-1), normalizer)
box_loss = tf.math.divide_no_nan(tf.reduce_sum(box_loss, axis=-1), normalizer)
loss = clf_loss + box_loss
return loss
def SmoothL1_BoxLoss(self, y_true_Box, y_pred_box):
difference = y_true_Box - y_pred_box
absolute_difference = tf.abs(difference) - 0.5
squared_difference = 0.5 * difference ** 2
loss = tf.where(
tf.less(absolute_difference, 1.0),
squared_difference,
absolute_difference)
return tf.reduce_sum(loss, axis=-1)
def Focal_ClassificationLoss(self, y_true_Cls, y_pred_Cls):
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true_Cls, logits=y_pred_Cls)
probs = tf.nn.sigmoid(y_pred_Cls)
alpha = tf.where(tf.equal(y_true_Cls, 1.0), self.alpha, (1.0 - self.alpha))
pt = tf.where(tf.equal(y_true_Cls, 1.0), probs, 1 - probs)
loss = alpha * tf.pow(1.0 - pt, self.gamma) * cross_entropy
return tf.reduce_sum(loss, axis=-1)
I changed it a little bit from the keras official website example code in "Object Detection with RetinaNet".
Question
Is there any advice for improving mAP up to 0.6~0.7? thank you for reading my question.
Background
I am a newbie to TensorFlow and I am trying to understand the basics of deep learning. I started from writing a two-layer neural network from scratch and it achieved 89% accuracy on MNIST dataset and now I am trying to implement the same network in TensorFlow and compare their performance.
Problem
I am not sure if I miss something basic in the code, but the following implementation seems to be unable to update weights and therefore could not output anything meaningful.
num_hidden = 100
# x -> (batch_size, 784)
x = tf.placeholder(tf.float32, [None, 784])
W1 = tf.Variable(tf.zeros((784, num_hidden)))
b1 = tf.Variable(tf.zeros((1, num_hidden)))
W2 = tf.Variable(tf.zeros((num_hidden, 10)))
b2 = tf.Variable(tf.zeros((1, 10)))
# z -> (batch_size, num_hidden)
z = tf.nn.relu(tf.matmul(x, W1) + b1)
# y -> (batch_size, 10)
y = tf.nn.softmax(tf.matmul(z, W2) + b2)
# y_ -> (batch_size, 10)
y_ = tf.placeholder(tf.float32, [None, 10])
# y_ * tf.log(y) -> (batch_size, 10)
cross_entropy = -tf.reduce_sum(y_ * tf.log(y+1e-10))
train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
# tf.argmax(y, axis=1) returns the maximum index in each row
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
for epoch in range(1000):
# batch_xs -> (100, 784)
# batch_ys -> (100, 10), one-hot encoded
batch_xs, batch_ys = mnist.train.next_batch(100)
train_data = {x: batch_xs, y_: batch_ys}
sess.run(train_step, feed_dict=train_data)
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
W1_e, b1_e, W2_e, b2_e = W1.eval(), b1.eval(), W2.eval(), b2.eval()
sess.close()
What I Have Done
I checked many the official docs and many other implementations, but I feel totally confused since they may use different versions and API varies greatly.
So could someone help me, thank you in advance.
There are two problems with what you have done so far. First, you have initialised all of the weights to zero, which will prevent the network from learning. And secondly, the learning rate was too high. The below code got me 0.9665 accuracy. For why not to set all the weights to zero you can see here .
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
num_hidden = 100
# x -> (batch_size, 784)
x = tf.placeholder(tf.float32, [None, 784])
label_place = tf.placeholder(tf.float32, [None, 10])
# WONT WORK as EVERYTHING IS ZERO!
# # Get accuracy at chance \approx 0.1
# W1 = tf.Variable(tf.zeros((784, num_hidden)))
# b1 = tf.Variable(tf.zeros((1, num_hidden)))
# W2 = tf.Variable(tf.zeros((num_hidden, 10)))
# b2 = tf.Variable(tf.zeros((1, 10)))
# Will work, you will need to train a bit more than 1000 steps
# though
W1 = tf.Variable(tf.random_normal((784, num_hidden), 0., 0.1))
b1 = tf.Variable(tf.zeros((1, num_hidden)))
W2 = tf.Variable(tf.random_normal((num_hidden, 10), 0, 0.1))
b2 = tf.Variable(tf.zeros((1, 10)))
# network, we only go as far as the linear output after the hidden layer
# so we can feed it into the tf.nn.softmax_cross_entropy_with_logits below
# this is more numerically stable
z = tf.nn.relu(tf.matmul(x, W1) + b1)
logits = tf.matmul(z, W2) + b2
# define our loss etc as before. however note that the learning rate is lower as
# with a higher learning rate it wasnt really working
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=label_place, logits=logits)
train_step = tf.train.GradientDescentOptimizer(.001).minimize(cross_entropy)
# continue as before
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
correct_prediction = tf.equal(tf.argmax(tf.nn.softmax(logits), 1), tf.argmax(label_place, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
for epoch in range(5000):
batch_xs, batch_ys = mnist.train.next_batch(100)
train_data = {x: batch_xs, label_place: batch_ys}
sess.run(train_step, feed_dict=train_data)
print(sess.run(accuracy, feed_dict={x: mnist.test.images, label_place: mnist.test.labels}))
W1_e, b1_e, W2_e, b2_e = W1.eval(), b1.eval(), W2.eval(), b2.eval()
sess.close()
I am trying to implement a tensor flow LSTM regression model for a list of inputs number.
example:
input_data = [1, 2, 3, 4, 5]
time_steps = 2
-> X == [[1, 2], [2, 3], [3, 4]]
-> y == [3, 4, 5]
The code is below:
TIMESTEPS = 20
num_hidden=20
Xd, yd = load_data()
train_input = Xd['train']
train_input = train_input.reshape(-1,20,1)
train_output = yd['train']
# train_input = [[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],..
# train_output = [[21],[22],[23]....
test_input = Xd['test']
test_output = yd['test']
X = tf.placeholder(tf.float32, [None, 20, 1])
y = tf.placeholder(tf.float32, [None, 1])
cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)
val, state = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
val = tf.Print(val, [tf.argmax(val,1)], 'argmax(val)=' , summarize=20, first_n=7)
val = tf.transpose(val, [1, 0, 2])
val = tf.Print(val, [tf.argmax(val,1)], 'argmax(val2)=' , summarize=20, first_n=7)
# Take only the last output after 20 time steps
last = tf.gather(val, int(val.get_shape()[0]) - 1)
last = tf.Print(last, [tf.argmax(last,1)], 'argmax(val3)=' , summarize=20, first_n=7)
# define variables for weights and bias
weight = tf.Variable(tf.truncated_normal([num_hidden, int(y.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[y.get_shape()[1]]))
# Prediction is matmul of last value + wieght + bias
prediction = tf.matmul(last, weight) + bias
# Cost function using softmax
# y is the true distrubution and prediction is the predicted
cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(prediction), reduction_indices=[1]))
#cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(cost)
from tensorflow.python import debug as tf_debug
inita = tf.initialize_all_variables()
sess = tf.Session()
sess.run(inita)
batch_size = 100
no_of_batches = int(len(train_input)/batch_size)
epoch = 10
test_size = 100
for i in range(epoch):
for start, end in zip(range(0, len(train_input), batch_size), range(batch_size, len(train_input)+1, batch_size)):
sess.run(minimize, feed_dict={X: train_input[start:end], y: train_output[start:end]})
test_indices = np.arange(len(test_input)) # Get A Test Batch
np.random.shuffle(test_indices)
test_indices = test_indices[0:test_size]
print (i, mean_squared_error(np.argmax(test_output[test_indices], axis=1), sess.run(prediction, feed_dict={X: test_input[test_indices]})))
print ("predictions", prediction.eval(feed_dict={X: train_input}, session=sess))
y_pred = prediction.eval(feed_dict={X: test_input}, session=sess)
sess.close()
test_size = test_output.shape[0]
ax = np.arange(0, test_size, 1)
plt.plot(ax, test_output, 'r', ax, y_pred, 'b')
plt.show()
But i am not able to minimize the cost, the calculated MSE increases at each step instead of decreasing.
I suspect there is a problem with the cost problem that i am using.
any thoughts or suggestions as to what i am doing wrong ?
Thanks
As mentioned in the comment, you had to change your loss function to the MSE function and reduce your learning rate. Is your error converging to zero ?
The graph building phase passes without error, but the program freezes (no reading hard drive, no memory change, no ...) during sess.run() in the first mini-batch in the first epoch. If I remove this layer or replace it with tf.contrib.layers.layer_norm, the program runs without issues.
The tensor (x) I pass into tf.layers.batch_normalization has the shape [#batches, 200]. I use most default values, but turned off the center and scale.
x_BN = tf.layers.batch_normalization(
x,
axis=-1,
momentum=0.99,
epsilon=1e-10, #0.001,
center=False, #True,
scale=False, #True,
beta_initializer=tf.zeros_initializer(),
gamma_initializer=tf.ones_initializer(),
moving_mean_initializer=tf.zeros_initializer(),
moving_variance_initializer=tf.ones_initializer(),
beta_regularizer=None,
gamma_regularizer=None,
beta_constraint=None,
gamma_constraint=None,
training=Flg_training, #False,
trainable=True,
name=None,
reuse=None,
renorm=False,
renorm_clipping=None,
renorm_momentum=0.99,
fused=False,
virtual_batch_size=None,
adjustment=None
)
The tensorflow version I'm using is tf-nightly-gpu (1.5.0-dev20171031 or 1.5.0-dev20171023). Has anyone encountered a similar problem?
Update
This happens when the input of tf.layers.batch_normalization is from tf.nn.bidirectional_dynamic_rnn, please see a simplified code to reproduce this issue:
import tensorflow as tf
import numpy as np
starter_learning_rate = 0.001
decay_steps = 100
decay_rate = 0.96
num_RNN_layers = 3
LSTM_CELL_SIZE = 100
keep_prob = 0.95
with tf.name_scope('Inputs'):
x = tf.placeholder(dtype=tf.float32, shape=[None, 200])
y = tf.placeholder(dtype=tf.float32, shape=[None, 200])
length = tf.placeholder(dtype=tf.int32, shape=[None])
Flg_training = tf.placeholder(dtype=tf.bool, shape=[])
x_1 = tf.expand_dims(x, -1)
with tf.name_scope('BiLSTM'):
dropcells = []
for iiLyr in list(range(num_RNN_layers)):
cell_iiLyr = tf.nn.rnn_cell.LSTMCell(num_units=LSTM_CELL_SIZE, state_is_tuple=True)
dropcells.append(tf.nn.rnn_cell.DropoutWrapper(cell=cell_iiLyr, output_keep_prob=keep_prob)) #,, input_keep_prob=self.keep_prob input_keep_prob=1.0, seed=None
MultiLyr_cell = tf.nn.rnn_cell.MultiRNNCell(cells=dropcells, state_is_tuple=True)
outputs, states = tf.nn.bidirectional_dynamic_rnn(
cell_fw=MultiLyr_cell,
cell_bw=MultiLyr_cell,
dtype=tf.float32,
sequence_length=length, #tf_b_lens
inputs=x_1, #stacked_RefPts_desc, #tf_b_VCCs_AMs_BN1
scope = "BiLSTM"
)
#output_fw, output_bw = outputs
states_fw, states_bw = states
c_fw_lstLyr, h_fw_lstLyr = states_fw[-1]
c_bw_lstLyr, h_bw_lstLyr = states_bw[-1]
states_concat1 = tf.concat([h_fw_lstLyr, h_bw_lstLyr], axis = 1, name = 'states_concat')
with tf.name_scope("cs_BN1"):
x_BN = tf.layers.batch_normalization(
states_concat1,
axis=-1, # axis that should be normalized (typically the features axis, in this case the concated states or hidden vectors)
momentum=0.99,
epsilon=1e-10, #0.001,
center=False, #True,
scale=False, #True,
beta_initializer=tf.zeros_initializer(),
gamma_initializer=tf.ones_initializer(),
moving_mean_initializer=tf.zeros_initializer(),
moving_variance_initializer=tf.ones_initializer(),
beta_regularizer=None,
gamma_regularizer=None,
beta_constraint=None,
gamma_constraint=None,
training=Flg_training, #False,
trainable=True,
name="test_BN", #None,
reuse=None,
renorm=False,
renorm_clipping=None,
renorm_momentum=0.99,
fused=False,
virtual_batch_size=None,
adjustment=None
)
with tf.name_scope("Regression"):
a = tf.get_variable("a", shape=[1], dtype=tf.float32, initializer=tf.constant_initializer(1.0))
b = tf.get_variable("b", shape=[1], dtype=tf.float32, initializer=tf.constant_initializer(0.0))
with tf.name_scope("Prediction"):
y_pred = tf.multiply(x_BN, a) + b
with tf.name_scope('Loss'):
losses = tf.losses.mean_squared_error(y, y_pred, reduction=tf.losses.Reduction.NONE)
mean_loss = tf.reduce_mean(losses)
with tf.name_scope('Training'):
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
decay_steps, decay_rate, staircase=True)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(losses, global_step=global_step)
#x_mean = tf.reduce_mean(x_BN, axis=0)
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter("G:\\Surface_Ozone\\Temp\\", sess.graph)
sess.run(tf.global_variables_initializer())
for ii in list(range(2000)):
x_in = (np.random.rand(20, 200))
y_in = x_in * 1.5 + 3.0
length_in = np.full([20], 200, dtype=np.int32)
_, mean_loss_val, a_val, b_val = sess.run([train_step, mean_loss, a, b], feed_dict={
x: x_in,
Flg_training: True,
y: y_in,
length: length_in
})
if (ii < 50):
print("step {}: {} | a: {} | b: {}".format(ii, mean_loss_val, a_val, b_val))
else:
if (ii % 100 == 0):
print("step {}: {} | a: {} | b: {}".format(ii, mean_loss_val, a_val, b_val))
print("Normal End.")
I am new to tensorflow and have tried to implement a simple one-layer linear network similar to https://www.tensorflow.org/get_started/mnist/beginners
x = tf.placeholder(tf.float32, [None, IN_SIZE], name="input")
W1 = tf.Variable(tf.zeros([IN_SIZE, OUT_SIZE]), name="Weight1")
b1 = tf.Variable(tf.zeros([OUT_SIZE]), name="bias1")
y = tf.matmul(x, W1) + b1
y_ = tf.placeholder(tf.float32, [None, OUT_SIZE], name="target")
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.AdamOptimizer(1e-3).minimize(cross_entropy)
The program works as expected and I have no problem on that. However, I try to add another layer but only found the W1,b1,W2 learnt are all zero matrix, and only the bias b2 contains nonzero values. Below is my modified network
x = tf.placeholder(tf.float32, [None, IN_SIZE], name="input")
W1 = tf.Variable(tf.zeros([IN_SIZE, L1_SIZE]), name="Weight1")
b1 = tf.Variable(tf.zeros([L1_SIZE]), name="bias1")
y = tf.matmul(x, W1) + b1
W2 = tf.Variable(tf.zeros([L1_SIZE, OUT_SIZE]), name="Weight2")
b2 = tf.Variable(tf.zeros([OUT_SIZE]), name="bias2")
y = tf.nn.relu(y)
y = tf.matmul(y, W2) + b2
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, OUT_SIZE], name="target")
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.AdamOptimizer(1e-3).minimize(cross_entropy)
The problem is that if you initialize the weight matrices before a relu with zeroes the gradients will always be zero and no learning will happen. You need to do random initialization.