Loss not converging in Polynomial regression in Tensorflow - tensorflow

import numpy as np
import tensorflow as tf
#input data:
x_input=np.linspace(0,10,1000)
y_input=x_input+np.power(x_input,2)
#model parameters
W = tf.Variable(tf.random_normal([2,1]), name='weight')
#bias
b = tf.Variable(tf.random_normal([1]), name='bias')
#placeholders
#X=tf.placeholder(tf.float32,shape=(None,2))
X=tf.placeholder(tf.float32,shape=[None,2])
Y=tf.placeholder(tf.float32)
x_modified=np.zeros([1000,2])
x_modified[:,0]=x_input
x_modified[:,1]=np.power(x_input,2)
#model
#x_new=tf.constant([x_input,np.power(x_input,2)])
Y_pred=tf.add(tf.matmul(X,W),b)
#algortihm
loss = tf.reduce_mean(tf.square(Y_pred -Y ))
#training algorithm
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
#initializing the variables
init = tf.initialize_all_variables()
#starting the session session
sess = tf.Session()
sess.run(init)
epoch=100
for step in xrange(epoch):
# temp=x_input.reshape((1000,1))
#y_input=temp
_, c=sess.run([optimizer, loss], feed_dict={X: x_modified, Y: y_input})
if step%50==0 :
print c
print "Model paramters:"
print sess.run(W)
print "bias:%f" %sess.run(b)
I'm trying to implement Polynomial regression(quadratic) in Tensorflow. The loss isn't converging. Could anyone please help me out with this. The similar logic is working for linear regression though!

First there is a problem in your shapes, for Y_pred and Y:
Y has unknown shape, and is fed with an array of shape (1000,)
Y_pred has shape (1000, 1)
Y - Y_pred will then have shape (1000, 1000)
This small code will prove my point:
a = tf.zeros([1000]) # shape (1000,)
b = tf.zeros([1000, 1]) # shape (1000, 1)
print (a-b).get_shape() # prints (1000, 1000)
You should use consistent types:
y_input = y_input.reshape((1000, 1))
Y = tf.placeholder(tf.float32, shape=[None, 1])
Anyway, the loss is exploding because you have very high values (input between 0 and 100, you should normalize it) and thus very high loss (around 2000 at the beginning of training).
The gradient is very high and the parameters explode, and the loss gets to infinite.
The quickest fix is to lower your learning rate (1e-5 converges for me, albeit very slowly at the end). You can make it higher after the loss converges to around 1.

Related

Backpropagation Using Tensorflow and Numpy MSE not Dropping

I am trying to create a Backpropagation but I do not want to use the GradientDescentOptimizer from TF. I just wanted to update my own weights and biases. The problem is that the Mean Square Error or Cost is not approaching to zero. It just stays at some 0.2xxx. Is it because of my inputs which are 520x1600 (yes, each input has 1600 units and yes, there are 520 of them) or my number of neurons in the Hidden Layer is problematic? I have tried implementing this using the GradientDescentOptimizer and minimize(cost) which is working fine (Cost reduces near to zero as training goes on) but maybe I have an issue in my code of updating the weights and biases.
Here's my code:
import tensorflow as tf
import numpy as np
from BPInputs40 import pattern, desired;
#get the inputs and desired outputs, 520 inputs, each has 1600 units
train_in = pattern
train_out = desired
learning_rate=tf.constant(0.5)
num_input_neurons = len(train_in[0])
num_output_neurons = len(train_out[0])
num_hidden_neurons = 20
#weight matrix initialization with random values
w_h = tf.Variable(tf.random_normal([num_input_neurons, num_hidden_neurons]), dtype=tf.float32)
w_o = tf.Variable(tf.random_normal([num_hidden_neurons, num_output_neurons]), dtype=tf.float32)
b_h = tf.Variable(tf.random_normal([1, num_hidden_neurons]), dtype=tf.float32)
b_o = tf.Variable(tf.random_normal([1, num_output_neurons]), dtype=tf.float32)
# Model input and output
x = tf.placeholder("float")
y = tf.placeholder("float")
def sigmoid(v):
return tf.div(tf.constant(1.0),tf.add(tf.constant(1.0),tf.exp(tf.negative(v*0.001))))
def derivative(v):
return tf.multiply(sigmoid(v), tf.subtract(tf.constant(1.0), sigmoid(v)))
output_h = tf.sigmoid(tf.add(tf.matmul(x,w_h),b_h))
output_o = tf.sigmoid(tf.add(tf.matmul(output_h,w_o),b_o))
error = tf.subtract(output_o,y) #(1x35)
mse = tf.reduce_mean(tf.square(error))
delta_o=tf.multiply(error,derivative(output_o))
delta_b_o=delta_o
delta_w_o=tf.matmul(tf.transpose(output_h), delta_o)
delta_backprop=tf.matmul(delta_o,tf.transpose(w_o))
delta_h=tf.multiply(delta_backprop,derivative(output_h))
delta_b_h=delta_h
delta_w_h=tf.matmul(tf.transpose(x),delta_h)
#updating the weights
train = [
tf.assign(w_h, tf.subtract(w_h, tf.multiply(learning_rate, delta_w_h))),
tf.assign(b_h, tf.subtract(b_h, tf.multiply(learning_rate, tf.reduce_mean(delta_b_h, 0)))),
tf.assign(w_o, tf.subtract(w_o, tf.multiply(learning_rate, delta_w_o))),
tf.assign(b_o, tf.subtract(b_o, tf.multiply(learning_rate, tf.reduce_mean(delta_b_o, 0))))
]
sess = tf.Session()
sess.run(tf.global_variables_initializer())
err,target=1, 0.005
epoch, max_epochs = 0, 2000000
while epoch < max_epochs:
epoch += 1
err, _ = sess.run([mse, train],{x:train_in,y:train_out})
if (epoch%1000 == 0):
print('Epoch:', epoch, '\nMSE:', err)
answer = tf.equal(tf.floor(output_o + 0.5), y)
accuracy = tf.reduce_mean(tf.cast(answer, "float"))
print(sess.run([output_o], feed_dict={x: train_in, y: train_out}));
print("Accuracy: ", (1-err) * 100 , "%");
Update: I got it working now. The MSE dropped to almost zero once I increased the number of neurons in the hidden layer. I tried using 5200 and 6400 neurons for the hidden layer and with just 5000 epochs, the accuracy was almost 99%. Also, the largest learning rate I used is 0.1 because when above that, the MSE will not be close to zero.
I'm not an expert in this field, but it looks like your weights are updated correctly. And the fact that your MSE decreases from some higher values to 0.2xxx is the strong indicator of that. I would definitely try to run this problem with way more hidden neurons (e.g. 500)
Btw, are your inputs normalized? If not, that obviously could be the reason

Why batch_normalization in tensorflow does not give expected results?

I would like to see the output of batch_normalization layer in a small example, but apparently I am doing something wrong so I get the same output as the input.
import tensorflow as tf
import keras.backend as K
K.set_image_data_format('channels_last')
X = tf.placeholder(tf.float32, shape=(None, 2, 2, 3)) # samples are 2X2 images with 3 channels
outp = tf.layers.batch_normalization(inputs=X, axis=3)
x = np.random.rand(4, 2, 2, 3) # sample set: 4 images
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
K.set_session(sess)
a = sess.run(outp, feed_dict={X:x, K.learning_phase(): 0})
print(a-x) # print the difference between input and normalized output
The input and output of the above code are almost identical. Can anyone point out the problem to me?
Remember that batch_normalization behaves differently at train and test time. Here, you have never "trained" your batch normalization, so the moving average it has learned is random but close to 0, and the moving variance factor close to 1, so the output is almost the same as the input. If you use K.learning_phase(): 1 you'll already see some differences (because it will normalize using the batch's average and standard deviation); if you first learn on a lot of examples and then test on some other ones you'll also see the normalization occuring, because the learnt mean and standard deviation will not be 0 and 1.
To see better the effects of batch norm, I'd also suggest you to multiply your input by a big number (say 100), so that you have a clear difference between unnormalized and normalized vectors, that will help you test what's going on.
EDIT: In your code as is, it seems that the update of the moving mean and moving variance is never done. You need to make sure the update ops are run, as indicated in batch_normalization's doc. The following lines should make it work:
outp = tf.layers.batch_normalization(inputs=X, axis=3, training=is_training, center=False, scale=False)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
outp = tf.identity(outp)
Below is my full working code (I got rid of Keras because I don't know it well, but you should be able to re-add it).
import tensorflow as tf
import numpy as np
X = tf.placeholder(tf.float32, shape=(None, 2, 2, 3)) # samples are 2X2 images with 3 channels
is_training = tf.placeholder(tf.bool, shape=()) # samples are 2X2 images with 3 channels
outp = tf.layers.batch_normalization(inputs=X, axis=3, training=is_training, center=False, scale=False)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
outp = tf.identity(outp)
x = np.random.rand(4, 2, 2, 3) * 100 # sample set: 4 images
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
initial = sess.run(outp, feed_dict={X:x, is_training: False})
for i in range(10000):
a = sess.run(outp, feed_dict={X:x, is_training: True})
if (i % 1000 == 0):
print("Step %i: " %i, a-x) # print the difference between input and normalized output
final = sess.run(outp, feed_dict={X: x, is_training: False})
print("initial: ", initial)
print("final: ", final)
assert not np.array_equal(initial, final)

Simple model gets 0.0 accuracy

I am training a simple model on a dataset containing labels always equal to 0, and am getting a 0.0 accuracy.
The model is the following:
import csv
import numpy as np
import pandas as pd
import tensorflow as tf
labelsReader = pd.read_csv('data.csv',usecols = [12],header=None)
dataReader = pd.read_csv('data.csv',usecols = [1,2,3,4,5,6,7,8,9,10,11],header=None)
labels_ = labelsReader.values
data_ = dataReader.values
labels = np.float32(labels_)
data = np.float32(data_)
x = tf.placeholder(tf.float32, [None, 11])
W = tf.Variable(tf.truncated_normal([11, 1], stddev=1./11.))
b = tf.Variable(tf.zeros([1]))
y = tf.matmul(x, W) + b
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 1])
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
for i in range(0, 1000):
train_step.run(feed_dict={x: data, y_: labels})
correct_prediction = tf.equal(y, y_)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: data, y_: labels}))
And here is the dataset:
444444,0,0,0.9993089149965446,0,0,0.000691085003455425,0,0,0,0,0,0
As the model trains, y of the data shown above decreases, and reaches -1000 after 1000 iterations.
What could be the cause of the failure to train the model ?
Your accuracy checks if the predicted float is exactly equal to the value you expect. With the network you made this is a very difficult task (although you might have a chance as you are also overfitting your data).
To get better results:
- Define accuracy to be higher/lower than a value (closer to 1 or closer to 0).
- Normalise your input data, I don't know the range of your input, but 444444 is a rediculous value to use as input, and it is difficult to train weights that can handle these values.
Also: try to add some sanity checks. For example: what is the output your model is predicting? (y.eval) And what is the cross entropy you have during training your network? (sess.run([accuracy,cross_entropy], feed_dict={x: data, y_: labels})
Good luck!

Tensorflow model always produces mean

I am having trouble with fitting a very simple model in tensorflow. If I have a column of input data which is constant, my output always converges to produce the same value for all rows, which is the mean of my output data, y_, even when there is another column in x_ which has enough information to reproduce y_ exactly. Here is a small example.
import tensorflow as tf
def weight_variable(shape):
"""Initialize the weights with random weights"""
initial = tf.truncated_normal(shape, stddev=0.1, dtype=tf.float64)
return tf.Variable(initial)
#Initialize my data
x = tf.constant([[1.0,1.0],[1.0,2.0],[1.0,3.0]], dtype=tf.float64)
y_ = tf.constant([1.0,2.0,3.0], dtype=tf.float64)
w = weight_variable((2,1))
y = tf.matmul(x,w)
error = tf.reduce_mean(tf.square(y_ - y))
train_step = tf.train.AdamOptimizer(1e-5).minimize(error)
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
#Train the model and output every 1000 iterations
for i in range(1000000):
sess.run(train_step)
err = sess.run(error)
if i % 1000 == 0:
print "\nerr:", err
print "x: ", sess.run(x)
print "w: ", sess.run(w)
print "y_: ", sess.run(y_)
print "y: ", sess.run(y)
This example always converges to w=[2,0], and y = [2,2,2]. This is a smooth function with a minimum at w=[0,1] and y = [1,2,3], where the error function is zero. Why does it not converge to this? I have also tried using gradient descent and I have tried varying the training rate.
Your target is y_ = tf.constant([1.0,2.0,3.0], dtype=tf.float64) has the shape (1, 3). The output of tf.matmul(x, w) has the shape (3, 1). Thus y_ - y has the shape (3, 3) according to numpy broadcasting rules. So you are really not optimizing the function that you thought you were optimizing. Change your y_ to the following and give it a shot :
y_ = tf.constant([[1.0],[2.0],[3.0]], dtype=tf.float64)
This should converge pretty quickly to your expected answer, even with a large learning rate.

Tensorflow - semantic segmentation

Posting here to check if there's anything wrong with my implementation of a simple semantic segmentation model in TensorFlow. This code represents a sanity check I'm doing with just a single image from the database, for which I'm trying to overfit the model.
It is a binary classification problem with each image pixel mapped to [0,1] in the ground truth label.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
img = plt.imread('image.png') #Image of size [750,750,3]
img = plt.imread('map.png') # Ground Truth of size [750, 750]
img = np.expand_dims(img, 0)
lab = np.expand_dims(lab, 0)
w1 = tf.Variable(tf.constant(0.001, shape=[3,3,3,32]))
b1 = tf.Variable(tf.constant(0.0, shape=[32]))
w2 = tf.Variable(tf.constant(0.001, shape=[3,3,32,2]))
b2 = tf.Variable(tf.constant(0.0, shape=[2]))
mul = tf.nn.conv2d(img, w1, strides=[1,1,1,1], padding='SAME')
bias_add = tf.add(mul, b1)
conv1 = tf.nn.relu(bias_add)
mul2 = tf.nn.conv2d(conv1, w2, strides=[1,1,1,1], padding='SAME')
bias_add2 = tf.add(mul2, b2)
conv2 = tf.nn.relu(bias_add2)
sess = tf.InteractiveSession()
lab = lab.astype('int32')
conv2_out = tf.reshape(conv2, [-1, 2])
lab = np.reshape(lab, [-1])
prediction = tf.nn.softmax(pred) # I use this to visualize prediction of the model, and calculate accuracy
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(conv2_out, lab))
optimizer = tf.train.AdamOptimizer(0.001).minimize(loss)
correct_pred = tf.equal(tf.argmax(prediction, 1), lab)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.int32)
tf.initialize_all_variables().run()
step = 1
iter = 5
while step < iter:
sess.run(optimizer, feed_dict={x: img, y: lab})
loss_val,acc = sess.run([loss,accuracy], feed_dict={x: img, y: lab})
print ("Iter:"+ str(step) +" Loss : " + "{:.6f}".format(loss_val)#+ " Accuracy : " + "{:.6f}".format(acc))
step += 1
print ("optimization finished!")
prediction_logits = prediction.eval()
weights = w1.eval() # first layer learned weights
prediction_logits = np.reshape(prediction_logits, [750,750,2])
plt.figure() # Plotting original image with predicted labels
plt.imshow(img[0,:,:,:])
plt.imshow(prediction_logits[:,:,0], cmap=plt.cm.binary)
plt.show()
plt.figure() # Plotting first layer weights
for i in range(32):
plt.subplot(8,4,i+1)
plt.imshow(weights[:,:,:,i])
plt.show()
When I run this (as an interactive session), just to train the model to overfit on this single image, the loss minimizes, but my accuracy doesn't seem to change. I'm not quite sure I understand how the tf.argmax function works or if I've implemented it correctly - and the accuracy sticks to a single value no matter how many iterations.
Thoughts? Also, am I going about plotting the figure and predicted label correctly, or are there any errors here? (any other errors as well - or best practices I'm not following, do point them out)
Additionally, what is the recommended way to implement a regularization over the weights? I found tf.contrib.layers.l2_regularizer to be a feasible option - how do I include it in this scenario, though? A simple sum with the loss function?