Network diverges with NaN in simple TensorFlow example - tensorflow

I am trying to follow the example from Stanford series on TF by implementing a quadratic linear regression.
Y = W*X*X + u*X + b
The dataset can be found in Cengage dataset; and the code is the following:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import xlrd
DATA = 'data\\slr05.xls'
# Read data
data = xlrd.open_workbook(DATA, encoding_override='utf-8')
sheet = data.sheet_by_index(0)
dataset = np.asarray([sheet.row_values(i) for i in range(1, sheet.nrows)])
n_samples = sheet.nrows - 1
X = tf.placeholder('float', name = 'X')
Y = tf.placeholder('float', name = 'Y')
W = tf.Variable(0.0, name = 'weights')
b = tf.Variable(0.0, name = 'bias')
u = tf.Variable(0.0, name = 'u_weight')
Y_ = X*X*W + X*u + b
loss = tf.square(Y - Y_, name = 'loss')
optimizer = tf.train.GradientDescentOptimizer(0.0001).minimize(loss)
init = tf.global_variables_initializer()
loss_average = []
# Start the Session
with tf.Session() as sess:
sess.run(init)
for i in range(10):
for x, y in dataset:
print(sess.run([optimizer, Y_, W, b, u, X, Y], feed_dict = {X:x, Y:y}))
loss_average.append(sess.run(loss, feed_dict = {X:x, Y:y}))
The final W, b, and u values that I get are nan. I tried to check step-by-step why this is happening. So, in the output below I have included the [optimizer, Y_, W, b, u, X, Y]
and after a few row iterations I get:
[None, 3.9304674e+33, -1.0271335e+33, -7.7725354e+29, -2.8294217e+31, 36.2, 41.]
[None, -1.619979e+36, inf, 3.2321854e+32, 1.2834338e+34, 39.7, 147]
Apparently, during optimization the W ends up to 'inf', which breaks down the regression output.
Any, idea what have I done wrong?

You have an exploding gradient problem here. That's because your X and Y, and consequently difference values are in the magnitude of 101, so the square differences (you loss) are of magnitude 102. When you introduce the X2 into the regression, your difference values will be in the magnitude of 102, their squares of magnitude 104. Therefore the gradients will be much larger and the network diverges violently.
To correct for this, you can reduce the learning rate by a factor of 10-3, to put the gradients roughly back where they were, and lo and behold, this code (tested):
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import xlrd
DATA = 'slr05.xls'
# Read data
data = xlrd.open_workbook(DATA, encoding_override='utf-8')
sheet = data.sheet_by_index(0)
dataset = np.asarray([sheet.row_values(i) for i in range(1, sheet.nrows)])
n_samples = sheet.nrows - 1
X = tf.placeholder('float', name = 'X')
Y = tf.placeholder('float', name = 'Y')
W = tf.Variable(0.0, name = 'weights')
b = tf.Variable(0.0, name = 'bias')
u = tf.Variable(0.0, name = 'u_weight')
Y_ = X*X*W + X*u + b
#Y_ = X * u + b
loss = tf.square(Y - Y_, name = 'loss')
optimizer = tf.train.GradientDescentOptimizer(0.0000001).minimize(loss)
init = tf.global_variables_initializer()
loss_average = []
# Start the Session
with tf.Session() as sess:
sess.run(init)
for i in range(10):
for x, y in dataset:
print(sess.run([optimizer, loss, Y_, W, b, u, X, Y], feed_dict = {X:x, Y:y}))
loss_average.append(sess.run(loss, feed_dict = {X:x, Y:y}))
will obediently and orderly converge, as nice networks do, outputting (last 5 lines only):
[None, 1313.2705, 9.760924, 0.06911032, 0.0014081484, 0.010015297, array(11.9, dtype=float32), array(46., dtype=float32)]
[None, 1174.7083, 7.7259817, 0.06986606, 0.0014150032, 0.010087272, array(10.5, dtype=float32), array(42., dtype=float32)]
[None, 1217.4297, 8.1083145, 0.07066501, 0.0014219815, 0.01016194, array(10.7, dtype=float32), array(43., dtype=float32)]
[None, 657.74097, 8.353538, 0.07126329, 0.0014271108, 0.010217336, array(10.8, dtype=float32), array(34., dtype=float32)]
[None, 299.5538, 1.6923765, 0.07134304, 0.0014305722, 0.010233952, array(4.8, dtype=float32), array(19., dtype=float32)]

Related

"keras.backend.variable" is not behaving correctly in keras as opposed to tensorflow

I want to define trainable scalar in my models. In TensorFlow, this is done using tf.Variable. In Keras, keras.backend.variable is supposed to behave the same way. However, when I use model.fit, keras does not change the variable during the optimization process. Does anyone know why?
To test, please uncomment RUN_ON = "tensorflow" or RUN_ON = "keras" to run on either of engines.
import numpy as np
import keras as k
import tensorflow as tf
import matplotlib.pyplot as plt
# RUN_ON = "tensorflow"
# RUN_ON = "keras"
b_true = 3.0
w_true = 5.0
x_true = np.linspace(0.0, 1.0, 1000).reshape(-1, 1)
y_true = x_true * w_true + b_true
ids = np.arange(0, x_true.shape[0])
if RUN_ON=="keras":
x = k.Input((1,), dtype="float32", name="x")
Fx = k.layers.Dense(1, use_bias=False, name="Fx")(x)
b = k.backend.variable(1.0, name="b")
y = k.layers.Lambda(lambda x: x+b, name="Add")(Fx)
model = k.Model(inputs=[x], outputs=[y])
model.compile("adam", loss="mse")
# model.summary()
model.fit(x_true, [y_true], epochs=100000, batch_size=1000)
y_pred = model.predict(x_true)
elif RUN_ON=="tensorflow":
x = tf.placeholder("float32", shape=[None, 1], name="x")
Fx = tf.layers.Dense(1, use_bias=False, name="Fx")(x)
b = tf.Variable(1.0, name="b")
y = Fx + b
yp = tf.placeholder("float32", shape=[None, 1], name="y")
loss = tf.reduce_mean(tf.square(yp - y))
opt = tf.train.AdamOptimizer(0.001).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(100000):
np.random.shuffle(ids)
opt_out, loss_val, b_val = sess.run([opt, loss, b], feed_dict={x: x_true[ids], yp: y_true[ids]})
print("epoch={:d} loss={:e} b_val={:f}".format(i, loss_val, b_val))
if loss_val < 1.0e-9:
break
y_pred = sess.run([y], feed_dict={x: x_true, yp: y_true})[0]
else:
raise ValueError('`RUN_ON` should be either `keras` or `tensorflow`.')
plt.plot(x_true, y_true, '--b', linewidth=4)
plt.plot(x_true, y_pred, 'r')
plt.show()
#

Tensorflow Embedding using Continous and Categorical Variable

Based on this post, I tried to create another model, where I'm adding both categorical and continous variables.
Please find the code below:
from __future__ import print_function
import pandas as pd;
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder
if __name__ == '__main__':
# 1 categorical input feature and a binary output
df = pd.DataFrame({'cat2': np.array(['o', 'm', 'm', 'c', 'c', 'c', 'o', 'm', 'm', 'm']),
'num1': np.random.rand(10),
'label': np.array([0, 0, 1, 1, 0, 0, 1, 0, 1, 1])})
encoder = LabelEncoder()
encoder.fit(df.cat2.values)
X1 = encoder.transform(df.cat2.values).reshape(-1,1)
X2 = np.array(df.num1.values).reshape(-1,1)
# X = np.concatenate((X1,X2), axis=1)
Y = np.zeros((len(df), 2))
Y[np.arange(len(df)), df.label.values] = 1
# Neural net parameters
training_epochs = 5
learning_rate = 1e-3
cardinality = len(np.unique(X))
embedding_size = 2
input_X_size = 1
n_labels = len(np.unique(Y))
n_hidden = 10
# Placeholders for input, output
cat2 = tf.placeholder(tf.int32, [None], name='cat2')
x = tf.placeholder(tf.float32, [None, 1], name="input_x")
y = tf.placeholder(tf.float32, [None, 2], name="input_y")
embed_matrix = tf.Variable(
tf.random_uniform([cardinality, embedding_size], -1.0, 1.0),
name="embed_matrix"
)
embed = tf.nn.embedding_lookup(embed_matrix, cat2)
inputs_with_embed = tf.concat([x, embedding_aggregated], axis=2, name="inputs_with_embed")
# Neural network weights
h = tf.get_variable(name='h2', shape=[inputs_with_embed, n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
W_out = tf.get_variable(name='out_w', shape=[n_hidden, n_labels],
initializer=tf.contrib.layers.xavier_initializer())
# Neural network operations
#embedded_chars = tf.nn.embedding_lookup(embeddings, x)
layer_1 = tf.matmul(inputs_with_embed,h)
layer_1 = tf.nn.relu(layer_1)
out_layer = tf.matmul(layer_1, W_out)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out_layer, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost],
feed_dict={x: X2,cat2:X1, y: Y})
print("Optimization Finished!")
But I'm getting the following error. It seems I'm not concatenating the continous variable and embedding properly. But I'm not understanding how to fix it.
Please if someone can please guide me.
ValueError: Shape must be at least rank 3 but is rank 2 for 'inputs_with_embed_2' (op: 'ConcatV2') with input shapes: [?,1], [?,2], [] and with computed input tensors: input[2] = <2>.
Thanks!
If by embedding_agregated you mean embed (probably typo)
The error is that there is no axis=2 in your case , it should be axis=1
inputs_with_embed = tf.concat([x, embed], axis=1, name="inputs_with_embed")
embed has a shape [None, embedding_dimension] and x has a shape [None, 1]
They are both 2D tensors, so you have access to axis=0 or axis=1 (indexing at 0 not 1), therefore to have your input_with_embed of shape [None, embedding_dimension+1] you need to concat on the axis=1

Understanding model loss/accuracy and how not to leak information

This question is related to the starting one posted here.
The problem is to classify rows so that the classification of row number i can rely on the data for all the previous rows including class membership. The linked post contains an answer which is posted bellow.
For the sake of experimentation I've used a set of randomly crafted data, where the classifying property is a 0,1 uniform random variable.
What strikes me is that the loss of the model in the above example is really low and the accuracy is 99% whereas I would expect something in the 50% range.
So I am assuming that the way the model is testing the classification is leaking information somehow.
Does anybody happen to see what's the issue? What would be the proper way to evaluate the accuracy in such scenario?
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from random import randint
SIZE = 100
df = pd.DataFrame({'Temperature': list(range(SIZE)),
'Weight': [randint(1,100) for _ in range(SIZE)],
'Size': [randint(1,10000) for _ in range(SIZE)],
'Property': [randint(0,1) for _ in range(SIZE)]})
df.Property = df.Property.shift(-1)
print ( df.head() )
# parameters
time_steps = 1
inputs = 3
outputs = 2
df = df.iloc[:-1,:]
df = df.values
train_X = df[:, :-1]
train_y = df[:, -1]
scaler = MinMaxScaler(feature_range=(0, 1))
train_X = scaler.fit_transform(train_X)
train_X = train_X[:,None,:]
onehot_encoder = OneHotEncoder()
encode_categorical = train_y.reshape(len(train_y), 1)
train_y = onehot_encoder.fit_transform(encode_categorical).toarray()
learning_rate = 0.001
epochs = 50000
batch_size = int(train_X.shape[0]/2)
length = train_X.shape[0]
display = 100
neurons = 100
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, time_steps, inputs])
y = tf.placeholder(tf.float32, [None, outputs])
cell = tf.contrib.rnn.BasicLSTMCell(num_units=neurons, activation=tf.nn.relu)
cell_outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
stacked_outputs = tf.reshape(cell_outputs, [-1, neurons])
out = tf.layers.dense(inputs=stacked_outputs, units=outputs)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
labels=y, logits=out))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
accuracy = tf.metrics.accuracy(labels = tf.argmax(y, 1),
predictions = tf.argmax(out, 1),
name = "accuracy")
precision = tf.metrics.precision(labels=tf.argmax(y, 1),
predictions=tf.argmax(out, 1),
name="precision")
recall = tf.metrics.recall(labels=tf.argmax(y, 1),
predictions=tf.argmax(out, 1),
name="recall")
f1 = 2 * accuracy[1] * recall[1] / ( precision[1] + recall[1] )
with tf.Session() as sess:
tf.global_variables_initializer().run()
tf.local_variables_initializer().run()
for steps in range(epochs):
mini_batch = zip(range(0, length, batch_size),
range(batch_size, length+1, batch_size))
for (start, end) in mini_batch:
sess.run(training_op, feed_dict = {X: train_X[start:end,:,:],
y: train_y[start:end,:]})
if (steps+1) % display == 0:
loss_fn = loss.eval(feed_dict = {X: train_X, y: train_y})
print('Step: {} \tTraining loss: {}'.format((steps+1), loss_fn))
acc, prec, recall, f1 = sess.run([accuracy, precision, recall, f1],
feed_dict = {X: train_X, y: train_y})
print('\nEvaluation on training set')
print('Accuracy:', acc[1])
print('Precision:', prec[1])
print('Recall:', recall[1])
print('F1 score:', f1)

How create linear regression model for multi dimensional data in Tensorflow?

I have read a guide start for tensorflow here's a link
In this guide use an example of one dimension for model y = W*x +b.
After that i tried create 2 dimensional for x . Follow is my code :
import tensorflow as tf
import numpy as np
import random as rd
rd.seed(2)
#model is 2*x1 + x2 - 3 = y
def create_data_train():
x_train = np.asarray([[2,3],[6,7],[1,5],[4,6],[10,-1],[0,0],[5,6],
[8,9],[4.5,6.2],[1,1],[0.3,0.2]])
w_train = np.asarray([[2,1]])
b = np.asarray([[-3]])
y_train = np.dot(x_train, w_train.T) + b
for i in range(y_train.shape[0]):
for j in range(y_train.shape[1]):
y_train[i][j] += 1-rd.randint(0,2)
return x_train,y_train
# step 1
x = tf.placeholder(tf.float32, [None, 2])
W = tf.Variable(tf.zeros([2, 1]))
b = tf.Variable(tf.zeros([1]))
y = tf.matmul(x, W) + b
y_ = tf.placeholder(tf.float32, [None, 1])
# step 2
loss = tf.reduce_sum(tf.pow(tf.subtract(y,y_),2))
optimizer = tf.train.GradientDescentOptimizer(0.05)
train = optimizer.minimize(loss)
#step 3
x1,y1 = create_data_train()
x_train = tf.convert_to_tensor(x1)
y_train = tf.convert_to_tensor(y1)
print(x_train)
print(y_train)
init = tf.global_variables_initializer()
sess = tf.Session()
print(sess.run(x_train))
sess.run(init)
for i in range(1000):
sess.run(train,feed_dict={x:x_train,y_:y_train})
endW ,endb = sess.run([W,b])
print(endW)
print(endb)
But when i run, i encounter an error is :
TypeError: The value of a feed cannot be a tf.Tensor object. Acceptable feed values include Python scalars, strings, lists, numpy ndarrays, or TensorHandles.
The error here because you cannot feed a tensor to feed_dict.
Since you have following two lines,
x_train = tf.convert_to_tensor(x1)
y_train = tf.convert_to_tensor(y1)
You are converting x1 and y1 to tensors (i.e your input and outputs). Feed x1 and y1 directly without converting to tensors.
Hope this helps.

Creating a highly customizable RNN in Tensorflow

I am trying to implement an RNN without using the RNN functions provided by tensorflow. Here is the code I tried that eventually gave me an error
import tensorflow as tf
tf.InteractiveSession()
x = tf.placeholder(tf.float32, shape=(5,5))
InitialState = tf.zeros((5,1))
h = InitialState
W1 = tf.Variable(tf.random_normal([5, 5], stddev=0.35),
name="W1")
W2 = tf.Variable(tf.random_normal([5, 5], stddev=0.35),
name="W2")
for k in range(5):
h = tf.matmul(W1,h) + tf.matmul(W2,x[:,k:(k+1)])
h = tf.sigmoid(h)
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
a = sess.run([h], feed_dict = {x:tf.ones((5,5))})
How can I implement an RNN from scratch? Is there an example online?
import tensorflow as tf
import numpy as np
hidden_size = 2 # hidden layer of two neurons
input_size = 5
# Weight of x will the be (hidden_layer_size x input_size)
Wx = tf.Variable(tf.random_normal([hidden_size, input_size], stddev=0.35),
name="Wx")
# Weight of y will be (input_size x hidden_layer_size)
Wy = tf.Variable(tf.random_normal([input_size, hidden_size], stddev=0.35),
name="Wy")
# Weight of h will be (hidden_size, hidden_size)
Wh = tf.Variable(tf.random_normal([hidden_size, hidden_size], stddev=0.35),
name="Wh")
h = tf.zeros((hidden_size, input_size))
x = tf.placeholder(dtype = tf.float32,
shape = (input_size,input_size))
y = tf.placeholder(dtype = tf.float32,
shape = (input_size,input_size))
feed_dict = {
x : np.ones((5,5), dtype = np.float32),
y : np.ones((5,5), dtype = np.float32)
}
# RNN step
for _ in range(input_size):
h = tf.tanh(tf.matmul(Wh, h) + tf.matmul(Wx, x))
o = tf.nn.softmax(h)
init_op = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init_op, feed_dict = feed_dict)
h_new, y_hat = sess.run([h, o], feed_dict = feed_dict)
print h_new
print y_hat