Emulating boolean masks in Theano - numpy

I'm porting a numpy expression to theano. The expression finds the number of true positive predictions for each class, given a one-hot matrix Y of ground truth classes and a one-hot matrix Y_hat of predicted classes. The numpy code is:
import numpy as np
y = np.array([1, 0, 1, 2, 2])
y_hat = np.array([2, 0, 1, 1, 0])
Y = np.zeros(shape=(len(y), len(np.unique(y))))
Y_hat = np.zeros_like(Y)
rows = np.arange(len(y))
Y[rows, y] = 1
Y_hat[rows, y_hat] = 1
((Y_hat == Y) & (Y == 1)).sum(axis=0)
The last expression yields array([1, 1, 0]). I've tried using theano's nonzero:
from theano import shared
Yt = shared(Y)
Yt_hat = shared(Y_hat)
Yt_hat[Yt.nonzero()].eval()
The eval results in array([ 0., 1., 1., 0., 0.]), which is a 0-1 mask of the rows of Yt_hat where the prediction is correct. Any suggestions for how to make this work? For different ways of doing it? Thanks.

Here are three variants demonstrating how to re-implement parts of your numpy code in Theano.
Note that Theano's Unique operation does not support running on the GPU and does not appear to support gradients either. As a result version 3 many not be of much use. Version 2 provides a workaround: compute the unique values outside Theano and pass them in. Version 1 is a Theano implementation of the final line of your numpy code only.
To address your specific issue: there is no need to use nonzero; in this case the indexing works in Theano just like it works in numpy. Maybe you were getting confused between y and Y? (common Python style is to stick with lower case for all variable and parameter names).
import numpy as np
import theano
import theano.tensor as tt
import theano.tensor.extra_ops
def numpy_ver(y, y_hat):
Y = np.zeros(shape=(len(y), len(np.unique(y))), dtype=np.int64)
Y_hat = np.zeros_like(Y, dtype=np.int64)
rows = np.arange(len(y), dtype=np.int64)
Y[rows, y] = 1
Y_hat[rows, y_hat] = 1
return ((Y_hat == Y) & (Y == 1)).sum(axis=0), Y, Y_hat
def compile_theano_ver1():
Y = tt.matrix(dtype='int64')
Y_hat = tt.matrix(dtype='int64')
z = (tt.eq(Y_hat, Y) & tt.eq(Y, 1)).sum(axis=0)
return theano.function([Y, Y_hat], outputs=z)
def compile_theano_ver2():
y = tt.vector(dtype='int64')
y_hat = tt.vector(dtype='int64')
y_uniq = tt.vector(dtype='int64')
Y = tt.zeros(shape=(y.shape[0], y_uniq.shape[0]), dtype='int64')
Y_hat = tt.zeros_like(Y, dtype='int64')
rows = tt.arange(y.shape[0], dtype='int64')
Y = tt.set_subtensor(Y[rows, y], 1)
Y_hat = tt.set_subtensor(Y_hat[rows, y_hat], 1)
z = (tt.eq(Y_hat, Y) & tt.eq(Y, 1)).sum(axis=0)
return theano.function([y, y_hat, y_uniq], outputs=z)
def compile_theano_ver3():
y = tt.vector(dtype='int64')
y_hat = tt.vector(dtype='int64')
y_uniq = tt.extra_ops.Unique()(y)
Y = tt.zeros(shape=(y.shape[0], y_uniq.shape[0]), dtype='int64')
Y_hat = tt.zeros_like(Y, dtype='int64')
rows = tt.arange(y.shape[0], dtype='int64')
Y = tt.set_subtensor(Y[rows, y], 1)
Y_hat = tt.set_subtensor(Y_hat[rows, y_hat], 1)
z = (tt.eq(Y_hat, Y) & tt.eq(Y, 1)).sum(axis=0)
return theano.function([y, y_hat], outputs=z)
def main():
y = np.array([1, 0, 1, 2, 2], dtype=np.int64)
y_hat = np.array([2, 0, 1, 1, 0], dtype=np.int64)
y_uniq = np.unique(y)
result, Y, Y_hat = numpy_ver(y, y_hat)
print result
theano_ver1 = compile_theano_ver1()
print theano_ver1(Y, Y_hat)
theano_ver2 = compile_theano_ver2()
print theano_ver2(y, y_hat, y_uniq)
theano_ver3 = compile_theano_ver3()
print theano_ver3(y, y_hat)
main()

Related

Statsmodels API OLS regression: ValueError -> shapes (95, 3) and (4,) are not aligned

def fit_linear_regression(X, y):
X = sm.add_constant(X)
est = sm.OLS(y, X)
est = est.fit()
return est
print(X_train.shape) // outputs (604, 41)
print(X_test.shape) // outputs (95, 41)
model = fit_linear_regression(X_train.iloc[:, [0, 1, 2]], y_train)
model.predict(X_test.iloc[:, [0, 1, 2]])
When I run this script, I get the following error
ValueError: shapes (95,3) and (4,) not aligned: 3 (dim 1) != 4 (dim 0)
When I do not select any columns but just include the whole dataframes, it does the same with shapes(95, 41) and (42,) not aligned. What the hell is going on here?
X_train, y_train and y_test are panda dataframes.
As pointed out in the comments by #AlexK, you need to add the intercept (or constant) to your test data. In your function, you had this step:
X = sm.add_constant(X)
And this is used in fitting the model, so the model expects 4 columns instead of 3.
Using an example:
import pandas as pd
import numpy as np
import statsmodels.api as sm
X_train = pd.DataFrame(
np.random.normal(0,1,(604,41)),
columns = ["v" + str(i) for i in range(41)]
)
X_test = pd.DataFrame(
np.random.normal(0,1,(95,41)),
columns = ["v" + str(i) for i in range(41)]
)
y_train = np.random.normal(0,1,(604,))
y_test = np.random.normal(0,1,(95,))
Fit and predict :
def fit_linear_regression(X, y):
X = sm.add_constant(X)
est = sm.OLS(y, X)
est = est.fit()
return est
model = fit_linear_regression(X_train.iloc[:, [0, 1, 2]], y_train)
model.predict(sm.add_constant(X_test.iloc[:, [0, 1, 2]]))
Since you are using a dataframe, I hope there are proper column names, so you can consider using the formula interface (see the help page), just adding a tweak to include all the columns in your input, see this post too :
import statsmodels.formula.api as smf
def formula_linear_regression(X, y):
formula = "y ~ " + "+".join(X.columns)
df = X.copy()
df['y'] = y
est = smf.ols(formula=formula, data=X)
est = est.fit()
return est
model2 = formula_linear_regression(X_train.iloc[:, [0, 1, 2]], y_train)
model.predict(X_test.iloc[:, [0, 1, 2]])

How to implement tf.gather_nd in Pytorch with the argument batch_dims?

I have been doing a project on image matching, so I need to find correspondences between 2 images. To get descriptors, I will need a interpolate function. However, when I read about a equivalent function which is done in Tensorflow, I still don’t get how to implement tf.gather_nd(parmas, indices, barch_dims) in Pytorch. Especially when there is a argument: batch_dims. I have gone through stackoverflow and there is no perfect equivalence yet.
The referred interpolate function in Tensorflow is below and I have been trying to implement this in Pytorch Arguments' information is below:
inputs is a dense feature map[i] from a for loop of batch size, which means it is 3D[H, W, C](in pytorch is [C, H, W])
pos is a set of random point coordinate shapes like [[i, j], [i, j],...,[i, j]], so it is 2D when it goes in interpolate function(in pytorch is [[i,i,...,i], [j,j,...,j]])
and it then expands both of their dimensions when they get into this function
I just want a perfect implement of tf.gather_nd with argument batch_dims. Thank you!
And here's a simple example of using it:
pos = tf.ones((12, 2)) ## stands for a set of coordinates [[i, i,…, i], [j, j,…, j]]
inputs = tf.ones((4, 4, 128)) ## stands for [H, W, C] of dense feature map
outputs = interpolate(pos, inputs, batched=False)
print(outputs.get_shape()) # We get (12, 128) here
interpolate function (tf version):
def interpolate(pos, inputs, nd=True):
pos = tf.expand_dims(pos, 0)
inputs = tf.expand_dims(inputs, 0)
h = tf.shape(inputs)[1]
w = tf.shape(inputs)[2]
i = pos[:, :, 0]
j = pos[:, :, 1]
i_top_left = tf.clip_by_value(tf.cast(tf.math.floor(i), tf.int32), 0, h - 1)
j_top_left = tf.clip_by_value(tf.cast(tf.math.floor(j), tf.int32), 0, w - 1)
i_top_right = tf.clip_by_value(tf.cast(tf.math.floor(i), tf.int32), 0, h - 1)
j_top_right = tf.clip_by_value(tf.cast(tf.math.ceil(j), tf.int32), 0, w - 1)
i_bottom_left = tf.clip_by_value(tf.cast(tf.math.ceil(i), tf.int32), 0, h - 1)
j_bottom_left = tf.clip_by_value(tf.cast(tf.math.floor(j), tf.int32), 0, w - 1)
i_bottom_right = tf.clip_by_value(tf.cast(tf.math.ceil(i), tf.int32), 0, h - 1)
j_bottom_right = tf.clip_by_value(tf.cast(tf.math.ceil(j), tf.int32), 0, w - 1)
dist_i_top_left = i - tf.cast(i_top_left, tf.float32)
dist_j_top_left = j - tf.cast(j_top_left, tf.float32)
w_top_left = (1 - dist_i_top_left) * (1 - dist_j_top_left)
w_top_right = (1 - dist_i_top_left) * dist_j_top_left
w_bottom_left = dist_i_top_left * (1 - dist_j_top_left)
w_bottom_right = dist_i_top_left * dist_j_top_left
if nd:
w_top_left = w_top_left[..., None]
w_top_right = w_top_right[..., None]
w_bottom_left = w_bottom_left[..., None]
w_bottom_right = w_bottom_right[..., None]
interpolated_val = (
w_top_left * tf.gather_nd(inputs, tf.stack([i_top_left, j_top_left], axis=-1), batch_dims=1) +
w_top_right * tf.gather_nd(inputs, tf.stack([i_top_right, j_top_right], axis=-1), batch_dims=1) +
w_bottom_left * tf.gather_nd(inputs, tf.stack([i_bottom_left, j_bottom_left], axis=-1), batch_dims=1) +
w_bottom_right * tf.gather_nd(inputs, tf.stack([i_bottom_right, j_bottom_right], axis=-1), batch_dims=1)
)
interpolated_val = tf.squeeze(interpolated_val, axis=0)
return interpolated_val
As far as I'm aware there is no directly equivalent of tf.gather_nd in PyTorch and implementing a generic version with batch_dims is not that simple. However, you likely don't need a generic version, and given the context of your interpolate function, a version for [C, H, W] would suffice.
At the beginning of interpolate you add a singular dimension to the front, which is the batch dimension. Setting batch_dims=1 in tf.gather_nd means there is one batch dimension at the beginning, therefore it applies it per batch, i.e. it indexes inputs[0] with pos[0] etc. There is no benefit of adding a singular batch dimension, because you could have just used the direct computation.
# Adding singular batch dimension
# Shape: [1, num_pos, 2]
pos = tf.expand_dims(pos, 0)
# Shape: [1, H, W, C]
inputs = tf.expand_dims(inputs, 0)
batched_result = tf.gather_nd(inputs, pos, batch_dims=1)
single_result = tf.gater_nd(inputs[0], pos[0])
# The first element in the batched result is the same as the single result
# Hence there is no benefit to adding a singular batch dimension.
tf.reduce_all(batched_result[0] == single_result) # => True
Single version
In PyTorch the implementation for [H, W, C] can be done with Python's indexing. While PyTorch usually uses [C, H, W] for images, it's only a matter of what dimension to index, but let's keep them the same as in TensorFlow for the sake of comparison. If you were to index them manually, you would do it as such: inputs[pos_h[0], pos_w[0]], inputs[pos_h[1], pos_w[1]] and so on. PyTorch allows you to do that automatically by providing the indices as lists: inputs[pos_h, pos_w], where pos_h and pos_w have the same length. All you need to do is split your pos into two separate tensors, one for the indices along the height dimension and the other along the width dimension, which you also did in the TensorFlow version.
inputs = torch.randn(4, 4, 128)
# Random positions 0-3, shape: [12, 2]
pos = torch.randint(4, (12, 2))
# Positions split by dimension
pos_h = pos[:, 0]
pos_w = pos[:, 1]
# Index the inputs with the indices per dimension
gathered = inputs[pos_h, pos_w]
# Verify that it's identical to TensorFlow's output
inputs_tf = tf.convert_to_tensor(inputs.numpy())
pos_tf = tf.convert_to_tensor(pos.numpy())
gathered_tf = tf.gather_nd(inputs_tf, pos_tf)
gathered_tf = torch.from_numpy(gathered_tf.numpy())
torch.equal(gathered_tf, gathered) # => True
If you want to apply it to a tensor of size [C, H, W] instead, you only need to change the dimensions you want to index:
# For [H, W, C]
gathered = inputs[pos_h, pos_w]
# For [C, H, W]
gathered = inputs[:, pos_h, pos_w]
Batched version
Making it a batched batched version (for [N, H, W, C] or [N, C, H, W]) is not that difficult, and using that is more appropriate, since you're dealing with batches anyway. The only tricky part is that each element in the batch should only be applied to the corresponding batch. For this the batch dimensions needs to be enumerated, which can be done with torch.arange. The batch enumeration is just the list with the batch indices, which will be combined with the pos_h and pos_w indices, resulting in inputs[0, pos_h[0, 0], pos_h[0, 0]], inputs[0, pos_h[0, 1], pos_h[0, 1]] ... inputs[1, pos_h[1, 0], pos_h[1, 0]] etc.
batch_size = 3
inputs = torch.randn(batch_size, 4, 4, 128)
# Random positions 0-3, different for each batch, shape: [3, 12, 2]
pos = torch.randint(4, (batch_size, 12, 2))
# Positions split by dimension
pos_h = pos[:, :, 0]
pos_w = pos[:, :, 1]
batch_enumeration = torch.arange(batch_size) # => [0, 1, 2]
# pos_h and pos_w have shape [3, 12], so the batch enumeration needs to be
# repeated 12 times per batch.
# Unsqueeze to get shape [3, 1], now the 1 could be repeated to 12, but
# broadcasting will do that automatically.
batch_enumeration = batch_enumeration.unsqueeze(1)
# Index the inputs with the indices per dimension
gathered = inputs[batch_enumeration, pos_h, pos_w]
# Again, verify that it's identical to TensorFlow's output
inputs_tf = tf.convert_to_tensor(inputs.numpy())
pos_tf = tf.convert_to_tensor(pos.numpy())
# This time with batch_dims=1
gathered_tf = tf.gather_nd(inputs_tf, pos_tf, batch_dims=1)
gathered_tf = torch.from_numpy(gathered_tf.numpy())
torch.equal(gathered_tf, gathered) # => True
Again, for [N, C, H, W], only the dimensions that are indexed need to be changed:
# For [N, H, W, C]
gathered = inputs[batch_enumeration, pos_h, pos_w]
# For [N, C, H, W]
gathered = inputs[batch_enumeration, :, pos_h, pos_w]
Just a little side note on the interpolate implementation, rounding the positions (floor and ceil respectively) doesn't make sense, because indices must be integers, so it has no effect, as long as your positions are actual indices. That also results in i_top_left and i_bottom_left being the same value, but even if they are to be rounded differently, they are always 1 position apart. Furthermore, i_top_left and i_top_right are literally the same. I don't think that this function produces a meaningful output. I don't know what you're trying to achieve, but if you're looking for image interpolation you could have a look at torch.nn.functional.interpolate.
This is just an extension of Michael Jungo's batched version answer when pos is 2D array instead of 1D array (excluding batch dimension).
bs = 2
H = 4
W = 6
C = 3
inputs = torch.randn(bs, H, W, C)
pos_h = torch.randint(H, (bs, H, W))
pos_w = torch.randint(W, (bs, H, W))
batch_enumeration = torch.arange(bs)
batch_enumeration = batch_enumeration.unsqueeze(1).unsqueeze(2)
inputs.shape
Out[34]: torch.Size([2, 4, 6, 3])
pos_h.shape
Out[35]: torch.Size([2, 4, 6])
pos_w.shape
Out[36]: torch.Size([2, 4, 6])
batch_enumeration.shape
Out[37]: torch.Size([2, 1, 1])
gathered = inputs[batch_enumeration, pos_h, pos_w]
For channel first, we also need to enumerate channels
inputs = torch.randn(bs, C, H, W)
pos_h = torch.randint(H, (bs, 1, H, W))
pos_w = torch.randint(W, (bs, 1, H, W))
batch_enumeration = torch.arange(bs)
batch_enumeration = batch_enumeration.unsqueeze(1).unsqueeze(2).unsqueeze(3)
channel_enumeration = torch.arange(C)
channel_enumeration = channel_enumeration.unsqueeze(0).unsqueeze(2).unsqueeze(3)
inputs.shape
Out[49]: torch.Size([2, 3, 4, 6])
pos_h.shape
Out[50]: torch.Size([2, 1, 4, 6])
pos_w.shape
Out[51]: torch.Size([2, 1, 4, 6])
batch_enumeration.shape
Out[52]: torch.Size([2, 1, 1, 1])
channel_enumeration.shape
Out[57]: torch.Size([1, 3, 1, 1])
gathered = inputs[batch_enumeration, channel_enumeration, pos_h, pos_w]
gathered.shape
Out[59]: torch.Size([2, 3, 4, 6])
Let's verify
inputs_np = inputs.numpy()
pos_h_np = pos_h.numpy()
pos_w_np = pos_w.numpy()
gathered_np = gathered.numpy()
pos_h_np[0,0,0,0]
Out[68]: 0
pos_w_np[0,0,0,0]
Out[69]: 3
inputs_np[0,:,0,3]
Out[71]: array([ 0.79122806, -2.190181 , -0.16741803], dtype=float32)
gathered_np[0,:,0,0]
Out[72]: array([ 0.79122806, -2.190181 , -0.16741803], dtype=float32)
pos_h_np[1,0,3,4]
Out[73]: 1
pos_w_np[1,0,3,4]
Out[74]: 2
inputs_np[1,:,1,2]
Out[75]: array([ 0.9282498 , -0.34945545, 0.9136222 ], dtype=float32)
gathered_np[1,:,3,4]
Out[77]: array([ 0.9282498 , -0.34945545, 0.9136222 ], dtype=float32)
I improved the answer from Michael Jungo's implementation. Now it supports arbitrary leading batch dimensions.
def gather_nd_torch(params, indices, batch_dim=1):
""" A PyTorch porting of tensorflow.gather_nd
This implementation can handle leading batch dimensions in params, see below for detailed explanation.
The majority of this implementation is from Michael Jungo # https://stackoverflow.com/a/61810047/6670143
I just ported it compatible to leading batch dimension.
Args:
params: a tensor of dimension [b1, ..., bn, g1, ..., gm, c].
indices: a tensor of dimension [b1, ..., bn, x, m]
batch_dim: indicate how many batch dimension you have, in the above example, batch_dim = n.
Returns:
gathered: a tensor of dimension [b1, ..., bn, x, c].
Example:
>>> batch_size = 5
>>> inputs = torch.randn(batch_size, batch_size, batch_size, 4, 4, 4, 32)
>>> pos = torch.randint(4, (batch_size, batch_size, batch_size, 12, 3))
>>> gathered = gather_nd_torch(inputs, pos, batch_dim=3)
>>> gathered.shape
torch.Size([5, 5, 5, 12, 32])
>>> inputs_tf = tf.convert_to_tensor(inputs.numpy())
>>> pos_tf = tf.convert_to_tensor(pos.numpy())
>>> gathered_tf = tf.gather_nd(inputs_tf, pos_tf, batch_dims=3)
>>> gathered_tf.shape
TensorShape([5, 5, 5, 12, 32])
>>> gathered_tf = torch.from_numpy(gathered_tf.numpy())
>>> torch.equal(gathered_tf, gathered)
True
"""
batch_dims = params.size()[:batch_dim] # [b1, ..., bn]
batch_size = np.cumprod(list(batch_dims))[-1] # b1 * ... * bn
c_dim = params.size()[-1] # c
grid_dims = params.size()[batch_dim:-1] # [g1, ..., gm]
n_indices = indices.size(-2) # x
n_pos = indices.size(-1) # m
# reshape leadning batch dims to a single batch dim
params = params.reshape(batch_size, *grid_dims, c_dim)
indices = indices.reshape(batch_size, n_indices, n_pos)
# build gather indices
# gather for each of the data point in this "batch"
batch_enumeration = torch.arange(batch_size).unsqueeze(1)
gather_dims = [indices[:, :, i] for i in range(len(grid_dims))]
gather_dims.insert(0, batch_enumeration)
gathered = params[gather_dims]
# reshape back to the shape with leading batch dims
gathered = gathered.reshape(*batch_dims, n_indices, c_dim)
return gathered
I have also made a demo Colab notebook, you can check it here. This implementation is way faster than TF's original implementation according to my poor speed test on Colab server with a GPU instance.

Implementing the Cosine similarity in tensor flow

My Question is for the below equation
The equation above of single vector. But if I have a batches of vectors, like my X and Y having the dimension of (None, 32), then there will some issue.
Also remember in coding environment, one example inside the batch is already in transpose shape. My problem is when we need to do transpose on [None, 32] the code will not accept and transpose for None dimenation.So I solve it in the following way:
def Cosine_similarity(X, Y, feature_dim):
L = tf.compat.v1.initializers.glorot_normal()(shape=[feature_dim, feature_dim])
out1 = tf.matmul(X, L)
out2 = tf.matmul(Y, L)
out_numerator = tf.reduce_sum(tf.multiply(out1, out2), axis = 1)
out3 = tf.reduce_sum(tf.multiply(out1, out1), axis = 1)
out3 = tf.sqrt(out3)
out4 = tf.reduce_sum(tf.multiply(out2, out2), axis = 1)
out4 = tf.sqrt(out4)
out_denominator = tf.multiply(out3, out4)
final_out = tf.divide(out_numerator, out_denominator)
return final_out
And this is coming from the following:
<XA.YA> = (XA)^T (YA)
= tf.reduce_sum(tf.multiply((X A) , (Y A)), axis = 1)
So I just to know if this implementation is right? Or you can correct me if I am missing something
Not sure I understand your concern for the (none) dimension.
If I understand correctly the cosine similarity between two identically shaped matrix X and Y ([batch, target_dim]) is just a matrix multiplication of X * Y^T with some L2 normalization. Note X would be your out1 and Y would be your out2.
def Cosine_similarity(x, y, A):
"""Pair-wise Cosine similarity.
First `x` and `y` are transformed by A.
`X = xA^T` with shape [batch, target_dim],
`Y = yA^T` with shape [batch, target_dim].
Args:
x: shaped [batch, feature_dim].
y: shaped [batch, feature_dim].
A: shaped [targte_dim, feature_dim]. Transformation matrix to project
from `feature_dim` to `target_dim`.
Returns:
A cosine similarity matrix shaped [batch, batch]. The entry
at (i, j) is the cosine similarity value between vector `X[i, :]` and
`Y[j, :]` where `X`, `Y` are the transformed `x` and y` by `A`
respectively. In the other word, entry at (i, j) is the pair-wise
cosine similarity value between the i-th example of `x` and the j-th
example of `y`.
"""
x = tf.matmul(x, A, transpose_b=True)
y = tf.matmul(y, A, transpose_b=True)
x_norm = tf.nn.l2_normalize(x, axis=-1)
y_norm = tf.nn.l2_normalize(y, axis=-1)
y_norm_trans = tf.transpose(y_norm, [1, 0])
sim = tf.matmul(x_norm, y_norm_trans)
return sim
import numpy as np
feature_dim = 8
target_dim = 4
batch_size = 2
x = tf.placeholder(tf.float32, shape=(None, dim))
y = tf.placeholder(tf.float32, shape=(None, dim))
A = tf.placeholder(tf.float32, shape=(target_dim, feature_dim))
sim = Cosine_similarity(x, y, A)
with tf.Session() as sess:
x, y, sim = sess.run([x, y, sim], feed_dict={
x: np.ones((batch_size, feature_dim)),
y: np.random.rand(batch_size, feature_dim),
A: np.random.rand(target_dim, feature_dim)})
print 'x=\n', x
print 'y=\n', y
print 'sim=\n', sim
Result:
x=
[[ 1. 1. 1. 1. 1. 1. 1. 1.]
[ 1. 1. 1. 1. 1. 1. 1. 1.]]
y=
[[ 0.01471654 0.76577073 0.97747731 0.06429122 0.91344446 0.47987637
0.09899797 0.773938 ]
[ 0.8555786 0.43403915 0.92445409 0.03393625 0.30154493 0.60895061
0.1233703 0.58597666]]
sim=
[[ 0.95917791 0.98181278]
[ 0.95917791 0.98181278]]

Network diverges with NaN in simple TensorFlow example

I am trying to follow the example from Stanford series on TF by implementing a quadratic linear regression.
Y = W*X*X + u*X + b
The dataset can be found in Cengage dataset; and the code is the following:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import xlrd
DATA = 'data\\slr05.xls'
# Read data
data = xlrd.open_workbook(DATA, encoding_override='utf-8')
sheet = data.sheet_by_index(0)
dataset = np.asarray([sheet.row_values(i) for i in range(1, sheet.nrows)])
n_samples = sheet.nrows - 1
X = tf.placeholder('float', name = 'X')
Y = tf.placeholder('float', name = 'Y')
W = tf.Variable(0.0, name = 'weights')
b = tf.Variable(0.0, name = 'bias')
u = tf.Variable(0.0, name = 'u_weight')
Y_ = X*X*W + X*u + b
loss = tf.square(Y - Y_, name = 'loss')
optimizer = tf.train.GradientDescentOptimizer(0.0001).minimize(loss)
init = tf.global_variables_initializer()
loss_average = []
# Start the Session
with tf.Session() as sess:
sess.run(init)
for i in range(10):
for x, y in dataset:
print(sess.run([optimizer, Y_, W, b, u, X, Y], feed_dict = {X:x, Y:y}))
loss_average.append(sess.run(loss, feed_dict = {X:x, Y:y}))
The final W, b, and u values that I get are nan. I tried to check step-by-step why this is happening. So, in the output below I have included the [optimizer, Y_, W, b, u, X, Y]
and after a few row iterations I get:
[None, 3.9304674e+33, -1.0271335e+33, -7.7725354e+29, -2.8294217e+31, 36.2, 41.]
[None, -1.619979e+36, inf, 3.2321854e+32, 1.2834338e+34, 39.7, 147]
Apparently, during optimization the W ends up to 'inf', which breaks down the regression output.
Any, idea what have I done wrong?
You have an exploding gradient problem here. That's because your X and Y, and consequently difference values are in the magnitude of 101, so the square differences (you loss) are of magnitude 102. When you introduce the X2 into the regression, your difference values will be in the magnitude of 102, their squares of magnitude 104. Therefore the gradients will be much larger and the network diverges violently.
To correct for this, you can reduce the learning rate by a factor of 10-3, to put the gradients roughly back where they were, and lo and behold, this code (tested):
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import xlrd
DATA = 'slr05.xls'
# Read data
data = xlrd.open_workbook(DATA, encoding_override='utf-8')
sheet = data.sheet_by_index(0)
dataset = np.asarray([sheet.row_values(i) for i in range(1, sheet.nrows)])
n_samples = sheet.nrows - 1
X = tf.placeholder('float', name = 'X')
Y = tf.placeholder('float', name = 'Y')
W = tf.Variable(0.0, name = 'weights')
b = tf.Variable(0.0, name = 'bias')
u = tf.Variable(0.0, name = 'u_weight')
Y_ = X*X*W + X*u + b
#Y_ = X * u + b
loss = tf.square(Y - Y_, name = 'loss')
optimizer = tf.train.GradientDescentOptimizer(0.0000001).minimize(loss)
init = tf.global_variables_initializer()
loss_average = []
# Start the Session
with tf.Session() as sess:
sess.run(init)
for i in range(10):
for x, y in dataset:
print(sess.run([optimizer, loss, Y_, W, b, u, X, Y], feed_dict = {X:x, Y:y}))
loss_average.append(sess.run(loss, feed_dict = {X:x, Y:y}))
will obediently and orderly converge, as nice networks do, outputting (last 5 lines only):
[None, 1313.2705, 9.760924, 0.06911032, 0.0014081484, 0.010015297, array(11.9, dtype=float32), array(46., dtype=float32)]
[None, 1174.7083, 7.7259817, 0.06986606, 0.0014150032, 0.010087272, array(10.5, dtype=float32), array(42., dtype=float32)]
[None, 1217.4297, 8.1083145, 0.07066501, 0.0014219815, 0.01016194, array(10.7, dtype=float32), array(43., dtype=float32)]
[None, 657.74097, 8.353538, 0.07126329, 0.0014271108, 0.010217336, array(10.8, dtype=float32), array(34., dtype=float32)]
[None, 299.5538, 1.6923765, 0.07134304, 0.0014305722, 0.010233952, array(4.8, dtype=float32), array(19., dtype=float32)]

TensorFlow: Plotting a graph that describes data points and decision boundary

I would like to achieve something similar:
https://rootpy.github.io/root_numpy/_images/plot_multiclass_1.png
What would be the most elegant solution? Get the weights, bias, function and data and plot it with some other tool or does TensorFlow have support for that?
As far as I know, Tensorflow does not directly support plotting decision boundaries.
It is certainly not the most elegant solution, but you can create a grid. Classify each point of the grid and then plot it. For example:
#!/usr/bin/env python
"""
Solve the XOR problem with Tensorflow.
The XOR problem is a two-class classification problem. You only have four
datapoints, all of which are given during training time. Each datapoint has
two features:
x o
o x
As you can see, the classifier has to learn a non-linear transformation of
the features to find a propper decision boundary.
"""
__author__ = "Martin Thoma"
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
# The training data
XOR_X = [[0, 0], [0, 1], [1, 0], [1, 1]] # Features
XOR_Y = [[0], [1], [1], [0]] # Class labels
XOR_Y = [[1, 0], [0, 1], [0, 1], [1, 0]] # Target values
assert len(XOR_X) == len(XOR_Y) # sanity check
# The network
nb_classes = 2
input_ = tf.placeholder(tf.float32,
shape=[None, len(XOR_X[0])],
name="input")
target = tf.placeholder(tf.float32,
shape=[None, nb_classes],
name="output")
nb_hidden_nodes = 2
# enc = tf.one_hot([0, 1], 2)
w1 = tf.Variable(tf.random_uniform([2, nb_hidden_nodes], -1, 1),
name="Weights1")
w2 = tf.Variable(tf.random_uniform([nb_hidden_nodes, nb_classes], -1, 1),
name="Weights2")
b1 = tf.Variable(tf.zeros([nb_hidden_nodes]), name="Biases1")
b2 = tf.Variable(tf.zeros([nb_classes]), name="Biases2")
activation2 = tf.sigmoid(tf.matmul(input_, w1) + b1)
hypothesis = tf.nn.softmax(tf.matmul(activation2, w2) + b2)
cross_entropy = -tf.reduce_sum(target * tf.log(hypothesis))
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy)
# Start training
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
for i in range(100000):
sess.run(train_step, feed_dict={input_: XOR_X, target: XOR_Y})
if i % 10000 == 0:
print('Epoch ', i)
print('Hypothesis ', sess.run(hypothesis,
feed_dict={input_: XOR_X,
target: XOR_Y}))
print('w1 ', sess.run(w1))
print('b1 ', sess.run(b1))
print('w2 ', sess.run(w2))
print('b2 ', sess.run(b2))
print('cost (ce)', sess.run(cross_entropy,
feed_dict={input_: XOR_X,
target: XOR_Y}))
# Visualize classification boundary
xs = np.linspace(-5, 5)
ys = np.linspace(-5, 5)
pred_classes = []
for x in xs:
for y in ys:
pred_class = sess.run(hypothesis,
feed_dict={input_: [[x, y]]})
pred_classes.append((x, y, pred_class.argmax()))
xs_p, ys_p = [], []
xs_n, ys_n = [], []
for x, y, c in pred_classes:
if c == 0:
xs_n.append(x)
ys_n.append(y)
else:
xs_p.append(x)
ys_p.append(y)
plt.plot(xs_p, ys_p, 'ro', xs_n, ys_n, 'bo')
plt.show()
which gives