I want to feed pytorch gradients manually. In my real problem, I have my own adjoint function that does not use tensors. Is there any way I can define my own gradient function for pytorch to use during optimization?
import numpy as np
import torch
# define rosenbrock function and gradient
x0 = np.array([0.1, 0.1])
a = 1
b = 5
def f(x):
return (a - x[0]) ** 2 + b * (x[1] - x[0] ** 2) ** 2
def jac(x):
dx1 = -2 * a + 4 * b * x[0] ** 3 - 4 * b * x[0] * x[1] + 2 * x[0]
dx2 = 2 * b * (x[1] - x[0] ** 2)
return np.array([dx1, dx2])
# create stochastic rosenbrock function and gradient
# (the crude analogy is that I have predefined stochastic
# forward and backward functions)
def f_rand(x):
return f(x) * np.random.uniform(0.5, 1.5)
def jac_rand(x): return jac(x) * np.random.uniform(0.5, 1.5)
x_tensor = torch.tensor(x0, requires_grad=False)
optimizer = torch.optim.Adam([x_tensor], lr=0.1)
# here, closure is fed f_rand to compute the gradient.
# I need to feed closer the gradient directly from jac_rand
def closure():
optimizer.zero_grad()
loss = f_rand(x_tensor)
loss.backward() # jac_rand(x)
return loss
for ii in range(200):
optimizer.step(closure)
print(x_tensor, f(x_tensor))
# tensor([1.0000, 1.0000], dtype=torch.float64, requires_grad=True) tensor(4.5799e-09, dtype=torch.float64, grad_fn=<AddBackward0>)
# ( this is the right answer, E[f(1, 1)] = 0 )
I've tried defining a custom function, but I can't get it to work. This is my best attempt so far:
import numpy as np
import torch
# define rosenbrock function and gradient
x0 = np.array([0.1, 0.1])
a = 1
b = 5
def f(x):
return (a - x[0]) ** 2 + b * (x[1] - x[0] ** 2) ** 2
def jac(x):
dx1 = -2 * a + 4 * b * x[0] ** 3 - 4 * b * x[0] * x[1] + 2 * x[0]
dx2 = 2 * b * (x[1] - x[0] ** 2)
return np.array([dx1, dx2])
# create stochastic rosenbrock function and gradient
def f_rand(x):
return f(x) * np.random.uniform(0.5, 1.5)
def jac_rand(x): return jac(x) * np.random.uniform(0.5, 1.5)
class custom_function(torch.autograd.Function):
#staticmethod
def forward(ctx, input):
ctx.save_for_backward(input)
return f_rand(input)
#staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
return grad_output * g_rand(input)
x_tensor = torch.tensor(x0, requires_grad=False)
optimizer = torch.optim.Adam([x_tensor], lr=0.1)
for ii in range(200):
print('x_tensor ', x_tensor)
optimizer.step(custom_function())
print(x_tensor, f(x_tensor))
It says:
RuntimeError: Legacy autograd function with non-static forward method is deprecated. Please use new-style autograd function with static forward method. (Example: https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)
Not quite sure if this is exactly what you want but the method call loss.backward() computes gradients via pytorch's computational graph and stores the gradient values in the weight tensors themselves (in your case it's in x_tensor). And these gradients can be accessed via x_tensor.grad. However, if you don't want to use pytorch's gradient computing method using loss.backward(), then you can manually feed your gradients into your tensor's .grad attribute as follows:
with torch.no_grad():
def closure():
optimizer.zero_grad()
loss = f_rand(x_tensor)
x_tensor.grad = torch.from_numpy(jac_rand(x_tensor))
return loss
I made some modifications, mainly the learning rate and the number of iterations. You will see the loss goes to zero as the tensor approaches (a, a²).
import torch
import numpy as np
import torch
# define rosenbrock function and gradient
np.random.seed(0)
x0 = np.array([0.1, 0.1])
a = 6
b = 100
def f(x):
return (a - x[0]) ** 2 + b * (x[1] - x[0] ** 2) ** 2
def jac(x):
dx1 = -2 * a + 4 * b * x[0] ** 3 - 4 * b * x[0] * x[1] + 2 * x[0]
dx2 = 2 * b * (x[1] - x[0] ** 2)
return np.array([dx1, dx2])
# create stochastic rosenbrock function and gradient
def f_rand(x):
#return f(x)
return f(x) * np.random.uniform(0.5, 1.5)
def jac_rand(x):
#return jac(x)
return jac(x) * np.random.uniform(0.5, 1.5)
class CustomFunction(torch.autograd.Function):
#staticmethod
def forward(ctx, input):
ctx.save_for_backward(input)
return f_rand(input)
#staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
return grad_output * jac_rand(input)
custom_function = CustomFunction.apply
x_tensor = torch.tensor(x0, requires_grad=True)
optimizer = torch.optim.Adam([x_tensor], lr=0.0001)
print('x_tensor ', x_tensor)
for ii in range(1000000):
optimizer.zero_grad()
output=custom_function(x_tensor)
loss = round(output.item(),8)
if loss < 0.0000001:
print('loss: ',loss)
break
print('loss: ',loss)
output.backward()
optimizer.step()
print(x_tensor, f(x_tensor))
Related
I am trying to code a neural network using only numpy and pandas. I am having issues with the dimension of my data. I am getting the error "ValueError: operands could not be broadcast together with shapes (150,) (150,3)
." Not sure what the alternative is here, as we are trying to predict one of the three types of flower based on 4 numerical values. Here is my code:
import pandas as pd
class NeuralNet():
def __init__(self, i_dim, h_dim, o_dim, lr):
self.i_dim = i_dim
self.h_dim = h_dim
self.o_dim = o_dim
self.lr = lr
self.weights1 = np.random.randn(self.i_dim, self.h_dim) / np.sqrt(self.i_dim)
self.bias1 = np.zeros((1, self.h_dim))
self.weights2 = np.random.randn(self.h_dim, self.o_dim) / np.sqrt(self.h_dim)
self.bias2 = np.zeros((1, self.o_dim))
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def softmax(self, x):
exps = np.exp(x - np.max(x, axis=1, keepdims=True))
return exps / np.sum(exps, axis=1, keepdims=True)
def forward(self, X):
self.layer1 = self.sigmoid(np.dot(X, self.weights1) + self.bias1)
self.layer2 = self.softmax(np.dot(self.layer1, self.weights2) + self.bias2)
return self.layer2
def sigmoid_derivative(self, x):
return x * (1 - x)
def softmax_derivative(self, x):
s = x.reshape(-1, 1)
return np.diagflat(s) - np.dot(s, s.T)
def backward(self, X, y, y_hat):
d_softmax = self.softmax_derivative(y_hat)
d_sigmoid = self.sigmoid_derivative(self.layer1)
d_weights2 = np.dot(self.layer1.T, (2 * (y - y_hat) * d_softmax))
d_bias2 = np.sum(2 * (y - y_hat) * d_softmax, axis=0, keepdims=True)
d_weights1 = np.dot(X.T, (np.dot(2 * (y - y_hat) * d_softmax, self.weights2.T) * d_sigmoid))
d_bias1 = np.sum(np.dot(2 * (y - y_hat) * d_softmax, self.weights2.T) * d_sigmoid, axis=0)
self.weights1 -= self.lr * d_weights1
self.bias1 -= self.lr * d_bias1
self.weights2 -= self.lr * d_weights2
self.bias2 -= self.lr * d_bias2
def cross_ent_loss(self):
sample_losses = - self.y * np.log(self.y_hat) - (1 - self.y) * np.log(1 - self.y_hat)
loss = np.mean(sample_losses)
return loss
def train(self, X, y, epochs):
for epoch in range(epochs):
y_hat = self.forward(X)
self.backward(X, y, y_hat)
loss = self.cross_ent_loss()
print(f"Epoch {epoch}: Loss = {loss}")
if epoch % 10 == 0:
print(f"Epoch {epoch}: Loss = {loss}")
def predict(self, X):
return self.forward(X)
df = pd.read_csv('/Users/brasilgu/PycharmProjects/NNfs/venv/lib/iris.data.txt', header=None)
X_train = df.iloc[:, :4].values
y_train = df.iloc[:, -1].values
nn = NeuralNet(4, 5, 3, 0.1)
nn.train(X_train, y_train, 1000)
y_pred = nn.predict(X_train)
y_pred_labels = np.argmax(y_pred, axis=1)
print(y_pred) ```
The stacktrace of the error:
``` Traceback (most recent call last):
File "/Users/brasilgu/PycharmProjects/NNfs/venv/lib/neural_net.py", line 72, in <module>
nn.train(X_train, y_train, 1000)
File "/Users/brasilgu/PycharmProjects/NNfs/venv/lib/neural_net.py", line 57, in train
self.backward(X, y, y_hat)
File "/Users/brasilgu/PycharmProjects/NNfs/venv/lib/neural_net.py", line 39, in backward
d_weights2 = np.dot(self.layer1.T, (2 * (y - y_hat) * d_softmax))
ValueError: operands could not be broadcast together with shapes (150,) (150,3)```
I saw the publicly available iris dataset and according to your code, the y seems to be a rank one matrix with shape (150, ).
So modify your y_train as y_train = y_train.reshape(-1, 1) to make it a proper matrix before creating the NeuralNet
I've been trying to experiment with Region Based: Dice Loss but there have been a lot of variations on the internet to a varying degree that I could not find two identical implementations. The problem is that all of these produce varying results. Below are the implementations that I found. Some uses smoothing factor which the authors in this paper have called epsilon, some use it in both numerator and denominator, one implementation used Gamma etc etc.
Could someone please help me with the correct implementation.
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np
def dice_loss1(y_true, y_pred, smooth=1e-6):
'''
https://www.kaggle.com/code/bigironsphere/loss-function-library-keras-pytorch/notebook
'''
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
smooth = tf.cast(smooth, y_pred.dtype)
y_pred = K.flatten(y_pred)
y_true = K.flatten(y_true)
intersection = K.sum(K.dot(y_true, y_pred))
dice_coef = (2*intersection + smooth) / (K.sum(y_true) + K.sum(y_pred) + smooth)
dice_loss = 1-dice_coef
return dice_loss
def dice_loss2(y_true, y_pred, smooth=1e-6): # Only Smooth
"""
https://gist.github.com/wassname/7793e2058c5c9dacb5212c0ac0b18a8a
"""
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
smooth = tf.cast(smooth, y_pred.dtype)
intersection = K.sum(K.abs(y_true * y_pred), axis=-1)
dice_coef = (2. * intersection + smooth) / (K.sum(K.square(y_true),-1) + K.sum(K.square(y_pred),-1) + smooth)
return 1- dice_coef
def dice_loss3(y_true, y_pred): # No gamma, no smooth
'''
https://lars76.github.io/2018/09/27/loss-functions-for-segmentation.html
'''
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
y_pred = tf.math.sigmoid(y_pred)
numerator = 2 * tf.reduce_sum(y_true * y_pred)
denominator = tf.reduce_sum(y_true + y_pred)
return 1 - numerator / denominator
def dice_loss4(y_true, y_pred, smooth=1e-6, gama=1): # Gama + Smooth is used
'''
https://dev.to/_aadidev/3-common-loss-functions-for-image-segmentation-545o
'''
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
smooth = tf.cast(smooth, y_pred.dtype)
gama = tf.cast(gama, y_pred.dtype)
nominator = 2 * tf.reduce_sum(tf.multiply(y_pred, y_true)) + smooth
denominator = tf.reduce_sum(y_pred ** gama) + tf.reduce_sum(y_true ** gama) + smooth
result = 1 - tf.divide(nominator, denominator)
return result
y_true = np.array([[0,0,1,0],
[0,0,1,0],
[0,0,1.,0.]])
y_pred = np.array([[0,0,0.9,0],
[0,0,0.1,0],
[1,1,0.1,1.]])
# print(dice_loss1(y_true, y_pred)) # Gives you error in K.dot()
print(dice_loss2(y_true, y_pred))
print(dice_loss3(y_true, y_pred)) # provides array of values
print(dice_loss4(y_true, y_pred))
I utilized a variation of the dice loss for brain tumor segmentation. The implementation for the dice coefficient which I used for such results was:
def dice_coef(y_true, y_pred, smooth=100):
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
dice = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
return dice
In order to make it a loss, it needs to be made into a function we want to minimize. This can be accomplished by making it negative:
def dice_coef_loss(y_true, y_pred):
return -dice_coef(y_true, y_pred)
or subtracting it from 1:
def dice_coef_loss(y_true, y_pred):
return 1 - dice_coef(y_true, y_pred)
or applying some other function then negating - for example, taking the negative logarithm (which could smooth the gradients):
def dice_coef_loss(y_true, y_pred):
return -K.log(dice_coef(y_true, y_pred))
The variable smooth represents your observation in other implementations with various names (smoothing, epsilon, etc.). Just for clarity, this smoothing variable exists to handle the case where the ground truth has very few white (or no) white pixels (assuming white pixels belonging to a class or boundary of an object, depending on your implementation).
If smooth is set too low, when the ground truth has few to 0 white pixels and the predicted image has some non-zero number of white pixels, the model will be penalized more heavily. Setting smooth higher means if the predicted image has some low amount of white pixels when the ground truth has none, the loss value will be lower. Depending on how aggressive the model needs to be, though, maybe a lower value is good.
Here's an illustrative example:
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
def dice_coef(y_true, y_pred, smooth):
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
dice = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
return dice
def dice_coef_loss(y_true, y_pred, smooth):
return 1 - dice_coef(y_true, y_pred, smooth)
if __name__ == '__main__':
smooth = 10e-6
y_pred = np.zeros((1, 128, 128))
# one pixel is set to 1
y_pred[0, 0, 0] = 1
y_pred = tf.convert_to_tensor(y_pred, dtype=tf.float32)
y_true = tf.zeros((1, 128, 128), dtype=tf.float32)
print(dice_coef(y_true, y_pred, smooth=smooth))
print(dice_coef_loss(y_true, y_pred, smooth=smooth))
will print out:
tf.Tensor(9.9999e-06, shape=(), dtype=float32)
tf.Tensor(0.99999, shape=(), dtype=float32)
But if smooth is set to 100:
tf.Tensor(0.990099, shape=(), dtype=float32)
tf.Tensor(0.009900987, shape=(), dtype=float32)
Showing the loss reduces to 0.009 instead of 0.99.
For completeness, if you have multiple segmentation channels (B X W X H X K, where B is the batch size, W and H are the dimensions of your image, and K are the different segmentations channels), the same concepts apply, but it can be implemented as follows:
def dice_coef_multilabel(y_true, y_pred, M, smooth):
dice = 0
for index in range(M):
dice += dice_coef(y_true[:,:,:,index], y_pred[:,:,:,index], smooth)
return dice
And it can be converted to a loss function through negation or subtraction, in the same way as dice_coef is. smooth could also be tuned per channel, if you supply a list or some other sequence (e.g; smooth_list):
def dice_coef_multilabel(y_true, y_pred, M, smooth_list):
dice = 0
for index in range(M):
dice += dice_coef(y_true[:,:,:,index], y_pred[:,:,:,index], smooth_list[index])
return dice
This is with tf 2.1.0
The following works up until you try to call a compiled model. Is there something to do to make the .compile and .fit methods work for multiple tensor inputs?
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
tf.keras.backend.set_floatx('float64')
m = 250 # samples
n_x = 1 # dim of x
n_tau = 11
x = (2 * np.random.rand(m, n_x).astype(np.float64) - 1) * 2
i = np.argsort(x[:, 0])
x = x[i] # to make plotting nicer
A = np.random.randn(n_x, 1)
y = x ** 2 + 0.3 * x + 0.4 * np.random.randn(m, 1).astype(np.float64)
y = y.dot(A) # y is 1d
y = y[:, :, None]
tau = np.linspace(1.0 / n_tau, 1 - 1.0 / n_tau, n_tau).astype(np.float64)
tau = tau[None, :, None]
def loss(tau_y, u):
tau = tau_y[0]
y = tau_y[1]
u = y - u
res = u ** 2 * (tau - tf.where(u <= np.float64(0.0), np.float64(1.0), np.float64(0.0)))
return tf.reduce_sum(tf.reduce_mean(res, axis=[1, 2]), axis=0)
tf.keras.backend.set_floatx('float64')
class My(tf.keras.models.Model):
def __init__(self):
super().__init__()
self._my_layer = tf.keras.layers.Dense(1, dtype=tf.float64)
def call(self, inputs):
tau = inputs[0]
y = inputs[1]
tf.print(tau.shape, y.shape)
return self._my_layer(tau)
model = My()
u = model((tau, y)) # calling model works
l = loss((tau, y), model((tau, y))) # call loss works
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=loss)
# this fails with the error below
model.fit((tau, y), (tau, y))
# ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), for inputs ['output_1'] but instead got the following list of 2 arrays: [array([[[0.09090909],
# [0.17272727],
# [0.25454545],
# [0.33636364],
# [0.41818182],
# [0.5 ],
# [0.58181818],
# [0.66363636],
# [0.74545455],
# ...
I stumbled across a strange phenomenon while playing around with variational autoencoders. The problem is quite simple to describe:
When defining the loss function for the VAE, you have to use some kind of reconstruction error. I decided to use my own implementation of cross-entropy, as I wasn't able to get reasonable results with any function provided by tensorflow. It looks like this:
x_hat = tf.contrib.layers.fully_connected(fc2,
input_dim,
activation_fn=tf.sigmoid)
## Define the loss
reconstruction_loss = -tf.reduce_sum(
x * tf.log(epsilon + x_hat) +
(1 - x) * tf.log(epsilon + 1 - x_hat),
axis=1)
It uses the output of the reconstructed layer, which applies the sigmoid function to get it to the [0; 1] range. Now, I wanted to apply the sigmoid within the loss function and changed it to
x_hat = tf.contrib.layers.fully_connected(fc2,
input_dim,
activation_fn=None)
## Define the loss
reconstruction_loss = -tf.reduce_sum(
x * tf.log(epsilon + tf.sigmoid(x_hat)) +
(1 - x) * tf.log(epsilon + 1 - tf.sigmoid(x_hat)),
axis=1)
I'm convinced that this should provide nearly identical results. In practice, though, this second attempt results in weird grey pictures. The originals seem blurry and much brighter, too. First the okay version, then the alternative "wrong" version.
Can someone explain to me what causes this weird behavior?
If you want to test it yourself, below is my source code. You have to comment the respective blocks in or out to get the results. Thanks!
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
import numpy as np
mnist = input_data.read_data_sets('MNIST_data/', one_hot=True)
n_samples = mnist.train.num_examples
input_dim = mnist.train.images[0].shape[0]
inter_dim = 256
encoding_dim = 5
epsilon = 1e-10
learning_rate = 1e-4
n_epochs = 20
batch_size = 100
width = 28
## Define the variational autoencoder model
x = tf.placeholder(dtype=tf.float32,
shape=[None, input_dim],
name='x')
fc1 = tf.contrib.layers.fully_connected(x,
inter_dim,
activation_fn=tf.nn.relu)
z_mean = tf.contrib.layers.fully_connected(fc1,
encoding_dim,
activation_fn=None)
z_log_var = tf.contrib.layers.fully_connected(fc1,
encoding_dim,
activation_fn=None)
eps = tf.random_normal(shape=tf.shape(z_log_var),
mean=0,
stddev=1,
dtype=tf.float32)
z = z_mean + tf.exp(z_log_var / 2) * eps
fc2 = tf.contrib.layers.fully_connected(z,
inter_dim,
activation_fn=tf.nn.relu)
x_hat = tf.contrib.layers.fully_connected(fc2,
input_dim,
activation_fn=tf.sigmoid)
#activation_fn=None)
## Define the loss
reconstruction_loss = -tf.reduce_sum(
x * tf.log(epsilon + x_hat) +
(1 - x) * tf.log(epsilon + 1 - x_hat),
axis=1)
ALTERNATIVE LOSS W/ APPLYING SIGMOID, REMOVED ACTIVATION FROM OUTPUT LAYER
'''
reconstruction_loss = -tf.reduce_sum(
x * tf.log(epsilon + tf.sigmoid(x_hat)) +
(1 - x) * tf.log(epsilon + 1 - tf.sigmoid(x_hat)),
axis=1)
'''
KL_div = -.5 * tf.reduce_sum(
1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var),
axis=1)
total_loss = tf.reduce_mean(reconstruction_loss + KL_div)
## Define the training operator
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(total_loss)
## Run it
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(n_epochs):
for _ in range(n_samples // batch_size):
batch = mnist.train.next_batch(batch_size)
_, loss, recon_loss, KL_loss = sess.run([train_op,
total_loss,
reconstruction_loss,
KL_div],
feed_dict={x:batch[0]})
print('[Epoch {}] loss: {}'.format(epoch, loss))
print('Training Done')
## Reconstruct a few samples to validate the training
batch = mnist.train.next_batch(100)
x_reconstructed = sess.run(x_hat, feed_dict={x:batch[0]})
n = np.sqrt(batch_size).astype(np.int32)
I_reconstructed = np.empty((width*n, 2*width*n))
for i in range(n):
for j in range(n):
x = np.concatenate(
(x_reconstructed[i*n+j, :].reshape(width, width),
batch[0][i*n+j, :].reshape(width, width)),
axis=1
)
I_reconstructed[i*width:(i+1)*width, j*2*width:(j+1)*2*width] = x
fig = plt.figure()
plt.imshow(I_reconstructed, cmap='gray')
EDIT1: SOLUTION
Thanks to #xdurch0, I was made aware of the fact that the reconstructed output is no longer rescaled via the sigmoid function. That means the sigmoid has to be applied on the image before plotting it. Just modify the output:
x_reconstructed = sess.run(tf.sigmoid(x_hat), feed_dict={x:batch[0]})
I expected the gradient for tf.sign() in TensorFlow to be equal to 0 or None. However, when I examined the gradients, I found that they were equal to very small numbers (e.g. 1.86264515e-09). Why is that?
(If you are curious as to why I even want to know this, it is because I want to implement the "straight-through estimator" described here, and before overriding the gradient for tf.sign(), I wanted to check that the default behavior was in fact what I was expecting.)
EDIT: Here is some code which reproduces the error. The model is just the linear regression model from the introduction to TensorFlow, except that I use y=sign(W)x + b instead of y=Wx + b.
import tensorflow as tf
import numpy as np
def gvdebug(g, v):
g2 = tf.zeros_like(g, dtype=tf.float32)
v2 = tf.zeros_like(v, dtype=tf.float32)
g2 = g
v2 = v
return g2,v2
# Create 100 phony x, y data points in NumPy, y = x * 0.1 + 0.3
x_data = np.random.rand(100).astype(np.float32)
y_data = x_data * 0.1 + 0.3
# Try to find values for W and b that compute y_data = W * x_data + b
# (We know that W should be 0.1 and b 0.3, but TensorFlow will
# figure that out for us.)
W = tf.Variable(tf.random_uniform([1], -1.0, 1.0))
b = tf.Variable(tf.zeros([1]))
y = tf.sign(W) * x_data + b
# Minimize the mean squared errors.
loss = tf.reduce_mean(tf.square(y - y_data))
optimizer = tf.train.GradientDescentOptimizer(0.5)
grads_and_vars = optimizer.compute_gradients(loss)
gv2 = [gvdebug(gv[0], gv[1]) for gv in grads_and_vars]
apply_grads = optimizer.apply_gradients(gv2)
# Before starting, initialize the variables. We will 'run' this first.
init = tf.initialize_all_variables()
# Launch the graph.
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.01)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
sess.run(init)
# Fit the line.
for step in range(201):
sess.run(apply_grads)
if (step % 20 == 0) or ((step-1) % 20 == 0):
print("")
print(sess.run(gv2[0][1])) #the variable
print(sess.run(gv2[0][0])) #the gradient
print("")
print(step, sess.run(W), sess.run(b))
# Learns best fit is W: [0.1], b: [0.3]