Is the argsort function differentiable in Tensorflow? - tensorflow

By this I mean, can I include it in a loss function and have autodiff function properly?
The raw_ops docs (https://www.tensorflow.org/api_docs/python/tf/raw_ops) has no listing for sort or argsort.

I run the following experiment in colab
import tensorflow as tf
x = tf.constant([[4.0, 2.1, 1.0]])
w = tf.Variable([[1.0, 1.0, 1.0]], name='w')
y_true = tf.constant([[1.0, 2.0, 3.0]])
#tf.function
def loss_fn(y_true, y_pred):
indices = tf.argsort(y_pred)
x = tf.gather(y_pred, indices, axis=-1)
return tf.reduce_sum(tf.square(y_true - x))
with tf.GradientTape() as tape:
y = x * w
loss = loss_fn(y_true, y)
tape.gradient(loss, [w])
The computed loss in 1.01 and the gradients for w seem to make sense to me.
So I would say the answer is yes, if you are using argsort() for indexing purposes. If you have something else in mind maybe you can tweak the example above and figure out if the gradients behave as you expect.

Related

How to get the Weighted Average Mean in Tensorflow

from example_data below I need sum_product(x,y)/sum(y) - having x & y as Input... probably this part of model can even be trainable=False, but anyway, is there a simplier way to do such calculation (either from 1 tensor at all or at least from such separate tensors for vars & weights) ?
If there could be more beautiful Graph for such Task, than I've created ?
I could have written Only such long code (for such a simple thing)
import numpy as np
import tensorflow as tf
from keras import backend as K
x= np.array([[1100, 1200, 1300, 1400]] ) # vals
y= np.array([[10, 50, 30, 5]] ) # weights
inpS= tf.keras.layers.Input(shape=(4,), batch_size=1, name='inp1', dtype='float32')
inpW= tf.keras.layers.Input(shape=(4,), batch_size=1, name='inp2', dtype='float32')
dot_product = tf.keras.layers.Dot(axes=1, normalize=False, trainable=False)([inpS, inpW])
wsum = tf.keras.layers.Lambda( lambda z: K.sum(z, axis=1, keepdims=True))(inpW)
con= tf.keras.layers.Concatenate(axis=-1)([dot_product, wsum]) #for Multiple input into Lambda layer
wa = tf.keras.layers.Lambda(lambda x: x[0][0]/x[0][1])(con)
model = tf.keras.Model([inpS, inpW], wa)
model.predict([x,y])
RES should be:
117000/95=1231.5789794921875
really, very easy to simplify:
Weighted_Av_Mean = tf.reduce_sum(weights * x) / tf.reduce_sum(weights)
tf.print(Weighted_Av_Mean)
solved due to comment's directive...
OR even in such a way:
# cast x & weights numpy_arrays first
x = tf.dtypes.cast(x,tf.float32)
weights = tf.dtypes.cast(weights,tf.float32)
# gives mean & variance
WAMean2= tf.nn.weighted_moments(
x, axes=[1], frequency_weights= weights, keepdims=False, name=None
)
tf.print('mean: ',WAMean2[0])

Exploding LOSS in TensorFlow 2.0 Linear Regression Example using GradientTape

I'm trying to construct a little educational example for multivariate linear regresssion, but the LOSS is increasing until it explodes rather than getting smaller, any idea?
import tensorflow as tf
tf.__version__
import numpy as np
data = np.array(
[
[100,35,35,12,0.32],
[101,46,35,21,0.34],
[130,56,46,3412,12.42],
[131,58,48,3542,13.43]
]
)
x = data[:,1:-1]
y_target = data[:,-1]
def loss_function(y, pred):
return tf.reduce_mean(tf.square(y - pred))
def train(b, w, x, y, lr=0.012):
with tf.GradientTape() as t:
current_loss = loss_function(y, linear_model(x))
lr_weight, lr_bias = t.gradient(current_loss, [w, b])
w.assign_sub(lr * lr_weight)
b.assign_sub(lr * lr_bias)
epochs = 80
for epoch_count in range(epochs):
real_loss = loss_function(y_target, linear_model(x))
train(b, w, x, y_target, lr=0.12)
print(f"Epoch count {epoch_count}: Loss value: {real_loss.numpy()}")
This even happens if I initialize the weights with the "correct" values (found out via a scikit-learn regressor)
w = tf.Variable([-1.76770250e-04,3.46688912e-01,2.43827475e-03],dtype=tf.float64)
b = tf.Variable(-11.837184241807234,dtype=tf.float64)
Here's how you might use a TF2 optimizer for a toy example (as per the comment). I know this is not the answer but I didn't want to post this in the comments section, as it will mess up the indentation and all that.
tf_x = tf.Variable(tf.constant(2.0,dtype=tf.float32),name='x')
optimizer = tf.optimizers.SGD(learning_rate=0.1)
# Optimizing tf_x using gradient tape
x_series, y_series = [],[]
for step in range(5):
x_series.append(tf_x.numpy().item())
with tf.GradientTape() as tape:
tf_y = tf_x**2
gradients = tape.gradient(tf_y, tf_x)
optimizer.apply_gradients(zip([gradients], [tf_x]))
Based on #thushv89's input, I'm providing here an intermediate solution using a TF2 Optimizer which is working, although this is not 100% answering my question
import tensorflow as tf
tf.__version__
import numpy as np
data = np.array(
[
[100,35,35,12,0.32],
[101,46,35,21,0.34],
[130,56,46,3412,12.42],
[131,58,48,3542,13.43]
]
)
x = data[:,1:-1]
y_target = data[:,-1]
w = tf.Variable([1,1,1],dtype=tf.float64)
b = tf.Variable(1,dtype=tf.float64)
def linear_model(x):
return b + tf.tensordot(x,w,axes=1)
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.MeanSquaredLogarithmicError()
def train_step(x, y):
with tf.GradientTape() as tape:
predicted = linear_model(x)
loss_value = loss_object(y, predicted)
print(f"Loss Value:{loss_value}")
grads = tape.gradient(loss_value, [b,w])
optimizer.apply_gradients(zip(grads, [b,w]))
def train(epochs):
for epoch in range(epochs):
train_step(x, y_target)
print ('Epoch {} finished'.format(epoch))
train(epochs = 1000)

how does TensorFlow handle the differentials for L1 regularization?

it seems that you can just declare a cost function by tf.abs() and then pass it down to auto-gradient generation (see https://github.com/nfmcclure/tensorflow_cookbook/blob/master/03_Linear_Regression/04_Loss_Functions_in_Linear_Regressions/04_lin_reg_l1_vs_l2.py)
. but we know abs() is not differentiable.
how is this done in Tensorflow? does it just randomly throw a number in [-1,1] ?
if someone could please point me to the implementation that would be great. Thanks!
(I looked for tensorflow.py in the git, but it does not even exist)
f(x) = abs(x) is differentiable everywhere, except at x=0. It derivative equals:
So the only question is how tensorflow implements derivative at x=0. You can check this manually:
import tensorflow as tf
x = tf.Variable(0.0)
y = tf.abs(x)
grad = tf.gradients(y, [x])[0]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(grad))
It prints 0.0.
A modified version, based on #standy 's answer.
Which you can modify the function yourself:
import tensorflow as tf
x = tf.Variable(0.0)
y = tf.where(tf.greater(x, 0), x+2, 2) # The piecewise-defined function here is:y=2 (x<0), y=x+2 (x>=0)
grad = tf.gradients(y, [x])[0]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(sess.run(grad))
I would recommend a post of mine, visulized L1 & L2 Regularization with echarts:
https://simzhou.com/en/posts/2021/cross-entropy-loss-visualized/

SGD converges but batch learning does not, simple regression in tensorflow

I have run into an issue where batch learning in tensorflow fails to converge to the correct solution for a simple convex optimization problem, whereas SGD converges. A small example is found below, in the Julia and python programming languages, I have verified that the same exact behaviour results from using tensorflow from both Julia and python.
I'm trying to fit the linear model y = s*W + B with parameters W and B
The cost function is quadratic, so the problem is convex and should be easily solved using a small enough step size. If I feed all data at once, the end result is just a prediction of the mean of y. If, however, I feed one datapoint at the time (commented code in julia version), the optimization converges to the correct parameters very fast.
I have also verified that the gradients computed by tensorflow differs between the batch example and summing up the gradients for each datapoint individually.
Any ideas on where I have failed?
using TensorFlow
s = linspace(1,10,10)
s = [s reverse(s)]
y = s*[1,4] + 2
session = Session(Graph())
s_ = placeholder(Float32, shape=[-1,2])
y_ = placeholder(Float32, shape=[-1,1])
W = Variable(0.01randn(Float32, 2,1), name="weights1")
B = Variable(Float32(1), name="bias3")
q = s_*W + B
loss = reduce_mean((y_ - q).^2)
train_step = train.minimize(train.AdamOptimizer(0.01), loss)
function train_critic(s,targets)
for i = 1:1000
# for i = 1:length(y)
# run(session, train_step, Dict(s_ => s[i,:]', y_ => targets[i]))
# end
ts = run(session, [loss,train_step], Dict(s_ => s, y_ => targets))[1]
println(ts)
end
v = run(session, q, Dict(s_ => s, y_ => targets))
plot(s[:,1],v, lab="v (Predicted value)")
plot!(s[:,1],y, lab="y (Correct value)")
gui();
end
run(session, initialize_all_variables())
train_critic(s,y)
Same code in python (I'm not a python user so this might be ugly)
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
import tensorflow as tf
from tensorflow.python.framework.ops import reset_default_graph
s = np.linspace(1,10,50).reshape((50,1))
s = np.concatenate((s,s[::-1]),axis=1).astype('float32')
y = np.add(np.matmul(s,[1,4]), 2).astype('float32')
reset_default_graph()
rng = np.random
s_ = tf.placeholder(tf.float32, [None, 2])
y_ = tf.placeholder(tf.float32, [None])
weight_initializer = tf.truncated_normal_initializer(stddev=0.1)
with tf.variable_scope('model'):
W = tf.get_variable('W', [2, 1],
initializer=weight_initializer)
B = tf.get_variable('B', [1],
initializer=tf.constant_initializer(0.0))
q = tf.matmul(s_, W) + B
loss = tf.reduce_mean(tf.square(tf.sub(y_ , q)))
optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss)
num_epochs = 200
train_cost= []
with tf.Session() as sess:
init = tf.initialize_all_variables()
sess.run(init)
for e in range(num_epochs):
feed_dict_train = {s_: s, y_: y}
fetches_train = [train_op, loss]
res = sess.run(fetches=fetches_train, feed_dict=feed_dict_train)
train_cost = [res[1]]
print train_cost
The answer turned out to be that when I fed in the targets, I fed a vector and not an Nx1 matrix. The operation y_-q then turned into a broadcast operation and instead of returning the elementwise difference, it returned an NxN matrix with the desired difference along the diagonal. In Julia, I solved this by modifying the line
train_critic(s,y)
to
train_critic(s,reshape(y, length(y),1))
to ensure y being a matrix.
A subtle error that took me a very long time to find! Part of the confusion was that TensorFlow seems to treat vectors as row vectors and not as column vectors like Julia, hence the broadcast operation in y_-q

Loss not converging in Polynomial regression in Tensorflow

import numpy as np
import tensorflow as tf
#input data:
x_input=np.linspace(0,10,1000)
y_input=x_input+np.power(x_input,2)
#model parameters
W = tf.Variable(tf.random_normal([2,1]), name='weight')
#bias
b = tf.Variable(tf.random_normal([1]), name='bias')
#placeholders
#X=tf.placeholder(tf.float32,shape=(None,2))
X=tf.placeholder(tf.float32,shape=[None,2])
Y=tf.placeholder(tf.float32)
x_modified=np.zeros([1000,2])
x_modified[:,0]=x_input
x_modified[:,1]=np.power(x_input,2)
#model
#x_new=tf.constant([x_input,np.power(x_input,2)])
Y_pred=tf.add(tf.matmul(X,W),b)
#algortihm
loss = tf.reduce_mean(tf.square(Y_pred -Y ))
#training algorithm
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
#initializing the variables
init = tf.initialize_all_variables()
#starting the session session
sess = tf.Session()
sess.run(init)
epoch=100
for step in xrange(epoch):
# temp=x_input.reshape((1000,1))
#y_input=temp
_, c=sess.run([optimizer, loss], feed_dict={X: x_modified, Y: y_input})
if step%50==0 :
print c
print "Model paramters:"
print sess.run(W)
print "bias:%f" %sess.run(b)
I'm trying to implement Polynomial regression(quadratic) in Tensorflow. The loss isn't converging. Could anyone please help me out with this. The similar logic is working for linear regression though!
First there is a problem in your shapes, for Y_pred and Y:
Y has unknown shape, and is fed with an array of shape (1000,)
Y_pred has shape (1000, 1)
Y - Y_pred will then have shape (1000, 1000)
This small code will prove my point:
a = tf.zeros([1000]) # shape (1000,)
b = tf.zeros([1000, 1]) # shape (1000, 1)
print (a-b).get_shape() # prints (1000, 1000)
You should use consistent types:
y_input = y_input.reshape((1000, 1))
Y = tf.placeholder(tf.float32, shape=[None, 1])
Anyway, the loss is exploding because you have very high values (input between 0 and 100, you should normalize it) and thus very high loss (around 2000 at the beginning of training).
The gradient is very high and the parameters explode, and the loss gets to infinite.
The quickest fix is to lower your learning rate (1e-5 converges for me, albeit very slowly at the end). You can make it higher after the loss converges to around 1.