My TensorFlow Gradient Descent diverges - tensorflow

import tensorflow as tf
import pandas as pd
import numpy as np
def normalize(data):
return data - np.min(data) / np.max(data) - np.min(data)
df = pd.read_csv('sat.csv', skipinitialspace=True)
x_reading = df['reading_score']
x_math = df['math_score']
x_reading, x_math = np.array(x_reading[df.reading_score != 's']), np.array(x_math[df.math_score != 's'])
x_data = normalize(np.float32(np.array([x_reading, x_math])))
y_writing = df[['writing_score']]
y_data = normalize(np.float32(np.array(y_writing[df.writing_score != 's'])))
W = tf.Variable(tf.random_uniform([1, 2], -.5, .5)) #float32
b = tf.Variable(tf.ones([1]))
y = tf.matmul(W, x_data) + b
loss = tf.reduce_mean(tf.square(y - y_data.T))
optimizer = tf.train.GradientDescentOptimizer(0.005)
train = optimizer.minimize(loss)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
for step in range(1000):
sess.run(train)
print step, sess.run(W), sess.run(b), sess.run(loss)
Here's my code. My sat.csv contains a data of reading, writing and math scores at SAT. As you can guess, the difference between the features is not that big.
This is a part of sat.csv.
DBN,SCHOOL NAME,Num of Test Takers,reading_score,math_score,writing_score
01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363
01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366
01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370
01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359
01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384
01M515,LOWER EAST SIDE PREPARATORY HIGH SCHOOL,112,332,557,316
01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND MATH HIGH SCHOOL",159,522,574,525
01M650,CASCADES HIGH SCHOOL,18,417,418,411
01M696,BARD HIGH SCHOOL EARLY COLLEGE,130,624,604,628
02M047,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECONDARY SCHOOL,16,395,400,387
I've only used math, writing and reading scores. My goal for the code above is to predict the writing score if I give math and reading scores.
I've never seen Tensorflow's gradient descent model diverges with this such simple data. What'd be wrong?

Here are a few options you could try:
Normalise you input and output data
Set smaller initial values for your weights
Use a lower learning rate
Divide your loss by the amount of samples you have (not putting your data in a placeholder is already uncommon).
Let me know what (if any) of these options helped and good luck!

Related

Tensorflow Quantum: PQC not optimizing

I have followed the tutorial available at: https://www.tensorflow.org/quantum/tutorials/mnist. I have modified this tutorial to the simplest example I could think of: an input set in which x increases linearly from 0 to 1 and y = x < 0.3. I then use a PQC with a single Rx gate with a symbol, and a readout using a Z gate.
When retrieving the optimized symbol and adjusting it manually, I can easily find a value that provides 100% accuracy, but when I let the Adam optimizer run, it converges to either always predict 1 or always predict -1. Does anybody spot what I do wrong? (and I apologize for not being able to break down the code to a smaller example)
import tensorflow as tf
import tensorflow_quantum as tfq
import cirq
import sympy
import numpy as np
# used to embed classical data in quantum circuits
def convert_to_circuit_cont(image):
"""Encode truncated classical image into quantum datapoint."""
values = np.ndarray.flatten(image)
qubits = cirq.GridQubit.rect(4, 1)
circuit = cirq.Circuit()
for i, value in enumerate(values):
if value:
circuit.append(cirq.rx(value).on(qubits[i]))
return circuit
# define classical dataset
length = 1000
np.random.seed(42)
# create a linearly increasing set for x from 0 to 1 in 1/length steps
x_train_sorted = np.asarray([[x/length] for x in range(0,length)], dtype=np.float32)
# p is used to shuffle x and y similarly
p = np.random.permutation(len(x_train_sorted))
x_train = x_train_sorted[p]
# y = x < 0.3 in {-1, 1} for Hinge loss
y_train_sorted = np.asarray([1 if (x/length)<0.30 else -1 for x in range(0,length)])
y_train = y_train_sorted[p]
# test == train for this example
x_test = x_train_sorted[:]
y_test = y_train_sorted[:]
# convert classical data into quantum circuits
x_train_circ = [convert_to_circuit_cont(x) for x in x_train]
x_test_circ = [convert_to_circuit_cont(x) for x in x_test]
x_train_tfcirc = tfq.convert_to_tensor(x_train_circ)
x_test_tfcirc = tfq.convert_to_tensor(x_test_circ)
# define the PQC circuit, consisting out of 1 qubit with 1 gate (Rx) and 1 parameter
def create_quantum_model():
data_qubits = cirq.GridQubit.rect(1, 1)
circuit = cirq.Circuit()
a = sympy.Symbol("a")
circuit.append(cirq.rx(a).on(data_qubits[0])),
return circuit, cirq.Z(data_qubits[0])
model_circuit, model_readout = create_quantum_model()
# Build the Keras model.
model = tf.keras.Sequential([
# The input is the data-circuit, encoded as a tf.string
tf.keras.layers.Input(shape=(), dtype=tf.string),
# The PQC layer returns the expected value of the readout gate, range [-1,1].
tfq.layers.PQC(model_circuit, model_readout),
])
# used for logging progress during optimization
def hinge_accuracy(y_true, y_pred):
y_true = tf.squeeze(y_true) > 0.0
y_pred = tf.squeeze(y_pred) > 0.0
result = tf.cast(y_true == y_pred, tf.float32)
return tf.reduce_mean(result)
# compile the model with Hinge loss and Adam, as done in the example. Have tried with various learning_rates
model.compile(
loss = tf.keras.losses.Hinge(),
optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
metrics=[hinge_accuracy])
EPOCHS = 20
BATCH_SIZE = 32
NUM_EXAMPLES = 1000
# fit the model
qnn_history = model.fit(
x_train_tfcirc, y_train,
batch_size=32,
epochs=EPOCHS,
verbose=1,
validation_data=(x_test_tfcirc, y_test),
use_multiprocessing=False)
results = model.predict(x_test_tfcirc)
results_mapped = [-1 if x<=0 else 1 for x in results[:,0]]
print(np.sum(np.equal(results_mapped, y_test)))
After 20 epochs of optimization, I get the following:
1000/1000 [==============================] - 0s 410us/sample - loss: 0.5589 - hinge_accuracy: 0.6982 - val_loss: 0.5530 - val_hinge_accuracy: 0.7070
This results in 700 samples out of 1000 predicted correctly. When looking at the mapped results, this is because all results are predicted as -1. When looking at the raw results, they linearly increase from -0.5484014 to -0.99996257.
When retrieving the weight with w = model.layers[0].get_weights(), subtracting 0.8, and setting it again with model.layers[0].set_weights(w), I get 920/1000 correct. Fine-tuning this process allows me to achieve 1000/1000.
Update 1:
I have also printed the update of the weight over the various epochs:
4.916246, 4.242602, 3.3765688, 2.6855211, 2.3405066, 2.206207, 2.1734586, 2.1656137, 2.1510274, 2.1634471, 2.1683235, 2.188944, 2.1510284, 2.1591303, 2.1632445, 2.1542525, 2.1677444, 2.1702878, 2.163104, 2.1635907
I set the weight to 1.36, a value which gives 908/1000 (as opposed to 700/100). The optimizer moves away from it:
1.7992111, 2.0727847, 2.1370323, 2.15711, 2.1686404, 2.1603785, 2.183334, 2.1563332, 2.156857, 2.169908, 2.1658351, 2.170673, 2.1575692, 2.1505954, 2.1561477, 2.1754034, 2.1545155, 2.1635509, 2.1464484, 2.1707492
One thing that I noticed is that the value for the hinge accuracy was 0.75 with the weight 1.36, which is higher than the 0.7 for 2.17. If this is the case, I am either in an unlucky part of the optimization landscape where the global minimum does not correspond to the minimum of the loss landscape, or the loss value is determined incorrectly. This is what I will be investigating next.
The minima of the Hinge loss function for this examples does not correspond with the maxima of number of correctly classified examples. Please see plot of these w.r.t. the value of the parameter. Given that the optimizer works towards the minima of the loss, not the maxima of the number of classified examples, the code (and framework/optimizer) do what they are supposed to do. Alternatively, one could use a different loss function to try to find a better fit. For example binarized l1 loss. This function would have the same global optimum, but would likely have a very flat landscape.

word2vec implementation in tensorflow 2.0

I want to implement word2vec using tensorflow 2.0
I have prepared dataset according to the skip-gramm model and I have got approx. 18 million observations(target and context words).
I have used the followng dataset for my goal:
https://www.kaggle.com/c/quora-question-pairs/notebooks
I have created a new dataset for n-gramm model. I have used windows_size 2 and number of skips equal to 2 as well in order to create for each target word(as our input) create context word(that is what I have to predict). It looks like this:
target context
1 3
1 1
2 1
2 1222
Here is my code:
dataset_train = tf.data.Dataset.from_tensor_slices((target, context))
dataset_train = dataset_train.shuffle(buffer_size=1024).batch(64)
#Parameters:
num_words = len(word_index)#approximately 100000
embed_size = 300
num_sampled = 64
initializer_softmax = tf.keras.initializers.GlorotUniform()
#Variables:
embeddings_weight = tf.Variable(tf.random.uniform([num_words,embed_size],-1.0,1.0))
softmax_weight = tf.Variable(initializer_softmax([num_words,embed_size]))
softmax_bias = tf.Variable(initializer_softmax([num_words]))
optimizer = tf.keras.optimizers.Adam()
#As before, we are supplying a list of integers (that correspond to our validation vocabulary words) to the embedding_lookup() function, which looks up these rows in the normalized_embeddings tensor, and returns the subset of validation normalized embeddings.
#Now that we have the normalized validation tensor, valid_embeddings, we can multiply this by the full normalized vocabulary (normalized_embedding) to finalize our similarity calculation:
#tf.function
def training(X,y):
with tf.GradientTape() as tape:
embed = tf.nn.embedding_lookup(embeddings_weight,X)
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
labels = y, num_sampled = num_sampled, num_classes = num_words))
variables = [embeddings_weight,softmax_weight,softmax_bias]
gradients = tape.gradient(loss,variables)
optimizer.apply_gradients(zip(gradients,variables))
EPOCHS = 30
for epoch in range(EPOCHS):
print('Epoch:',epoch)
for X,y in dataset_train:
training(X,y)
#compute similarity of words:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings_weight), 1, keepdims=True))
norm_embed = embeddings_weight/ norm
temp_emb = tf.nn.embedding_lookup(norm_embed,X)
similarity = tf.matmul(temp_emb,tf.transpose(norm_embed))
But the computation of even 1 epoch lasts too long. Is it possible somehow to improve the perfomance of my code?(I am using google colab for the code execution)
EDIT: this is a shape of my train dataset
dataset_train
<BatchDataset shapes: ((None,), (None, 1)), types: (tf.int64, tf.int64)>
I was following the instructions from this guide: https://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/
This is because softmax function is computationally quite expensive while dealing with possibilities of millions of points in Word2Vec algorithm as explained here. A faster training would be possible with negative sampling.

Accessing elements of a placeholder in tensorflow [duplicate]

This question already has answers here:
Weighted cost function in tensorflow
(2 answers)
Closed 4 years ago.
I have a neural network with MSE loss function being implemented something like this:
# input x_ph is of size Nx1 and output should also be of size Nx1
def train_neural_network_batch(x_ph, predict=False):
prediction = neural_network_model(x_ph)
# MSE loss function
cost = tf.reduce_mean(tf.square(prediction - y_ph))
optimizer = tf.train.AdamOptimizer(learn_rate).minimize(cost)
# mini-batch optimization here
I'm fairly new to neural networks and Python, but I understand that each iteration, a sample of training points will be fed into the neural network and the loss function evaluated at the points in this sample. However, I would like to be able to modify the loss function so that it weights certain data more heavily. Some pseudocode of what I mean
# manually compute the MSE of the data without the first sampled element
cost = 0.0
for ii in range(1,len(y_ph)):
cost += tf.square(prediction[ii] - y_ph[ii])
cost = cost/(len(y_ph)-1.0)
# weight the first sampled data point more heavily according to some parameter W
cost += W*(prediction[0] - y_ph[0])
I might have more points I wish to weight differently as well, but for now, I'm just wondering how I can implement something like this in tensorflow. I know len(y_ph) is invalid as y_ph is just a placeholder, and I can't just do something like y_ph[i] or prediction[i].
You can do this in multiple ways:
1) If some of your data instances weighting are simply 2 times or 3 times more than normal instance, you may just copy those instance multiple times in your data set. Thus they would occupy more weight in loss, hence satisfy your intention. This is the simplest way.
2) If your weighting is more complex, say a float weighting. You can define a placeholder for weighting, multiply it to loss, and use feed_dict to feed the weighting in session together with x batch and y batch. Just make sure instance_weight is the same size with batch_size
E.g.
import tensorflow as tf
import numpy as np
with tf.variable_scope("test", reuse=tf.AUTO_REUSE):
x = tf.placeholder(tf.float32, [None,1])
y = tf.placeholder(tf.float32, [None,1])
instance_weight = tf.placeholder(tf.float32, [None,1])
w1 = tf.get_variable("w1", shape=[1, 1])
prediction = tf.matmul(x, w1)
cost = tf.square(prediction - y)
loss = tf.reduce_mean(instance_weight * cost)
opt = tf.train.AdamOptimizer(0.5).minimize(loss)
with tf.Session() as sess:
x1 = [[1.],[2.],[3.]]
y1 = [[2.],[4.],[3.]]
instance_weight1 = [[10.0], [10.0], [0.1]]
sess.run(tf.global_variables_initializer())
print (x1)
print (y1)
print (instance_weight1)
for i in range(1000):
_, loss1, prediction1 = sess.run([opt, loss, prediction], feed_dict={instance_weight : instance_weight1, x : x1, y : y1 })
if (i % 100) == 0:
print(loss1)
print(prediction1)
NOTE instance_weight1, you may change instance_weight1 to see the difference (here batch_size is set to 3)
Where x1,y1 and x2,y2 follow the rule y=2*x
Whereas x3,y3 follow the rule y=x
But with different weight as [10,10,0.1], the prediction1 coverage to y1,y2 rule and almost ignored y3, the output are as:
[[1.9823183]
[3.9646366]
[5.9469547]]
PS: in tensorflow graph, it's highly recommended not to use for loops, but use matrix operator instead to parallel the calculation.

Linear Regression overfitting

I'm pursuing course 2 on this coursera course on linear regression (https://www.coursera.org/specializations/machine-learning)
I've solved the training using graphlab but wanted to try out sklearn for the experience and learning. I'm using sklearn and pandas for this.
The model overfits on the data. How can I fix this? This is the code.
These are the coefficients i'm getting.
[ -3.33628603e-13 1.00000000e+00]
poly1_data = polynomial_dataframe(sales["sqft_living"], 1)
poly1_data["price"] = sales["price"]
model1 = LinearRegression()
model1.fit(poly1_data, sales["price"])
print(model1.coef_)
plt.plot(poly1_data['power_1'], poly1_data['price'], '.',poly1_data['power_1'], model1.predict(poly1_data),'-')
plt.show()
The plotted line is like this. As you see it connects every data point.
and this is the plot of the input data
I wouldn't even call this overfit. I'd say you aren't doing what you think you should be doing. In particular, you forgot to add a column of 1's to your design matrix, X. For example:
# generate some univariate data
x = np.arange(100)
y = 2*x + x*np.random.normal(0,1,100)
df = pd.DataFrame([x,y]).T
df.columns = ['x','y']
You're doing the following:
model1 = LinearRegression()
X = df["x"].values.reshape(1,-1)[0] # reshaping data
y = df["y"].values.reshape(1,-1)[0]
model1.fit(X,y)
Which leads to:
plt.plot(df['x'].values, df['y'].values,'.')
plt.plot(X[0], model1.predict(X)[0],'-')
plt.show()
Instead, you want to add a column of 1's to your design matrix (X):
X = np.column_stack([np.ones(len(df['x'])),df["x"].values.reshape(1,-1)[0]])
y = df["y"].values.reshape(1,-1)
model1.fit(X,y)
And (after some reshaping) you get:
plt.plot(df['x'].values, df['y'].values,'.')
plt.plot(df['x'].values, model1.predict(X),'-')
plt.show()

Calculating euclidian distance in batches with TensorFlow

import tensorflow as tf
import numpy as np
dim = 1000
x1 = tf.placeholder('float32', shape=(None, dim))
x2 = tf.placeholder('float32', shape=(None, dim))
l2diff = tf.sqrt( tf.reduce_sum(tf.square(tf.sub(x1, x2)),reduction_indices=1))
vector1 = np.random.rand(1,1000)
all_vectors = np.random.rand(500,1000)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
distances = sess.run(l2diff, feed_dict = {x1: vector1, x2: all_vectors})
The above code works well. But iterating for each vector takes too much time.
So is there any way to calculate the same with multiple vectors at a time. Like lets say vector1 = np.random.rand(10,1000)
I am preferring this than sklearn's euclidian distance because I want to calculate similarity for 100k vectors and want to run it on GPU.
And also don't want to replicate the all_vectors because all_vactors already fills 70% my machine's RAM.
Is there any way to calculate distances by passing batch of vectors?