Comparison of GradientDescent algorithm in tensorflow with the implementation of Michael Nielsen - tensorflow

First I will give an overview of my problem. I have two setups:
1) A net which is based on tensorflow
2) A net which is based on code from Michael Nielsen's Book http://neuralnetworksanddeeplearning.com/index.html
Both nets are completely equal. They both have
3 hidden layers a 30 neurons
2 inputs neurons, one output neuron
All activations are sigmoid
Stochastic Gradient descent algorithm as learning algorithm with eta=3.0
quadratic cost function : cost_function = tf.scalar_mul(1.0/(N_training_set*2.0),tf.reduce_sum(tf.squared_difference(y,y_)))
batch_size of 10
weight initialization: The weights which connect the lth and l+1th layer are initialized with sigma=1/sqrt(N_l), where N_l is the number of neurons in the lth layer.
My problem is, that the tensorflow results are very bad ( a factor 10 worse than the results one obtains if I use the Nielsen code).
So before I post my complete code: Does anybody know that there is a bug in the tensorflow StochasticGradientDescent algorithm? (Or does anybody has a reference how the learning rate of the Stocharstic Gradient Descent in tensorflow is defined? I cannot find something in the api)
Here is my code for the tensorflow net:
regression.py
import readData
import matplotlib.pyplot as plt
import numpy as np
from random import randint
import random
from root_numpy import fill_hist
from ROOT import TCanvas, TH2F, TText, TF1 ,TH1D
import ROOT
import tensorflow as tf
import math
# # # # # # ##
#Read in data#
# #
function_outputs=True# apply an invertable function to the y's and train with the modified outputs y_mod! Up to know this function is just a normalization.
function_inputs=True #
full_set = readData.read_data_set("./TH2D_A00_TB10.root","LHCChi2_CMSSM_nObs1061_A00_TB10","full_set",function_inputs,function_outputs)
N_full_set=full_set.get_N()
N_validation_set=10000
N_training_set=N_full_set-(N_validation_set)
full=range(0,N_full_set)
random.shuffle(full)
training_subset=full[:N_training_set]#indices for training set
validation_subset=full[N_training_set:N_training_set+N_validation_set]#indices for validation set
training_set = readData.read_data_set("./TH2D_A00_TB10.root","LHCChi2_CMSSM_nObs1061_A00_TB10","training_set",
function_inputs,function_outputs,full_set=full_set,subset=training_subset)
validation_set = readData.read_data_set("./TH2D_A00_TB10.root","LHCChi2_CMSSM_nObs1061_A00_TB10","validation_set",
function_inputs,function_outputs,full_set=full_set,subset=validation_subset )
#overwiew of full data set, training_data set and validation_data set. The modified members( normalized in this case) can be accessed with the x_mod() and y_mod() member functions
#the normalized data (input and output) will be used to train the net
print "full_data_set:"
print "x (inputs)"
print full_set.get_x()
print "y (outputs)"
print full_set.get_y()
print "x_mod"
print full_set.get_x_mod()
print "y_mod"
print full_set.get_y_mod()
print "------------------"
print "training_data_set:"
print "x (inputs)"
print training_set.get_x()
print "y (outputs)"
print training_set.get_y()
print "x_mod"
print training_set.get_x_mod()
print "y_mod"
print training_set.get_y_mod()
print "------------------"
print "evaluation_data_set:"
print "x (inputs)"
print validation_set.get_x()
print "y (outputs)"
print validation_set.get_y()
print "x_mod"
print validation_set.get_x_mod()
print "y_mod"
print validation_set.get_y_mod()
print "------------------"
# # # # # # # # # # # ##
#setting up the network#
# #
N_epochs = 20
learning_rate = 3.0
batch_size = 10
N1 = 2 #equals N_inputs
N2 = 30
N3 = 30
N4 = 30
N5 = 1
N_in=N1
N_out=N5
#one calculates everything directly for all elements in one batch
"""example: N_in=2,N_out=3, mini_batch_size=5, activation function=linear. In der output matrix gibt es 5Zeilen,jede fuer ein mini batch. Jede Zeile hat 3 Spalten fuer ein output neuron jeweils
W2
[[-0.31917086 -0.03908769 0.5792625 ]
[ 1.34563279 0.03904691 0.39674851]]
b2
[ 0.40960133 -0.5495823 -0.97048181]
x_in
[[ 23.2 12.2 ]
[ 0. 1.1 ]
[ 2.3 3.3 ]
[ 23.22222 24.44444]
[ 333. 444. ]]
y=x_in*W2+b2
[[ 9.42155647 -0.98004436 17.30874062]
[ 1.88979745 -0.50663072 -0.53405845]
[ 4.1160965 -0.51062918 1.67109203]
[ 25.8909874 -0.50280523 22.17957497]
[ 491.5866394 3.77104688 368.08026123]]
hier wird klar, dass b2 auf jede Zeile der Matrix x_in*w2 draufaddiert wird.
W2 ist die transponierte der atrix, die im Buch definiert ist.
"""
x = tf.placeholder(tf.float32,[None,N1])#don't take the shape=(batch_size,N1) argument, because we need this for different batch sizes
W2 = tf.Variable(tf.random_normal([N1, N2],mean=0.0,stddev=1.0/math.sqrt(N1*1.0)))# Initialize the weights for one neuron with 1/sqrt(Number of weights which enter the neuron/ Number of neurons in layer before)
b2 = tf.Variable(tf.random_normal([N2]))
a2 = tf.sigmoid(tf.matmul(x, W2) + b2) #x=a1
W3 = tf.Variable(tf.random_normal([N2, N3],mean=0.0,stddev=1.0/math.sqrt(N2*1.0)))
b3 = tf.Variable(tf.random_normal([N3]))
a3 = tf.sigmoid(tf.matmul(a2, W3) + b3)
W4 = tf.Variable(tf.random_normal([N3, N4],mean=0.0,stddev=1.0/math.sqrt(N3*1.0)))
b4 = tf.Variable(tf.random_normal([N4]))
a4 = tf.sigmoid(tf.matmul(a3, W4) + b4)
W5 = tf.Variable(tf.random_normal([N4, N5],mean=0.0,stddev=1.0/math.sqrt(N4*1.0)))
b5 = tf.Variable(tf.random_normal([N5]))
y = tf.sigmoid(tf.matmul(a4, W5) + b5)
y_ = tf.placeholder(tf.float32,[None,N_out]) # ,shape=(None,N_out)
# # # # # # # # # # # # # #
#initializing and training#
# #
cost_function = tf.scalar_mul(1.0/(N_training_set*2.0),tf.reduce_sum(tf.squared_difference(y,y_)))
error_to_desired_output= y-y_
abs_error_to_desired_output= tf.abs(y-y_)
sum_abs_error_to_desired_output= tf.reduce_sum(tf.abs(y-y_))
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
init = tf.initialize_all_variables()
#launch the graph
sess = tf.Session()
sess.run(init)
N_training_batch=training_set.get_N()/batch_size #rounds to samllest integer
out_mod_validation=[0]*N_epochs # output of net, when inputting x_mod of validation data. Will be saved after each epoch.
error_mod_validation_data= [0]*N_epochs #absolute error on mod validation data after each epoch
diff_mod_validation=[0]*N_epochs # error vector of validation data after each epoch. i.e. y-y_
cost_training_data=[0]*N_epochs
for i in range(0,N_epochs):
for j in range(0,N_training_batch):
batch_xs, batch_ys, epochs_completed = training_set.next_batch(batch_size)#always gives the modified x's and y's. If one does not want to modifie them the function has to be set to identity
sess.run(train_step, feed_dict={x: batch_xs,
y_: batch_ys})
cost_training_data[i]=sess.run(cost_function, feed_dict={
x: training_set.get_x_mod(), y_: training_set.get_y_mod()})
out_mod_validation[i]= sess.run(y, feed_dict={
x: validation_set.get_x_mod()})# output of net, when imputting x_mod of validation data after each training epoch
diff_mod_validation[i]=sess.run(error_to_desired_output, feed_dict={
x: validation_set.get_x_mod(),y_: validation_set.get_y_mod()})
error_mod_validation_data[i]=sess.run(sum_abs_error_to_desired_output, feed_dict={
x: validation_set.get_x_mod(),y_: validation_set.get_y_mod()})
print "epochs completed: "+str(i)
#now calculate everything for the unmodified/unnormalized outputs
out_validation=[0]*N_epochs # output of net, when inputting x_mod of validation data and making the normalization of the output backwards, saved after each epoch
error_validation_data=[0.0]*N_epochs
diff_validation=[0.0]*N_epochs
#make the transformation on the outputs backwards
for i in range(0,N_epochs):
out_validation[i]=np.ndarray(shape=(validation_set.get_N(),1))
for j in range(0,len(out_mod_validation[i])):
out_validation[i][j]=out_mod_validation[i][j]#do this, because otherwise we will produce only a reference
readData.apply_inverse_function_to_outputs(out_mod_validation[i],out_validation[i],full_set.get_y_max())# second argument will be changed!
diff_validation[i]=np.subtract(out_validation[i],validation_set.get_y())
error_validation_data[i]=np.sum(np.absolute(np.subtract(out_validation[i],validation_set.get_y())))
#print at 10 examples how good the output matches the desired output
for i in range(0,10):
print "desired output"
print validation_set.get_y()[i][0]
print "actual output after last training epoch"
print out_validation[-1][i][0]
print "-------"
print "total error on validation_data set after last training"
print error_validation_data[-1]
# # # # ##
#printing#
# #
plt.figure(1)
plt.title("Costfunction of (modified) Training-data")
plt.xlabel("epochs")
plt.ylabel("cost function")
x_range=[x+1 for x in range(0,N_epochs)]
plt.plot(x_range,cost_training_data)
plt.savefig("cost_on_training_data.png")
plt.figure(2)
plt.title("f data")
plt.xlabel("epochs")
plt.ylabel("total error on validation data")
x_range=[x+1 for x in range(0,N_epochs)]
plt.plot(x_range,error_validation_data)
plt.savefig("error_on_val_data.png")
error_on_validation_data_after_training = diff_validation[-1].reshape((1,validation_set.get_N()))
hist=TH1D('hist',"Errors on val data after last training epoch",200,-10000,10000)
fill_hist(hist,error_on_validation_data_after_training[0])
canvas=TCanvas();
hist.GetXaxis().SetTitle("desired Chi^2- outputted Chi^2");
hist.Draw()
canvas.SaveAs('error_on_val_data_hist.png')
readData.py
import numpy as np
import root_numpy
from ROOT import TFile, TH2D, TCanvas
import itertools
def apply_function_to_inputs(x,x_mod,x_max):# python uebergibt alles als reference
#normalize the inputs
for i in range(0,len(x)):
for j in range(0,len(x[i])):
#print "x["+str(i)+"]["+str(j)+"]="+str(x[i][j])
x_mod[i][j]=x[i][j]/x_max[j]
#print "x_mod["+str(i)+"]["+str(j)+"]="+str(x_mod[i][j])
def apply_inverse_function_to_inputs(x,x_mod,x_max):# python uebergibt alles als reference
#re normalize the inputs
for i in range(0,len(x)):
for j in range(0,len(x[i])):
x_mod[i][j]=x[i][j]*x_max[j]
def apply_function_to_outputs(y,y_mod,y_max):# python uebergibt alles als reference
#normalize the outputs
for i in range(0,len(y)):
for j in range(0,len(y[i])):
y_mod[i][j]=y[i][j]/y_max[j]
def apply_inverse_function_to_outputs(y,y_mod,y_max):# python uebergibt alles als reference
#re-normalize the outputs
for i in range(0,len(y)):
for j in range(0,len(y[i])):
y_mod[i][j]=y[i][j]*y_max[j]
class Dataset(object):
def __init__(self,path,hist_name,kind_of_set,function_inputs,function_outputs,full_set,subset):
self._kind_of_set=kind_of_set
"""example
self._x np.ndarray(shape=(N_points,2))
[[ 10. 95.]
[ 10. 100.]
[ 10. 105.]
...,
[ 2490. 1185.]
[ 2490. 1190.]
[ 2490. 1195.]]
self._y np.ndarray(shape=(N_points,1))
[[ 0.00000000e+00]
[ 0.00000000e+00]
[ 0.00000000e+00]
...,
[ 6.34848448e-06]
[ 6.34845946e-06]
[ 6.34848448e-06]]
"""
rfile = TFile(path)
histogram = rfile.Get(hist_name)
#now prepare data for training:
if kind_of_set=="full_set":
N_points=histogram.GetXaxis().GetNbins() * histogram.GetYaxis().GetNbins() #number of points in full_set
self._N=N_points
self._y=np.ndarray(shape=(N_points,1))
self._x=np.ndarray(shape=(N_points,2))
self._y_mod=np.ndarray(shape=(N_points,1)) #function applied to outputs, for example normalized, or a function is applied
self._x_mod=np.ndarray(shape=(N_points,2)) #function applied to inputs
self._y_max=np.ndarray(shape=(1))
self._y_max[0]=0.0
self._x_max=np.ndarray(shape=(2))
self._x_max=np.ndarray(shape=(2))
self._x_max[0]=0.0
self._x_max[1]=0.0
i=0
for x_bin in range(0, histogram.GetXaxis().GetNbins()):
for y_bin in range(0, histogram.GetYaxis().GetNbins()):
self._x[i][0]=histogram.GetXaxis().GetBinCenter(x_bin)
self._x[i][1]=histogram.GetYaxis().GetBinCenter(y_bin)
self._y[i][0]=histogram.GetBinContent(x_bin,y_bin)
for j in range(0,len(self._x[i])):# only in the full_set case the maximum values are calculated
if self._x[i][j]>self._x_max[j]:
self._x_max[j]=self._x[i][j]
for j in range(0,len(self._y[i])):
if self._y[i][j]>self._y_max[j]:
self._y_max[j]=self._y[i][j]
i=i+1
#apply function to inputs and outputs, the function can also be the identity
apply_function_to_inputs(self._x,self._x_mod,self._x_max)
apply_function_to_outputs(self._y,self._y_mod,self._y_max)
elif kind_of_set=="training_set" or kind_of_set=="validation_set" or kind_of_set=="test_set":
self._N = len(subset)#Number of elements of the data set
self._y=np.ndarray(shape=(self._N,1))
self._x=np.ndarray(shape=(self._N,2))
self._y_mod=np.ndarray(shape=(self._N,1))
self._x_mod=np.ndarray(shape=(self._N,2))
self._y_max=full_set.get_y_max()
self._x_max=full_set.get_x_max()
for i in range(0,self._N):
self._x[i][0]=full_set.get_x()[subset[i]][0]
self._x[i][1]=full_set.get_x()[subset[i]][1]
self._y[i][0]=full_set.get_y()[subset[i]][0]
self._x_mod[i][0]=full_set.get_x_mod()[subset[i]][0]
self._x_mod[i][1]=full_set.get_x_mod()[subset[i]][1]
self._y_mod[i][0]=full_set.get_y_mod()[subset[i]][0]
if len(self._x)==0:# If the set has 0 entries the list is empty
self._N_input=-1
else:
self._N_input = len(self._x[0])
if len(self._y)==0:# If the set has 0 entries the list is empty
self._N_output=-1
else:
self._N_output = len(self._y[0])
self._index_in_epoch = 0 #if one has trained 2 mini batches in the epoch already then this is 2*batch_size
self._epochs_completed = 0
def get_N_input_nodes(self):
return self._N_input
def get_N_output_nodes(self):
return self._N_output
def get_N(self):
return self._N
def get_x(self):
return self._x
def get_y(self):
return self._y
def get_x_max(self):
return self._x_max
def get_y_max(self):
return self._y_max
def get_x_mod(self):
return self._x_mod
def get_y_mod(self):
return self._y_mod
def next_batch(self, batch_size, fake_x=False):
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch >= self._N:
# Finished epoch
self._epochs_completed += 1
# Shuffle the data
perm = np.arange(self._N)
np.random.shuffle(perm)
self._x = self._x[perm]#shuffle both, actually one would only need to shuffle x_mod and y_mod, but for consistency we shuffle both!
self._y = self._y[perm]
self._x_mod = self._x_mod[perm]
self._y_mod = self._y_mod[perm]
# Start next epoch
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._N #if batch size<= self._N then an exception is thrown!
end = self._index_in_epoch
return self._x_mod[start:end], self._y_mod[start:end], self._epochs_completed
def read_data_set(path,hist_name,kind_of_set,function_inputs,function_outputs,full_set=None,subset=None):
return Dataset(path,hist_name,kind_of_set,function_inputs,function_outputs,full_set,subset)
I have uploaded the corresponding data input file to
https://github.com/kanban1992/GradientDescent_Comparison

Related

Convert Tensorflow 1.x code with custom loss into 2.x

Suppose I have the following code written in Tensorflow 1.x where I define custom loss function. I wish to remove .compat.v1., Session, placeholder etc. and convert it into Tensorflow 2.x.
How to do so?
import DGM
import tensorflow as tf
import numpy as np
import scipy.stats as spstats
import matplotlib.pyplot as plt
from tqdm.notebook import trange
# Option parameters
phi = 10
n = 0.01
T = 4
# Solution parameters (domain on which to solve PDE)
t_low = 0.0 - 1e-10
x_low = 0.0 + 1e-10
x_high = 1.0
# neural network parameters
num_layers = 3
nodes_per_layer = 50
# Training parameters
sampling_stages = 2500 # number of times to resample new time-space domain points
steps_per_sample = 20 # number of SGD steps to take before re-sampling
# Sampling parameters
nsim_interior = 100
nsim_boundary_1 = 50
nsim_boundary_2 = 50
nsim_initial = 50
x_multiplier = 1.1 # multiplier for oversampling i.e. draw x from [x_low, x_high * x_multiplier]
def sampler(nsim_interior, nsim_boundary_1, nsim_boundary_2, nsim_initial):
''' Sample time-space points from the function's domain; points are sampled
uniformly on the interior of the domain, at the initial/terminal time points
and along the spatial boundary at different time points.
Args:
nsim_interior: number of space points in the interior of U
nsim_boundary_1: number of space points in the boundary of U
nsim_boundary_2: number of space points in the boundary of U_x
nsim_initial: number of space points at the initial time
'''
# Sampler #1: domain interior
t_interior = np.random.uniform(low=t_low, high=T, size=[nsim_interior, 1])
x_interior = np.random.uniform(low=x_low, high=x_high*x_multiplier, size=[nsim_interior, 1])
# Sampler #2: spatial boundary 1
t_boundary_1 = np.random.uniform(low=t_low, high=T, size=[nsim_boundary_1, 1])
x_boundary_1 = np.ones((nsim_boundary_1, 1))
# Sampler #3: spatial boundary 2
t_boundary_2 = np.random.uniform(low=t_low, high=T, size=[nsim_boundary_2, 1])
x_boundary_2 = np.zeros((nsim_boundary_2, 1))
# Sampler #4: initial condition
t_initial = np.zeros((nsim_initial, 1))
x_initial = np.random.uniform(low=x_low, high=x_high*x_multiplier, size=[nsim_initial, 1])
return (
t_interior, x_interior,
t_boundary_1, x_boundary_1,
t_boundary_2, x_boundary_2,
t_initial, x_initial
)
def loss(
model,
t_interior, x_interior,
t_boundary_1, x_boundary_1,
t_boundary_2, x_boundary_2,
t_initial, x_initial
):
''' Compute total loss for training.
Args:
model: DGM model object
t_interior, x_interior: sampled time / space points in the interior of U
t_boundary_1, x_boundary_1: sampled time / space points in the boundary of U
t_boundary_2, x_boundary_2: sampled time / space points in the boundary of U_x
t_initial, x_initial: sampled time / space points at the initial time
'''
# Loss term #1: PDE
# compute function value and derivatives at current sampled points
u = model(t_interior, x_interior)
u_t = tf.gradients(ys=u, xs=t_interior)[0]
u_x = tf.gradients(ys=u, xs=x_interior)[0]
u_xx = tf.gradients(ys=u_x, xs=x_interior)[0]
diff_u = u_t - u_xx + phi**2 * (tf.nn.relu(u) + 1e-10)**n
# compute average L2-norm for the PDE
L1 = tf.reduce_mean(input_tensor=tf.square(diff_u))
# Loss term #2: First b. c.
u = model(t_boundary_1, x_boundary_1)
bc1_error = u - 1
# Loss term #3: Second b. c.
u = model(t_boundary_2, x_boundary_2)
u_x = tf.gradients(ys=u, xs=x_boundary_2)[0]
bc2_error = u_x - 0
# Loss term #3: Initial condition
u = model(t_initial, x_initial)
init_error = u - 1
# compute average L2-norm for the initial/boundary conditions
L2 = tf.reduce_mean(input_tensor=tf.square(bc1_error + bc2_error + init_error))
return L1, L2
# initialize DGM model (last input: space dimension = 1)
model = DGM.DGMNet(nodes_per_layer, num_layers, 1)
# tensor placeholders (_tnsr suffix indicates tensors)
# inputs (time, space domain interior, space domain at initial time)
t_interior_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_interior_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_boundary_1_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_boundary_1_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_boundary_2_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_boundary_2_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_initial_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_initial_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
# loss
L1_tnsr, L2_tnsr = loss(
model,
t_interior_tnsr, x_interior_tnsr,
t_boundary_1_tnsr, x_boundary_1_tnsr,
t_boundary_2_tnsr, x_boundary_2_tnsr,
t_initial_tnsr, x_initial_tnsr
)
loss_tnsr = L1_tnsr + L2_tnsr
# set optimizer
starting_learning_rate = 3e-4
global_step = tf.Variable(0, trainable=False)
lr = tf.compat.v1.train.exponential_decay(
learning_rate=starting_learning_rate,
global_step=global_step,
decay_steps=1e5,
decay_rate=0.96,
staircase=True,
)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr).minimize(loss_tnsr)
# initialize variables
init_op = tf.compat.v1.global_variables_initializer()
# open session
sess = tf.compat.v1.Session()
sess.run(init_op)
try:
model.load_weights("checkpoint/")
print("Loading from checkpoint.")
except:
print("Checkpoint not found.")
# for each sampling stage
for i in trange(sampling_stages):
# sample uniformly from the required regions
t_interior, x_interior, \
t_boundary_1, x_boundary_1, \
t_boundary_2, x_boundary_2, \
t_initial, x_initial = sampler(
nsim_interior, nsim_boundary_1, nsim_boundary_2, nsim_initial
)
# for a given sample, take the required number of SGD steps
for _ in range(steps_per_sample):
loss, L1, L2, _ = sess.run(
[loss_tnsr, L1_tnsr, L2_tnsr, optimizer],
feed_dict = {
t_interior_tnsr: t_interior,
x_interior_tnsr: x_interior,
t_boundary_1_tnsr: t_boundary_1,
x_boundary_1_tnsr: x_boundary_1,
t_boundary_2_tnsr: t_boundary_2,
x_boundary_2_tnsr: x_boundary_2,
t_initial_tnsr: t_initial,
x_initial_tnsr: x_initial,
}
)
if i % 10 == 0:
print(f"Loss: {loss:.5f},\t L1: {L1:.5f},\t L2: {L2:.5f},\t iteration: {i}")
model.save_weights("checkpoint/")
I tried searching how to implement custom loss functions with model as an argument, but couldn't implement it.
For model.compile there is a loss argument for which you can pass the Loss function. May be a string (name of loss function), or a tf.keras.losses.Loss instance. For example
Model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
loss=tf.keras.losses.BinaryCrossentropy())
If you have created your custom loss function you can also pass that loss function to the loss argument by providing the name of that loss function. For example
def my_loss_fn(y_true, y_pred):
squared_difference = tf.square(y_true - y_pred)
return tf.reduce_mean(squared_difference, axis=-1)
model.compile(optimizer='adam', loss=my_loss_fn)
Thank You.

PPO: NaN Policy return in Tensorflow Keras

I am trying to implement the PPO algorithm with clipped loss in addition to KL penalties and run training on Mujuco Gym environments. After ~ 15000 gradient steps, policy collapses into returning NaN.
These are the policy training info before the policy collapses:
A: tf.Tensor(-0.10426917, shape=(), dtype=float32)
LOG_A: tf.Tensor(37.021107, shape=(), dtype=float32)
LOSS: tf.Tensor(0.16812761, shape=(), dtype=float32)
GRAD: tf.Tensor(
[[-3.4624012e-04 -1.2807851e-04 -1.9778654e-01 ... -2.7586846e+00
-1.2552655e-01 -1.7212760e-03]
[ 4.6312678e-05 -2.2251482e-04 5.5088173e-03 ... 9.5249921e-02
2.2186586e-03 2.0080474e-04]
[ 2.0314787e-05 -1.6381161e-04 7.1509695e-03 ... 1.1740552e-01
3.4010289e-03 1.2105847e-04]
...
[ 1.7827883e-04 -1.1712313e-05 5.8873045e-01 ... 9.2354174e+00
2.9186043e-01 -2.2818900e-03]
[-9.0385452e-05 3.0951984e-03 -3.6487404e-02 ... -2.6829168e-01
-3.9602429e-02 2.0654879e-03]
[ 2.2925157e-04 4.6892464e-03 5.9946489e-01 ... 9.3497839e+00
3.0514282e-01 -1.3834883e-03]], shape=(11, 256), dtype=float32)
A: tf.Tensor(nan, shape=(), dtype=float32)
LOG_A: tf.Tensor(nan, shape=(), dtype=float32)
Note: The gradient info captures only the gradients of the first layer, as I have found capturing all gradient info to be messy and seemingly redundant.
What I have tried:
Tuning hyperparameters: I have tried multiple sets of hyperparameters including the one documented in the original paper. The same error occurs(the hyperparams setup provided in the example below are chosen for higher sampling efficiency for faster debugging).
Gradient clipping: Gradient norm has been clipped to be unitary, and as shown above, it does not appear to have the exploding gradient issue.
Guaranteed numerical stability of tanh squashing of policy log probability: A small epsilon was used to clip the sum of squares so that action log probability does not return inf after tanh squashing.
Unitized code example:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import scipy.signal
import time
from tensorflow.keras import Model
import matplotlib.pyplot as plt
import random
import tensorflow_probability as tfp
tf.keras.backend.set_floatx('float32')
EPSILON = 1e-10
################## GLOBAL SETUP P1 ##################
problem = "Hopper-v2"
env = gym.make(problem)
eval_env = gym.make(problem)
num_states = env.observation_space.shape[0]
print("Size of State Space -> {}".format(num_states), flush=True)
num_actions = env.action_space.shape[0]
print("Size of Action Space -> {}".format(num_actions), flush=True)
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]
print("Max Value of Action -> {}".format(upper_bound), flush=True)
print("Min Value of Action -> {}".format(lower_bound), flush=True)
minibatch_size = 256
##########*****####################*****##########
#################### Auxiliaries ####################
def discounted_cumulative_sums(x, discount):
# Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
##########*****####################*****##########
#################### Replay Buffer ####################
class Buffer:
def __init__(self, observation_dimensions, action_dimensions, size, gamma=0.99, lam=0.95):
self.observation_buffer = np.zeros(
(size, observation_dimensions), dtype=np.float32
)
self.action_buffer = np.zeros((size, action_dimensions), dtype=np.int32)
self.advantage_buffer = np.zeros(size, dtype=np.float32)
self.reward_buffer = np.zeros(size, dtype=np.float32)
self.return_buffer = np.zeros(size, dtype=np.float32)
self.value_buffer = np.zeros(size, dtype=np.float32)
self.logprobability_buffer = np.zeros(size, dtype=np.float32)
self.gamma, self.lam = gamma, lam
self.pointer, self.trajectory_start_index = 0, 0
def store(self, observation, action, reward, value, logprobability):
self.observation_buffer[self.pointer] = observation
self.action_buffer[self.pointer] = action
self.reward_buffer[self.pointer] = reward
self.value_buffer[self.pointer] = value
self.logprobability_buffer[self.pointer] = logprobability
self.pointer += 1
def finish_trajectory(self, last_value=0):
path_slice = slice(self.trajectory_start_index, self.pointer)
rewards = np.append(self.reward_buffer[path_slice], last_value)
values = np.append(self.value_buffer[path_slice], last_value)
deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
self.advantage_buffer[path_slice] = discounted_cumulative_sums(
deltas, self.gamma * self.lam
)
self.return_buffer[path_slice] = discounted_cumulative_sums(
rewards, self.gamma
)[:-1]
self.trajectory_start_index = self.pointer
def get(self):
# Get all data of the buffer and normalize the advantages
rindex = np.random.choice(self.pointer, minibatch_size)
advantage_mean, advantage_std = (
np.mean(self.advantage_buffer[rindex]),
np.std(self.advantage_buffer[rindex]),
)
return (
self.observation_buffer[rindex],
self.action_buffer[rindex],
(self.advantage_buffer[rindex] - advantage_mean) / advantage_std,
self.return_buffer[rindex],
self.logprobability_buffer[rindex],
)
def clear(self):
self.pointer, self.trajectory_start_index = 0, 0
##########*****####################*****##########
#################### Models ####################
class Actor(Model):
def __init__(self):
super().__init__()
self.action_dim = num_actions
self.dense1_layer = layers.Dense(256, activation="relu")
self.dense2_layer = layers.Dense(256, activation="relu")
self.mean_layer = layers.Dense(self.action_dim)
self.stdev_layer = layers.Dense(self.action_dim)
def call(self, state, eval_mode=False):
a1 = self.dense1_layer(state)
a2 = self.dense2_layer(a1)
mu = self.mean_layer(a2)
log_sigma = self.stdev_layer(a2)
sigma = tf.exp(log_sigma)
covar_m = tf.linalg.diag(sigma**2)
dist = tfp.distributions.MultivariateNormalTriL(loc=mu, scale_tril=tf.linalg.cholesky(covar_m))
if eval_mode:
action_ = mu
else:
action_ = dist.sample()
action = tf.tanh(action_)
log_pi_ = dist.log_prob(action_)
log_pi = log_pi_ - tf.reduce_sum(tf.math.log(tf.clip_by_value(1 - action**2, EPSILON, 1.0)), axis=1)
return action*upper_bound, log_pi
def get_critic():
state_input = layers.Input(shape=(num_states))
state_out = layers.Dense(256, activation="relu")(state_input)
out = layers.Dense(256, activation="relu")(state_out)
outputs = layers.Dense(1, dtype='float32')(out)
model = tf.keras.Model(state_input, outputs)
return model
##########*****####################*****##########
#################### GLOBAL SETUP P2 ####################
# Hyperparameters of the PPO algorithm
horizon = 2048
iterations = 2000
gamma = 0.99
clip_ratio = 0.2
epochs = 500
lam = 0.97
target_kl = 0.01
beta = 1.0
render = False
actor_model = Actor()
critic_model = get_critic()
lr = 0.0003
policy_optimizer = tf.keras.optimizers.Adam(learning_rate=lr,
# )
clipnorm=1.0)
value_optimizer = tf.keras.optimizers.Adam(learning_rate=lr,
# )
clipnorm=1.0)
buffer = Buffer(num_states, num_actions, horizon)
##########*****####################*****##########
#################### Training ####################
observation, episode_return, episode_length = env.reset(), 0, 0
tf_observation = tf.expand_dims(observation, 0)
def train_policy(
observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):
global beta
with tf.GradientTape() as tape: # Record operations for automatic differentiation.
action, log_a = actor_model(observation_buffer)
# print("A: ", tf.reduce_mean(action))
# print("LOG_A: ", tf.reduce_mean(log_a))
ratio = tf.exp(
log_a
- logprobability_buffer
)
# print("R: ", tf.reduce_mean(ratio), flush=True)
cd_ratio = tf.clip_by_value(ratio, (1 - clip_ratio), (1 + clip_ratio))
min_advantage = cd_ratio * advantage_buffer
_kl = -beta*tf.math.reduce_max(logprobability_buffer - log_a)
policy_loss = -tf.reduce_mean(tf.minimum(ratio * advantage_buffer, min_advantage) + _kl)
# print("LOSS: ", policy_loss)
policy_grads = tape.gradient(policy_loss, actor_model.trainable_variables)
policy_optimizer.apply_gradients(zip(policy_grads, actor_model.trainable_variables))
# print("GRAD: ", policy_grads[0], flush=True)
action_opt, log_a_opt = actor_model(observation_buffer)
kl = tf.reduce_mean(
logprobability_buffer
- log_a_opt
)
if kl < target_kl/1.5:
beta = beta/2
if kl > target_kl*1.5:
beta = beta*2
return kl
def train_value_function(observation_buffer, return_buffer):
with tf.GradientTape() as tape: # Record operations for automatic differentiation.
value_loss = tf.reduce_mean((return_buffer - critic_model(observation_buffer)) ** 2)
value_grads = tape.gradient(value_loss, critic_model.trainable_variables)
value_optimizer.apply_gradients(zip(value_grads, critic_model.trainable_variables))
for ite in range(iterations):
for t in range(horizon):
if render:
env.render()
action, log_pi_a = actor_model(tf_observation)
action = action[0]
observation_new, reward, done, _ = env.step(action)
episode_return += reward
episode_length += 1
value_t = critic_model(tf_observation)
buffer.store(observation, action, reward, value_t, log_pi_a)
observation = observation_new
tf_observation = tf.expand_dims(observation, 0)
terminal = done
if terminal or (t == horizon - 1):
last_value = 0 if done else critic_model(tf_observation)
buffer.finish_trajectory(last_value)
observation, episode_return, episode_length = env.reset(), 0, 0
tf_observation = tf.expand_dims(observation, 0)
for _ in range(epochs):
(
observation_buffer,
action_buffer,
advantage_buffer,
return_buffer,
logprobability_buffer,
) = buffer.get()
kl = train_policy(
observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
)
train_value_function(observation_buffer, return_buffer)
buffer.clear()
##########*****####################*****##########
Note:
The code base is constructed by a combination of a modified version of the official keras PPO tutorial(https://keras.io/examples/rl/ppo_cartpole/) and Modules(Mainly the policy network) that have been tested in other implementations.
I refrained from using tf_function declaration as I am very new to tensorflow, thus not understanding its impact, and I have read from various github issues that sometimes such declaration causes numerical instability due to caching. However, it could be a source of my issues.
Any help is appreciated, and apologies if something is missing or unclear.

Not able to generate correct English to SQL translations using LSTM for machine translation

I'm using recurrent neural networks to train a model to translate sample english sentences such as "fetch all employee data" into sql such as "SELECT * FROM EMPLOYEE". Right now my program takes 100 epochs of training time but translates all the inputs the same. Required libraries are tensorflow and keras. Could someone take a look at my program to help me generate the correct translation?
Here is my code in python:
https://github.com/Kashdog/engsqlnmt
here's my code:
from __future__ import print_function
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
import h5py
batch_size = 64 # Batch size for training.
epochs = 200 # Number of epochs to train for.
latent_dim = 256 # Latent dimensionality of the encoding space.
num_samples = 10000 # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'eng-sql/sql.txt'
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
print(line.split('^'))
input_text, target_text = line.split('^')
# We use "tab" as the "start sequence" character
# for the targets, and "\n" as "end sequence" character.
target_text = '\t' + target_text + '\n'
input_texts.append(input_text)
target_texts.append(target_text)
for char in input_text:
if char not in input_characters:
input_characters.add(char)
for char in target_text:
if char not in target_characters:
target_characters.add(char)
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)
input_token_index = dict(
[(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
[(char, i) for i, char in enumerate(target_characters)])
encoder_input_data = np.zeros(
(len(input_texts), max_encoder_seq_length, num_encoder_tokens),
dtype='float32')
decoder_input_data = np.zeros(
(len(input_texts), max_decoder_seq_length, num_decoder_tokens),
dtype='float32')
decoder_target_data = np.zeros(
(len(input_texts), max_decoder_seq_length, num_decoder_tokens),
dtype='float32')
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
for t, char in enumerate(input_text):
encoder_input_data[i, t, input_token_index[char]] = 1.
for t, char in enumerate(target_text):
# decoder_target_data is ahead of decoder_input_data by one timestep
decoder_input_data[i, t, target_token_index[char]] = 1.
if t > 0:
# decoder_target_data will be ahead by one timestep
# and will not include the start character.
decoder_target_data[i, t - 1, target_token_index[char]] = 1.
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
batch_size=batch_size,
epochs=epochs,
validation_split=0.2)
# Save model
model.save('s2s.h5')
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
(i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
(i, char) for char, i in target_token_index.items())
def decode_sequence(input_seq):
# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, num_decoder_tokens))
# Populate the first character of target sequence with the start character.
target_seq[0, 0, target_token_index['\t']] = 1.
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = ''
while not stop_condition:
output_tokens, h, c = decoder_model.predict(
[target_seq] + states_value)
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
# Exit condition: either hit max length
# or find stop character.
if (sampled_char == '\n' or
len(decoded_sentence) > max_decoder_seq_length):
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.
# Update states
states_value = [h, c]
return decoded_sentence
for seq_index in range(39):
# Take one sequence (part of the training set)
# for trying out decoding.
input_seq = encoder_input_data[seq_index: seq_index + 1]
decoded_sentence = decode_sequence(input_seq)
print('-')
print(seq_index)
print('Input sentence:', input_texts[seq_index])
print('Decoded sentence:', decoded_sentence)
print('testing')
encoder_test_data = np.zeros(
(2,max_encoder_seq_length, num_encoder_tokens),
dtype='float32')
test_seq = "fetch total employee data"
print(test_seq)
#encoder_test_data
for t, char in enumerate(test_seq):
encoder_test_data[1,t, input_token_index[char]] = 1.
#input_seq = 'fetch all customer data'
decoded_sentence = decode_sequence(encoder_test_data[1:2])
print('Decoded test sentence:', decoded_sentence)
and my data file(sql.txt) is:
fetch all customer data^SELECT * FROM CUSTOMER
find all customer data^SELECT * FROM CUSTOMER
retrieve all customer data^SELECT * FROM CUSTOMER
get all customer data^SELECT * FROM CUSTOMER
download all customer data^SELECT * FROM CUSTOMER
select all customer data^SELECT * FROM CUSTOMER
obtain all employee info^SELECT * FROM EMPLOYEE
show all employee info^SELECT * FROM EMPLOYEE
display all employee info^SELECT * FROM EMPLOYEE
TLDR; Your dataset is very small, biased and lacks the variety needed for RNNs. So you need 'some tricks' to make your code work.
The problem is you didn't shuffle your input data. (The fully working source code is here)
If you look to your sql.txt file, you'll notice the dataset is sorted by customer and employee examples so it makes harder for your network to learn and furthermore, your dataset is biased [30 samples of customer and 70 samples of employee]
Also, your hidden_size was a little big for this small dataset (~100 samples)
so I made some changes:
batch_size = 32 # Batch size for training.
epochs = 300 # Number of epochs to train for.
latent_dim = 32 # Latent dimensionality of the encoding space.
Here's the shuffle code:
import random
all_data = list(zip(input_texts, target_texts))
random.shuffle(all_data)
for i, (input_text, target_text) in enumerate(all_data):
for t, char in enumerate(input_text):
encoder_input_data[i, t, input_token_index[char]] = 1.
for t, char in enumerate(target_text):
# decoder_target_data is ahead of decoder_input_data by one timestep
decoder_input_data[i, t, target_token_index[char]] = 1.
if t > 0:
# decoder_target_data will be ahead by one timestep
# and will not include the start character.
decoder_target_data[i, t - 1, target_token_index[char]] = 1.
so here's the result (I think you'll need more data and a not-biased dataset):
-
34
Input sentence: show all client information
Decoded sentence: SELECT * FROM CUSTOMER
-
35
Input sentence: display all client information
Decoded sentence: SELECT * FROM CUSTOMER
-
36
Input sentence: fetch me all client information
Decoded sentence: SELECT * FROM CUSTOMER
-
37
Input sentence: get me all client information
Decoded sentence: SELECT * FROM CUSTOMER
-
38
Input sentence: get me all employee information
Decoded sentence: SELECT * FROM EMPLOYEE
testing
fetch total employee data
Decoded test sentence: SELECT * FROM EMPLOYEE

Linear Regression with Neural Networks in Tensorflow and normalization

I've been following this tutorial:
https://blog.altoros.com/using-linear-regression-in-tensorflow.html
I'm aware there's better ways to do linear regression, but I'm using this as a base to do multi-variate regression and multi-variate non-linear regression to try to understand TensorFlow.
Without normalizing my data at all, I get 'nan' with GradientDescentOptimizer. I'm curious about why this is. Why is normalization so important that the model won't run at all? And what about subtracting mean and dividing by standard deviation suddenly makes it work so well?
After normalizing data, I'd like to recover the original value.
Each set of data seems to be normalized separately with its own stddev and mean parameters: the training data X, training data Y, test data X, and test data Y.
However, when I run the network on new data, I'm assuming when I predict new values, I have to normalize the input again. In that case, how do I make sense of the predicted Y? Am I supposed to use the training data's standard deviation and mean to unnormalize, or the new data's standard deviation and mean? I am confused what the model is actually fitting to when I give it normalized training data, and how to interpret W and b. I originally wanted to fit to Y = mx + b, and want to know what m and b really are.
Because I trained on training data, I assumed that I would need to store the training_data's pre-normalization standard deviation and mean and unnormalize any results from the network using this value. But in fact, when I use the new data's standard deviation and mean to unnormalize I get more reasonable values. I don't think it's worth posting that code because I just have a fundamental misunderstanding of what I need to do, but this is the basic code I'm using anyway.
import tensorflow as tf
import numpy
import matplotlib.pyplot as plt
# Train a data set
# X: size data
size_data = [ 2104, 1600, 2400, 1416, 3000, 1985, 1534, 1427,
1380, 1494, 1940, 2000, 1890, 4478, 1268, 2300,
1320, 1236, 2609, 3031, 1767, 1888, 1604, 1962,
3890, 1100, 1458, 2526, 2200, 2637, 1839, 1000,
2040, 3137, 1811, 1437, 1239, 2132, 4215, 2162,
1664, 2238, 2567, 1200, 852, 1852, 1203 ]
# Y: price data (set to 5x + 30)
price_data = [5*c + 30 for c in size_data]
size_data = numpy.asarray(size_data)
price_data = numpy.asarray(price_data)
# Test a data set
size_data_test = [ 1600, 1494, 1236, 1100, 3137, 2238 ]
price_data_test = [5*c + 30 for c in size_data_test]
size_data_test = numpy.asarray(size_data_test)
price_data_test = numpy.asarray(price_data_test)
def normalize(array):
std = array.std()
mean = array.mean()
return (array - mean) / std, std, mean
# Normalize a data set
size_data_n, size_data_n_std, size_data_n_mean = normalize(size_data)
price_data_n, price_data_n_std, price_data_n_mean = normalize(price_data)
size_data_test_n, size_data_test_n_std, size_data_test_n_mean = normalize(size_data_test)
price_data_test_n, price_data_test_n_std, price_data_test_n_mean = normalize(price_data_test)
# Display a plot
#plt.plot(size_data, price_data, 'ro', label='Samples data')
#plt.legend()
#plt.draw()
samples_number = price_data_n.size
# TF graph input
X = tf.placeholder("float")
Y = tf.placeholder("float")
# Create a model
# Set model weights
W = tf.Variable(numpy.random.randn(), name="weight")
b = tf.Variable(numpy.random.randn(), name="bias")
# Set parameters
learning_rate = 0.05
training_iteration = 200
# Construct a linear model
model = tf.add(tf.mul(X, W), b)
# Minimize squared errors
cost_function = tf.reduce_sum(tf.pow(model - Y, 2))/(2 * samples_number) #L2 loss
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function) #Gradient descent
#optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(cost_function)
# Initialize variables
init = tf.initialize_all_variables()
# Launch a graph
with tf.Session() as sess:
sess.run(init)
display_step = 20
# Fit all training data
for iteration in range(training_iteration):
for (x, y) in zip(size_data_n, price_data_n):
sess.run(optimizer, feed_dict={X: x, Y: y})
# Display logs per iteration step
if iteration % display_step == 0:
print("Iteration:", '%04d' % (iteration + 1), "cost=", "{:.9f}".format(sess.run(cost_function, feed_dict={X:size_data_n, Y:price_data_n})),\
"W=", sess.run(W), "b=", sess.run(b))
tuning_cost = sess.run(cost_function, feed_dict={X: size_data_n, Y: price_data_n})
print("Tuning completed:", "cost=", "{:.9f}".format(tuning_cost), "W=", sess.run(W), "b=", sess.run(b))
# Validate a tuning model
testing_cost = sess.run(cost_function, feed_dict={X: size_data_test_n, Y: price_data_test_n})
print("Testing data cost:" , testing_cost)
Y_predicted = sess.run(model, feed_dict={X: size_data_test_n, Y: price_data_test_n})
print("%-20s%-20s%-20s%-20s" % ("Test X", "Actual", "Target", "Error(%)"))
print('Normalized')
for i in range(len(size_data_test_n)):
err = 100.0 * abs(Y_predicted[i] - price_data_test_n[i]) / abs(price_data_test_n[i])
print("%-20f%-20f%-20f%-20f" % (size_data_test_n[i], Y_predicted[i], price_data_test_n[i], err))
print('Unnormalized')
for i in range(len(size_data_test_n)):
orig_size_data_test_i = size_data_test_n[i] * size_data_test_n_std + size_data_test_n_mean
orig_price_data_test_i = price_data_test_n[i] * price_data_test_n_std + price_data_test_n_mean
# ??? which one is correct for getting unnormalized predicted Y?
#orig_Y_predicted_i = Y_predicted[i] * price_data_n_std + price_data_n_mean
orig_Y_predicted_i = Y_predicted[i] * price_data_test_n_std + price_data_test_n_mean
orig_err = 100.0 * abs(orig_Y_predicted_i - orig_price_data_test_i) / abs(orig_price_data_test_i)
print("%-20f%-20f%-20f%-20f" % (orig_size_data_test_i, orig_Y_predicted_i, orig_price_data_test_i, orig_err))
# Display a plot
plt.figure()
plt.plot(size_data, price_data, 'ro', label='Samples')
plt.plot(size_data_test, price_data_test, 'go', label='Testing samples')
plt.plot(size_data_test, (sess.run(W) * size_data_test_n + sess.run(b))*price_data_n_std + price_data_n_mean , label='Fitted test line')
plt.legend()
plt.show()

Tensorflow: custom data load + asynchronous computation [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 4 years ago.
Improve this question
This is how-to which I believe is missed from TF examples.
Task:
samples for each class are given in separate dir and thus labels are indirect (i.e. by dir)
decoupled load and computations in TF
Each separate bit could be found, however I think have them all together in one place will help to save a lot of time for TF beginners (like myself).
Lets tackle 1. in my case it is two sets of images:
# all filenames for .jpg in dir
# - list of fnames
# - list of labels
def path_fnames(f_path, label, ext = ['.jpg', '.jpeg']):
f_n = [f_path+'/'+f for f in sorted(os.listdir(f_path)) if os.path.splitext(f)[1].lower() in ext]
f_l = [label] * len(f_n)
return f_n, f_l
#
def dense_to_one_hot(labels_dense, num_classes=10, dtype=np.float32):
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = np.arange(num_labels) * num_classes
labels_one_hot = np.zeros((num_labels, num_classes),dtype=dtype)
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
data_dir = '/mnt/dataset/'
dir_1 = '/class_1'
dir_2 = '/class_2'
# --- get filenames for data ---
dpath = [data_dir+dir_1, data_dir+dir_2]
f_n1, f_l1 = path_fnames(dpath[0], 0)
f_n2, f_l2 = path_fnames(dpath[1], 1)
# --- create one-hot labels ---
ohl = dense_to_one_hot(np.asarray(f_l1+f_l2), num_classes=2, dtype = np.float32)
fnames = f_n1+f_n2; # one-hot labels created in this sequence
Now we have all file-names and one-hot labels preloaded.
Lets move to the 2.
It is based on How to prefetch data using a custom python function in tensorflow. In short it has:
custom image-reader (replace with yours)
queue fnl_q with [filename label] which is used by reader to feed
queue proc_q with [sample label] which is used to feed processing some_op
thread which perform read_op to get [sample label] and enqueue_op to put pair into proc_q. Thread is controlled by tf.Coordinator
some_op which first get data from proc_q by dequeue_many() and rest of computation (also could be put in separate thread).
Notes:
feature_read_op and label_read_op are two separate ops.
I use sleep() to slow down and control op - only for test purposes
i have separated "feeding" and "calculation" parts - in real case just run them in parallel
print 'TF version:', tf.__version__
# --- params ----
im_s = [30, 30, 1] # target image size
BATCH_SIZE = 16
# image reader
# - fnl_queue: queue with [fn l] pairs
# Notes
# - to resize: image_tensor = tf.image.resize_image_with_crop_or_pad(image_tensor, HEIGHT, WIDTH)
# - how about image preprocessing?
def img_reader_jpg(fnl_queue, ch = 3, keep = False):
fn, label = fnl_queue.dequeue()
if keep:
fnl_queue.enqueue([fn, label])
img_bytes = tf.read_file(fn)
img_u8 = tf.image.decode_jpeg(img_bytes, channels=ch)
img_f32 = tf.cast(img_u8, tf.float32)/256.0
#img_4 = tf.expand_dims(img_f32,0)
return img_f32, label
# load [feature, label] and enqueue to processing queue
# - sess: tf session
# - sess: tf Coordinator
# - [fr_op, lr_op ]: feature_read_op label_read_op
# - enqueue_op: [f l] pairs enqueue op
def load_and_enqueue(sess, coord, feature_read_op, label_read_op , enqueue_op):
i = 0
while not coord.should_stop():
# for testing purpose
time.sleep(0.1)
#print 'load_and_enqueue i=',i
#i = i +1
feature, label = sess.run([feature_read_op, label_read_op ])
feed_dict = {feature_input: feature,
label_input : label}
sess.run(enqueue_op, feed_dict=feed_dict)
# --- TF part ---
# filenames and labels are pre-loaded
fv = tf.constant(fnames)
lv = tf.constant(ohl)
#fnl_q = tf.FIFOQueue(len(fnames), [tf.string, tf.float32])
fnl_q = tf.RandomShuffleQueue(len(fnames), 0, [tf.string, tf.float32])
do_enq = fnl_q.enqueue_many([fv, lv])
# reading_op: feature_read_op label_read_op
feature_read_op, label_read_op = img_reader_jpg(fnl_q, ch = im_s[2])
# samples queue
f_s = im_s
l_s = 2
feature_input = tf.placeholder(tf.float32, shape=f_s, name='feature_input')
label_input = tf.placeholder(tf.float32, shape=l_s, name='label_input')
#proc_q = tf.RandomShuffleQueue(len(fnames), 0, [tf.float32, tf.float32], shapes=[f_s, l_s])
proc_q = tf.FIFOQueue(len(fnames), [tf.float32, tf.float32], shapes=[f_s, l_s])
enqueue_op = proc_q.enqueue([feature_input, label_input])
# test:
# - some op
img_batch, lab_batch = proc_q.dequeue_many(BATCH_SIZE)
some_op = [img_batch, lab_batch]
# service ops
init_op = tf.initialize_all_variables()
# let run stuff
with tf.Session() as sess:
sess.run(init_op)
sess.run(do_enq)
print "fnl_q.size:", fnl_q.size().eval()
print "proc_q.size:", proc_q.size().eval()
# --- test thread stuff ---
# - fill proc_q
coord = tf.train.Coordinator()
t = threading.Thread(target=load_and_enqueue, args = (sess, coord, feature_read_op, label_read_op , enqueue_op))
t.start()
time.sleep(2.1)
coord.request_stop()
coord.join([t])
print "fnl_q.size:", fnl_q.size().eval()
print "proc_q.size:", proc_q.size().eval()
# - process a bit
ss = sess.run(some_op)
print 'ss[0].shape', ss[0].shape
print ' ss[1]:\n', ss[1]
print "fnl_q.size:", fnl_q.size().eval()
print "proc_q.size:", proc_q.size().eval()
print 'ok'
Typical output:
TF version: 0.6.0
fnl_q.size: 1225
proc_q.size: 0
fnl_q.size: 1204
proc_q.size: 21
ss[0].shape (16, 30, 30, 1)
ss[1]:
[[ 0. 1.]
[ 1. 0.]
[ 1. 0.]
[ 0. 1.]
[ 0. 1.]
[ 1. 0.]
[ 1. 0.]
[ 0. 1.]
[ 1. 0.]
[ 0. 1.]
[ 0. 1.]
[ 1. 0.]
[ 1. 0.]
[ 0. 1.]
[ 1. 0.]
[ 0. 1.]]
fnl_q.size: 1204
proc_q.size: 5
ok
All as expected
batch of pairs [sample label] are created
pairs are shuffled
Only thing left is to apply TF as it is intended to be used by replacing some_op :)
And a question:
one observed problem problem - in case I use tf.FIFOQueue for file-names and tf.RandomShuffleQueue for samples - shuffling doesn't happen. However other way around (as it code above) it does shuffle perfectly.
Any problem with shuffling for
tf.RandomShuffleQueue(len(fnames), 0, [tf.float32, tf.float32], shapes=[f_s, l_s]) ?
ADD:
The version with two threads:
one for re-fill/update/change file name queue
second for fill samples to processing queue.
Also added correct way to stop threads.
def load_and_enqueue(sess, coord, feature_read_op, label_read_op , enqueue_op):
try:
while not coord.should_stop():
feature, label = sess.run([feature_read_op, label_read_op ])
feed_dict = {feature_input: feature,
label_input : label}
sess.run(enqueue_op, feed_dict=feed_dict)
except Exception as e:
return
# periodically check the state of fnl queue and if needed refill it
# - enqueue_op: 'refill' file-name_label queue
def enqueue_fnl(sess, coord, fnl_q, enqueue_op):
try:
while not coord.should_stop():
time.sleep(0.5)
s = sess.run(fnl_q.size())
if s < (9*BATCH_SIZE) :
sess.run(enqueue_op)
except Exception as e:
return
# -- ops for feed part --
# filenames and labels are pre-loaded
fv = tf.constant(fnames)
lv = tf.constant(ohl)
# read op
fnl_q = tf.RandomShuffleQueue(len(fnames)*2, 0, [tf.string, tf.float32], name = 'fnl_q') # add some margin for re-fill to fit
do_fnl_enq = fnl_q.enqueue_many([fv, lv])
feature_read_op, label_read_op = img_reader_jpg(fnl_q, ch = IMG_SIZE[2])
# samples queue
feature_input = tf.placeholder(tf.float32, shape=IMG_SIZE, name='feature_input')
label_input = tf.placeholder(tf.float32, shape=LAB_SIZE, name='label_input')
proc_q = tf.FIFOQueue(len(fnames)*3, [tf.float32, tf.float32], shapes=[IMG_SIZE, LAB_SIZE], name = 'fe_la_q')
enqueue_op = proc_q.enqueue([feature_input, label_input])
# -- ops for trainind end eval
img_batch, lab_batch = proc_q.dequeue_many(BATCH_SIZE)
... here is your model
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, lab_ph))
optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
with tf.Session() as sess:
coord = tf.train.Coordinator()
t_le = threading.Thread(target=load_and_enqueue, args = (sess, coord, feature_read_op, label_read_op , enqueue_op) , name = 'load_and_enqueue')
t_re = threading.Thread(target=enqueue_fnl, args = (sess, coord, fnl_q, do_fnl_enq), name = 'enqueue_fnl') # re-enq thread i.e. refiling filename queue
t_le.start()
t_re.start()
try:
# training
for step in xrange(823):
# some proc
img_v, lab_v = sess.run([img_batch, lab_batch])
feed_dict = { img_ph : img_v,
lab_ph : lab_v,
keep_prob: 0.7}
_, loss_v = sess.run([optimizer, loss], feed_dict = feed_dict)
except Exception as e:
print 'Training: Exception:', e
# stop threads
coord.request_stop() # ask to stop
sess.run(fnl_q.close(cancel_pending_enqueues=True)) # tell proc_q don't wait for enque anymore
sess.run(proc_q.close(cancel_pending_enqueues=True)) # tell proc_q don't wait for enque anymore
coord.join([t_le, t_re], stop_grace_period_secs=8)