Gaussian Process Regression in Tensorflow 2.0 leads to no gradients? - tensorflow2.0

The following code is basically from the documentation, slightly converted to run in tensorflow 2.0. The gradients are all None. I'm not sure if this is a bug or just something I am missing:
(corrected code)
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
psd_kernels = tfp.positive_semidefinite_kernels
tf.keras.backend.set_floatx('float64')
f = lambda x: np.sin(10*x[..., 0]) * np.exp(-x[..., 0]**2)
observation_index_points = np.random.uniform(-1., 1., 50)[..., np.newaxis]
observations = f(observation_index_points) + np.random.normal(0., .05, 50)
class Model(tf.keras.models.Model):
def __init__(self):
super().__init__()
self.amplitude_ = tf.Variable(np.float64(0), trainable=True)
self.amplitude = tf.exp(self.amplitude_, name='amplitude')
self.length_scale_ = tf.Variable(np.float64(0), trainable=True)
self.length_scale = tf.exp(self.length_scale_, name='length_scale')
self.kernel = psd_kernels.ExponentiatedQuadratic(self.amplitude, self.length_scale)
self.observation_noise_variance_ = tf.Variable(np.float64(-5), trainable=True)
self.observation_noise_variance = tf.exp(self.observation_noise_variance_, name='observation_noise_variance')
def gp(self, observation_index_points):
return tfd.GaussianProcess(
kernel=self.kernel,
index_points=observation_index_points,
observation_noise_variance=self.observation_noise_variance)
def call(self, observation_index_points, observations, index_points):
return tfd.GaussianProcessRegressionModel(
kernel=self.kernel,
index_points=index_points,
observation_index_points=observation_index_points,
observations=observations,
observation_noise_variance=self.observation_noise_variance)
optimizer = tf.keras.optimizers.Adam(learning_rate=.05)
# We can construct the posterior at a new set of `index_points` using the same
# kernel (with the same parameters, which we'll optimize below).
index_points = np.linspace(-1., 1., 100)[..., np.newaxis]
model = Model()
gprm = model(observation_index_points, observations, index_points)
gp = model.gp(observation_index_points)
gp.log_prob(observations)
samples = gprm.sample(10)
trainable_variables = [model.amplitude_, model.length_scale_, model.observation_noise_variance_]
with tf.GradientTape() as tape:
loss = -gp.log_prob(observations)
print(loss)
g = tape.gradient(loss, trainable_variables)
print(g)
UPDATE:
The following example now works. Am wondering if there is a better pattern for organizing this flow in tf 2.0?
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
tfb = tfp.bijectors
tfd = tfp.distributions
psd_kernels = tfp.positive_semidefinite_kernels
m = 1000
n = 3
x = np.random.randn(m, n).astype(np.float32)
y = np.random.randn(m).astype(np.float32)
x_ = np.random.randn(100, n).astype(np.float32)
class GPRMatern(tf.keras.models.Model):
def __init__(self, feature_ndims=1):
super().__init__()
self.kernel = psd_kernels.MaternFiveHalves()
self.observation_noise_variance = tf.Variable(np.float32(.01), name='obs_noise_variance')
def gprm(self, x_obs, y_obs, x):
return tfd.GaussianProcessRegressionModel(
kernel=self.kernel,
index_points=x,
observation_index_points=x_obs,
observations=y_obs,
observation_noise_variance=self.observation_noise_variance)
def nll_for_train(self, x_obs, y_obs):
gp = tfd.GaussianProcess(
kernel=self.kernel,
index_points=x_obs,
observation_noise_variance=self.observation_noise_variance)
return -tf.reduce_mean(gp.log_prob(y_obs))
class GPRExpQuad(tf.keras.models.Model):
def __init__(self):
super().__init__()
self.amplitude = tf.Variable(np.float32(0.0), name='amplitude')
self.length_scale = tf.Variable(np.float32(0.0), name='length_scale')
self.observation_noise_variance = tf.Variable(np.float32(-5.0), name='obs_noise_variance')
#property
def kernel(self):
return psd_kernels.ExponentiatedQuadratic(tf.exp(self.amplitude), tf.exp(self.length_scale))
def nll_for_train(self, x_obs, y_obs):
gp = tfd.GaussianProcess(
kernel=self.kernel,
index_points=x_obs,
observation_noise_variance=tf.exp(self.observation_noise_variance))
return -tf.reduce_mean(gp.log_prob(y_obs))
def gprm(self, x_obs, y_obs, x):
return tfd.GaussianProcessRegressionModel(
kernel=self.kernel,
index_points=x,
observation_index_points=x_obs,
observations=y_obs,
observation_noise_variance=tf.exp(self.observation_noise_variance))
def test_model(model=GPRMatern):
model = model()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
# model.fit(x, y, epochs=steps)
for i in range(10):
with tf.GradientTape() as tape:
l = model.nll_for_train(x, y)
g = tape.gradient(l, model.trainable_variables)
optimizer.apply_gradients(zip(g, model.trainable_variables))
print({x.name: x.numpy() for x in model.trainable_variables})
matern = GPRMatern()
expquad = GPRExpQuad()
test_matern = lambda : test_model(model=GPRMatern)
test_expquad = lambda : test_model(model=GPRExpQuad)

Related

Tensor shapes for FFJORD bijector

I want to fit FFJORD bijector for transformation of two-dimensional dataset. The code is below (it is simplified version of my original code, but has the same problem).
import tensorflow as tf
import tensorflow_probability as tfp
tfb = tfp.bijectors
tfd = tfp.distributions
class ODE(tf.keras.layers.Layer):
def __init__(self):
super(ODE, self).__init__()
self.dense_layer1 = tf.keras.layers.Dense(4, activation = 'tanh')
self.dense_layer2 = tf.keras.layers.Dense(2)
def call(self, t, inputs):
return self.dense_layer2(self.dense_layer1(inputs))
ode = ODE()
ffjord = tfb.FFJORD(state_time_derivative_fn = ode)
base_distr = tfd.MultivariateNormalDiag(loc = tf.zeros(2), scale_diag = tf.ones(2))
td = tfd.TransformedDistribution(distribution = base_distr, bijector = ffjord)
x = tf.keras.Input(shape = (2,), dtype = tf.float32)
log_prob = td.log_prob(x)
model = tf.keras.Model(x, log_prob)
def NLL(y, log_prob):
return -log_prob
model.compile(optimizer = tf.optimizers.Adam(1.0e-2), loss = NLL)
history = model.fit(x = X_train, y = np.zeros(X_train.shape[0]), epochs = 100, verbose = 0, batch_size = 128)
I get error in line log_prob = td.log_prob(x): ValueError: Cannot convert a partially known TensorShape to a Tensor: (None, 2)
If I try to get a sample from transformed distribution td.sample(), it produces another error, but td.sample(1) works as well as some other calls, for example
x = tf.constant([[2.0, 3.0]])
ode(-1.0, x)
ffjord.inverse(x)
ffjord.forward(x)
td.log_prob(td.sample(5))
I guess that there is some problem with shapes, but can't understand where it is.

How does one use keras add_weight() vars with tensorflow probability distributions?

I am creating a new keras layer which accepts a vector of input data and is parameterized by 2 scalars, a mean and standard deviation. I model the input data as a normal distribution and estimate its mean and variance through gradient descent. However, when I initialize tfp.Normal(mu, sigma) which mu and sigma are from add_weights() during, build(), the gradients do not propagate through mu and sigma.
The tensorflow probability documentation states that you can pass in training variables for distribution parameters and backprop through them. How do I get this to work inside of keras?
Below is a minimal working example.
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
tfk = tf.keras
tfkl = tf.keras.layers
tfd = tfp.distributions
tfpl = tfp.layers
EPS = 1e-5
batch_size = 4
N = 100
x = np.random.randn(batch_size, N)
class NormalLikelihood(tf.keras.layers.Layer):
def __init__(self):
super(NormalLikelihood, self).__init__()
def build(self, input_shape):
self.mu = self.add_weight("mean", shape=[1], initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=1), dtype=tf.float32)
self.sigma = self.add_weight("std", shape=[1], initializer=tf.keras.initializers.RandomUniform(minval=EPS, maxval=5.0, seed=None), constraint=tf.keras.constraints.non_neg(), dtype=tf.float32)
self.distribution = tfp.distributions.Normal(self.mu[0], self.sigma[0])
def call(self, input):
r = self.distribution.prob(input)
r = tf.clip_by_value(r, 1e-3, 1-1e-3)
return r
input_layer = tf.keras.layers.Input(shape=(100,))
r = NormalLikelihood()(input_layer)
r = -tf.reduce_sum(tf.math.log(r))
model = tf.keras.models.Model(input_layer, r)
model.add_loss(r)
model.compile(optimizer='rmsprop', loss=None)
model.fit(x, y=None)
This code results in builtins.ValueError: No gradients provided for any variable: ['normal_likelihood/mean:0', 'normal_likelihood/std:0'] which is not expected. Desired behavior would be that ['normal_likelihood/mean:0', 'normal_likelihood/std:0'] have gradients provided for them.
See the code in google colab: https://colab.research.google.com/drive/1_u4XTCIH-2qwNSgv9zkZiCG_zeCIEZGp?usp=sharing
Change tfp.distributions.Normal(self.mu[0], self.sigma[0]) to tfp.distributions.Normal(self.mu, self.sigma).
The reason this works is because under the hood of the .fit() keras method, the gradient computation is looking for trainable variables. When you index into the weights of the model you're taking the gradient against a constant that destroys the connectivity of the chain rule.
Example:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
EPS = 1e-5
class NormalLikelihoodYours(tf.keras.layers.Layer):
def __init__(self):
super(NormalLikelihoodYours, self).__init__()
def build(self, input_shape):
self.mu = self.add_weight(
"mean", shape=[1],
initializer=tf.keras.initializers.RandomNormal(
mean=0.0, stddev=1), dtype=tf.float32)
self.sigma = self.add_weight(
"std", shape=[1],
initializer=tf.keras.initializers.RandomUniform(
minval=EPS, maxval=5.0, seed=None),
constraint=tf.keras.constraints.non_neg(),
dtype=tf.float32)
self.distribution = tfp.distributions.Normal(self.mu[0], self.sigma[0])
def call(self, input):
r = self.distribution.prob(input)
r = tf.clip_by_value(r, 1e-3, 1-1e-3)
return r
class NormalLikelihoodMine(tf.keras.layers.Layer):
def __init__(self):
super(NormalLikelihoodMine, self).__init__()
def build(self, input_shape):
self.mu = self.add_weight(
"mean", shape=[1],
initializer=tf.keras.initializers.RandomNormal(
mean=0.0, stddev=1), dtype=tf.float32)
self.sigma = self.add_weight(
"std", shape=[1],
initializer=tf.keras.initializers.RandomUniform(
minval=EPS, maxval=5.0, seed=None),
constraint=tf.keras.constraints.non_neg(),
dtype=tf.float32)
self.distribution = tfp.distributions.Normal(self.mu, self.sigma)
def call(self, input):
r = self.distribution.prob(input)
r = tf.clip_by_value(r, 1e-3, 1-1e-3)
return r
# loss function
def calc_loss(logits):
return -tf.math.reduce_sum(tf.math.log(logits))
# model input
input_layer = tf.keras.layers.Input(shape=(100,))
x_in = tf.random.normal([4, 100])
# your model
your_output = NormalLikelihoodYours()(input_layer)
your_model = tf.keras.models.Model(input_layer, your_output)\
# my model
my_output = NormalLikelihoodMine()(input_layer)
my_model = tf.keras.models.Model(input_layer, my_output)
# yours has no gradients because the network weights are not
# included anywhere in the loss calculation. When you index them
# with `[0]` they go from being trainable variables in the network,
# to just constants.
with tf.GradientTape() as tape:
y_hat = your_model(x_in)
loss = calc_loss(y_hat)
print(tape.gradient(loss, your_model.trainable_variables))
# [None, None]
# my model has gradients because `loss` and the weights in
# `trainable_variables` are connected
with tf.GradientTape() as tape:
y_hat = my_model(x_in)
loss = calc_loss(y_hat)
print(tape.gradient(loss, my_model.trainable_variables))
# [<tf.Tensor: shape=(1,), numpy=array([43.83749], dtype=float32)>,
# <tf.Tensor: shape=(1,), numpy=array([-37.348656], dtype=float32)>]

ValueError: Dimensions must be equal, but are 2 and 1 in time2vec example

I have 2 inputs and 4 outputs. I want to use the time2vec to predict the outputs. I have used the code in https://towardsdatascience.com/time2vec-for-time-series-features-encoding-a03a4f3f937e, it works for one input and one output. But when I want to use for (2 inputs and four outputs) it gives me the following error:
import numpy as np
import tensorflow as tf
from keras.layers import Dense, Dropout, Activation, Flatten, LSTM, Embedding, Input, concatenate,
Lambda
from sklearn.preprocessing import MinMaxScaler
from keras.callbacks import EarlyStopping
import keras
import random
import os
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.optimizers import *
from tensorflow.keras import backend as K
from kerashypetune import KerasGridSearch
import matplotlib.pyplot as plt
w = 5
ts = 10
nt = 10
ntest = nt + int(percent*nt)
X_train = np.random.rand(90,5,2)
X_test = np.random.rand(5,5,2)
y_train = np.random.rand(90,4)
y_test = np.random.rand(5,4)
""" ### DEFINE T2V LAYER ###
class T2V(Layer):
def __init__(self, output_dim=None, **kwargs):
self.output_dim = output_dim
super(T2V, self).__init__(**kwargs)
def build(self, input_shape):
self.W = self.add_weight(name='W', shape=(1, self.output_dim), initializer='uniform',
trainable=True)
self.P = self.add_weight(name='P',shape=(1,
self.output_dim),initializer='uniform',trainable=True)
self.w = self.add_weight(name='w',shape=(1, 1),initializer='uniform', trainable=True)
self.p = self.add_weight(name='p',shape=(1, 1),initializer='uniform',trainable=True)
super(T2V, self).build(input_shape)
def call(self, x):
original = self.w * x + self.p
sin_trans = K.sin(K.dot(x, self.W) + self.P)
return K.concatenate([sin_trans, original], -1)
CREATE GENERATOR FOR LSTM AND T2V
sequence_length = w
def gen_sequence(id_df, seq_length, seq_cols):
data_matrix = id_df[seq_cols].values
num_elements = data_matrix.shape[0]
for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
yield data_matrix[start:stop, :]
def gen_labels(id_df, seq_length, label):
data_matrix = id_df[label].values
num_elements = data_matrix.shape[0]
return data_matrix[seq_length:num_elements, :]
DEFINE MODEL STRUCTURES
def set_seed_TF2(seed):
tf.random.set_seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)
def T2V_NN(param, dim):
inp = Input(shape=(dim,2))
x = T2V(param['t2v_dim'])(inp)
x = LSTM(param['unit'], activation=param['act'])(x)
x = Dense(2)(x)
m = Model(inp, x)
m.compile(loss='mse', optimizer=Adam(lr=param['lr']))
return m
def NN(param, dim):
inp = Input(shape=(dim,2))
x = LSTM(param['unit'], activation=param['act'])(inp)
x = Dense(2)(x)
m = Model(inp, x)
m.compile(loss='mse', optimizer=Adam(lr=param['lr']))
return m
Param grid
param_grid = {'unit': [64,32],'t2v_dim': [128,64],'lr': [1e-2,1e-3], 'act': ['elu','relu'], 'epochs': 1,'batch_size': [512,1024]}
FIT T2V + LSTM
es = EarlyStopping(patience=5, verbose=0, min_delta=0.001, monitor='val_loss', mode='auto',
restore_best_weights=True)
hypermodel = lambda x: T2V_NN(param=x, dim=sequence_length)
kgs_t2v = KerasGridSearch(hypermodel, param_grid, monitor='val_loss', greater_is_better=False,
tuner_verbose=1)
kgs_t2v.set_seed(set_seed_TF2, seed=33)
kgs_t2v.search(X_train, y_train, validation_split=0.2, callbacks=[es], shuffle=False)
But when I run the model, I've got this error :
ValueError: Dimensions must be equal, but are 2 and 1 for '{{node t2v_2/MatMul}} = MatMul[T=DT_FLOAT,
transpose_a=false, transpose_b=false](t2v_2/Reshape, t2v_2/Reshape_1)' with input shapes: [?,2], [1,128].
Could you help me to solve this?
You have to change the parameters inside the T2V layer and inside your network in order to correctly match the shapes
class T2V(Layer):
def __init__(self, output_dim=None, **kwargs):
self.output_dim = output_dim
super(T2V, self).__init__(**kwargs)
def build(self, input_shape):
self.W = self.add_weight(name='W', shape=(input_shape[-1], self.output_dim),
initializer='uniform', trainable=True)
self.P = self.add_weight(name='P', shape=(input_shape[1], self.output_dim),
initializer='uniform', trainable=True)
self.w = self.add_weight(name='w', shape=(input_shape[1], 1),
initializer='uniform', trainable=True)
self.p = self.add_weight(name='p', shape=(input_shape[1], 1),
initializer='uniform', trainable=True)
super(T2V, self).build(input_shape)
def call(self, x):
original = self.w * x + self.p
sin_trans = K.sin(K.dot(x, self.W) + self.P)
return K.concatenate([sin_trans, original], -1)
create a dummy example
n_sample = 90
timesteps = 5
feat_inp = 2
feat_out = 4
X = np.random.uniform(0,1, (n_sample, timesteps, feat_inp))
y = np.random.uniform(0,1, (n_sample, feat_out))
def T2V_NN():
inp = Input(shape=(timesteps,feat_inp))
x = T2V(32)(inp)
x = LSTM(8)(x)
x = Dense(feat_out)(x)
m = Model(inp, x)
m.compile(loss='mse', optimizer='adam')
return m
model = T2V_NN()
model.fit(X,y, epochs=3)

The method to use gradient accumulate in BERT finetune

I was doing a Bert finetune and I had OOM issues. I heard a good method to handle this is to use "gradient accumulate". Below are my optimization.py(include the gradient accumulate)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
from tensorflow.python.training import optimizer
from tensorflow.python.framework import ops
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = MultistepAdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
if use_tpu:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# This is how the model was pre-trained.
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=global_step)
# Normally the global step update is done inside of `apply_gradients`.
# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
# a different optimizer, you should probably take this line out.
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
class MultistepAdamWeightDecayOptimizer(optimizer.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
n = 1,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="MultistepAdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(MultistepAdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = learning_rate
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self._n = n
self.exclude_from_weight_decay = exclude_from_weight_decay
self._n_t = None
def _prepare(self):
super(MultistepAdamWeightDecayOptimizer, self)._prepare()
self._n_t=tf.convert_to_tensor(self._n, name="n")
def _create_slots(self,var_list):
super(MultistepAdamWeightDecayOptimizer, self)._create_slots(var_list)
first_var = min(var_list, key=lambda x: x.name)
self._create_non_slot_variable(initial_value=0 if self._n == 1 else 1,
name="iter",
colocate_with=first_var)
for v in var_list:
self._zeros_slot(v,"grad_acc",self._name)
def _get_iter_variable(self):
if tf.contrib.eager.in_eager_mode():
graph = None
else:
graph = tf.get_default_graph()
return self._get_non_slot_variable("iter", graph=graph)
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
"""See base class."""
update_ops = []
var_list = [v for g, v in grads_and_vars if g is not None]
with ops.init_scope():
self._create_slots(var_list)
self._prepare()
for(grad, param) in grads_and_vars:
if grad is None or param is None:
continue
grad_acc = self.get_slot(param, "grad_acc")
param_name = self._get_variable_name(params.name)
m = tf.get_variable(name=param_name + "/adam_m", shape=param.shape.as_list(),
dtype=tf.float32,trainable=False, initializer=tf.zeros_initializer())
v = tf.get_variable(name =param_name + "/adam_v", shape=param.sahpe.as_list(),
dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer())
def _apply_adam(grad_acc, grad, param, m, v):
total_grad = (grad_acc + grad) / tf.cast(self._n_t, grad.dtype)
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, total_grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(total_grad)))
update = next_m / (tf.sqrt(next_v) + self.epsilon)
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param
update_with_lr =self.learning_rate * update
next_param = param - update_with_lr
adam_op = tf.group(param.assign(next_param), m.assign(next_m),
v.assign(next_v))
with tf.control_dependencies([adam_op]):
grad_acc_to_zero_op = grad_acc.assign(tf.zero_like(grad_acc), use_locking=self._use_locking)
return tf.group(adam_op, grad_acc_to_zero_op)
def _accumulate_gradient(grad_acc, grad):
assign_up = tf.assign_add(grad_acc, grad, use_locking=self._use_locking)
return tf.group(assign_op)
update_op = tf.cond(tf.equal(self._get_iter_variable(),0),
lambda: _apply_adam(grad_acc, grad, param,m, v),
lambda: _accumulate_gradient(grad_acc, grad))
update_ops.append(update_op)
apply_updates = self._finish(update_ops, name_scope=name)
return apply_updates
def _finish(self, update_ops, name_scope):
iter_=self._get_iter_variable()
with tf.control_dependencies(update_ops):
with tf.colocate_with(iter_):
update_iter = iter_.assign(tf.mod(iter_+1, self._n_t),
use_locking=self._use_locking)
return tf.group(
*update_ops + [update_iter], name=name_scope)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
After I used this optimization.py, i could use large batch. But loss did not decrease and after 300 steps(i got 550000 training data, batch size 64, iteration 1000 and epoch 20), it said: train loop marked as finished and stopped.
I am not sure what problem is, could you please help me out? thanks.

How to input csv data in an autoencoder

I am using the code below that implements an autoencoder. How can I feed the autoencoder with data for training and testing?
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
class Autoencoder(object):
def __init__(self, n_input, n_hidden, transfer_function=tf.nn.softplus, optimizer = tf.train.AdamOptimizer()):
self.n_input = n_input
self.n_hidden = n_hidden
self.transfer = transfer_function
network_weights = self._initialize_weights()
self.weights = network_weights
# model
self.x = tf.placeholder(tf.float32, [None, self.n_input])
self.hidden = self.transfer(tf.add(tf.matmul(self.x, self.weights['w1']), self.weights['b1']))
self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2'])
# cost
self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))
self.optimizer = optimizer.minimize(self.cost)
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
def _initialize_weights(self):
all_weights = dict()
all_weights['w1'] = tf.get_variable("w1", shape=[self.n_input, self.n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32))
all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32))
return all_weights
def partial_fit(self, X):
cost, opt = self.sess.run((self.cost, self.optimizer), feed_dict={self.x: X})
return cost
def calc_total_cost(self, X):
return self.sess.run(self.cost, feed_dict = {self.x: X})
def transform(self, X):
return self.sess.run(self.hidden, feed_dict={self.x: X})
def generate(self, hidden = None):
if hidden is None:
hidden = self.sess.run(tf.random_normal([1, self.n_hidden]))
return self.sess.run(self.reconstruction, feed_dict={self.hidden: hidden})
def reconstruct(self, X):
return self.sess.run(self.reconstruction, feed_dict={self.x: X})
def getWeights(self):
return self.sess.run(self.weights['w1'])
def getBiases(self):
return self.sess.run(self.weights['b1'])
# I instantiate the class autoencoder, 5 is the dimension of a raw input,
2 is the dimension of the hidden layer
autoencoder = Autoencoder(5, 2, transfer_function=tf.nn.softplus, optimizer
= tf.train.AdamOptimizer())
# I prepare my data**
IRIS_TRAINING = "C:\\Users\\Desktop\\iris_training.csv"
#Feeding data to Autoencoder ???
Train and Test ??
How can I train this model with csv file data? I think I need to run the following instruction as _, c = sess.run([optimizer, cost], feed_dict={self.x: batch_ofd_ata}) inside a loop of epochs, but I am struggling with it.
Check out Stanford CS20SI's tutorial.
https://github.com/chiphuyen/tf-stanford-tutorials/blob/master/examples/05_csv_reader.py