Related
I am working on an artifical neural network which I have created via subclassing.
The subclassing looks like this:
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import scipy.stats as si
import sympy as sy
from sympy.stats import Normal, cdf
from sympy import init_printing
class DGMNet(tf.keras.Model):
def __init__(self, n_layers, n_nodes, dimensions=1):
"""
Parameters:
- n_layers: number of layers
- n_nodes: number of nodes in (inner) layers
- dimensions: number of spacial dimensions
"""
super().__init__()
self.n_layers = n_layers
self.initial_layer = DenseLayer(dimensions + 1, n_nodes, activation="relu")
self.lstmlikelist = []
for _ in range(self.n_layers):
self.lstmlikelist.append(LSTMLikeLayer(dimensions + 1, n_nodes, activation="relu"))
self.final_layer = DenseLayer(n_nodes, 1, activation=None)
def call(self, t, x):
X = tf.concat([t,x], 1)
S = self.initial_layer.call(X)
for i in range(self.n_layers):
S = self.lstmlikelist[i].call({'S': S, 'X': X})
result = self.final_layer.call(S)
return result
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, n_inputs, n_outputs, activation):
"""
Parameters:
- n_inputs: number of inputs
- n_outputs: number of outputs
- activation: activation function
"""
super(DenseLayer, self).__init__()
self.n_inputs = n_inputs
self.n_outputs = n_outputs
self.W = self.add_weight(shape=(self.n_inputs, self.n_outputs),
initializer='random_normal',
trainable=True)
self.b = self.add_weight(shape=(1, self.n_outputs),
initializer='random_normal',
trainable=True)
self.activation = _get_function(activation)
def call(self, inputs):
S = tf.add(tf.matmul(inputs, self.W), self.b)
S = self.activation(S)
return S
class LSTMLikeLayer(tf.keras.layers.Layer):
def __init__(self, n_inputs, n_outputs, activation):
"""
Parameters:
- n_inputs: number of inputs
- n_outputs: number of outputs
- activation: activation function
"""
super(LSTMLikeLayer, self).__init__()
self.n_outputs = n_outputs
self.n_inputs = n_inputs
self.Uz = self.add_variable("Uz", shape=[self.n_inputs, self.n_outputs])
self.Ug = self.add_variable("Ug", shape=[self.n_inputs, self.n_outputs])
self.Ur = self.add_variable("Ur", shape=[self.n_inputs, self.n_outputs])
self.Uh = self.add_variable("Uh", shape=[self.n_inputs, self.n_outputs])
self.Wz = self.add_variable("Wz", shape=[self.n_outputs, self.n_outputs])
self.Wg = self.add_variable("Wg", shape=[self.n_outputs, self.n_outputs])
self.Wr = self.add_variable("Wr", shape=[self.n_outputs, self.n_outputs])
self.Wh = self.add_variable("Wh", shape=[self.n_outputs, self.n_outputs])
self.bz = self.add_variable("bz", shape=[1, self.n_outputs])
self.bg = self.add_variable("bg", shape=[1, self.n_outputs])
self.br = self.add_variable("br", shape=[1, self.n_outputs])
self.bh = self.add_variable("bh", shape=[1, self.n_outputs])
self.activation = _get_function(activation)
def call(self, inputs):
S = inputs['S']
X = inputs['X']
Z = self.activation(tf.add(tf.add(tf.matmul(X, self.Uz), tf.matmul(S, self.Wz)), self.bz))
G = self.activation(tf.add(tf.add(tf.matmul(X, self.Ug), tf.matmul(S, self.Wg)), self.bg))
R = self.activation(tf.add(tf.add(tf.matmul(X, self.Ur), tf.matmul(S, self.Wr)), self.br))
H = self.activation(tf.add(tf.add(tf.matmul(X, self.Uh), tf.matmul(tf.multiply(S, R), self.Wh)), self.bh))
Snew = tf.add(tf.multiply(tf.subtract(tf.ones_like(G), G), H), tf.multiply(Z, S))
return Snew
def _get_function(name):
f = None
if name == "tanh":
f = tf.nn.tanh
elif name == "sigmoid":
f = tf.nn.sigmoid
elif name == "relu":
f = tf.nn.relu
elif not name:
f = tf.identity
assert f is not None
return f
# Sampling
def sampler(N1, N2, N3):
np.random.seed(42)
# Sampler #1: PDE domain
t1 = np.random.uniform(low=T0,
high=T,
size=[N1,1])
s1 = np.random.uniform(low=S1,
high=S2,
size=[N1,1])
# Sampler #2: boundary condition
t2 = np.zeros(shape=(1, 1))
s2 = np.zeros(shape=(1, 1))
# Sampler #3: initial/terminal condition
t3 = T * np.ones((N3,1)) #Terminal condition
s3 = np.random.uniform(low=S1,
high=S2,
size=[N3,1])
return (t1, s1, t2, s2, t3, s3)
# Loss function
def loss(model, t1, x1, t2, x2, t3, x3):
# Loss term #1: PDE
V = model(t1, x1)
V_t = tf.gradients(V, t1)[0]
V_x = tf.gradients(V, x1)[0]
V_xx = tf.gradients(V_x, x1)[0]
f = V_t + r*x1*V_x + 0.5*sigma**2*x1**2*V_xx - r*V
L1 = tf.reduce_mean(tf.square(f))
# Loss term #2: boundary condition
#L2 = tf.reduce_mean(tf.square(V))
# Loss term #3: initial/terminal condition
L3 = tf.reduce_mean(tf.square(model(t3, x3) - tf.math.maximum(x3-K,0)))
return (L1, L3)
# B-S's analytical known solution
def analytical_solution(t, x):
#C = SN(d1) - Xe- rt N(d2)
#S: spot price
#K: strike price
#T: time to maturity
#r: interest rate
#sigma: volatility of underlying asset
d1 = (np.log(x / K) + (r + 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
d2 = (np.log(x / K) + (r - 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
call = (x * si.norm.cdf(d1, 0.0, 1.0) - K * np.exp(-r * T) * si.norm.cdf(d2, 0.0, 1.0))
return call
# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)
# Strike price
K = 0.5
# PDE parameters
r = 0.05 # Interest rate
sigma = 0.25 # Volatility
# Time limits
T0 = 0.0 + 1e-10 # Initial time
T = 1.0 # Terminal time
# Space limits
S1 = 0.0 + 1e-10 # Low boundary
S2 = 1.0 # High boundary
# Number of samples
NS_1 = 1000
NS_2 = 0
NS_3 = 100
t1, s1, t2, s2, t3, s3 = sampler(NS_1, NS_2, NS_3)
Now what I want to do is to iterate over different parameters and create a new ann for each iteration.
My plan was to do it in this way:
tf.compat.v1.disable_eager_execution()
t1_t = tf.compat.v1.placeholder(tf.float32, [None,1])
x1_t = tf.compat.v1.placeholder(tf.float32, [None,1])
t2_t = tf.compat.v1.placeholder(tf.float32, [None,1])
x2_t = tf.compat.v1.placeholder(tf.float32, [None,1])
t3_t = tf.compat.v1.placeholder(tf.float32, [None,1])
x3_t = tf.compat.v1.placeholder(tf.float32, [None,1])
volatility_list = [0.08]#[0.08, 0.16, 0.18, 0.2, 0.28]
stages_list = [10]#, 50, 100]
layers_list = [3]#, 5, 7]
npl_list = [3]#, 6, 9, 12, 15]
for sigma in volatility_list:
for st in stages_list:
for lay in layers_list:
for npl in npl_list:
# Neural Network definition
num_layers = lay
nodes_per_layer = npl
ann = DGMNet(num_layers, nodes_per_layer)
L1_t, L3_t = loss(ann, t1_t, x1_t, t2_t, x2_t, t3_t, x3_t)
loss_t = L1_t + L3_t
# Optimizer parameters
global_step = tf.Variable(1, trainable=False)
starter_learning_rate = 0.001
learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step,
100000, 0.96, staircase=True)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_t)
# Training parameters
steps_per_sample = st
sampling_stages = 100#2000
# Plot tensors
tplot_t = tf.compat.v1.placeholder(tf.float32, [None,1], name="tplot_t") # We name to recover it later
xplot_t = tf.compat.v1.placeholder(tf.float32, [None,1], name="xplot_t")
vplot_t = tf.identity(ann(tplot_t, xplot_t), name="vplot_t") # Trick for naming the trained model
# Training data holders
sampling_stages_list = []
elapsed_time_list = []
loss_list = []
L1_list = []
L3_list = []
# Train network!!
init_op = tf.compat.v1.global_variables_initializer()
sess = tf.compat.v1.Session()
sess.run(init_op)
for i in range(sampling_stages):
t1, x1, t2, x2, t3, x3 = sampler(NS_1, NS_2, NS_3)
start_time = time.clock()
for _ in range(steps_per_sample):
loss, L1, L3, _ = sess.run([loss_t, L1_t, L3_t, optimizer],
feed_dict = {t1_t:t1, x1_t:x1, t2_t:t2, x2_t:x2, t3_t:t3, x3_t:x3})
end_time = time.clock()
elapsed_time = end_time - start_time
sampling_stages_list.append(i)
elapsed_time_list.append(elapsed_time)
loss_list.append(loss)
L1_list.append(L1)
L3_list.append(L3)
text = "Stage: {:04d}, Loss: {:e}, L1: {:e}, L3: {:e}, {:f} seconds".format(i, loss, L1, L3, elapsed_time)
print(text)
#goodness of fit
time_0 = 0
listofzeros = [time_0] * 100
prices_for_goodness = np.linspace(S1,S2, 100)
goodness_list = []
solution_goodness = analytical_solution(listofzeros, prices_for_goodness)
ttt = time_0*np.ones_like(prices_for_goodness.reshape(-1,1))
nn_goodness, = sess.run([vplot_t],
feed_dict={tplot_t:ttt, xplot_t:prices_for_goodness.reshape(-1,1)})
deviation_list = np.abs(solution_goodness - nn_goodness)/(T-T0)
print("{0:.2f}%".format(np.average(deviation_list)*100))
Unfortunately as soon as it ends the first iteration I get a TypeError that 'numpy.float32' object is not callable
Error Traceback:
TypeError Traceback (most recent call last)
<ipython-input-14-bb14643d0c42> in <module>()
10
11
---> 12 L1_t, L3_t = loss(ann, t1_t, x1_t, t2_t, x2_t, t3_t, x3_t)
13 loss_t = L1_t + L3_t
14
TypeError: 'numpy.float32' object is not callable
I guess that the problem is with the creation of the placeholders, however I am not sure how to solve it. Maybe one of you can help me
Thanks in advance!
Chris
Did you create a variable called 'loss'? It seems that the loss function is redefined by a variable with the same name, so then python tries to call that variable as a function.
I am trying to use the Triple-Loss technique to fine-tune an EfficientNet network for human Re-ID using Keras. Here is the code I am using:
This is the generator:
class SampleGen(object):
def __init__(self, file_class_mapping):
self.file_class_mapping = file_class_mapping
self.class_to_list_files = defaultdict(list)
self.list_all_files = list(file_class_mapping.keys())
self.range_all_files = list(range(len(self.list_all_files)))
for file, class_ in file_class_mapping.items():
self.class_to_list_files[class_].append(file)
self.list_classes = list(set(self.file_class_mapping.values()))
self.range_list_classes = range(len(self.list_classes))
self.class_weight = np.array([len(self.class_to_list_files[class_]) for class_ in self.list_classes])
self.class_weight = self.class_weight / np.sum(self.class_weight)
def get_sample(self):
class_idx = np.random.choice(self.range_list_classes, 1, p=self.class_weight)[0]
examples_class_idx = np.random.choice(range(len(self.class_to_list_files[self.list_classes[class_idx]])), 2)
positive_example_1, positive_example_2 = \
self.class_to_list_files[self.list_classes[class_idx]][examples_class_idx[0]], \
self.class_to_list_files[self.list_classes[class_idx]][examples_class_idx[1]]
negative_example = None
while negative_example is None or self.file_class_mapping[negative_example] == \
self.file_class_mapping[positive_example_1]:
negative_example_idx = np.random.choice(self.range_all_files, 1)[0]
negative_example = self.list_all_files[negative_example_idx]
return positive_example_1, negative_example, positive_example_2
def read_and_resize(filepath):
im = Image.open((filepath)).convert('RGB')
im = im.resize((image_size, image_size))
return np.array(im, dtype="float32")
def augment(im_array):
if np.random.uniform(0, 1) > 0.9:
im_array = np.fliplr(im_array)
return im_array
def gen(triplet_gen):
while True:
list_positive_examples_1 = []
list_negative_examples = []
list_positive_examples_2 = []
for i in range(batch_size):
positive_example_1, negative_example, positive_example_2 = triplet_gen.get_sample()
path_pos1 = join(path_train, positive_example_1)
path_neg = join(path_train, negative_example)
path_pos2 = join(path_train, positive_example_2)
positive_example_1_img = read_and_resize(path_pos1)
negative_example_img = read_and_resize(path_neg)
positive_example_2_img = read_and_resize(path_pos2)
positive_example_1_img = augment(positive_example_1_img)
negative_example_img = augment(negative_example_img)
positive_example_2_img = augment(positive_example_2_img)
list_positive_examples_1.append(positive_example_1_img)
list_negative_examples.append(negative_example_img)
list_positive_examples_2.append(positive_example_2_img)
A = preprocess_input(np.array(list_positive_examples_1))
B = preprocess_input(np.array(list_positive_examples_2))
C = preprocess_input(np.array(list_negative_examples))
label = None
yield {'anchor_input': A, 'positive_input': B, 'negative_input': C}, label
This is how I create the model:
def get_model():
base_model = efn.EfficientNetB3(weights='imagenet', include_top=False)
for layer in base_model.layers:
layer.trainable = False
x = base_model.output
x = Dropout(0.6)(x)
x = Dense(embedding_dim)(x)
x = Lambda(lambda x: K.l2_normalize(x, axis=1), name="enc_out")(x)
embedding_model = Model(base_model.input, x, name="embedding")
input_shape = (image_size, image_size, 3)
anchor_input = Input(input_shape, name='anchor_input')
positive_input = Input(input_shape, name='positive_input')
negative_input = Input(input_shape, name='negative_input')
anchor_embedding = embedding_model(anchor_input)
positive_embedding = embedding_model(positive_input)
negative_embedding = embedding_model(negative_input)
inputs = [anchor_input, positive_input, negative_input]
outputs = [anchor_embedding, positive_embedding, negative_embedding]
triplet_model = Model(inputs, outputs)
triplet_model.add_loss(K.mean(triplet_loss(outputs)))
return embedding_model, triplet_model
And this is how I'm trying to run the training:
if __name__ == '__main__':
data = pd.read_csv(path_csv)
train, test = train_test_split(data, train_size=0.7, random_state=1337)
file_id_mapping_train = {k: v for k, v in zip(train.Image.values, train.Id.values)}
file_id_mapping_test = {k: v for k, v in zip(test.Image.values, test.Id.values)}
gen_tr = gen(SampleGen(file_id_mapping_train))
gen_te = gen(SampleGen(file_id_mapping_test))
embedding_model, triplet_model = get_model()
for i, layer in enumerate(embedding_model.layers):
print(i, layer.name, layer.trainable)
for layer in embedding_model.layers[379:]:
layer.trainable = True
for layer in embedding_model.layers[:379]:
layer.trainable = False
triplet_model.compile(loss=None, optimizer=Adam(0.0001))
history = triplet_model.fit(x=gen_tr,
validation_data=gen_te,
epochs=10,
verbose=1,
steps_per_epoch=200,
validation_steps=20,
callbacks=create_callbacks())
The csv contains two columns (Image, Id) and I am generating triplets on the go using a generator. The layer 379 is the last layer of the network so I just leave that as trainable. I let it run for some epochs and it seems like it doesn't converge, it stays around 2.30. On epochs like 20, the loss is even higher than what I've started with. Here you can see what I mean: train example Is there anything wrong with the way I think about the problem?
Thank you!
I am using tensorflow 2.0 and trying to make a actor critic algorithm to play the game of cartpole. I have done everything right but getting the following error: ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0'].
Please help me out
Here is my code:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
MAX_EPISODES = 2000
GAMMA = 0.9
LR_A = 0.001
LR_C = 0.01
env = gym.make("CartPole-v0")
N_ACTIONS = env.action_space.n
N_FEATURES = 4
def make_actor(n_features, n_actions):
inputs = tf.keras.Input(shape=[n_features])
hidden = tf.keras.layers.Dense(20, activation=tf.nn.relu)(inputs)
dist = tf.keras.layers.Dense(n_actions, activation=tf.nn.softmax)(hidden)
model = tf.keras.Model(inputs=inputs, outputs=dist)
return model
def make_critic(n_features):
inputs = tf.keras.Input(shape=[n_features])
hidden = tf.keras.layers.Dense(20, activation=tf.nn.relu)(inputs)
value = tf.keras.layers.Dense(1)(hidden)
model = tf.keras.Model(inputs=inputs, outputs=value)
return model
actor = make_actor(N_FEATURES, N_ACTIONS)
critic = make_critic(N_FEATURES)
actor.summary()
critic.summary()
actor_optimizer = tf.keras.optimizers.Adam(LR_A)
critic_optimizer = tf.keras.optimizers.Adam(LR_C)
def loss_actor(s, a, td_error):
dist = actor(s.reshape(1, 4)).numpy()
log_prob = np.log(dist[0, a])
exp_v = np.mean(log_prob * td_error)
return tf.multiply(exp_v, -1)
def loss_critic(s, s_, r, gamma):
s, s_ = s[np.newaxis, :], s_[np.newaxis, :]
v = critic(s)
v_ = critic(s_)
td_error = r + gamma * v_ - v
return tf.multiply(td_error, 1)
def train(max_episodes):
for episode in range(max_episodes):
s = env.reset().astype(np.float32)
t = 0
track_r = []
while True:
dist = actor(s.reshape(1, 4)).numpy()
a = np.random.choice(range(N_ACTIONS), p=dist.ravel())
s_, r, done, info = env.step(a)
s_ = s_.astype(np.float32)
if done: r=-20
track_r.append(r)
with tf.GradientTape() as cri_tape, tf.GradientTape() as act_tape:
td_error = loss_critic(s, s_, r, GAMMA)
gradient = cri_tape.gradient(td_error, critic.trainable_variables)
critic_optimizer.apply_gradients(zip(gradient,critic.trainable_variables))
with tf.GradientTape() as act_tape:
neg_exp_v = loss_actor(s, a, td_error.numpy())
gradient = act_tape.gradient(neg_exp_v, critic.trainable_variables)
actor_optimizer.apply_gradients(zip(gradient, actor.trainable_variables))
s = s_
t += 1
if done:
print("Episode:{} Steps:{}".format(episode+1, t))
train(MAX_EPISODES)
The error is on line 69:actor_optimizer.apply_gradients(zip(gradient, actor.trainable_variables))
When I tried to print out the gradients for the actor the result was None.
I am really not getting where the problem is.
Is there a way in Keras to retrieve the cell state (i.e., c vector) of a LSTM layer at every timestep of a given input?
It seems the return_state argument returns the last cell state after the computation is done, but I need also the intermediate ones. Also, I don't want to pass these cell states to the next layer, I only want to be able to access them.
Preferably using TensorFlow as backend.
Thanks
I was looking for a solution to this issue and after reading the guidance for creating your own custom RNN Cell in tf.keras (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AbstractRNNCell), I believe the following is the most concise and easy to read way of doing this for Tensorflow 2:
import tensorflow as tf
from tensorflow.keras.layers import LSTMCell
class LSTMCellReturnCellState(LSTMCell):
def call(self, inputs, states, training=None):
real_inputs = inputs[:,:self.units] # decouple [h, c]
outputs, [h,c] = super().call(real_inputs, states, training=training)
return tf.concat([h, c], axis=1), [h,c]
num_units = 512
test_input = tf.random.uniform([5,100,num_units])
rnn = tf.keras.layers.RNN(LSTMCellReturnCellState(num_units),
return_sequences=True, return_state=True)
whole_seq_output, final_memory_state, final_carry_state = rnn(test_input)
print(whole_seq_output.shape)
>>> (5,100,1024)
# Hidden state sequence
h_seq = whole_seq_output[:,:,:num_units] # (5,100,512)
# Cell state sequence
c_seq = whole_seq_output[:,:,num_units:] # (5,100,512)
As mentioned in an above solution, you can see the advantage of this is that it can be easily wrapped into tf.keras.layers.RNN as a drop-in for the normal LSTMCell.
Here is a Colab Notebook with the code running as expected for tensorflow==2.6.0
I know it's pretty late, I hope this can help.
what you are asking, technically, is possible by modifying the LSTM-cell in call method. I modify it and make it return 4 dimension instead of 3 when you give return_sequences=True.
Code
from keras.layers.recurrent import _generate_dropout_mask
class Mod_LSTMCELL(LSTMCell):
def call(self, inputs, states, training=None):
if 0 < self.dropout < 1 and self._dropout_mask is None:
self._dropout_mask = _generate_dropout_mask(
K.ones_like(inputs),
self.dropout,
training=training,
count=4)
if (0 < self.recurrent_dropout < 1 and
self._recurrent_dropout_mask is None):
self._recurrent_dropout_mask = _generate_dropout_mask(
K.ones_like(states[0]),
self.recurrent_dropout,
training=training,
count=4)
# dropout matrices for input units
dp_mask = self._dropout_mask
# dropout matrices for recurrent units
rec_dp_mask = self._recurrent_dropout_mask
h_tm1 = states[0] # previous memory state
c_tm1 = states[1] # previous carry state
if self.implementation == 1:
if 0 < self.dropout < 1.:
inputs_i = inputs * dp_mask[0]
inputs_f = inputs * dp_mask[1]
inputs_c = inputs * dp_mask[2]
inputs_o = inputs * dp_mask[3]
else:
inputs_i = inputs
inputs_f = inputs
inputs_c = inputs
inputs_o = inputs
x_i = K.dot(inputs_i, self.kernel_i)
x_f = K.dot(inputs_f, self.kernel_f)
x_c = K.dot(inputs_c, self.kernel_c)
x_o = K.dot(inputs_o, self.kernel_o)
if self.use_bias:
x_i = K.bias_add(x_i, self.bias_i)
x_f = K.bias_add(x_f, self.bias_f)
x_c = K.bias_add(x_c, self.bias_c)
x_o = K.bias_add(x_o, self.bias_o)
if 0 < self.recurrent_dropout < 1.:
h_tm1_i = h_tm1 * rec_dp_mask[0]
h_tm1_f = h_tm1 * rec_dp_mask[1]
h_tm1_c = h_tm1 * rec_dp_mask[2]
h_tm1_o = h_tm1 * rec_dp_mask[3]
else:
h_tm1_i = h_tm1
h_tm1_f = h_tm1
h_tm1_c = h_tm1
h_tm1_o = h_tm1
i = self.recurrent_activation(x_i + K.dot(h_tm1_i,
self.recurrent_kernel_i))
f = self.recurrent_activation(x_f + K.dot(h_tm1_f,
self.recurrent_kernel_f))
c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1_c,
self.recurrent_kernel_c))
o = self.recurrent_activation(x_o + K.dot(h_tm1_o,
self.recurrent_kernel_o))
else:
if 0. < self.dropout < 1.:
inputs *= dp_mask[0]
z = K.dot(inputs, self.kernel)
if 0. < self.recurrent_dropout < 1.:
h_tm1 *= rec_dp_mask[0]
z += K.dot(h_tm1, self.recurrent_kernel)
if self.use_bias:
z = K.bias_add(z, self.bias)
z0 = z[:, :self.units]
z1 = z[:, self.units: 2 * self.units]
z2 = z[:, 2 * self.units: 3 * self.units]
z3 = z[:, 3 * self.units:]
i = self.recurrent_activation(z0)
f = self.recurrent_activation(z1)
c = f * c_tm1 + i * self.activation(z2)
o = self.recurrent_activation(z3)
h = o * self.activation(c)
if 0 < self.dropout + self.recurrent_dropout:
if training is None:
h._uses_learning_phase = True
return tf.expand_dims(tf.concat([h,c],axis=0),0), [h, c]
Sample code
# create a cell
test = Mod_LSTMCELL(100)
# Input timesteps=10, features=7
in1 = Input(shape=(10,7))
out1 = RNN(test, return_sequences=True)(in1)
M = Model(inputs=[in1],outputs=[out1])
M.compile(keras.optimizers.Adam(),loss='mse')
ans = M.predict(np.arange(7*10,dtype=np.float32).reshape(1, 10, 7))
print(ans.shape)
# state_h
print(ans[0,0,0,:])
# state_c
print(ans[0,0,1,:])
First, this is not possible do with the tf.keras.layers.LSTM. You have to use LSTMCell instead or subclass LSTM. Second, there is no need to subclass LSTMCell to get the sequence of cell states. LSTMCell already returns a list of the hidden state (h) and cell state (c) everytime you call it.
For those not familiar with LSTMCell, it takes in the current [h, c] tensors, and the input at the current timestep (it cannot take in a sequence of times) and returns the activations, and the updated [h,c].
Here is an example of showing how to use LSTMCell to process a sequence of timesteps and to return the accumulated cell states.
# example inputs
inputs = tf.convert_to_tensor(np.random.rand(3, 4), dtype='float32') # 3 timesteps, 4 features
h_c = [tf.zeros((1,2)), tf.zeros((1,2))] # must initialize hidden/cell state for lstm cell
h_c = tf.convert_to_tensor(h_c, dtype='float32')
lstm = tf.keras.layers.LSTMCell(2)
# example of how you accumulate cell state over repeated calls to LSTMCell
inputs = tf.unstack(inputs, axis=0)
c_states = []
for cur_inputs in inputs:
out, h_c = lstm(tf.expand_dims(cur_inputs, axis=0), h_c)
h, c = h_c
c_states.append(c)
You can access the states of any RNN by setting return_sequences = True in the initializer. You can find more information about this parameter here.
I am a beginner in machine learning and neural networks. Recently, after watching Andrew Ng's lectures on deep learning, I tried to implement a binary classifier using deep neural networks on my own.
However, the cost of the function is expected to decrease after each iteration.
In my program, it decreases slightly in the beginning, but rapidly increases later. I tried to make changes in learning rate and number of iterations, but to no avail. I am very confused.
Here is my code
1. Neural network classifier class
class NeuralNetwork:
def __init__(self, X, Y, dimensions, alpha=1.2, iter=3000):
self.X = X
self.Y = Y
self.dimensions = dimensions # Including input layer and output layer. Let example be dimensions=4
self.alpha = alpha # Learning rate
self.iter = iter # Number of iterations
self.length = len(self.dimensions)-1
self.params = {} # To store parameters W and b for each layer
self.cache = {} # To store cache Z and A for each layer
self.grads = {} # To store dA, dZ, dW, db
self.cost = 1 # Initial value does not matter
def initialize(self):
np.random.seed(3)
# If dimensions is 4, then layer 0 and 3 are input and output layers
# So we only need to initialize w1, w2 and w3
# There is no need of w0 for input layer
for l in range(1, len(self.dimensions)):
self.params['W'+str(l)] = np.random.randn(self.dimensions[l], self.dimensions[l-1])*0.01
self.params['b'+str(l)] = np.zeros((self.dimensions[l], 1))
def forward_propagation(self):
self.cache['A0'] = self.X
# For last layer, ie, the output layer 3, we need to activate using sigmoid
# For layer 1 and 2, we need to use relu
for l in range(1, len(self.dimensions)-1):
self.cache['Z'+str(l)] = np.dot(self.params['W'+str(l)], self.cache['A'+str(l-1)]) + self.params['b'+str(l)]
self.cache['A'+str(l)] = relu(self.cache['Z'+str(l)])
l = len(self.dimensions)-1
self.cache['Z'+str(l)] = np.dot(self.params['W'+str(l)], self.cache['A'+str(l-1)]) + self.params['b'+str(l)]
self.cache['A'+str(l)] = sigmoid(self.cache['Z'+str(l)])
def compute_cost(self):
m = self.Y.shape[1]
A = self.cache['A'+str(len(self.dimensions)-1)]
self.cost = -1/m*np.sum(np.multiply(self.Y, np.log(A)) + np.multiply(1-self.Y, np.log(1-A)))
self.cost = np.squeeze(self.cost)
def backward_propagation(self):
A = self.cache['A' + str(len(self.dimensions) - 1)]
m = self.X.shape[1]
self.grads['dA'+str(len(self.dimensions)-1)] = -(np.divide(self.Y, A) - np.divide(1-self.Y, 1-A))
# Sigmoid derivative for final layer
l = len(self.dimensions)-1
self.grads['dZ' + str(l)] = self.grads['dA' + str(l)] * sigmoid_prime(self.cache['Z' + str(l)])
self.grads['dW' + str(l)] = 1 / m * np.dot(self.grads['dZ' + str(l)], self.cache['A' + str(l - 1)].T)
self.grads['db' + str(l)] = 1 / m * np.sum(self.grads['dZ' + str(l)], axis=1, keepdims=True)
self.grads['dA' + str(l - 1)] = np.dot(self.params['W' + str(l)].T, self.grads['dZ' + str(l)])
# Relu derivative for previous layers
for l in range(len(self.dimensions)-2, 0, -1):
self.grads['dZ'+str(l)] = self.grads['dA'+str(l)] * relu_prime(self.cache['Z'+str(l)])
self.grads['dW'+str(l)] = 1/m*np.dot(self.grads['dZ'+str(l)], self.cache['A'+str(l-1)].T)
self.grads['db'+str(l)] = 1/m*np.sum(self.grads['dZ'+str(l)], axis=1, keepdims=True)
self.grads['dA'+str(l-1)] = np.dot(self.params['W'+str(l)].T, self.grads['dZ'+str(l)])
def update_parameters(self):
for l in range(1, len(self.dimensions)):
self.params['W'+str(l)] = self.params['W'+str(l)] - self.alpha*self.grads['dW'+str(l)]
self.params['b'+str(l)] = self.params['b'+str(l)] - self.alpha*self.grads['db'+str(l)]
def train(self):
np.random.seed(1)
self.initialize()
for i in range(self.iter):
#print(self.params)
self.forward_propagation()
self.compute_cost()
self.backward_propagation()
self.update_parameters()
if i % 100 == 0:
print('Cost after {} iterations is {}'.format(i, self.cost))
2. Testing code for odd or even number classifier
import numpy as np
from main import NeuralNetwork
X = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
Y = np.array([[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]])
clf = NeuralNetwork(X, Y, [1, 1, 1], alpha=0.003, iter=7000)
clf.train()
3. Helper Code
import math
import numpy as np
def sigmoid_scalar(x):
return 1/(1+math.exp(-x))
def sigmoid_prime_scalar(x):
return sigmoid_scalar(x)*(1-sigmoid_scalar(x))
def relu_scalar(x):
if x > 0:
return x
else:
return 0
def relu_prime_scalar(x):
if x > 0:
return 1
else:
return 0
sigmoid = np.vectorize(sigmoid_scalar)
sigmoid_prime = np.vectorize(sigmoid_prime_scalar)
relu = np.vectorize(relu_scalar)
relu_prime = np.vectorize(relu_prime_scalar)
Output
I believe your cross-entropy derivative is wrong. Instead of this:
# WRONG!
self.grads['dA'+str(len(self.dimensions)-1)] = -(np.divide(self.Y, A) - np.divide(1-self.Y, A))
... do this:
# CORRECT
self.grads['dA'+str(len(self.dimensions)-1)] = np.divide(A - self.Y, (1 - A) * A)
See these lecture notes for the details. I think you meant the formula (5), but forgot 1-A. Anyway, use formula (6).