tf.keras.layers.Cropping2D for pytorch - tensorflow

I'm currently working on a gan that generates time series and I want to tailor my generated time series to the desired time steps. In Tensorflow I always used "tf.keras.layers.Cropping2D" for this. Is there an equivalent for this in Pytorch?
class cont_cond_cnn_generator(nn.Module):
def __init__(self, nz=32, dim_embed=DIM_EMBED):
dim_embed=DIM_EMBED
super(cont_cond_cnn_generator, self).__init__()
self.z_dim = nz
self.dim_embed = dim_embed
self.dense = nn.Linear(self.z_dim, 3 * GEN_SIZE*16, bias=True)
self.dense = nn.Linear(self.z_dim, 4 * 4 * GEN_SIZE*16, bias=True)
self.final = nn.Conv1d(GEN_SIZE, channels, 3, stride=1, padding=1, bias=bias)
nn.init.xavier_uniform_(self.dense.weight.data, 1.)
nn.init.xavier_uniform_(self.final.weight.data, 1.)
self.genblock0 = ResBlockGenerator(in_channels=GEN_SIZE*16, out_channels = GEN_SIZE*8 , dim_embed=dim_embed) #4--->8 # 2. parameter original :GEN_SIZE*8
self.genblock1 = ResBlockGenerator(GEN_SIZE*8, GEN_SIZE*4, dim_embed=dim_embed) #8--->16 #2. parameter original :GEN_SIZE*4
self.genblock2 = ResBlockGenerator(GEN_SIZE*4,GEN_SIZE*2, dim_embed=dim_embed) #16--->32 # 2. parameter original :GEN_SIZE*2
self.genblock3 = ResBlockGenerator(GEN_SIZE*2, GEN_SIZE, dim_embed=dim_embed) #32--->64 2. parameter original :GEN_SIZE
self.final = nn.Sequential(
nn.BatchNorm1d(GEN_SIZE),
nn.ReLU(),
self.final,
nn.Sigmoid()
)
def forward(self, z, y):
z = z.view(z.size(0), z.size(1))
out = self.dense(z)
out = out.view(-1, GEN_SIZE*16, 3) #out = out.view(-1, GEN_SIZE*16, 4, 4)
out = self.genblock0(out, y)
out = self.genblock1(out, y)
out = self.genblock2(out, y)
out = self.genblock3(out, y)
out = self.final(out)
out = torchvision.transforms.functional.crop(out, 0,0,3,2201)
return out
I used the torchvision.transforms.functional.crop() and it works. But is it a good methode or would you recommend my another way to do that.

Related

How can I define the activation function as a hyperparameter in PyTorch through RAY.Tune?

This is the link to the main page of the topic I want.
https://docs.ray.io/en/latest/tune/examples/tune-pytorch-cifar.html#tune-pytorch-cifar-ref
But unfortunately, there is no good documentation to answer all the questions.
Also, if you know how I can define nested cross validation in this environment, please tell me.
If you look at
class Net(nn.Module):
def __init__(self, l1=120, l2=84):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, l1)
self.fc2 = nn.Linear(l1, l2)
self.fc3 = nn.Linear(l2, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
and
config = {
"l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
"l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([2, 4, 8, 16]),
}
then you can see that l1 and l2 in the init function are config parameters to be tuned for the Net architecyure. You can do the same for your use and add an arbitrary activation function as a parameter in the init of the Net class and add use a tune.choice for different activation in the config dictionary.
I solved this problem as follows.
n_samples,n_features=X_train.shape
class NeuralNetwork (nn.Module):
def __init__(self,n_input_features,l1, l2,l3,config):
super (NeuralNetwork, self).__init__()
self.config = config
self.linear1=nn.Linear(n_input_features,4*math.floor(n_input_features/2)+l1)
self.linear2=nn.Linear(l1+4*math.floor(n_input_features/2),math.floor(n_input_features/2)+l2)
self.linear3=nn.Linear(math.floor(n_input_features/2)+l2,math.floor(n_input_features/3)+l3)
self.linear4=nn.Linear(math.floor(n_input_features/3)+l3,math.floor(n_input_features/6))
self.linear5=nn.Linear(math.floor(n_input_features/6),1)
self.a1 = self.config.get("a1")
self.a2 = self.config.get("a2")
self.a3 = self.config.get("a3")
self.a4 = self.config.get("a4")
#staticmethod
def activation_func(act_str):
if act_str=="tanh" or act_str=="sigmoid":
return eval("torch."+act_str)
elif act_str=="silu" or act_str=="relu" or act_str=="leaky_relu" or act_str=="gelu":
return eval("torch.nn.functional."+act_str)
def forward(self,x):
out=self.linear1(x)
out=self.activation_func(self.a1)(out.float())
out=self.linear2(out)
out=self.activation_func(self.a2)(out.float())
out=self.linear3(out)
out=self.activation_func(self.a3)(out.float())
out=self.linear4(out)
out=self.activation_func(self.a3)(out.float())
out=torch.sigmoid(self.linear5(out))
y_predicted=out
return y_predicted

TypeError when trying to make a loop creating artificial neural networks

I am working on an artifical neural network which I have created via subclassing.
The subclassing looks like this:
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import scipy.stats as si
import sympy as sy
from sympy.stats import Normal, cdf
from sympy import init_printing
class DGMNet(tf.keras.Model):
def __init__(self, n_layers, n_nodes, dimensions=1):
"""
Parameters:
- n_layers: number of layers
- n_nodes: number of nodes in (inner) layers
- dimensions: number of spacial dimensions
"""
super().__init__()
self.n_layers = n_layers
self.initial_layer = DenseLayer(dimensions + 1, n_nodes, activation="relu")
self.lstmlikelist = []
for _ in range(self.n_layers):
self.lstmlikelist.append(LSTMLikeLayer(dimensions + 1, n_nodes, activation="relu"))
self.final_layer = DenseLayer(n_nodes, 1, activation=None)
def call(self, t, x):
X = tf.concat([t,x], 1)
S = self.initial_layer.call(X)
for i in range(self.n_layers):
S = self.lstmlikelist[i].call({'S': S, 'X': X})
result = self.final_layer.call(S)
return result
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, n_inputs, n_outputs, activation):
"""
Parameters:
- n_inputs: number of inputs
- n_outputs: number of outputs
- activation: activation function
"""
super(DenseLayer, self).__init__()
self.n_inputs = n_inputs
self.n_outputs = n_outputs
self.W = self.add_weight(shape=(self.n_inputs, self.n_outputs),
initializer='random_normal',
trainable=True)
self.b = self.add_weight(shape=(1, self.n_outputs),
initializer='random_normal',
trainable=True)
self.activation = _get_function(activation)
def call(self, inputs):
S = tf.add(tf.matmul(inputs, self.W), self.b)
S = self.activation(S)
return S
class LSTMLikeLayer(tf.keras.layers.Layer):
def __init__(self, n_inputs, n_outputs, activation):
"""
Parameters:
- n_inputs: number of inputs
- n_outputs: number of outputs
- activation: activation function
"""
super(LSTMLikeLayer, self).__init__()
self.n_outputs = n_outputs
self.n_inputs = n_inputs
self.Uz = self.add_variable("Uz", shape=[self.n_inputs, self.n_outputs])
self.Ug = self.add_variable("Ug", shape=[self.n_inputs, self.n_outputs])
self.Ur = self.add_variable("Ur", shape=[self.n_inputs, self.n_outputs])
self.Uh = self.add_variable("Uh", shape=[self.n_inputs, self.n_outputs])
self.Wz = self.add_variable("Wz", shape=[self.n_outputs, self.n_outputs])
self.Wg = self.add_variable("Wg", shape=[self.n_outputs, self.n_outputs])
self.Wr = self.add_variable("Wr", shape=[self.n_outputs, self.n_outputs])
self.Wh = self.add_variable("Wh", shape=[self.n_outputs, self.n_outputs])
self.bz = self.add_variable("bz", shape=[1, self.n_outputs])
self.bg = self.add_variable("bg", shape=[1, self.n_outputs])
self.br = self.add_variable("br", shape=[1, self.n_outputs])
self.bh = self.add_variable("bh", shape=[1, self.n_outputs])
self.activation = _get_function(activation)
def call(self, inputs):
S = inputs['S']
X = inputs['X']
Z = self.activation(tf.add(tf.add(tf.matmul(X, self.Uz), tf.matmul(S, self.Wz)), self.bz))
G = self.activation(tf.add(tf.add(tf.matmul(X, self.Ug), tf.matmul(S, self.Wg)), self.bg))
R = self.activation(tf.add(tf.add(tf.matmul(X, self.Ur), tf.matmul(S, self.Wr)), self.br))
H = self.activation(tf.add(tf.add(tf.matmul(X, self.Uh), tf.matmul(tf.multiply(S, R), self.Wh)), self.bh))
Snew = tf.add(tf.multiply(tf.subtract(tf.ones_like(G), G), H), tf.multiply(Z, S))
return Snew
def _get_function(name):
f = None
if name == "tanh":
f = tf.nn.tanh
elif name == "sigmoid":
f = tf.nn.sigmoid
elif name == "relu":
f = tf.nn.relu
elif not name:
f = tf.identity
assert f is not None
return f
# Sampling
def sampler(N1, N2, N3):
np.random.seed(42)
# Sampler #1: PDE domain
t1 = np.random.uniform(low=T0,
high=T,
size=[N1,1])
s1 = np.random.uniform(low=S1,
high=S2,
size=[N1,1])
# Sampler #2: boundary condition
t2 = np.zeros(shape=(1, 1))
s2 = np.zeros(shape=(1, 1))
# Sampler #3: initial/terminal condition
t3 = T * np.ones((N3,1)) #Terminal condition
s3 = np.random.uniform(low=S1,
high=S2,
size=[N3,1])
return (t1, s1, t2, s2, t3, s3)
# Loss function
def loss(model, t1, x1, t2, x2, t3, x3):
# Loss term #1: PDE
V = model(t1, x1)
V_t = tf.gradients(V, t1)[0]
V_x = tf.gradients(V, x1)[0]
V_xx = tf.gradients(V_x, x1)[0]
f = V_t + r*x1*V_x + 0.5*sigma**2*x1**2*V_xx - r*V
L1 = tf.reduce_mean(tf.square(f))
# Loss term #2: boundary condition
#L2 = tf.reduce_mean(tf.square(V))
# Loss term #3: initial/terminal condition
L3 = tf.reduce_mean(tf.square(model(t3, x3) - tf.math.maximum(x3-K,0)))
return (L1, L3)
# B-S's analytical known solution
def analytical_solution(t, x):
#C = SN(d1) - Xe- rt N(d2)
#S: spot price
#K: strike price
#T: time to maturity
#r: interest rate
#sigma: volatility of underlying asset
d1 = (np.log(x / K) + (r + 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
d2 = (np.log(x / K) + (r - 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
call = (x * si.norm.cdf(d1, 0.0, 1.0) - K * np.exp(-r * T) * si.norm.cdf(d2, 0.0, 1.0))
return call
# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)
# Strike price
K = 0.5
# PDE parameters
r = 0.05 # Interest rate
sigma = 0.25 # Volatility
# Time limits
T0 = 0.0 + 1e-10 # Initial time
T = 1.0 # Terminal time
# Space limits
S1 = 0.0 + 1e-10 # Low boundary
S2 = 1.0 # High boundary
# Number of samples
NS_1 = 1000
NS_2 = 0
NS_3 = 100
t1, s1, t2, s2, t3, s3 = sampler(NS_1, NS_2, NS_3)
Now what I want to do is to iterate over different parameters and create a new ann for each iteration.
My plan was to do it in this way:
tf.compat.v1.disable_eager_execution()
t1_t = tf.compat.v1.placeholder(tf.float32, [None,1])
x1_t = tf.compat.v1.placeholder(tf.float32, [None,1])
t2_t = tf.compat.v1.placeholder(tf.float32, [None,1])
x2_t = tf.compat.v1.placeholder(tf.float32, [None,1])
t3_t = tf.compat.v1.placeholder(tf.float32, [None,1])
x3_t = tf.compat.v1.placeholder(tf.float32, [None,1])
volatility_list = [0.08]#[0.08, 0.16, 0.18, 0.2, 0.28]
stages_list = [10]#, 50, 100]
layers_list = [3]#, 5, 7]
npl_list = [3]#, 6, 9, 12, 15]
for sigma in volatility_list:
for st in stages_list:
for lay in layers_list:
for npl in npl_list:
# Neural Network definition
num_layers = lay
nodes_per_layer = npl
ann = DGMNet(num_layers, nodes_per_layer)
L1_t, L3_t = loss(ann, t1_t, x1_t, t2_t, x2_t, t3_t, x3_t)
loss_t = L1_t + L3_t
# Optimizer parameters
global_step = tf.Variable(1, trainable=False)
starter_learning_rate = 0.001
learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step,
100000, 0.96, staircase=True)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_t)
# Training parameters
steps_per_sample = st
sampling_stages = 100#2000
# Plot tensors
tplot_t = tf.compat.v1.placeholder(tf.float32, [None,1], name="tplot_t") # We name to recover it later
xplot_t = tf.compat.v1.placeholder(tf.float32, [None,1], name="xplot_t")
vplot_t = tf.identity(ann(tplot_t, xplot_t), name="vplot_t") # Trick for naming the trained model
# Training data holders
sampling_stages_list = []
elapsed_time_list = []
loss_list = []
L1_list = []
L3_list = []
# Train network!!
init_op = tf.compat.v1.global_variables_initializer()
sess = tf.compat.v1.Session()
sess.run(init_op)
for i in range(sampling_stages):
t1, x1, t2, x2, t3, x3 = sampler(NS_1, NS_2, NS_3)
start_time = time.clock()
for _ in range(steps_per_sample):
loss, L1, L3, _ = sess.run([loss_t, L1_t, L3_t, optimizer],
feed_dict = {t1_t:t1, x1_t:x1, t2_t:t2, x2_t:x2, t3_t:t3, x3_t:x3})
end_time = time.clock()
elapsed_time = end_time - start_time
sampling_stages_list.append(i)
elapsed_time_list.append(elapsed_time)
loss_list.append(loss)
L1_list.append(L1)
L3_list.append(L3)
text = "Stage: {:04d}, Loss: {:e}, L1: {:e}, L3: {:e}, {:f} seconds".format(i, loss, L1, L3, elapsed_time)
print(text)
#goodness of fit
time_0 = 0
listofzeros = [time_0] * 100
prices_for_goodness = np.linspace(S1,S2, 100)
goodness_list = []
solution_goodness = analytical_solution(listofzeros, prices_for_goodness)
ttt = time_0*np.ones_like(prices_for_goodness.reshape(-1,1))
nn_goodness, = sess.run([vplot_t],
feed_dict={tplot_t:ttt, xplot_t:prices_for_goodness.reshape(-1,1)})
deviation_list = np.abs(solution_goodness - nn_goodness)/(T-T0)
print("{0:.2f}%".format(np.average(deviation_list)*100))
Unfortunately as soon as it ends the first iteration I get a TypeError that 'numpy.float32' object is not callable
Error Traceback:
TypeError Traceback (most recent call last)
<ipython-input-14-bb14643d0c42> in <module>()
10
11
---> 12 L1_t, L3_t = loss(ann, t1_t, x1_t, t2_t, x2_t, t3_t, x3_t)
13 loss_t = L1_t + L3_t
14
TypeError: 'numpy.float32' object is not callable
I guess that the problem is with the creation of the placeholders, however I am not sure how to solve it. Maybe one of you can help me
Thanks in advance!
Chris
Did you create a variable called 'loss'? It seems that the loss function is redefined by a variable with the same name, so then python tries to call that variable as a function.

Stratify batch in Tensorflow 2

I have minibatches that I get from an sqlite database with data of integer and float type, x, and a binary label in 0 and 1, y. I am looking for something like X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(y, x, test_size=0.1, random_state=1, stratify=True) from scikit-learn, where a keyword could stratify the data (i.e. the same number of class-0 and class-1 instances).
In Tensorflow 2, stratification seems not straightforwardly possible. My very complicated solution works for me, but takes a lot of time because of all the reshaping and transposing:
def stratify(x, y):
# number of positive instances (the smaller class)
pos = np.sum(y).item() # how many positive bonds there are
x = np.transpose(x)
# number of features
f = np.shape(x)[1]
# filter only class 1
y = tf.transpose(y)
x_pos = tf.boolean_mask(x,
y_pos = tf.boolean_mask(y, y)
# filter only class 1
x_neg = tf.boolean_mask(x, tf.bitwise.invert(y)-254)
x_neg = tf.reshape(x_neg, [f,-1])
y_neg = tf.boolean_mask(y, tf.bitwise.invert(y)-254)
# just take randomy as many class-0 as there are class-1
x_neg = tf.transpose(tf.random.shuffle(tf.transpose(x_neg)))
x_neg = x_neg[:,0:pos]
y_neg = y_neg[0:pos]
# concat the class-1 and class-0 together, then shuffle, and concat back together
x = tf.concat([x_pos,tf.transpose(x_neg)],0)
y = tf.concat([y_pos, tf.transpose(y_neg)],0)
xy = tf.concat([tf.transpose(x), tf.cast(np.reshape(y,[1, -1]), tf.float64)],0)
xy = tf.transpose((tf.random.shuffle(tf.transpose(xy)))) # because there is no axis arg in shuffle
x = xy[0:f,:]
x = tf.transpose(x)
y = xy[f,:]
return x, y
I am happy to see some feedback/improvement on my own function or novel, easier ideas.
Data division is best if it is done in raw format only or before you transform it into tensors. If there is a strong requirement to do it in TensorFlow only, then I will suggest you to make use of tf.data.Dataset class. I have added the demo code with relevant comments explaining the steps.
import tensorflow as tf
import numpy as np
TEST_SIZE = 0.1
DATA_SIZE = 1000
# Create data
X_data = np.random.rand(DATA_SIZE, 28, 28, 1)
y_data = np.random.randint(0, 2, [DATA_SIZE])
samples1 = np.sum(y_data)
print('Percentage of 1 = ', samples1 / len(y_data))
# Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((X_data, y_data))
# Gather data with 0 and 1 labels separately
class0_dataset = dataset.filter(lambda x, y: y == 0)
class1_dataset = dataset.filter(lambda x, y: y == 1)
# Shuffle them
class0_dataset = class0_dataset.shuffle(DATA_SIZE)
class1_dataset = class1_dataset.shuffle(DATA_SIZE)
# Split them
class0_test_samples_len = int((DATA_SIZE - samples1) * TEST_SIZE)
class0_test = class0_dataset.take(class0_test_samples_len)
class0_train = class0_dataset.skip(class0_test_samples_len)
class1_test_samples_len = int(samples1 * TEST_SIZE)
class1_test = class1_dataset.take(class1_test_samples_len)
class1_train = class1_dataset.skip(class1_test_samples_len)
print('Train Class 0 = ', len(list(class0_train)), ' Class 1 = ', len(list(class1_train)))
print('Test Class 0 = ', len(list(class0_test)), ' Class 1 = ', len(list(class1_test)))
# Gather datasets
train_dataset = class0_train.concatenate(class1_train).shuffle(DATA_SIZE)
test_dataset = class0_test.concatenate(class1_test).shuffle(DATA_SIZE)
print('Train dataset size = ', len(list(train_dataset)))
print('Test dataset size = ', len(list(test_dataset)))
Sample output:
Percentage of 1 = 0.474
Train Class 0 = 474 Class 1 = 427
Test Class 0 = 52 Class 1 = 47
Train dataset size = 901
Test dataset size = 99

How to extract cell state from a LSTM at each timestep in Keras?

Is there a way in Keras to retrieve the cell state (i.e., c vector) of a LSTM layer at every timestep of a given input?
It seems the return_state argument returns the last cell state after the computation is done, but I need also the intermediate ones. Also, I don't want to pass these cell states to the next layer, I only want to be able to access them.
Preferably using TensorFlow as backend.
Thanks
I was looking for a solution to this issue and after reading the guidance for creating your own custom RNN Cell in tf.keras (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AbstractRNNCell), I believe the following is the most concise and easy to read way of doing this for Tensorflow 2:
import tensorflow as tf
from tensorflow.keras.layers import LSTMCell
class LSTMCellReturnCellState(LSTMCell):
def call(self, inputs, states, training=None):
real_inputs = inputs[:,:self.units] # decouple [h, c]
outputs, [h,c] = super().call(real_inputs, states, training=training)
return tf.concat([h, c], axis=1), [h,c]
num_units = 512
test_input = tf.random.uniform([5,100,num_units])
rnn = tf.keras.layers.RNN(LSTMCellReturnCellState(num_units),
return_sequences=True, return_state=True)
whole_seq_output, final_memory_state, final_carry_state = rnn(test_input)
print(whole_seq_output.shape)
>>> (5,100,1024)
# Hidden state sequence
h_seq = whole_seq_output[:,:,:num_units] # (5,100,512)
# Cell state sequence
c_seq = whole_seq_output[:,:,num_units:] # (5,100,512)
As mentioned in an above solution, you can see the advantage of this is that it can be easily wrapped into tf.keras.layers.RNN as a drop-in for the normal LSTMCell.
Here is a Colab Notebook with the code running as expected for tensorflow==2.6.0
I know it's pretty late, I hope this can help.
what you are asking, technically, is possible by modifying the LSTM-cell in call method. I modify it and make it return 4 dimension instead of 3 when you give return_sequences=True.
Code
from keras.layers.recurrent import _generate_dropout_mask
class Mod_LSTMCELL(LSTMCell):
def call(self, inputs, states, training=None):
if 0 < self.dropout < 1 and self._dropout_mask is None:
self._dropout_mask = _generate_dropout_mask(
K.ones_like(inputs),
self.dropout,
training=training,
count=4)
if (0 < self.recurrent_dropout < 1 and
self._recurrent_dropout_mask is None):
self._recurrent_dropout_mask = _generate_dropout_mask(
K.ones_like(states[0]),
self.recurrent_dropout,
training=training,
count=4)
# dropout matrices for input units
dp_mask = self._dropout_mask
# dropout matrices for recurrent units
rec_dp_mask = self._recurrent_dropout_mask
h_tm1 = states[0] # previous memory state
c_tm1 = states[1] # previous carry state
if self.implementation == 1:
if 0 < self.dropout < 1.:
inputs_i = inputs * dp_mask[0]
inputs_f = inputs * dp_mask[1]
inputs_c = inputs * dp_mask[2]
inputs_o = inputs * dp_mask[3]
else:
inputs_i = inputs
inputs_f = inputs
inputs_c = inputs
inputs_o = inputs
x_i = K.dot(inputs_i, self.kernel_i)
x_f = K.dot(inputs_f, self.kernel_f)
x_c = K.dot(inputs_c, self.kernel_c)
x_o = K.dot(inputs_o, self.kernel_o)
if self.use_bias:
x_i = K.bias_add(x_i, self.bias_i)
x_f = K.bias_add(x_f, self.bias_f)
x_c = K.bias_add(x_c, self.bias_c)
x_o = K.bias_add(x_o, self.bias_o)
if 0 < self.recurrent_dropout < 1.:
h_tm1_i = h_tm1 * rec_dp_mask[0]
h_tm1_f = h_tm1 * rec_dp_mask[1]
h_tm1_c = h_tm1 * rec_dp_mask[2]
h_tm1_o = h_tm1 * rec_dp_mask[3]
else:
h_tm1_i = h_tm1
h_tm1_f = h_tm1
h_tm1_c = h_tm1
h_tm1_o = h_tm1
i = self.recurrent_activation(x_i + K.dot(h_tm1_i,
self.recurrent_kernel_i))
f = self.recurrent_activation(x_f + K.dot(h_tm1_f,
self.recurrent_kernel_f))
c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1_c,
self.recurrent_kernel_c))
o = self.recurrent_activation(x_o + K.dot(h_tm1_o,
self.recurrent_kernel_o))
else:
if 0. < self.dropout < 1.:
inputs *= dp_mask[0]
z = K.dot(inputs, self.kernel)
if 0. < self.recurrent_dropout < 1.:
h_tm1 *= rec_dp_mask[0]
z += K.dot(h_tm1, self.recurrent_kernel)
if self.use_bias:
z = K.bias_add(z, self.bias)
z0 = z[:, :self.units]
z1 = z[:, self.units: 2 * self.units]
z2 = z[:, 2 * self.units: 3 * self.units]
z3 = z[:, 3 * self.units:]
i = self.recurrent_activation(z0)
f = self.recurrent_activation(z1)
c = f * c_tm1 + i * self.activation(z2)
o = self.recurrent_activation(z3)
h = o * self.activation(c)
if 0 < self.dropout + self.recurrent_dropout:
if training is None:
h._uses_learning_phase = True
return tf.expand_dims(tf.concat([h,c],axis=0),0), [h, c]
Sample code
# create a cell
test = Mod_LSTMCELL(100)
# Input timesteps=10, features=7
in1 = Input(shape=(10,7))
out1 = RNN(test, return_sequences=True)(in1)
M = Model(inputs=[in1],outputs=[out1])
M.compile(keras.optimizers.Adam(),loss='mse')
ans = M.predict(np.arange(7*10,dtype=np.float32).reshape(1, 10, 7))
print(ans.shape)
# state_h
print(ans[0,0,0,:])
# state_c
print(ans[0,0,1,:])
First, this is not possible do with the tf.keras.layers.LSTM. You have to use LSTMCell instead or subclass LSTM. Second, there is no need to subclass LSTMCell to get the sequence of cell states. LSTMCell already returns a list of the hidden state (h) and cell state (c) everytime you call it.
For those not familiar with LSTMCell, it takes in the current [h, c] tensors, and the input at the current timestep (it cannot take in a sequence of times) and returns the activations, and the updated [h,c].
Here is an example of showing how to use LSTMCell to process a sequence of timesteps and to return the accumulated cell states.
# example inputs
inputs = tf.convert_to_tensor(np.random.rand(3, 4), dtype='float32') # 3 timesteps, 4 features
h_c = [tf.zeros((1,2)), tf.zeros((1,2))] # must initialize hidden/cell state for lstm cell
h_c = tf.convert_to_tensor(h_c, dtype='float32')
lstm = tf.keras.layers.LSTMCell(2)
# example of how you accumulate cell state over repeated calls to LSTMCell
inputs = tf.unstack(inputs, axis=0)
c_states = []
for cur_inputs in inputs:
out, h_c = lstm(tf.expand_dims(cur_inputs, axis=0), h_c)
h, c = h_c
c_states.append(c)
You can access the states of any RNN by setting return_sequences = True in the initializer. You can find more information about this parameter here.

Why does the cost in my implementation of a deep neural network increase after a few iterations?

I am a beginner in machine learning and neural networks. Recently, after watching Andrew Ng's lectures on deep learning, I tried to implement a binary classifier using deep neural networks on my own.
However, the cost of the function is expected to decrease after each iteration.
In my program, it decreases slightly in the beginning, but rapidly increases later. I tried to make changes in learning rate and number of iterations, but to no avail. I am very confused.
Here is my code
1. Neural network classifier class
class NeuralNetwork:
def __init__(self, X, Y, dimensions, alpha=1.2, iter=3000):
self.X = X
self.Y = Y
self.dimensions = dimensions # Including input layer and output layer. Let example be dimensions=4
self.alpha = alpha # Learning rate
self.iter = iter # Number of iterations
self.length = len(self.dimensions)-1
self.params = {} # To store parameters W and b for each layer
self.cache = {} # To store cache Z and A for each layer
self.grads = {} # To store dA, dZ, dW, db
self.cost = 1 # Initial value does not matter
def initialize(self):
np.random.seed(3)
# If dimensions is 4, then layer 0 and 3 are input and output layers
# So we only need to initialize w1, w2 and w3
# There is no need of w0 for input layer
for l in range(1, len(self.dimensions)):
self.params['W'+str(l)] = np.random.randn(self.dimensions[l], self.dimensions[l-1])*0.01
self.params['b'+str(l)] = np.zeros((self.dimensions[l], 1))
def forward_propagation(self):
self.cache['A0'] = self.X
# For last layer, ie, the output layer 3, we need to activate using sigmoid
# For layer 1 and 2, we need to use relu
for l in range(1, len(self.dimensions)-1):
self.cache['Z'+str(l)] = np.dot(self.params['W'+str(l)], self.cache['A'+str(l-1)]) + self.params['b'+str(l)]
self.cache['A'+str(l)] = relu(self.cache['Z'+str(l)])
l = len(self.dimensions)-1
self.cache['Z'+str(l)] = np.dot(self.params['W'+str(l)], self.cache['A'+str(l-1)]) + self.params['b'+str(l)]
self.cache['A'+str(l)] = sigmoid(self.cache['Z'+str(l)])
def compute_cost(self):
m = self.Y.shape[1]
A = self.cache['A'+str(len(self.dimensions)-1)]
self.cost = -1/m*np.sum(np.multiply(self.Y, np.log(A)) + np.multiply(1-self.Y, np.log(1-A)))
self.cost = np.squeeze(self.cost)
def backward_propagation(self):
A = self.cache['A' + str(len(self.dimensions) - 1)]
m = self.X.shape[1]
self.grads['dA'+str(len(self.dimensions)-1)] = -(np.divide(self.Y, A) - np.divide(1-self.Y, 1-A))
# Sigmoid derivative for final layer
l = len(self.dimensions)-1
self.grads['dZ' + str(l)] = self.grads['dA' + str(l)] * sigmoid_prime(self.cache['Z' + str(l)])
self.grads['dW' + str(l)] = 1 / m * np.dot(self.grads['dZ' + str(l)], self.cache['A' + str(l - 1)].T)
self.grads['db' + str(l)] = 1 / m * np.sum(self.grads['dZ' + str(l)], axis=1, keepdims=True)
self.grads['dA' + str(l - 1)] = np.dot(self.params['W' + str(l)].T, self.grads['dZ' + str(l)])
# Relu derivative for previous layers
for l in range(len(self.dimensions)-2, 0, -1):
self.grads['dZ'+str(l)] = self.grads['dA'+str(l)] * relu_prime(self.cache['Z'+str(l)])
self.grads['dW'+str(l)] = 1/m*np.dot(self.grads['dZ'+str(l)], self.cache['A'+str(l-1)].T)
self.grads['db'+str(l)] = 1/m*np.sum(self.grads['dZ'+str(l)], axis=1, keepdims=True)
self.grads['dA'+str(l-1)] = np.dot(self.params['W'+str(l)].T, self.grads['dZ'+str(l)])
def update_parameters(self):
for l in range(1, len(self.dimensions)):
self.params['W'+str(l)] = self.params['W'+str(l)] - self.alpha*self.grads['dW'+str(l)]
self.params['b'+str(l)] = self.params['b'+str(l)] - self.alpha*self.grads['db'+str(l)]
def train(self):
np.random.seed(1)
self.initialize()
for i in range(self.iter):
#print(self.params)
self.forward_propagation()
self.compute_cost()
self.backward_propagation()
self.update_parameters()
if i % 100 == 0:
print('Cost after {} iterations is {}'.format(i, self.cost))
2. Testing code for odd or even number classifier
import numpy as np
from main import NeuralNetwork
X = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
Y = np.array([[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]])
clf = NeuralNetwork(X, Y, [1, 1, 1], alpha=0.003, iter=7000)
clf.train()
3. Helper Code
import math
import numpy as np
def sigmoid_scalar(x):
return 1/(1+math.exp(-x))
def sigmoid_prime_scalar(x):
return sigmoid_scalar(x)*(1-sigmoid_scalar(x))
def relu_scalar(x):
if x > 0:
return x
else:
return 0
def relu_prime_scalar(x):
if x > 0:
return 1
else:
return 0
sigmoid = np.vectorize(sigmoid_scalar)
sigmoid_prime = np.vectorize(sigmoid_prime_scalar)
relu = np.vectorize(relu_scalar)
relu_prime = np.vectorize(relu_prime_scalar)
Output
I believe your cross-entropy derivative is wrong. Instead of this:
# WRONG!
self.grads['dA'+str(len(self.dimensions)-1)] = -(np.divide(self.Y, A) - np.divide(1-self.Y, A))
... do this:
# CORRECT
self.grads['dA'+str(len(self.dimensions)-1)] = np.divide(A - self.Y, (1 - A) * A)
See these lecture notes for the details. I think you meant the formula (5), but forgot 1-A. Anyway, use formula (6).