Related
Suppose I have the following code written in Tensorflow 1.x where I define custom loss function. I wish to remove .compat.v1., Session, placeholder etc. and convert it into Tensorflow 2.x.
How to do so?
import DGM
import tensorflow as tf
import numpy as np
import scipy.stats as spstats
import matplotlib.pyplot as plt
from tqdm.notebook import trange
# Option parameters
phi = 10
n = 0.01
T = 4
# Solution parameters (domain on which to solve PDE)
t_low = 0.0 - 1e-10
x_low = 0.0 + 1e-10
x_high = 1.0
# neural network parameters
num_layers = 3
nodes_per_layer = 50
# Training parameters
sampling_stages = 2500 # number of times to resample new time-space domain points
steps_per_sample = 20 # number of SGD steps to take before re-sampling
# Sampling parameters
nsim_interior = 100
nsim_boundary_1 = 50
nsim_boundary_2 = 50
nsim_initial = 50
x_multiplier = 1.1 # multiplier for oversampling i.e. draw x from [x_low, x_high * x_multiplier]
def sampler(nsim_interior, nsim_boundary_1, nsim_boundary_2, nsim_initial):
''' Sample time-space points from the function's domain; points are sampled
uniformly on the interior of the domain, at the initial/terminal time points
and along the spatial boundary at different time points.
Args:
nsim_interior: number of space points in the interior of U
nsim_boundary_1: number of space points in the boundary of U
nsim_boundary_2: number of space points in the boundary of U_x
nsim_initial: number of space points at the initial time
'''
# Sampler #1: domain interior
t_interior = np.random.uniform(low=t_low, high=T, size=[nsim_interior, 1])
x_interior = np.random.uniform(low=x_low, high=x_high*x_multiplier, size=[nsim_interior, 1])
# Sampler #2: spatial boundary 1
t_boundary_1 = np.random.uniform(low=t_low, high=T, size=[nsim_boundary_1, 1])
x_boundary_1 = np.ones((nsim_boundary_1, 1))
# Sampler #3: spatial boundary 2
t_boundary_2 = np.random.uniform(low=t_low, high=T, size=[nsim_boundary_2, 1])
x_boundary_2 = np.zeros((nsim_boundary_2, 1))
# Sampler #4: initial condition
t_initial = np.zeros((nsim_initial, 1))
x_initial = np.random.uniform(low=x_low, high=x_high*x_multiplier, size=[nsim_initial, 1])
return (
t_interior, x_interior,
t_boundary_1, x_boundary_1,
t_boundary_2, x_boundary_2,
t_initial, x_initial
)
def loss(
model,
t_interior, x_interior,
t_boundary_1, x_boundary_1,
t_boundary_2, x_boundary_2,
t_initial, x_initial
):
''' Compute total loss for training.
Args:
model: DGM model object
t_interior, x_interior: sampled time / space points in the interior of U
t_boundary_1, x_boundary_1: sampled time / space points in the boundary of U
t_boundary_2, x_boundary_2: sampled time / space points in the boundary of U_x
t_initial, x_initial: sampled time / space points at the initial time
'''
# Loss term #1: PDE
# compute function value and derivatives at current sampled points
u = model(t_interior, x_interior)
u_t = tf.gradients(ys=u, xs=t_interior)[0]
u_x = tf.gradients(ys=u, xs=x_interior)[0]
u_xx = tf.gradients(ys=u_x, xs=x_interior)[0]
diff_u = u_t - u_xx + phi**2 * (tf.nn.relu(u) + 1e-10)**n
# compute average L2-norm for the PDE
L1 = tf.reduce_mean(input_tensor=tf.square(diff_u))
# Loss term #2: First b. c.
u = model(t_boundary_1, x_boundary_1)
bc1_error = u - 1
# Loss term #3: Second b. c.
u = model(t_boundary_2, x_boundary_2)
u_x = tf.gradients(ys=u, xs=x_boundary_2)[0]
bc2_error = u_x - 0
# Loss term #3: Initial condition
u = model(t_initial, x_initial)
init_error = u - 1
# compute average L2-norm for the initial/boundary conditions
L2 = tf.reduce_mean(input_tensor=tf.square(bc1_error + bc2_error + init_error))
return L1, L2
# initialize DGM model (last input: space dimension = 1)
model = DGM.DGMNet(nodes_per_layer, num_layers, 1)
# tensor placeholders (_tnsr suffix indicates tensors)
# inputs (time, space domain interior, space domain at initial time)
t_interior_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_interior_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_boundary_1_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_boundary_1_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_boundary_2_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_boundary_2_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
t_initial_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
x_initial_tnsr = tf.compat.v1.placeholder(tf.float32, [None,1])
# loss
L1_tnsr, L2_tnsr = loss(
model,
t_interior_tnsr, x_interior_tnsr,
t_boundary_1_tnsr, x_boundary_1_tnsr,
t_boundary_2_tnsr, x_boundary_2_tnsr,
t_initial_tnsr, x_initial_tnsr
)
loss_tnsr = L1_tnsr + L2_tnsr
# set optimizer
starting_learning_rate = 3e-4
global_step = tf.Variable(0, trainable=False)
lr = tf.compat.v1.train.exponential_decay(
learning_rate=starting_learning_rate,
global_step=global_step,
decay_steps=1e5,
decay_rate=0.96,
staircase=True,
)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr).minimize(loss_tnsr)
# initialize variables
init_op = tf.compat.v1.global_variables_initializer()
# open session
sess = tf.compat.v1.Session()
sess.run(init_op)
try:
model.load_weights("checkpoint/")
print("Loading from checkpoint.")
except:
print("Checkpoint not found.")
# for each sampling stage
for i in trange(sampling_stages):
# sample uniformly from the required regions
t_interior, x_interior, \
t_boundary_1, x_boundary_1, \
t_boundary_2, x_boundary_2, \
t_initial, x_initial = sampler(
nsim_interior, nsim_boundary_1, nsim_boundary_2, nsim_initial
)
# for a given sample, take the required number of SGD steps
for _ in range(steps_per_sample):
loss, L1, L2, _ = sess.run(
[loss_tnsr, L1_tnsr, L2_tnsr, optimizer],
feed_dict = {
t_interior_tnsr: t_interior,
x_interior_tnsr: x_interior,
t_boundary_1_tnsr: t_boundary_1,
x_boundary_1_tnsr: x_boundary_1,
t_boundary_2_tnsr: t_boundary_2,
x_boundary_2_tnsr: x_boundary_2,
t_initial_tnsr: t_initial,
x_initial_tnsr: x_initial,
}
)
if i % 10 == 0:
print(f"Loss: {loss:.5f},\t L1: {L1:.5f},\t L2: {L2:.5f},\t iteration: {i}")
model.save_weights("checkpoint/")
I tried searching how to implement custom loss functions with model as an argument, but couldn't implement it.
For model.compile there is a loss argument for which you can pass the Loss function. May be a string (name of loss function), or a tf.keras.losses.Loss instance. For example
Model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
loss=tf.keras.losses.BinaryCrossentropy())
If you have created your custom loss function you can also pass that loss function to the loss argument by providing the name of that loss function. For example
def my_loss_fn(y_true, y_pred):
squared_difference = tf.square(y_true - y_pred)
return tf.reduce_mean(squared_difference, axis=-1)
model.compile(optimizer='adam', loss=my_loss_fn)
Thank You.
I'm working on a seq2sql project and I successfully build a model but when training I get an error. I'm not using any Keras embedding layer.
M=13 #Question Length
d=40 #Dimention of the LSTM
C=12 #number of table Columns
batch_size=9
inputs1=Input(shape=(M,100),name='question_token')
Hq=Bidirectional(LSTM(d,return_sequences=True),name='QuestionENC')(inputs1) #this is HQ shape is (num_samples,13,80)
inputs2=Input(shape=(C,3,100),name='col_token')
col_lstm_layer=Bidirectional(LSTM(d,return_sequences=False),name='ColENC')
def hidd(te):
t=tf.Variable(initial_value=1,dtype=tf.int32)
for i in range(batch_size):
t=tf.assign(t,i)
Z = tf.nn.embedding_lookup(te, t)
print(col_lstm_layer(Z))
h=tf.reshape(col_lstm_layer(Z),[1,C,d*2])
if i==0:
# cols_last_hidden=tf.Variable(initial_value=h)
cols_last_hidden=tf.stack(h)#this is because it gives an error if we use tf.Variable here
else:
cols_last_hidden=tf.concat([cols_last_hidden,h],0)#shape of this one is (num_samples,num_col,80) 80 is last encoding of each column
return cols_last_hidden
cols_last_hidden=Lambda(hidd)(inputs2)
Hq=Dense(d*2,name='QuestionLastEncode')(Hq)
I=tf.Variable(initial_value=1,dtype=tf.int32)
J=tf.Variable(initial_value=1,dtype=tf.int32)
K=1
def get_col_att(tensors):
global K,all_col_attention
if K:
t=tf.Variable(initial_value=1,dtype=tf.int32)
for i in range(batch_size):
t=tf.assign(t,i)
x = tf.nn.embedding_lookup(tensors[0], t)
# print("tensors[1]:",tensors[1])
y = tf.nn.embedding_lookup(tensors[1], t)
# print("x shape",x.shape,"y shape",y.shape)
y=tf.transpose(y)
# print("x shape",x.shape,"y",y.shape)
Ecol=tf.reshape(tf.transpose(tf.tensordot(x,y,axes=1)),[1,C,M])
if i==0:
# all_col_attention=tf.Variable(initial_value=Ecol,name=""+i)
all_col_attention=tf.stack(Ecol)
else:
all_col_attention=tf.concat([all_col_attention,Ecol],0)
K=0
print("all_col_attention",all_col_attention)
return all_col_attention
total_alpha_sel_lambda=Lambda(get_col_att,name="Alpha")([Hq,cols_last_hidden])
total_alpha_sel=Dense(13,activation="softmax")(total_alpha_sel_lambda)
# print("Hq",Hq," total_alpha_sel_lambda shape",total_alpha_sel_lambda," total_alpha_sel shape",total_alpha_sel.shape)
def get_EQcol(tensors):
global K
if K:
t=tf.Variable(initial_value=1,dtype=tf.int32)
global all_Eqcol
for i in range(batch_size):
t=tf.assign(t,i)
x = tf.nn.embedding_lookup(tensors[0], t)
y = tf.nn.embedding_lookup(tensors[1], t)
Eqcol=tf.reshape(tf.tensordot(x,y,axes=1),[1,C,d*2])
if i==0:
# all_Eqcol=tf.Variable(initial_value=Eqcol,name=""+i)
all_Eqcol=tf.stack(Eqcol)
else:
all_Eqcol=tf.concat([all_Eqcol,Eqcol],0)
K=0
print("all_Eqcol",all_Eqcol)
return all_Eqcol
K=1
EQcol=Lambda(get_EQcol,name='EQcol')([total_alpha_sel,Hq])#total_alpha_sel(12x13) Hq(13xd*2)
EQcol=Dropout(.2)(EQcol)
L1=Dense(d*2,name='L1')(cols_last_hidden)
L2=Dense(d*2,name='L2')(EQcol)
L1_plus_L2=Add()([L1,L2])
pre=Flatten()(L1_plus_L2)
Psel=Dense(12,activation="softmax")(pre)
model=Model(inputs=[inputs1,inputs2],outputs=Psel)
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()
earlyStopping=EarlyStopping(monitor='val_loss', patience=7, verbose=0, mode='auto')
history=model.fit([Equestion,Col_Embeddings],y_train,epochs=50,validation_split=.1,shuffle=False,callbacks=[earlyStopping],batch_size=batch_size)
The shapes of the Equestion, Col_Embeddings, and y_train are (10, 12, 3, 100) ,(10, 13, 100) and (10, 12).
I searched about this error but in all cases they have used an embedding layer incorrectly. Here I get this error even though I'm not using one.
indices = 2 is not in [0, 1)
[[{{node lambda_3/embedding_lookup_2}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:#col_token_2"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_col_token_2_0_1, lambda_3/Assign_2, lambda_3/embedding_lookup_2/axis)]]
The problem here was the batch size is defined at the graph level.here i have used batch_size =9 for the graph and yes i get batch size of 9 for training by the validation split .1 for the full batch size of 10 but for the validation i left only one sample because 10*.1 is one.
So the batch size of 1 cannot be passed to the graph because it needs batch size of 9.that's why this error comes
As for the solution i put the batch_size=1 and then it works fine also got a good accuracy by using batch_size=1.
Hope this will help someone.
Cheers ..
For me this error was due to bad form of my input data. You have to double check your input data to the model and it depends on your model input.
Is there a way in Keras to retrieve the cell state (i.e., c vector) of a LSTM layer at every timestep of a given input?
It seems the return_state argument returns the last cell state after the computation is done, but I need also the intermediate ones. Also, I don't want to pass these cell states to the next layer, I only want to be able to access them.
Preferably using TensorFlow as backend.
Thanks
I was looking for a solution to this issue and after reading the guidance for creating your own custom RNN Cell in tf.keras (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AbstractRNNCell), I believe the following is the most concise and easy to read way of doing this for Tensorflow 2:
import tensorflow as tf
from tensorflow.keras.layers import LSTMCell
class LSTMCellReturnCellState(LSTMCell):
def call(self, inputs, states, training=None):
real_inputs = inputs[:,:self.units] # decouple [h, c]
outputs, [h,c] = super().call(real_inputs, states, training=training)
return tf.concat([h, c], axis=1), [h,c]
num_units = 512
test_input = tf.random.uniform([5,100,num_units])
rnn = tf.keras.layers.RNN(LSTMCellReturnCellState(num_units),
return_sequences=True, return_state=True)
whole_seq_output, final_memory_state, final_carry_state = rnn(test_input)
print(whole_seq_output.shape)
>>> (5,100,1024)
# Hidden state sequence
h_seq = whole_seq_output[:,:,:num_units] # (5,100,512)
# Cell state sequence
c_seq = whole_seq_output[:,:,num_units:] # (5,100,512)
As mentioned in an above solution, you can see the advantage of this is that it can be easily wrapped into tf.keras.layers.RNN as a drop-in for the normal LSTMCell.
Here is a Colab Notebook with the code running as expected for tensorflow==2.6.0
I know it's pretty late, I hope this can help.
what you are asking, technically, is possible by modifying the LSTM-cell in call method. I modify it and make it return 4 dimension instead of 3 when you give return_sequences=True.
Code
from keras.layers.recurrent import _generate_dropout_mask
class Mod_LSTMCELL(LSTMCell):
def call(self, inputs, states, training=None):
if 0 < self.dropout < 1 and self._dropout_mask is None:
self._dropout_mask = _generate_dropout_mask(
K.ones_like(inputs),
self.dropout,
training=training,
count=4)
if (0 < self.recurrent_dropout < 1 and
self._recurrent_dropout_mask is None):
self._recurrent_dropout_mask = _generate_dropout_mask(
K.ones_like(states[0]),
self.recurrent_dropout,
training=training,
count=4)
# dropout matrices for input units
dp_mask = self._dropout_mask
# dropout matrices for recurrent units
rec_dp_mask = self._recurrent_dropout_mask
h_tm1 = states[0] # previous memory state
c_tm1 = states[1] # previous carry state
if self.implementation == 1:
if 0 < self.dropout < 1.:
inputs_i = inputs * dp_mask[0]
inputs_f = inputs * dp_mask[1]
inputs_c = inputs * dp_mask[2]
inputs_o = inputs * dp_mask[3]
else:
inputs_i = inputs
inputs_f = inputs
inputs_c = inputs
inputs_o = inputs
x_i = K.dot(inputs_i, self.kernel_i)
x_f = K.dot(inputs_f, self.kernel_f)
x_c = K.dot(inputs_c, self.kernel_c)
x_o = K.dot(inputs_o, self.kernel_o)
if self.use_bias:
x_i = K.bias_add(x_i, self.bias_i)
x_f = K.bias_add(x_f, self.bias_f)
x_c = K.bias_add(x_c, self.bias_c)
x_o = K.bias_add(x_o, self.bias_o)
if 0 < self.recurrent_dropout < 1.:
h_tm1_i = h_tm1 * rec_dp_mask[0]
h_tm1_f = h_tm1 * rec_dp_mask[1]
h_tm1_c = h_tm1 * rec_dp_mask[2]
h_tm1_o = h_tm1 * rec_dp_mask[3]
else:
h_tm1_i = h_tm1
h_tm1_f = h_tm1
h_tm1_c = h_tm1
h_tm1_o = h_tm1
i = self.recurrent_activation(x_i + K.dot(h_tm1_i,
self.recurrent_kernel_i))
f = self.recurrent_activation(x_f + K.dot(h_tm1_f,
self.recurrent_kernel_f))
c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1_c,
self.recurrent_kernel_c))
o = self.recurrent_activation(x_o + K.dot(h_tm1_o,
self.recurrent_kernel_o))
else:
if 0. < self.dropout < 1.:
inputs *= dp_mask[0]
z = K.dot(inputs, self.kernel)
if 0. < self.recurrent_dropout < 1.:
h_tm1 *= rec_dp_mask[0]
z += K.dot(h_tm1, self.recurrent_kernel)
if self.use_bias:
z = K.bias_add(z, self.bias)
z0 = z[:, :self.units]
z1 = z[:, self.units: 2 * self.units]
z2 = z[:, 2 * self.units: 3 * self.units]
z3 = z[:, 3 * self.units:]
i = self.recurrent_activation(z0)
f = self.recurrent_activation(z1)
c = f * c_tm1 + i * self.activation(z2)
o = self.recurrent_activation(z3)
h = o * self.activation(c)
if 0 < self.dropout + self.recurrent_dropout:
if training is None:
h._uses_learning_phase = True
return tf.expand_dims(tf.concat([h,c],axis=0),0), [h, c]
Sample code
# create a cell
test = Mod_LSTMCELL(100)
# Input timesteps=10, features=7
in1 = Input(shape=(10,7))
out1 = RNN(test, return_sequences=True)(in1)
M = Model(inputs=[in1],outputs=[out1])
M.compile(keras.optimizers.Adam(),loss='mse')
ans = M.predict(np.arange(7*10,dtype=np.float32).reshape(1, 10, 7))
print(ans.shape)
# state_h
print(ans[0,0,0,:])
# state_c
print(ans[0,0,1,:])
First, this is not possible do with the tf.keras.layers.LSTM. You have to use LSTMCell instead or subclass LSTM. Second, there is no need to subclass LSTMCell to get the sequence of cell states. LSTMCell already returns a list of the hidden state (h) and cell state (c) everytime you call it.
For those not familiar with LSTMCell, it takes in the current [h, c] tensors, and the input at the current timestep (it cannot take in a sequence of times) and returns the activations, and the updated [h,c].
Here is an example of showing how to use LSTMCell to process a sequence of timesteps and to return the accumulated cell states.
# example inputs
inputs = tf.convert_to_tensor(np.random.rand(3, 4), dtype='float32') # 3 timesteps, 4 features
h_c = [tf.zeros((1,2)), tf.zeros((1,2))] # must initialize hidden/cell state for lstm cell
h_c = tf.convert_to_tensor(h_c, dtype='float32')
lstm = tf.keras.layers.LSTMCell(2)
# example of how you accumulate cell state over repeated calls to LSTMCell
inputs = tf.unstack(inputs, axis=0)
c_states = []
for cur_inputs in inputs:
out, h_c = lstm(tf.expand_dims(cur_inputs, axis=0), h_c)
h, c = h_c
c_states.append(c)
You can access the states of any RNN by setting return_sequences = True in the initializer. You can find more information about this parameter here.
I'm trying to use the tf.layers.batch_normalization function provided in the latest Tensorflow API to implement a recurrent batch normalized LSTM.
The implementation is as below (I modified the TF source code):
class BNLSTMCell(tf.nn.rnn_cell.RNNCell):
"""
Batch Normalized Long short-term memory unit (LSTM) recurrent network cell.
cf. Recurrent Batch Normalization
https://arxiv.org/abs/1603.09025
cf. A Gentle Guide to Using Batch Normalization in TensorFlow
http://ruishu.io/2016/12/27/batchnorm/
"""
def __init__(self, num_units, forward_only, gamma_c=1.0, gamma_h=1.0,
gamma_x=1.0, beta_c=0.0, beta_h=0.0, beta_x=0.0,
input_size=None, use_peepholes=False, cell_clip=None,
initializer=None, num_proj=None,
num_unit_shards=1, num_proj_shards=1,
forget_bias=1.0, state_is_tuple=False,
activation=tf.tanh):
"""Initialize the parameters for an LSTM cell.
Args:
num_units: int, The number of units in the LSTM cell
forward_only:
If False (training):
1. Normalize layer activations according to mini-batch statistics.
2. During the training step, update population statistics
approximation via moving average of mini-batch statistics.
If True (testing):
1. Normalize layer activations according to estimated population
statistics.
2. No update of population statistics according to mini-batch
statistcs from test data.
gamma_c: Scale of cell state normalization
beta_c: Offset of cell state normalization
gamma_h: Scale of hidden state normalization
beta_h: Offset of hidden state normalization
(set to 0 to avoid redundancy)
gamma_x: Scale of input normalization
beta_x: Offset of input normalization
(set to 0 to avoid redundancy)
input_size: Deprecated and unused.
use_peepholes: bool, Set True to enable diagonal/peephole connections.
cell_clip: (optional) A float value, if provided the cell state is clipped
by this value prior to the cell output activation.
initializer: (optional) The initializer to use for the weight and
projection matrices.
num_proj: (optional) int, The output dimensionality for the projection
matrices. If None, no projection is performed.
num_unit_shards: How to split the weight matrix. If >1, the weight
matrix is stored across num_unit_shards.
num_proj_shards: How to split the projection matrix. If >1, the
projection matrix is stored across num_proj_shards.
forget_bias: Biases of the forget gate are initialized by default to 1
in order to reduce the scale of forgetting at the beginning of
the training.
state_is_tuple: If True, accepted and returned states are 2-tuples of
the `c_state` and `m_state`. By default (False), they are concatenated
along the column axis. This default behavior will soon be deprecated.
activation: Activation function of the inner states.
"""
if not state_is_tuple:
logging.warn(
"%s: Using a concatenated state is slower and will soon be "
"deprecated. Use state_is_tuple=True." % self)
if input_size is not None:
logging.warn("%s: The input_size parameter is deprecated." % self)
self._num_units = num_units
self.forward_only = forward_only
self._gamma_c = gamma_c
self._beta_c = beta_c
self._gamma_h = gamma_h
self._beta_h = beta_h
self._gamma_x = gamma_x
self._beta_x = beta_x
self._use_peepholes = use_peepholes
self._cell_clip = cell_clip
self._initializer = initializer
self._num_proj = num_proj
self._num_unit_shards = num_unit_shards
self._num_proj_shards = num_proj_shards
self._forget_bias = forget_bias
self._state_is_tuple = state_is_tuple
self._activation = activation
if num_proj:
self._state_size = (
tf.nn.rnn_cell.LSTMStateTuple(num_units, num_proj)
if state_is_tuple else num_units + num_proj)
self._output_size = num_proj
else:
self._state_size = (
tf.nn.rnn_cell.LSTMStateTuple(num_units, num_units)
if state_is_tuple else 2 * num_units)
self._output_size = num_units
#property
def state_size(self):
return self._state_size
#property
def output_size(self):
return self._output_size
def __call__(self, inputs, state, scope=None):
"""Run one step of LSTM.
Args:
inputs: input Tensor, 2D, batch x num_units.
state: if `state_is_tuple` is False, this must be a state Tensor,
`2-D, batch x state_size`. If `state_is_tuple` is True, this must be a
tuple of state Tensors, both `2-D`, with column sizes `c_state` and
`m_state`.
scope: VariableScope for the created subgraph; defaults to "LSTMCell".
Returns:
A tuple containing:
- A `2-D, [batch x output_dim]`, Tensor representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is:
num_proj if num_proj was set,
num_units otherwise.
- Tensor(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
Raises:
ValueError: If input size cannot be inferred from inputs via
static shape inference.
"""
num_proj = self._num_units if self._num_proj is None else self._num_proj
if self._state_is_tuple:
(c_prev, m_prev) = state
else:
c_prev = tf.slice(state, [0, 0], [-1, self._num_units])
m_prev = tf.slice(state, [0, self._num_units], [-1, num_proj])
dtype = inputs.dtype
input_size = inputs.get_shape().with_rank(2)[1]
if input_size.value is None:
raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
with tf.variable_scope(scope or type(self).__name__,
initializer=self._initializer): # "LSTMCell"
w_h = tf.get_variable("W_h", [num_proj, 4 * self._num_units],
dtype=tf.float32)
w_x = tf.get_variable("W_x", [input_size.value, 4 * self._num_units],
dtype=tf.float32)
b = tf.get_variable(
"B", shape=[4 * self._num_units],
initializer=tf.zeros_initializer, dtype=dtype)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
hidden_matrix = tf.matmul(m_prev, w_h)
bn_hidden_matrix = tf.layers.batch_normalization(hidden_matrix,
momentum=0.5,
beta_initializer=tf.constant_initializer(self._beta_h),
gamma_initializer=tf.constant_initializer(self._gamma_h),
training=(not self.forward_only),
name='bn_hidden_matrix', reuse=None)
# print(tf.get_collection(tf.GraphKeys.VARIABLES, scope=scope))
input_matrix = tf.matmul(inputs, w_x)
bn_input_matrix = tf.layers.batch_normalization(input_matrix,
momentum=0.5,
beta_initializer=tf.constant_initializer(self._beta_x),
gamma_initializer=tf.constant_initializer(self._gamma_x),
training=(not self.forward_only),
name='bn_input_matrix', reuse=None)
lstm_matrix = tf.nn.bias_add(
tf.add(bn_input_matrix, bn_hidden_matrix), b)
i, j, f, o = tf.split(lstm_matrix, num_or_size_splits=4, axis=1)
# Diagonal connections
if self._use_peepholes:
w_f_diag = tf.get_variable(
"W_F_diag", shape=[self._num_units], dtype=dtype)
w_i_diag = tf.get_variable(
"W_I_diag", shape=[self._num_units], dtype=dtype)
w_o_diag = tf.get_variable(
"W_O_diag", shape=[self._num_units], dtype=dtype)
if self._use_peepholes:
c = (tf.sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
tf.sigmoid(i + w_i_diag * c_prev) * self._activation(j))
else:
c = (tf.sigmoid(f + self._forget_bias) * c_prev + tf.sigmoid(i) *
self._activation(j))
if self._cell_clip is not None:
# pylint: disable=invalid-unary-operand-type
c = tf.clip_by_value(c, -self._cell_clip, self._cell_clip)
# pylint: enable=invalid-unary-operand-type
bn_c = tf.layers.batch_normalization(c,
momentum=0.5,
beta_initializer=tf.constant_initializer(self._beta_c),
gamma_initializer=tf.constant_initializer(self._gamma_c),
training=(not self.forward_only),
name='bn_cell', reuse=None)
if self._use_peepholes:
m = tf.sigmoid(o + w_o_diag * bn_c) * self._activation(bn_c)
else:
m = tf.sigmoid(o) * self._activation(bn_c)
if self._num_proj is not None:
concat_w_proj = tf.nn.rnn_cell._get_concat_variable(
"W_P", [self._num_units, self._num_proj],
dtype, self._num_proj_shards)
m = tf.matmul(m, concat_w_proj)
new_state = (tf.nn.rnn_cell.LSTMStateTuple(c, m) if self._state_is_tuple
else tf.concat(1, [c, m]))
return m, new_state
I built a sequence to sequence model and run the extra updates during training as specified in other posts.
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
if extra_update_ops and not forward_only:
outputs, extra_updates = session.run([output_feed, extra_update_ops], input_feed)
else:
outputs = session.run(output_feed, input_feed)
The training loss looks very reasonable.
However, my test output is garbage. I wonder if anyone has had similar experience and knows how to resolve it.
I'm trying to visualize the output of a convolutional layer in tensorflow using the function tf.image_summary. I'm already using it successfully in other instances (e. g. visualizing the input image), but have some difficulties reshaping the output here correctly. I have the following conv layer:
img_size = 256
x_image = tf.reshape(x, [-1,img_size, img_size,1], "sketch_image")
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
So the output of h_conv1 would have the shape [-1, img_size, img_size, 32]. Just using tf.image_summary("first_conv", tf.reshape(h_conv1, [-1, img_size, img_size, 1])) Doesn't account for the 32 different kernels, so I'm basically slicing through different feature maps here.
How can I reshape them correctly? Or is there another helper function I could use for including this output in the summary?
I don't know of a helper function but if you want to see all the filters you can pack them into one image with some fancy uses of tf.transpose.
So if you have a tensor that's images x ix x iy x channels
>>> V = tf.Variable()
>>> print V.get_shape()
TensorShape([Dimension(-1), Dimension(256), Dimension(256), Dimension(32)])
So in this example ix = 256, iy=256, channels=32
first slice off 1 image, and remove the image dimension
V = tf.slice(V,(0,0,0,0),(1,-1,-1,-1)) #V[0,...]
V = tf.reshape(V,(iy,ix,channels))
Next add a couple of pixels of zero padding around the image
ix += 4
iy += 4
V = tf.image.resize_image_with_crop_or_pad(image, iy, ix)
Then reshape so that instead of 32 channels you have 4x8 channels, lets call them cy=4 and cx=8.
V = tf.reshape(V,(iy,ix,cy,cx))
Now the tricky part. tf seems to return results in C-order, numpy's default.
The current order, if flattened, would list all the channels for the first pixel (iterating over cx and cy), before listing the channels of the second pixel (incrementing ix). Going across the rows of pixels (ix) before incrementing to the next row (iy).
We want the order that would lay out the images in a grid.
So you go across a row of an image (ix), before stepping along the row of channels (cx), when you hit the end of the row of channels you step to the next row in the image (iy) and when you run out or rows in the image you increment to the next row of channels (cy). so:
V = tf.transpose(V,(2,0,3,1)) #cy,iy,cx,ix
Personally I prefer np.einsum for fancy transposes, for readability, but it's not in tf yet.
newtensor = np.einsum('yxYX->YyXx',oldtensor)
anyway, now that the pixels are in the right order, we can safely flatten it into a 2d tensor:
# image_summary needs 4d input
V = tf.reshape(V,(1,cy*iy,cx*ix,1))
try tf.image_summary on that, you should get a grid of little images.
Below is an image of what one gets after following all the steps here.
In case someone would like to "jump" to numpy and visualize "there" here is an example how to display both Weights and processing result. All transformations are based on prev answer by mdaoust.
# to visualize 1st conv layer Weights
vv1 = sess.run(W_conv1)
# to visualize 1st conv layer output
vv2 = sess.run(h_conv1,feed_dict = {img_ph:x, keep_prob: 1.0})
vv2 = vv2[0,:,:,:] # in case of bunch out - slice first img
def vis_conv(v,ix,iy,ch,cy,cx, p = 0) :
v = np.reshape(v,(iy,ix,ch))
ix += 2
iy += 2
npad = ((1,1), (1,1), (0,0))
v = np.pad(v, pad_width=npad, mode='constant', constant_values=p)
v = np.reshape(v,(iy,ix,cy,cx))
v = np.transpose(v,(2,0,3,1)) #cy,iy,cx,ix
v = np.reshape(v,(cy*iy,cx*ix))
return v
# W_conv1 - weights
ix = 5 # data size
iy = 5
ch = 32
cy = 4 # grid from channels: 32 = 4x8
cx = 8
v = vis_conv(vv1,ix,iy,ch,cy,cx)
plt.figure(figsize = (8,8))
plt.imshow(v,cmap="Greys_r",interpolation='nearest')
# h_conv1 - processed image
ix = 30 # data size
iy = 30
v = vis_conv(vv2,ix,iy,ch,cy,cx)
plt.figure(figsize = (8,8))
plt.imshow(v,cmap="Greys_r",interpolation='nearest')
you may try to get convolution layer activation image this way:
h_conv1_features = tf.unpack(h_conv1, axis=3)
h_conv1_imgs = tf.expand_dims(tf.concat(1, h_conv1_features_padded), -1)
this gets one vertical stripe with all images concatenated vertically.
if you want them padded (in my case of relu activations to pad with white line):
h_conv1_features = tf.unpack(h_conv1, axis=3)
h_conv1_max = tf.reduce_max(h_conv1)
h_conv1_features_padded = map(lambda t: tf.pad(t-h_conv1_max, [[0,0],[0,1],[0,0]])+h_conv1_max, h_conv1_features)
h_conv1_imgs = tf.expand_dims(tf.concat(1, h_conv1_features_padded), -1)
I personally try to tile every 2d-filter in a single image.
For doing this -if i'm not terribly mistaken since I'm quite new to DL- I found out that it could be helpful to exploit the depth_to_space function, since it takes a 4d tensor
[batch, height, width, depth]
and produces an output of shape
[batch, height*block_size, width*block_size, depth/(block_size*block_size)]
Where block_size is the number of "tiles" in the output image. The only limitation to this is that the depth should be the square of block_size, which is an integer, otherwise it cannot "fill" the resulting image correctly.
A possible solution could be of padding the depth of the input tensor up to a depth that is accepted by the method, but I sill havn't tried this.
Another way, which I think very easy, is using the get_operation_by_name function. I had hard time visualizing the layers with other methods but this helped me.
#first, find out the operations, many of those are micro-operations such as add etc.
graph = tf.get_default_graph()
graph.get_operations()
#choose relevant operations
op_name = '...'
op = graph.get_operation_by_name(op_name)
out = sess.run([op.outputs[0]], feed_dict={x: img_batch, is_training: False})
#img_batch is a single image whose dimensions are (1,n,n,1).
# out is the output of the layer, do whatever you want with the output
#in my case, I wanted to see the output of a convolution layer
out2 = np.array(out)
print(out2.shape)
# determine, row, col, and fig size etc.
for each_depth in range(out2.shape[4]):
fig.add_subplot(rows, cols, each_depth+1)
plt.imshow(out2[0,0,:,:,each_depth], cmap='gray')
For example below is the input(colored cat) and output of the second conv layer in my model.
Note that I am aware this question is old and there are easier methods with Keras but for people who use an old model from other people (such as me), this may be useful.