Foward pass in LSTM netwok learned by keras - tensorflow

I have the following code that I am hoping to get a forward pass from a 2 layer LSTM:
"""
this is a simple numerical example of LSTM forward pass to allow deep understanding
the LSTM is trying to learn the sin function by learning to predict the next value after a sequence of 3 inputs
example 1: {0.583, 0.633, 0.681} --> {0.725}, these values correspond to
{sin(35.66), sin(39.27}, sin(42.92)} --> {sin(46.47)}
example 2: {0.725, 0.767, 0.801} --> {0.849}, these values correspond to
{sin(46.47), sin(50.09), sin(53.23)} --> {sin(58.10)}
example tested: [[['0.725323664']
['0.7671179']
['0.805884672']]]
predicted_instance: [ 0.83467698]
training example pair: [['0.680666907']
['0.725323664']
['0.7671179']] 0.805884672
"""
import numpy as np
# linear activation matrix-wise (works also element-wise)
def linear(x):
return x
# sigmoid function matrix-wise (works also element-wise)
def sigmoid(x):
return 1/(1 + np.exp(-x))
# hard sigmoid function element wise
def hard_sig(x):
# in Keras for both tensorflow and theano backend
return np.max(np.array([0.0, np.min(np.array([1.0, x * 0.2 + 0.5]))]))
# Courbariaux et al. 2016 (Binarized Neural Networks)
# return np.max(np.array([0.0, np.min(np.array([1.0, (x + 1.0)/2.0]))]))
# hard sigmoid function matrix wise
def hard_sigmoid(x, fun=hard_sig):
return np.vectorize(fun)(x)
# hyperbolic tangent function matrix wise (works also element-wise)
def hyperbolic_tangent(x):
return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
print(sigmoid(np.array([-100, 0, 100])))
print(hard_sigmoid(np.array([-100, 0, 0.1, 100])))
print(hyperbolic_tangent(np.array([-100, 0, 100])))
parameter_names = ['lstm_1_kernel_0.npy',
'lstm_1_recurrent_kernel_0.npy',
'lstm_1_bias_0.npy',
'lstm_2_kernel_0.npy',
'lstm_2_recurrent_kernel_0.npy',
'lstm_2_bias_0.npy',
'dense_1_kernel_0.npy',
'dense_1_bias_0.npy']
# LSTM 1 Weights
lstm_1_kernel_0 = np.load('lstm_1_kernel_0.npy')
print('lstm_1_kernel_0: ', lstm_1_kernel_0.shape)
lstm_1_recurrent_kernel_0 = np.load('lstm_1_recurrent_kernel_0.npy')
print('lstm_1_recurrent_kernel_0: ', lstm_1_recurrent_kernel_0.shape)
lstm_1_bias_0 = np.load('lstm_1_bias_0.npy')
print('lstm_1_bias_0: ', lstm_1_bias_0.shape)
# LSTM 2 Wights
lstm_2_kernel_0 = np.load('lstm_2_kernel_0.npy')
print('lstm_2_kernel_0: ', lstm_2_kernel_0.shape)
lstm_2_recurrent_kernel_0 = np.load('lstm_2_recurrent_kernel_0.npy')
print('lstm_2_recurrent_kernel_0: ', lstm_2_recurrent_kernel_0.shape)
lstm_2_bias_0 = np.load('lstm_2_bias_0.npy')
print('lstm_2_bias_0: ', lstm_2_bias_0.shape)
# Dense layer
dense_1_kernel_0 = np.load('dense_1_kernel_0.npy')
print('dense_1_kernel_0: ', dense_1_kernel_0.shape)
dense_1_bias_0 = np.load('dense_1_bias_0.npy')
print('dense_1_bias_0: ', dense_1_bias_0.shape)
time_seq = [0, 1, 2]
"""
input_seq = np.array([[[0.725323664],
[0.7671179],
[0.805884672]]])
"""
input_seq = np.array([[[0.680666907],
[0.725323664],
[0.7671179]]])
print('input_seq: ', input_seq.shape)
for time in time_seq:
print('input t', time, ':', input_seq[0, time, 0])
"""
# z0 = z[:, :self.units]
# z1 = z[:, self.units: 2 * self.units]
# z2 = z[:, 2 * self.units: 3 * self.units]
# z3 = z[:, 3 * self.units:]
# i = self.recurrent_activation(z0)
# f = self.recurrent_activation(z1)
# c = f * c_tm1 + i * self.activation(z2)
# o = self.recurrent_activation(z3)
# activation =' tanh'
# recurrent_activation = 'hard_sigmoid'
"""
# LSTM 1
x_1_lstm_1 = input_seq[0, 0, 0]
print('x_1: ', x_1_lstm_1)
x_2_lstm_1 = input_seq[0, 1, 0]
print('x_2: ', x_2_lstm_1)
x_3_lstm_1 = input_seq[0, 2, 0]
print('x_3: ', x_3_lstm_1)
c_0_lstm_1 = np.zeros((1, 3))
h_0_lstm_1 = np.zeros((1, 3))
z_1_lstm_1 = np.dot(x_1_lstm_1, lstm_1_kernel_0) + np.dot(h_0_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_1_lstm_1.shape)
i_1_lstm_1 = sigmoid(z_1_lstm_1[:, 0:3])
f_1_lstm_1 = sigmoid(z_1_lstm_1[:, 3:6])
input_to_c_1_lstm_1 = z_1_lstm_1[:, 6:9]
o_1_lstm_1 = sigmoid(z_1_lstm_1[:, 9:12])
c_1_lstm_1 = np.multiply(f_1_lstm_1, c_0_lstm_1) + np.multiply(i_1_lstm_1, hyperbolic_tangent(input_to_c_1_lstm_1))
h_1_lstm_1 = np.multiply(o_1_lstm_1, hyperbolic_tangent(c_1_lstm_1))
print('h_1_lstm_1: ', h_1_lstm_1.shape, h_1_lstm_1)
z_2_lstm_1 = np.dot(x_2_lstm_1, lstm_1_kernel_0) + np.dot(h_1_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_2_lstm_1.shape)
i_2_lstm_1 = sigmoid(z_2_lstm_1[:, 0:3])
f_2_lstm_1 = sigmoid(z_2_lstm_1[:, 3:6])
input_to_c_2_lstm_1 = z_2_lstm_1[:, 6:9]
o_2_lstm_1 = sigmoid(z_2_lstm_1[:, 9:12])
c_2_lstm_1 = np.multiply(f_2_lstm_1, c_1_lstm_1) + np.multiply(i_2_lstm_1, hyperbolic_tangent(input_to_c_2_lstm_1))
h_2_lstm_1 = np.multiply(o_2_lstm_1, hyperbolic_tangent(c_2_lstm_1))
print('h_2_lstm_1: ', h_2_lstm_1.shape, h_2_lstm_1)
z_3_lstm_1 = np.dot(x_3_lstm_1, lstm_1_kernel_0) + np.dot(h_2_lstm_1, lstm_1_recurrent_kernel_0) + lstm_1_bias_0
print(z_3_lstm_1.shape)
i_3_lstm_1 = sigmoid(z_3_lstm_1[:, 0:3])
f_3_lstm_1 = sigmoid(z_3_lstm_1[:, 3:6])
input_to_c_3_lstm_1 = z_3_lstm_1[:, 6:9]
o_3_lstm_1 = sigmoid(z_3_lstm_1[:, 9:12])
c_3_lstm_1 = np.multiply(f_3_lstm_1, c_2_lstm_1) + np.multiply(i_3_lstm_1, hyperbolic_tangent(input_to_c_3_lstm_1))
h_3_lstm_1 = np.multiply(o_3_lstm_1, hyperbolic_tangent(c_3_lstm_1))
print('h_3_lstm_1: ', h_3_lstm_1.shape, h_3_lstm_1)
# LSTM 2
x_1_lstm_2 = h_1_lstm_1
x_2_lstm_2 = h_2_lstm_1
x_3_lstm_2 = h_3_lstm_1
c_0_lstm_2 = np.zeros((1, 1))
h_0_lstm_2 = np.zeros((1, 1))
z_1_lstm_2 = np.dot(x_1_lstm_2, lstm_2_kernel_0) + np.dot(h_0_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_1_lstm_2.shape)
i_1_lstm_2 = sigmoid(z_1_lstm_2[:, 0])
f_1_lstm_2 = sigmoid(z_1_lstm_2[:, 1])
input_to_c_1_lstm_2 = z_1_lstm_2[:, 2]
o_1_lstm_2 = sigmoid(z_1_lstm_2[:, 3])
c_1_lstm_2 = np.multiply(f_1_lstm_2, c_0_lstm_2) + np.multiply(i_1_lstm_2, hyperbolic_tangent(input_to_c_1_lstm_2))
h_1_lstm_2 = np.multiply(o_1_lstm_2, hyperbolic_tangent(c_1_lstm_2))
print('h_1_lstm_2: ', h_1_lstm_2.shape, h_1_lstm_2)
z_2_lstm_2 = np.dot(x_2_lstm_2, lstm_2_kernel_0) + np.dot(h_1_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_2_lstm_2.shape)
i_2_lstm_2 = sigmoid(z_2_lstm_2[:, 0])
f_2_lstm_2 = sigmoid(z_2_lstm_2[:, 1])
input_to_c_2_lstm_2 = z_2_lstm_2[:, 2]
o_2_lstm_2 = sigmoid(z_2_lstm_2[:, 3])
c_2_lstm_2 = np.multiply(f_2_lstm_2, c_1_lstm_2) + np.multiply(i_2_lstm_2, hyperbolic_tangent(input_to_c_2_lstm_2))
h_2_lstm_2 = np.multiply(o_2_lstm_2, hyperbolic_tangent(c_2_lstm_2))
print('h_2_lstm_2: ', h_2_lstm_2.shape, h_2_lstm_2)
z_3_lstm_2 = np.dot(x_3_lstm_2, lstm_2_kernel_0) + np.dot(h_2_lstm_2, lstm_2_recurrent_kernel_0) + lstm_2_bias_0
print(z_3_lstm_2.shape)
i_3_lstm_2 = sigmoid(z_3_lstm_2[:, 0])
f_3_lstm_2 = sigmoid(z_3_lstm_2[:, 1])
input_to_c_3_lstm_2 = z_3_lstm_2[:, 2]
o_3_lstm_2 = sigmoid(z_3_lstm_2[:, 3])
c_3_lstm_2 = np.multiply(f_3_lstm_2, c_2_lstm_2) + np.multiply(i_3_lstm_2, hyperbolic_tangent(input_to_c_3_lstm_2))
h_3_lstm_2 = np.multiply(o_3_lstm_2, hyperbolic_tangent(c_3_lstm_2))
print('h_3_lstm_2: ', h_3_lstm_2.shape, h_3_lstm_2)
output = np.dot(h_3_lstm_2, dense_1_kernel_0) + dense_1_bias_0
print('output: ', output)
The weights have been saved to file at train time and they can be retrieved from the following location:
LSTM weights
In order to create the LSTM which is fitting a sinwave signal I have used the following code in Keras:
def build_simple_model(layers):
model = Sequential()
model.add(LSTM(input_shape=(layers[1], layers[0]),
output_dim=layers[1],
return_sequences=True,
activation='tanh',
recurrent_activation='sigmoid')) # 'hard_sigmoid'
# model.add(Dropout(0.2))
model.add(LSTM(layers[2],
return_sequences=False,
activation='tanh',
recurrent_activation='sigmoid')) # 'hard_sigmoid'
# model.add(Dropout(0.2))
model.add(Dense(output_dim=layers[3]))
model.add(Activation("linear"))
start = time.time()
model.compile(loss="mse", optimizer="rmsprop")
print("> Compilation Time : ", time.time() - start)
plot_model(model, to_file='lstm_model.png', show_shapes=True, show_layer_names=True)
print(model.summary())
return model
This resulted in the following model:
I have used the training procedure as follows:
seq_len = 3
model = lstm.build_simple_model([1, seq_len, 1, 1])
model.fit(X_train,
y_train,
batch_size=512,
nb_epoch=epochs,
validation_split=0.05)
Would it be possible to understand why my forward pass does not produce the desired output in predicting a future sin() signal value based on three previous consecutive ones.
The original example on which I am trying to base my forward pass exercise originates here. The weights uploaded in .npy format are from a network that is able to perfectly predict the next sin() value in a series.

I realised what the problem was. I was trying to extract my model weights using Tensorflow session (after model fitting), rather than via Keras methods directly. This resulted in weights matrices that made perfect sense (dimension wise) but contained the values from initialization step.
model.fit(X_train,
y_train,
batch_size=batch_size,
nb_epoch=epochs,
validation_split=0.05,
callbacks=callbacks_list)
print('n_parameters: ', len(model.weights))
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
parameter_names = ['lstm_1_kernel_0',
'lstm_1_recurrent_kernel_0',
'lstm_1_bias_0',
'lstm_2_kernel_0',
'lstm_2_recurrent_kernel_0',
'lstm_2_bias_0',
'dense_1_kernel_0',
'dense_1_bias_0']
weights = model.get_weights()
trainable_weights = model.trainable_weights
for parameter in range(len(model.weights)):
print('')
# using Keras methods is the correct way
print('parameter: ', trainable_weights[parameter])
print('parameter Keras: ', weights[parameter])
# using session with TF is the wrong way
print('parameter TF: ', model.weights[parameter].eval(session=sess))
#np.save(parameter_names[parameter], model.weights[parameter].eval(session=sess))
#np.save(parameter_names[parameter], weights[parameter])
This prints the following to screen:
parameter: <tf.Variable 'lstm_1/kernel:0' shape=(1, 12) dtype=float32_ref>
parameter Keras: [[ 0.02005039 0.59627813 -0.77670902 -0.17643917 0.64905447 -0.49418128
0.01204901 0.79791737 -1.58887422 -0.3566488 0.67758918 0.77245694]]
parameter TF: [[-0.20346385 -0.07166874 -0.58842945 0.03744811 0.46911311 -0.0469712
-0.07291448 0.27316415 -0.53298378 0.08367682 0.10194337 0.20933461]]
parameter: <tf.Variable 'lstm_1/recurrent_kernel:0' shape=(3, 12) dtype=float32_ref>
parameter Keras: [[ 0.01916649 -0.30881727 -0.07018201 0.28770521 -0.45713434 -0.33738521
0.53091544 -0.78456688 0.50647908 0.12326431 -0.18517831 -0.28752103]
[ 0.44490865 -0.09020164 1.00983524 0.43070397 -0.14646551 -0.53908533
1.33833826 0.76106179 -1.28808987 0.71029669 -0.19338571 -0.30499896]
[ 0.76727188 -0.10291406 0.53285897 0.31021088 0.46876401 0.04961515
0.0573149 1.17765784 -0.45716232 0.26181531 0.60458028 -0.6042906 ]]
parameter TF: [[-0.044281 -0.42013288 -0.06702472 0.16710882 0.07229936 0.20263752
0.01935999 -0.65925431 0.21676332 0.02481769 0.50321299 -0.08369029]
[-0.17725646 -0.14031938 -0.07758044 -0.39292315 0.36675838 -0.20198873
0.59491426 -0.12469263 0.14705807 0.39603388 -0.25511321 -0.01221756]
[ 0.51603764 0.34401873 0.36002275 0.05344227 -0.00293417 -0.36086732
0.1636388 -0.24916036 0.09064917 -0.04246153 0.05563453 -0.5006755 ]]
parameter: <tf.Variable 'lstm_1/bias:0' shape=(12,) dtype=float32_ref>
parameter Keras: [ 3.91339064e-01 -2.09703773e-01 -4.88098420e-04 1.15376031e+00
6.24452651e-01 2.24053934e-01 4.06851530e-01 4.78419960e-01
1.77846551e-01 3.19107175e-01 5.16630232e-01 -2.22970009e-01]
parameter TF: [ 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
parameter: <tf.Variable 'lstm_2/kernel:0' shape=(3, 4) dtype=float32_ref>
parameter Keras: [[ 2.01334882 1.9168334 1.77633524 -0.90856379]
[ 1.17618477 1.02978265 -0.06435115 0.66180402]
[-1.33014703 -0.71629387 -0.87376142 1.35648465]]
parameter TF: [[ 0.83115911 0.72150767 0.51600969 -0.52725452]
[ 0.53043616 0.59162521 -0.59219611 0.0951736 ]
[-0.8030411 -0.00424314 -0.06715947 0.67533839]]
parameter: <tf.Variable 'lstm_2/recurrent_kernel:0' shape=(1, 4) dtype=float32_ref>
parameter Keras: [[-0.09348518 -0.7667768 0.24031806 -0.39155772]]
parameter TF: [[-0.085137 -0.59010917 0.61000961 -0.52193022]]
parameter: <tf.Variable 'lstm_2/bias:0' shape=(4,) dtype=float32_ref>
parameter Keras: [ 1.21466994 2.22224903 1.34946632 0.19186479]
parameter TF: [ 0. 1. 0. 0.]
parameter: <tf.Variable 'dense_1/kernel:0' shape=(1, 1) dtype=float32_ref>
parameter Keras: [[ 2.69569159]]
parameter TF: [[ 1.5422312]]
parameter: <tf.Variable 'dense_1/bias:0' shape=(1,) dtype=float32_ref>
parameter Keras: [ 0.20767514]
parameter TF: [ 0.]
The forward pass code was therefore correct.The weights were wrong.The correct weights .npy files have also been updated at the link mentioned in the question. This forward pass can be used to illustrate sequence generation with LSTM by recycling the output.

Related

How to build a custom question-answering head when using hugginface transformers?

Using the TFBertForQuestionAnswering.from_pretrained() function, we get a predefined head on top of BERT together with a loss function that are suitable for this task.
My question is how to create a custom head without relying on TFAutoModelForQuestionAnswering.from_pretrained().
I want to do this because there is no place where the architecture of the head is explained clearly. By reading the code here we can see the architecture they are using, but I can't be sure I understand their code 100%.
Starting from How to Fine-tune HuggingFace BERT model for Text Classification is good. However, it covers only the classification task, which is much simpler.
'start_positions' and 'end_positions' are created following this tutorial.
So far, I've got the following:
train_dataset
# Dataset({
# features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
# num_rows: 99205
# })
train_dataset.set_format(type='tensorflow', columns=['input_ids', 'token_type_ids', 'attention_mask'])
features = {x: train_dataset[x] for x in ['input_ids', 'token_type_ids', 'attention_mask']}
labels = [train_dataset[x] for x in ['start_positions', 'end_positions']]
labels = np.array(labels).T
tfdataset = tf.data.Dataset.from_tensor_slices((features, labels)).batch(16)
input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='input_ids')
token_type_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name='token_type_ids')
attention_mask = tf.keras.layers.Input((256,), dtype=tf.int32, name='attention_mask')
bert = TFAutoModel.from_pretrained("bert-base-multilingual-cased")
output = bert([input_ids, token_type_ids, attention_mask]).last_hidden_state
output = tf.keras.layers.Dense(2, name="qa_outputs")(output)
model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=output)
num_train_epochs = 3
num_train_steps = len(tfdataset) * num_train_epochs
optimizer, schedule = create_optimizer(
init_lr=2e-5,
num_warmup_steps=0,
num_train_steps=num_train_steps,
weight_decay_rate=0.01
)
def qa_loss(labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
start_loss = loss_fn(labels[0], logits[0])
end_loss = loss_fn(labels[1], logits[1])
return (start_loss + end_loss) / 2.0
model.compile(
loss=loss_fn,
optimizer=optimizer
)
model.fit(tfdataset, epochs=num_train_epochs)
And I am getting the following error:
ValueError: `labels.shape` must equal `logits.shape` except for the last dimension. Received: labels.shape=(2,) and logits.shape=(256, 2)
It is complaining about the shape of the labels. This should not happen since I am using SparseCategoricalCrossentropy loss.
For future reference, I actually found a solution, which is just editing the TFBertForQuestionAnswering class itself. For example, I added an additional layer in the following code and trained the model as usual and it worked.
from transformers import TFBertPreTrainedModel
from transformers import TFBertMainLayer
from transformers.modeling_tf_utils import TFQuestionAnsweringLoss, get_initializer, input_processing
from transformers.modeling_tf_outputs import TFQuestionAnsweringModelOutput
from transformers import BertConfig
class MY_TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
_keys_to_ignore_on_load_unexpected = [
r"pooler",
r"mlm___cls",
r"nsp___cls",
r"cls.predictions",
r"cls.seq_relationship",
]
def __init__(self, config: BertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
# This is the dense layer I added
self.my_dense = tf.keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="my_dense",
)
self.qa_outputs = tf.keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="qa_outputs",
)
def call(
self,
input_ids = None,
attention_mask = None,
token_type_ids = None,
position_ids = None,
head_mask = None,
inputs_embeds = None,
output_attentions = None,
output_hidden_states = None,
return_dict = None,
start_positions = None,
end_positions= None,
training = False,
**kwargs,
):
r"""
start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
are not taken into account for computing the loss.
"""
inputs = input_processing(
func=self.call,
config=self.config,
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
start_positions=start_positions,
end_positions=end_positions,
training=training,
kwargs_call=kwargs,
)
outputs = self.bert(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
token_type_ids=inputs["token_type_ids"],
position_ids=inputs["position_ids"],
head_mask=inputs["head_mask"],
inputs_embeds=inputs["inputs_embeds"],
output_attentions=inputs["output_attentions"],
output_hidden_states=inputs["output_hidden_states"],
return_dict=inputs["return_dict"],
training=inputs["training"],
)
sequence_output = outputs[0]
# You also have to add it here
my_logits = self.my_dense(inputs=sequence_output)
logits = self.qa_outputs(inputs=my_logits)
start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1)
start_logits = tf.squeeze(input=start_logits, axis=-1)
end_logits = tf.squeeze(input=end_logits, axis=-1)
loss = None
if inputs["start_positions"] is not None and inputs["end_positions"] is not None:
labels = {"start_position": inputs["start_positions"]}
labels["end_position"] = inputs["end_positions"]
loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits))
if not inputs["return_dict"]:
output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput:
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
return TFQuestionAnsweringModelOutput(
start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns
)

ValueError: Error when checking target: expected dense_1 to have 2 dimensions, but got array with shape (1188, 12, 2)

So when I was trying to build a lstm network, every time it tells me that "ValueError: Error when checking target: expected dense_1 to have 2 dimensions, but got array with shape (1188, 12, 2)".
My dataset has more than 1000 samples, 2 features, and I set the time_step as 12.
I have already reshaped my dataset to 3-dim, however, the error tells that my last layer-Dense layer(I use this layer as output) expected a 2-dimention array.What shell I do?
My codes are as follows:
# read train set
readColsPro = (7, 20)
filename = 'train_set.txt'
xProTrain_1 = readCsv.csvMat(filename, 1, cols=readColsPro, rows=[0, 1200])
yProTrain_1 = readCsv.csvMat(filename, 1, cols=readColsPro, rows=[1, 1201])
xProTrain_1 = xProTrain_1.reshape(xProTrain_1.shape[0], 2)
yProTrain_1 = yProTrain_1.reshape(yProTrain_1.shape[0], 2)
# erase 'nan' datas
for i in xProTrain_1:
if np.isnan(i[1]):
i[1] = 0
for i in yProTrain_1:
if np.isnan(i[1]):
i[1] = 0
# read test set
xProTest_1 = readCsv.csvMat(filename, 1, cols=readColsPro, rows=[1, 1201])
yProTest_1 = readCsv.csvMat(filename, 1, cols=readColsPro, rows=[2, 1202])
xProTest_1 = np.reshape(xProTest_1, (xProTest_1.shape[0], xProTest_1.shape[1]))
yProTest_1 = np.reshape(yProTest_1, (yProTest_1.shape[0], yProTest_1.shape[1]))
for i in xProTest_1:
if np.isnan(i[1]):
i[1] = 0
for i in yProTest_1:
if np.isnan(i[1]):
i[1] = 0
# parameters
timeStepPro = 12
epoch = 24
batch_size = 24
trainNumPro = xProTrain_1.shape[0]
testNumPro = yProTrain_1.shape[0]
# reshape datas to 3D
xProTrain_2 = []
for i in range(timeStepPro, trainNumPro):
xProTrain_2.append(xProTrain_1[i - timeStepPro:i])
xProTrain_2 = np.array(xProTrain_2)
yProTrain_2 = []
for i in range(timeStepPro, trainNumPro):
yProTrain_2.append(yProTrain_1[i - timeStepPro:i])
yProTrain_2 = np.array(yProTrain_2)
print(xProTrain_2.shape)
print(yProTrain_2.shape)
# reshape datas to 3D
xProTest_2 = []
for i in range(timeStepPro, trainNumPro):
xProTest_2.append(xProTest_1[i - timeStepPro:i])
xProTest_2 = np.array(xProTest_2)
yProTest_2 = []
for i in range(timeStepPro, trainNumPro):
yProTest_2.append(yProTest_1[i - timeStepPro:i])
yProTest_2 = np.array(yProTest_2)
# define network
modelA = Sequential()
modelA.add(LSTM(units=64, return_sequences=True,
input_shape=[xProTrain_2.shape[1], 2]))
modelA.add(BatchNormalization())
modelA.add(LSTM(units=128, return_sequences=True))
modelA.add(LSTM(units=128, return_sequences=True))
modelA.add(LSTM(units=256, return_sequences=True))
modelA.add(LSTM(units=64, return_sequences=False))
modelA.add(Dense(units=2, activation='relu'))
modelA.compile(optimizer='adam',
loss='mean_squared_error',
metrics=['accuracy'])
modelA.fit(x=xProTrain_2, y=yProTrain_2, epochs=epoch, batch_size=batch_size)
Error message are as follows:
ValueError: Error when checking target: expected dense_1 to have 2 dimensions, but got array with shape (1188, 12, 2)

Error on tensorflow: Shape must be rank 2 but is rank 1 for 'MatMul_25'

I'm trying to create a conditional GAN. However, i'm stuck as to why no matter what i do, it appears the same error over and over again.
Here's the code:
image_dim = 784 #28 * 28
Y_dimension = 10
gen_hidd_dim = 256
disc_hidd_dim = 256
z_noise_dim =100 #input noise datapoint
def xavier_init(shape):
return tf.random_normal(shape = shape, stddev = 1/tf.sqrt(shape[0]/2.0))
weights = {
'disc_H' : tf.Variable(xavier_init([image_dim + Y_dimension, disc_hidd_dim])),
'disc_final' : tf.Variable(xavier_init([disc_hidd_dim, 1])),
'gen_H': tf.Variable([z_noise_dim + Y_dimension, gen_hidd_dim]),
'gen_final': tf.Variable(xavier_init([gen_hidd_dim, image_dim]))
}
bias = {
'disc_H': tf.Variable(xavier_init([disc_hidd_dim])),
'disc_final': tf.Variable(xavier_init([1])),
'gen_H': tf.Variable(xavier_init([gen_hidd_dim])),
'gen_final': tf.Variable(xavier_init([image_dim]))
}
Z_input = tf.placeholder(tf.float32, shape= [None, z_noise_dim ], name = 'input_noise')
Y_input = tf.placeholder(tf.float32, shape= [None, Y_dimension], name='Labels')
X_input = tf.placeholder(tf.float32, shape=[None, image_dim], name = 'real_input')
def Discriminator(x,y):
inputs = tf.concat(axis = 1, values = [x,y])
hidden_layer = tf.nn.relu(tf.add(tf.matmul(inputs, weights['disc_H']), bias['disc_H']))
final_layer = tf.add(tf.matmul(hidden_layer, weights['disc_final']), bias['disc_final'])
disc_output = tf.nn.sigmoid(final_layer)
return final_layer, disc_output
def Generator(x,y):
inputs = tf.concat(axis=1, values=[x,y])
hidden_layer = tf.nn.relu(tf.add(tf.matmul(tf.cast(inputs, tf.float32), tf.cast(weights['gen_H'], tf.float32)), tf.cast(bias['gen_H'],tf.float32)))
final_layer = tf.add(tf.matmul(hidden_layer, weights['gen_final']), bias['gen_final'])
gen_output = tf.nn.sigmoid(final_layer)
return gen_output
output_Gen = Generator(Z_input, Y_input)
Right after executing the Generator i get the following error:
ValueError: Shape must be rank 2 but is rank 1 for 'MatMul_25' (op: 'MatMul') with input shapes: [?,110], [2].
What to do?
I think you just missed one call to xavier_init() when initialising your weights.
You have this:
weights = {
'disc_H' : tf.Variable(xavier_init([image_dim + Y_dimension, disc_hidd_dim])),
'disc_final' : tf.Variable(xavier_init([disc_hidd_dim, 1])),
'gen_H': tf.Variable([z_noise_dim + Y_dimension, gen_hidd_dim]),
'gen_final': tf.Variable(xavier_init([gen_hidd_dim, image_dim]))
}
but I think you want this:
weights = {
'disc_H' : tf.Variable(xavier_init([image_dim + Y_dimension, disc_hidd_dim])),
'disc_final' : tf.Variable(xavier_init([disc_hidd_dim, 1])),
'gen_H': tf.Variable(xavier_init([z_noise_dim + Y_dimension, gen_hidd_dim])),
'gen_final': tf.Variable(xavier_init([gen_hidd_dim, image_dim]))
}
The error message was because weights['gen_H'] had shape [2] whereas you expected it to have shape [110, 256]. This meant that the call to tf.matmul() failed because it's impossible to matrix multiply a matrix with shape [m, 110] by a matrix of shape [2]

tensorflow nested map_fn concat two tensors

say I have two tensors:
a=Tensor("zeros_3:0", shape=(2, 4, 5), dtype=float32)
b=Tensor("ones_3:0", shape=(2, 3, 5), dtype=float32)
how can I concat each element along axis 2 to get a new tensor shaped (2,3,4,10), using nested map_fn or other tf functions?
here is my for loop version
concat_list = []
for i in range(a.get_shape()[1]):
for j in range(b.get_shape()[1]):
concat_list.append(tf.concat([a[:, i, :], b[:, j, :]], axis=1))
there is a similar question using "new unit dimension", but I don't know how to use tf.concat with the "new unit dimension".
You can use tf.tile and tf.expand_dims with tf.concat. An example:
import tensorflow as tf
a = tf.random_normal(shape=(2,4,5),dtype=tf.float32)
b = tf.random_normal(shape=(2,3,5),dtype=tf.float32)
# your code
concat_list = []
for i in range(a.get_shape()[1]):
for j in range(b.get_shape()[1]):
concat_list.append(tf.concat([a[:, i, :], b[:, j, :]], axis=1))
# Application method
A = tf.tile(tf.expand_dims(a,axis=1),[1,b.shape[1],1,1])
B = tf.tile(tf.expand_dims(b,axis=2),[1,1,a.shape[1],1])
result = tf.concat([A,B],axis=-1)
with tf.Session() as sess:
concat_list_val,result_val = sess.run([concat_list,result])
print(concat_list_val[-1])
print(result_val.shape)
print(result_val[:,-1,-1,:])
# your result
[[ 1.0459949 1.5562199 -0.04387079 0.17898582 -1.9795663 0.988437
-0.40415847 0.8865694 -1.4764767 -0.8417388 ]
[-0.3542176 -0.3281141 0.01491702 0.91899025 -1.0651684 0.12315683
0.6555444 -0.80451876 -1.3260773 0.33680603]]
# Application result shape
(2, 3, 4, 10)
# Application result
[[ 1.0459949 1.5562199 -0.04387079 0.17898582 -1.9795663 0.988437
-0.40415847 0.8865694 -1.4764767 -0.8417388 ]
[-0.3542176 -0.3281141 0.01491702 0.91899025 -1.0651684 0.12315683
0.6555444 -0.80451876 -1.3260773 0.33680603]]
Performance
You can use follow code to compare speed.
import datetime
...
with tf.Session() as sess:
start = datetime.datetime.now()
print('#' * 60)
for i in range(10000):
result_val = sess.run(result)
end = datetime.datetime.now()
print('cost time(seconds) : %.2f' % ((end - start).total_seconds()))
start = datetime.datetime.now()
print('#' * 60)
for i in range(10000):
concat_list_val = sess.run(concat_list)
end = datetime.datetime.now()
print('cost time(seconds) : %.2f' % ((end - start).total_seconds()))
The vectorization method 10000 iterations takes 1.48s and the loop 10000 iterations takes 5.76s when a.shape=(2,4,5) and b.shape=(2,3,5) on my 8GB GPU memory. But the vectorization method takes 3.28s and the loop time is 317.23s when a.shape=(20,40,5) and b.shape=(20,40,5).
The vectorization method will be significantly faster than the tf.map_fn() and python loop.

Tensorflow, ValueError: The two structures don't have the same nested structure

import tensorflow as tf
vocab_num = 4000
word_dim = 300
question_encode = None
answer_num = 1000
common_dim = 256
memory_dim = 256
question_encode = tf.placeholder(
tf.int64, [None, None], 'question_encode')
with tf.variable_scope('embedding'):
embedding_matrix = tf.get_variable(
'embedding_matrix',
[vocab_num, word_dim], regularizer=tf.nn.l2_loss)
question_embedding = tf.nn.embedding_lookup(
embedding_matrix, question_encode, name='word_embedding')
print('question_embedding', question_embedding)
shape = tf.shape(question_encode)
batch_size = shape[0]
question_length = tf.constant(15)
time = tf.constant(0, name='time')
max_length = tf.constant(20)
q_cell = tf.nn.rnn_cell.LSTMCell(word_dim)
q_state = q_cell.zero_state(batch_size, tf.float32)
word_embed_W = tf.get_variable('word_embed_W', [word_dim, common_dim], regularizer=tf.nn.l2_loss)
word_embed_b = tf.get_variable('word_embed_b', [common_dim])
word_embedding = question_embedding[:, time]
out_ = tf.ones((1, 256))
time = tf.constant(0)
out = tf.zeros((max_length - question_length, 256))
def _one_step(time, q_state, word_list):
"""One time step of model."""
word_embedding = question_embedding[:, time]
with tf.variable_scope('lstm_q'):
q_output, q_state = q_cell(word_embedding, q_state)
with tf.name_scope('transform_w'):
word = tf.nn.xw_plus_b(
word_embedding, word_embed_W, word_embed_b)
word = tf.nn.tanh(word)
word_list = tf.concat([word_list, word], axis=0)
return time + 1, q_state, word_list
# main loop
time, q_state, out_ = tf.while_loop(
cond=lambda time, *_: time < question_length,
body=_one_step,
loop_vars=[time, q_state, out_],
shape_invariants=[time.get_shape(), tf.TensorShape([None, 256])]
)
word_list = tf.concat([out_, out], axis=0)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
res = sess.run(out)
When problems arise :
ValueError: The two structures don't have the same nested structure.
First structure: type=list str=[<tf.Tensor 'Const_2:0' shape=() dtype=int32>, LSTMStateTuple(c=<tf.Tensor 'LSTMCellZeroState/zeros:0' shape=(?, 300) dtype=float32>, h=<tf.Tensor 'LSTMCellZeroState/zeros_1:0' shape=(?, 300) dtype=float32>), <tf.Tensor 'ones:0' shape=(1, 256) dtype=float32>]
Second structure: type=list str=[TensorShape([]), TensorShape([Dimension(None), Dimension(256)])]
What I was trying to achieve was a matrix with each word spliced together, but with the q_sate change it turned out to be wrong
But I tried many methods are wrong, so I hope to get your help
But I tried many methods are wrong, so I hope to get your help
But I tried many methods are wrong, so I hope to get your help
The variables loop_vars you input are three, butshape_invariants you input are two. So the error shows two structures don't have the same nested structure. You just need to add the structure of q_state.
# main loop
time, q_state, out_ = tf.while_loop(
cond=lambda time, *_: time < question_length,
body=_one_step,
loop_vars=[time, q_state, out_],
shape_invariants=[time.get_shape()
,tf.nn.rnn_cell.LSTMStateTuple(tf.TensorShape([None, 300]),tf.TensorShape([None, 300]))
,tf.TensorShape([None, 256])]
)