Multivariate LSTM cross feature dependencies - tensorflow

I was working myself through handson-ml2, and chapter 15 in particular.
I want to generalize the multiple steps ahead approach to multiple features and one target. In order to test my understanding I create some series, which are either following a sin wave or a cos wave with some frequency.
`
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
start = -10
stop = 10
n_steps = 2*(stop-start) #40
forecast_horizon = 10
num_features = 4
x_axis = np.linspace(start, stop, n_steps+forecast_horizon)
series_sin = np.stack([np.sin(np.random.rand() * x_axis) for i in range(10000)])
series_cos = np.stack([np.cos(np.random.rand() * x_axis) for i in range(10000)])
rand = np.random.rand(10000, n_steps + forecast_horizon).round()
target = np.where(rand, series_sin, series_cos)
series = np.stack([target, rand, series_sin, series_cos], axis = 2)
X_train = series[:7000,:n_steps]
X_valid = series[7000:9000,:n_steps]
X_test = series[9000:,:n_steps]
Y = np.empty([10000, n_steps, forecast_horizon])
for step_ahead in range(1, n_steps + 1):
Y[:, step_ahead - 1, :] = \
target[:, step_ahead:step_ahead + forecast_horizon] \
.reshape(10000, forecast_horizon)
y_train = Y[:7000]
y_valid = Y[7000:9000]
y_test = Y[9000:]
`
When plotting the target, sin and cos waves for some sample one sees that according to the rand array the target is either the sin wave or cos wave.
time series plot
Now I want to train a neural network forecasting the time series.
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(512, activation = "linear"),
tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(
128,
input_shape=(n_steps,num_features),
return_sequences=True,
activation="sigmoid")),
tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(
64,
input_shape=(n_steps,num_features),
return_sequences=True,
activation="sigmoid")),
tf.keras.layers.Dense(256, activation = "linear"),
tf.keras.layers.Dense(
forecast_horizon,
activation = "linear")
])
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
loss=tf.keras.losses.MeanSquaredError()
)
model.fit(X_train, y_train,
validation_data=(X_valid, y_valid),
epochs=2)
My assumption would be that it is very easy for the model to learn the task since feature 2,3,4 are perfect predictors for the target. But as one can see on the plot below the model does not learn any cross feature dependencies.
time series plot with forecast
Any ideas?
cheers,
Felix

Related

Difference equation in LSTM network on Tensorflow

I'd like to use a LSTM network on Tensorflow to implement a difference equation. I searched on internet but I didn't find anything about this topic.
The equation is:
formula
in which b=[1, 2, 1] and a=[1, -1.6641, 0.8387].
My aim is to use a neural network to find the correlation between input and output. Due to that to find the output ad k-instant you have to know also the previous inputs and outputs, my idea is to implement a LSTM network (many to one structure).
If we suppose to have an input vector of 500 samples and to use a window size of 5, the input of LSTM network is a vector of shape (500,5,1) while the output is (500,1,1).
The IN%OUT of first iteration are:
[0; x(k-4), x(k-3), x(k-2), x(k-1), x(k); 1] -> [1; y(k); 1]
formula
in the second iteration:
[0; x(k-3), x(k-2), x(k-1), x(k), x(k+1); 1] -> [1; y(k+1); 1]
formula
So I used a LSMT network with stateful set to TRUE to allow the network to remember past states but it doesn't converge.
It seems to me that the idea is correct but I cannot see where I am going wrong. Could someone help me find the problem? I copy and paste the code below and the network is developed on Tensorflow.
# Difference equation
K = 0.0436
b = np.array([1,2,1])
a = np.array([1, -1.6641, 0.8387])
x = np.random.uniform(0, 1, 100)
y = K*(signal.lfilter(b,a,x))
# Generate Dataset
X_train = np.random.uniform(0, 1, 100)
y_train = K*(signal.lfilter(b,a,X_train))
X_val = np.ones(100)
y_val = K*(signal.lfilter(b,a,X_val))
X_test = np.random.uniform(0.5, 0.8, 100)
y_test = K*(signal.lfilter(b,a,X_test))
def get_x_split(data, windows_size):
""" Return sliding window dataset. """
x_temp = np.zeros([1,windows_size-1])
x = np.array([])
for i in range(0,len(data)):
x_temp = np.append(x_temp[-windows_size+1:], data[i]).T
x = np.append(x, x_temp, axis=0)
x = np.reshape(x, (int(len(x)/windows_size), windows_size))
return x
windows_size = 10
X_train = get_x_split(X_train, windows_size)
X_val = get_x_split(X_val, windows_size)
X_test = get_x_split(X_test, windows_size)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
# Model Definition
activation_function = 'tanh'
def build_model():
input_layer = Input(shape=(X_train.shape[1],1), batch_size=1)
HL_1 = LSTM(1, activation=activation_function, return_sequences=True, stateful = True)(input_layer)
HL_2 = LSTM(1, activation=activation_function, return_sequences=False, stateful = True)(HL_1)
output_layer = Dense(1, activation='relu',name='Output')(HL_2)
model = Model(inputs=input_layer, outputs=output_layer)
return model
model = build_model()
model.compile(optimizer=RMSprop(),
loss={'Output': 'mse'}, #mse
metrics={'Output': tf.keras.metrics.RootMeanSquaredError()})
# Training
history = model.fit(x=X_train,
y=y_train,
batch_size=1,
validation_data=(X_val, y_val),
epochs=5000,
verbose=1,
shuffle=False)
# Test
y_pred = model.predict(X_test)
pred_samples = 400
plt.figure(dpi=1200)
plt.plot(y_test[300:pred_samples,3,0], label='true', linewidth=0.8, alpha=0.5)
plt.plot(y_pred[300:pred_samples,3,0], label='pred')
plt.legend()
plt.grid()
plt.title("Test")
plt.show()

How can I compile batched training of a gpflow GPR into a tf.function?

I need to train a GPR model in multiple batches per epoch using a custom loss function. I would like to do this using GPflow and I would like to compile my training using tf.function to increase the efficiency. However, gpflow.GPR must be re-instantiated each time you supply new data, so tf.function will have to re-trace each time. This makes the code slower rather than faster.
This is the initial setup:
import numpy as np
from itertools import islice
import tensorflow as tf
import tensorflow_probability as tfp
tfb = tfp.bijectors
from sklearn.model_selection import train_test_split
import gpflow
from gpflow.kernels import SquaredExponential
import time
data_size = 1000
train_fract = 0.8
batch_size = 250
n_epochs = 3
iterations_per_epoch = int(train_fract * data_size/batch_size)
tf.random.set_seed(3)
# Generate dummy data
x = np.arange(data_size)
y = np.arange(data_size) + np.random.rand(data_size)
# Slice into train and validate sets
x_train, x_validate, y_train, y_validate = train_test_split(x, y, random_state = 1, test_size = 1-train_fract )
# Convert data into tensorflow constants
x_train = tf.constant(x_train[:, np.newaxis], dtype=np.float64)
x_validate = tf.constant(x_validate[:, np.newaxis], dtype=np.float64)
y_train = tf.constant(y_train[:, np.newaxis], dtype=np.float64)
y_validate = tf.constant(y_validate[:, np.newaxis], dtype=np.float64)
# Batch data
batched_dataset = (
tf.data.Dataset.from_tensor_slices((x_train, y_train))
.shuffle(buffer_size=len(x_train), seed=1)
.repeat(count=None)
.batch(batch_size)
)
# Create kernel
constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp())
amplitude = tfp.util.TransformedVariable(initial_value=1, bijector=constrain_positive, dtype=np.float64, name="amplitude")
len_scale = tfp.util.TransformedVariable(initial_value=10, bijector=constrain_positive, dtype=np.float64, name="len_scale")
kernel = SquaredExponential(variance=amplitude, lengthscales=len_scale, name="squared_exponential_kernel")
obs_noise = tfp.util.TransformedVariable(initial_value=1e-3, bijector=constrain_positive, dtype=np.float64, name="observation_noise")
# Define custom loss function
#tf.function(autograph=False, experimental_compile=False)
def my_custom_loss(y_predict, y_true):
return tf.math.reduce_mean(tf.math.squared_difference(y_predict, y_true))
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
This is how I train without a tf.function:
gpr_model_j_i = gpflow.models.GPR(data=(x_train, y_train), kernel=kernel, noise_variance=obs_noise)
# Start training loop
for j in range(n_epochs):
for i, (x_train_j_i, y_train_j_i) in enumerate(islice(batched_dataset, iterations_per_epoch)):
with tf.GradientTape() as tape:
gpr_model_j_i = gpflow.models.GPR(data=(x_train_j_i, y_train_j_i), kernel=kernel, noise_variance=gpr_model_j_i.likelihood.variance)
y_predict_j_i = gpr_model_j_i.predict_f(x_validate)[0]
loss_j_i = my_custom_loss(y_predict_j_i, y_validate)
grads_j_i = tape.gradient(loss_j_i, gpr_model_j_i.trainable_variables)
optimizer.apply_gradients(zip(grads_j_i, gpr_model_j_i.trainable_variables))
This is how I train with a tf.function:
#tf.function(autograph=False, experimental_compile=False)
def tf_function_attempt_3(model): #, optimizer):
with tf.GradientTape() as tape:
y_predict_j_i = model.predict_f(x_validate)[0]
loss_j_i = my_custom_loss(y_predict_j_i, y_validate)
grads_j_i = tape.gradient(loss_j_i, model.trainable_variables)
optimizer.apply_gradients(zip(grads_j_i, model.trainable_variables))
print("TRACING...", end="")
for j in range(n_epochs):
for i, (x_train_j_i, y_train_j_i) in enumerate(islice(batched_dataset, iterations_per_epoch)):
gpr_model_j_i = gpflow.models.GPR(data=(x_train_j_i, y_train_j_i), kernel=kernel, noise_variance=gpr_model_j_i.likelihood.variance)
tf_function_attempt_3(gpr_model_j_i)#, optimizer)
The tf.function retraces for each batch and is significantly slower than the normal training.
Is there a way to speed up the batched training of my GPR model with tf.function while using a custom loss function and GPflow? If not, I am open to suggestions for an alternative approach.
You don't have to re-instantiate GPR each time. You can construct tf.Variable holders with unconstrained shape and then .assign to them:
import gpflow
import numpy as np
import tensorflow as tf
input_dim = 1
initial_x, initial_y = np.zeros((0, input_dim)), np.zeros((0, 1)) # or your first batch
x_var = tf.Variable(initial_x, shape=(None, input_dim), dtype=tf.float64)
y_var = tf.Variable(initial_y, shape=(None,1), dtype=tf.float64)
# in principle you could also set shape=(None, None)...
m = gpflow.models.GPR((x_var, y_var), gpflow.kernels.SquaredExponential())
loss = m.training_loss_closure() # compile=True default wraps in tf.function()
N1 = 3
x1, y1 = np.random.randn(N1, input_dim), np.random.randn(N1, 1)
m.data[0].assign(x1)
m.data[1].assign(y1)
loss() # traces the first time
N2 = 7
x2, y2 = np.random.randn(N2, input_dim), np.random.randn(N2, 1)
m.data[0].assign(x2)
m.data[1].assign(y2)
loss() # does not trace again

Tensorflow Custom Dataset - Add metadata as additional input to an image input processed by a CNN

I've got a working CNN model that classifies images from a custom dataset that is loaded with a csv file. The dataset is split up into training, validation and test dataset after being shuffled. Now I want to expand the image input by four extra input classes containing info / metadata about the images.
I've already learnt that I should split up my cnn model into two branches, one for the images and one for the extra input. My question is, how must I modify my data input so that the model can correctly process both images and additional input?
I'm very new to creating neural networks in tensorflow. My entire code is basically from this website. However, none of the topics could solve the problem for my code.
This is my code: (additional metadata are called usages, completions, heights, constructions)
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from keras.callbacks import History
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns
import io
# READ IMAGES, METADATA AND LABELS
df = pd.read_csv('dataset.csv')
df = df.sample(frac=1)
file_paths = df['file_name'].values
labels = df['label'].values
usages = df['usage'].values
completions = df['completion'].values
heights = df['height'].values
constructions = df['construction'].values
# SPLITTING THE DATASET INTO 80 % TRAINING DATA, 10 % VALIDATION DATA, 10 % TEST DATA
dataset_size = len(df.index)
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = int(0.1 * dataset_size)
img_height = 350
img_width = 350
batch_size = 16
autotune = tf.data.experimental.AUTOTUNE
# FUNCTION TO READ AND NORMALIZE THE IMAGES
def read_image(image_file, label, usg, com, hei, con):
image = tf.io.read_file(image_file)
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, (img_width, img_height))
return tf.cast(image, tf.float32) / 255.0, label, \
tf.cast(usg, tf.float32), tf.cast(com, tf.float32), \
tf.cast(hei, tf.float32), tf.cast(con, tf.float32)
# FUNCTION FOR DATA AUGMENTATION
def augment(image, labeL, usg, com, hei, con):
if tf.random.uniform((), minval=0, maxval=1) < 0.1:
image = tf.tile(tf.image.rgb_to_grayscale(image), [1, 1, 3])
image = tf.image.random_brightness(image, max_delta=0.25)
image = tf.image.random_contrast(image, lower=0.75, upper=1.25)
image = tf.image.random_saturation(image, lower=0.75, upper=1.25)
image = tf.image.random_flip_left_right(image)
return image, label, usg, com, hei, con
# SETUP FOR TRAINING, VALIDATION & TEST DATASET
ds_train = ds_train.map(read_image, num_parallel_calls=autotune)
ds_train = ds_train.cache()
ds_train = ds_train.map(augment, num_parallel_calls=autotune)
ds_train = ds_train.batch(batch_size)
ds_train = ds_train.prefetch(autotune)
ds_val = ds_val.map(read_image, num_parallel_calls=autotune)
ds_val = ds_val.batch(batch_size)
ds_val = ds_val.prefetch(autotune)
ds_test = ds_test.map(read_image, num_parallel_calls=autotune)
ds_test = ds_test.batch(batch_size)
ds_test = ds_test.prefetch(autotune)
## HOW TO SPLIT UP THE DATASET FOR THE MODEL FROM HERE? ##
# DEFINING FUNCTIONAL MODEL
input_img = keras.Input(shape=(img_width, img_height, 3))
input_dat = keras.Input(shape=(4,)) # how is this shape supposed to be?
x = layers.Conv2D(16, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.02), padding='same')(input_img)
x = layers.BatchNormalization(momentum=0.9)(x)
x = layers.MaxPooling2D()(x)
x = layers.Conv2D(32, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.02), padding='same')(x)
x = layers.BatchNormalization(momentum=0.9)(x)
x = layers.MaxPooling2D()(x)
x = layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.02), padding='same')(x)
x = layers.BatchNormalization(momentum=0.9)(x)
x = layers.MaxPooling2D()(x)
x = layers.Conv2D(128, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.02), padding='same')(x)
x = layers.BatchNormalization(momentum=0.9)(x)
x = layers.MaxPooling2D()(x)
out1 = layers.Flatten()(x)
out2 = layers.Dense(128, activation='relu')(input_dat)
merge = layers.concatenate([out1, out2])
x = layers.Dense(256, activation='relu')(merge)
x = layers.Dropout(0.35)(x)
output = layers.Dense(8, activation='sigmoid')(x)
model = keras.Model(inputs=[input_img, input_dat], outputs=output)
history = History()
no_overfit = keras.callbacks.EarlyStopping(monitor='val_loss', # stop training when overfitting occurs
min_delta=0.015, patience=1,
verbose=2, mode='auto')
# TRAINING STEP
model.compile(
optimizer=keras.optimizers.Adam(3e-5),
loss=[keras.losses.SparseCategoricalCrossentropy()],
metrics=["accuracy"])
model.fit(ds_train, epochs=30, callbacks=[no_overfit, history],
verbose=1, validation_data=ds_val)
So far I've only added the extra inputs to the dataset tensor and changed the model structure. How exactly do I split my dataset into input_img and input_dat so that each model branch will receive their proper input?
Also I have a custom test step in order to plot a confusion matrix. How is this supposed to be modified? Here the working code, for just the image input:
y_true = []
y_pred = []
for x, y in ds_test:
y_true.append(y)
predicts = model.predict(x) # compute model predictions for test step
y_pred.append(np.argmax(predicts, axis=-1))
true = tf.concat([item for item in y_true], axis=0)
pred = tf.concat([item for item in y_pred], axis=0)
cm = confusion_matrix(true, pred) # confusion matrix from seaborn
testacc = np.trace(cm) / float(np.sum(cm)) # calculating test accuracy
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(10, 10))
color = sns.light_palette("seagreen", as_cmap=False)
sns.heatmap(cm, annot=True, square=True, cmap=color, fmt=".3f",
linewidths=0.6, linecolor='k', cbar_kws={"shrink": 0.8})
plt.yticks(rotation=0)
plt.xlabel('\nPredicted Labels', fontsize=18)
plt.ylabel('True Labels\n', fontsize=18)
plt.title('Multiclass Model - Confusion Matrix (Test Step)\n', fontsize=24)
plt.text(10, 1.1, 'Accuracy = {:0.4f}'.format(testacc), fontsize=20)
ax.axhline(y=8, color='k', linewidth=1.5) # depending on amount of classes
ax.axvline(x=8, color='k', linewidth=1.5)
plt.show()
print('\naccuracy: {:0.4f}'.format(testacc))
Any help is greatly appreciated!!

How to multiply a layer by a constant vector element wise in Keras?

I want to make a weighted average ensemble of 3 of my trained models. So, I want first to multiply the softmax output of a model (element-wise) by a vector and then average the 3 weighted outputs of the 3 models.
I used the following code to multiply the output of the first model by its weight vector:
from keras.layers import Multiply, Average
resnet_weights = np.asarray([[0.91855, 0.99485, 0.89065, 0.96525, 0.98005,
0.93645, 0.6149, 0.934, 0.92505, 0.785, 0.85]], np.float32)
resnet_weight_tensor=tf.constant(resnet_weights, np.float32)
sess = tf.InteractiveSession()
print(resnet_weight_tensor.eval())
sess.close()
resnet_weighted = Multiply()([finetuned_model.layers[-1].output, resnet_weight_tensor])
print(resnet_weighted)
new_model=Model(model.input, resnet_weighted)
However, I'm stuck with the following error:
What can I do?
Use Lambda instead of Multiply, and K.constant instead of tf.constant (is backend-neutral):
resnet_weight_tensor=K.constant(resnet_weights, 'float32')
out = finetuned_model.layers[-1].output
resnet_weighted = Lambda(lambda x: x * resnet_weight_tensor)(out)
FULL EXAMPLE:
## BUILD MODELS
batch_size = 32
num_batches = 100
input_shape = (4,)
num_classes = 3
model_1 = make_model(input_shape, 8, num_classes)
model_2 = make_model(input_shape, 10, num_classes)
model_3 = make_model(input_shape, 12, num_classes)
## BUILD ENSEMBLE
models = (model_1, model_2, model_3)
models_ins = [model.input for model in models]
models_outs = [model.input for model in models]
outputs_weights = [np.random.random((batch_size, num_classes)),
np.random.random((batch_size, num_classes)),
np.random.random((batch_size, num_classes))]
outs_avg = model_outputs_average(models, outputs_weights)
final_out = Dense(num_classes, activation='softmax')(outs_avg)
model_ensemble = Model(inputs=models_ins, outputs=final_out)
model_ensemble.compile('adam', loss='categorical_crossentropy')
### TEST ENSEMBLE
x1 = np.random.randn(batch_size, *input_shape) # toy data
x2 = np.random.randn(batch_size, *input_shape)
x3 = np.random.randn(batch_size, *input_shape)
y = np.random.randint(0,2,(batch_size, num_classes)) # toy labels
model_ensemble.fit([x1,x2,x3], y)
Verify averaging:
[print(layer.name) for layer in model_ensemble.layers] # show layer names
preouts1 = get_layer_outputs(model_ensemble, 'lambda_1', [x1,x2,x3])
preouts2 = get_layer_outputs(model_ensemble, 'lambda_2', [x1,x2,x3])
preouts3 = get_layer_outputs(model_ensemble, 'lambda_3', [x1,x2,x3])
preouts_avg = get_layer_outputs(model_ensemble, 'average_1',[x1,x2,x3])
preouts = np.asarray([preouts1, preouts2, preouts3])
sum_of_diff_of_means = np.sum(np.mean(preouts, axis=0) - preouts_avg)
print(np.sum(np.mean([preouts1, preouts2, preouts3],axis=0) - preouts_avg))
# 4.69e-07
Functions used:
def make_model(input_shape, dense_dim, num_classes=3):
ipt = Input(shape=input_shape)
x = Dense(dense_dim, activation='relu')(ipt)
out = Dense(num_classes, activation='softmax')(x)
model = Model(ipt, out)
model.compile('adam', loss='categorical_crossentropy')
return model
def model_outputs_average(models, outputs_weights):
outs = [model.output for model in models]
out_shape = K.int_shape(outs[0])[1:] # ignore batch dim
assert all([(K.int_shape(out)[1:] == out_shape) for out in outs]), \
"All model output shapes must match"
outs_weights = [K.constant(w, 'float32') for w in outputs_weights]
ow_shape = K.int_shape(outs_weights[0])
assert all([(K.int_shape(w) == ow_shape) for w in outs_weights]), \
"All outputs_weights and model.output shapes must match"
weights_layers = [Lambda(lambda x: x * ow)(out) for ow, out
in zip(outs_weights, outs)]
return Average()(weights_layers)
def get_layer_outputs(model,layer_name,input_data,train_mode=False):
outputs = [layer.output for layer in model.layers if layer_name in layer.name]
layers_fn = K.function([model.input, K.learning_phase()], outputs)
return [layers_fn([input_data,int(train_mode)])][0][0]
The bug is possibly caused by the mixture of kears api and tensorflow api, since your resnet_weight_tensor is a tensor from tensorflow api, while finetuned_model.layers[-1].output is the output from a keras layer. Some discusses can be seen here issue 7362
One walk around is to wrap resnet_weight_tensor into keras Input layer.
from keras.layers import Multiply, Average, Input
resnet_weights = np.asarray([[0.91855, 0.99485, 0.89065, 0.96525, 0.98005,
0.93645, 0.6149, 0.934, 0.92505, 0.785, 0.85]], np.float32)
resnet_weight_tensor=tf.constant(resnet_weights, np.float32)
resnet_weight_input = Input(tensor=resnet_weight_tensor)
sess = tf.InteractiveSession()
print(resnet_weight_tensor.eval())
sess.close()
resnet_weighted = Multiply()([finetuned_model.layers[-1].output, resnet_weight_input])
print(resnet_weighted)
new_model=Model([model.input, resnet_weight_input], resnet_weighted)

LSTM to predict sine wave

Here I would like to generate a tutorial usage of LSTM in MxNet, with the example for Tensorflow. (location at https://github.com/mouradmourafiq/tensorflow-lstm-regression/blob/master/lstm_sin.ipynb"
Here is my major code
import mxnet as mx
import numpy as np
import pandas as pd
import argparse
import os
import sys
from data_processing import generate_data
import logging
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)
TIMESTEPS = 3
BATCH_SIZE = 100
X, y = generate_data(np.sin, np.linspace(0, 100, 10000), TIMESTEPS, seperate=False)
train_iter = mx.io.NDArrayIter(X['train'], y['train'], batch_size=BATCH_SIZE, shuffle=True, label_name='lro_label')
eval_iter = mx.io.NDArrayIter(X['val'], y['val'], batch_size=BATCH_SIZE, shuffle=False)
test_iter = mx.io.NDArrayIter(X['test'], batch_size=BATCH_SIZE, shuffle=False)
num_layers = 3
num_hidden = 50
data = mx.sym.Variable('data')
label = mx.sym.Variable('lro_label')
stack = mx.rnn.SequentialRNNCell()
for i in range(num_layers):
stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_'%i))
#stack.reset()
outputs, states = stack.unroll(length=TIMESTEPS,
inputs=data,
layout='NTC',
merge_outputs=True)
outputs = mx.sym.reshape(outputs, shape=(BATCH_SIZE, -1))
# purpose of fc1 was to make shape change to (batch_size, *), or label shape won't match LSTM unrolled output shape.
outputs = mx.sym.FullyConnected(data=outputs, num_hidden=1, name='fc1')
label = mx.sym.reshape(label, shape=(-1,))
outputs = mx.sym.LinearRegressionOutput(data=outputs,
label=label,
name='lro')
contexts = mx.cpu(0)
model = mx.mod.Module(symbol = outputs,
data_names = ['data'],
label_names = ['lro_label'])
model.fit(train_iter, eval_iter,
optimizer_params = {'learning_rate':0.005},
num_epoch=4,
batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 2))
This code runs but the train_accuracy is Nan.
The question is how to make it correct?
And since unrolled out shape has sequence_length, how can it match to label shape? Did my FC1 net make sense?
pass auto_reset=False to Speedometer callback, say, batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 2, auto_reset=False), should fix the NaN train-acc.