Keras Model using Tensorflow Distribution for loss fails with batch size > 1 - tensorflow

I'm trying to use a distribution from tensorflow_probability to define a custom loss function in Keras. More specifically, I'm trying to build a Mixture Density Network.
My model works on a toy dataset when batch_size = 1 (it learns to predict the correct mixture distribution for y using x). But it "fails" when batch_size > 1 (it predicts the same distribution for all y, ignoring x). This makes me think my problem has to do with batch_shape vs. sample_shape.
To reproduce:
import random
import keras
from keras import backend as K
from keras.layers import Dense, Activation, LSTM, Input, Concatenate, Reshape, concatenate, Flatten, Lambda
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.models import Sequential, Model
import tensorflow
import tensorflow_probability as tfp
tfd = tfp.distributions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# generate toy dataset
random.seed(12902)
n_obs = 20000
x = np.random.uniform(size=(n_obs, 4))
df = pd.DataFrame(x, columns = ['x_{0}'.format(i) for i in np.arange(4)])
# 2 latent classes, with noisy assignment based on x_0, x_1, (x_2 and x_3 are noise)
df['latent_class'] = 0
df.loc[df.x_0 + df.x_1 + np.random.normal(scale=.05, size=n_obs) > 1, 'latent_class'] = 1
df.latent_class.value_counts()
# Latent class will determines which mixture distribution we draw from
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(probs=[0.3, 0.7]),
components_distribution=tfd.Normal(
loc=[-1., 1], scale=[0.1, 0.5]))
d0_samples = d0.sample(sample_shape=(df.latent_class == 0).sum()).numpy()
d1 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(probs=[0.5, 0.5]),
components_distribution=tfd.Normal(
loc=[-2., 2], scale=[0.2, 0.6]))
d1_samples = d1.sample(sample_shape=(df.latent_class == 1).sum()).numpy()
df.loc[df.latent_class == 0, 'y'] = d0_samples
df.loc[df.latent_class == 1, 'y'] = d1_samples
fig, ax = plt.subplots()
bins = np.linspace(-4, 5, 9*4 + 1)
df.y[df.latent_class == 0].hist(ax=ax, bins=bins, label='Class 0', alpha=.4, density=True)
df.y[df.latent_class == 1].hist(ax=ax, bins=bins, label='Class 1', alpha=.4, density=True)
ax.legend();
# mixture density network
N_COMPONENTS = 2 # number of components in the mixture
input_feature_space = 4
flat_input = Input(shape=(input_feature_space,),
batch_shape=(None, input_feature_space),
name='inputs')
x = Dense(6, activation='relu',
kernel_initializer='glorot_uniform',
bias_initializer='ones')(flat_input)
x = Dense(6, activation='relu',
kernel_initializer='glorot_uniform',
bias_initializer='ones')(x)
# 3 params per component: weight, loc, scale
output = Dense(N_COMPONENTS*3,
kernel_initializer='glorot_uniform',
bias_initializer='ones')(x)
model = Model(inputs=[flat_input],
outputs=[output])
I suspect the problem is in the next 3 functions:
def get_mixture_coef(output, num_components):
"""
Extract mixture params from output
"""
out_pi = output[:, :num_components]
out_sigma = output[:, num_components:2*num_components]
out_mu = output[:, 2*num_components:]
# use softmax to normalize pi into prob distribution
max_pi = K.max(out_pi, axis=1, keepdims=True)
out_pi = out_pi - max_pi
out_pi = K.exp(out_pi)
normalize_pi = 1 / K.sum(out_pi, axis=1, keepdims=True)
out_pi = normalize_pi * out_pi
# use exp to ensure sigma is pos
out_sigma = K.exp(out_sigma)
return out_pi, out_sigma, out_mu
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=out_pi),
components_distribution=tfd.Normal(
loc=out_mu, scale=out_sigma,
),
)
# I suspect the problem is here
return -1 * d0.log_prob(y)
def mdn_loss(num_components):
def loss(y_true, y_pred):
out_pi, out_sigma, out_mu = get_mixture_coef(y_pred, num_components)
return get_lossfunc(out_pi, out_sigma, out_mu, y_true)
return loss
opt = Adam(lr=.001)
model.compile(
optimizer=opt,
loss = mdn_loss(N_COMPONENTS),
)
es = EarlyStopping(monitor='val_loss',
min_delta=1e-5,
patience=5,
verbose=1, mode='auto')
validation = .15
validate_idx = np.random.choice(df.index.values,
size=int(validation * df.shape[0]),
replace=False)
train_idx = [i for i in df.index.values if i not in validate_idx]
x_cols = ['x_0', 'x_1', 'x_2', 'x_3']
model.fit(x=df.loc[train_idx, x_cols].values,
y=df.loc[train_idx, 'y'].values[:, np.newaxis],
validation_data=(
df.loc[validate_idx, x_cols].values,
df.loc[validate_idx, 'y'].values[:, np.newaxis]),
# model works when batch_size = 1
# model fails when batch_size > 1
epochs=2, batch_size=1, verbose=1, callbacks=[es])
def sample(output, n_samples, num_components):
"""Sample from a mixture distribution parameterized by
model output."""
pi, sigma, mu = get_mixture_coef(output, num_components)
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=pi),
components_distribution=tfd.Normal(
loc=mu,
scale=sigma))
return d0.sample(sample_shape=n_samples).numpy()
yhat = model.predict(df.loc[train_idx, x_cols].values)
out_pi, out_sigma, out_mu = get_mixture_coef(yhat, 2)
latent_1_samples = sample(yhat[:1], n_samples=1000, num_components=2)
latent_1_samples = pd.DataFrame({'latent_1_samples': latent_1_samples.ravel()})
fig, ax = plt.subplots()
bins = np.linspace(-4, 5, 9*4 + 1)
latent_1_samples.latent_1_samples.hist(ax=ax, bins=bins, label='Class 1: yHat', alpha=.4, density=True)
df.y[df.latent_class == 0].hist(ax=ax, bins=bins, label='Class 0: True', density=True, histtype='step')
df.y[df.latent_class == 1].hist(ax=ax, bins=bins, label='Class 1: True', density=True, histtype='step')
ax.legend();
Thanks in advance!
Update
I found two ways to solve the problem, guided by this answer. Both solutions point to the fact that Keras is awkwardly broadcasting y to match y_pred:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=out_pi),
components_distribution=tfd.Normal(
loc=out_mu, scale=out_sigma,
),
)
# this also works:
# return -1 * d0.log_prob(tensorflow.transpose(y))
return -1 * d0.log_prob(y[:, 0])

Specifying the workaround here (Answer Section) even though it is specified by Dan in the question, for the benefit of the Community.
The problem of predicting the same distribution for all y, ignoring x can be resolved in two ways.
Code for Solution 1 is mentioned below:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=out_pi),
components_distribution=tfd.Normal(
loc=out_mu, scale=out_sigma,
),
)
return -1 * d0.log_prob(tensorflow.transpose(y))
Code for Solution 2 is mentioned below:
def get_lossfunc(out_pi, out_sigma, out_mu, y):
d0 = tfd.MixtureSameFamily(
mixture_distribution=tfd.Categorical(
probs=out_pi),
components_distribution=tfd.Normal(
loc=out_mu, scale=out_sigma,
),
)
return -1 * d0.log_prob(y[:, 0])
Hope this helps. Happy Learning!

Related

How to use this tutorial on time series forecasting (for beginners)

I am following TensorFlow’s tutorial on time series forecasting. I created and saved the model like in this tutorial. There are many examples in the manual for learning, but few uses of it.
How can I use the saved model in another script? How can I predict temperature, e.g., “01.01.2017 00:10:00”? How can I get the temperature value in a normal format?
Example code from the tutorial:
from datetime import datetime
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import Adam, RMSprop
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
zip_path = tf.keras.utils.get_file(
origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
fname='jena_climate_2009_2016.csv.zip',
extract=True)
csv_path, _ = os.path.splitext(zip_path)
df = pd.read_csv(csv_path)
# Slice [start:stop:step], starting from index 5 take every 6th record.
df = df[5::6]
date_time = pd.to_datetime(df.pop('Date Time'), format='%d.%m.%Y %H:%M:%S')
wv = df['wv (m/s)']
bad_wv = wv == -9999.0
wv[bad_wv] = 0.0
max_wv = df['max. wv (m/s)']
bad_max_wv = max_wv == -9999.0
max_wv[bad_max_wv] = 0.0
# The above inplace edits are reflected in the DataFrame.
df['wv (m/s)'].min()
wv = df.pop('wv (m/s)')
max_wv = df.pop('max. wv (m/s)')
# Convert to radians.
wd_rad = df.pop('wd (deg)')*np.pi / 180
# Calculate the wind x and y components.
df['Wx'] = wv*np.cos(wd_rad)
df['Wy'] = wv*np.sin(wd_rad)
# Calculate the max wind x and y components.
df['max Wx'] = max_wv*np.cos(wd_rad)
df['max Wy'] = max_wv*np.sin(wd_rad)
timestamp_s = date_time.map(pd.Timestamp.timestamp)
day = 24*60*60
year = (365.2425)*day
df['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
df['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
df['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
df['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))
fft = tf.signal.rfft(df['T (degC)'])
f_per_dataset = np.arange(0, len(fft))
n_samples_h = len(df['T (degC)'])
hours_per_year = 24*365.2524
years_per_dataset = n_samples_h/(hours_per_year)
# f_per_year = f_per_dataset/years_per_dataset
# plt.step(f_per_year, np.abs(fft))
# plt.xscale('log')
# plt.ylim(0, 400000)
# plt.xlim([0.1, max(plt.xlim())])
# plt.xticks([1, 365.2524], labels=['1/Year', '1/day'])
# _ = plt.xlabel('Frequency (log scale)')
column_indices = {name: i for i, name in enumerate(df.columns)}
n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]
num_features = df.shape[1]
train_mean = train_df.mean()
train_std = train_df.std()
train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std
df_std = (df - train_mean) / train_std
# df_std = df_std.melt(var_name='Column', value_name='Normalized')
# plt.figure(figsize=(12, 6))
# ax = sns.violinplot(x='Column', y='Normalized', data=df_std)
# _ = ax.set_xticklabels(df.keys(), rotation=90)
# //////////////////////////////////////////////////////////////////////////////////////////////
class WindowGenerator():
def __init__(self, input_width, label_width, shift,
train_df=train_df, val_df=val_df, test_df=test_df,
label_columns=None):
# Store the raw data.
self.train_df = train_df
self.val_df = val_df
self.test_df = test_df
# Work out the label column indices.
self.label_columns = label_columns
if label_columns is not None:
self.label_columns_indices = {name: i for i, name in
enumerate(label_columns)}
self.column_indices = {name: i for i, name in
enumerate(train_df.columns)}
# Work out the window parameters.
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size)[self.input_slice]
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
def __repr__(self):
return '\n'.join([
f'Total window size: {self.total_window_size}',
f'Input indices: {self.input_indices}',
f'Label indices: {self.label_indices}',
f'Label column name(s): {self.label_columns}'])
# //////////////////////////////////////////////////////////////////////////////////////////////
def split_window(self, features):
inputs = features[:, self.input_slice, :]
labels = features[:, self.labels_slice, :]
if self.label_columns is not None:
labels = tf.stack(
[labels[:, :, self.column_indices[name]] for name in self.label_columns],
axis=-1)
# Slicing doesn't preserve static shape information, so set the shapes
# manually. This way the `tf.data.Datasets` are easier to inspect.
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, None])
return inputs, labels
WindowGenerator.split_window = split_window
def plot(self, model=None, plot_col='T (degC)', max_subplots=3, num=None):
inputs, labels = self.example
plt.figure(figsize=(12, 8), num=num)
plot_col_index = self.column_indices[plot_col]
max_n = min(max_subplots, len(inputs))
for n in range(max_n):
plt.subplot(max_n, 1, n+1)
plt.ylabel(f'{plot_col} [normed]')
plt.plot(self.input_indices, inputs[n, :, plot_col_index],
label='Inputs', marker='.', zorder=-10)
if self.label_columns:
label_col_index = self.label_columns_indices.get(plot_col, None)
else:
label_col_index = plot_col_index
if label_col_index is None:
continue
plt.scatter(self.label_indices, labels[n, :, label_col_index],
edgecolors='k', label='Labels', c='#2ca02c', s=64)
if model is not None:
predictions = model(inputs)
plt.scatter(self.label_indices, predictions[n, :, label_col_index],
marker='X', edgecolors='k', label='Predictions',
c='#ff7f0e', s=64)
if n == 0:
plt.legend()
plt.xlabel('Time [h]')
WindowGenerator.plot = plot
def make_dataset(self, data):
data = np.array(data, dtype=np.float32)
ds = tf.keras.utils.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=self.total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32,)
ds = ds.map(self.split_window)
return ds
WindowGenerator.make_dataset = make_dataset
#property
def train(self):
return self.make_dataset(self.train_df)
#property
def val(self):
return self.make_dataset(self.val_df)
#property
def test(self):
return self.make_dataset(self.test_df)
#property
def example(self):
"""Get and cache an example batch of `inputs, labels` for plotting."""
result = getattr(self, '_example', None)
if result is None:
# No example batch was found, so get one from the `.train` dataset
result = next(iter(self.train))
# And cache it for next time
self._example = result
return result
WindowGenerator.train = train
WindowGenerator.val = val
WindowGenerator.test = test
WindowGenerator.example = example
single_step_window = WindowGenerator(
input_width=1, label_width=1, shift=1,
label_columns=['T (degC)'])
MAX_EPOCHS = 20
def compile_and_fit(model, window, patience=2, save=False):
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
patience=patience,
mode='min')
model.compile(loss=tf.keras.losses.MeanSquaredError(),
optimizer=tf.keras.optimizers.Adam(),
metrics=[tf.keras.metrics.MeanAbsoluteError()])
history = model.fit(window.train, epochs=MAX_EPOCHS,
validation_data=window.val,
callbacks=[early_stopping])
if save == True:
model.save('./saved_model')
return history
dense = tf.keras.Sequential([
tf.keras.layers.Dense(units=64, activation='relu'),
tf.keras.layers.Dense(units=64, activation='relu'),
tf.keras.layers.Dense(units=1),
])
history = compile_and_fit(dense, single_step_window, save=True)
val_performance = {}
performance = {}
val_performance['Dense'] = dense.evaluate(single_step_window.val)
performance['Dense'] = dense.evaluate(single_step_window.test, verbose=0)
dense.summary()
Next I need to load the model and use:
from datetime import datetime
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import Adam, RMSprop
import os
import matplotlib.pyplot as plt
import seaborn as sns
loaded_model = tf.keras.models.load_model('./saved_model')
loaded_model.summary()
What's next?
There are many ways you may consider.
model.save() and model.load()
model.save_weights() and model.load_wights()
Export parameters as arrays. We discussed here no need for target API install. You can use any random function from the target machine with the parameter.
Transform and copy the working directory
etc.
Time conversation. You know that the database has unique time units, Oracle, Informix, and SQL. They can display as unix_time or binary sequences as designed in many applications. The unix_time in the communications response with a clock or response time to convert per security method if any. You need to synchronize the clock and response to the correct binary format or time format for extracting data when request or response information extraction, number as binary, and then string exports.
TAI (17 September 2004) UTC (16 to 17 September 2004) [Unix time][1]
2004-09-17T00:00:30.75 2004-09-16T23:59:58.75 1095379198.75
Sample: Export trained parameters from the model. It is easy to print out at any of the steps, but working coefficients respond to the target environment with significant feedback.
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Callback
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
class custom_callback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
if( logs['accuracy'] >= 0.95 ):
self.model.stop_training = True
def on_epoch_end(self, epoch, logs=None):
print( model.get_weights()[0].shape ) # (3, 3, 4, 32)
print( model.get_weights()[1].shape ) # (32,)
custom_callback = custom_callback()
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Training
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
history = model.fit( dataset, batch_size=100, epochs=10000, callbacks=[custom_callback] )

Learning a simple pattern with RNN

I am trying to make RNN in tensorflow capture a basic pattern in a simple time series in hours. I am trying to solve a bigger problem involving count time series of customer demand.
The simple time series is as follows:
Every 24 hours (1 day) there will be a small integer number either 1 or 2 from a random uniform distirbution.
In between these 24 hours will be zero values.
Every 168 hours (7 days) there will be a high integer number (5 or 6 or 7 or 8 or 9) from a random uniform distirbution.
I tried following the code at https://r2rt.com/recurrent-neural-networks-in-tensorflow-i.html using dynamic_rnn.
Is my test data correct? How can I feed the batches of output from previous times step as input to the next time step? I have 5 hyperparamters to play with
batch_size = 8 num_steps = 192 state_size = 5 learning_rate = 0.00001
num_epochs=1
However, after training each time with the same hyperparameters I am getting different results. Each time the training error is very small. The different results seem quite random (local minima probably??). orange is actual, blue is predicted.
Can my test batch start at any point in the sequence? Does the RNN learn the number of zeros inbetween non-zero values? if the test batch starts with a small non-zero number then the RNN should know that it should output 23 zero value steps after this and then after 167 steps output a high non-zero value. if I start my test sequence at 0 then it should wait 23 more zero value steps before outputing a small non-zero value and after 167 steps output a high non-zero value?
or does it learn another pattern? I am not sure if my method of testing is correct?
Is it better to just pass one time step integer value and let the network generate the remaining time steps integer values by passing the current time step output as input to the next time step?
Currently, I just take a random sequence of X generated by the same method for training and check if my output Y is the shifted version of X by 1 time step. Could you please explain?
My code is given below. you can just copy and paste and it should run. Basically, I just generate the data, build the model, train the network and test it.
from data_generator import gen_data
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import numpy as np
import time
import matplotlib.pyplot as plt
num_classes = 11
batch_size = 8
num_steps = 192
state_size = 5
learning_rate = 0.00001
num_epochs=1
dem = gen_data(len=1576)
def gen_batch(dem, batch_size, num_steps):
raw_x = dem[:-1]
raw_y = dem[1:]
data_length = len(raw_x)
num_of_win = data_length - num_steps - 1 # 1382 windows
batch_partition_length = num_of_win // batch_size # 172 batches
data_x = []
data_y = []
j=0
for i in range(batch_partition_length):
windows_x = []
windows_y = []
k=0
while(k<batch_size):
windows_x.append( raw_x[ j:num_steps + j] )
windows_y.append( raw_y[ j:num_steps + j] )
j+=1
k+=1
data_x.append(np.array(windows_x)) # each batch is stacked horizontally.
data_y.append(np.array(windows_y))
for windows_x, windows_y in zip(data_x,data_x):
x = windows_x
y = windows_y
z = x.shape
z = y.shape
yield (x, y)
def gen_epoch(num_epochs,batch_size, num_steps):
for n in range(num_epochs):
yield gen_batch(dem, batch_size, num_steps)
def reset_graph():
# if 'sess' in globals() and sess:
# sess.close()
tf.compat.v1.reset_default_graph()
def build_RNN_model(batch_size, num_classes,state_size,num_steps,learning_rate):
reset_graph()
x = tf.compat.v1.placeholder(dtype=tf.int32, shape=(batch_size,num_steps))
y = tf.compat.v1.placeholder(dtype=tf.int32, shape=(batch_size,num_steps))
init_state = tf.zeros([batch_size, state_size])
# with tf.compat.v1.variable_scope('rnn_cell'):
# W = tf.compat.v1.get_variable('inp_state_w', shape=(num_classes+state_size,state_size),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
# b = tf.compat.v1.get_variable('inp_state_b', shape=(state_size),initializer=tf.compat.v1.initializers.constant(0.0) )
# def rnn_cell(rnn_input,state):
# with tf.compat.v1.variable_scope('rnn_cell', reuse=True):
# W = tf.compat.v1.get_variable('inp_state_w', shape=(num_classes+state_size,state_size),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
# b = tf.compat.v1.get_variable('inp_state_b', shape=(state_size),initializer=tf.compat.v1.initializers.constant(0.0) )
# return tf.tanh( tf.matmul( tf.concat([rnn_input,state], axis=1),W) + b )
#cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(state_size, reuse=True, name='rnn_cell' )
rnn_inputs = tf.one_hot(x, num_classes)
cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(state_size)
rnn_outputs, final_state = tf.compat.v1.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)
with tf.compat.v1.variable_scope('output'):
W = tf.compat.v1.get_variable('out_state_w', shape=(state_size,num_classes),initializer=tf.compat.v1.initializers.glorot_uniform(10) )
b = tf.compat.v1.get_variable('out_state_b', shape=(num_classes),initializer=tf.compat.v1.initializers.constant(0.0) )
logits = tf.reshape( tf.compat.v1.matmul(tf.reshape(rnn_outputs, [-1, state_size]), W) + b, [batch_size, num_steps, num_classes])
predictions = tf.compat.v1.nn.softmax(logits)
tru_labels = y
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
total_loss = tf.reduce_mean(losses)
train_step = tf.compat.v1.train.AdagradOptimizer(learning_rate).minimize(total_loss)
return dict(
x=x,
y=y,
final_state = final_state,
total_loss = total_loss,
train_step = train_step,
init_state = init_state,
predictions = predictions,
tru_labels = tru_labels,
saver = tf.compat.v1.train.Saver()
)
def train_network(g,num_epochs, batch_size,num_steps, dem,save=' '):
tf.compat.v1.set_random_seed(2345)
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.initialize_all_variables())
training_losses = []
for idx, epoch in enumerate(gen_epoch(num_epochs,batch_size, num_steps)):
training_loss = 0
steps=0 # number of batches
training_state = None
for X,Y in epoch:
steps+=1
feed_dict = {g['x'] : X, g['y'] : Y}
if training_state is not None:
feed_dict[g['init_state']] = training_state
training_loss_, training_state, train_step = \
sess.run([g['total_loss'], g['final_state'], g['train_step']], feed_dict)
training_loss+=training_loss_
print("Average training loss for Epoch", idx, ":", training_loss/steps)
print('steps',steps)
training_losses.append(training_loss/steps)
if isinstance(save, str):
g['saver'].save(sess, save)
e = gen_batch(dem, batch_size, num_steps)
e = gen_batch(dem, batch_size, num_steps)
for X,Y in e:
tru_labels, predictions = \
sess.run([g['tru_labels'], g['predictions']], feed_dict={g['x'] : X, g['y'] : Y, g['init_state'] : training_state})
pred = np.argmax(predictions, axis=2)
print(pred.shape)
pred = pred[0]
print('predictions',pred)
tru_labels = tru_labels[0]
print('tru_labels',tru_labels )
plt.plot(pred)
plt.plot(tru_labels)
plt.show()
return training_loss
g = build_RNN_model(batch_size, num_classes,state_size,num_steps,learning_rate)
t = time.time()
train_network(g, num_epochs,batch_size,num_steps, dem,save='saver' )
print("It took", time.time() - t, "seconds to train for 3 epochs.")
I have written some keras code with a single RNN cell and a dense layer to capture the following two patterns which is similar to the two patterns above. However, the distribution of magnitudes of high vehicles and low vehicles that are drawn from a categorical distribution below are not being represented in the test output.
Categorical Random Variable, x = {0,1,2} and p(x) = {0.6,0.3,0.1}
low vehicles = 1 + x , every 4 hours
high vehicles = 6 + x , every 8 hours
I managed to get the results like the following
with this code
from copyreg import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras
import sys
#### for reproduclvle resutls
from numpy.random import seed
seed(1)
import tensorflow
tensorflow.random.set_seed(2)
n_steps = 12
batch_size = 32
lay1_state_size = 64
lay2_state_size = 0
dense_state_size = 1
num_epochs = 25
horizon = 24
loss_function_type = 'sparse_categorical_crossentropy or mse or rmse'
num_layers = 1
optimizer_type = 'Adam'
metrics = 'rmse'
# spikes at regrular interval
dem = np.load('const_dem_2_freq_stoch.npy')
dem_len = len(dem)
def gen_batch(dem, batch_size, n_steps):
n = n_steps + 1
raw_x = dem[:-1]
data_length = len(raw_x)
num_of_win = data_length - n - 1 # 1382 windows
batch_partition_length = num_of_win // batch_size # 172 batches
#print('batch_partition_length',batch_partition_length)
data_x = []
j=0
for i in range(batch_partition_length):
windows_x = []
k=0
while(k<batch_size):
windows_x.append( raw_x[ j:n + j] )
j+=1
k+=1
data_x.append(np.array(windows_x)) # each batch is stacked horizontally.
data_x = np.array(data_x)
data_x = np.reshape(data_x,(-1,n)) # 224 x 13
#print(data_x.shape)
return data_x,batch_partition_length
data_x,batch_partition_length = gen_batch(dem, batch_size, n_steps)
data_x = np.expand_dims(data_x,axis=-1)
tr = int(0.7*dem_len)
val = int(0.2*dem_len)
x_train, y_train = data_x[:tr,:n_steps], data_x[:tr,-1]
x_valid, y_valid = data_x[tr:tr+val,:n_steps], data_x[tr:tr+val,-1]
print('\n\n')
print('tr+val',tr+val)
print('\n\n')
x_test, y_test = data_x[tr+val:,:n_steps], data_x[tr+val:,-1]
#model
model = keras.models.Sequential([keras.layers.SimpleRNN(lay1_state_size,input_shape=[None,1]), keras.layers.Dense(dense_state_size)])
# model = keras.models.Sequential([keras.layers.SimpleRNN(lay1_state_size,return_sequences=True,input_shape=[None,1]),keras.layers.SimpleRNN(lay2_state_size),
# keras.layers.Dense(dense_state_size)])
model.compile(optimizer='Adam',loss=keras.losses.mean_absolute_error ,metrics=[tf.keras.metrics.RootMeanSquaredError()] )
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,validation_data=(x_valid,y_valid))
print('\n')
print('Model Evaluation on test set:\n')
model.evaluate(x_test, y_test,batch_size=batch_size)
print('\n')
#model.summary()
y_tru = np.array([])
for step_ahead in range(horizon):
# tru label
y = np.append(data_x[step_ahead+1:,n_steps ], np.array([[0]*(step_ahead+1)]))
y_tru = np.append(y_tru,y)
# prediction
y_pred_one = model.predict(data_x[:,step_ahead:])[:,np.newaxis,:]
data_x = np.concatenate([data_x,y_pred_one ],axis=1)
y_tru = np.reshape(y_tru,(batch_partition_length*batch_size,horizon),order='F')
y_pred_horizon = data_x[:,n_steps+1:]
y_pred_horizon = np.squeeze(y_pred_horizon)
print('print(y_pred_horizon.shape)',y_pred_horizon.shape)
print(' RNN prediction on all data MSE',np.mean(keras.losses.mean_squared_error(y_tru,y_pred_horizon )) )
print(' RNN prediction on all data MAE',np.mean(keras.losses.mean_absolute_error(y_tru,y_pred_horizon )) )
print('\n')
for i in range(10):
plt.figure(i)
plt.plot(y_tru[i])
plt.plot(np.squeeze(y_pred_horizon[i]))
plt.show()
The data generation code is given below
from copyreg import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras
dem_len = 1240
def categorical(p):
return (p.cumsum(-1) >= np.random.uniform(size=p.shape[:-1])[..., None]).argmax(-1)
p = np.array([0.6, 0.3, 0.1])
def dem_hr(hr, lo_veh, hi_veh,len):
dem_hrs = np.array([])
for i in range(10000):
#d = np.random.randint(lo_veh,hi_veh)
d = lo_veh + categorical(p)
z = np.array([0]*(hr-1))
dem_hrs = np.append(dem_hrs, d)
dem_hrs = np.append(dem_hrs, z)
dem_hrs = dem_hrs[:len]
return dem_hrs
def gen_data(len):
dzero = np.zeros(len)
# for hr,lo_veh, hi_veh in zip([4, 8],[1, 6],[3,9]):
# d = dem_hr(hr, lo_veh, hi_veh,len)
# dem = dem + d
# dem = np.array(dem,dtype=np.float32)
d4 = dem_hr(4, 1, 3,len)
d8 = dem_hr(8, 6, 9,len)
dall = dzero + d8
dsub = dall - d4
dem = np.where(dsub>=0,d8,d4)
# plt.plot(dem)
# plt.plot(d4)
# plt.plot(d8)
# plt.show()
return dem
dem = gen_data(len=dem_len)
np.save('const_dem_2_freq_stoch_cat',dem)
plt.plot(dem)
plt.show()
I think incresing the number of steps may help to capture the distribution of magnitudes at different periods. Does increasing the layers also help to capture the magnitude distribution?

PairNorm of gnn in tensorflow

I try to realize <PAIRNORM: TACKLING OVERSMOOTHING IN GNNS> in tensorflow using spektral, here is my code:
import numpy as np
from tensorflow.keras.layers import Dropout, Input, Dense, LayerNormalization, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K
from spektral.layers import GATConv
class GraphNorm_PairNorm(Layer):
def __init__(self, **kwargs):
super(GraphNorm_PairNorm, self).__init__(**kwargs)
def call(self, inputs):
'''
# example:
np.random.seed(1)
x = np.random.randint(1, 20, [4, 3])
y = x[:3, :3]
y
(y - y.mean(axis=0))/y.std(axis=0)
y.mean(axis=0)
y.std(axis=0)
x = x.astype(np.float)
# array([[13, 3, 14],
# [ 7, 10, 5],
# [15, 2, 17],
# [15, 9, 8]]
a = np.eye(4)
a[:3,:3]=1
x = tf.constant(x)
a = tf.constant(a)
inputs = [x, a]
'''
x, a = inputs
input_shape = x.shape
ndims = len(input_shape)
n_nodes = input_shape[-2]
n_feas = input_shape[-1]
x = K.expand_dims(x, -1)
shape_x_tile = [1] * ndims + [n_nodes]
x = K.tile(x, shape_x_tile) # (n_graph, nodes, feas, nodes)
a = K.expand_dims(a, -2)
shape_a_tile = np.ones_like(a.shape)
shape_a_tile[-2] = n_feas
a = K.tile(a, shape_a_tile) # (n_graph, nodes, feas, nodes)
x_mask = x * a # (n_graph, nodes, feas, nodes)
x_len = tf.reduce_sum(a, -3, keepdims=True) #
x_mean = tf.reduce_sum(x_mask, -3, keepdims=True) / x_len
x2 = tf.square(x_mask - x_mean)
x2 = tf.where(tf.equal(a, 0), 0, x2)
x_std = tf.sqrt(tf.reduce_sum(x2, -3, keepdims=True) / x_len)
x_mean = tf.einsum("...ijk->...kj", x_mean)
x_std = tf.einsum("...ijk->...kj", x_std)
x_std = tf.where(tf.equal(x_std, 0), 1, x_std)
opt = (inputs[0] - x_mean) / x_std
return opt
# Parameters
a = np.ones([100, 100])
x = np.random.random([100, 100])
channels = 8 # Number of channels in each head of the first GAT layer
n_attn_heads = 8 # Number of attention heads in first GAT layer
dropout = 0.6 # Dropout rate for the features and adjacency matrix
l2_reg = 2.5e-4 # L2 regularization rate
learning_rate = 5e-3 # Learning rate
epochs = 20000 # Number of training epochs
patience = 100 # Patience for early stopping
N = x.shape[-2] # Number of nodes in the graph
F = x.shape[-1] # Original size of node features
n_out = 1 # 如果Number of classes
# Model definition
x_in = Input(shape=(F,)) # <tf.Tensor 'input_1:0' shape=(None, 1433) dtype=float32>
a_in = Input((None,), sparse=False) #
do_1 = Dropout(dropout)(x_in)
gc_1 = GATConv(
channels, #
attn_heads=n_attn_heads,
concat_heads=True,
dropout_rate=dropout, #
activation="elu",
kernel_regularizer=l2(l2_reg),
attn_kernel_regularizer=l2(l2_reg),
bias_regularizer=l2(l2_reg),
)([do_1, a_in])
gc_1 = GraphNorm_PairNorm()((gc_1, a_in))
out1 = Dense(1)(gc_1)
# Build model
model1 = Model(inputs=[x_in, a_in], outputs=out1)
model1((x, a)) #
I do want my model can deal with different graph with different number of nodes!!!
and the error is TypeError: Failed to convert elements of [1, 1, None] to Tensor. Consider casting elements to a supported type, how to fix this?

Batchnormalization in Keras vs PyTorch vs Numpy are different

I created the BatchNormalization layer in Keras, PyTorch and calculated the same operation using Numpy but I get three different results. Am I making some error here?
Things I assume below: layer.get_weights() in tf.keras for BN layer returns in order gamma, beta, running_mean, running_var. For the BN operation I am using the following operation: gamma * (x - running_mean) / sqrt(running_var + epsilon) + beta
Code snippet to reproduce the issue:
import torch
import tensorflow
from torch.nn import Module, BatchNorm1d, Conv1d
from torch.nn.functional import pad
import numpy as np
from tensorflow.keras.layers import Conv1D, BatchNormalization, Input
from tensorflow.keras.models import Model
torch.backends.cudnn.deterministic = True
np.random.seed(12345)
z = Input((1024, 8), dtype=np.float32)
inp = z
z = Conv1D(64, 16, padding='same', use_bias=False)(z)
z = BatchNormalization(epsilon=0.001)(z)
keras_model = Model(inp, z)
# in order: conv-layer weight, gamma, beta, running_mean, running_var
weights = [np.random.random((16, 8, 64)), np.random.random((64,)), np.random.random((64,)), np.random.random((64,)),
np.random.random((64,))]
weights = [np.array(x, dtype=np.float32) for x in weights]
keras_model.layers[1].set_weights([weights[0]])
keras_model.layers[2].set_weights(weights[1:])
keras_model_subpart = Model(keras_model.inputs, keras_model.layers[1].output)
class TorchModel(Module):
def __init__(self):
super(TorchModel, self).__init__()
self.l1 = Conv1d(8, 64, 16, bias=False)
self.l2 = BatchNorm1d(64, 0.001)
def forward(self, x):
x = pad(x, (7, 8))
x = self.l1(x)
y = x
x = self.l2(x)
return y, x
torch_model = TorchModel().to(torch.device('cpu'))
torch_model.l1.weight.data = torch.from_numpy(weights[0].T).float()
torch_model.l2.weight.data = torch.from_numpy(weights[1].T).float()
torch_model.l2.bias.data = torch.from_numpy(weights[2]).float()
torch_model.l2.running_mean = torch.from_numpy(weights[3]).float()
torch_model.l2.running_var = torch.from_numpy(weights[4]).float()
torch_model.eval()
input_value = np.array(np.random.random((1024, 8)), dtype=np.float32)
keras_results = [np.array(keras_model_subpart.predict(input_value[np.newaxis, :, :])),
np.array(keras_model.predict(input_value[np.newaxis, :, :]))]
with torch.no_grad():
torch_results = [x.detach().numpy() for x in torch_model(torch.from_numpy(input_value.T[np.newaxis, :, :]).float())]
keras_results = [np.squeeze(x) for x in keras_results]
torch_results = [np.squeeze(x) for x in torch_results]
numpy_results = weights[1] * (keras_results[0] - weights[3]) / np.sqrt(weights[4] + 0.001) + weights[2]
print(torch.__version__, tensorflow.__version__, np.__version__, sep=",")
print('\nRESULTS:')
print('\tLayer 1 difference:', np.mean(np.abs(keras_results[0] - torch_results[0].T).flatten()))
print('\tLayer 2 difference:', np.mean(np.abs(keras_results[1] - torch_results[1].T).flatten()))
print('\tLayer 2 keras - numpy:', np.mean(np.abs(keras_results[1] - numpy_results).flatten()))
print('\tLayer 2 torch - numpy:', np.mean(np.abs(torch_results[1] - numpy_results.T).flatten()))
The output I get (after all the initialization printing of tensorflow)
1.7.1+cu110,2.4.1,1.19.5
RESULTS:
Layer 1 difference: 0.0
Layer 2 difference: 6.8671216e-07
Layer 2 keras - numpy: 2.291581e-06
Layer 2 torch - numpy: 1.8929532e-06

Compound Poisson Keras custom loss function

I am trying to implement a custom loss function using Tensorflow as the negative loglikelihood of this expression (which is a compound Poisson-Gamma):
The first term (represented by the Dirac delta) refers to the case when z == 0, while the sum (which needs to be truncated at some point in the implementation as it goes to infinity) represents the product of the probability from a Gamma and a Poisson distribution.
This is the tentative implementation in Tensorflow:
import tensorflow as tf
import tensorflow_probability as tfp
from functools import partial
tf.enable_eager_execution()
import numpy as np
def pois_gamma_compound_loss(y_true, y_pred):
lambda_, alpha, beta = y_pred[:, 0], y_pred[:, 1], y_pred[:, 2]
poisson_distr = tfp.distributions.Poisson(rate=lambda_)
ijk_0 = (1.0, tf.zeros_like(y_true))
c = lambda i, p: i < 4
b = lambda i, p: (tf.add(i, 1),
p + tf.math.multiply(x = poisson_distr.prob(tf.zeros_like(y_true) + i),
y = tfp.distributions.Gamma(concentration=tf.math.multiply(x = alpha,
y = tf.zeros_like(y_true) + i),
rate=beta).prob(y_tru)))
ijk_final = tf.while_loop(c, b, ijk_0)
batch_lik = tf.add(ijk_final[1], tf.math.exp(tf.multiply(lambda_, -1.0)))
return -tf.reduce_mean(batch_lik)
inputs = Input(shape=(39,))
x = Dense(4, activation='relu', kernel_initializer='random_uniform')(inputs)
x = Dense(4, activation='relu', kernel_initializer='random_uniform')(inputs)
x = Dense(6, activation='relu', kernel_initializer='random_uniform')(inputs)
lambda_ = Dense(1, activation="softmax", name="lambda", kernel_initializer='random_uniform')(x)
alpha = Dense(1, activation="softmax", name="alpha", kernel_initializer='random_uniform')(x)
beta = Dense(1, activation="softmax", name="beta", kernel_initializer='random_uniform')(x)
output_params = Concatenate(name="pvec", axis=1)([lambda_, alpha, beta])
model = Model(inputs, output_params)
model.compile(loss=pois_gamma_compound_loss, optimizer='adam')
model.fit(X_train, y_train, epochs=60, batch_size=20)