Keras/TensorFlow MNIST DCGAN: why does generator has almost zero loss from start? - tensorflow

I have constructed a DCGAN (deep convolutional generative adversarial network) inspired by this github repository. It is written in a more low level Tensorflow code that I tried transforming into Keras syntax instead.
Now, the network is quite heavy I think (around 4 million parameters), and I get this problem that during training that the generator network beats the discriminator network by a lot. I have not found any similar posts about this problem, since most of the time it is the discriminator that beats the generator (when in fact being fooled), or that we have mode collapse. So I am thinking there might be something wrong in the code (maybe the discriminator network is training when it shouldn't, or the loss function is wrong etc.). I have tried spotting the mistake but failed.
My code follows below:
from keras.models import Sequential
from keras.layers import Dense, Reshape, ReLU, LeakyReLU, BatchNormalization as BN#, tanh, sigmoid
from keras.layers.core import Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import UpSampling2D, Conv2D, MaxPooling2D, Conv2DTranspose
from keras.optimizers import SGD, Adam
from keras.datasets import mnist
import time
import numpy as np
import math
from utils import load_mnist, load_lines, load_celebA
class dcgan(object):
def __init__(self, config):
"""
Args:
batch_size: The size of batch. Should be specified before training.
y_dim: (optional) Dimension of dim for y. [None]
z_dim: (optional) Dimension of dim for Z. [100]
gf_dim: (optional) Dimension of G filters in first conv layer. [64]
df_dim: (optional) Dimension of D filters in first conv layer. [64]
gfc_dim: (optional) Dimension of G units for for fully connected layer. [1024]
dfc_dim: (optional) Dimension of D units for fully connected layer. [1024]
c_dim: (optional) Dimension of image color. For grayscale input, set to 1. [3]
"""
self.build_model(config)
def build_model(self,config):
self.D = self.discriminator(config)
self.G = self.generator(config)
self.GAN = Sequential()
self.GAN.add(self.G)
self.D.trainable = False
self.GAN.add(self.D)
def discriminator(self,config):
input_shape = (config.x_h,config.x_w,config.x_d)
D = Sequential()
D.add(Conv2D(filters=config.df_dim,strides=2,padding='same',kernel_size=5,input_shape=input_shape))
D.add(LeakyReLU(alpha=0.2))
D.add(Conv2D(filters=config.df_dim*2,strides=2,padding='same',kernel_size=5))
D.add(BN(momentum=0.9,epsilon=1e-5))
D.add(LeakyReLU(alpha=0.2))
D.add(Conv2D(filters=config.df_dim*4,strides=2,padding='same',kernel_size=5))
D.add(BN(momentum=0.9,epsilon=1e-5))
D.add(LeakyReLU(alpha=0.2))
D.add(Conv2D(filters=config.df_dim*8,strides=2,padding='same',kernel_size=5))
D.add(BN(momentum=0.9,epsilon=1e-5))
D.add(LeakyReLU(alpha=0.2))
D.add(Flatten())
D.add(Dense(1))
D.add(Activation('sigmoid'))
print('D:')
D.summary()
return D
def generator(self,config):
G = Sequential()
G.add(Dense(input_dim=config.z_dim, units=config.gf_dim*8*4*4))
G.add(Reshape((4,4,config.gf_dim*8)))
G.add(BN(momentum=0.9,epsilon=1e-5))
G.add(ReLU())
G.add(Conv2DTranspose(filters=config.gf_dim*4,strides=2,padding='same',kernel_size=5))
G.add(BN(momentum=0.9,epsilon=1e-5))
G.add(ReLU())
G.add(Conv2DTranspose(filters=config.gf_dim*2,strides=2,padding='same',kernel_size=5))
G.add(BN(momentum=0.9,epsilon=1e-5))
G.add(ReLU())
if config.dataset not in ['mnist','lines']:
#more layers could (and should) be added in order to get correct output size of G
G.add(Conv2DTranspose(filters=config.gf_dim,strides=2,padding='same',kernel_size=5))
G.add(BN(momentum=0.9,epsilon=1e-5))
G.add(ReLU())
G.add(Conv2DTranspose(filters=config.c_dim,strides=2,padding='same',kernel_size=5))
G.add(Activation('tanh'))
print('G:')
G.summary()
return G
def train(self,config):
if config.dataset == 'mnist':
(X_train, y_train), (X_test, y_test) = load_mnist()
X_train = (X_train.astype(np.float32) - 127.5)/127.5
elif config.dataset == 'lines':
(X_train, y_train), (X_test, y_test) = load_lines()
elif config.dataset == 'celebA':
(X_train, y_train), (X_test, y_test) = load_celebA()
D_optim = Adam(learning_rate=config.learning_rate, beta_1=config.beta_1)
G_optim = Adam(learning_rate=config.learning_rate, beta_1=config.beta_1)
loss_f = 'binary_crossentropy'
#Compile models
self.D.compile(loss=loss_f,optimizer=D_optim)
self.D.trainable = True
self.G.compile(loss=loss_f,optimizer=G_optim)
self.GAN.compile(loss=loss_f,optimizer=G_optim)
batches = int(len(X_train)/config.batch_size) #int always rounds down --> no problem with running out of data
counter = 1
print('\n' * 1)
print('='*42)
print('-'*10,'Training initialized.','-'*10)
print('='*42)
print('\n' * 2)
start_time = time.time()
for epoch in range(config.epochs):
for batch in range(batches):
batch_X_real = X_train[int(batch*config.batch_size/2):int((batch+1)*config.batch_size/2)][np.newaxis].transpose(1,2,3,0)
batch_z = np.random.normal(0,1,size=(config.batch_size,config.z_dim))
batch_X_fake = self.G.predict(batch_z[0:int(config.batch_size/2)])
batch_X = np.concatenate((batch_X_real,batch_X_fake),axis=0)
batch_yd = np.concatenate((np.ones(int(config.batch_size/2)),np.zeros((int(config.batch_size/2)))))
batch_yg = np.ones((config.batch_size))
#maybe normalize values in X?
#Update D network
self.D.trainable = True
D_loss = self.D.train_on_batch(batch_X, batch_yd)
#Update G network
self.D.trainable = False
G_loss = self.GAN.train_on_batch(batch_z, batch_yg)
#Update G network again according to https://github.com/carpedm20/DCGAN-tensorflow.git
#G_loss = self.GAN.train_on_batch(batch_z, batch_yg)
# Ta tid på körningen
# print("[%8d Epoch:[%2d/%2d] [%4d/%4d] time: %4.4f, d_loss: %.8f, g_loss: %.8f" \
# % (counter, epoch, config.epoch, idx, batch_idxs,
# time.time() - start_time, errD_fake+errD_real, errG))
#Save losses to vectors in order to plot
#Print status and save images for each config.sample_freq iterations
if np.mod(counter,config.sample_freq) == 0:
print('Epoch: {}/{} | Batch: {}/{} | D-loss {} | G-loss {} | Time: {}'.format(epoch+1,config.epochs,batch+1,batches,D_loss,G_loss,time.time() - start_time))
counter += 1
print('\n' * 2)
print('='*38)
print('-'*10,'Training complete.','-'*10)
print('='*38)
The program runs slow, but if you try running it using this chunk of code:
#import model
from setup import model_config
#create configuration object
config = model_config(dataset='mnist',loadmodel=False, interpolation=False,epochs=20,batch_size=64,
z_dim=100,gf_dim=64,df_dim=64,gfc_dim=1024,dfc_dim=1024,
c_dim=1,sample_freq=10) # >> model=None << ny parameter!
if config.loadmodel:
# Pass model to model parameter in config, vet inte hur man gör
# model1 = LoadModel('Generator')
# model2 = LoadModel('Discriminator')
# model3 = LoadModel('DG')
#load existing model
pass
else:
dcgan = dcgan(config)
dcgan.train(config)
if config.interpolation:
#do interpolation
pass
it will start printing out progress and losses. I am certain there is some obvious error somewhere! If I have missed something, let me know what I can add in order to make this a better post!

Related

Time-Series LSTM Model wrong prediction

I am practicing how to create an LSTM model on a univariate series using this dataset from Kaggle: https://www.kaggle.com/sumanthvrao/daily-climate-time-series-data
My issue is that I am unable to get an accurate prediction of the temperature and my loss seems to be going all over the place. I have tried multiple methods including
Ensuring that time series data is stationary
Changing the time steps
Changing the hyperparameters
Using a stacked LSTM model
I am really curious as to what is wrong with my code although I do have a few hypothesis:
I made an error when preprocessing the data
I introduced stationarity wrongly
This dataset requires a multivariate approach
%tensorflow_version 2.x # this line is not required unless you are in a notebook
import tensorflow as tf
from numpy import array
from numpy import argmax
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
# preparing independent and dependent features
def prepare_data(timeseries_data, n_features):
X, y =[],[]
for i in range(len(timeseries_data)):
# find the end of this pattern
end_ix = i + n_features
# check if we are beyond the sequence
if end_ix > len(timeseries_data)-1:
break
# gather input and output parts of the pattern
seq_x, seq_y = timeseries_data[i:end_ix], timeseries_data[end_ix]
X.append(seq_x)
y.append(seq_y)
return np.array(X), np.array(y)
# preparing independent and dependent features
def prepare_x_input(timeseries_data, n_features):
x = []
for i in range(len(timeseries_data)):
# find the end of this pattern
end_ix = i + n_features
# check if we are beyond the sequence
if end_ix > len(timeseries_data):
break
# gather input and output parts of the pattern
seq_x = timeseries_data[i:end_ix]
x.append(seq_x)
x = x[-1:]
#remove non-stationerity
#x = np.log(x)
return np.array(x)
#read data and filter temperature column
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Weather Parameter/DailyDelhiClimateTrain.csv')
df.head()
temp_df = df.pop('meantemp')
plt.plot(temp_df)
#make data stationery
sta_temp_df = np.log(temp_df).diff()
plt.figure(figsize=(15,5))
plt.plot(sta_temp_df)
print(sta_temp_df)
time_step = 7
x, y = prepare_data(sta_temp_df, time_step)
n_features = 1
x = x.reshape((x.shape[0], x.shape[1], n_features))
model = Sequential()
model.add(LSTM(10, return_sequences=True, input_shape=(time_step, n_features)))
model.add(LSTM(10))
model.add(Dense(16, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.summary()
result = model.fit(x, y, epochs=800)
n_days = 113
pred_temp_df = list(temp_df)
test = sta_temp_df.copy()
sta_temp_df = list(sta_temp_df)
i = 0
while(i<n_days):
x_input = prepare_x_input(sta_temp_df, time_step)
print(x_input)
x_input = x_input.reshape((1, time_step, n_features))
#pass data into model
yhat = model.predict(x_input, verbose=0)
yhat.flatten
print(yhat[0][0])
sta_temp_df.append(yhat[0][0])
i = i+1
sta_temp_df[0] = np.log(temp_df[0])
cum_temp_df = np.exp(np.cumsum(sta_temp_df))
print(cum_temp_df)
My code is shown above. Would really appreciate if someone can identify what I did wrong here!

Loss function with derivative in TensorFlow 2

I am using TF2 (2.3.0) NN to approximate the function y which solves the ODE: y'+3y=0
I have defined cutsom loss class and function in which I am trying to differentiate the single output with respect to the single input so the equation holds, provided that y_true is zero:
from tensorflow.keras.losses import Loss
import tensorflow as tf
class CustomLossOde(Loss):
def __init__(self, x, model, name='ode_loss'):
super().__init__(name=name)
self.x = x
self.model = model
def call(self, y_true, y_pred):
with tf.GradientTape() as tape:
tape.watch(self.x)
y_p = self.model(self.x)
dy_dx = tape.gradient(y_p, self.x)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
return loss
but running the following NN:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from custom_loss_ode import CustomLossOde
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
loss = CustomLossOde(model.input, model)
model.compile(optimizer=Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99),loss=loss)
model.run_eagerly = True
model.fit(x_train, y_train, batch_size=16, epochs=30)
for now I am getting 0 loss from the fisrt epoch, which doesn't make any sense.
I have printed both y_true and y_test from within the function and they seem OK so I suspect that the problem is in the gradien which I didn't succeed to print.
Apprecitate any help
Defining a custom loss with the high level Keras API is a bit difficult in that case. I would instead write the training loop from scracth, as it allows a finer grained control over what you can do.
I took inspiration from those two guides :
Advanced Automatic Differentiation
Writing a training loop from scratch
Basically, I used the fact that multiple tape can interact seamlessly. I use one to compute the loss function, the other to calculate the gradients to be propagated by the optimizer.
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
# using the high level tf.data API for data handling
x_train = tf.reshape(x_train,(-1,1))
dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(1)
opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99)
for step, (x,y_true) in enumerate(dataset):
# we need to convert x to a variable if we want the tape to be
# able to compute the gradient according to x
x_variable = tf.Variable(x)
with tf.GradientTape() as model_tape:
with tf.GradientTape() as loss_tape:
loss_tape.watch(x_variable)
y_pred = model(x_variable)
dy_dx = loss_tape.gradient(y_pred, x_variable)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
grad = model_tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(grad, model.trainable_variables))
if step%20==0:
print(f"Step {step}: loss={loss.numpy()}")

Why am I getting "ValueError: No gradients provided for any variable: ['Variable:0']." error?

I'm extremely new to tensorflow, and I'm trying to build a style transfer model, I understand the concept of how the model is but am having difficulty at actually implementing it, since I don't fully understand what is going on in tensorflow, yet. When I try to run the optimization for the generated image I get the "No gradients provided" error, which I don't understand since my code has:
loss = total_loss(content_feats, style_feats, output_feats)
grad = tape.gradient(loss, output_processado)
optimizer.apply_gradients(zip([grad],[output_processado]))
ValueError Traceback (most recent call
last)
in ()
8
9 grad = tape.gradient(loss, output_processado)
---> 10 optimizer.apply_gradients(zip([grad],[output_processado]))
11
12 clip = tf.clip_by_value(output_processado, min_value, max_value)
1 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py in _filter_grads(grads_and_vars) 1217 if not filtered: 1218
raise ValueError("No gradients provided for any variable: %s." %
-> 1219 ([v.name for _, v in grads_and_vars],)) 1220 if vars_with_empty_grads: 1221 logging.warning(
ValueError: No gradients provided for any variable: ['Variable:0'].
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
import numpy as np
from PIL import Image
import requests
from io import BytesIO
from keras.applications.vgg19 import VGG19
from keras.applications.vgg19 import preprocess_input
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.models import Model
import keras.backend as K
from matplotlib import pyplot as plt
from numpy import expand_dims
from tensorflow import GradientTape
ITERATIONS = 10
CHANNELS = 3
IMAGE_SIZE = 500
IMAGE_WIDTH = IMAGE_SIZE
IMAGE_HEIGHT = IMAGE_SIZE
CONTENT_WEIGHT = 0.02
STYLE_WEIGHT = 4.5
MEAN = np.array([103.939, 116.779, 123.68])
CONTENT_LAYERS = ['block4_conv2']
STYLE_LAYERS = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1', 'block5_conv1']
input_image_path = "input.png"
style_image_path = "style.png"
output_image_path = "output.png"
combined_image_path = "combined.png"
san_francisco_image_path = "https://www.economist.com/sites/default/files/images/print-edition/20180602_USP001_0.jpg"
tytus_image_path = "http://meetingbenches.com/wp-content/flagallery/tytus-brzozowski-polish-architect-and-watercolorist-a-fairy-tale-in-warsaw/tytus_brzozowski_13.jpg"
input_image = Image.open(BytesIO(requests.get(san_francisco_image_path).content))
input_image = input_image.resize((IMAGE_WIDTH, IMAGE_HEIGHT))
input_image.save(input_image_path)
#input_image
# Style visualization
style_image = Image.open(BytesIO(requests.get(tytus_image_path).content))
style_image = style_image.resize((IMAGE_WIDTH, IMAGE_HEIGHT))
style_image.save(style_image_path)
#style_image
def obter_modelo():
modelo = VGG19(include_top = False, weights = 'imagenet', input_tensor = None)
c_layer = CONTENT_LAYERS
s_layers = STYLE_LAYERS
output_layers = [modelo.get_layer(layer).output for layer in (c_layer + s_layers)]
return Model(modelo.inputs, output_layers)
def processar_imagem(img):
imagem = img.resize((IMAGE_HEIGHT, IMAGE_WIDTH))
imagem = img_to_array(imagem)
imagem = preprocess_input(imagem)
imagem = expand_dims(imagem, axis=0)
return imagem
def desprocessar_imagem(img):
imagem = img
mean = MEAN
imagem[..., 0] += mean[0]
imagem[..., 1] += mean[1]
imagem[..., 2] += mean[2]
imagem = imagem[..., ::-1]
return imagem.astype(int)
def content_loss(c_mat, out_mat):
return 0.5 * K.sum(K.square(out_mat - c_mat))
def matriz_gram(mat):
return K.dot(mat,K.transpose(mat))
def style_loss(s_mat, out_mat):
style_feat = K.batch_flatten(K.permute_dimensions(s_mat,(2,0,1)))
output_feat = K.batch_flatten(K.permute_dimensions(out_mat,(2,0,1)))
style_gram = matriz_gram(style_feat)
output_gram = matriz_gram(output_feat)
return K.sum(K.square(style_gram - output_gram)) / (4.0 * (CHANNELS ** 2) * (IMAGE_SIZE ** 2))
def total_loss(c_layer, s_layers, out_layers):
content_layer = c_layer[0]
out_content = out_layers[0]
style_layers = s_layers[1:]
out_style = out_layers[1:]
c_loss = content_loss(content_layer[0], out_content[0])
s_loss = None
for i in range(len(style_layers)):
if s_loss is None:
s_loss = style_loss(style_layers[i][0], out_style[i][0])
else:
s_loss += style_loss(style_layers[i][0], out_style[i][0])
return CONTENT_WEIGHT * c_loss + (STYLE_WEIGHT * s_loss)/len(style_layers)
modelo = obter_modelo()
#content image
content_processado = processar_imagem(input_image)
content_feats = modelo(K.variable(content_processado))
#style image
style_processado = processar_imagem(style_image)
style_feats = modelo(K.variable(style_processado))
#output image
output_processado = preprocess_input(np.random.uniform(0,250,(IMAGE_HEIGHT, IMAGE_WIDTH,CHANNELS)))
output_processado = expand_dims(output_processado, axis=0)
output_processado = K.variable(output_processado)
optimizer = tf.optimizers.Adam(5,beta_1=.99,epsilon=1e-3)
epochs=200
melhor_loss = K.variable(2000000.0)
melhor_imagem = None
min_value = MEAN
max_value = 255 + MEAN
loss = K.variable(0.0)
for e in range(epochs):
with tf.GradientTape() as tape:
tape.watch(output_processado)
output_feats = modelo(output_processado)
loss = total_loss(content_feats, style_feats, output_feats)
grad = tape.gradient(loss, output_processado)
optimizer.apply_gradients(zip([grad],[output_processado]))
clip = tf.clip_by_value(output_processado, min_value, max_value)
output_processado.assign(clip)
print("Epoch: " + str(e) )
For tape.gradient, you have to pass (loss, model.trainable_weights), but you are passing tape.gradient(loss, output_processado). Also for optimizer.apply_gradients, you have to pass (grad, model.trainable_variables), but you are passing (zip([grad],[output_processado]).
Calling a model inside a GradientTape scope enables you to retrieve the gradients of the trainable weights of the layer with respect to a loss value. Using an optimizer instance, you can use these gradients to update these variables (which you can retrieve using model.trainable_weights).
TensorFlow provides the tf.GradientTape API for automatic differentiation - computing the gradient of a computation with respect to its input variables. Tensorflow "records" all operations executed inside the context of a tf.GradientTape onto a "tape". Tensorflow then uses that tape and the gradients associated with each recorded operation to compute the gradients of a "recorded" computation using reverse mode differentiation.
If you want to process the gradients before applying them you can instead use the optimizer in three steps:
Compute the gradients with tf.GradientTape.
Process the gradients as you wish.
Apply the processed gradients with apply_gradients().
Here is a simple example for mnist data. The comments are present in the code to explain better.
Code-
import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from tensorflow.keras import layers
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
# Preprocess the data (these are Numpy arrays)
x_train = x_train.reshape(60000, 784).astype('float32') / 255
x_test = x_test.reshape(10000, 784).astype('float32') / 255
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')
# Reserve 10,000 samples for validation
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]
# Get the model.
inputs = keras.Input(shape=(784,), name='digits')
x = layers.Dense(64, activation='relu', name='dense_1')(inputs)
x = layers.Dense(64, activation='relu', name='dense_2')(x)
outputs = layers.Dense(10, name='predictions')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
# Instantiate an optimizer.
optimizer = keras.optimizers.SGD(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Prepare the training dataset.
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
epochs = 3
for epoch in range(epochs):
print('Start of epoch %d' % (epoch,))
# Iterate over the batches of the dataset.
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
# Open a GradientTape to record the operations run
# during the forward pass, which enables autodifferentiation.
with tf.GradientTape() as tape:
# Run the forward pass of the layer.
# The operations that the layer applies
# to its inputs are going to be recorded
# on the GradientTape.
logits = model(x_batch_train, training=True) # Logits for this minibatch
# Compute the loss value for this minibatch.
loss_value = loss_fn(y_batch_train, logits)
# Use the gradient tape to automatically retrieve
# the gradients of the trainable variables with respect to the loss.
grads = tape.gradient(loss_value, model.trainable_weights)
# Run one step of gradient descent by updating
# the value of the variables to minimize the loss.
optimizer.apply_gradients(zip(grads, model.trainable_weights))
# Log every 200 batches.
if step % 200 == 0:
print('Training loss (for one batch) at step %s: %s' % (step, float(loss_value)))
print('Seen so far: %s samples' % ((step + 1) * 64))
Output -
2.2.0
Start of epoch 0
Training loss (for one batch) at step 0: 2.323657512664795
Seen so far: 64 samples
Training loss (for one batch) at step 200: 2.3156163692474365
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 2.2302279472351074
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 2.131979465484619
Seen so far: 38464 samples
Start of epoch 1
Training loss (for one batch) at step 0: 2.00234317779541
Seen so far: 64 samples
Training loss (for one batch) at step 200: 1.7992427349090576
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 1.8583933115005493
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 1.6005337238311768
Seen so far: 38464 samples
Start of epoch 2
Training loss (for one batch) at step 0: 1.6701987981796265
Seen so far: 64 samples
Training loss (for one batch) at step 200: 1.6237502098083496
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 1.3603084087371826
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 1.246948480606079
Seen so far: 38464 samples
You can find more about tf.GradientTape here. The example used here is taken from here.
Hope this answers your question. Happy Learning.

Replicating results using Keras

I am trying to replicate results of my experiments using Tensorflow and Keras (with TF backend). When I use TF, I set random seeds for numpy and tensorflow graphs first right at the top of the script. I am not using any dropout layers or other methods that could introduce randomness (that I can think of).
When running such models, regardless of it's network size, always yields same results.
TF Experiment 1:
('Epoch ', 99, ' completed out of ', 100, ' loss: ', 289.8982433080673, 'accuracy: ', 0.6875)
TF Experiment 2:
('Epoch ', 99, ' completed out of ', 100, ' loss: ', 289.8982433080673, 'accuracy: ', 0.6875)
When I tried to replicate these results using Keras with same configurations, I failed. On top of that, each separate run yields different performance.
My TF code, which can replicate results, looks like this:
Snippet Reference: https://www.youtube.com/watch?v=BhpvH5DuVu8&list=PLQVvvaa0QuDfKTOs3Keq_kaG2P55YRn5v&index=46
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
## import system modules
#
import os
import sys
## import ML modules
#
import tensorflow as tf
import numpy as np
from keras.utils import to_categorical
from sklearn import preprocessing
logs_path = '../logs/'
## Default constants
#
NO_OF_CLASSES = 2
BATCH_SIZE = 32
FEAT_DIM = 26
N_nodes_hl1 = 300
N_nodes_hl2 = 30
N_nodes_hl3 = 30
## define the network architecture
#
## This model is a simple multilayer perceptron network with 3 hidden layers.
## Input to the layer has the dimensions equal to feature dimensions.
## We create a complete graph in this method with input placeholder as an input argument and
## output placeholder as an returning argument
#
def neural_network_model(data):
## defining dictionaries specifying the specification of each layer.
#
hidden_1_layer = {'weights': tf.Variable(tf.random_normal([FEAT_DIM, N_nodes_hl1]), name='w1'),\
'biases': tf.Variable(tf.random_normal([N_nodes_hl1]), name='b1')}
hidden_2_layer = {'weights': tf.Variable(tf.random_normal([N_nodes_hl1, N_nodes_hl2]), name='w2'), \
'biases': tf.Variable(tf.random_normal([N_nodes_hl2]), name='b2')}
hidden_3_layer = {'weights': tf.Variable(tf.random_normal([N_nodes_hl2, N_nodes_hl3]), name='w3'),\
'biases': tf.Variable(tf.random_normal([N_nodes_hl3]), name='b3')}
output_layer = {'weights': tf.Variable(tf.random_normal([N_nodes_hl3, NO_OF_CLASSES]), name='w4'), \
'biases': tf.Variable(tf.random_normal([NO_OF_CLASSES]), name='b4')}
l1 = tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1, hidden_2_layer['weights']), hidden_2_layer['biases'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2, hidden_3_layer['weights']), hidden_3_layer['biases'])
l3 = tf.nn.relu(l3)
output = tf.add(tf.matmul(l3, output_layer['weights']), output_layer['biases'], name="last_layer")
## return the final layer's output gracefully
#
return output
## end of method
#
## This method trains a neural network along with collecting statistics related to
## the graphs.
#
def train_neural_network(xtrain, ytrain, odir):
learning_rate = 0.0008
epoch_iter = 100
## input/ output placeholders where data would be plugged in...
#
x = tf.placeholder('float', [None, FEAT_DIM], name="input")
y_ = tf.placeholder('float', name="output")
## define the network
#
logits = neural_network_model(x)
prediction = tf.nn.softmax(logits, name="op_to_restore") ## softmax normalizes the output results
loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = y_) )
## Major OP for the training procedure. The "train" op defined here tries to minimize loss
#
with tf.name_scope('ADAM'):
# Gradient Descent
optimizer = tf.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(loss)
with tf.name_scope('Accuracy'):
## Accuracy calculation by comparing the predicted and detected labels
#
acc = tf.equal(tf.argmax(logits, 1), tf.argmax(y_, 1))
acc = tf.reduce_mean(tf.cast(acc, tf.float32))
## summary and display variables
#
loss_sum = tf.summary.scalar("loss", loss)
acc_sum = tf.summary.scalar("accuracy", acc)
## Merge all summaries into a single variable. This summaries will be displayed using Tensorboard
#
merged_summary_op = tf.summary.merge([loss_sum, acc_sum])
## create a session for the graph (graph initialization)
#
with tf.Session() as sess:
## initialize all the variables. Note that before this point, all the variables were empty buckets !!
#
sess.run(tf.global_variables_initializer())
## initialize the summary writer (For tensorboard)
#
summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
## iterate over epochs (complete forward-backward for the entire training set)
#
for epoch in range(epoch_iter):
## initialize some variables to keep track of progress during training
#
epoch_loss = 0
epoch_accuracy = 0
## minibatch training. Splitting input data in to smaller chunks is better
#
for i in range( int(len(xtrain)/ BATCH_SIZE) ):
epoch_x = xtrain[ i * BATCH_SIZE : i * BATCH_SIZE + BATCH_SIZE]
epoch_y = ytrain[ i * BATCH_SIZE : i * BATCH_SIZE + BATCH_SIZE]
## run the session and collect the intermediate stats. Feed dict kwarg takes in input/output placeholdar names as
## a key and features/labels as values
#
_, ac, ls, summary = sess.run([train, acc, loss, merged_summary_op], feed_dict = {x: epoch_x, y_: epoch_y})
## write the the summary in logs to visualize it later
#
summary_writer.add_summary(summary, epoch * int(len(xtrain)/BATCH_SIZE)+i)
## update stats
#
epoch_loss += ls
epoch_accuracy += ac
print ("Epoch ", epoch, " completed out of ", epoch_iter, " loss: ", epoch_loss, "accuracy: ", ac)
## saver module to save tf graph variables.. etc....
My Keras script to replicate results looks as follows:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
## import system modules
#
import os
import sys
## import ML and datatype modules
#
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD
from keras.utils import to_categorical
from sklearn import preprocessing
## Default constants
#
NO_OF_CLASSES = 2
BATCH_SIZE = 32
FEAT_DIM = 26
N_nodes_hl1 = 300
N_nodes_hl2 = 30
N_nodes_hl3 = 30
## This method defines the NN architecture as well as performs training and saves the model info
#
def train_neural_network(xtrain, ytrain, odir):
learning_rate = 0.009
## Define the network (MLP)
#
model = Sequential()
model.add(Dense(N_nodes_hl1, input_dim=FEAT_DIM, activation="relu"))
model.add(Dense(N_nodes_hl2, activation="relu"))
model.add(Dense(N_nodes_hl3, activation="relu"))
model.add(Dense(NO_OF_CLASSES, activation="softmax"))
## optimizer
#
sgd = SGD(lr=learning_rate)
model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=['accuracy'])
print model.summary()
## train the model
model.fit(x=xtrain, y=ytrain, epochs=100)
Keras experiment1:
loss: 0.5964 - acc: 0.6725
Keras experiment2:
loss: 0.5974 - acc: 0.6712
The only difference between two scripts is the optimizer. I don't think that would introduce any randomness during training. Also, I believe that NN architecture should yield same results with the precision upto float64 on CPUs (and float32 on GPUs due to hardware capabilities).
What am I missing in my Keras script? Also, correct me if my understanding is wrong somewhere in this query.
Along with that, additional references (other than the following) on how to replicate NN results would be highly appreciated.
https://machinelearningmastery.com/reproducible-results-neural-networks-keras/
How to get stable results with TensorFlow, setting random seed
Getting reproducible results using tensorflow-gpu

Keras model predict fails

im a bit of a noob in ML. ive been trying to get a model retrained on the flowers dataset to work on keras-js withous success. whenever i try to run predict on the model i get "Error: predict() must take an object where the keys are the named inputs of the model: input_1." pls help here is my code
test.py
import sys
import json
import numpy as np
from collections import defaultdict
# It's very important to put this import before keras,
# as explained here: Loading tensorflow before scipy.misc seems to cause imread to fail #1541
# https://github.com/tensorflow/tensorflow/issues/1541
import scipy.misc
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import SGD
from keras import backend as K
from keras.utils import np_utils
import dataset
import net
np.random.seed(1337)
spe = 500
n = 299
batch_size = 128
nb_epoch = 1
nb_phase_two_epoch = 1
data_directory, test_directory, model_file_prefix = sys.argv[1:]
print "loading dataset"
X, y, tags = dataset.dataset(data_directory, n)
nb_classes = len(tags)
sample_count = len(y)
train_size = sample_count * 4 // 5
X_train = X[:train_size]
y_train = y[:train_size]
Y_train = np_utils.to_categorical(y_train, nb_classes)
X_test = X[train_size:]
y_test = y[train_size:]
Y_test = np_utils.to_categorical(y_test, nb_classes)
datagen = ImageDataGenerator(
featurewise_center=False,
samplewise_center=False,
featurewise_std_normalization=False,
samplewise_std_normalization=False,
zca_whitening=False,
rotation_range=0,
width_shift_range=0.125,
height_shift_range=0.125,
horizontal_flip=True,
vertical_flip=False,
fill_mode='nearest')
#datagen.fit(X_train)
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
data_directory,
target_size=(299,299),
batch_size=16,
class_mode='categorical'
)
validation_generator = train_datagen.flow_from_directory(
data_directory,
target_size=(299,299),
batch_size=16,
class_mode='categorical'
)
def evaluate(model, vis_filename=None):
Y_pred = model.predict(X_test, batch_size=batch_size)
y_pred = np.argmax(Y_pred, axis=1)
accuracy = float(np.sum(y_test==y_pred)) / len(y_test)
print "accuracy:", accuracy
confusion = np.zeros((nb_classes, nb_classes), dtype=np.int32)
for (predicted_index, actual_index, image) in zip(y_pred, y_test, X_test):
confusion[predicted_index, actual_index] += 1
print "rows are predicted classes, columns are actual classes"
for predicted_index, predicted_tag in enumerate(tags):
print predicted_tag[:7],
for actual_index, actual_tag in enumerate(tags):
print "\t%d" % confusion[predicted_index, actual_index],
print
if vis_filename is not None:
bucket_size = 10
image_size = n // 4 # right now that's 56
vis_image_size = nb_classes * image_size * bucket_size
vis_image = 255 * np.ones((vis_image_size, vis_image_size, 3), dtype='uint8')
example_counts = defaultdict(int)
for (predicted_tag, actual_tag, normalized_image) in zip(y_pred, y_test, X_test):
example_count = example_counts[(predicted_tag, actual_tag)]
if example_count >= bucket_size**2:
continue
image = dataset.reverse_preprocess_input(normalized_image)
image = image.transpose((1, 2, 0))
image = scipy.misc.imresize(image, (image_size, image_size)).astype(np.uint8)
tilepos_x = bucket_size * predicted_tag
tilepos_y = bucket_size * actual_tag
tilepos_x += example_count % bucket_size
tilepos_y += example_count // bucket_size
pos_x, pos_y = tilepos_x * image_size, tilepos_y * image_size
vis_image[pos_y:pos_y+image_size, pos_x:pos_x+image_size, :] = image
example_counts[(predicted_tag, actual_tag)] += 1
vis_image[::image_size * bucket_size, :] = 0
vis_image[:, ::image_size * bucket_size] = 0
scipy.misc.imsave(vis_filename, vis_image)
print "loading original inception model"
model = net.build_model(nb_classes)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=["accuracy"])
# train the model on the new data for a few epochs
print "training the newly added dense layers"
print "samples per eph ",spe#X_train.shape[0]
model.fit_generator(train_generator,
samples_per_epoch=spe,
nb_epoch=nb_epoch,
validation_data=validation_generator,
nb_val_samples=spe,
)
net.save(model, tags, model_file_prefix)
# at this point, the top layers are well trained and we can start fine-tuning
# convolutional layers from inception V3. We will freeze the bottom N layers
# and train the remaining top layers.
# we chose to train the top 2 inception blocks, i.e. we will freeze
# the first 172 layers and unfreeze the rest:
for layer in model.layers[:172]:
layer.trainable = False
for layer in model.layers[172:]:
layer.trainable = True
# we need to recompile the model for these modifications to take effect
# we use SGD with a low learning rate
model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy', metrics=["accuracy"])
# we train our model again (this time fine-tuning the top 2 inception blocks
# alongside the top Dense layers
print "fine-tuning top 2 inception blocks alongside the top dense layers"
for i in range(1,11):
print "mega-epoch %d/10" % i
model.fit_generator(train_generator,
samples_per_epoch=spe,
nb_epoch=nb_phase_two_epoch,
validation_data=validation_generator,
nb_val_samples=spe,
)
#evaluate(model, str(i).zfill(3)+".png")
# evaluate(model, "000.jpg")
net.save(model, tags, model_file_prefix)
when run with keras-js i get the error
Error: predict() must take an object where the keys are the named inputs of the model: input_1.
pls help
Wasn't easy to read your code - indentation was off and I don't really know what's in dataset and the other imports.
That said, the problem is probably the format of X_test. During training, you use output from ImageDatagenerator which rescales the images to (299,299) together with the other manipulations. During evaluation, you use the raw data in X_test directly.
Hope this helps /p