A neural network outputting vectors with all components NaN - tensorflow

My neural network code ends up outputting vectors with NaNs most of the time. The code is given below:
from __future__ import division, print_function
from six.moves import xrange
import time
import os
from glob import glob
from zipfile import ZipFile, ZIP_DEFLATED
import numpy as np
import tensorflow as tf
## Defining variables which have to be provided by user
## Defining the number of units in the RNN. This is also the size of the word
## and document embeddings
num_units = 100
##The number of data elements in a batch
batch_size = 1
##The folder where the npz files with the numpy arrays are stored.
npz_files_folder = "npz_files"
## Name of the file to which we want the model to be saved
model_file = "rnn_trial"
## Number of labels sampled from the noise for NCE
num_sampled = 50
## The dropout probability for the NN
dropout = 0.2
## The learning rate for the optimizer
lr = 0.1
## The number of epochs
epochs = 10
## Reading in the list of npz files with vectors for each document
doc_files = sorted(glob(os.path.join(npz_files_folder, "*.npz")))
num_classes = num_docs = len(doc_files)
## The tensor for storing a batch of sentences where each sentence is a
## sequence of word embeddings. This is an input to the NN
sentences = tf.placeholder(tf.float32, [batch_size, None, num_units],
name='sentences')
## The tensor for storing a batch of documents where each document is a
## sequence of sentence embeddings. This is an input to the NN
documents = tf.placeholder(tf.float32, [batch_size, None, num_units])
## The tensor for storing the labels for each batch of documents
labels = tf.placeholder(tf.float32, [batch_size])
## Here we define the LSTM used in the first layer
sent_lstm = tf.contrib.rnn.BasicLSTMCell(num_units)
sent_lstm = tf.contrib.rnn.DropoutWrapper(sent_lstm,
output_keep_prob=1.0-dropout)
## We define the initial_state of the LSTM in first layer here
initial_state_sent_lstm = sent_lstm.zero_state(batch_size, tf.float32)
## Here we get the outputs and states from the first layer
outputs_lstm, states_lstm = tf.nn.dynamic_rnn(sent_lstm,
inputs=sentences, initial_state=initial_state_sent_lstm)
## Here we define the forward GRU used in the second layer
doc_gru_fw = tf.contrib.rnn.GRUCell(num_units//2)
initial_state_doc_gru_fw = doc_gru_fw.zero_state(batch_size, tf.float32)
## Here we define the reverse GRU used in second layer.
doc_gru_bw = tf.contrib.rnn.GRUCell(num_units-num_units//2)
initial_state_doc_gru_bw = doc_gru_bw.zero_state(batch_size, tf.float32)
## Here we get the outputs and states from the second layer
outputs_gru, states_gru = tf.nn.bidirectional_dynamic_rnn(cell_fw=doc_gru_fw,
cell_bw=doc_gru_bw, initial_state_fw=initial_state_doc_gru_fw,
initial_state_bw=initial_state_doc_gru_bw,
inputs=documents)
# outputs_gru, states_gru = tf.nn.bidirectional_dynamic_rnn(cell_fw=doc_gru_fw,
# cell_bw=doc_gru_bw,
# inputs=documents, dtype=tf.float32)
## The final document embeddings
final_output = tf.reduce_mean(tf.concat(outputs_gru, 2), axis=1)
sigmoid_W = tf.Variable(
tf.truncated_normal([num_units, 1],
stddev=1.0/np.sqrt(num_units)))
sigmoid_b = tf.Variable(tf.zeros([1], dtype=tf.float32))
logits = tf.matmul(final_output, sigmoid_W) + sigmoid_b
y_ = (num_docs - 1) * tf.sigmoid(tf.reshape(logits, [-1]))
loss = tf.reduce_sum(tf.square(y_ - labels))
## Defining the training step
train = tf.train.AdamOptimizer(lr).minimize(loss)
## Initializing the session
sess = tf.Session()
## Initializing the variables
sess.run(tf.global_variables_initializer())
t = time.time()
for n in xrange(epochs):
result = False
for j, doc in enumerate(doc_files):
# if j==100:
# break
try:
npz_file = np.load(doc, allow_pickle=False)
except ValueError:
continue
train_label = np.array([j])
sent_files = sorted(npz_file.files)
temp_doc = np.array([])
temp_doc = np.reshape(temp_doc, (0, num_units))
for i, sent_file in enumerate(sent_files):
sent_input = np.reshape(npz_file[sent_file], (1, -1, num_units))
if 0 in sent_input.shape:
continue
output_1 = sess.run(outputs_lstm,
feed_dict={sentences: sent_input})
sent_embed = output_1[:, -1:]
temp_doc = np.concatenate([temp_doc] + list(sent_embed), 0)
## Training the model
temp_doc = np.array([temp_doc])
_, doc_vector = sess.run([train, final_output], feed_dict={
documents: temp_doc, labels: train_label})
if np.isnan(np.sum(doc_vector)):
result = True
print(result)
print("Finished with epoch ", n)
print()
doc_vecs_file_name = model_file + "_doc_vecs.zip"
with ZipFile(doc_vecs_file_name, 'w', ZIP_DEFLATED, True) as myzip:
for doc in doc_files:
# if doc_files.index(doc)==100:
# break
try:
npz_file = np.load(doc, allow_pickle=False)
except ValueError:
continue
sent_files = sorted(npz_file.files)
temp_doc = np.array([])
temp_doc = np.reshape(temp_doc, (0, num_units))
for i, sent_file in enumerate(sent_files):
sent_input = np.reshape(npz_file[sent_file], (1, -1, num_units))
if 0 in sent_input.shape:
continue
output_1 = sess.run(outputs_lstm,
feed_dict={sentences: sent_input})
sent_embed = output_1[:, -1:]
temp_doc = np.concatenate([temp_doc] + list(sent_embed), 0)
## Training the model
temp_doc = np.array([temp_doc])
doc_vec = sess.run(final_output, feed_dict={documents: temp_doc})
temp_file = doc.split(os.sep)[-1][:-4] + ".csv"
np.savetxt(temp_file, doc_vec, delimiter=',')
myzip.write(temp_file)
os.remove(temp_file)
saver = tf.train.Saver()
saver.save(sess, model_file)
print("Time taken = ", (time.time() - t))
If needed, I can upload a sample data set which you can use to try running the code yourself. With that sample data set, occasionally the training is completed without any NaNs creeping in. But, most of the time, NaNs pop up while training.
I am using tensorflow version 1.1.0 alongwith python 2.7.13 from the anaconda distribution.

Related

Implementation difference between TensorFlow LSTMBlockFusedCell and PyTorch LSTM

I am attempting to translate a tensorflow LSTMBlockFusedCell model to pytorch LSTM, but I'm not getting the same outputs with identical input and weights in both models. I believe this is due to how the weights are being set for the torch model; in the code snippet beneath the TensorFlow weight has the shape (400, 164) whilst the PyTorch weights has the shape (400,64) and (400,100) for torch_lstm.weight_ih_l0 and torch_lstm.weight_hh_l0 respectively. I addressed this inconsistency by using the first 64 elements as weight_ih_l0 and the proceeding 100 elements as weight_hh_l0. According to this article, TensorFlow uses right-multiplication instead of PyTorch left-multiplication which is why I need to transpose the weight. Also I am setting the bias to 0 (rendering it useless) for debugging.
import tensorflow as tf
import numpy as np
import torch
time_len, batch_size, input_size, num_units = 50, 1, 64, 100 # L, N, Hin, Hout with torch semantics
# setup tensorflow LSTM
tf_lstm = tf.contrib.rnn.LSTMBlockFusedCell(num_units=num_units)
inp = tf.placeholder(tf.float32, shape=(time_len, batch_size, input_size))
out, c = tf_lstm(inp, dtype=tf.float32)
tf_weight = tf_lstm.weights[0]
tf_bias = tf_lstm.weights[1]
# initialize weights
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# tf forward pass
a = np.random.randn(time_len, batch_size, input_size).astype(np.float32) # input
b = np.zeros(tf_bias.shape) # set lstm bias to zero
tf_out, lstm_weight, lstm_bias = sess.run([out, tf_weight, tf_bias], {inp: a, tf_bias: b})
assert (lstm_bias == 0).all() # make sure lstm bias was 0
# setup pytorch LSTM
torch_lstm = torch.nn.LSTM(input_size=input_size, hidden_size=num_units, num_layers=1, bias=False)
# set torch weights same as tensorflow weights (is this correct?)
w1 = lstm_weight[:input_size, :] # first 64 elements
w2 = lstm_weight[input_size:, :] # proceeding 100 elements
torch_lstm.weight_ih_l0.data = torch.tensor(w1.T) # transpose and set first weight
torch_lstm.weight_hh_l0.data = torch.tensor(w2.T) # transpose and set second weight
# torch forward pass
torch_out, (hn, cn) = torch_lstm(torch.tensor(a))
torch_out = torch_out.detach().numpy() # convert to numpy for compatibility
# compare
assert torch_out.shape == tf_out.shape
print("np.allclose(torch_out, tf_out) = ", np.allclose(torch_out, tf_out))
print("normalized difference: ", np.linalg.norm(torch_out - tf_out))
output:
np.allclose(torch_out, tf_out) = False
normalized difference: 10.741002
Expected output:
np.allclose(torch_out, tf_out) = True
normalized difference: ~0.0
I am running on cpu with the following dependencies:
numpy==1.21.5
tensorflow-gpu==1.14.0
torch==1.11.0
I am running tensorflow v1, cpu version should work, the python wheel is available here for python<=3.7.
Any help is appreciated.
I believe I solved this by changing the order of weight associated with each gate and setting forget_bias=0.0 in LSTMBlockFusedCell:
import tensorflow as tf
import numpy as np
import torch
import itertools as it
time_len, batch_size, input_size, num_units = 50, 1, 64, 100 # L, N, Hin, Hout with torch semantics
# setup tensorflow LSTM
tf_lstm = tf.contrib.rnn.LSTMBlockFusedCell(num_units=num_units, forget_bias=0.0, dtype=tf.float32)
inp = tf.placeholder(tf.float32, shape=(time_len, batch_size, input_size))
out, c = tf_lstm(inp, dtype=tf.float32)
tf_weight = tf_lstm.weights[0]
tf_bias = tf_lstm.weights[1]
# initialize weights
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# tf forward pass
a = np.random.randn(*inp.shape).astype(np.float32) # input
b = np.zeros(tf_bias.shape) # set lstm bias to zero
tf_out, lstm_weight, lstm_bias = sess.run([out, tf_weight, tf_bias], {inp: a, tf_bias: b})
assert (lstm_bias == 0).all() # make sure lstm bias was 0
# setup pytorch LSTM
torch_lstm = torch.nn.LSTM(input_size=input_size, hidden_size=num_units, num_layers=1, bias=False)
# weights associated with each gate
i = lstm_weight[:, 0:100].copy(), 'i'
f = lstm_weight[:, 100:200].copy(), 'f'
o = lstm_weight[:, 200:300].copy(), 'o'
g = lstm_weight[:, 300:400].copy(), 'g'
for i,f,o,g in it.permutations([i,f,o,g], 4):
print(*[x[1] for x in (i,f,o,g)])
i,f,o,g = (x[0] for x in (i,f,o,g))
lstm_weight = np.concatenate([i,f,o,g], axis=1)
# set torch weights same as tensorflow weights
w1 = lstm_weight[:input_size, :] # first 64 elements
w2 = lstm_weight[input_size:, :] # proceeding 100 elements
torch_lstm.weight_ih_l0.data = torch.tensor(w1.T) # transpose and set first weight
torch_lstm.weight_hh_l0.data = torch.tensor(w2.T) # transpose and set second weight
# torch forward pass
torch_out, (hn, cn) = torch_lstm(torch.tensor(a))
torch_out = torch_out.detach().numpy() # convert to numpy for compatibility
# compare
assert torch_out.shape == tf_out.shape
print("np.allclose(torch_out, tf_out) = ", np.allclose(torch_out, tf_out))
print("normalized difference: ", np.linalg.norm(torch_out - tf_out))
This will print the difference for all permutations of gate weights, the combination i o f g gave a difference of 1.7814435e-06 which is close enough.

Keras GPU memory overflow using with keras.utils.sequence and generator

Dataset.py
import os
import random
from skimage import io
import cv2
from skimage.transform import resize
import numpy as np
import tensorflow as tf
import keras
import Augmentor
def iter_sequence_infinite(seq):
"""Iterate indefinitely over a Sequence.
# Arguments
seq: Sequence object
# Returns
Generator yielding batches.
"""
while True:
for item in seq:
yield item
# data generator class
class DataGenerator(keras.utils.Sequence):
def __init__(self, ids, imgs_dir, masks_dir, batch_size=10, img_size=128, n_classes=1, n_channels=3, shuffle=True):
self.id_names = ids
self.indexes = np.arange(len(self.id_names))
self.imgs_dir = imgs_dir
self.masks_dir = masks_dir
self.batch_size = batch_size
self.img_size = img_size
self.n_classes = n_classes
self.n_channels = n_channels
self.shuffle = shuffle
self.on_epoch_end()
# for printing the statistics of the function
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.id_names))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation__(self, id_name):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
img_path = os.path.join(self.imgs_dir, id_name) # polyp segmentation/images/id_name.jpg
mask_path = os.path.join(self.masks_dir, id_name) # polyp segmenatation/masks/id_name.jpg
img = io.imread(img_path)
mask = cv2.imread(mask_path)
p = Augmentor.DataPipeline([[img, mask]])
p.resize(probability=1.0, width=self.img_size, height=self.img_size)
p.rotate_without_crop(probability=0.3, max_left_rotation=10, max_right_rotation=10)
#p.random_distortion(probability=0.3, grid_height=10, grid_width=10, magnitude=1)
p.shear(probability=0.3, max_shear_left=1, max_shear_right=1)
#p.skew_tilt(probability=0.3, magnitude=0.1)
p.flip_random(probability=0.3)
sample_p = p.sample(1)
sample_p = np.array(sample_p).squeeze()
p_img = sample_p[0]
p_mask = sample_p[1]
augmented_mask = (p_mask // 255) * 255 # denoising
q = Augmentor.DataPipeline([[p_img]])
q.random_contrast(probability=0.3, min_factor=0.2, max_factor=1.0) # low to High
q.random_brightness(probability=0.3, min_factor=0.2, max_factor=1.0) # dark to bright
sample_q = q.sample(1)
sample_q = np.array(sample_q).squeeze()
image = sample_q
mask = augmented_mask[::, ::, 0]
"""
# reading the image from dataset
## Reading Image
image = io.imread(img_path) # reading image to image vaiable
image = resize(image, (self.img_size, self.img_size), anti_aliasing=True) # resizing input image to 128 * 128
mask = io.imread(mask_path, as_gray=True) # mask image of same size with all zeros
mask = resize(mask, (self.img_size, self.img_size), anti_aliasing=True) # resizing mask to fit the 128 * 128 image
mask = np.expand_dims(mask, axis=-1)
"""
# image normalization
image = image / 255.0
mask = mask / 255.0
return image, mask
def __len__(self):
"Denotes the number of batches per epoch"
return int(np.floor(len(self.id_names) / self.batch_size))
def __getitem__(self, index): # index : batch no.
# Generate indexes of the batch
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
batch_ids = [self.id_names[k] for k in indexes]
imgs = list()
masks = list()
for id_name in batch_ids:
img, mask = self.__data_generation__(id_name)
imgs.append(img)
masks.append(np.expand_dims(mask,-1))
imgs = np.array(imgs)
masks = np.array(masks)
return imgs, masks # return batch
train.py
import argparse
import logging
import os
import sys
from tqdm import tqdm # progress bar
import numpy as np
import matplotlib.pyplot as plt
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import segmentation_models as sm
from segmentation_models.utils import set_trainable
from dataset import DataGenerator, iter_sequence_infinite
def train_model(model, train_gen, valid_gen, epochs, save_cp=True):
total_batch_count = 0
train_img_num = len(train_gen.id_names)
train_batch_num = len(train_gen)
train_gen_out = iter_sequence_infinite(train_gen)
valid_batch_num = len(valid_gen)
valid_img_num = len(valid_gen.id_names)
valid_gen_out = iter_sequence_infinite(valid_gen)
for epoch in range(epochs): # interation as many epochs
set_trainable(model)
epoch_loss = 0 # loss in this epoch
epoch_iou = 0
count = 0
with tqdm(total=train_img_num, desc=f'Epoch {epoch + 1}/{epochs}', position=0, leave=True, unit='img') as pbar: # make progress bar
for _ in range(train_batch_num):
batch = next(train_gen_out)
imgs = batch[0]
true_masks = batch[1]
loss, iou = model.train_on_batch(imgs, true_masks) # value of loss of this batch
epoch_loss += loss
epoch_iou += iou
pbar.set_postfix(**{'Batch loss': loss, 'Batch IoU': iou}) # floating the loss at the post in the pbar
pbar.update(imgs.shape[0]) # update progress
count += 1
total_batch_count += 1
train_gen.on_epoch_end()
print( "Epoch : loss: {}, IoU : {}".format(epoch_loss/count, epoch_iou/count))
# Do validation
validation_model(model, valid_gen_out, valid_batch_num, valid_img_num)
valid_gen.on_epoch_end()
if save_cp:
try:
if not os.path.isdir(checkpoint_dir):
os.mkdir(checkpoint_dir)
logging.info('Created checkpoint directory')
else:
pass
except OSError:
pass
model.save_weights(os.path.join(checkpoint_dir , f'CP_epoch{epoch + 1}.h5'))
logging.info(f'Checkpoint {epoch + 1} saved !')
def validation_model(model, valid_gen_out, valid_batch_num, valid_img_num):
epoch_loss = 0 # loss in this epoch
epoch_iou = 0
count = 0
with tqdm(total=valid_img_num, desc='Validation round', position=0, leave=True, unit='img') as pbar: # make progress bar
for _ in range(valid_batch_num):
batch = next(valid_gen_out)
imgs = batch[0]
true_masks = batch[1]
loss, iou = model.test_on_batch(imgs, true_masks) # value of loss of this batch
epoch_loss += loss
epoch_iou += iou
pbar.set_postfix(**{'Batch, loss': loss, 'Batch IoU': iou}) # floating the loss at the post in the pbar
pbar.update(imgs.shape[0]) # update progress
count += 1
print("Validation loss: {}, IoU: {}".format(epoch_loss / count, epoch_iou / count))
pred_mask = model.predict(np.expand_dims(imgs[0],0))
plt.subplot(131)
plt.imshow(imgs[0])
plt.subplot(132)
plt.imshow(true_masks[0].squeeze(), cmap="gray")
plt.subplot(133)
plt.imshow(pred_mask.squeeze(), cmap="gray")
plt.show()
print()
def get_args():
parser = argparse.ArgumentParser(description='Train the UNet on images and target masks',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-e', '--epochs', metavar='E', type=int, default=50,
help='Number of epochs', dest='epochs')
parser.add_argument('-b', '--batch_size', metavar='B', type=int, nargs='?', default=2,
help='Batch size', dest='batch_size')
parser.add_argument('-l', '--learning-rate', metavar='LR', type=float, nargs='?', default=1e-5,
help='Learning rate', dest='lr')
parser.add_argument('-bb', '--backbone', default='resnet50', metavar='FILE',
help="backcone name")
parser.add_argument('-w', '--weight', dest='load', type=str, default=False,
help='Load model from a .h5 file')
parser.add_argument('-s', '--resizing', dest='resizing', type=int, default=384,
help='Downscaling factor of the images')
parser.add_argument('-v', '--validation', dest='val', type=float, default=20.0,
help='Percent of the data that is used as validation (0-100)')
return parser.parse_args()
if __name__ == '__main__':
img_dir = './data/train/imgs/' # ./data/train/imgs/CVC_Original/'
mask_dir = './data/train/masks/' # ./data/train/masks/CVC_Ground Truth/'
checkpoint_dir = './checkpoints'
args = get_args()
# train path
train_ids = os.listdir(img_dir)
# Validation Data Size
n_val = int(len(train_ids) * args.val/100) # size of validation set
valid_ids = train_ids[:n_val] # list of image ids used for validation of result 0 to 9
train_ids = train_ids[n_val:] # list of image ids used for training dataset
# print(valid_ids, "\n\n")
print("training_size: ", len(train_ids), "validation_size: ", len(valid_ids))
train_gen = DataGenerator(train_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
valid_gen = DataGenerator(valid_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
print("total training batches: ", len(train_gen))
print("total validaton batches: ", len(valid_gen))
train_steps = len(train_ids) // args.batch_size
valid_steps = len(valid_ids) // args.batch_size
# define model
model = sm.Unet(args.backbone, encoder_weights='imagenet')
optimizer = optimizers.Adam(lr=args.lr, decay=1e-4)
model.compile(
optimizer=optimizer,
# "Adam",
loss=sm.losses.bce_dice_loss, # sm.losses.bce_jaccard_loss, # sm.losses.binary_crossentropy,
metrics=[sm.metrics.iou_score],
)
#model.summary()
callbacks = [
EarlyStopping(patience=6, verbose=1),
ReduceLROnPlateau(factor=0.1, patience=3, min_lr=1e-7, verbose=1),
ModelCheckpoint('./weights.Epoch{epoch:02d}-Loss{loss:.3f}-VIou{val_iou_score:.3f}.h5', verbose=1,
monitor='val_accuracy', save_best_only=True, save_weights_only=True)
]
train_model(model=model, train_gen=train_gen, valid_gen=valid_gen, epochs=args.epochs)
When I try to run this code, some epochs are well progressed but, in 20epochs, it occurs gpu memory overflow error like below
(0) Resource exhausted: OOM when allocating tensor with shape[2,64,96,96] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node decoder_stage2b_bn/FusedBatchNorm}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
so, I think that it is because of data generation.
This code generate batch in this order.
in train.py, initialize Datageneratr class which is sequence model that is implemented in Dataset.py
train_gen = DataGenerator(train_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
valid_gen = DataGenerator(valid_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
At the first in the function 'train_model' convert Datagenerator(sequence model) to generator with using function 'iter_sequence_infinite'
train_gen_out = iter_sequence_infinite(train_gen)
valid_gen_out = iter_sequence_infinite(valid_gen)
using magic-function, 'next', get batch
batch = next(train_gen_out)
I think that there will be no memory problem but it's occurred.
What is the problem and how to solve it?
Thanks.

Odd problem with the Multivariate Input Multi-Step LSTM Time Series Forecasting Models

I have developed Multivariate Input Multi-Step LSTM Time Series Forecasting Models for my dataset according to the tutorial (https://machinelearningmastery.com/how-to-develop-lstm-models-for-multi-step-time-series-forecasting-of-household-power-consumption/).
Yet, I had a very odd problem, that is, when I run code with smaller samples (50 samples for training, 10 samples for testing), the predictions are correct. but when I run the experiment with full samples (4000 samples for training, 1000 samples for testing), the predictions contain NaN values, which lead to errors.
Then, when I try scaling plus relu activation functions plus regularization as following code, I can get predictions with full samples (4000 samples for training, 1000 samples for testing), but the predictions is still not correct, I want to forecast 96 steps, but all steps I predicted is the same number.
Can you give a useful suggestion to deal with the forecast accuracy issues?
import time
from math import sqrt
from numpy import split
from numpy import array
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
import csv
import numpy
from sklearn.preprocessing import MinMaxScaler
from numpy import save
from timeit import default_timer as timer
def scale(train, test):
# fit scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
train = train.astype(float)
test = test.astype(float)
scaler = scaler.fit(train)
# transform train
train = train.reshape(train.shape[0], train.shape[1])
train_scaled = scaler.transform(train)
# transform test
test = test.reshape(test.shape[0], test.shape[1])
test_scaled = scaler.transform(test)
return scaler, train_scaled, test_scaled
# split a univariate dataset into train/test sets
def split_dataset(data):
# split into standard weeks
train, test = data[0:387030, 10:26], data[387030:433881, 10:26]
# train, test = data[0:4850, 10:26], data[4850:5820, 10:26]
# train, test = data[0:387030], data[387029:433880]
# restructure into windows of weekly data
# numpy.savetxt("test.csv", data[387030:433881, :], delimiter=",")
# save('test.npy', data[387030:433881, :])
scaler, train_scaled, test_scaled = scale(train, test)
train_scaled = array(split(train_scaled, len(train_scaled) / 97))
test_scaled = array(split(test_scaled, len(test_scaled) / 97))
return scaler, train_scaled, test_scaled
# create a list of configs to try
def model_configs():
# define scope of configs
# n_input = [12]
n_nodes = [100, 200, 300]
n_epochs = [50, 100]
n_batch = [64]
# n_diff = [12]
# create configs
configs = list()
# for i in n_input:
for j in n_nodes:
for k in n_epochs:
for l in n_batch:
cfg = [j, k, l]
configs.append(cfg)
print('Total configs: %d' % len(configs))
return configs
# evaluate one or more weekly forecasts against expected values
def evaluate_forecasts(actual, predicted):
scores = list()
# calculate an RMSE score for each day
for i in range(0, actual.shape[1], 97):
# for i in range():
# calculate mse
mse = mean_squared_error(actual[:, i, :], predicted[:, i, :])
# calculate rmse
rmse = sqrt(mse)
# store
scores.append(rmse)
# calculate overall RMSE
s = 0
for x in range(actual.shape[0]):
for y in range(actual.shape[1]):
for z in range(actual.shape[2]):
s += (actual[x, y, z] - predicted[x, y, z])**2
score = sqrt(s / (actual.shape[0] * actual.shape[1] * actual.shape[2]))
return score, scores
# convert history into inputs and outputs
def to_supervised(train, n_steps_in, n_steps_out=97, overlop=97):
# flatten data
sequences = train.reshape(
(train.shape[0] * train.shape[1], train.shape[2]))
X, y = list(), list()
for i in range(0, len(sequences), overlop):
end_ix = i + n_steps_in
out_end_ix = end_ix + n_steps_out
# check if we are beyond the dataset
if out_end_ix > len(sequences):
break
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix:out_end_ix, :]
X.append(seq_x)
y.append(seq_y)
return array(X), array(y)
# train the model
def build_model(train, n_input, config):
# unpack config
n_nodes, n_epochs, n_batch = config
# prepare data
train_x, train_y = to_supervised(train, n_input)
# define parameters
verbose, epochs, batch_size = 0, n_epochs, n_batch
n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
# reshape output into [samples, timesteps, features]
train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], n_features))
# define model
model = Sequential()
model.add(
LSTM(
n_nodes,
activation='relu',
input_shape=(
n_timesteps,
n_features), recurrent_dropout=0.6))
model.add(RepeatVector(n_outputs))
model.add(LSTM(n_nodes, activation='relu', return_sequences=True, recurrent_dropout=0.6))
model.add(TimeDistributed(Dense(n_nodes, activation='relu')))
model.add(TimeDistributed(Dense(n_features)))
model.compile(loss='mse', optimizer='adam')
# fit network
model.fit(
train_x,
train_y,
epochs=epochs,
batch_size=batch_size,
verbose=verbose)
return model
# make a forecast
def forecast(model, history, n_input):
# flatten data
data = array(history)
data = data.reshape((data.shape[0] * data.shape[1], data.shape[2]))
# retrieve last observations for input data
input_x = data[-n_input:, :]
# reshape into [1, n_input, n]
input_x = input_x.reshape((1, input_x.shape[0], input_x.shape[1]))
# forecast the next week
yhat = model.predict(input_x, verbose=0)
# we only want the vector forecast
yhat = yhat[0]
return yhat
# evaluate a single model
def evaluate_model(train, test, n_input, cfg):
start = timer()
# fit model
model = build_model(train, n_input, cfg)
# print("--- %s seconds ---" % (time.time() - start_time))
# history is a list of weekly data
history = [x for x in train]
# walk-forward validation over each week
predictions = list()
for i in range(len(test)):
# predict the week
yhat_sequence = forecast(model, history, n_input)
# store the predictions
predictions.append(yhat_sequence)
# get real observation and add to history for predicting the next week
history.append(test[i, :])
# evaluate predictions days for each week
predictions = array(predictions)
# invert scaling
predictions = predictions.reshape(
(predictions.shape[0] *
predictions.shape[1],
predictions.shape[2]))
predictions = scaler.inverse_transform(predictions)
test = test.reshape((test.shape[0] * test.shape[1], test.shape[2]))
test = scaler.inverse_transform(test)
predictions = array(split(predictions, len(predictions) / 97))
test = array(split(test, len(test) / 97))
score, scores = evaluate_forecasts(test, predictions)
run_time = timer() - start
return cfg[0], cfg[1], cfg[2], score, scores, run_time
# load the new file
dataset = read_csv(
'data_preproccess_5.csv',
header=0,
index_col=0)
# split into train and test
scaler, train_scaled, test_scaled = split_dataset(dataset.values)
# evaluate model and get scores
n_input = 7 * 97
# model configs
cfg_list = model_configs()
scores = [
evaluate_model(
train_scaled,
test_scaled,
n_input,
cfg) for cfg in cfg_list]
provide some sample data
sample data
If you have multistep output, you can easily reshape your predictions and calculate it.
My splitted datasets
`trainX, trainY, testX, testY`
Get Prediction Results
`trainPredict = model.predict(trainX)`
`testPredict = model.predict(testX)`
Reshape the Predictions and Real Values
`trainY = trainY.reshape(-1, )`
`trainPredict = trainPredict.reshape(-1, )`
`testY = testY.reshape(-1, )`
`testPredict = testPredict.reshape(-1, )`
Calculate root mean squared error
`print('Train Root mean squared error: {}'.format(math.sqrt(mean_squared_error(trainY, trainPredict))))`
`print('Test Root mean squared error: {}'.format(math.sqrt(mean_squared_error(testY, testPredict))))`

Email classification using word2vec

My goal is to classify email, so each email have to correspond to a specific category like sport, clothes and so on.
My idea is to use a Recurrent Neural Network with one layer based on embedding (by word2vec). I'm using the followed schema "emojify v2" with some changing:
https://github.com/enggen/Deep-Learning-Coursera/blob/master/Sequence%20Models/Week2/Emojify/Emojify%20-%20v2.ipynb
and the word2vec pretrained model downloaded from:
http://hlt.isti.cnr.it/wordembeddings/
The problem is that this neural network does not work properly. In fact, trying to train the network on only 2 categories (20 sample). I get an accuracy of 50-60% on the training set that is of course too low.
Do you have any hint to improve my result?
What could be the problem?
a low number of samples?
a low number of nodes?
using loss='categorical_crossentropy'?
using optimizer='adam'?
the text of the email is too long?
the lenght of each email is different. This could be a problem?
I'm using keras (tensorflow).
edit1: insert code
import numpy as np
import emoji
import matplotlib.pyplot as plt
import numpy as np
# utiliti che serviranno in seguito
import csv
import numpy as np
import emoji
import pandas as pd
from sklearn.metrics import confusion_matrix
def read_glove_vecs(glove_file):
with open(glove_file, 'r') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map
def get_categories(filename):
categories_code = []
categories_name = []
with open(filename, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
category_code = row[1];
if (search(category_code, categories_code) == False):
categories_code.append(row[1])
categories_name.append(row[2])
Y = np.asarray(categories_code)
Z = np.asarray(categories_name)
return Y, Z
def search(category, categories):
for c in categories:
if (c == category):
return True
return False
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
def read_csv_for_email(filename):
name = []
text = []
category = []
category_code = []
with open(filename, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
name.append(row[0])
category_code.append(row[1])
category.append(row[2])
text.append(row[3])
X = np.asarray(name)
Y = np.asarray(category_code)
Z = np.asarray(category)
W = np.asarray(text)
return X, Y, Z, W
def convert_to_one_hot(Y, C):
Y = np.eye(C)[Y.reshape(-1)]
return Y
# converto in one hot vector
def my_convert_to_one_hot(Y, values):
#print(Y)
#print(values)
col = len(values)
row = len(Y)
#print(col)
#print(row)
R = np.zeros((row, col))
for res in range(row):
for i in range(col):
#print(Y[res])
#print(values[i])
if Y[res] == values[i]:
X = [0]*col;
X[i] = 1;
R[res] = np.array(X)
return R;
def print_predictions(X, pred):
print()
for i in range(X.shape[0]):
print(X[i], label_to_emoji(int(pred[i])))
def plot_confusion_matrix(y_actu, y_pred, title='Confusion matrix', cmap=plt.cm.gray_r):
df_confusion = pd.crosstab(y_actu, y_pred.reshape(y_pred.shape[0],), rownames=['Actual'], colnames=['Predicted'], margins=True)
df_conf_norm = df_confusion / df_confusion.sum(axis=1)
plt.matshow(df_confusion, cmap=cmap) # imshow
#plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(df_confusion.columns))
plt.xticks(tick_marks, df_confusion.columns, rotation=45)
plt.yticks(tick_marks, df_confusion.index)
#plt.tight_layout()
plt.ylabel(df_confusion.index.name)
plt.xlabel(df_confusion.columns.name)
def predict(X, Y, W, b, word_to_vec_map):
"""
Given X (sentences) and Y (emoji indices), predict emojis and compute the accuracy of your model over the given set.
Arguments:
X -- input data containing sentences, numpy array of shape (m, None)
Y -- labels, containing index of the label emoji, numpy array of shape (m, 1)
Returns:
pred -- numpy array of shape (m, 1) with your predictions
"""
m = X.shape[0]
pred = np.zeros((m, 1))
for j in range(m): # Loop over training examples
# Split jth test example (sentence) into list of lower case words
words = X[j].lower().split()
# Average words' vectors
avg = np.zeros((50,))
for w in words:
avg += word_to_vec_map[w]
avg = avg/len(words)
# Forward propagation
Z = np.dot(W, avg) + b
A = softmax(Z)
pred[j] = np.argmax(A)
print("Accuracy: " + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
return pred
import tensorflow as tf
sess = tf.Session()
from keras.layers import Input, Dense, Activation
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)
# GRADED FUNCTION: sentences_to_indices
def sentences_to_indices(X, word_to_index, max_len):
"""
Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
The output shape should be such that it can be given to `Embedding()` (described in Figure 4).
Arguments:
X -- array of sentences (strings), of shape (m, 1)
word_to_index -- a dictionary containing the each word mapped to its index
max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.
Returns:
X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
"""
m = X.shape[0] # number of training examples
### START CODE HERE ###
# Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
X_indices = np.zeros((m, max_len))
for i in range(m): # loop over training examples
# Convert the ith training sentence in lower case and split is into words. You should get a list of words.
sentence_words = X[i].lower().split()
# Initialize j to 0
j = 0
# Loop over the words of sentence_words
for w in sentence_words:
# Set the (i,j)th entry of X_indices to the index of the correct word.
if w in word_to_index:
if (j >= maxLen):
continue
X_indices[i, j] = word_to_index[w]
# Increment j to j + 1
j = j + 1
### END CODE HERE ###
return X_indices
from gensim.models import Word2Vec
model = Word2Vec.load('/home/ale/Scrivania/emoji_ita/embedding/glove_WIKI') # glove model
def getMyModels (model):
word_to_index = dict({})
index_to_word = dict({})
word_to_vec_map = dict({})
for idx, key in enumerate(model.wv.vocab):
word_to_index[key] = idx
index_to_word[idx] = key
word_to_vec_map[key] = model.wv[key]
return word_to_index, index_to_word, word_to_vec_map
code_train, category_train, category_code_train, text_train = read_csv_for_email('/home/ale/Scrivania/TestClassificazione/dataText2Categories.csv')
#FIXME
X_test, Y_test, Z_test, text_test = read_csv_for_email('C:\Users\Alessio\Desktop\3Febbraio\dataText2Categories.csv')
word_to_index, index_to_word, word_to_vec_map = getMyModels(model)
# EMBEDDING LAYER
# GRADED FUNCTION: pretrained_embedding_layer
# emb_dim define dimensionality of your GloVe word vectors (= 50)
def pretrained_embedding_layer(word_to_vec_map, word_to_index, emb_dim):
"""
Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
Arguments:
word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)
Returns:
embedding_layer -- pretrained layer Keras instance
"""
vocab_len = len(word_to_index) + 1 # adding 1 to fit Keras embedding (requirement)
### START CODE HERE ###
# Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
emb_matrix = np.zeros((vocab_len, emb_dim))
# Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
for word, index in word_to_index.items():
emb_matrix[index, :] = word_to_vec_map[word]
# Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False.
embedding_layer = Embedding(vocab_len, emb_dim)
### END CODE HERE ###
# Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
embedding_layer.build((None,))
# Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
embedding_layer.set_weights([emb_matrix])
return embedding_layer
# MODELLO EMOJIFY2
def Classifier(input_shape, word_to_vec_map, word_to_index, emb_dim, num_activation):
"""
Function creating the Emojify-v2 model's graph.
Arguments:
input_shape -- shape of the input, usually (max_len,)
word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)
Returns:
model -- a model instance in Keras
"""
### START CODE HERE ###
# Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
sentence_indices = Input(shape=input_shape, dtype=np.int32)
# Create the embedding layer pretrained with GloVe Vectors (≈1 line)
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index, emb_dim)
# Propagate sentence_indices through your embedding layer, you get back the embeddings
embeddings = embedding_layer(sentence_indices)
# Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
# Be careful, the returned output should be a batch of sequences.
X = LSTM(128, return_sequences=True)(embeddings)
# Add dropout with a probability of 0.5
X = Dropout(0.5)(X)
# Propagate X trough another LSTM layer with 128-dimensional hidden state
# Be careful, the returned output should be a single hidden state, not a batch of sequences.
X = LSTM(128)(X)
# Add dropout with a probability of 0.5
X = Dropout(0.5)(X)
# Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
X = Dense(num_activation, activation='softmax')(X)
# Add a softmax activation
X = Activation('softmax')(X)
# Create Model instance which converts sentence_indices into X.
model = Model(sentence_indices, X)
### END CODE HERE ###
return model
def max_text_lenght(filename):
max_len = 0;
max_id = 0;
with open(filename, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
text = row[3];
current_lenght = len(text);
if (current_lenght > max_len):
max_len = current_lenght;
max_id = row[0]
#print(max_id)
return max_len
def create_values_list():
z, y = get_categories('C:\Users\Alessio\Desktop\3Febbraio\dataText2Categories.csv')
return y;
values = create_values_list()
num_activation = len(values)
maxLen = 300
print(maxLen)
model = Classifier((maxLen,), word_to_vec_map, word_to_index, maxLen, num_activation)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
Y_train_indices = sentences_to_indices(text_train, word_to_index, maxLen)
Z_train_oh = my_convert_to_one_hot(category_code_train, values)
print(Z_train_oh)
model.fit(Y_train_indices, Z_train_oh, epochs = 60, batch_size = 32, shuffle=True)
model.save('text_classification.h5')

Why does my squared loss becomes negative in TensorFlow?

I meet a really strange problem that my squared loss becomes negative. Here's my code.
#!/usr/bin/python
# -*- coding:utf8 -*-
from __future__ import print_function
from models.vgg16 import VGG16_fixed
from keras.backend.tensorflow_backend import set_session
from scipy.misc import imsave
from models.generative_model_v2 import gen_model_v2
from scripts.image_process import *
from scripts.utils_func import *
from tensorflow.python import debug as tf_debug
import tensorflow as tf
import os
import time
# configure gpu usage
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
set_session(tf.Session(config=config)) # pass gpu setting to Keras
# set learning phase, or batch norm won't work
K.set_learning_phase(1)
# dataset setting
width, height = 256, 256
coco_img_path = '../../dataset/coco/images/train2014/'
sl_img_path = './images/style/'
# a trade-off coefficient between content loss and style loss, which is multiplied with style loss
alpha = 1
# create placeholders for input images
if K.image_data_format() == 'channels_last':
content_img_shape = [width, height, 3]
style_img_shape = [width, height, 3]
else:
content_img_shape = [3, width, height]
style_img_shape = [3, width, height]
with tf.name_scope('input'):
content_img = tf.placeholder(dtype='float32',
shape=(None, content_img_shape[0], content_img_shape[1], content_img_shape[2]),
name='content_img')
style_img = tf.placeholder(dtype='float32',
shape=(None, style_img_shape[0], style_img_shape[1], style_img_shape[2]),
name='style_img')
# load model
main_model, outputs = gen_model_v2(input_content_tensor=content_img, input_style_tensor=style_img)
concact_input = K.concatenate([content_img,
outputs,
style_img], axis=0)
vgg16_model = VGG16_fixed(input_tensor=concact_input,
weights='imagenet', include_top=False)
# get the symbolic outputs of each "key" layer (we gave them unique names).
vgg16_outputs_dict = dict([(layer.name, layer.output) for layer in vgg16_model.layers])
# get relevant layers
content_feature_layers = 'block3_conv3'
style_feature_layers = ['block1_conv2', 'block2_conv2',
'block3_conv3', 'block4_conv3']
# content loss
ct_loss = K.variable(0.)
layer_features = vgg16_outputs_dict[content_feature_layers]
content_img_features = layer_features[0, :, :, :]
outputs_img_features = layer_features[1, :, :, :]
ct_loss += content_loss(content_img_features, outputs_img_features)
# style loss
sl_loss_temp = K.variable(0.)
for layer_name in style_feature_layers:
layer_features = vgg16_outputs_dict[layer_name]
outputs_img_features = layer_features[1, :, :, :]
style_img_features = layer_features[2, :, :, :]
sl = style_loss(style_img_features, outputs_img_features)
sl_loss_temp += (alpha / len(style_feature_layers)) * sl
sl_loss = sl_loss_temp
# combine loss
loss = ct_loss + sl_loss
# write in summary
tf.summary.scalar('content_loss', ct_loss)
tf.summary.scalar("style_loss", sl_loss)
tf.summary.scalar("loss", loss)
# optimization
train_op = tf.train.AdamOptimizer(learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08).minimize(loss)
with tf.Session(config=config) as sess:
# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter('./logs/gen_model_v2',
sess.graph)
# initialize all variables
tf.global_variables_initializer().run()
# get training image
ct_img_name = [x for x in os.listdir(coco_img_path) if x.endswith(".jpg")]
ct_img_num = len(ct_img_name)
print("content image number: ", ct_img_num)
sl_img_name = [x for x in os.listdir(sl_img_path) if x.endswith(".jpg")]
sl_img_num = len(sl_img_name)
print("style image number: ", sl_img_num)
# start training
start_time = time.time()
for i in range(1):
itr = 0
for ct_name in ct_img_name:
if itr > 10: # used to train a small sample of ms coco
break
sl_name = sl_img_name[itr % sl_img_num]
_, loss_val, summary = sess.run([train_op, loss, merged],
feed_dict={content_img: preprocess_image(coco_img_path + ct_name, height, width),
style_img: preprocess_image(sl_img_path + sl_name, height, width)})
train_writer.add_summary(summary, itr * (i+1))
print('iteration', itr, 'loss =', loss_val)
itr += 1
end_time = time.time()
print('Training completed in %ds' % (end_time - start_time))
# save model
main_model.save('./models/gen_model_v2_1.h5')
# use images to test
test_ct_img_path = './images/content/train-1.jpg'
test_ct_img = preprocess_image(test_ct_img_path, height, width)
test_sl_img_path = './images/style/starry_night.jpg'
test_sl_img = preprocess_image(test_ct_img_path, height, width)
# feed test images into model
output = sess.run(outputs, feed_dict={content_img: test_ct_img, style_img: test_sl_img})
output = deprocess_image(output)
print('Output image shape:', output.shape[1:4])
imsave('./images/autoencoder/test_v2_1.png', output[0])
and my loss function is defined as below:
#!/usr/bin/python
# -*- coding:utf8 -*-
import numpy as np
from keras import backend as K
import tensorflow as tf
# the gram matrix of an image tensor (feature-wise outer product)
def gram_matrix(x):
assert K.ndim(x) == 3
if K.image_data_format() == 'channels_first':
features = K.batch_flatten(x)
else:
features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1)))
gram = K.dot(features, K.transpose(features))
return gram
def style_loss(featuremap_1, featuremap_2):
assert K.ndim(featuremap_1) == 3
assert K.ndim(featuremap_2) == 3
g1 = gram_matrix(featuremap_1)
g2 = gram_matrix(featuremap_2)
channels = 3
if K.image_data_format() == 'channels_first':
size = featuremap_1.shape[1] * featuremap_1[2]
else:
size = K.shape(featuremap_1)[0] * K.shape(featuremap_1)[1]
size = K.cast(size, tf.float32)
return K.sum(K.square(g1 - g2)) / (4. * (channels ** 2) * (size ** 2))
def content_loss(base, combination):
return K.sum(K.square(combination - base))
So, you can see my loss value is squared using K.square(). How can it be a negative value?
This is the result of my code, that the loss decrease sharply, which seems impossible.
You're starting with a ct_loss as a variable. Just set it to the content loss.
ct_loss = content_loss(content_img_features, outputs_img_features)