Neural Network loss drastically jumps during training - tensorflow

During training of my LSTM network the loss/acc went from 0.75/85% to 4.97/17% from the 42 epoch to the 44 epoch.
Why would this happen?
I only have 1500 training examples currently and am overfitting heavily. Would this be a cause?
For context I am using keras and a lstm network to predict reactions to text on slack. My training data are label encoded sentences and I am predicting the reaction class which is just a one hot representation of all the possible classes.
Here is my model in Keras
# create the model
embedding_vecor_length = 128
model = Sequential()
model.add(Embedding(max_words, embedding_vecor_length, input_length=max_message_length))
model.add(LSTM(1024))
model.add(Dense(classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
The loss and accuracy end up recovering by the 100th epoch. Is this something to worry about?
# coding: utf-8
# In[1]:
import pandas as pd
import re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999
np.random.seed(7)
# In[2]:
raw_data = pd.DataFrame()
for file in os.listdir('data/random'):
temp_df = pd.read_json(path_or_buf='data/random/' + file, orient='values', dtype=False)
raw_data = pd.concat([raw_data, temp_df])
for file in os.listdir('data/company-wide'):
temp_df = pd.read_json(path_or_buf='data/company-wide/' + file, orient='values', dtype=False)
raw_data = pd.concat([raw_data, temp_df])
for file in os.listdir('data/politics'):
temp_df = pd.read_json(path_or_buf='data/politics/' + file, orient='values', dtype=False)
raw_data = pd.concat([raw_data, temp_df])
# In[3]:
raw_data.shape
# In[4]:
# Only selected messages with reactions
data = raw_data.loc[(raw_data['reactions'].isnull() == False) & (raw_data['text'] != '')][['reactions', 'text']]
# In[5]:
data.shape
# In[6]:
def extractEmojiName(x):
max_count = 0
result = ''
for emoji in x:
if (emoji['count'] > max_count):
result = emoji['name']
return result
def removeUrls(x):
line = re.sub(r"<(http|https).*>", "", x)
return line
def removeUsername(x):
line = re.sub(r"<#.*>", "", x)
return line
# In[7]:
data['reactions_parsed'] = data['reactions'].apply(lambda x: extractEmojiName(x))
# In[8]:
data['text'] = data['text'].apply(lambda x: removeUrls(x))
data['text'] = data['text'].apply(lambda x: removeUsername(x))
# In[9]:
max_words = 10000
tokenizer = Tokenizer(nb_words=max_words)
tokenizer.fit_on_texts(data['text'])
text_vectors = tokenizer.texts_to_sequences(data['text'])
data['text_vector'] = text_vectors
# In[10]:
encoder = LabelEncoder()
data['reactions_encoded'] = encoder.fit_transform(data['reactions_parsed'])
# In[11]:
data
# In[12]:
classes = len(data['reactions_parsed'].unique())
target_vector = data['reactions_encoded'].values
reactions_vector = np.eye(classes)[target_vector]
data['reactions_vector'] = reactions_vector.tolist()
# In[13]:
max_message_length = data['text_vector'].apply(lambda x: len(x)).max()
# In[14]:
X_train, X_test, y_train, y_test = train_test_split(text_vectors, reactions_vector, test_size=.2, stratify=reactions_vector)
# In[15]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
# In[17]:
X_train = sequence.pad_sequences(X_train, maxlen=max_message_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_message_length)
# In[18]:
# create the model
embedding_vecor_length = 128
model = Sequential()
model.add(Embedding(max_words, embedding_vecor_length, input_length=max_message_length))
model.add(Dropout(0.2))
model.add(LSTM(1024))
model.add(Dropout(0.2))
model.add(Dense(classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# In[ ]:
model.fit(X_train, y_train, nb_epoch=35, batch_size=64)
# In[ ]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))
# In[45]:
scores
# In[56]:
def show_predictions(model, X_test, y_test):
predictions = model.predict(X_test)
index = 0
for prediction in predictions:
print('Prediction -> ' + encoder.inverse_transform(prediction.argmax()))
print('Actual -> ' + encoder.inverse_transform(y_test[index].argmax()))
index+=1
# In[57]:
show_predictions(model, X_test, y_test)
# In[58]:
show_predictions(model, X_train[0:100], y_train[0:100])
# In[ ]:

Related

ValueError: Input 0 of layer “Discriminator” is incompatible with the layer: expected shape=(None, 3), found shape=(100, 2)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score,\
accuracy_score, balanced_accuracy_score,classification_report,\
plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, Concatenate
from tensorflow.keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import RandomNormal
import tensorflow.keras.backend as K
from sklearn.utils import shuffle
import pickle
from tqdm import tqdm
import numpy as np
from scipy import stats
import pandas as pd
np.random.seed(1635848)
def get_data_XYZ_one_dimensional(n, a=-2, c=1/2, random_state=None, verbose=True):
"""
Generates pseudo-random data distributed according to the distribution defined in section 2.1 of the document
"Math/Confounders and data generation.pdf".
:param n: Number of data points to generate.
:param a: Mean of X.
:param c: Shape parameter for Weibull distribution.
:param random_state: Used to set the seed of numpy.random before generation of random numbers.
:param verbose: If True will display a progress bar. If False it will not display a progress bar.
:return: Pandas DataFrame with three columns (corresponding to X, Y and Z) and n rows (corresponding to the n
generated pseudo-random samples).
"""
np.random.seed(random_state)
output = []
iterator = tqdm(range(n)) if verbose else range(n)
for _ in iterator:
X = stats.norm.rvs(loc=-2, scale=1)
Y = stats.bernoulli.rvs(p=1/(1+np.exp(-X)))
if Y == 0:
Z = stats.expon.rvs(scale=np.exp(-X)) # note: np.exp(-X) could be cached for more computational efficiency but would render the code less useful
elif Y == 1:
Z = stats.weibull_min.rvs(c=c, scale=np.exp(-X))
else:
assert False
output.append((X, Y, Z))
return pd.DataFrame(output, columns=["Personal information", "Treatment", "Time to event"])
data = get_data_XYZ_one_dimensional(n=100, random_state=0)
print(data)
# The Architecture of CGAN
class cGAN():
"""
Class containing 3 methods (and __init__): generator, discriminator and train.
Generator is trained using random noise and label as inputs. Discriminator is trained
using real/fake samples and labels as inputs.
"""
def __init__(self,latent_dim=100, out_shape=3):
self.latent_dim = latent_dim
self.out_shape = out_shape
self.num_classes = 2
# using Adam as our optimizer
optimizer = Adam(0.0002, 0.5)
# building the discriminator
self.discriminator = self.discriminator()
self.discriminator.compile(loss=['binary_crossentropy'],
optimizer=optimizer,
metrics=['accuracy'])
# building the generator
self.generator = self.generator()
noise = Input(shape=(self.latent_dim,))
label = Input(shape=(1,))
gen_samples = self.generator([noise, label])
# we don't train discriminator when training generator
self.discriminator.trainable = False
valid = self.discriminator([gen_samples, label])
# combining both models
self.combined = Model([noise, label], valid)
self.combined.compile(loss=['binary_crossentropy'],
optimizer=optimizer,
metrics=['accuracy'])
def generator(self):
init = RandomNormal(mean=0.0, stddev=0.02)
model = Sequential()
model.add(Dense(128, input_dim=self.latent_dim))
model.add(Dropout(0.2))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(256))
model.add(Dropout(0.2))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(512))
model.add(Dropout(0.2))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(self.out_shape, activation='tanh'))
noise = Input(shape=(self.latent_dim,))
label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(self.num_classes, self.latent_dim)(label))
model_input = multiply([noise, label_embedding])
gen_sample = model(model_input)
model.summary()
return Model([noise, label], gen_sample, name="Generator")
def discriminator(self):
init = RandomNormal(mean=0.0, stddev=0.02)
model = Sequential()
model.add(Dense(512, input_dim=self.out_shape, kernel_initializer=init))
model.add(LeakyReLU(alpha=0.2))
model.add(Dense(256, kernel_initializer=init))
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.4))
model.add(Dense(128, kernel_initializer=init))
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
gen_sample = Input(shape=(self.out_shape,))
label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(self.num_classes, self.out_shape)(label))
model_input = multiply([gen_sample, label_embedding])
validity = model(model_input)
model.summary()
return Model(inputs=[gen_sample, label], outputs=validity, name="Discriminator")
def train(self, X_train, y_train, pos_index, neg_index, epochs, sampling=False, batch_size=32, sample_interval=100, plot=True):
# though not recommended, defining losses as global helps as in analysing our cgan out of the class
global G_losses
global D_losses
G_losses = []
D_losses = []
# Adversarial ground truths
valid = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))
for epoch in range(epochs):
# if sampling==True --> train discriminator with 8 sample from positive class and rest with negative class
if sampling:
idx1 = np.random.choice(pos_index, 3)
idx0 = np.random.choice(neg_index, batch_size-3)
idx = np.concatenate((idx1, idx0))
# if sampling!=True --> train discriminator using random instances in batches of 32
else:
idx = np.random.choice(len(y_train), batch_size)
samples, labels = X_train[idx], y_train[idx]
samples, labels = shuffle(samples, labels)
# Sample noise as generator input
noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
gen_samples = self.generator.predict([noise, labels])
# label smoothing
if epoch < epochs//1.5:
valid_smooth = (valid+0.1)-(np.random.random(valid.shape)*0.1)
fake_smooth = (fake-0.1)+(np.random.random(fake.shape)*0.1)
else:
valid_smooth = valid
fake_smooth = fake
# Train the discriminator
self.discriminator.trainable = True
d_loss_real = self.discriminator.train_on_batch([samples, labels], valid_smooth)
d_loss_fake = self.discriminator.train_on_batch([gen_samples, labels], fake_smooth)
d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
# Train Generator
self.discriminator.trainable = False
sampled_labels = np.random.randint(0, 2, batch_size).reshape(-1, 1)
# Train the generator
g_loss = self.combined.train_on_batch([noise, sampled_labels], valid)
if (epoch+1)%sample_interval==0:
print('[%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f'
% (epoch, epochs, d_loss[0], g_loss[0]))
G_losses.append(g_loss[0])
D_losses.append(d_loss[0])
if plot:
if epoch+1==epochs:
plt.figure(figsize=(10,5))
plt.title("Generator and Discriminator Loss")
plt.plot(G_losses,label="G")
plt.plot(D_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()
data.Treatment.value_counts()
scaler = StandardScaler()
X = scaler.fit_transform(data.drop('Treatment', 1))
y = data['Treatment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lgb_1 = lgb.LGBMClassifier()
lgb_1.fit(X_train, y_train)
y_pred = lgb_1.predict(X_test)
# evaluation
print(classification_report(y_test, y_pred))
plot_confusion_matrix(lgb_1, X_test, y_test)
plt.show()
le = preprocessing.LabelEncoder()
for i in ['Personal information', 'Treatment', 'Time to event']:
data[i] = le.fit_transform(data[i].astype(str))
y_train = y_train.reshape(-1,1)
pos_index = np.where(y_train==1)[0]
neg_index = np.where(y_train==0)[0]
cgan.train(X_train, y_train, pos_index, neg_index, epochs=500)
Here, the training gives an error ValueError: Input 0 of layer "Discriminator" is incompatible with the layer: expected shape=(None, 3), found shape=(100, 2). Well I understand I have to fix the shape by changing the input but where and how to do it.
Also there are 3 columns in data so how to go about making this work?
I think the fix out_shape=2 and not 3 because the generated output has 2 and you stated the number of classes to be 2 as well. Unless there is something else I am missing.
def __init__(self, latent_dim=100, out_shape=2):

Low validation accuracy but the train have high accuracy, also the validation loss so high

i have a question, when im creating the model, why it have high accuracy and low loss, but the validation accuracy low and validation loss so high.
Any recommend way to improve this?
It took 256 samples, validate on 65 samples.
This is the code that i used to
import numpy as np
import cv2
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
# from tensorflow.keras.models import Sequential
from keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from keras.layers import Conv2D,MaxPooling2D,Dense, Dropout, Activation, Flatten
# from tensorflow.keras.optimizers import Adam,RMSprop,SGD
from keras.optimizers import Adam , RMSprop, SGD
import pickle
from streamlit_webrtc import webrtc_streamer
################################################################
path = 'DatasetsBatik'
testRatio = 0.2
validationRatio = 0.2
imageDimension =(32,32,3)
batchSizeVal=2
epochsVal=200
stepsPerEpochVal=200
################################################################
images=[]
classNumber =[]
myList = os.listdir(path)
print("Total Number of Classes Detected" , " : " , len(myList))
NumberOfClasses =len(myList)
print("Importing Classes ....")
for x in range(0,NumberOfClasses):
#0 Representing Kawung,
#1 Representing Megamendung,
#2 Representing Parang,
#3 Representing Sekar Jagad,
#4 Representing Simbut,
myPicList = os.listdir(path+"/"+str(x))
for y in myPicList:
currentImage = cv2.imread(path+"/"+str(x)+"/"+y)
currentImage = cv2.resize(currentImage, (imageDimension[0],imageDimension[1]) )
images.append(currentImage)
classNumber.append(x)
print(x,end = " " )
print("")
print(len(classNumber))
images = np.array(images)
classNumber = np.array(classNumber)
# print(images.shape)
# print(classNumber.shape)
### Split Data
X_train,X_test,Y_train,Y_test = train_test_split(images,classNumber,test_size = testRatio)
X_train,X_validation,Y_train,Y_validation= train_test_split(X_train,Y_train,test_size = validationRatio)
print("x_train.shape = " + str(X_train.shape))
print("x_test.shape = " +str(X_test.shape))
print("x_validation.shape = "+str(X_validation.shape))
NumberOfSamples=[]
for x in range(0,NumberOfClasses):
# print(len(np.where(Y_train ==x)[0]))
NumberOfSamples.append(len(np.where(Y_train==x)[0]))
print("Number Of Samples : " +str(NumberOfSamples))
plt.figure(figsize=(10,5))
plt.bar(range(0,NumberOfClasses),NumberOfSamples)
plt.title("Number of Images for each Class")
plt.xlabel("Class ID")
plt.ylabel("Number of Images")
plt.show()
# print(X_train)
def PreProcessing(img):
img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
img = cv2.equalizeHist(img)
img = img/255
return img
# img = X_train[30]
# img = cv2.resize(img,(300,300))
# cv2.imshow("PreProcessed",img)
# cv2.waitKey(0)
X_train=np.array(list(map(PreProcessing,X_train)))
# print(X_train)
X_test=np.array(list(map(PreProcessing,X_test)))
X_validation=np.array(list(map(PreProcessing,X_validation)))
# print(X_train.shape)
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],X_train.shape[2],1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],X_test.shape[2],1)
X_validation = X_validation.reshape(X_validation.shape[0],X_validation.shape[1],X_validation.shape[2],1)
# print(X_train.shape)
dataGen = ImageDataGenerator(width_shift_range=0.1,height_shift_range=0.1,zoom_range=0.2,shear_range=0.1,rotation_range=10)
dataGen.fit(X_train)
Y_train = to_categorical(Y_train,NumberOfClasses)
Y_test = to_categorical(Y_test,NumberOfClasses)
Y_validation = to_categorical(Y_validation,NumberOfClasses)
def myModel():
NumberOfFilters1 = 128
NumberOfFilters2 = 256
sizeOfFilter1=(5,5)
sizeOfFilter2=(3,3)
sizeOfPool=(2,2)
NumberOfNode=128
model = Sequential()
model.add((Conv2D(NumberOfFilters1,sizeOfFilter1,input_shape=(imageDimension[0],imageDimension[1],1),activation='relu')))
model.add(Dropout(0.5))
model.add(MaxPooling2D(pool_size=sizeOfPool))
# model.add((Conv2D(NumberOfFilters1,sizeOfFilter1,activation='relu')))
# model.add(MaxPooling2D(pool_size=sizeOfPool))
# model.add(Dropout(0.5))
model.add((Conv2D(NumberOfFilters2,sizeOfFilter2,activation='relu')))
model.add(MaxPooling2D(pool_size=sizeOfPool))
model.add(Dropout(0.5))
# model.add((Conv2D(NumberOfFilters2,sizeOfFilter2,activation='relu')))
# model.add(MaxPooling2D(pool_size=sizeOfPool))
# model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(NumberOfNode,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(NumberOfClasses,activation='softmax'))
model.compile(Adam(lr=0.0005),loss='categorical_crossentropy',metrics=['accuracy'])
return model
model = myModel()
print(model.summary())
history = model.fit_generator(dataGen.flow(X_train,Y_train,batch_size=batchSizeVal),steps_per_epoch=stepsPerEpochVal,epochs=epochsVal,validation_data=(X_validation,Y_validation),shuffle=1)
# history= model.fit(X_train, Y_train, batch_size=batchSizeVal,epochs=epochsVal,verbose=1,validation_data=(X_validation, Y_validation))
plt.figure(1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['training','validation'])
plt.title('Loss')
plt.xlabel('epoch')
plt.figure(2)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(['training','validation'])
plt.title('Accuracy')
plt.xlabel('epoch')
plt.show()
score= model.evaluate(X_test,Y_test,verbose=0)
print('Test Score =',score[0])
print('Score Accuracy = ',score[1])
print(score)
# print(model.metrics_names)
pickle_out=open("10_TestModel4","wb")
pickle.dump(model,pickle_out)
pickle_out.close()
This is the result that i have now
Epoch 1
Finish,Epoch 200
I have tried to remove or add hidden layer, and also try some different learning rate, epoch, batch size, steps per epoch, adding drop out for the layer . Any recommend would be very helpfull, thankyou.

Accuracy drops when using ImageDataGenerator

I am working on skin disease classification and trying to build a CNN model. I have approximitly 200 images. I have splited my data using by importing train_test_split from sklearn.model_selection.
code snippet:
data = [] #this is where I will store all the data
for category in categories:
path = os.path.join(data_dir,category)
# print(path)
class_num = categories.index(category)
# print(class_num)
for img in os.listdir(path):
# print(img)
try:
img_array = cv2.imread(os.path.join(path,img))
new_array = cv2.resize(img_array,img_size)
data.append([new_array,class_num])
except Exception as e:
pass
X = []
Y = []
for features,label in data:
X.append(features)
Y.append(label)
X = np.array(X)
X = X.astype('float32')/255.0
X = X.reshape(-1,height, width,3)
Y = np.array(Y)
from keras.utils.np_utils import to_categorical
Y = to_categorical(Y, num_classes = 4)
from sklearn.model_selection import train_test_split
# train_ratio = 0.75
# validation_ratio = 0.15
# test_ratio = 0.10
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42, stratify=Y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1, stratify=y_train)
import pickle
pickle.dump(X_train, open('/content/drive/MyDrive/Data/X_train', 'wb'))
pickle.dump(y_train, open('/content/drive/MyDrive/Data/y_train', 'wb'))
pickle.dump(X_test, open('/content/drive/MyDrive/Data/X_test', 'wb'))
pickle.dump(y_test, open('/content/drive/MyDrive/Data/y_test', 'wb'))
pickle.dump(X_val, open('/content/drive/MyDrive/Data/X_val', 'wb'))
pickle.dump(y_val, open('/content/drive/MyDrive/Data/y_val', 'wb'))
The accuracy drastically drops when I use ImageDataGenerator for data augmentation.
code snippet:
Adam(learning_rate=0.00001, name='Adam')
model.compile(optimizer = 'Adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])
epochs = 80
from tensorflow.keras import callbacks
import time
import keras
from keras.callbacks import EarlyStopping
es_callback = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20)
datagen = ImageDataGenerator(
rescale=1./255,
rotation_range=30,
shear_range=0.3,
zoom_range=0.3,
width_shift_range=0.4,
height_shift_range=0.4,
horizontal_flip=True,
fill_mode='nearest'
)
checkpoint = callbacks.ModelCheckpoint(
filepath='/content/drive/MyDrive/Model1/model.{epoch:02d}-{accuracy:.2f}-{val_accuracy:.2f}.h5',
monitor='val_accuracy',
verbose=1,
save_best_only=True,
mode='auto'
)
history5 = model.fit(datagen.flow(X_train,
y_train,
batch_size=32),
epochs = epochs,
validation_data = (X_val,y_val)
)
without data sugmentation the validation accuracy is 55%
with data augmentation the validation accuracy is 30%

Compare the example of Pytorch and Keras on Cifar10 data

I use CIFAR10 dataset to learn how to code using Keras and PyTorch.
The environment is Python 3.6.7, Torch 1.0.0, Keras 2.2.4, Tensorflow 1.14.0.
I use the same batch size, number of epochs, learning rate and optimizer.
I use DenseNet121 as the model.
After training, Keras get 69% accuracy in test data.
PyTorch just get 54% in test data.
I know the results are different, but why is the result so bad in PyTorch?
Here is the Keras code:
import os, keras
from keras.datasets import cifar10
from keras.applications.densenet import DenseNet121
batch_size = 32
num_classes = 10
epochs = 20
# The data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
# model
model = DenseNet121(include_top=True, weights=None, input_shape=(32,32,3), classes=10)
# initiate RMSprop optimizer
opt = keras.optimizers.SGD(lr=0.001, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test),
shuffle=True)
# Score trained model.
scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
Here is the Pytorch code:
import torch
import torchvision
import torchvision.transforms as transforms
from torch import flatten
import torch.optim as optim
from torchvision import transforms, models
from torch.nn import Linear, Softmax, Module, Sequential, CrossEntropyLoss
import numpy as np
from tqdm import tqdm
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
transform = transforms.Compose([transforms.ToTensor()])
trainset = torchvision.datasets.CIFAR10(root='./DataSet', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, num_workers=0)
testset = torchvision.datasets.CIFAR10(root='./DataSet', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=0)
import torch.nn as nn
import torch.nn.functional as F
class Net(Module):
def __init__(self):
super(Net, self).__init__()
self.funFeatExtra = Sequential(*[i for i in list(models.densenet121().children())[:-1]])
self.funFlatten = flatten
self.funOutputLayer = Linear(1024, 10)
self.funSoftmax = Softmax(dim=1)
def forward(self, x):
x = self.funFeatExtra(x)
x = self.funFlatten(x, 1)
x = self.funOutputLayer(x)
x = self.funSoftmax(x)
return x
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
for epoch in range(20): # loop over the dataset multiple times
running_loss = 0.0
for i, data in tqdm(enumerate(trainloader, 0)):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net.cuda()(inputs.cuda())
loss = criterion(outputs, labels.cuda())
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
# if i % 2000 == 1999: # print every 2000 mini-batches
# print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
# running_loss = 0.0
print('Finished Training')
########################################################################
# The results seem pretty good.
#
# Let us look at how the network performs on the whole dataset.
correct = 0
total = 0
with torch.no_grad():
for data in tqdm(testloader):
images, labels = data
outputs = net.cpu()(images.cpu())
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))
You are not supposed to softmax the model output before you pass it to CrossEntropyLoss. Per the documentation:
This criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
...
The input is expected to contain raw, unnormalized scores for each class.
You can softmax them separately (outside of forward()) when calculating accuracy.

the same model converged in keras but not tensorflow, how is that possible?

I'm trying to work with lstm in tensorflow, but I got to the point I can't make a simple imdb sentiment model to converge.
I took a keras model and tried to duplicate the exact same model in tensorflow, in keras it trains and converge however in tensorflow it is just stuck at some point (0.69 loss).
I tried to make them as equal as possible, the only difference I can tell of is that in keras the padding is before the sequence, while in tensorflow I use 'post' padding due to the conventions in tensorflow.
Any idea whats wrong with my tensorflow model?
from __future__ import print_function
import random
import numpy as np
from tensorflow.contrib.keras.python.keras.preprocessing import sequence
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense, Dropout, Activation
from tensorflow.contrib.keras.python.keras.layers import Embedding
from tensorflow.contrib.keras.python.keras.layers import LSTM
from tensorflow.contrib.keras.python.keras.layers import Conv1D, MaxPooling1D
from tensorflow.contrib.keras.python.keras.datasets import imdb
import tensorflow as tf
# Embedding
max_features = 30000
maxlen = 2494
embedding_size = 128
# Convolution
kernel_size = 5
filters = 64
pool_size = 4
# LSTM
lstm_output_size = 70
# Training
batch_size = 30
epochs = 2
class TrainData:
def __init__(self, batch_sz=batch_size):
(x_train, y_train), (_, _) = imdb.load_data(num_words=max_features)
y_train = [[int(x == 1), int(x != 1)] for x in y_train]
self._batch_size = batch_sz
self._train_data = sequence.pad_sequences(x_train, padding='pre')
self._train_labels = y_train
def next_batch(self):
if len(self._train_data) < self._batch_size:
self.__init__()
batch_x, batch_y = self._train_data[:self._batch_size], self._train_labels[:self._batch_size]
self._train_data = self._train_data[self._batch_size:]
self._train_labels = self._train_labels[self._batch_size:]
return batch_x, batch_y
def batch_generator(self):
while True:
if len(self._train_data) < self._batch_size:
self.__init__()
batch_x, batch_y = self._train_data[:self._batch_size], self._train_labels[:self._batch_size]
self._train_data = self._train_data[self._batch_size:]
self._train_labels = self._train_labels[self._batch_size:]
yield batch_x, batch_y
def get_num_batches(self):
return int(len(self._train_data) / self._batch_size)
def length(sequence):
used = tf.sign(tf.abs(sequence))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
def get_model(x, y):
embedding = tf.get_variable("embedding", [max_features, embedding_size], dtype=tf.float32)
embedded_x = tf.nn.embedding_lookup(embedding, x)
print(x)
print(embedded_x)
print(length(x))
cell_1 = tf.contrib.rnn.BasicLSTMCell(lstm_output_size)
output_1, state_1 = tf.nn.dynamic_rnn(cell_1, embedded_x, dtype=tf.float32, scope="rnn_layer1",
sequence_length=length(x))
# Select last output.
last_index = tf.shape(output_1)[1] - 1
# reshaping to [seq_length, batch_size, num_units]
output = tf.transpose(output_1, [1, 0, 2])
last = tf.gather(output, last_index)
# Softmax layer
with tf.name_scope('fc_layer'):
weight = tf.get_variable(name="weights", shape=[lstm_output_size, 2])
bias = tf.get_variable(shape=[2], name="bias")
logits = tf.matmul(last, weight) + bias
loss = tf.losses.softmax_cross_entropy(y, logits=logits)
optimizer = tf.train.AdamOptimizer()
optimize_step = optimizer.minimize(loss=loss)
return loss, optimize_step
def tf_model():
x_holder = tf.placeholder(tf.int32, shape=[None, maxlen])
y_holder = tf.placeholder(tf.int32, shape=[None, 2])
loss, opt_step = get_model(x_holder, y_holder)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
step = 0
for epoch in range(10):
cost_epochs = []
train_data = TrainData()
cost_batch = 0
for batch in range(train_data.get_num_batches()):
x_train, y_train = train_data.next_batch()
_, cost_batch = sess.run([opt_step, loss],
feed_dict={x_holder: x_train,
y_holder: y_train})
cost_epochs.append(cost_batch)
step += 1
# if step % 100 == 0:
print("Epoch: " + str(epoch))
print("\tcost: " + str(np.mean(cost_epochs)))
def keras_model():
# print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
y_test = [[int(x == 1), int(x != 1)] for x in y_test]
x_test = sequence.pad_sequences(x_test, maxlen=maxlen, padding='pre')
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(LSTM(lstm_output_size))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print('Train...')
data = TrainData()
model.fit_generator(data.batch_generator(), steps_per_epoch=data.get_num_batches(),
epochs=epochs,
validation_data=(x_test, y_test))
if __name__ == '__main__':
# keras_model()
tf_model()
EDIT
When I limit the sequence length to 100 both models converge, so I assume there is something different in the the lstm layer.
Check the initial values of your operations. In my case the adadelta optimizer in keras had initial learning rate of 1.0 and in tf.keras it had 0.001 so in the mnist dataset it converged much slowly.