Question: I am doing a multi-class image classification using TensorFlow 2.5 on Google Colab. I received three different values of classification accuracy and I do not know which one I should trust and why they are different.
Demonstration:
when I was evaluating on the test set, I received accuracy_1
29/29 [==============================] - 5s 147ms/step - loss: 1.1036 - accuracy: 0.3186
when I was predicting on the test set, I received accuracy_2, which is 0.22
precision recall f1-score support
0 0.69 0.12 0.21 1305
1 0.15 0.78 0.26 272
2 0.14 0.13 0.13 231
accuracy 0.22 1808
macro avg 0.33 0.34 0.20 1808
weighted avg 0.54 0.22 0.20 1808
Here's how I got accuracy_3, whose value is 0.2129424778761062
:
from sklearn.metrics import accuracy_score
prediction = np.argmax(detector.predict(test_dataset), axis=1)
accuracy_3 = accuracy_score(
np.concatenate([label.numpy() for image, label in test_dataset.take(-1)]),
prediction
))
I discovered that if I run the code block that calculates accuracy_3 for multiple times. I get different result each time but they will differ much from accuracy_2, which is 0.22. Here is the code for the calculation of accuracy_1 and accuracy_2:
from tensorflow.keras.callbacks import Callback
class Peek(Callback):
def on_epoch_begin(self, epoch, logs=None):
current_decayed_lr = self.model.optimizer._decayed_lr(tf.float32).numpy()
print(f"Current learning rate: {current_decayed_lr}")
def on_epoch_end(self, epoch, logs=None):
print("Evaluating...")
self.model.evaluate(test_dataset, verbose=1) # calculates accuracy_1
print("Predicting...")
predictions = np.argmax(self.model.predict(test_dataset), axis=1)
true_categories = np.array([label.numpy() for image, label in test_dataset.unbatch()])
print(classification_report(true_categories, predictions)) # calculates accuracy_2
The difference between accuracy_2 and accuracy_3 is more likely due to random chance but accuracy_1 is much larger than the other two. I searched on stackoverflow and some posts say the difference could be due to shuffle=True in ImageDataGenerator when creating the test set. My case is different because I was not using ImageDataGenerator for performance's sake. I was loading data using TFRecords, here is the full code.
import os
import math
import numpy as np
import tensorflow as tf
from glob import glob
from progressbar import progressbar
from os.path import basename, exists
from tensorflow.sparse import to_dense
from tensorflow.data import Dataset, Options, TFRecordDataset
from tensorflow.image import decode_jpeg, encode_jpeg, resize
from tensorflow.train import Feature, Features, BytesList, Int64List, FloatList, Example
from tensorflow.io import read_file, TFRecordWriter, FixedLenFeature, VarLenFeature, parse_single_example
from tensorflow.data.experimental import AUTOTUNE
class DataLoader:
def __init__(self, subset_name):
self.subset_name = subset_name
self.file_pattern = glob(
f"./dataset/{self.subset_name}/**/*.jpg",
recursive=True
)
self.target_size = (224, 224)
self.classes = [b"Negative", b"Positive", b"Unreadable"]
self.n_images = len(self.file_pattern)
self.n_shards = 32
self.write_shard_size = math.ceil(1.0 * self.n_images / self.n_shards)
self.read_shard_size = 64
self.output_dir = f"tfrecords-jpeg-{subset_name}-{'x'.join(map(lambda x: str(x), self.target_size))}"
def fetch_image_and_label(self, filename):
bits = read_file(filename)
image = decode_jpeg(bits)
image = resize(image, self.target_size)
height = tf.shape(image)[0]
width = tf.shape(image)[1]
image = tf.cast(image, tf.uint8)
image = encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
label = tf.expand_dims(filename, axis=-1)
label = tf.strings.split(label, sep="/")
label = label.values[-2]
return image, label, height, width
#staticmethod
def _bytestring_feature(list_of_bytestrings):
return Feature(bytes_list=BytesList(value=list_of_bytestrings))
#staticmethod
def _int_feature(list_of_ints):
return Feature(int64_list=Int64List(value=list_of_ints))
#staticmethod
def _float_feature(list_of_floats):
return Feature(float_list=FloatList(value=list_of_floats))
def to_tfrecord(self, tfrec_filewriter, img_bytes, label, height, width):
class_num = np.argmax(np.array(self.classes) == label)
one_hot_class = np.eye(len(self.classes))[class_num]
feature = {
"image": self._bytestring_feature([img_bytes]),
"class": self._int_feature([class_num]),
"label": self._bytestring_feature([label]),
"size": self._int_feature([height, width]),
"one_hot_class": self._float_feature(one_hot_class.tolist())
}
return Example(features=Features(feature=feature))
def write_records(self):
print(f"{self.n_images} images, {self.n_shards} shards with {self.write_shard_size} images each.")
filenames = Dataset.list_files(self.file_pattern, seed=35155)
dataset = filenames.map(self.fetch_image_and_label, num_parallel_calls=AUTOTUNE).batch(self.write_shard_size)
if not exists(self.output_dir):
os.mkdir(self.output_dir)
print("Writing TFRecords...")
for shard, (image, label, height, width) in enumerate(dataset):
shard_size = image.numpy().shape[0]
filename = f"{self.output_dir}/{str(shard).zfill(2)}-{shard_size}.tfrec"
with TFRecordWriter(filename) as out_file:
for i in progressbar(range(shard_size)):
example = self.to_tfrecord(
out_file,
image.numpy()[i],
label.numpy()[i],
height.numpy()[i],
width.numpy()[i]
)
out_file.write(example.SerializeToString())
print(f"Wrote file {filename} containing {shard_size} records")
def _read_tfrecord(self, example):
features = {
"image": FixedLenFeature([], tf.string),
"class": FixedLenFeature([], tf.int64),
"label": FixedLenFeature([], tf.string),
"size": FixedLenFeature([2], tf.int64),
"one_hot_class": VarLenFeature(tf.float32)
}
example = parse_single_example(example, features)
image = decode_jpeg(example["image"], channels=3)
image = tf.reshape(image, [*self.target_size, 3])
class_num = example["class"]
label = example["label"]
height = example["size"][0]
width = example["size"][1]
one_hot_class = to_dense(example["one_hot_class"])
# return image, class_num, label, height, width, one_hot_class
# return only image and class_num because we're classifying images
return image, class_num
def read_records(self):
from tensorflow.io.gfile import glob
option_no_order = Options()
option_no_order.experimental_deterministic = False
filenames = glob(f"{self.output_dir}/*.tfrec")
dataset = TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
dataset = dataset.with_options(option_no_order)
dataset = dataset.map(self._read_tfrecord, num_parallel_calls=AUTOTUNE)
dataset = dataset.shuffle(10000)
dataset = dataset.prefetch(buffer_size=AUTOTUNE)
dataset = dataset.batch(self.read_shard_size)
return dataset
train_loader = DataLoader("train")
validation_loader = DataLoader("validation")
test_loader = DataLoader("test")
train_dataset = train_loader.read_records()
validation_dataset = validation_loader.read_records()
test_dataset = test_loader.read_records()
train_dataset = train_dataset.concatenate(validation_dataset)
The difference between accuracy_2 and accuracy_3 still exists and accuracy_3 still changes every time I run the code block that computes accuracy_3 even after dataset = dataset.shuffle(10000) is removed from def read_records(self) in DataLoader class.
I will also paste the code regarding how the model was instantiated and compiled to provide more background information.
from tensorflow.keras import Input, Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.applications.densenet import DenseNet201
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.applications.densenet import preprocess_input
def create_model():
feature_extractor = DenseNet201(
weights="imagenet",
input_shape=(224, 224, 3),
include_top=False
)
feature_extractor.trainable = True
inputs = Input([224, 224, 3])
x = preprocess_input(inputs)
x = feature_extractor(x)
x = GlobalAveragePooling2D()(x)
x = Dense(32, activation="elu")(x)
x = Dropout(0.8)(x)
outputs = Dense(3, activation="softmax")(x)
detector = Model(inputs, outputs)
detector.compile(
optimizer=SGD(learning_rate=0.001, momentum=0.9),
loss=["sparse_categorical_crossentropy"],
metrics=["sparse_categorical_accuracy"]
)
return detector
detector = create_model()
peek = Peek()
detector.fit(
train_dataset,
epochs=1,
validation_data=test_dataset,
class_weight=class_weight,
callbacks=[peek],
)
Related
I want to train a mixture density model using tfd.MixtureSameFamily. But after several thousand epochs of training, the result gets NaN. Here's full functioning code to replicate this.
import section
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
import numpy as np
data generation
number_of_instances = 1000
x_data = np.linspace(-5.5,5.5,number_of_instances)
r_data = np.random.randn(number_of_instances)
y_data = 7*np.sin(x_data*0.75)+ x_data + r_data
x_data = x_data.astype("float32")
x_data = x_data.reshape(x_data.size,1)
y_data = y_data.astype("float32")
y_data = y_data.reshape(y_data.size,1)
model building section
hidden_units = 100
k_mixt = 5
l2_reg = 1e-3
learning_rate = 1e-3
hidden_dense = Dense(units=hidden_units,
input_dim=y_data,
activation=tf.nn.relu,
kernel_regularizer=regularizers.l2(l2_reg),
name=f'Dense',
trainable=True)
alpha_dense = Dense(units=k_mixt,
activation=tf.nn.softmax,
name='alpha',
trainable=True)
mu_dense = Dense(units=k_mixt,
activation=None,
name='mu',
trainable=True)
sigma_dense = Dense(k_mixt,
activation=tf.nn.softplus,
name='sigma',
trainable=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
training section
for epoch in range(int(5e4)):
with tf.GradientTape() as tape:
hidden = hidden_dense(y_data)
alpha = alpha_dense(hidden)
mu = mu_dense((hidden))
sigma=sigma_dense(hidden)
gm = tfd.MixtureSameFamily(mixture_distribution=tfd.Categorical(probs=alpha),components_distribution=tfd.Normal(loc=mu, scale=sigma))
loss = -tf.reduce_sum(gm.log_prob(tf.reshape(x_data,(-1,))))
grads = tape.gradient(loss,tape.watched_variables())
(grads_clipped, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
optimizer.apply_gradients(zip(grads_clipped, tape.watched_variables()))
if epoch%5e2 == 0:
print(epoch, loss)
What I found is that the nan first appears sigma_dense layer and hidden_dense layer in some epoch and then spread to all other layers. It seems that the cause of Nan is calculating gradient of loss respect to sigma.
As I learnt from a youtube video, the gradient of loss respect to sigma is:
d(ln(loss)/d(sigma) = -n/sigma + 1/sigma^3 * ((x1 - mu)^2 + ... + (xn - mu)^2)
Could this derivative formula be the cause of nan? Does anyone have any idea?
I am using graph convolutions in Deepchem/Keras for predicting molecular properties. Following the Deepchem tutorials I created a data generator. While there is no error in my code below, I fail to understand why the size of pred changes with epoch and batch_size.
First we create some dummy data.
!pip install --pre deepchem
!pip install --pre rdkit
import deepchem as dc
import numpy as np
import tensorflow as tf
from deepchem.feat.mol_graphs import ConvMol
mol = ['C-C-O']*240
ftr = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
X=ftr.featurize(mol)
y = np.arange(0,240,1)
w = np.arange(0,240,1)
ids = np.arange(0,240,1)
ds = dc.data.NumpyDataset(X=X, y=y, ids=ids)
Edit: We use the following function as generator:
def data_generator(dataset, epochs=1, batch_size = 100, pad_batches = True):
print(dataset)
for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size, epochs,
deterministic=False, pad_batches=pad_batches)):
multiConvMol = ConvMol.agglomerate_mols(X_b)
inputs = [multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership)]
for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
labels = [y_b]
weights = [w_b]
yield (inputs, labels, weights)
(end edit)
Then we define the model and fit it to the dataset generated above:
batch_size = 100
n_tasks = 1
class TestModel(tf.keras.Model):
def __init__(self, model = 1):
super(TestModel, self).__init__()
self.model = model
#____________Test Model 1___________
if self.model == 1:
self.gc1 = GraphConv(128, activation_fn=tf.nn.tanh)
self.readout = GraphGather(batch_size=batch_size,
activation_fn=tf.nn.tanh)
self.dense2 = layers.Dense(1)
def call(self, inputs):
#____________Test Model 1___________
if self.model == 1:
gc1_output = self.gc1(inputs)
readout_output = self.readout([gc1_output]+ inputs[1:])
dense2_output = self.dense2(readout_output)
return dense2_output
#Fit_generator
print("_________\nFitting:")
testmodel = dc.models.KerasModel(TestModel(1), loss=dc.models.losses.L2Loss())
testmodel.fit_generator(data_generator(ds, epochs=1, batch_size = 100))
Finally we try to predict the dataset labels setting epochs = 2:
#Predict
print("_________\nPredicting:")
pred = testmodel.predict_on_generator(data_generator(ds, epochs = 2, batch_size = 100, pad_batches = True))
print(ds.y.shape, pred.shape)
Giving:
_________
Predicting:
<NumpyDataset X.shape: (240,), y.shape: (240,), w.shape: (240,), ids: [0 1 2 ... 237 238 239], task_names: [0]>
(240,) (600, 1)
However if I change epochs to 1, the size of pred changes (300, 1) i.e. half of what we had before. Similarly, changing the batch_size affects the prediction size too.
Can anyone explain what I'm doing wrong?
I am trying to train a keras ResNet50 model for image classification model using a tutorial. Instead of the inbuilt data generator, I want to use albumentations library for augmentation.
from albumentations import Compose
transforms = Compose([HorizontalFlip()])
I have read a few articles, but I could not figure out how to implement albumentations.
Which line of code should I modify to implement albumentations.
I am reproducing the code below after removing non necessary lines.
NUM_CLASSES = 2
CHANNELS = 3
IMAGE_RESIZE = 224
RESNET50_POOLING_AVERAGE = 'avg'
DENSE_LAYER_ACTIVATION = 'softmax'
OBJECTIVE_FUNCTION = 'categorical_crossentropy'
LOSS_METRICS = ['accuracy']
NUM_EPOCHS = 300
EARLY_STOP_PATIENCE = 20
STEPS_PER_EPOCH_TRAINING = 20
STEPS_PER_EPOCH_VALIDATION = 20
BATCH_SIZE_TRAINING = 10
BATCH_SIZE_VALIDATION = 10
# %% ---------------------------------------------------------------------
TrainingData_directory = 'C:/datafolder/Train'
ValidationData_directory = 'C:/datafolder/Validation'
ModelCheckpointPath = 'C:/datafolder/ResNet50_Weights.hdf5'
# %% ---------------------------------------------------------------------
from albumentations import Compose
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# %% ---------------------------------------------------------------------
model = Sequential()
model.add(ResNet50(include_top = False, pooling = RESNET50_POOLING_AVERAGE, weights = 'imagenet'))
model.add(Dense(NUM_CLASSES, activation = DENSE_LAYER_ACTIVATION))
model.layers[0].trainable = False
from tensorflow.keras import optimizers
sgd = optimizers.SGD(lr = 0.001, decay = 1e-6, momentum = 0.9, nesterov = True)
model.compile(optimizer = sgd, loss = OBJECTIVE_FUNCTION, metrics = LOSS_METRICS)
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
image_size = IMAGE_RESIZE
data_generator = ImageDataGenerator(preprocessing_function = preprocess_input)
train_generator = data_generator.flow_from_directory(TrainingData_directory,
target_size = (image_size, image_size),
batch_size = BATCH_SIZE_TRAINING,
class_mode = 'categorical')
validation_generator = data_generator.flow_from_directory(ValidationData_directory,
target_size = (image_size, image_size),
batch_size = BATCH_SIZE_VALIDATION,
class_mode = 'categorical')
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = EARLY_STOP_PATIENCE)
cb_checkpointer = ModelCheckpoint(filepath = ModelCheckpointPath,
monitor = 'val_loss', save_best_only = True, mode = 'auto')
fit_history = model.fit_generator(
train_generator,
steps_per_epoch=STEPS_PER_EPOCH_TRAINING,
epochs = NUM_EPOCHS,
validation_data=validation_generator,
validation_steps=STEPS_PER_EPOCH_VALIDATION,
callbacks=[cb_checkpointer, cb_early_stopper]
)
I think you can do it using the ImageDataGenerator preprocessing_function. The function should taken in a single image as input and return an image. So in your case.
def augmentor (img)
# place you code here do to the albumentations transforms
# your code should result in a single transformed image I called aug_img
return aug_img/127.5-1 #scales the pixels between -1 and +1 which it what preprocees_input does
data_generator = ImageDataGenerator(preprocessing_function = augmentor)
You can include it into the preprocessing function passed to ImageDataGenerator:
def preprocessing_function(x):
preprocessed_x = preprocess_input(x)
transformed_image = transforms(image=preprocessed_x)['image']
return transformed_image
ImageDataGenerator(preprocessing_function = preprocessing_function)
That's (IMO) the limitation or losing the flexibility that one might come across using a built-in data generator (ImageDataGenerator). You should implement your own custom data generator.
Check this kernel: [TF.Keras]: SOTA Augmentation in Sequence Generator, where we've shown how one can use albumentation, cutmix, mixup, and fmix type advance augmentation into the custom generator. Here is a basic approach of how to use albumentaiton in a custom data generator.
import albumentations as A
# For Training
def albu_transforms_train(data_resize):
return A.Compose([
A.ToFloat(),
A.Resize(data_resize, data_resize),
A. [.....what ever......]
], p=1.)
class Generator(tf.keras.utils.Sequence):
def __getitem__(self, index):
...........
Data = np.empty((self.batch_size, *self.dim))
Target = np.empty((self.batch_size, 5), dtype = np.float32)
for i, k in enumerate(idx):
# load the image file using cv2
image = cv2.imread(self.img_path + self.data['image_id'][k])
image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
# call augmentor / albumentation
res = self.augment(image=image)
image = res['image']
# assign
Data[i,:, :, :] = image
Target[i,:] = self.label.loc[k, :].values
return Data, Target
# call the generator
check_gens = Generator(...., transform = albu_transforms_train(128))
I am training image classification model which contains 21000 image. I have created data pipeline with the help of tf.data API of tensorflow. My issue is that training is too slow despite using API. I have also enabled tensorflow gpu version. Please help me out.I thought first it was due to keras imagedatagenerator which is slowing down my training time but now when I changed it tf.data pipeline it still does not utilizes my gpu. Below is my whole code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.applications import ResNet50, EfficientNetB3, InceptionV3, DenseNet121
from tensorflow.keras.optimizers import Adam
# ignoring warnings
import warnings
warnings.simplefilter("ignore")
import os,cv2
base_dir = "D:/cassava-leaf-disease-classification/"
train_csv = pd.read_csv("D:/cassava-leaf-disease-classification/train.csv")
# print(train_csv.head())
df_sample = pd.read_csv("D:/cassava-leaf-disease-classification/sample_submission.csv")
train_images = "D:/cassava-leaf-disease-classification/train_images/"+train_csv['image_id']
# print(train_images)
# print(os.listdir(train_images))
train_labels = pd.read_csv(os.path.join(base_dir, "train.csv"))
# print(train_labels)
BATCH_SIZE = 16
EPOCHS = 25
STEPS_PER_EPOCH = len(train_labels)*0.8 / BATCH_SIZE
TARGET_SIZE = 300
# train_labels['label'] = train_labels.label.astype('str')
labels = train_labels.iloc[:,-1].values
# print(labels)
def build_decoder(with_labels=True, target_size=(TARGET_SIZE, TARGET_SIZE), ext='jpg'):
def img_decode(img_path):
file_bytes = tf.io.read_file(img_path)
if ext == 'png':
img = tf.image.decode_png(file_bytes, channels=3)
elif ext in ['jpg', 'jpeg']:
img = tf.image.decode_jpeg(file_bytes, channels=3)
else:
raise ValueError("Image extension not supported")
img = tf.cast(img, tf.float32) / 255.0
img = tf.image.resize(img, target_size)
return img
def decode_with_labels(img_path, label):
return img_decode(img_path), label
if with_labels == True:
return decode_with_labels
else:
return img_decode
def build_augmenter(with_labels=True):
def augment(img):
img = tf.image.random_flip_left_right(img)
img = tf.image.random_flip_up_down(img)
img = tf.image.random_brightness(img, 0.1)
img = tf.image.random_contrast(img, 0.9, 1.1)
img = tf.image.random_saturation(img, 0.9, 1.1)
return img
def augment_with_labels(img, label):
return augment(img), label
if with_labels == True:
return augment_with_labels
else:
return augment
def build_dataset(paths, labels=None, bsize=32, cache=True,
decode_fn=None, augment_fn=None,
augment=True, repeat=True, shuffle=1024,
cache_dir=""):
if cache_dir != "" and cache is True:
os.makedirs(cache_dir, exist_ok=True)
if decode_fn is None:
decode_fn = build_decoder(labels is not None)
if augment_fn is None:
augment_fn = build_augmenter(labels is not None)
AUTO = tf.data.experimental.AUTOTUNE
slices = paths if labels is None else (paths, labels)
dset = tf.data.Dataset.from_tensor_slices(slices)
dset = dset.map(decode_fn, num_parallel_calls=AUTO)
# dset = dset.cache(cache_dir) if cache else dset
dset = dset.map(augment_fn, num_parallel_calls=AUTO) if augment else dset
dset = dset.repeat() if repeat else dset
dset = dset.shuffle(shuffle) if shuffle else dset
dset = dset.batch(bsize).prefetch(AUTO)
return dset
# Train test split
(train_img, valid_img,train_labels,valid_labels) = train_test_split(train_images,labels,train_size = 0.8,random_state = 0)
# print(train, valid)
# Tensorflow datasets
train_df = build_dataset(
train_img, train_labels, bsize=BATCH_SIZE,
cache=True)
valid_df = build_dataset(
valid_img, valid_labels, bsize=BATCH_SIZE,
repeat=False, shuffle=False, augment=False,
cache=True)
def create_model():
model = models.Sequential()
model.add(EfficientNetB3(include_top=False, weights='imagenet',
input_shape=(TARGET_SIZE,TARGET_SIZE,3)))
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(5,activation='softmax'))
model.compile(optimizer=Adam(lr=0.001),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
model = create_model()
model.summary()
model_save = ModelCheckpoint('C:/Users/rosha/PycharmProjects/CLDD/saved_Models/EffNetB3_300_16_best_weights.h5',
save_best_only=True,
save_weights_only=True,
monitor='val_accuracy',
mode='max',
verbose=1
)
early_stop = EarlyStopping(monitor='val_accuracy',
min_delta=0.001,
patience=5,
mode='max',
verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy',
factor=0.3,
patience=2,
min_delta=0.001,
mode='max',
verbose=1)
history = model.fit(
train_df,
validation_data=valid_df,
steps_per_epoch=STEPS_PER_EPOCH,
epochs=EPOCHS,
callbacks=[model_save, early_stop, reduce_lr],
verbose=1,
)
plt.rcParams.update({'font.size': 16})
hist = pd.DataFrame(history.history)
fig, (ax1, ax2) = plt.subplots(figsize=(12, 12), nrows=2, ncols=1)
hist['loss'].plot(ax=ax1, c='k', label='training loss')
hist['val_loss'].plot(ax=ax1, c='r', linestyle='--', label='validation loss')
ax1.legend()
hist['accuracy'].plot(ax=ax2, c='k', label='training accuracy')
hist['val_accuracy'].plot(ax=ax2, c='r', linestyle='--', label='validation accuracy')
ax2.legend()
plt.show()
model.save('./EffNetB3_300_16.h5')
So here is a small checklist I like to go over:
Execute the following code to check whether the GPU is found by tensorflow:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
If the output is "Num GPUs Available: 0", then you should check that you indeed have tensorflow-gpu installed, you might also want to check that support libraries are also in the gpu version.
If your libraries are correct you will need to check to see if your CUDA driver installation is correct. This step is somewhat OS dependent but there are many tutorials online for both. My favourite for TF can be found on the official website: https://www.tensorflow.org/install/gpu
Here I would like to generate a tutorial usage of LSTM in MxNet, with the example for Tensorflow. (location at https://github.com/mouradmourafiq/tensorflow-lstm-regression/blob/master/lstm_sin.ipynb"
Here is my major code
import mxnet as mx
import numpy as np
import pandas as pd
import argparse
import os
import sys
from data_processing import generate_data
import logging
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)
TIMESTEPS = 3
BATCH_SIZE = 100
X, y = generate_data(np.sin, np.linspace(0, 100, 10000), TIMESTEPS, seperate=False)
train_iter = mx.io.NDArrayIter(X['train'], y['train'], batch_size=BATCH_SIZE, shuffle=True, label_name='lro_label')
eval_iter = mx.io.NDArrayIter(X['val'], y['val'], batch_size=BATCH_SIZE, shuffle=False)
test_iter = mx.io.NDArrayIter(X['test'], batch_size=BATCH_SIZE, shuffle=False)
num_layers = 3
num_hidden = 50
data = mx.sym.Variable('data')
label = mx.sym.Variable('lro_label')
stack = mx.rnn.SequentialRNNCell()
for i in range(num_layers):
stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_'%i))
#stack.reset()
outputs, states = stack.unroll(length=TIMESTEPS,
inputs=data,
layout='NTC',
merge_outputs=True)
outputs = mx.sym.reshape(outputs, shape=(BATCH_SIZE, -1))
# purpose of fc1 was to make shape change to (batch_size, *), or label shape won't match LSTM unrolled output shape.
outputs = mx.sym.FullyConnected(data=outputs, num_hidden=1, name='fc1')
label = mx.sym.reshape(label, shape=(-1,))
outputs = mx.sym.LinearRegressionOutput(data=outputs,
label=label,
name='lro')
contexts = mx.cpu(0)
model = mx.mod.Module(symbol = outputs,
data_names = ['data'],
label_names = ['lro_label'])
model.fit(train_iter, eval_iter,
optimizer_params = {'learning_rate':0.005},
num_epoch=4,
batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 2))
This code runs but the train_accuracy is Nan.
The question is how to make it correct?
And since unrolled out shape has sequence_length, how can it match to label shape? Did my FC1 net make sense?
pass auto_reset=False to Speedometer callback, say, batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 2, auto_reset=False), should fix the NaN train-acc.