How to define the data to fit the classifier - tensorflow

I am new to tensorflow. I created a 204x4 matrix where the first 3 colums are feature and the last colum is the target. How do I need to convert the array so that tensorflow can train the data?
TRAINING_SET = np.asarray(seq[:llength])
VALIDATION_SET= np.asarray(seq[llength:llength+tlength])
TEST_SET = np.asarray(seq[llength+tlength:])
num_epochs=100
batch_size = 32
featureColumns = np.shape(TRAINING_SET)[1]
# define a function to get data as batch, you can use this function for test and validation also by simply changing shuffle=False and replacing tf.train.shuffle_batch as tf.train.batch
def data_input_fn(trainset, batch_size, num_epochs, toShuffle):
data_f = trainset[:, :(featureColumns-1)]
data_l = trainset[:, (featureColumns-1)]
data_f_single, data_l_single = tf.train.slice_input_producer([data_f, data_l], num_epochs=num_epochs, shuffle=toShuffle)
if toShuffle is True:
data_f_batch, data_l_batch = tf.train.shuffle_batch([data_f_single, data_l_single], batch_size=batch_size, capacity=400, min_after_dequeue=2*batch_size)
else:
data_f_batch, data_l_batch = tf.train.batch([data_f_single, data_l_single], batch_size=batch_size, capacity=400, min_after_dequeue=2*batch_size)
return data_f_batch, data_l_batch
def main():
# Specify that all features have real-value data
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=3)]
# Build 3 layer DNN with 10, 20, 10 units respectively.
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
hidden_units=[10, 20, 10],
n_classes=10,
model_dir="/tmp/iris_model")
# Fit model.
classifier.fit(input_fn=lambda: data_input_fn(TRAINING_SET, batch_size, num_epochs, True), steps=4000)
# Evaluate accuracy.
accuracy_test_score = classifier.evaluate(input_fn=lambda: data_input_fn(VALIDATION_SET, batch_size, num_epochs, False),
steps=1)["accuracy"]
accuracy_validation_score = classifier.evaluate(input_fn=lambda: data_input_fn(TEST_SET, batch_size, num_epochs, False),
steps=1)["accuracy"]
print ("\nValidation Accuracy: {0:0.2f}\nTest Accuracy: {1:0.2f}\n".format(accuracy_validation_score,accuracy_test_score))
# Classify two new flower samples.
def new_samples():
return np.array(
[[327,8,3],
[47,8,0]], dtype=np.float32)
predictions = list(classifier.predict_classes(input_fn=new_samples))
gives
TypeError: 'Tensor' object is not callable

You need use a function for the input_fn not just a tensor
TRAINING_SET = np.asarray(seq[:llength])
VALIDATION_SET= np.asarray(seq[llength:llength+tlength])
TEST_SET = np.asarray(seq[llength+tlength:])
num_epochs=100
batch_size = 32
# define a function to get data as batch, you can use this function for test and validation also by simply changing shuffle=False and replacing tf.train.shuffle_batch as tf.train.batch
def data_input_fn(trainset, batch_size, num_epochs):
data_f = trainset[:, :3]
data_l = trainset[:, 3]
data_f_single, data_l_single = tf.train.slice_input_producer([data_f, data_l], num_epochs=num_epochs, shuffle=True)
data_f_batch, data_l_batch = tf.train.shuffle_batch([data_f_single, data_l_single], batch_size=batch_size, capacity=400, min_after_dequeue=2*batch_size)
return data_f_batch, data_l_batch
# use this function as input_fn to fit
classifier.fit(input_fn=lambda: data_input_fn(TRAINING_SET, batch_size, num_epochs), steps=4000)

Related

sklearn classification_report ValueError: Found input variables with inconsistent numbers of samples: [18, 576]

I'm working on a CNN classification problem. I used keras and a pre-trained model. Now I want to evaluate my model and need the precision, recall and f1-Score. When I use sklearn.metrics classification_report I get above error. I know where the numbers are coming from, first is the length of my test dataset in batches and second are the number of actual sampels (predictions) in there. However I don't know how to "convert" them.
See my code down below:
# load train_ds
train_ds = tf.keras.utils.image_dataset_from_directory(
directory ='/gdrive/My Drive/Flies_dt/224x224',
image_size = (224, 224),
validation_split = 0.40,
subset = "training",
seed = 123,
shuffle = True)
# load val_ds
val_ds = tf.keras.utils.image_dataset_from_directory(
directory ='/gdrive/My Drive/Flies_dt/224x224',
image_size = (224, 224),
validation_split = 0.40,
subset = "validation",
seed = 123,
shuffle = True)
# move some batches of val_ds to test_ds
test_ds = val_ds.take((1*len(val_ds)) // 2)
print('test_ds =', len(test_ds))
val_ds = val_ds.skip((1*len(val_ds)) // 2)
print('val_ds =', len(val_ds)) #test_ds = 18 val_ds = 18
# Load Model
base_model = keras.applications.vgg19.VGG19(
include_top=False,
weights='imagenet',
input_shape=(224,224,3)
)
# Freeze base_model
base_model.trainable = False
#
inputs = keras.Input(shape=(224,224,3))
x = data_augmentation(inputs) #apply data augmentation
# Preprocessing
x = tf.keras.applications.vgg19.preprocess_input(x)
# The base model contains batchnorm layers. We want to keep them in inference mode
# when we unfreeze the base model for fine-tuning, so we make sure that the
# base_model is running in inference mode here.
x = base_model(x, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
x = keras.layers.Dropout(0.2)(x) # Regularize with dropout
outputs = keras.layers.Dense(5, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.compile(
loss="sparse_categorical_crossentropy",
optimizer="Adam",
metrics=['acc']
)
model.fit(train_ds, epochs=8, validation_data=val_ds, callbacks=[tensorboard_callback])
# Unfreeze the base_model. Note that it keeps running in inference mode
# since we passed `training=False` when calling it. This means that
# the batchnorm layers will not update their batch statistics.
# This prevents the batchnorm layers from undoing all the training
# we've done so far.
base_model.trainable = True
model.summary()
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.000001), # Low learning rate
loss="sparse_categorical_crossentropy",
metrics=['acc']
)
model.fit(train_ds, epochs=5, validation_data=val_ds)
#Evaluate
from sklearn.metrics import classification_report
y_pred = model.predict(test_ds, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
print(classification_report(test_ds, y_pred_bool))
I also tried something like this, but I'm not sure if this gives me the correct values for multiclass classification.
from keras import backend as K
def recall_m(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision_m(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def f1_m(y_true, y_pred):
precision = precision_m(y_true, y_pred)
recall = recall_m(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])
# fit the model
history = model.fit(Xtrain, ytrain, validation_split=0.3, epochs=10, verbose=0)
# evaluate the model
loss, accuracy, f1_score, precision, recall = model.evaluate(Xtest, ytest, verbose=0)
This is a lot, Sorry. Hope somebody can help.

Using Keras Tuner for time series split

Is it possible to use Keras tuner for tuning a NN using Time Series Split , similar to sklearn.model_selection.TimeSeriesSplit in sklearn.
For example consider a sample tuner class from https://towardsdatascience.com/hyperparameter-tuning-with-keras-tuner-283474fbfbe
from kerastuner import HyperModel
class SampleModel(HyperModel):
def __init__(self, input_shape):
self.input_shape = input_shape
def build(self, hp):
model = Sequential()
model.add(
layers.Dense(
units=hp.Int('units', 8, 64, 4, default=8),
activation=hp.Choice(
'dense_activation',
values=['relu', 'tanh', 'sigmoid'],
default='relu'),
input_shape=input_shape
)
)
model.add(layers.Dense(1))
model.compile(
optimizer='rmsprop',loss='mse',metrics=['mse']
)
return model
tuner:
tuner_rs = RandomSearch(
hypermodel,
objective='mse',
seed=42,
max_trials=10,
executions_per_trial=2)
tuner_rs.search(x_train_scaled, y_train, epochs=10, validation_split=0.2, verbose=0)
So instead of validation_split = 0.2, in the above line is it possible to do the following
from sklearn.model_selection import TimeSeriesSplit
#defining a time series split object
tscv = TimeSeriesSplit(n_splits = 5)
#using that in Keras Tuner
tuner_rs.search(x_train, y_train, epochs=10, validation_split=tscv, verbose=0)
I solved in this way:
First I have istanciated a class that allows to perform Blocking Time Series Split. I found out that it might be better to use this time series split rather than Sklearn TimeSeriesSplit because we won't make our model train on instances with already seen data. As you can see from the picture, if number of splits is 5, BTSS will divide your training data in 5 parts with only the validation data in common across the splits. (Since StackOverflow doesn't allow me to upload images i'll post a reference link: https://hub.packtpub.com/cross-validation-strategies-for-time-series-forecasting-tutorial/)
class BlockingTimeSeriesSplit():
def __init__(self, n_splits):
self.n_splits = n_splits
def get_n_splits(self, X, y, groups):
return self.n_splits
def split(self, X, y=None, groups=None):
n_samples = len(X)
k_fold_size = n_samples // self.n_splits
indices = np.arange(n_samples)
margin = 0
for i in range(self.n_splits):
start = i * k_fold_size
stop = start + k_fold_size
mid = int(0.8 * (stop - start)) + start
yield indices[start: mid], indices[mid + margin: stop]
Then you will proceed by creating your own model:
def build_model(hp):
pass
Finally you can create your CVtuner as a class which will call back BlockingTimeSeriesSplit.
class CVTuner(kt.engine.tuner.Tuner):
def run_trial(self, trial, x, y, *args, **kwargs):
cv = BlockingTimeSeriesSplit(n_splits=5)
val_accuracy_list = []
batch_size = trial.hyperparameters.Int('batch_size', 0, 64, step=8)
epochs = trial.hyperparameters.Int('epochs', 10, 100, step=10)
for train_indices, test_indices in cv.split(x):
x_train, x_test = x[train_indices], x[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
model = self.hypermodel.build(trial.hyperparameters)
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
val_loss, val_accuracy, val_auc = model.evaluate(x_test, y_test)
val_accuracy_list.append(val_accuracy)
self.oracle.update_trial(trial.trial_id, {'val_accuracy': np.mean(val_accuracy_list)})
self.save_model(trial.trial_id, model)
tuner = CVTuner(oracle=kt.oracles.BayesianOptimization(objective='val_accuracy',max_trials=1), hypermodel=create_model)
stop_early = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)
tuner.search(X, Y, callbacks=[stop_early])
best_model = tuner.get_best_models()[0]
best_model.summary()
best_model.evaluate(x_out_of_sample, y_out_of_sample)

how to save tensorflow model with tf.estimator

I have the following example code to train and evaluate a cnn mnist model using tensorflow's estimator api:
def model_fn(features, labels, mode):
images = tf.reshape(features, [-1, 28, 28, 1])
model = Model()
logits = model(images)
predicted_logit = tf.argmax(input=logits, axis=1, output_type=tf.int32)
if mode == tf.estimator.ModeKeys.PREDICT:
probabilities = tf.nn.softmax(logits)
predictions = {
'predicted_logit': predicted_logit,
'probabilities': probabilities
}
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
else:
...
def mnist_train_and_eval(_):
train_data, train_labels, eval_data, eval_labels, val_data, val_labels = get_mnist_data()
# Create a input function to train
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x= train_data,
y=train_labels,
batch_size=_BATCH_SIZE,
num_epochs=1,
shuffle=True)
# Create a input function to eval
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x= eval_data,
y=eval_labels,
batch_size=_BATCH_SIZE,
num_epochs=1,
shuffle=False)
# Create a estimator with model_fn
image_classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=_MODEL_DIR)
# Finally, train and evaluate the model after each epoch
for _ in range(_NUM_EPOCHS):
image_classifier.train(input_fn=train_input_fn)
metrics = image_classifier.evaluate(input_fn=eval_input_fn)
How can I use the estimator.export_savedmodel to save the trained model for later inference? How should I write the serving_input_receiver_fn?
Thank you very much for your help!
You create a function with a dictionary of input features. Placeholder should match the shape of your image, with first dimension for batch_size.
def serving_input_receiver_fn():
x = tf.placeholder(tf.float32, [None, Shape])
inputs = {'x': x}
return tf.estimator.export.ServingInputReceiver(features=inputs, receiver_tensors=inputs)
Or you can use TensorServingInputReceiver which doesn't required dict mapping
inputs = tf.placeholder(tf.float32, [None, 32*32*3])
tf.estimator.export.TensorServingInputReceiver(inputs, inputs)
This function returns new instance of ServingInputReceiver, which is passed to export_savedmodel or tf.estimator.FinalExporter
...
image_classifier.export_savedmodel(saved_dir, serving_input_receiver_fn)

Why my GraphDef implementation exceeds the 2GB limit in TensorFlow?

I am using the tf.estimator API and I have the following model_fn function:
def model_fn(features, labels, mode, params):
labels = tf.reshape(labels, (-1, 1))
model = word2vec.create_model(features, params)
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=model)
loss = sampled_softmax_loss.create_loss(model['softmax_w'],
model['softmax_b'],
model['relu_layer_1'],
labels,
params['softmax_sample'],
params['vocabulary_size'])
cost = tf.reduce_mean(loss)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode=mode, loss=cost)
optimizer = adam_optimizer.create_optimizer(params['learning_rate'])
train_operation = optimizer.minimize(cost)
if mode == tf.estimator.ModeKeys.TRAIN:
return tf.estimator.EstimatorSpec(mode=mode, loss=cost, train_op=train_operation)
raise RuntimeError('Not a valid Mode value')
The word2vec.create_model function is given below. The function returns a python dictionary with the interesting nodes of the network (e.g. the embeddings matrix, the softmax weight and bias for training etc.).
def create_model(features, hyper_params):
model = {}
vocabulary_size = hyper_params['vocabulary_size']
hidden_size = hyper_params['hidden_size']
feature_columns = hyper_params['feature_columns']
with tf.variable_scope('word2vec'):
# Creating the Embedding layer
net = tf.feature_column.input_layer(features, feature_columns)
# Creating the hidden layer
net = dense_layer.create_layer(model, net, hidden_size)
# Creating the SoftMax weight and bias variables to use in the sampled loss function
softmax_w = tf.Variable(tf.truncated_normal((vocabulary_size, hidden_size), stddev=0.1), name='softmax_weights')
softmax_b = tf.Variable(tf.zeros(vocabulary_size), name='softmax_bias')
model['softmax_w'] = softmax_w
model['softmax_b'] = softmax_b
return model
Last but not least, my main function, which in turn I use the tf.app.run(main) command to run:
def main():
path = os.path.join('data', 'data.csv')
(train_x, train_y), (test_x, test_y) = prepare_data.load_data(path, path, columns, columns[-1])
vocabulary_size = len(train_x[columns[0]].unique())
feature_columns = []
for key in train_x.keys():
item_id = tf.feature_column.categorical_column_with_identity(key=key, num_buckets=vocabulary_size)
feature_columns.append(tf.feature_column.embedding_column(item_id, 512))
classifier = tf.estimator.Estimator(
model_fn=model_fn,
params={
'feature_columns': feature_columns,
'vocabulary_size': vocabulary_size,
'hidden_size': 256,
'learning_rate': 0.001,
'softmax_sample': 100,
})
print('Training the classifier...')
classifier.train(input_fn=lambda: prepare_data.train_input_fn(train_x, train_y, 128), steps=2)
print('Evaluating on test dataset...')
eval_result = classifier.evaluate(input_fn=lambda: prepare_data.eval_input_fn(test_x, test_y, 128))
print('Printing results...')
print(eval_result)
When I run this, I get a ValueError: GraphDef cannot be larger than 2GB. error. Why is that? What can am I doing wrong?
Below is my train_input_fn:
def train_input_fn(features, labels, batch_size):
def gen():
for f, l in zip(features, labels):
yield f, l
ds = tf.data.Dataset.from_generator(gen, (tf.int64, tf.int64), (tf.TensorShape([None]), tf.TensorShape([None])))
ds = ds.repeat().batch(batch_size)
feature, label = ds.make_one_shot_iterator().get_next()
return {"Input": feature}, label
The dataset is a simple csv like below:
Input Label
0 12600 838
1 12600 4558
2 12600 838
3 12600 4558
4 838 12600
Dataset.from_tensor_slices adds the whole dataset to the computational graph (see details), so better use Dataset.from_generator. I have shown an example of how to do it using mnist:How to load MNIST via TensorFlow (including download)?

Keras split train test set when using ImageDataGenerator

I have a single directory which contains sub-folders (according to labels) of images. I want to split this data into train and test set while using ImageDataGenerator in Keras. Although model.fit() in keras has argument validation_split for specifying the split, I could not find the same for model.fit_generator(). How to do it ?
train_datagen = ImageDataGenerator(rescale=1./255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
train_generator = train_datagen.flow_from_directory(
train_data_dir,
target_size=(img_width, img_height),
batch_size=32,
class_mode='binary')
model.fit_generator(
train_generator,
samples_per_epoch=nb_train_samples,
nb_epoch=nb_epoch,
validation_data=??,
nb_val_samples=nb_validation_samples)
I don't have separate directory for validation data, need to split it from the training data
Keras has now added Train / validation split from a single directory using ImageDataGenerator:
train_datagen = ImageDataGenerator(rescale=1./255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
validation_split=0.2) # set validation split
train_generator = train_datagen.flow_from_directory(
train_data_dir,
target_size=(img_height, img_width),
batch_size=batch_size,
class_mode='binary',
subset='training') # set as training data
validation_generator = train_datagen.flow_from_directory(
train_data_dir, # same directory as training data
target_size=(img_height, img_width),
batch_size=batch_size,
class_mode='binary',
subset='validation') # set as validation data
model.fit_generator(
train_generator,
steps_per_epoch = train_generator.samples // batch_size,
validation_data = validation_generator,
validation_steps = validation_generator.samples // batch_size,
epochs = nb_epochs)
https://keras.io/preprocessing/image/
For example, you have folder like this
full_dataset
|--horse (40 images)
|--donkey (30 images)
|--cow ((50 images)
|--zebra (70 images)
FIRST WAY
image_generator = ImageDataGenerator(rescale=1/255, validation_split=0.2)
train_dataset = image_generator.flow_from_directory(batch_size=32,
directory='full_dataset',
shuffle=True,
target_size=(280, 280),
subset="training",
class_mode='categorical')
validation_dataset = image_generator.flow_from_directory(batch_size=32,
directory='full_dataset',
shuffle=True,
target_size=(280, 280),
subset="validation",
class_mode='categorical')
SECOND WAY
import glob
horse = glob.glob('full_dataset/horse/*.*')
donkey = glob.glob('full_dataset/donkey/*.*')
cow = glob.glob('full_dataset/cow/*.*')
zebra = glob.glob('full_dataset/zebra/*.*')
data = []
labels = []
for i in horse:
image=tf.keras.preprocessing.image.load_img(i, color_mode='RGB',
target_size= (280,280))
image=np.array(image)
data.append(image)
labels.append(0)
for i in donkey:
image=tf.keras.preprocessing.image.load_img(i, color_mode='RGB',
target_size= (280,280))
image=np.array(image)
data.append(image)
labels.append(1)
for i in cow:
image=tf.keras.preprocessing.image.load_img(i, color_mode='RGB',
target_size= (280,280))
image=np.array(image)
data.append(image)
labels.append(2)
for i in zebra:
image=tf.keras.preprocessing.image.load_img(i, color_mode='RGB',
target_size= (280,280))
image=np.array(image)
data.append(image)
labels.append(3)
data = np.array(data)
labels = np.array(labels)
from sklearn.model_selection import train_test_split
X_train, X_test, ytrain, ytest = train_test_split(data, labels, test_size=0.2,
random_state=42)
Main drawback from First way, you can't use for display a picture. It will error if you write validation_dataset[1]. But it worked if I use first way : X_test[1]
With reference to this question https://github.com/keras-team/keras/issues/597 , you can use the following code to split the whole set into train and val:
train_datagen = ImageDataGenerator(rescale=1./255,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True
validation_split=0.2) # val 20%
val_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
train_data = train_datagen.flow_from_directory(train_path,
target_size=(224, 224),
color_mode='rgb',
batch_size=BS,
class_mode='categorical',
shuffle=True,
subset = 'training')
val_data = val_datagen.flow_from_directory(train_path,
target_size=(224, 224),
color_mode='rgb',
batch_size=BS,
class_mode='categorical',
shuffle=False,
subset = 'validation')
If we use subset in ImageDataGenerator then same augmentation will be applied to both training and validation. If you want to apply augmentation only on training set, you can split the folders using split-folders package which can be installed directly using pip.
https://pypi.org/project/split-folders/
This will separate the dataset into train, val and test directory and then you can create separate generator for each of them.
I have a PR for it. One way is to hash the filenames and do a variant assignment.
Example:
# -*- coding: utf-8 -*-
"""Train model using transfer learning."""
import os
import re
import glob
import hashlib
import argparse
import warnings
import six
import numpy as np
import tensorflow as tf
from tensorflow.python.platform import gfile
from keras.models import Model
from keras import backend as K
from keras.optimizers import SGD
from keras.layers import Dense, GlobalAveragePooling2D, Input
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing.image import (ImageDataGenerator, Iterator,
array_to_img, img_to_array, load_img)
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
RANDOM_SEED = 0
MAX_NUM_IMAGES_PER_CLASS = 2 ** 27 - 1 # ~134M
VALID_IMAGE_FORMATS = frozenset(['jpg', 'jpeg', 'JPG', 'JPEG'])
# we chose to train the top 2 inception blocks
BATCH_SIZE = 100
TRAINABLE_LAYERS = 172
INCEPTIONV3_BASE_LAYERS = len(InceptionV3(weights=None, include_top=False).layers)
STEPS_PER_EPOCH = 625
VALIDATION_STEPS = 100
MODEL_INPUT_WIDTH = 299
MODEL_INPUT_HEIGHT = 299
MODEL_INPUT_DEPTH = 3
FC_LAYER_SIZE = 1024
# Helper: Save the model.
checkpointer = ModelCheckpoint(
filepath='./output/checkpoints/inception.{epoch:03d}-{val_loss:.2f}.hdf5',
verbose=1,
save_best_only=True)
# Helper: Stop when we stop learning.
early_stopper = EarlyStopping(patience=10)
# Helper: TensorBoard
tensorboard = TensorBoard(log_dir='./output/')
def as_bytes(bytes_or_text, encoding='utf-8'):
"""Converts bytes or unicode to `bytes`, using utf-8 encoding for text.
# Arguments
bytes_or_text: A `bytes`, `str`, or `unicode` object.
encoding: A string indicating the charset for encoding unicode.
# Returns
A `bytes` object.
# Raises
TypeError: If `bytes_or_text` is not a binary or unicode string.
"""
if isinstance(bytes_or_text, six.text_type):
return bytes_or_text.encode(encoding)
elif isinstance(bytes_or_text, bytes):
return bytes_or_text
else:
raise TypeError('Expected binary or unicode string, got %r' %
(bytes_or_text,))
class CustomImageDataGenerator(ImageDataGenerator):
def flow_from_image_lists(self, image_lists,
category, image_dir,
target_size=(256, 256), color_mode='rgb',
class_mode='categorical',
batch_size=32, shuffle=True, seed=None,
save_to_dir=None,
save_prefix='',
save_format='jpeg'):
return ImageListIterator(
image_lists, self,
category, image_dir,
target_size=target_size, color_mode=color_mode,
class_mode=class_mode,
data_format=self.data_format,
batch_size=batch_size, shuffle=shuffle, seed=seed,
save_to_dir=save_to_dir,
save_prefix=save_prefix,
save_format=save_format)
class ImageListIterator(Iterator):
"""Iterator capable of reading images from a directory on disk.
# Arguments
image_lists: Dictionary of training images for each label.
image_data_generator: Instance of `ImageDataGenerator`
to use for random transformations and normalization.
target_size: tuple of integers, dimensions to resize input images to.
color_mode: One of `"rgb"`, `"grayscale"`. Color mode to read images.
classes: Optional list of strings, names of sudirectories
containing images from each class (e.g. `["dogs", "cats"]`).
It will be computed automatically if not set.
class_mode: Mode for yielding the targets:
`"binary"`: binary targets (if there are only two classes),
`"categorical"`: categorical targets,
`"sparse"`: integer targets,
`None`: no targets get yielded (only input images are yielded).
batch_size: Integer, size of a batch.
shuffle: Boolean, whether to shuffle the data between epochs.
seed: Random seed for data shuffling.
data_format: String, one of `channels_first`, `channels_last`.
save_to_dir: Optional directory where to save the pictures
being yielded, in a viewable format. This is useful
for visualizing the random transformations being
applied, for debugging purposes.
save_prefix: String prefix to use for saving sample
images (if `save_to_dir` is set).
save_format: Format to use for saving sample images
(if `save_to_dir` is set).
"""
def __init__(self, image_lists, image_data_generator,
category, image_dir,
target_size=(256, 256), color_mode='rgb',
class_mode='categorical',
batch_size=32, shuffle=True, seed=None,
data_format=None,
save_to_dir=None, save_prefix='', save_format='jpeg'):
if data_format is None:
data_format = K.image_data_format()
classes = list(image_lists.keys())
self.category = category
self.num_class = len(classes)
self.image_lists = image_lists
self.image_dir = image_dir
how_many_files = 0
for label_name in classes:
for _ in self.image_lists[label_name][category]:
how_many_files += 1
self.samples = how_many_files
self.class2id = dict(zip(classes, range(len(classes))))
self.id2class = dict((v, k) for k, v in self.class2id.items())
self.classes = np.zeros((self.samples,), dtype='int32')
self.image_data_generator = image_data_generator
self.target_size = tuple(target_size)
if color_mode not in {'rgb', 'grayscale'}:
raise ValueError('Invalid color mode:', color_mode,
'; expected "rgb" or "grayscale".')
self.color_mode = color_mode
self.data_format = data_format
if self.color_mode == 'rgb':
if self.data_format == 'channels_last':
self.image_shape = self.target_size + (3,)
else:
self.image_shape = (3,) + self.target_size
else:
if self.data_format == 'channels_last':
self.image_shape = self.target_size + (1,)
else:
self.image_shape = (1,) + self.target_size
if class_mode not in {'categorical', 'binary', 'sparse', None}:
raise ValueError('Invalid class_mode:', class_mode,
'; expected one of "categorical", '
'"binary", "sparse", or None.')
self.class_mode = class_mode
self.save_to_dir = save_to_dir
self.save_prefix = save_prefix
self.save_format = save_format
i = 0
self.filenames = []
for label_name in classes:
for j, _ in enumerate(self.image_lists[label_name][category]):
self.classes[i] = self.class2id[label_name]
img_path = get_image_path(self.image_lists,
label_name,
j,
self.image_dir,
self.category)
self.filenames.append(img_path)
i += 1
print("Found {} {} files".format(len(self.filenames), category))
super(ImageListIterator, self).__init__(self.samples, batch_size, shuffle,
seed)
def next(self):
"""For python 2.x.
# Returns
The next batch.
"""
with self.lock:
index_array, current_index, current_batch_size = next(
self.index_generator)
# The transformation of images is not under thread lock
# so it can be done in parallel
batch_x = np.zeros((current_batch_size,) + self.image_shape,
dtype=K.floatx())
grayscale = self.color_mode == 'grayscale'
# build batch of image data
for i, j in enumerate(index_array):
img = load_img(self.filenames[j],
grayscale=grayscale,
target_size=self.target_size)
x = img_to_array(img, data_format=self.data_format)
x = self.image_data_generator.random_transform(x)
x = self.image_data_generator.standardize(x)
batch_x[i] = x
# optionally save augmented images to disk for debugging purposes
if self.save_to_dir:
for i in range(current_batch_size):
img = array_to_img(batch_x[i], self.data_format, scale=True)
fname = '{prefix}_{index}_{hash}.{format}'.format(
prefix=self.save_prefix,
index=current_index + i,
hash=np.random.randint(10000),
format=self.save_format)
img.save(os.path.join(self.save_to_dir, fname))
# build batch of labels
if self.class_mode == 'sparse':
batch_y = self.classes[index_array]
elif self.class_mode == 'binary':
batch_y = self.classes[index_array].astype(K.floatx())
elif self.class_mode == 'categorical':
batch_y = np.zeros((len(batch_x), self.num_class),
dtype=K.floatx())
for i, label in enumerate(self.classes[index_array]):
batch_y[i, label] = 1.
else:
return batch_x
return batch_x, batch_y
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py
def create_image_lists(image_dir, validation_pct=10):
"""Builds a list of training images from the file system.
Analyzes the sub folders in the image directory, splits them into stable
training, testing, and validation sets, and returns a data structure
describing the lists of images for each label and their paths.
# Arguments
image_dir: string path to a folder containing subfolders of images.
validation_pct: integer percentage of images reserved for validation.
# Returns
dictionary of label subfolder, with images split into training
and validation sets within each label.
"""
if not os.path.isdir(image_dir):
raise ValueError("Image directory {} not found.".format(image_dir))
image_lists = {}
sub_dirs = [x[0] for x in os.walk(image_dir)]
sub_dirs_without_root = sub_dirs[1:] # first element is root directory
for sub_dir in sub_dirs_without_root:
file_list = []
dir_name = os.path.basename(sub_dir)
if dir_name == image_dir:
continue
print("Looking for images in '{}'".format(dir_name))
for extension in VALID_IMAGE_FORMATS:
file_glob = os.path.join(image_dir, dir_name, '*.' + extension)
file_list.extend(glob.glob(file_glob))
if not file_list:
warnings.warn('No files found')
continue
if len(file_list) < 20:
warnings.warn('Folder has less than 20 images, which may cause '
'issues.')
elif len(file_list) > MAX_NUM_IMAGES_PER_CLASS:
warnings.warn('WARNING: Folder {} has more than {} images. Some '
'images will never be selected.'
.format(dir_name, MAX_NUM_IMAGES_PER_CLASS))
label_name = re.sub(r'[^a-z0-9]+', ' ', dir_name.lower())
training_images = []
validation_images = []
for file_name in file_list:
base_name = os.path.basename(file_name)
# Get the hash of the file name and perform variant assignment.
hash_name = hashlib.sha1(as_bytes(base_name)).hexdigest()
hash_pct = ((int(hash_name, 16) % (MAX_NUM_IMAGES_PER_CLASS + 1)) *
(100.0 / MAX_NUM_IMAGES_PER_CLASS))
if hash_pct < validation_pct:
validation_images.append(base_name)
else:
training_images.append(base_name)
image_lists[label_name] = {
'dir': dir_name,
'training': training_images,
'validation': validation_images,
}
return image_lists
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py
def get_image_path(image_lists, label_name, index, image_dir, category):
""""Returns a path to an image for a label at the given index.
# Arguments
image_lists: Dictionary of training images for each label.
label_name: Label string we want to get an image for.
index: Int offset of the image we want. This will be moduloed by the
available number of images for the label, so it can be arbitrarily large.
image_dir: Root folder string of the subfolders containing the training
images.
category: Name string of set to pull images from - training, testing, or
validation.
# Returns
File system path string to an image that meets the requested parameters.
"""
if label_name not in image_lists:
raise ValueError('Label does not exist ', label_name)
label_lists = image_lists[label_name]
if category not in label_lists:
raise ValueError('Category does not exist ', category)
category_list = label_lists[category]
if not category_list:
raise ValueError('Label %s has no images in the category %s.',
label_name, category)
mod_index = index % len(category_list)
base_name = category_list[mod_index]
sub_dir = label_lists['dir']
full_path = os.path.join(image_dir, sub_dir, base_name)
return full_path
def get_generators(image_lists, image_dir):
train_datagen = CustomImageDataGenerator(rescale=1. / 255,
horizontal_flip=True)
test_datagen = CustomImageDataGenerator(rescale=1. / 255)
train_generator = train_datagen.flow_from_image_lists(
image_lists=image_lists,
category='training',
image_dir=image_dir,
target_size=(MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH),
batch_size=BATCH_SIZE,
class_mode='categorical',
seed=RANDOM_SEED)
validation_generator = test_datagen.flow_from_image_lists(
image_lists=image_lists,
category='validation',
image_dir=image_dir,
target_size=(MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH),
batch_size=BATCH_SIZE,
class_mode='categorical',
seed=RANDOM_SEED)
return train_generator, validation_generator
def get_model(num_classes, weights='imagenet'):
# create the base pre-trained model
# , input_tensor=input_tensor
base_model = InceptionV3(weights=weights, include_top=False)
# add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(FC_LAYER_SIZE, activation='relu')(x)
# and a logistic layer -- let's say we have 2 classes
predictions = Dense(num_classes, activation='softmax')(x)
# this is the model we will train
model = Model(inputs=[base_model.input], outputs=[predictions])
return model
def get_top_layer_model(model):
"""Used to train just the top layers of the model."""
# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
for layer in model.layers[:INCEPTIONV3_BASE_LAYERS]:
layer.trainable = False
for layer in model.layers[INCEPTIONV3_BASE_LAYERS:]:
layer.trainable = True
# compile the model (should be done after setting layers to non-trainable)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
metrics=['accuracy'])
return model
def get_mid_layer_model(model):
"""After we fine-tune the dense layers, train deeper."""
# freeze the first TRAINABLE_LAYER_INDEX layers and unfreeze the rest
for layer in model.layers[:TRAINABLE_LAYERS]:
layer.trainable = False
for layer in model.layers[TRAINABLE_LAYERS:]:
layer.trainable = True
# we need to recompile the model for these modifications to take effect
# we use SGD with a low learning rate
model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
loss='categorical_crossentropy',
metrics=['accuracy'])
return model
def train_model(model, epochs, generators, callbacks=None):
train_generator, validation_generator = generators
model.fit_generator(
train_generator,
steps_per_epoch=STEPS_PER_EPOCH,
validation_data=validation_generator,
validation_steps=VALIDATION_STEPS,
epochs=epochs,
callbacks=callbacks)
return model
def main(image_dir, validation_pct):
sub_dirs = [x[0] for x in gfile.Walk(image_dir)]
num_classes = len(sub_dirs) - 1
print("Number of classes found: {}".format(num_classes))
model = get_model(num_classes)
print("Using validation percent of %{}".format(validation_pct))
image_lists = create_image_lists(image_dir, validation_pct)
generators = get_generators(image_lists, image_dir)
# Get and train the top layers.
model = get_top_layer_model(model)
model = train_model(model, epochs=10, generators=generators)
# Get and train the mid layers.
model = get_mid_layer_model(model)
_ = train_model(model, epochs=100, generators=generators,
callbacks=[checkpointer, early_stopper, tensorboard])
# save model
model.save('./output/model.hdf5', overwrite=True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image-dir', required=True, help='data directory')
parser.add_argument('--validation-pct', default=10, help='validation percentage')
args = parser.parse_args()
os.makedirs('./output/checkpoints/', exist_ok=True)
main(**vars(args))
If you simply want to divide the dataset for training and validation (without any augmentation, etc.)
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
ds_gen = ImageDataGenerator(
preprocessing_function=preprocess_input,
validation_split=0.2
)
train_ds = ds_gen.flow_from_directory(
"/path/to/dataset",
seed=1,
target_size=(150, 150), #adjust to your needs
batch_size=32,#adjust to your needs
class_mode='categorical',
subset='training'
)
val_ds = ds_gen.flow_from_directory(
"/path/to/dataset",
seed=1,
target_size=(150, 150),
batch_size=32,
class_mode='categorical',
subset='validation'
)
Here is the answer:
data_path = 'path/to/dir'
data_gen = ImageDataGenerator(rescale=1./255, validation_split=0.3)
train_data = data_gen.flow_from_directory(directory=data_path,target_size=img_size, batch_size=batch_size, subset='training', seed=42, class_mode='binary' )
test_data = data_gen.flow_from_directory(directory=data_path,target_size=img_size, batch_size=batch_size, subset='validation', seed=42, class_mode='binary' )
This is a simple tensorflow 2.0 code
from tensorflow.keras.preprocessing.image import ImageDataGenerator
def image_data_generator(data_dir,
data_augment=False,
batch_size=BATCH_SIZE,
target_size=(100, 100),
color_mode='rgb',
class_mode='binary',
shuffle=True):
if data_augment:
datagen = ImageDataGenerator(rescale=1./255,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
validation_split=0.2,#this is the trick
horizontal_flip=True)
else:
datagen = ImageDataGenerator(rescale=1./255)
generator = datagen.flow_from_directory(data_dir,
target_size=target_size,
color_mode=color_mode,
batch_size=batch_size,
shuffle=shuffle,
class_mode=class_mode)
return generator
train_generator = image_data_generator('Your_DataBase_Path',data_augment=True)
if you want to use pre processing units of VGG16 model and split your dataset into 70% training and 30% validation just follow this approach:
train_path = 'your dataset path'
train_batch=
ImageDataGenerator(preprocessing_function=tf.keras.applications.vgg16.preprocess_input, validation_split=0.3) \
.flow_from_directory(directory=train_path, target_size=(,), classes=['',''], batch_size= ,class_mode='categorical', subset='training')
val_batch=
ImageDataGenerator(preprocessing_function=tf.keras.applications.vgg16.preprocess_input, validation_split=0.3) \
.flow_from_directory(directory=train_path, target_size=(,), classes=['',''], batch_size=, class_mode='categorical', subset='validation')
If you are using TensorFlow 2.x, you can use the same fit() function and use the parameter validation_split also for Image ImageDataGenerator
I don't know if you are still interested, but I found the following workaround. The most important function is GetTrainValidTestGeneratorFromDir, the other ones are just used by it. The basic idea is that you first divide the ImageDataGenerator by two using validation_split. By means of this you will get two iterators. You can use the second one as the test iterator. You will further divide the first one in the following way:
First use flow_from_directory using training subset (so you can be sure that test data are excluded). Now you can use the same generator to get two divided dataframes and then you can use flow_from_dataframe function. You will get three ImageDataIterators without changing the folders
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 15 10:15:18 2021
#author: Alessandro
"""
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
def ShuffleDataframe(thedataframe):
thedataframe = thedataframe.sample(n=len(thedataframe), random_state=42)
thedataframe = thedataframe.reset_index()
thedataframe.drop('index', axis='columns', inplace=True)
return(thedataframe)
def TransformGeneratorClassNumberToLabels(theGenerator, theLabelsNumbers):
labelnames = theGenerator.class_indices
labelnames = list(labelnames.keys())
theLabelsString = [labelnames[i] for i in theLabelsNumbers]
return(theLabelsString)
def GetGeneratorDataframe(theGenerator):
training_filenames = theGenerator.filenames
theLabelsNumbers = theGenerator.classes
thelabelsString = TransformGeneratorClassNumberToLabels(theGenerator,
theLabelsNumbers)
thedataframe = pd.DataFrame({'File': training_filenames,
'Label': thelabelsString})
thedataframe = ShuffleDataframe(thedataframe)
return(thedataframe)
def GetTrainValidTestGeneratorFromDir(thedirectory,
input_shape= (256, 256, 3),
validation_split=0.1,
rescaling = 1./255):
train_datagen = ImageDataGenerator(rescale=1./255,
validation_split=0.2)
train_and_valid_generator = train_datagen.flow_from_directory(thedirectory,
target_size=input_shape[0:2],
batch_size=20,
class_mode="categorical",
subset = 'training',
save_to_dir ='checkdir')
test_generator = train_datagen.flow_from_directory(thedirectory,
target_size=input_shape[0:2],
batch_size=20,
class_mode="categorical",
subset = 'validation')
thedataframe = GetGeneratorDataframe(train_and_valid_generator)
class_mode = "categorical"
training_generator = train_datagen.flow_from_dataframe(dataframe = thedataframe,
directory = thedirectory,
target_size=input_shape[0:2],
x_col="File",
y_col = "Label",
subset= "training",
class_mode=class_mode)
validation_generator = train_datagen.flow_from_dataframe(dataframe = thedataframe,
directory = thedirectory,
target_size=input_shape[0:2],
x_col="File",
y_col = "Label",
subset= "validation",
class_mode=class_mode)
return training_generator, validation_generator, test_generator
input_shape = (450, 450, 3)
myDir = "MyFolder"
(training_generator,
validation_generator,
test_generator) = GetTrainValidTestGeneratorFromDir(myDir)
# next part is just to verify whhat you got
training_dataframe = GetGeneratorDataframe(training_generator)
valid_dataframe = GetGeneratorDataframe(validation_generator)
test_dataframe = GetGeneratorDataframe(test_generator)
Note that, starting from TF 2.9, the ImageDataGenerator() has been deprecated in favour of tf.keras.utils.image_dataset_from_directory() which achieves the same functionality.
It is highly likely that the former will be removed altogether in the upcoming TF versions.
Deprecated: tf.keras.preprocessing.image.ImageDataGenerator is not
recommended for new code. Prefer loading images with
tf.keras.utils.image_dataset_from_directory and transforming the
output tf.data.Dataset with preprocessing layers. For more
information, see the tutorials for loading images and augmenting
images, as well as the preprocessing layer guide.
In case you are looking for train/validation split in generators for segmentation task, you can use the following snippet:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
BATCH_SIZE = 16
val_fraction = 0.1
image_generator = ImageDataGenerator(rescale=1/255,
brightness_range=[0.75, 1.75],
validation_split=val_fraction)
mask_generator = ImageDataGenerator(validation_split=val_fraction)
train_image_generator = image_generator.flow_from_dataframe(df_img,
directory=image_dir,
x_col='image',
class_mode=None,
color_mode='rgb',
target_size=(INPUT_SIZE, INPUT_SIZE),
batch_size=BATCH_SIZE,
shuffle=True,
subset='training',
seed=1)
train_mask_generator = mask_generator.flow_from_dataframe(df_gt,
directory=gt_dir,
x_col='mask',
color_mode='grayscale',
class_mode=None,
target_size=(INPUT_SIZE, INPUT_SIZE),
batch_size=BATCH_SIZE,
shuffle=True,
subset='training',
seed=1)
validation_image_generator = image_generator.flow_from_dataframe(df_img,
directory=image_dir,
x_col='image',
class_mode=None,
color_mode='rgb',
target_size=(INPUT_SIZE, INPUT_SIZE),
batch_size=BATCH_SIZE,
subset='validation',
seed=1)
validation_mask_generator = mask_generator.flow_from_dataframe(df_gt,
directory=gt_dir,
x_col='mask',
color_mode='grayscale',
class_mode=None,
target_size=(INPUT_SIZE, INPUT_SIZE),
batch_size=BATCH_SIZE,
subset='validation',
seed=1)
train_generator = zip(train_image_generator, train_mask_generator)
validation_generator = zip(validation_image_generator, validation_mask_generator)