I followed the tutorial here to try to train my model using CIFAR-100. But I'm getting this error. What do I do?
ValueError: Data Params Error: The dataset label shape (100,) does not match the
number of classes (10) in the dataset. Please ensure the dataset
labels have 10 classes, or change the number of classes
to match the dataset.
Here is my setup, adapted from the tutorial but for CIFAR-100.
import tensorflow as tf
import tensorflow_datasets as tfds
import masterful
masterful = masterful.register()
TRAINING_PERCENTAGE = 5
(training_dataset,
test_dataset) = tfds.load('cifar100',
as_supervised=True,
split=[f'train[:{TRAINING_PERCENTAGE}%]', 'test'],
with_info=False)
def sparse_to_dense(image, label):
label = tf.cast(label, tf.int32)
one_hot_label = tf.one_hot(label, depth=100)
return image, one_hot_label
training_dataset = training_dataset.map(sparse_to_dense,
num_parallel_calls=tf.data.AUTOTUNE)
test_dataset = test_dataset.map(sparse_to_dense,
num_parallel_calls=tf.data.AUTOTUNE)
def get_model():
model = tf.keras.models.Sequential()
model.add(
tf.keras.layers.experimental.preprocessing.Rescaling(1. / 255,
input_shape=(32, 32,
3)))
model.add(tf.keras.layers.Conv2D(
16,
(3, 3),
activation='relu',
))
model.add(tf.keras.layers.GlobalAveragePooling2D())
model.add(tf.keras.layers.Dense(10))
return model
model = get_model()
model_params = masterful.architecture.learn_architecture_params(
model=model,
task=masterful.enums.Task.CLASSIFICATION,
input_range=masterful.enums.ImageRange.CIFAR10_TORCH,
num_classes=10,
prediction_logits=True,
)
training_dataset_params = masterful.data.learn_data_params(
dataset=training_dataset,
task=masterful.enums.Task.CLASSIFICATION,
image_range=masterful.enums.ImageRange.CIFAR10_TORCH,
num_classes=10,
sparse_labels=False,
)
optimization_params = masterful.optimization.learn_optimization_params(
model,
model_params,
training_dataset,
training_dataset_params,
)
# This is a set of parameters learned on CIFAR10 for
# small sized models.
regularization_params = masterful.regularization.parameters.CIFAR10_SMALL
training_report = masterful.training.train(
model,
model_params,
optimization_params,
regularization_params,
None,
training_dataset,
training_dataset_params,
)
Both of you are correct, regularization and classification is support 10 classes you may consider multiple steps approaches.
[ Sample ]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: DataSet
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
TRAINING_PERCENTAGE = 5
(training_dataset, test_dataset) = tfds.load('cifar100', as_supervised=True, split=[f'train[:{TRAINING_PERCENTAGE}%]', 'test'], with_info=False)
training_dataset = training_dataset.map(sparse_to_dense, num_parallel_calls=tf.data.AUTOTUNE)
test_dataset = test_dataset.map(sparse_to_dense, num_parallel_calls=tf.data.AUTOTUNE)
task = masterful.enums.Task.CLASSIFICATION
[ ENUM ]:
# Task.CLASSIFICATION
# Task.CLASSIFICATION
# Task.BINARY_CLASSIFICATION
# Task.MULTILABEL_CLASSIFICATION
# Task.DETECTION
# Task.LOCALIZATION
# Task.SEMANTIC_SEGMENTATION
# Task.INSTANCE_SEGMENTATION
# Task.KEYPOINT_DETECTION
[ Statistics ]:
# #enum.unique
# class DatasetStatistics(enum.Enum):
# """An enum for datasets statistics required for Masterful's normalization."""
# CIFAR_10_MEAN = [0.4914, 0.4822, 0.4465]
# CIFAR_10_STD = [0.247, 0.243, 0.261]
# CIFAR_10_BGR_255 = [113.8575, 122.961, 125.307]
# CIFAR_100_MEAN = [0.5071, 0.4867, 0.4408]
# CIFAR_100_STD = [0.2675, 0.2565, 0.2761]
# CIFAR_100_BGR_255 = [129.3105, 124.1085, 112.404]
# IMAGENET_MEAN = [0.485, 0.456, 0.406]
# IMAGENET_STD = [0.229, 0.224, 0.225]
# IMAGENET_MEAN_BGR_255 = [103.939, 116.779, 123.68]
[ Params ]:
# ArchitectureParams(task=<Task.CLASSIFICATION: 'classification'>, num_classes=10, ensemble_multiplier=1, custom_objects={}, model_config=None,
# backbone_only=False, input_shape=(32, 32, 3), input_range=<ImageRange.CIFAR100_TORCH: 'CIFAR100_TORCH'>, input_dtype=tf.float32, input_channels_last=True,
# prediction_logits=True, prediction_dtype=tf.float32, prediction_structure=<TensorStructure.SINGLE_TENSOR: 'single_tensor'>, prediction_shape=TensorShape([10]))
This Value Error is explaining the problem its the message.
The dataset labels, being CIFAR-100, are one hot vectors of length 100.
But the call to "masterful.data.learn_data_params" gets passed the value 10.
Update your model architecture with:
model.add(tf.keras.layers.Dense(100))
and your call to learn_data_params with:
num_classes=100,
Related
I am trying to train on a colab TPU using data from my GCP account.
When I run the cell that starts the training, the cell just seems to hang, with no progress. I put a very low number of steps, so that the training should complete pretty quickly, about a minute on GPU, but it never finishes on TPU.
I am using a custom model, and I am using files saved on GCP using the solution given in this stackoverflow answer How to connect to private storage bucket using the Google Colab TPU
The model trains/runs just fine on GPU/CPU.
The full code is in this colab notebook here
https://colab.research.google.com/drive/13HgRJru0glOzn7m0b7tmVCO_VrRpa1XS?usp=sharing
And here's a google drive link to the sample data file
https://drive.google.com/file/d/10EFyxau97jLfeGaKugMevIyX-bobsFe5/view?usp=sharing
And below is the code from the colab notebook
!pip install transformers --q
%tensorflow_version 2.x
!gcloud auth login
'''NEED TO RUN THIS CELL TWICE TO AVOID ERROR'''
from google.colab import auth
auth.authenticate_user()
project_id = 'machinelearning-264918'
!gcloud config set project {project_id}
!pip install tfa-nightly
import tensorflow_addons as tfa
from transformers import TFBertModel, AutoModel
import tensorflow as tf
from tensorflow.keras.layers import (Dense,
Dropout)
import os
import tensorflow_addons as tfa
logger = tf.get_logger()
logger.info(tf.__version__)
autotune = tf.data.experimental.AUTOTUNE
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
logger.info('Running with TPUStrategy on TPU {} with {} cores '
.format(tpu.cluster_spec().as_dict()['worker'],
strategy.num_replicas_in_sync))
batch_size = 3 * strategy.num_replicas_in_sync
except Exception:
# raise ValueError
strategy = tf.distribute.OneDeviceStrategy(device='/gpu:0')
logger.warning('Failed initializing TPU! Running on GPU')
batch_size = 3
from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
from tensorflow.python.distribute import parameter_server_strategy
def _minimize(strategy, tape, optimizer, loss, trainable_variables):
with tape:
if isinstance(optimizer, lso.LossScaleOptimizer):
loss = optimizer.get_scaled_loss(loss)
gradients = tape.gradient(loss, trainable_variables)
# Whether to aggregate gradients outside of optimizer. This requires support
# of the optimizer and doesn't work with ParameterServerStrategy and
# CentralStroageStrategy.
aggregate_grads_outside_optimizer = (
optimizer._HAS_AGGREGATE_GRAD and # pylint: disable=protected-access
not isinstance(strategy.extended,
parameter_server_strategy.ParameterServerStrategyExtended))
if aggregate_grads_outside_optimizer:
# We aggregate gradients before unscaling them, in case a subclass of
# LossScaleOptimizer all-reduces in fp16. All-reducing in fp16 can only be
# done on scaled gradients, not unscaled gradients, for numeric stability.
gradients = optimizer._aggregate_gradients(zip(gradients, # pylint: disable=protected-access
trainable_variables))
if isinstance(optimizer, lso.LossScaleOptimizer):
gradients = optimizer.get_unscaled_gradients(gradients)
gradients = optimizer._clip_gradients(gradients) # pylint: disable=protected-access
if trainable_variables:
if aggregate_grads_outside_optimizer:
optimizer.apply_gradients(
zip(gradients, trainable_variables),
experimental_aggregate_gradients=False)
else:
optimizer.apply_gradients(zip(gradients, trainable_variables))
class CustomModel(tf.keras.Model):
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
x, y = data
batch_label = tf.reshape(y, (tf.size(y)/2, 2), name=None)
rs = tf.ragged.stack(x, axis=0)
reg = rs.to_tensor()
batch_input = tf.reshape(reg, (tf.shape(reg)[0]*tf.shape(reg)[1], tf.shape(reg)[2]))
with tf.GradientTape() as tape:
y_pred = self(batch_input, training=True) # Forward pass
# Compute the loss value
# (the loss function is configured in `compile()`)
loss = self.compiled_loss(batch_label, y_pred, regularization_losses=self.losses)
# Compute gradients
_minimize(self.distribute_strategy, tape, self.optimizer, loss,
self.trainable_variables)
# Update weights
# self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update metrics (includes the metric that tracks the loss)
self.compiled_metrics.update_state(y, y_pred)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
def get_model(drop_out):
sciBert = TFBertModel.from_pretrained('bert-base-uncased', from_pt=True)
allFinal = tf.keras.Input(shape=(None,), dtype=tf.int32, name='inputN')
'''Should posFinal and negFinal be concatenated, so there's only one call to sciBert'''
allBertOut = sciBert(allFinal, training=True)
allPoolConcat = tf.concat([
allBertOut[0][:, 0], #output of ff layer after last hidden state since it seems to be untrained in roberta
tf.reduce_mean(allBertOut[0][:, 1:-1], axis=1)
],axis=1)
postLayer = tf.keras.layers.Dense(768, activation='swish', name='postff')
LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNormO")
postLayer2 = tf.keras.layers.Dense(768, activation='swish', name='2postff')
classifier = tf.keras.layers.Dense(2, name='classifierff')
postWeights = postLayer(allPoolConcat)
postWeights = LayerNorm(postWeights)
postWeights = Dropout(drop_out)(postWeights)
postWeights2 = postLayer2(postWeights)
allScores = classifier(postWeights2)
model = CustomModel(inputs=allFinal, outputs=allScores)
return model
#tf.function
def _parse_example(example_proto):
features = {
'sciBert_SentenceIndex': tf.io.VarLenFeature( dtype=tf.int64),
'SciBert_IDs': tf.io.VarLenFeature(dtype=tf.int64),
}
parsed_example_dict = tf.io.parse_single_example(example_proto, features)
sentencePositions = parsed_example_dict['sciBert_SentenceIndex']
passageIds = parsed_example_dict['SciBert_IDs']
sentencePositions = tf.sparse.to_dense(sentencePositions)
bertIds = tf.sparse.to_dense(passageIds)
sentencePositions = tf.cast(sentencePositions, dtype=tf.int32)
passageIds = tf.cast(passageIds, dtype=tf.int32)
length = tf.shape(
sentencePositions, out_type=tf.dtypes.int32, name='shape'
)
lengthMinusOne = tf.math.subtract(
length, 1, name='SubtractOne'
)
# creage random numbers for a sentence index up to 2nd to last index
# the last index is just the last position of the non-padded bertID
startRandSentIndex = tf.random.uniform(
shape=[1], minval=0, maxval=lengthMinusOne[0], dtype=tf.dtypes.int32, seed=None, name=None)
# Get the end point for that sentence
endRandSentIndex = tf.math.add(startRandSentIndex, 1, name=None)
# last position of the non-padded bertID
lastPosition = length-1
# extract BertID positions for sentence start/end and bertID end
startSentencePosit = tf.gather_nd(sentencePositions, [startRandSentIndex], batch_dims=0)
endSentencePosit = tf.gather_nd(sentencePositions, [endRandSentIndex], batch_dims=0)
lastPassagePosit = tf.gather_nd(sentencePositions, [lastPosition], batch_dims=0)
# Get slices of BertIDs for the query, and the rest
firstPiece = tf.slice(bertIds, [0], [startSentencePosit[0]] )
queryPiece = tf.slice(bertIds, [startSentencePosit[0]], [endSentencePosit[0]-startSentencePosit[0]] )
lastPiece = tf.slice(bertIds, [endSentencePosit[0]], [lastPassagePosit[0]-endSentencePosit[0]] )
# concat rest of passage
restPassagePiece = tf.concat( [firstPiece,lastPiece], axis=0 )
# Clip
queryPiece = queryPiece[0:256]
restPassagePiece = restPassagePiece[0:510]
# add special tokens for proper input into the model
return tf.cast(queryPiece, dtype=tf.int32), tf.cast(restPassagePiece, dtype=tf.int32)
#tf.function
def clip_seq_to_len(seq, num_tokens=512):
seq_len = tf.shape(seq)[0]
if seq_len > 511:
return seq[:511]
return seq[:]
#tf.function
def make_samples(query_a, passage_a, query_b, passage_b):
CLS_inputID = tf.constant([102])
SEP_inputID = tf.constant([103])
positive_sample_a = clip_seq_to_len(tf.concat([CLS_inputID, query_a, SEP_inputID, passage_a], axis=-1))
positive_sample_b = clip_seq_to_len(tf.concat([CLS_inputID, query_b, SEP_inputID, passage_b], axis=-1))
negative_sample_a = clip_seq_to_len(tf.concat([CLS_inputID, query_a, SEP_inputID, passage_b], axis=-1))
negative_sample_b = clip_seq_to_len(tf.concat([CLS_inputID, query_b, SEP_inputID, passage_a], axis=-1))
positive_sample_a = tf.concat([positive_sample_a, SEP_inputID], axis=-1)
positive_sample_b = tf.concat([positive_sample_b, SEP_inputID], axis=-1)
negative_sample_a = tf.concat([negative_sample_a, SEP_inputID], axis=-1)
negative_sample_b = tf.concat([negative_sample_b, SEP_inputID], axis=-1)
return positive_sample_a, positive_sample_b, negative_sample_a, negative_sample_b
#tf.function
def get_samples(example_a, example_b):
samples = make_samples(*_parse_example(example_a), *_parse_example(example_b))
return samples
config = {
'drop_out':0.1
}
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
with strategy.scope():
model = get_model(**config)
model.compile(loss=loss_fn,
optimizer=tfa.optimizers.AdamW(weight_decay=1e-5, learning_rate=3e-4, epsilon=1e-07), run_eagerly=False)
config_name = 'model_b'
base_dir = 'gs://bdora-semanticscholar'
model_dir = os.path.join(base_dir, config_name)
# tensorboard_dir = os.path.join(model_dir, 'logs_' + str(time()))
tfrecords_pattern_train = os.path.join(base_dir, 'VersionB_00022*')
tfrecords_pattern_train2 = os.path.join(base_dir, 'VersionB_00022*')
#tf.function
def gen():
while True:
yield ([1, 0], [1, 0], [0, 1], [0, 1] )
batchNumber = batch_size
run_eagerly = False
with strategy.scope():
filenames = tf.io.gfile.glob(tfrecords_pattern_train)
train_dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=autotune)
filenames = tf.io.gfile.glob(tfrecords_pattern_train)
neg_dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=autotune)
train_dataset = train_dataset.shuffle(150_000, seed=1000, reshuffle_each_iteration=True)
neg_dataset = neg_dataset.shuffle(150_000, seed=2000, reshuffle_each_iteration=True)
train_datasetC = tf.data.Dataset.zip((train_dataset, neg_dataset))
train_datasetC = train_datasetC.map(get_samples, num_parallel_calls=autotune)
train_datasetC = train_datasetC.shuffle(1024, seed=1000, reshuffle_each_iteration=True)
train_datasetC = train_datasetC.padded_batch(batchNumber, padding_values=(0, 0, 0, 0))
datasetLabels = tf.data.Dataset.from_generator(
gen,
(tf.int32, tf.int32, tf.int32, tf.int32),
(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])))
datasetLabels = datasetLabels.batch(batchNumber)
train_datasetFinal = tf.data.Dataset.zip((train_datasetC, datasetLabels))
train_datasetFinal = train_datasetFinal.prefetch(autotune)
train_datasetFinal = train_datasetFinal.repeat()
train_datasetFinal = train_datasetFinal.apply(tf.data.experimental.ignore_errors())
model.fit(train_datasetFinal, steps_per_epoch=100, epochs=3)
And this is the only output I get
Epoch 1/3
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model/bert/pooler/dense/kernel:0', 'tf_bert_model/bert/pooler/dense/bias:0'] when minimizing the loss.
I found this GitHub issue discussion [1] that you can refer to.
It's not an error, it just means it's not updating those variables. Those variables (pooler) are not used when doing sequence classification.
[1] https://github.com/tensorflow/tensorflow/issues/37501
Dataset.py
import os
import random
from skimage import io
import cv2
from skimage.transform import resize
import numpy as np
import tensorflow as tf
import keras
import Augmentor
def iter_sequence_infinite(seq):
"""Iterate indefinitely over a Sequence.
# Arguments
seq: Sequence object
# Returns
Generator yielding batches.
"""
while True:
for item in seq:
yield item
# data generator class
class DataGenerator(keras.utils.Sequence):
def __init__(self, ids, imgs_dir, masks_dir, batch_size=10, img_size=128, n_classes=1, n_channels=3, shuffle=True):
self.id_names = ids
self.indexes = np.arange(len(self.id_names))
self.imgs_dir = imgs_dir
self.masks_dir = masks_dir
self.batch_size = batch_size
self.img_size = img_size
self.n_classes = n_classes
self.n_channels = n_channels
self.shuffle = shuffle
self.on_epoch_end()
# for printing the statistics of the function
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.id_names))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation__(self, id_name):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
img_path = os.path.join(self.imgs_dir, id_name) # polyp segmentation/images/id_name.jpg
mask_path = os.path.join(self.masks_dir, id_name) # polyp segmenatation/masks/id_name.jpg
img = io.imread(img_path)
mask = cv2.imread(mask_path)
p = Augmentor.DataPipeline([[img, mask]])
p.resize(probability=1.0, width=self.img_size, height=self.img_size)
p.rotate_without_crop(probability=0.3, max_left_rotation=10, max_right_rotation=10)
#p.random_distortion(probability=0.3, grid_height=10, grid_width=10, magnitude=1)
p.shear(probability=0.3, max_shear_left=1, max_shear_right=1)
#p.skew_tilt(probability=0.3, magnitude=0.1)
p.flip_random(probability=0.3)
sample_p = p.sample(1)
sample_p = np.array(sample_p).squeeze()
p_img = sample_p[0]
p_mask = sample_p[1]
augmented_mask = (p_mask // 255) * 255 # denoising
q = Augmentor.DataPipeline([[p_img]])
q.random_contrast(probability=0.3, min_factor=0.2, max_factor=1.0) # low to High
q.random_brightness(probability=0.3, min_factor=0.2, max_factor=1.0) # dark to bright
sample_q = q.sample(1)
sample_q = np.array(sample_q).squeeze()
image = sample_q
mask = augmented_mask[::, ::, 0]
"""
# reading the image from dataset
## Reading Image
image = io.imread(img_path) # reading image to image vaiable
image = resize(image, (self.img_size, self.img_size), anti_aliasing=True) # resizing input image to 128 * 128
mask = io.imread(mask_path, as_gray=True) # mask image of same size with all zeros
mask = resize(mask, (self.img_size, self.img_size), anti_aliasing=True) # resizing mask to fit the 128 * 128 image
mask = np.expand_dims(mask, axis=-1)
"""
# image normalization
image = image / 255.0
mask = mask / 255.0
return image, mask
def __len__(self):
"Denotes the number of batches per epoch"
return int(np.floor(len(self.id_names) / self.batch_size))
def __getitem__(self, index): # index : batch no.
# Generate indexes of the batch
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
batch_ids = [self.id_names[k] for k in indexes]
imgs = list()
masks = list()
for id_name in batch_ids:
img, mask = self.__data_generation__(id_name)
imgs.append(img)
masks.append(np.expand_dims(mask,-1))
imgs = np.array(imgs)
masks = np.array(masks)
return imgs, masks # return batch
train.py
import argparse
import logging
import os
import sys
from tqdm import tqdm # progress bar
import numpy as np
import matplotlib.pyplot as plt
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import segmentation_models as sm
from segmentation_models.utils import set_trainable
from dataset import DataGenerator, iter_sequence_infinite
def train_model(model, train_gen, valid_gen, epochs, save_cp=True):
total_batch_count = 0
train_img_num = len(train_gen.id_names)
train_batch_num = len(train_gen)
train_gen_out = iter_sequence_infinite(train_gen)
valid_batch_num = len(valid_gen)
valid_img_num = len(valid_gen.id_names)
valid_gen_out = iter_sequence_infinite(valid_gen)
for epoch in range(epochs): # interation as many epochs
set_trainable(model)
epoch_loss = 0 # loss in this epoch
epoch_iou = 0
count = 0
with tqdm(total=train_img_num, desc=f'Epoch {epoch + 1}/{epochs}', position=0, leave=True, unit='img') as pbar: # make progress bar
for _ in range(train_batch_num):
batch = next(train_gen_out)
imgs = batch[0]
true_masks = batch[1]
loss, iou = model.train_on_batch(imgs, true_masks) # value of loss of this batch
epoch_loss += loss
epoch_iou += iou
pbar.set_postfix(**{'Batch loss': loss, 'Batch IoU': iou}) # floating the loss at the post in the pbar
pbar.update(imgs.shape[0]) # update progress
count += 1
total_batch_count += 1
train_gen.on_epoch_end()
print( "Epoch : loss: {}, IoU : {}".format(epoch_loss/count, epoch_iou/count))
# Do validation
validation_model(model, valid_gen_out, valid_batch_num, valid_img_num)
valid_gen.on_epoch_end()
if save_cp:
try:
if not os.path.isdir(checkpoint_dir):
os.mkdir(checkpoint_dir)
logging.info('Created checkpoint directory')
else:
pass
except OSError:
pass
model.save_weights(os.path.join(checkpoint_dir , f'CP_epoch{epoch + 1}.h5'))
logging.info(f'Checkpoint {epoch + 1} saved !')
def validation_model(model, valid_gen_out, valid_batch_num, valid_img_num):
epoch_loss = 0 # loss in this epoch
epoch_iou = 0
count = 0
with tqdm(total=valid_img_num, desc='Validation round', position=0, leave=True, unit='img') as pbar: # make progress bar
for _ in range(valid_batch_num):
batch = next(valid_gen_out)
imgs = batch[0]
true_masks = batch[1]
loss, iou = model.test_on_batch(imgs, true_masks) # value of loss of this batch
epoch_loss += loss
epoch_iou += iou
pbar.set_postfix(**{'Batch, loss': loss, 'Batch IoU': iou}) # floating the loss at the post in the pbar
pbar.update(imgs.shape[0]) # update progress
count += 1
print("Validation loss: {}, IoU: {}".format(epoch_loss / count, epoch_iou / count))
pred_mask = model.predict(np.expand_dims(imgs[0],0))
plt.subplot(131)
plt.imshow(imgs[0])
plt.subplot(132)
plt.imshow(true_masks[0].squeeze(), cmap="gray")
plt.subplot(133)
plt.imshow(pred_mask.squeeze(), cmap="gray")
plt.show()
print()
def get_args():
parser = argparse.ArgumentParser(description='Train the UNet on images and target masks',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-e', '--epochs', metavar='E', type=int, default=50,
help='Number of epochs', dest='epochs')
parser.add_argument('-b', '--batch_size', metavar='B', type=int, nargs='?', default=2,
help='Batch size', dest='batch_size')
parser.add_argument('-l', '--learning-rate', metavar='LR', type=float, nargs='?', default=1e-5,
help='Learning rate', dest='lr')
parser.add_argument('-bb', '--backbone', default='resnet50', metavar='FILE',
help="backcone name")
parser.add_argument('-w', '--weight', dest='load', type=str, default=False,
help='Load model from a .h5 file')
parser.add_argument('-s', '--resizing', dest='resizing', type=int, default=384,
help='Downscaling factor of the images')
parser.add_argument('-v', '--validation', dest='val', type=float, default=20.0,
help='Percent of the data that is used as validation (0-100)')
return parser.parse_args()
if __name__ == '__main__':
img_dir = './data/train/imgs/' # ./data/train/imgs/CVC_Original/'
mask_dir = './data/train/masks/' # ./data/train/masks/CVC_Ground Truth/'
checkpoint_dir = './checkpoints'
args = get_args()
# train path
train_ids = os.listdir(img_dir)
# Validation Data Size
n_val = int(len(train_ids) * args.val/100) # size of validation set
valid_ids = train_ids[:n_val] # list of image ids used for validation of result 0 to 9
train_ids = train_ids[n_val:] # list of image ids used for training dataset
# print(valid_ids, "\n\n")
print("training_size: ", len(train_ids), "validation_size: ", len(valid_ids))
train_gen = DataGenerator(train_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
valid_gen = DataGenerator(valid_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
print("total training batches: ", len(train_gen))
print("total validaton batches: ", len(valid_gen))
train_steps = len(train_ids) // args.batch_size
valid_steps = len(valid_ids) // args.batch_size
# define model
model = sm.Unet(args.backbone, encoder_weights='imagenet')
optimizer = optimizers.Adam(lr=args.lr, decay=1e-4)
model.compile(
optimizer=optimizer,
# "Adam",
loss=sm.losses.bce_dice_loss, # sm.losses.bce_jaccard_loss, # sm.losses.binary_crossentropy,
metrics=[sm.metrics.iou_score],
)
#model.summary()
callbacks = [
EarlyStopping(patience=6, verbose=1),
ReduceLROnPlateau(factor=0.1, patience=3, min_lr=1e-7, verbose=1),
ModelCheckpoint('./weights.Epoch{epoch:02d}-Loss{loss:.3f}-VIou{val_iou_score:.3f}.h5', verbose=1,
monitor='val_accuracy', save_best_only=True, save_weights_only=True)
]
train_model(model=model, train_gen=train_gen, valid_gen=valid_gen, epochs=args.epochs)
When I try to run this code, some epochs are well progressed but, in 20epochs, it occurs gpu memory overflow error like below
(0) Resource exhausted: OOM when allocating tensor with shape[2,64,96,96] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node decoder_stage2b_bn/FusedBatchNorm}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
so, I think that it is because of data generation.
This code generate batch in this order.
in train.py, initialize Datageneratr class which is sequence model that is implemented in Dataset.py
train_gen = DataGenerator(train_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
valid_gen = DataGenerator(valid_ids, img_dir, mask_dir, img_size=args.resizing, batch_size=args.batch_size)
At the first in the function 'train_model' convert Datagenerator(sequence model) to generator with using function 'iter_sequence_infinite'
train_gen_out = iter_sequence_infinite(train_gen)
valid_gen_out = iter_sequence_infinite(valid_gen)
using magic-function, 'next', get batch
batch = next(train_gen_out)
I think that there will be no memory problem but it's occurred.
What is the problem and how to solve it?
Thanks.
I/m breaking my head on this for 3 days now.
I followed mainly this link to create my own datagenerator. But one way or another I'm doing something wrong and I can't figure out why. My error is:
*ValueError: Error when checking input: expected dense_4_input to have 2 dimensions, but got array with shape (5, 128, 128, 3)
*
The network for the number:
def create_mlp(dim, regress=False):
# define our MLP network
model = Sequential()
model.add(Dense(8, input_dim=dim, activation="relu"))
model.add(Dense(4, activation="relu"))
# check to see if the regression node should be added
if regress:
model.add(Dense(1, activation="linear"))
return model
The CNN for the image:
def create_cnn(inputshape, filters=(16, 32, 64), regress=True):
chanDim = -1
# define the model input
inputs = Input(shape=inputshape)
# loop over the number of filters
for (i, f) in enumerate(filters):
# if this is the first CONV layer then set the input
# appropriately
if i == 0:
x= inputs
# CONV => RELU => BN => POOL
x = Conv2D(f, (3, 3), padding="same")(x)
x = Activation("relu")(x)
x = BatchNormalization(axis=chanDim)(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
# flatten the volume, then FC => RELU => BN => DROPOUT
x = Flatten()(x)
x = Dense(16)(x)
x = Activation("relu")(x)
x = BatchNormalization(axis=chanDim)(x)
x = Dropout(0.5)(x)
# apply another FC layer, this one to match the number of nodes
# coming out of the MLP
x = Dense(4)(x)
x = Activation("relu")(x)
# check to see if the regression node should be added
if regress:
x = Dense(1, activation="linear")(x)
# construct the CNN
model = Model(inputs, x)
# return the CNN
return model
My own generator:
def aux_generator(img="todo", aux_input="todo", batch_size=3):
while True:
# Select files (paths/indices) for the batch
# todo make this random
img_path, gridnum, batch_output = get_batch_path()
batch_input_img = []
batch_input_sattelite = []
# Read in each input, perform preprocessing and get labels
for input_path in img_path:
input_img = get_input_image(input_path)
input = preprocess_input(image=input_img)
batch_input_img += [input]
for GridNum in gridnum:
# append is not good!
batch_input_sattelite.append(get_input_sattelite(GridNum))
# Return a tuple of (input,output) to feed the network
batch_x1 = np.array(batch_input_img)
batch_x2 = np.array(batch_input_sattelite)
batch_y = np.array(batch_output)
print("image shape : ", batch_x1.shape) #(5, 128, 128, 3)
print("Aux shape: ", batch_x2.shape, batch_x2) #(5,)
yield [batch_x1, batch_x2], batch_y
def get_batch_path():
# use the df we produced in downloadMpas to know where the images are and what their NO2 concentration is
img_info_df = pd.read_csv(r"Small/mappingTest.csv", delimiter=',', header=None,
names=['GridNum', 'id', 'score', 'lat', 'lon'])
img_info_df = img_info_df[img_info_df.score != "score"]
# the keras network needs float for the score (not object which is default when he reads in)
img_info_df = img_info_df.astype({"GridNum": 'float64', "id": 'object', "score": 'float64'})
return img_info_df['id'].head(n=5), img_info_df['GridNum'].head(n=5), img_info_df['score'].head(n=5)
def get_input_image(path):
# get image
img = image.load_img(r"Small/" + path)
img = image.img_to_array(img)
# get the corresponding value of the sattelite data
return img
def get_input_sattelite(GridNum):
sattelite_no2 = sattelite_df[sattelite_df['GridNum'] == GridNum]['sattelite'].values[0]
print("sattelite no2:", sattelite_no2)
return sattelite_no2
def preprocess_input(image):
# do whatever we want to the images
return (image)
The main:
sattelite_df = pd.read_csv(r"Small/sattelite.csv", delimiter=',', header=None,
names=['GridNum', 'id', 'score', 'lat', 'lon', 'sattelite'])
input_img_shape = (128, 128, 3)
input_aux_shape = (1)
img_model = create_cnn(input_img_shape)
aux_model = create_mlp(input_aux_shape, regress=False)
combinedInput = concatenate([aux_model.output, img_model.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="linear")(x)
# our final model will accept categorical/numerical data on the MLP
# input and images on the CNN input, outputting a single value (the
# predicted price of the house)
model = Model(inputs=[aux_model.input, img_model.input], outputs=x)
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)
batch_size = 2
early = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)
ImageFile.LOAD_TRUNCATED_IMAGES = True
model.fit_generator(
aux_generator(batch_size=batch_size),
steps_per_epoch=10 // batch_size,
epochs=2,
validation_data=aux_generator(batch_size=3),
validation_steps=20 // batch_size,
callbacks=[early])
Any help is welcome, as I don't know what I'm doing wrong?
hi i used my own dataset for train the model but i have error that i mention below . my dataset has 124 class and lables are 0 to 123 , size is 60*60 gray , batch is 10 and result is :
lables.eval() --> [ 1 101 101 103 103 103 103 100 102 1] -- len(lables.eval())= 10
orginal pic size -- > (?, 60, 60, 1)
First convolutional layer (?, 30, 30, 32)
Second convolutional layer. (?, 15, 15, 64)
flatten. (?, 14400)
dense .1 (?, 2048)
dense .2 (?, 124)
error
ensorflow.python.framework.errors_impl.InvalidArgumentError: logits and
labels must have the same first dimension, got logits shape [40,124] and
labels shape [10]
code
def model_fn(features, labels, mode, params):
# Reference to the tensor named "image" in the input-function.
x = features["image"]
# The convolutional layers expect 4-rank tensors
# but x is a 2-rank tensor, so reshape it.
net = tf.reshape(x, [-1, img_size, img_size, num_channels])
# First convolutional layer.
net = tf.layers.conv2d(inputs=net, name='layer_conv1',
filters=32, kernel_size=3,
padding='same', activation=tf.nn.relu)
net = tf.layers.max_pooling2d(inputs=net, pool_size=2, strides=2)
# Second convolutional layer.
net = tf.layers.conv2d(inputs=net, name='layer_conv2',
filters=64, kernel_size=3,
padding='same', activation=tf.nn.relu)
net = tf.layers.max_pooling2d(inputs=net, pool_size=2, strides=2)
# Flatten to a 2-rank tensor.
net = tf.contrib.layers.flatten(net)
# Eventually this should be replaced with:
# net = tf.layers.flatten(net)
# First fully-connected / dense layer.
# This uses the ReLU activation function.
net = tf.layers.dense(inputs=net, name='layer_fc1',
units=2048, activation=tf.nn.relu)
# Second fully-connected / dense layer.
# This is the last layer so it does not use an activation function.
net = tf.layers.dense(inputs=net, name='layer_fc_2',
units=num_classes)
# Logits output of the neural network.
logits = net
y_pred = tf.nn.softmax(logits=logits)
y_pred_cls = tf.argmax(y_pred, axis=1)
if mode == tf.estimator.ModeKeys.PREDICT:
spec = tf.estimator.EstimatorSpec(mode=mode,
predictions=y_pred_cls)
else:
cross_entropy =
tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
logits=logits)
loss = tf.reduce_mean(cross_entropy)
optimizer =
tf.train.AdamOptimizer(learning_rate=params["learning_rate"])
train_op = optimizer.minimize(
loss=loss, global_step=tf.train.get_global_step())
metrics = \
{
"accuracy": tf.metrics.accuracy(labels, y_pred_cls)
}
spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
train_op=train_op,
eval_metric_ops=metrics)
return spec`
this lables comes from here via tfrecords:
def input_fn(filenames, train, batch_size=10, buffer_size=2048):
# Args:
# filenames: Filenames for the TFRecords files.
# train: Boolean whether training (True) or testing (False).
# batch_size: Return batches of this size.
# buffer_size: Read buffers of this size. The random shuffling
# is done on the buffer, so it must be big enough.
# Create a TensorFlow Dataset-object which has functionality
# for reading and shuffling data from TFRecords files.
dataset = tf.data.TFRecordDataset(filenames=filenames)
# Parse the serialized data in the TFRecords files.
# This returns TensorFlow tensors for the image and labels.
dataset = dataset.map(parse)
if train:
# If training then read a buffer of the given size and
# randomly shuffle it.
dataset = dataset.shuffle(buffer_size=buffer_size)
# Allow infinite reading of the data.
num_repeat = None
else:
# If testing then don't shuffle the data.
# Only go through the data once.
num_repeat = 1
# Repeat the dataset the given number of times.
dataset = dataset.repeat(num_repeat)
# Get a batch of data with the given size.
dataset = dataset.batch(batch_size)
# Create an iterator for the dataset and the above modifications.
iterator = dataset.make_one_shot_iterator()
# Get the next batch of images and labels.
images_batch, labels_batch = iterator.get_next()
# The input-function must return a dict wrapping the images.
x = {'image': images_batch}
y = labels_batch
print(x, ' - ', y.get_shape())
return x, y
i generate labeles via this code for example image name=math-1 , lable = 1
def get_lable_and_image(path):
lbl = []
img = []
for filename in glob.glob(os.path.join(path, '*.png')):
img.append(filename)
lable = filename[41:].split()[0].split('-')[1]
lbl.append(int(lable))
lables = np.array(lbl)
images = np.array(img)
# print(images[1], lables[1])
return images, lables
i push images and lables to create tfrecords
def convert(image_paths, labels, out_path):
# Args:
# image_paths List of file-paths for the images.
# labels Class-labels for the images.
# out_path File-path for the TFRecords output file.
print("Converting: " + out_path)
# Number of images. Used when printing the progress.
num_images = len(image_paths)
# Open a TFRecordWriter for the output-file.
with tf.python_io.TFRecordWriter(out_path) as writer:
# Iterate over all the image-paths and class-labels.
for i, (path, label) in enumerate(zip(image_paths, labels)):
# Print the percentage-progress.
print_progress(count=i, total=num_images-1)
# Load the image-file using matplotlib's imread function.
img = imread(path)
# Convert the image to raw bytes.
img_bytes = img.tostring()
# Create a dict with the data we want to save in the
# TFRecords file. You can add more relevant data here.
data = \
{
'image': wrap_bytes(img_bytes),
'label': wrap_int64(label)
}
# Wrap the data as TensorFlow Features.
feature = tf.train.Features(feature=data)
# Wrap again as a TensorFlow Example.
example = tf.train.Example(features=feature)
# Serialize the data.
serialized = example.SerializeToString()
# Write the serialized data to the TFRecords file.
writer.write(serialized)
My neural network code ends up outputting vectors with NaNs most of the time. The code is given below:
from __future__ import division, print_function
from six.moves import xrange
import time
import os
from glob import glob
from zipfile import ZipFile, ZIP_DEFLATED
import numpy as np
import tensorflow as tf
## Defining variables which have to be provided by user
## Defining the number of units in the RNN. This is also the size of the word
## and document embeddings
num_units = 100
##The number of data elements in a batch
batch_size = 1
##The folder where the npz files with the numpy arrays are stored.
npz_files_folder = "npz_files"
## Name of the file to which we want the model to be saved
model_file = "rnn_trial"
## Number of labels sampled from the noise for NCE
num_sampled = 50
## The dropout probability for the NN
dropout = 0.2
## The learning rate for the optimizer
lr = 0.1
## The number of epochs
epochs = 10
## Reading in the list of npz files with vectors for each document
doc_files = sorted(glob(os.path.join(npz_files_folder, "*.npz")))
num_classes = num_docs = len(doc_files)
## The tensor for storing a batch of sentences where each sentence is a
## sequence of word embeddings. This is an input to the NN
sentences = tf.placeholder(tf.float32, [batch_size, None, num_units],
name='sentences')
## The tensor for storing a batch of documents where each document is a
## sequence of sentence embeddings. This is an input to the NN
documents = tf.placeholder(tf.float32, [batch_size, None, num_units])
## The tensor for storing the labels for each batch of documents
labels = tf.placeholder(tf.float32, [batch_size])
## Here we define the LSTM used in the first layer
sent_lstm = tf.contrib.rnn.BasicLSTMCell(num_units)
sent_lstm = tf.contrib.rnn.DropoutWrapper(sent_lstm,
output_keep_prob=1.0-dropout)
## We define the initial_state of the LSTM in first layer here
initial_state_sent_lstm = sent_lstm.zero_state(batch_size, tf.float32)
## Here we get the outputs and states from the first layer
outputs_lstm, states_lstm = tf.nn.dynamic_rnn(sent_lstm,
inputs=sentences, initial_state=initial_state_sent_lstm)
## Here we define the forward GRU used in the second layer
doc_gru_fw = tf.contrib.rnn.GRUCell(num_units//2)
initial_state_doc_gru_fw = doc_gru_fw.zero_state(batch_size, tf.float32)
## Here we define the reverse GRU used in second layer.
doc_gru_bw = tf.contrib.rnn.GRUCell(num_units-num_units//2)
initial_state_doc_gru_bw = doc_gru_bw.zero_state(batch_size, tf.float32)
## Here we get the outputs and states from the second layer
outputs_gru, states_gru = tf.nn.bidirectional_dynamic_rnn(cell_fw=doc_gru_fw,
cell_bw=doc_gru_bw, initial_state_fw=initial_state_doc_gru_fw,
initial_state_bw=initial_state_doc_gru_bw,
inputs=documents)
# outputs_gru, states_gru = tf.nn.bidirectional_dynamic_rnn(cell_fw=doc_gru_fw,
# cell_bw=doc_gru_bw,
# inputs=documents, dtype=tf.float32)
## The final document embeddings
final_output = tf.reduce_mean(tf.concat(outputs_gru, 2), axis=1)
sigmoid_W = tf.Variable(
tf.truncated_normal([num_units, 1],
stddev=1.0/np.sqrt(num_units)))
sigmoid_b = tf.Variable(tf.zeros([1], dtype=tf.float32))
logits = tf.matmul(final_output, sigmoid_W) + sigmoid_b
y_ = (num_docs - 1) * tf.sigmoid(tf.reshape(logits, [-1]))
loss = tf.reduce_sum(tf.square(y_ - labels))
## Defining the training step
train = tf.train.AdamOptimizer(lr).minimize(loss)
## Initializing the session
sess = tf.Session()
## Initializing the variables
sess.run(tf.global_variables_initializer())
t = time.time()
for n in xrange(epochs):
result = False
for j, doc in enumerate(doc_files):
# if j==100:
# break
try:
npz_file = np.load(doc, allow_pickle=False)
except ValueError:
continue
train_label = np.array([j])
sent_files = sorted(npz_file.files)
temp_doc = np.array([])
temp_doc = np.reshape(temp_doc, (0, num_units))
for i, sent_file in enumerate(sent_files):
sent_input = np.reshape(npz_file[sent_file], (1, -1, num_units))
if 0 in sent_input.shape:
continue
output_1 = sess.run(outputs_lstm,
feed_dict={sentences: sent_input})
sent_embed = output_1[:, -1:]
temp_doc = np.concatenate([temp_doc] + list(sent_embed), 0)
## Training the model
temp_doc = np.array([temp_doc])
_, doc_vector = sess.run([train, final_output], feed_dict={
documents: temp_doc, labels: train_label})
if np.isnan(np.sum(doc_vector)):
result = True
print(result)
print("Finished with epoch ", n)
print()
doc_vecs_file_name = model_file + "_doc_vecs.zip"
with ZipFile(doc_vecs_file_name, 'w', ZIP_DEFLATED, True) as myzip:
for doc in doc_files:
# if doc_files.index(doc)==100:
# break
try:
npz_file = np.load(doc, allow_pickle=False)
except ValueError:
continue
sent_files = sorted(npz_file.files)
temp_doc = np.array([])
temp_doc = np.reshape(temp_doc, (0, num_units))
for i, sent_file in enumerate(sent_files):
sent_input = np.reshape(npz_file[sent_file], (1, -1, num_units))
if 0 in sent_input.shape:
continue
output_1 = sess.run(outputs_lstm,
feed_dict={sentences: sent_input})
sent_embed = output_1[:, -1:]
temp_doc = np.concatenate([temp_doc] + list(sent_embed), 0)
## Training the model
temp_doc = np.array([temp_doc])
doc_vec = sess.run(final_output, feed_dict={documents: temp_doc})
temp_file = doc.split(os.sep)[-1][:-4] + ".csv"
np.savetxt(temp_file, doc_vec, delimiter=',')
myzip.write(temp_file)
os.remove(temp_file)
saver = tf.train.Saver()
saver.save(sess, model_file)
print("Time taken = ", (time.time() - t))
If needed, I can upload a sample data set which you can use to try running the code yourself. With that sample data set, occasionally the training is completed without any NaNs creeping in. But, most of the time, NaNs pop up while training.
I am using tensorflow version 1.1.0 alongwith python 2.7.13 from the anaconda distribution.