How to obtain the encoder from the make_csv_dataset? - tensorflow

I used this code from the tutorial:
def get_train_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
select_columns= CSV_COLUMNS,
**kwargs)
return dataset
Then I created the train set:
raw_train_data = get_train_dataset(train_file_path)
to train the model.
The question is how to get the encoder used for the training data for encoding new text?
I loaded the new data, but this doesn't use the same encoder as the training data:
raw_test_data = get_test_dataset(new_data_file_path)
How to obtain the original encoder when using tf.data.experimental.make_csv_dataset?
EDIT:
train_file_path = "./train.csv"
test_file_path = "./test.csv"
LABEL_COLUMN = 'target'
CSV_COLUMNS = ['text', 'target']
def get_train_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
select_columns= CSV_COLUMNS,
**kwargs)
return dataset
def get_test_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
sample_submission = pd.read_csv("./sample_submission.csv")
raw_train_data = get_train_dataset(train_file_path)
raw_test_data = get_test_dataset(test_file_path)
def extract_train_tensor(example, label):
print(example)
return example['text'], label
def extract_test_tensor(example):
print(example)
return example['text']
test_data = raw_test_data.map(lambda ex: extract_test_tensor(ex))
test_data_size = len(list(test_data))
print("test size: ", test_data_size)
train_data_all = raw_train_data.map(lambda ex, label: extract_train_tensor(ex, label))
train_data_all = train_data_all.shuffle(10000)
print(train_data_all)
train_data_size = len(list(train_data_all))
print("train size: ", train_data_size)
train_size = int(0.7 * train_data_size)
val_size = int(0.3 * train_data_size)
train_data = train_data_all.take(train_size)
val_data = train_data_all.skip(train_size)
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
dtype=tf.string, trainable=True)
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_data,
epochs=20,
validation_data=val_data,
verbose=1)
import numpy as np
predictions = model.predict(test_data)
predictions = np.where(predictions > 0.5, 1, 0)
sample_submission['target'] = predictions
print(predictions)
The two calls to get_train_dataset() and get_test_dataset() to generate train and test data. The train data is split into train and validation sets and the accuracy is great. However, the test data accuracy is very low. Both data sets are strings of text and I didn't do any encoding.

tf.data.experimental.make_csv_dataset does not make any encoding.
It is about:
Reads CSV files into a dataset, where each element is a (features,
labels) tuple that corresponds to a batch of CSV rows. The features
dictionary maps feature column names to Tensors containing the
corresponding feature data, and labels is a Tensor containing the
batch's label data.
So your get_test_dataset() function should not care about the get_train_dataset() function dataset generation procedure.
Regarding the low test performance:
You trained and validated your model with random combination of samples of the same data with repetition:
train_data = train_data_all.take(train_size)
val_data = train_data_all.skip(train_size)
Thus probably you will have identical samples in them, so your validation will not resulted in a real measurement of the accuracy of model.
In contrary the model never saw the test set's samples, so prediction on this set is a reliable measurement of performance.

Related

How to "update" from module tf.keras.preprocessing.image to tf.keras.utils.image_dataset_from_directory for features extraction

This code part is common to both "problematic" codes below:
BATCH_SIZE = 32
IM_DIR = '/content/drive/My Drive/101_ObjectCategories/'
IM_HEIGHT = 224
IM_WIDTH = 224
NUM_IM = 8686
NUM_EPOCHS = int(math.ceil(NUM_IM / BATCH_SIZE))
#load pre-trained base model
model = ResNet50(weights='imagenet',
include_top=False,
input_shape=(IM_WIDTH, IM_HEIGHT, CH),
pooling='max')
The following code I successfully use to extract features of a set of images using module tf.keras.preprocessing.image.
datagen = tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function=preprocess_input)
dataset = datagen.flow_from_directory(IM_DIR,
target_size=(IM_HEIGHT, IM_WIDTH),
class_mode=None,
shuffle=False)
feature_list = []
feature_list = model.predict(dataset, num_epochs)
Thereafter I train a simple nearest-neighbor model using brute-force algorithm and I'm able to find three other images that are really similar to the query image as you can see below:
But as pointed in the documentation this preprocessing module is deprecated.
So, I would like to "update" the code as suggested in the documentation: "Prefer loading data with tf.keras.utils.image_dataset_from_directory, and then transforming the output tf.data.Dataset with preprocessing layers".
For that I'm trying the following:
#load images
dataset = tf.keras.utils.image_dataset_from_directory(
IM_DIR,
labels='inferred', #'inferred', None
label_mode='categorical', #'int', 'categorical', 'binary' or None
class_names=None,
color_mode='rgb', #'grayscale', 'rgb' or 'rgba'
batch_size=BATCH_SIZE,
image_size=(IM_HEIGHT, IM_WIDTH),
shuffle=True,
seed=51719,
validation_split=None,
subset=None, #'training', 'validation' or 'both'
interpolation='bilinear', #'bilinear', 'nearest', 'bicubic', 'area', 'lanczos3', 'lanczos5', 'gaussian' or 'mitchellcubic'
follow_links=False,
crop_to_aspect_ratio=False
)
#"transforming the output with preprocessing layers"
#rescale (normalize) dataset
rescale_layer = tf.keras.layers.Rescaling(1./255)
rescaled_dataset = dataset.map(lambda x, y: (rescale_layer(x), y))
im_batch, labels_batch = next(iter(rescaled_dataset))
#configure dataset for performance
#https://www.tensorflow.org/tutorials/load_data/images#configure_the_dataset_for_performance
AUTOTUNE = tf.data.AUTOTUNE
tuned_dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
And now I begin with the features extraction
#features extraction
#https://www.tensorflow.org/api_docs/python/tf/keras/Model#predict
feature_list = []
feature_list = model.predict(
tuned_dataset,
batch_size=None,
verbose='auto',
steps=None,
callbacks=None,
max_queue_size=10,
workers=1,
use_multiprocessing=False
)
#save features
pickle.dump(
feature_list,
open(DATA_DIR + 'features.pickle', 'wb'))
After that I do the same and train the nearest neighbor model with this features, but the results are catastrophic as you can see below:
What I'm doing so wrong that I have such different results?
== EDIT 1 ==
Answering #DWKOT using the same image we have following results:
#Query image with first code
im_idx = 75
distances, indices = neighbors.kneighbors([feature_list[im_idx]])
plt.imshow(mpimg.imread(filenames[im_idx]), interpolation='lanczos')
#Similar image
plt.imshow(mpimg.imread(filenames[indices[0][1]]), interpolation='lanczos')
And the code that give us the distance to the 5 nearest neighbors:
for i in range(5):
print(distances[0][i])
With the following results:
0.0
185.60701
185.75049
195.71657
196.4056
With the second code we have following result for query / similar image:
/
And following results for the first five "similar" images:
0.0
0.81401
0.88622
0.92734
0.9346
What is also strange as I would expect similar images having values next to zero and different ones far from zero...

Unexpected behavior when subclass Tensorflow Keras Model with 3D input for dense layer

I am comparing two simple single dense layer regressors. The two regressors are only different in a way that the 2nd one I pass in index of data in the call function rather than real data and use index to retrieve data for training.
class DenseRegressorV1(tf.keras.Model):
def __init__(
self,
*args,
**kwargs,
):
super(DenseRegressorV1, self).__init__(*args, **kwargs)
self.dense_layer = layers.Dense(units=1, name="dense")
def call(self, inputs):
logits = self.dense_layer(inputs)
return logits
class DenseRegressorV2(tf.keras.Model):
def __init__(
self,
data,
*args,
**kwargs,
):
super(DenseRegressorV2, self).__init__(*args, **kwargs)
self.data = data
self.dense_layer = layers.Dense(units=1, name="dense")
def call(self, inputs_idx):
inputs = tf.gather(self.data, inputs_idx)
logits = self.dense_layer(inputs)
return logits
The input data is in 3D. The y is a simple linear sum of x with some noise which is easy to predict. The data is generated as below:
x_train_3D = tf.random.uniform((80000, 80, 20))
y_train_3D = tf.expand_dims(tf.reduce_sum(x_train_3D, axis=2), axis=2) + 2 * tf.random.uniform((80000, 80, 1))
x_train_3D_idx = np.array(range(0, len(x_train_3D)))
The unexpected behavior is that the V1 seems doing correct training and the val_loss converges to a very small number but V2 does not, although I think they should perform the same.
I use below code for training:
def train_model(model, x_train, y_train, num_epochs, batch_size, learning_rate):
optimizer = keras.optimizers.Adam(learning_rate)
tf.keras.metrics.RootMeanSquaredError()
model.compile(
optimizer=optimizer,
loss=tf.keras.losses.mse,
metrics=['mse'],
)
early_stopping = keras.callbacks.EarlyStopping(
monitor="val_loss", patience=20, restore_best_weights=True
)
model_history = model.fit(
x=x_train,
y=y_train,
epochs=num_epochs,
batch_size=batch_size,
validation_split=0.18,
callbacks=[early_stopping]
)
return model_history
dr_model_v1 = DenseRegressorV1()
dr_model_v2 = DenseRegressorV2(x_train_3D)
train_model(dr_model_v1, x_train_3D, y_train_3D, num_epochs=500, batch_size=1000, learning_rate=0.01)
train_model(dr_model_v2, x_train_3D_idx, y_train_3D, num_epochs=500, batch_size=1000, learning_rate=0.01)
So my questions are:
What causes the difference in the training and how we can fix for the V2 model?
If I flatten the train data into 2D before feeding into the model, I can get expected results for V2 model. Why the shape is making a difference here?
How I transformed to 2D for training with V2
x_train_2D = tf.reshape(x_train_3D, (x_train_3D.get_shape()[0] * x_train_3D.get_shape()[1], x_train_3D.get_shape()[2]))
y_train_2D = tf.reshape(y_train_3D, (y_train_3D.get_shape()[0] * y_train_3D.get_shape()[1], y_train_3D.get_shape()[2]))

Keras Sequential Model accuracy is bad. Model is Ignoring/neglecting a class

little background: I'm making a simple rock, paper, scissors image classifier program. Basically, I want the image classifier to be able to distinguish between a rock, paper, or scissor image.
problem: The program works amazing for two of the classes, rock and paper, but completely fails whenever given a scissors test image. I've tried increasing my training data and a few other things but no luck. I was wondering if anyone has any ideas on how to offset this.
sidenote: I suspect it also has something to do with overfitting. I say this because the model has about a 92% accuracy with the training data but 55% accuracy on test data.
import numpy as np
import os
import cv2
import random
import tensorflow as tf
from tensorflow import keras
CATEGORIES = ['rock', 'paper', 'scissors']
IMG_SIZE = 400 # The size of the images that your neural network will use
CLASS_SIZE = len(CATEGORIES)
TRAIN_DIR = "../Train/"
def loadData( directoryPath ):
data = []
for category in CATEGORIES:
path = os.path.join(directoryPath, category)
class_num = CATEGORIES.index(category)
for img in os.listdir(path):
try:
img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
data.append([new_array, class_num])
except Exception as e:
pass
return data
training_data = loadData(TRAIN_DIR)
random.shuffle(training_data)
X = [] #features
y = [] #labels
for i in range(len(training_data)):
features = training_data[i][0]
label = training_data[i][1]
X.append(features)
y.append(label)
X = np.array(X)
y = np.array(y)
X = X/255.0
model = keras.Sequential([
keras.layers.Flatten(input_shape=(IMG_SIZE, IMG_SIZE)),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dense(CLASS_SIZE)
])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
model.fit(X, y, epochs=25)
TEST_DIR = "../Test/"
test_data = loadData( TEST_DIR )
random.shuffle(test_data)
test_images = []
test_labels = []
for i in range(len(test_data)):
features = test_data[i][0]
label = test_data[i][1]
test_images.append(features)
test_labels.append(label)
test_images = np.array(test_images)
test_images = test_images/255.0
test_labels = np.array(test_labels)
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print('\nTest accuracy:', test_acc)
# Saving the model
model_json = model.to_json()
with open("model.json", "w") as json_file :
json_file.write(model_json)
model.save_weights("model.h5")
print("Saved model to disk")
model.save('CNN.model')
If you want to create a massive amount of training data fast: https://github.com/ThomasStuart/RockPaperScissorsMachineLearning/blob/master/source/0.0-collectMassiveData.py
Thanks in advance to any help or ideas :)
You can simply test overfitting by adding 2 additional layers, one dropout layer and one dense layer. Also be sure to shuffle your train_data after each epoch, so the model keeps the learning general. Also, if I see this correctly, you are doing a multi class classification but do not have a softmax activation in the last layer. I would recommend you, to use it.
With drouput and softmax your model would look like this:
model = keras.Sequential([
keras.layers.Flatten(input_shape=(IMG_SIZE, IMG_SIZE)),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dropout(0.4), #0.4 means 40% of the neurons will be randomly unused
keras.layers.Dense(CLASS_SIZE, activation="softmax")
])
As last advice: Cnns perform in general way better with tasks like this. You might want to switch to a CNN network, for having even better performance.

How to load MNIST via TensorFlow (including download)?

The TensorFlow documentation for MNIST recommends multiple different ways to load the MNIST dataset:
https://www.tensorflow.org/tutorials/layers
https://www.tensorflow.org/versions/r1.2/get_started/mnist/beginners
https://www.tensorflow.org/versions/r1.2/get_started/mnist/pros
All ways described in the documentation throw many deprecated warnings with TensorFlow 1.8.
The way I'm currently loading MNIST and creating batches for training:
class MNIST:
def __init__(self, optimizer):
...
self.mnist_dataset = input_data.read_data_sets("/tmp/data/", one_hot=True)
self.test_data = self.mnist_dataset.test.images.reshape((-1, self.timesteps, self.num_input))
self.test_label = self.mnist_dataset.test.labels
...
def train_run(self, sess):
batch_input, batch_output = self.mnist_dataset.train.next_batch(self.batch_size, shuffle=True)
batch_input = batch_input.reshape((self.batch_size, self.timesteps, self.num_input))
_, loss = sess.run(fetches=[self.train_step, self.loss], feed_dict={self.input_placeholder: batch_input, self.output_placeholder: batch_output})
...
def test_run(self, sess):
loss = sess.run(fetches=[self.loss], feed_dict={self.input_placeholder: self.test_data, self.output_placeholder: self.test_label})
...
How could I do exactly the same thing, just with the current method of doing this?
I couldn't find any documentation on this.
It seems to me that the new way is something in the lines of:
train, test = tf.keras.datasets.mnist.load_data()
self.mnist_train_ds = tf.data.Dataset.from_tensor_slices(train)
self.mnist_test_ds = tf.data.Dataset.from_tensor_slices(test)
But how can I use these datasets in my train_run and test_run method?
An example of loading the MNIST dataset using TF dataset API:
Create a mnist dataset to load train, valid and test images:
You can create a dataset for numpy inputs, either using Dataset.from_tensor_slices or Dataset.from_generator. Dataset.from_tensor_slices adds the whole dataset to the computational graph, so we will use Dataset.from_generator instead.
#load mnist data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
def create_mnist_dataset(data, labels, batch_size):
def gen():
for image, label in zip(data, labels):
yield image, label
ds = tf.data.Dataset.from_generator(gen, (tf.float32, tf.int32), ((28,28 ), ()))
return ds.repeat().batch(batch_size)
#train and validation dataset with different batch size
train_dataset = create_mnist_dataset(x_train, y_train, 10)
valid_dataset = create_mnist_dataset(x_test, y_test, 20)
A feedable iterator that can toggle between training and validation
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(
handle, train_dataset.output_types, train_dataset.output_shapes)
image, label = iterator.get_next()
train_iterator = train_dataset.make_one_shot_iterator()
valid_iterator = valid_dataset.make_one_shot_iterator()
A sample run:
#A toy network
y = tf.layers.dense(tf.layers.flatten(image),1,activation=tf.nn.relu)
loss = tf.losses.mean_squared_error(tf.squeeze(y), label)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# The `Iterator.string_handle()` method returns a tensor that can be evaluated
# and used to feed the `handle` placeholder.
train_handle = sess.run(train_iterator.string_handle())
valid_handle = sess.run(valid_iterator.string_handle())
# Run training
train_loss, train_img, train_label = sess.run([loss, image, label],
feed_dict={handle: train_handle})
# train_image.shape = (10, 784)
# Run validation
valid_pred, valid_img = sess.run([y, image],
feed_dict={handle: valid_handle})
#test_image.shape = (20, 784)

do not save check point for a final step for Estimator

I use Estimator and I train model in the loop to feed data. Every step is the final step. The checkpoints are saved for every final step too. I want to avoid saving checkpoint in every iteration to increase the performance (speed) of the training.
I can not find any information how to do this. Do you have any ideas/suggestions/solutions?
classifier = Estimator(
model_fn=cnn_model_fn,
model_dir="./temp_model_Adam",
config=tf.contrib.learn.RunConfig(
save_checkpoints_secs=None,
save_checkpoints_steps=100,
save_summary_steps=None
)
)
# Train the model
for e in range(0, 10):
numbers = np.arange(10000)
np.random.shuffle(numbers)
for step in range(0, 2000):
classifier.fit(
input_fn=lambda: read_images_for_training_as_batch(step, path, 5, numbers),
steps=1
)
Nowadays the api got changed a bit but from what I see you were using the fit (currently train) method incorrectly, you should put steps=2000 and have your input function return an iterator over your dataset. Today you have tf.estimator.inputs.numpy_input_fn at your disposal that can help you when you have small data sets, otherwise you have to use tf.data.DataSet api.
Something like this (it loads .wav files):
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
# ...
def input_fn(num_epochs, batch_size, shuffle=False, mode='training')
def input_fn_bound():
def _read_file(fn, label):
return io_ops.read_file(fn), label
def _decode(data, label):
pcm = contrib_audio.decode_wav(data,
desired_channels=1,
desired_samples=desired_samples)
return pcm.audio, label
filenames = get_files(mode)
classes = get_classes(mode)
labels = {'class': np.array(classes)}
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
if shuffle:
dataset = dataset.shuffle(buffer_size=len(labels))
dataset = dataset.map(_read_file, num_parallel_calls=num_map_threads)
dataset = dataset.map(_decode, num_parallel_calls=num_map_threads)
dataset = dataset.map(lambda wav, label: ({'wav': wav}, label))
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(2) # To load next batch while the first one is being processed on GPU
iter = dataset.make_one_shot_iterator()
features, labels = iter.get_next()
return features, labels
return input_fn_bound
# ....
estimator.train(input_fn=input_fn(
num_epoths=None,
batch_size=64,
shuffle=True,
mode='training'),
steps=10000)