How to get the labels from tensorflow dataset - tensorflow

ds_test = tf.data.experimental.make_csv_dataset(
file_pattern = "./dfj_test/part-*.csv.gz",
batch_size=batch_size, num_epochs=1,
#column_names=use_cols,
label_name='label_id',
#select_columns= select_cols,
num_parallel_reads=30, compression_type='GZIP',
shuffle_buffer_size=12800)
This is my tesetset during training. After completing the model, I want to zip the columns of predictions and labels for the df_test .
preds = model.predict(df_test)
Getting the predictions is quite simple, and it is of numpy array format. However, I don't know how to get the corresponding labels from the df_test.
I want to zip(preds, labels) for further analysis.
Any hint? Thanks.
(tf version 2.3.1)

You can map each example to return the field you want
# load some exemplary data
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
dataset = tf.data.experimental.make_csv_dataset(train_file_path, batch_size=100, num_epochs=1)
# get field by unbatching
labels_iterator= dataset.unbatch().map(lambda x: x['survived']).as_numpy_iterator()
labels = np.array(list(labels_iterator))
# get field by concatenating batches
labels_iterator= dataset.map(lambda x: x['survived']).as_numpy_iterator()
labels = np.concatenate(list(labels_iterator))

Related

How to "update" from module tf.keras.preprocessing.image to tf.keras.utils.image_dataset_from_directory for features extraction

This code part is common to both "problematic" codes below:
BATCH_SIZE = 32
IM_DIR = '/content/drive/My Drive/101_ObjectCategories/'
IM_HEIGHT = 224
IM_WIDTH = 224
NUM_IM = 8686
NUM_EPOCHS = int(math.ceil(NUM_IM / BATCH_SIZE))
#load pre-trained base model
model = ResNet50(weights='imagenet',
include_top=False,
input_shape=(IM_WIDTH, IM_HEIGHT, CH),
pooling='max')
The following code I successfully use to extract features of a set of images using module tf.keras.preprocessing.image.
datagen = tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function=preprocess_input)
dataset = datagen.flow_from_directory(IM_DIR,
target_size=(IM_HEIGHT, IM_WIDTH),
class_mode=None,
shuffle=False)
feature_list = []
feature_list = model.predict(dataset, num_epochs)
Thereafter I train a simple nearest-neighbor model using brute-force algorithm and I'm able to find three other images that are really similar to the query image as you can see below:
But as pointed in the documentation this preprocessing module is deprecated.
So, I would like to "update" the code as suggested in the documentation: "Prefer loading data with tf.keras.utils.image_dataset_from_directory, and then transforming the output tf.data.Dataset with preprocessing layers".
For that I'm trying the following:
#load images
dataset = tf.keras.utils.image_dataset_from_directory(
IM_DIR,
labels='inferred', #'inferred', None
label_mode='categorical', #'int', 'categorical', 'binary' or None
class_names=None,
color_mode='rgb', #'grayscale', 'rgb' or 'rgba'
batch_size=BATCH_SIZE,
image_size=(IM_HEIGHT, IM_WIDTH),
shuffle=True,
seed=51719,
validation_split=None,
subset=None, #'training', 'validation' or 'both'
interpolation='bilinear', #'bilinear', 'nearest', 'bicubic', 'area', 'lanczos3', 'lanczos5', 'gaussian' or 'mitchellcubic'
follow_links=False,
crop_to_aspect_ratio=False
)
#"transforming the output with preprocessing layers"
#rescale (normalize) dataset
rescale_layer = tf.keras.layers.Rescaling(1./255)
rescaled_dataset = dataset.map(lambda x, y: (rescale_layer(x), y))
im_batch, labels_batch = next(iter(rescaled_dataset))
#configure dataset for performance
#https://www.tensorflow.org/tutorials/load_data/images#configure_the_dataset_for_performance
AUTOTUNE = tf.data.AUTOTUNE
tuned_dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
And now I begin with the features extraction
#features extraction
#https://www.tensorflow.org/api_docs/python/tf/keras/Model#predict
feature_list = []
feature_list = model.predict(
tuned_dataset,
batch_size=None,
verbose='auto',
steps=None,
callbacks=None,
max_queue_size=10,
workers=1,
use_multiprocessing=False
)
#save features
pickle.dump(
feature_list,
open(DATA_DIR + 'features.pickle', 'wb'))
After that I do the same and train the nearest neighbor model with this features, but the results are catastrophic as you can see below:
What I'm doing so wrong that I have such different results?
== EDIT 1 ==
Answering #DWKOT using the same image we have following results:
#Query image with first code
im_idx = 75
distances, indices = neighbors.kneighbors([feature_list[im_idx]])
plt.imshow(mpimg.imread(filenames[im_idx]), interpolation='lanczos')
#Similar image
plt.imshow(mpimg.imread(filenames[indices[0][1]]), interpolation='lanczos')
And the code that give us the distance to the 5 nearest neighbors:
for i in range(5):
print(distances[0][i])
With the following results:
0.0
185.60701
185.75049
195.71657
196.4056
With the second code we have following result for query / similar image:
/
And following results for the first five "similar" images:
0.0
0.81401
0.88622
0.92734
0.9346
What is also strange as I would expect similar images having values next to zero and different ones far from zero...

How to obtain the encoder from the make_csv_dataset?

I used this code from the tutorial:
def get_train_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
select_columns= CSV_COLUMNS,
**kwargs)
return dataset
Then I created the train set:
raw_train_data = get_train_dataset(train_file_path)
to train the model.
The question is how to get the encoder used for the training data for encoding new text?
I loaded the new data, but this doesn't use the same encoder as the training data:
raw_test_data = get_test_dataset(new_data_file_path)
How to obtain the original encoder when using tf.data.experimental.make_csv_dataset?
EDIT:
train_file_path = "./train.csv"
test_file_path = "./test.csv"
LABEL_COLUMN = 'target'
CSV_COLUMNS = ['text', 'target']
def get_train_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
select_columns= CSV_COLUMNS,
**kwargs)
return dataset
def get_test_dataset(file_path, **kwargs):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=10, # Artificially small to make examples easier to show.
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
return dataset
sample_submission = pd.read_csv("./sample_submission.csv")
raw_train_data = get_train_dataset(train_file_path)
raw_test_data = get_test_dataset(test_file_path)
def extract_train_tensor(example, label):
print(example)
return example['text'], label
def extract_test_tensor(example):
print(example)
return example['text']
test_data = raw_test_data.map(lambda ex: extract_test_tensor(ex))
test_data_size = len(list(test_data))
print("test size: ", test_data_size)
train_data_all = raw_train_data.map(lambda ex, label: extract_train_tensor(ex, label))
train_data_all = train_data_all.shuffle(10000)
print(train_data_all)
train_data_size = len(list(train_data_all))
print("train size: ", train_data_size)
train_size = int(0.7 * train_data_size)
val_size = int(0.3 * train_data_size)
train_data = train_data_all.take(train_size)
val_data = train_data_all.skip(train_size)
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
dtype=tf.string, trainable=True)
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
history = model.fit(train_data,
epochs=20,
validation_data=val_data,
verbose=1)
import numpy as np
predictions = model.predict(test_data)
predictions = np.where(predictions > 0.5, 1, 0)
sample_submission['target'] = predictions
print(predictions)
The two calls to get_train_dataset() and get_test_dataset() to generate train and test data. The train data is split into train and validation sets and the accuracy is great. However, the test data accuracy is very low. Both data sets are strings of text and I didn't do any encoding.
tf.data.experimental.make_csv_dataset does not make any encoding.
It is about:
Reads CSV files into a dataset, where each element is a (features,
labels) tuple that corresponds to a batch of CSV rows. The features
dictionary maps feature column names to Tensors containing the
corresponding feature data, and labels is a Tensor containing the
batch's label data.
So your get_test_dataset() function should not care about the get_train_dataset() function dataset generation procedure.
Regarding the low test performance:
You trained and validated your model with random combination of samples of the same data with repetition:
train_data = train_data_all.take(train_size)
val_data = train_data_all.skip(train_size)
Thus probably you will have identical samples in them, so your validation will not resulted in a real measurement of the accuracy of model.
In contrary the model never saw the test set's samples, so prediction on this set is a reliable measurement of performance.

Making prediction on Iris dataset

I have a basic classification code for Irish dataset.
import tensorflow as tf
import pandas as pd
COLUMN_NAMES = [
'SepalLength',
'SepalWidth',
'PetalLength',
'PetalWidth',
'Species'
]
# Import training dataset
training_dataset = pd.read_csv('iris_training.csv', names=COLUMN_NAMES, header=0)
train_x = training_dataset.iloc[:, 0:4]
train_y = training_dataset.iloc[:, 4]
# Import testing dataset
test_dataset = pd.read_csv('iris_test.csv', names=COLUMN_NAMES, header=0)
test_x = test_dataset.iloc[:, 0:4]
test_y = test_dataset.iloc[:, 4]
columns_feat = [
tf.feature_column.numeric_column(key='SepalLength'),
tf.feature_column.numeric_column(key='SepalWidth'),
tf.feature_column.numeric_column(key='PetalLength'),
tf.feature_column.numeric_column(key='PetalWidth')
]
classifier = tf.estimator.DNNClassifier(
feature_columns=columns_feat,
# Two hidden layers of 10 nodes each.
hidden_units=[10, 10],
# The model is classifying 3 classes
n_classes=3)
def train_function(inputs, outputs, batch_size):
dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), outputs))
dataset = dataset.shuffle(1000).repeat().batch(batch_size)
return dataset.make_one_shot_iterator().get_next()
# Train the Model.
classifier.train(
input_fn=lambda:train_function(train_x, train_y, 100),
steps=1000)
def evaluation_function(attributes, classes, batch_size):
attributes=dict(attributes)
if classes is None:
inputs = attributes
else:
inputs = (attributes, classes)
dataset = tf.data.Dataset.from_tensor_slices(inputs)
assert batch_size is not None, "batch_size must not be None"
dataset = dataset.batch(batch_size)
return dataset.make_one_shot_iterator().get_next()
# Evaluate the model.
eval_result = classifier.evaluate(
input_fn=lambda:evaluation_function(test_x, test_y, 100))
I evaluate the result but how can i make a prediction on my data because now i get only console info of loss and epochs, accuracy. For example if i have everything except species. I want to give my own sepal length and etc so i can get prediction of the species and it will be another variable. Do i have to create variables like pred_x or pred_y(pandas dataframe) and then put them into eval_result?
Is that what you mean? for example:new_samples = np.array([[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32) If you want new data like this to make predictions, then you can refer to this code.TensorFlow-Iris-Classification
Like all estimator classes, the DNNClassifier class has a predict method that makes real-world predictions. The documentation is here.

Link prediction with input data

I have a list of files, an I use the KNN algorithm to classify these files.
dataset = pd.read_csv(file)
training_samples = get_sample_number(dataset)
X_train = dataset.iloc[:training_samples, 5:9]
y_train = dataset.iloc[:training_samples, 9]
X_test = dataset.iloc[training_samples:, 5:9]
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
# Fitting classifier to the training set
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
Now I have my categories in my y_pred array. But I want to save the result in the file where I read the dataset. How can I link a prediction to the right row in the file (or dataset)?
Your predictions in y_pred have a length of X_test.shape[0], which is obviously less than the length of the original dataset. If you want to attach the predictions to the original dataset that you read from file, you would need to make predictions on the whole dataset, and then do a simple concat to get it all together:
y_pred_all = classifier.predict(dataset.iloc[:, 5:9])
dataset = pd.concat([dataset, y_pred_all], axis=1)

do not save check point for a final step for Estimator

I use Estimator and I train model in the loop to feed data. Every step is the final step. The checkpoints are saved for every final step too. I want to avoid saving checkpoint in every iteration to increase the performance (speed) of the training.
I can not find any information how to do this. Do you have any ideas/suggestions/solutions?
classifier = Estimator(
model_fn=cnn_model_fn,
model_dir="./temp_model_Adam",
config=tf.contrib.learn.RunConfig(
save_checkpoints_secs=None,
save_checkpoints_steps=100,
save_summary_steps=None
)
)
# Train the model
for e in range(0, 10):
numbers = np.arange(10000)
np.random.shuffle(numbers)
for step in range(0, 2000):
classifier.fit(
input_fn=lambda: read_images_for_training_as_batch(step, path, 5, numbers),
steps=1
)
Nowadays the api got changed a bit but from what I see you were using the fit (currently train) method incorrectly, you should put steps=2000 and have your input function return an iterator over your dataset. Today you have tf.estimator.inputs.numpy_input_fn at your disposal that can help you when you have small data sets, otherwise you have to use tf.data.DataSet api.
Something like this (it loads .wav files):
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
# ...
def input_fn(num_epochs, batch_size, shuffle=False, mode='training')
def input_fn_bound():
def _read_file(fn, label):
return io_ops.read_file(fn), label
def _decode(data, label):
pcm = contrib_audio.decode_wav(data,
desired_channels=1,
desired_samples=desired_samples)
return pcm.audio, label
filenames = get_files(mode)
classes = get_classes(mode)
labels = {'class': np.array(classes)}
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
if shuffle:
dataset = dataset.shuffle(buffer_size=len(labels))
dataset = dataset.map(_read_file, num_parallel_calls=num_map_threads)
dataset = dataset.map(_decode, num_parallel_calls=num_map_threads)
dataset = dataset.map(lambda wav, label: ({'wav': wav}, label))
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(2) # To load next batch while the first one is being processed on GPU
iter = dataset.make_one_shot_iterator()
features, labels = iter.get_next()
return features, labels
return input_fn_bound
# ....
estimator.train(input_fn=input_fn(
num_epoths=None,
batch_size=64,
shuffle=True,
mode='training'),
steps=10000)