Related
I am following the official Tensorflow tutorial for preprocessing layers, and I am not sure I get why I end up getting these extra columns after the categorical encoding.
Here is a stripped-down minimal reproducible example (including the data):
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
import pathlib
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'
tf.keras.utils.get_file('petfinder_mini.zip', dataset_url, extract=True, cache_dir='.')
df = pd.read_csv(csv_file)
# In the original dataset "4" indicates the pet was not adopted.
df['target'] = np.where(df['AdoptionSpeed']==4, 0, 1)
# Drop un-used columns.
df = df.drop(columns=['AdoptionSpeed', 'Description'])
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
dataframe = dataframe.copy()
labels = dataframe.pop('target')
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
ds = ds.prefetch(batch_size)
return ds
batch_size = 5
ds = df_to_dataset(df, batch_size=batch_size)
[(train_features, label_batch)] = ds.take(1)
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
# Create a StringLookup layer which will turn strings into integer indices
if dtype == 'string':
index = preprocessing.StringLookup(max_tokens=max_tokens)
else:
index = preprocessing.IntegerLookup(max_values=max_tokens)
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
# Learn the set of possible values and assign them a fixed integer index.
index.adapt(feature_ds)
# Create a Discretization for our integer indices.
encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())
#encoder = preprocessing.CategoryEncoding(max_tokens=2)
# Prepare a Dataset that only yields our feature.
feature_ds = feature_ds.map(index)
# Learn the space of possible indices.
encoder.adapt(feature_ds)
# Apply one-hot encoding to our indices. The lambda function captures the
# layer so we can use them, or include them in the functional model later.
return lambda feature: encoder(index(feature))
So, after running
type_col = train_features['Type']
layer = get_category_encoding_layer('Type', ds, 'string')
layer(type_col)
I get a result as:
<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[0., 0., 1., 0.],
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[0., 0., 0., 1.],
[0., 0., 0., 1.]], dtype=float32)>
similar to what is shown in the tutorial indeed.
Notice that this is a binary classification problem (Cat/Dog):
np.unique(type_col)
# array([b'Cat', b'Dog'], dtype=object)
So, what is the logic of the 2 extra columns after the categorical encoding shown in the result above? What do they represent, and why they are 2 (and not, say, 1, or 3, or more)?
(I am perfectly aware that, should I wish for a simple one-hot encoding, I could simply use to_categorical(), but this is not the question here)
As already implied in the question, categorical encoding is somewhat richer that simple one-hot encoding. To see what these two columns represent it suffices to add a diagnostic print somewhere inside the get_category_encoding_layer() function:
print(index.get_vocabulary())
Then the result of the last commands will be:
['', '[UNK]', 'Dog', 'Cat']
<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[0., 0., 1., 0.],
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[0., 0., 0., 1.],
[0., 0., 0., 1.]], dtype=float32)>
The hint should hopefully be clear: the extra two columns here represent the empty value '' and unknown ones '[UNK]', respectively, which could be present in future (unseen) data.
This is actually determined from the default arguments, not of CategoryEncoding, but of the preceding StringLookup; from the docs:
mask_token=''
oov_token='[UNK]'
You can end up with a somewhat more tight encoding (only 1 extra column instead of 2) by asking for oov_token='' instead of oov_token='[UNK]'; replace the call to StringLookup in the get_category_encoding_layer() function with
index = preprocessing.StringLookup(oov_token='',mask_token=None, max_tokens=max_tokens)
after which, the result will be:
['', 'Dog', 'Cat']
<tf.Tensor: shape=(5, 3), dtype=float32, numpy=
array([[0., 1., 0.],
[0., 1., 0.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.]], dtype=float32)>
i.e. with only 3 columns (without a dedicated one for '[UNK]'). AFAIK, this is the lowest you can go - attempting to set both mask_token and oov_token to None will result to an error.
I am following the official Tensorflow tutorial for preprocessing layers, and I am not sure I get why I end up getting these extra columns after the categorical encoding.
Here is a stripped-down minimal reproducible example (including the data):
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
import pathlib
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'
tf.keras.utils.get_file('petfinder_mini.zip', dataset_url, extract=True, cache_dir='.')
df = pd.read_csv(csv_file)
# In the original dataset "4" indicates the pet was not adopted.
df['target'] = np.where(df['AdoptionSpeed']==4, 0, 1)
# Drop un-used columns.
df = df.drop(columns=['AdoptionSpeed', 'Description'])
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
dataframe = dataframe.copy()
labels = dataframe.pop('target')
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
ds = ds.prefetch(batch_size)
return ds
batch_size = 5
ds = df_to_dataset(df, batch_size=batch_size)
[(train_features, label_batch)] = ds.take(1)
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
# Create a StringLookup layer which will turn strings into integer indices
if dtype == 'string':
index = preprocessing.StringLookup(max_tokens=max_tokens)
else:
index = preprocessing.IntegerLookup(max_values=max_tokens)
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
# Learn the set of possible values and assign them a fixed integer index.
index.adapt(feature_ds)
# Create a Discretization for our integer indices.
encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())
#encoder = preprocessing.CategoryEncoding(max_tokens=2)
# Prepare a Dataset that only yields our feature.
feature_ds = feature_ds.map(index)
# Learn the space of possible indices.
encoder.adapt(feature_ds)
# Apply one-hot encoding to our indices. The lambda function captures the
# layer so we can use them, or include them in the functional model later.
return lambda feature: encoder(index(feature))
So, after running
type_col = train_features['Type']
layer = get_category_encoding_layer('Type', ds, 'string')
layer(type_col)
I get a result as:
<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[0., 0., 1., 0.],
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[0., 0., 0., 1.],
[0., 0., 0., 1.]], dtype=float32)>
similar to what is shown in the tutorial indeed.
Notice that this is a binary classification problem (Cat/Dog):
np.unique(type_col)
# array([b'Cat', b'Dog'], dtype=object)
So, what is the logic of the 2 extra columns after the categorical encoding shown in the result above? What do they represent, and why they are 2 (and not, say, 1, or 3, or more)?
(I am perfectly aware that, should I wish for a simple one-hot encoding, I could simply use to_categorical(), but this is not the question here)
As already implied in the question, categorical encoding is somewhat richer that simple one-hot encoding. To see what these two columns represent it suffices to add a diagnostic print somewhere inside the get_category_encoding_layer() function:
print(index.get_vocabulary())
Then the result of the last commands will be:
['', '[UNK]', 'Dog', 'Cat']
<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[0., 0., 1., 0.],
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[0., 0., 0., 1.],
[0., 0., 0., 1.]], dtype=float32)>
The hint should hopefully be clear: the extra two columns here represent the empty value '' and unknown ones '[UNK]', respectively, which could be present in future (unseen) data.
This is actually determined from the default arguments, not of CategoryEncoding, but of the preceding StringLookup; from the docs:
mask_token=''
oov_token='[UNK]'
You can end up with a somewhat more tight encoding (only 1 extra column instead of 2) by asking for oov_token='' instead of oov_token='[UNK]'; replace the call to StringLookup in the get_category_encoding_layer() function with
index = preprocessing.StringLookup(oov_token='',mask_token=None, max_tokens=max_tokens)
after which, the result will be:
['', 'Dog', 'Cat']
<tf.Tensor: shape=(5, 3), dtype=float32, numpy=
array([[0., 1., 0.],
[0., 1., 0.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.]], dtype=float32)>
i.e. with only 3 columns (without a dedicated one for '[UNK]'). AFAIK, this is the lowest you can go - attempting to set both mask_token and oov_token to None will result to an error.
I want to shift a tensor in a given axis. It's easy to do this in pandas or numpy. Like this:
import numpy as np
import pandas as pd
data = np.arange(0, 6).reshape(-1, 2)
pd.DataFrame(data).shift(1).fillna(0).values
Output is:
array([[0., 0.],
[0., 1.],
[2., 3.]])
But in tensorflow, the closest solution I found is tf.roll. But it shift the last row to the first row. (I don't want that). So I have to use something like
tf.roll + tf.slice(remove the last row) + tf.concat(add tf.zeros to the first row).
It's really ugly.
Is there a better way to handle shift in tensorflow or keras?
Thanks.
I think I find a better way for this problem.
We could use tf.roll, then apply tf.math.multiply to set the first row to zeros.
Sample code is as follows:
Original tensor:
A = tf.cast(tf.reshape(tf.range(27), (-1, 3, 3)), dtype=tf.float32)
A
Output:
<tf.Tensor: id=117, shape=(3, 3, 3), dtype=float32, numpy=
array([[[ 0., 1., 2.],
[ 3., 4., 5.],
[ 6., 7., 8.]],
[[ 9., 10., 11.],
[12., 13., 14.],
[15., 16., 17.]],
[[18., 19., 20.],
[21., 22., 23.],
[24., 25., 26.]]], dtype=float32)>
Shift (like pd.shift):
B = tf.concat((tf.zeros((1, 3)), tf.ones((2, 3))), axis=0)
C = tf.expand_dims(B, axis=0)
tf.math.multiply(tf.roll(A, 1, axis=1), C)
Output:
<tf.Tensor: id=128, shape=(3, 3, 3), dtype=float32, numpy=
array([[[ 0., 0., 0.],
[ 0., 1., 2.],
[ 3., 4., 5.]],
[[ 0., 0., 0.],
[ 9., 10., 11.],
[12., 13., 14.]],
[[ 0., 0., 0.],
[18., 19., 20.],
[21., 22., 23.]]], dtype=float32)>
Try this:
import tensorflow as tf
input = tf.constant([[0, 1, 3], [4, 5, 6], [7, 8, 9]])
shifted_0dim = input[1:]
shifted_1dim = input[:, 1:]
shifted2 = input[2:]
Generalizing the accepted answer to arbitrary tensor shapes, desired shift, and axis to shift:
import tensorflow as tf
def tf_shift(tensor, shift=1, axis=0):
dim = len(tensor.shape)
if axis > dim:
raise ValueError(
f'Value of axis ({axis}) must be <= number of tensor axes ({dim})'
)
mask_dim = dim - axis
mask_shape = tensor.shape[-mask_dim:]
zero_dim = min(shift, mask_shape[0])
mask = tf.concat(
[tf.zeros(tf.TensorShape(zero_dim) + mask_shape[1:]),
tf.ones(tf.TensorShape(mask_shape[0] - zero_dim) + mask_shape[1:])],
axis=0
)
for i in range(dim - mask_dim):
mask = tf.expand_dims(mask, axis=0)
return tf.multiply(
tf.roll(tensor, shift, axis),
mask
)
EDIT:
This code above doesn't allow for negative shift values, and is pretty slow. Here is a more efficient version utilizing tf.roll and tf.concat without creating a mask and multiplying the tensor of interest by it.
import tensorflow as tf
def tf_shift(values: tf.Tensor, shift: int = 1, axis: int = 0):
pad = tf.zeros([val if i != axis else abs(shift) for i, val in enumerate(values.shape)],
dtype=values.dtype)
size = [-1 if i != axis else val - abs(shift) for i, val in enumerate(values.shape)]
if shift > 0:
shifted = tf.concat(
[pad, tf.slice(values, [0] * len(values.shape), size)],
axis=axis
)
elif shift < 0:
shifted = tf.concat(
[tf.slice(values, [0 if i != axis else abs(shift) for i, _ in enumerate(values.shape)], size), pad],
axis=axis
)
else:
shifted = values
return shifted
Assuming a 2d tensor, this function should mimic a Dataframe shift:
def shift_tensor(tensor, periods, fill_value):
num_row = len(tensor)
num_col = len(tensor[0])
pad = tf.fill([periods, num_col], fill_value)
if periods > 0:
shifted_tensor = tf.concat((pad, tensor[:(num_row - periods), :]), axis=0)
else:
shifted_tensor = tf.concat((tensor[:(num_row - periods), :], pad), axis=0)
return shifted_tensor
I'm trying to implement the model described by Professor Andrew Ng for object detection (explanation starts at 10:00).
He describes the first element of the output vector as the probability that an object was detected, followed by the coordinates of the bounding box of the object matched (when one is matched). The last part of the output vector is a softmax of all the classes your model knows.
As he explains it, using a simple squared error for the case when there is a detection is fine, and just the squares difference of y^[0] - y[0]. I get that this is a naive approach. I'm just wanting to implement this for the learning experience.
My questions
How do I implement this conditional loss in tensorflow?
How do I handle this conditional about y^[0] when dealing with a batch.
How do I implement this conditional loss in tensorflow?
You can convert the loss function to:
Error = mask[0]*(y^[0]-y[0])**2 + mask[1]*(y^[1]-y[1])**2 ... mask[n]*(y^[n]-y[n])**2),
where mask = [1, 1,...1] for y[0] = 1 and [1, 0, ...0] for y[0] = 0
How do I handle this conditional about y^[0] when dealing with a
batch.
For a batch, you can construct the mask on the fly like:
mask = tf.concat([tf.ones((tf.shape(y)[0],1)),y[:,0][...,None]*y[:,1:]], axis=1)
Code:
y_hat_n = np.array([[3, 3, 3, 3], [3,3,3,3]])
y_1 = np.array([[1, 1, 1, 1], [1,1,1,1]])
y_0 = np.array([[0, 1, 1, 1], [0,1,1,1]])
y = tf.placeholder(tf.float32,[None, 4])
y_hat = tf.placeholder(tf.float32,[None, 4])
mask = tf.concat([tf.ones((tf.shape(y)[0],1)),y[:,0][...,None]*y[:,1:]], axis=1)
error = tf.losses.mean_squared_error(mask*y, mask*y_hat)
with tf.Session() as sess:
print(sess.run([mask,error], {y:y_0, y_hat:y_hat_n}))
print(sess.run([mask,error], {y:y_1, y_hat:y_hat_n}))
# Mask and error
#[array([[1., 0., 0., 0.],
# [1., 0., 0., 0.]], dtype=float32), 2.25]
#[array([[1., 1., 1., 1.],
# [1., 1., 1., 1.]], dtype=float32), 4.0]
Suppose I have a tensor in Tensorflow that its values are like:
A = [[0.7, 0.2, 0.1],[0.1, 0.4, 0.5]]
How can I change this tensor into the following:
B = [[1, 0, 0],[0, 0, 1]]
In other words I want to just keep the maximum and replace it with 1.
Any help would be appreciated.
I think that you can solve it with a one-liner:
import tensorflow as tf
import numpy as np
x_data = [[0.7, 0.2, 0.1],[0.1, 0.4, 0.5]]
# I am using hard-coded dimensions for simplicity
x = tf.placeholder(dtype=tf.float32, name="x", shape=(2,3))
session = tf.InteractiveSession()
session.run(tf.one_hot(tf.argmax(x, 1), 3), {x: x_data})
The result is the one that you expect:
Out[6]:
array([[ 1., 0., 0.],
[ 0., 0., 1.]], dtype=float32)