Why does the same Tensorflow model work with a list of arrays but doesn't work with tf.data.Dataset unbatched? - tensorflow

I have the following simple set up:
import tensorflow as tf
def make_mymodel():
class MyModel(tf.keras.models.Model):
def __init__(self):
super(MyModel, self).__init__()
self.model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(1, 2)),
tf.keras.layers.Dense(1)
])
def call(self, x):
return self.model(x)
mymodel = MyModel()
return mymodel
model = make_mymodel()
X = [[[1, 1]],
[[2, 2]],
[[10, 10]],
[[20, 20]],
[[50, 50]]]
y = [1, 2, 10, 20, 50]
# ds_n_X = tf.data.Dataset.from_tensor_slices(X)
# ds_n_Y = tf.data.Dataset.from_tensor_slices(y)
# ds = tf.data.Dataset.zip((ds_n_X, ds_n_Y))
#
# for input, label in ds:
# print(input.numpy(), label.numpy())
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model.build((1, 2))
model.compile(optimizer='adam',
loss=loss_fn)
model.summary()
model.fit(X, y, epochs=10)
print(model.predict([
[[25, 25]]
]))
This works fine (although I get strange predictions), but when I uncomment the ds lines and change model.fit(X, y, epochs=10) to model.fit(ds, epochs=10), I get the following error:
Traceback (most recent call last):
File "example_dataset.py", line 51, in <module>
model.fit(ds, epochs=10)
...
ValueError: slice index 0 of dimension 0 out of bounds. for '{{node strided_slice}} = StridedSlice[Index=DT_INT32, T=DT_INT32, begin_mask=0, ellipsis_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=1](Shape, strided_slice/stack, strided_slice/stack_1, strided_slice/stack_2)' with input shapes: [0], [1], [1], [1] and with computed input tensors: input[1] = <0>, input[2] = <1>, input[3] = <1>.
The error gets solved when I run model.fit(ds.batch(2), epochs=10) (I added a batch instruction to the dataset).
I expect to be able to use a list of arrays and tf.data.Dataset interchangeably but, for some reason, I need to add a batch dimension to the dataset in order to use tf.data.Dataset. Is this expected behavior or am I conceptually missing something?

Because the model expects input as (batch_dim, input_dim). So, for your data, each input to the model should be like (None, 1, 2).
Let's explore the dimensions of your data by array and by dataset. While you define your input as array the shape is:
>>> print(np.array(X).shape)
(5, 1, 2)
It is compatible with what the model expects. But when you define a dataset using your array the shape is:
>>> for input, label in ds.take(1):
print(input.numpy().shape)
(1, 2)
And this is incompatible with what model expects, and if we batch the data:
>>> ds = ds.batch(1)
>>> for input, label in ds.take(1):
print(input.numpy().shape)
(1, 1, 2)
Then, it will be fine to pass dataset to the model.fit().

Related

Correct way to iterate over Keras ragged tensor

I have an input Tensorflow ragged tensor structured like this [batch num_images width height channels] and I need to iterate over the dimension num_images to extract some features relevant for downstream applications.
Example code is the following:
from tensorflow.keras.applications.efficientnet import EfficientNetB7
from tensorflow.keras.layers import Input
import tensorflow as tf
eff_net = EfficientNetB7(weights='imagenet', include_top=False)
input_claim = Input(shape=(None, 600, 600, 3), name='input_1', ragged=True)
eff_out = tf.map_fn(fn=eff_net,
elems=input_claim, fn_output_signature=tf.float32)
The first Input dimension is set to None as it can differ across data points, and for this reason the input receives instances of tf.RaggedTensor.
This code breaks with a TypeError in this way TypeError: Could not build a TypeSpec for KerasTensor(type_spec=RaggedTensorSpec(TensorShape([None, None, 600, 600, 3]), tf.float32, 1, tf.int64), name='input_1', description="created by layer 'input_1'") of unsupported type <class 'keras.engine.keras_tensor.RaggedKerasTensor'>.
I suspect there is a better way to perform this type of preprocessing though
Update: num_images is needed because (although not described here) I am doing some following reduce operation on this dimension
You can use tf.ragged.map_flat_values to achieve the same
Create a model like:
def eff_net(x): #dummy eff_net for testing that returns [batch, dim]
return tf.random.normal(shape=tf.shape(x)[:2])
input_claim = keras.Input(shape=(None, 600, 600, 3), name='input_1', ragged=True)
class RaggedMapLayer(layers.Layer):
def call(self, x):
return tf.ragged.map_flat_values(eff_net, x)
outputs = RaggedMapLayer()(input_claim)
model = keras.Model(inputs=input_claim, outputs=outputs)
testing,
inputs = tf.RaggedTensor.from_row_splits( tf.random.normal(shape=(10, 600, 600, 3)), row_splits=[0, 2, 5,10])
#shape [3, None, 600, 600, 3]
model(inputs).shape
#[3, None, 600]

Tensorflow embeddings InvalidArgumentError: indices[18,16] = 11905 is not in [0, 11905) [[node sequential_1/embedding_1/embedding_lookup

I am using TF 2.2.0 and trying to create a Word2Vec CNN text classification model. But however I tried there has been always an issue with the model or embedding layers. I could not found clear solutions in the internet so decided to ask it.
import multiprocessing
modelW2V = gensim.models.Word2Vec(filtered_stopwords_list, size= 100, min_count = 5, window = 5, sg=0, iter = 10, workers= multiprocessing.cpu_count() - 1)
model_save_location = "3000tweets_notbinary"
modelW2V.wv.save_word2vec_format(model_save_location)
word2vec = {}
with open('3000tweets_notbinary', encoding='UTF-8') as f:
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
num_words = len(list(tokenizer.word_index))
embedding_matrix = np.random.uniform(-1, 1, (num_words, 100))
for word, i in tokenizer.word_index.items():
if i < num_words:
embedding_vector = word2vec.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
else:
embedding_matrix[i] = np.zeros((100,))
I have created my word2vec weights by the code above and then converted it to embedding_matrix as I followed on many tutorials. But since there are a lot of words seen by word2vec but not available in embeddings, if there is no embedding I assign 0 vector. And then fed data and this embedding to tf sequential model.
seq_leng = max_tokens
vocab_size = num_words
embedding_dim = 100
filter_sizes = [3, 4, 5]
num_filters = 512
drop = 0.5
epochs = 5
batch_size = 32
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(input_dim= vocab_size,
output_dim= embedding_dim,
weights = [embedding_matrix],
input_length= max_tokens,
trainable= False),
tf.keras.layers.Conv1D(num_filters, 7, activation= "relu", padding= "same"),
tf.keras.layers.MaxPool1D(2),
tf.keras.layers.Conv1D(num_filters, 7, activation= "relu", padding= "same"),
tf.keras.layers.MaxPool1D(),
tf.keras.layers.Dropout(drop),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(32, activation= "relu", kernel_regularizer= tf.keras.regularizers.l2(1e-4)),
tf.keras.layers.Dense(3, activation= "softmax")
])
model.compile(loss= "categorical_crossentropy", optimizer= tf.keras.optimizers.Adam(learning_rate= 0.001, epsilon= 1e-06),
metrics= ["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
model.summary()
history = model.fit(x_train_pad, y_train2, batch_size= 60, epochs= epochs, shuffle= True, verbose= 1)
But when I run this code, tensorflow gives me the following error in any random time of the training process. But I could not find any solution to it. I have tried adding + 1 to vocab_size but when I do that I get size mismatch error which does not let me even compile my model. Can anyone please help me?
InvalidArgumentError: indices[18,16] = 11905 is not in [0, 11905)
[[node sequential_1/embedding_1/embedding_lookup (defined at <ipython-input-26-ef1b16cf85bf>:1) ]] [Op:__inference_train_function_1533]
Errors may have originated from an input operation.
Input Source operations connected to node sequential_1/embedding_1/embedding_lookup:
sequential_1/embedding_1/embedding_lookup/991 (defined at /usr/lib/python3.6/contextlib.py:81)
Function call stack:
train_function
I solved this solution. I was adding a new dimension to vocab_size by doing it vocab_size + 1 as suggested by others. However, since sizes of layer dimensions and embedding matrix don't match I got this issue in my hands. I added a zero vector at the end of my embedding matrix which solved the issue.

Using TF Estimator with TFRecord generator

I am trying to create a simple NN that reads in a folder of tfrecords. Each record has a 1024-value 'mean_rgb' vector, and a category label. I am trying to create a simple feed-forward NN that learns the categories based on this feature vector.
def generate(dir, shuffle, batch_size):
def parse(serialized):
features = {
'mean_rgb': tf.FixedLenFeature([1024], tf.float32),
'category': tf.FixedLenFeature([], tf.int64)
}
parsed_example = tf.parse_single_example(serialized=serialized, features=features)
vrv = parsed_example['mean_rgb']
label = parsed_example['category']
d = dict(zip(['mean_rgb'], [vrv])), label
return d
dataset = tf.data.TFRecordDataset(dir).repeat(1)
dataset = dataset.map(parse)
if shuffle:
dataset = dataset.shuffle(8000)
dataset = dataset.batch(batch_size)
iterator = dataset.make_one_shot_iterator()
next = iterator.get_next()
print(next)
return next
def batch_generator(dir, shuffle=False, batch_size=64):
sess = K.get_session()
while True:
yield sess.run(generate(dir, shuffle, batch_size))
num_classes = 29
batch_size = 64
yt8m_train = [os.path.join(yt8m_dir_train, x) for x in read_all_file_names(yt8m_dir_train) if '.tfrecord' in x]
yt8m_test = [os.path.join(yt8m_dir_test, x) for x in read_all_file_names(yt8m_dir_test) if '.tfrecord' in x]
feature_columns = [tf.feature_column.numeric_column(k) for k in ['mean_rgb']]
#batch_generator(yt8m_test).__next__()
classifier = tf.estimator.DNNClassifier(
feature_columns=feature_columns,
hidden_units=[1024, 1024],
n_classes=num_classes,
model_dir=model_dir)
classifier.train(
input_fn=lambda: generate(yt8m_train, True, batch_size))
However, I get the following error:
InvalidArgumentError (see above for traceback): Input to reshape is a
tensor with 65536 values, but the requested shape has 64
I am not sure why it sees the input as a 64x1024=65536 vector instead of a (64, 1024) vector. When I print the next item in the generator, I get
({'mean_rgb': <tf.Tensor: id=23, shape=(64, 1024), dtype=float32, numpy=
array([[ 0.9243997 , 0.28990048, -0.4130672 , ..., -0.096692 ,
0.27225342, 0.13346168],
[ 0.5853526 , 0.67050666, -0.24683481, ..., -0.6999033 ,
-0.4100128 , -0.00349384],
[ 0.49572858, 0.5231492 , -0.53445834, ..., 0.0449002 ,
0.10582132, -0.37333965],
...,
[ 0.5776026 , -0.07128889, -0.61762846, ..., 0.22194198,
0.61441416, -0.27355513],
[-0.01848815, 0.20132884, 1.1023484 , ..., 0.06496283,
0.29560333, 0.09157721],
[-0.25877073, -1.9552246 , 0.10309827, ..., 0.22032814,
-0.6812989 , -0.23649289]], dtype=float32)>}
which has the correct (64, 1024) shape
the problem is at how the features_columns works, for example, I had a similar problem and I solved by doing a reshape here is part of my code that will help you understand:
defining the features_column:
feature_columns = {
'images': tf.feature_column.numeric_column('images', self.shape),
}
then to create the input for the model:
with tf.name_scope('input'):
feature_columns = list(self._features_columns().values())
input_layer = tf.feature_column.input_layer(
features=features, feature_columns=feature_columns)
input_layer = tf.reshape(
input_layer,
shape=(-1, self.parameters.size, self.parameters.size,
self.parameters.channels))
if pay attention to the last part I had to reshape the tensor, the -1 is to let Tensorflow figure out the batch size
I believe the issue was that feature_columns = [tf.feature_column.numeric_column(k) for k in ['mean_rgb']] assumes that the column is a scalar - when actually it is a 1024 vector. I had to add shape=1024 to the numeric_column call. Also had to remove existing checkpoint saved model.

Tensorflow embedding for categorical feature

In machine learning, it is common to represent a categorical (specifically: nominal) feature with one-hot-encoding. I am trying to learn how to use tensorflow's embedding layer to represent a categorical feature in a classification problem. I have got tensorflow version 1.01 installed and I am using Python 3.6.
I am aware of the tensorflow tutorial for word2vec, but it is not very instructive for my case. While building the tf.Graph, it uses NCE-specific weights and tf.nn.nce_loss.
I just want a simple feed-forward net as below, and the input layer to be an embedding. My attempt is below. It complains when I try to matrix multiply the embedding with the hidden layer due to shape incompatibility. Any ideas how I can fix this?
from __future__ import print_function
import pandas as pd;
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder
if __name__ == '__main__':
# 1 categorical input feature and a binary output
df = pd.DataFrame({'cat2': np.array(['o', 'm', 'm', 'c', 'c', 'c', 'o', 'm', 'm', 'm']),
'label': np.array([0, 0, 1, 1, 0, 0, 1, 0, 1, 1])})
encoder = LabelEncoder()
encoder.fit(df.cat2.values)
X = encoder.transform(df.cat2.values)
Y = np.zeros((len(df), 2))
Y[np.arange(len(df)), df.label.values] = 1
# Neural net parameters
training_epochs = 5
learning_rate = 1e-3
cardinality = len(np.unique(X))
embedding_size = 2
input_X_size = 1
n_labels = len(np.unique(Y))
n_hidden = 10
# Placeholders for input, output
x = tf.placeholder(tf.int32, [None, 1], name="input_x")
y = tf.placeholder(tf.float32, [None, 2], name="input_y")
# Neural network weights
embeddings = tf.Variable(tf.random_uniform([cardinality, embedding_size], -1.0, 1.0))
h = tf.get_variable(name='h2', shape=[embedding_size, n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
W_out = tf.get_variable(name='out_w', shape=[n_hidden, n_labels],
initializer=tf.contrib.layers.xavier_initializer())
# Neural network operations
embedded_chars = tf.nn.embedding_lookup(embeddings, x)
layer_1 = tf.matmul(embedded_chars,h)
layer_1 = tf.nn.relu(layer_1)
out_layer = tf.matmul(layer_1, W_out)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out_layer, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost = 0.
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost],
feed_dict={x: X, y: Y})
print("Optimization Finished!")
EDIT:
Please see below the error message:
Traceback (most recent call last):
File "/home/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/common_shapes.py", line 671, in _call_cpp_shape_fn_impl
input_tensors_as_shapes, status)
File "/home/anaconda3/lib/python3.6/contextlib.py", line 89, in __exit__
next(self.gen)
File "/home/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape must be rank 2 but is rank 3 for 'MatMul' (op: 'MatMul') with input shapes: [?,1,2], [2,10].
Just make your x placeholder be size [None] instead of [None, 1]

Using Estimator for building an LSTM network

I am trying to build an LSTM network using an Estimator. My data looks like
X = [[1,2,3], [2,3,4], ... , [98,99,100]]
y = [2, 3, ... , 99]
I am using an Estimator:
regressor = learn.Estimator(model_fn=lstm_model,
params=model_params,
)
where the lstm_model function is
def lstm_model(features, targets, mode, params):
def lstm_cells(layers):
if isinstance(layers[0], dict):
return [tf.nn.rnn_cell.BasicLSTMCell(layer['steps'],state_is_tuple=True) for layer in layers]
return [tf.nn.rnn_cell.BasicLSTMCell(steps, state_is_tuple=True) for steps in layers]
stacked_lstm = tf.nn.rnn_cell.MultiRNNCell(lstm_cells(params['rnn_layers']), state_is_tuple=True)
output, layers = tf.nn.rnn(stacked_lstm, [features], dtype=tf.float32)
return learn.models.linear_regression(output, targets)
and params are
model_params = {
'steps': 1000,
'learning_rate': 0.03,
'batch_size': 24,
'time_steps': 3,
'rnn_layers': [{'steps': 3}],
'dense_layers': [10, 10]
}
and then I do the fitting
regressor.fit(X, y)
The issue I am facing is
output, layers = tf.nn.rnn(stacked_lstm, [features], dtype=tf.float32)
requires a sequence but I am not sure how to split my features to into list of tensors. The shape of features inside the lstm_model function is (?, 3)
I have two questions, how do I do the training in batches? and how do I split 'features' so
output, layers = tf.nn.rnn(stacked_lstm, [features], dtype=tf.float32)
doesn't throw and error. The error I am getting is
raise TypeError("%s that don't all match." % prefix)
TypeError: Tensors in list passed to 'values' of 'Concat' Op have types [float64, float32] that don't all match.
I am using tensorflow 0.12
I had to set the shape for features to be
(batch_size, time_step, 1) or (None, time_step, 1) and then unstack the features to go in the rnn. Unstacking the features in the "time_step" so you have a list of tensors with the size of time steps and the shape for each tensor should be (None, 1) or (batch_size, 1)