Word2Vec Tutorial: Tensorflow TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type int32 of argument 'x' - tensorflow

Version of Tensorflow: 1.2.1
Version of Python: 3.5
Operating System: Windows 10
Another poster has asked about this same problem on StackOverflow here, and he appears to be using code from the same Udacity Word2Vec tutorial. So, maybe I'm dense, but the code of this example is so busy and complex that I can't tell what fixed his problem.
The error occurs when I call tf.reduce_means:
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
train_labels, num_sampled, vocabulary_size))
Right before the call to tf.reduce_mean the key variables have the following data types.
train_dataset.dtype
>> tf.int32
train_labels.dtype
>> tf.int32
valid_dataset.dtype
>> tf.int32
embeddings.dtype
>> tf.float32_ref
softmax_weights.dtype
>> tf.float32_ref
softmax_biases.dtype
>> tf.float32_ref
embed.dtype
>> tf.float32
I tried every permutation of data type in the definitions of the variables train_dataset.dtype, train_labels.dtype and valid_dataset.dtype: making them all int64, all float32, all float64, and combinations of integer and floating point. Nothing worked. I didn't try altering the data types of softmax_weight and softmax_biases, because I'm afraid that might foul up the optimization algorithm. Don't these need to be floats to support the calculus that is done during backpropagation? (Tensorflow is often a very opaque black box with documentation that verges on completely useless, so I can suspect things but never know for sure.)
Program Flow at Time of Error:
After the call to reduce_mean program control transfers to sampled_softmax_loss() in file nn_impl.py which in turn calls _compute_sampled_logits():
logits, labels = _compute_sampled_logits(
weights=weights,
biases=biases,
labels=labels,
inputs=inputs,
num_sampled=num_sampled,
num_classes=num_classes,
num_true=num_true,
sampled_values=sampled_values,
subtract_log_q=True,
remove_accidental_hits=remove_accidental_hits,
partition_strategy=partition_strategy,
name=name)
At this point I check the data types of the passed-in parameters and get the following:
weights.dtype
>> tf.float32_ref
biases.dtype
>> tf.float32_ref
labels.dtype
>> tf.float32
inputs.dtype
>> tf.int32
On the very next step an exception occurs, and I am thrown into the StreamWrapper class in file ansitowin32.py. Running to the end, I get the following Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
C:\Anaconda3\envs\aind-dog\lib\site-packages\tensorflow\python\framework\op_def_library.py in apply_op(self, op_type_name, name, **keywords)
489 as_ref=input_arg.is_ref,
--> 490 preferred_dtype=default_dtype)
491 except TypeError as err:
C:\Anaconda3\envs\aind-dog\lib\site-packages\tensorflow\python\framework\ops.py in internal_convert_to_tensor(value, dtype, name, as_ref, preferred_dtype)
740 if ret is None:
--> 741 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
742
C:\Anaconda3\envs\aind-dog\lib\site-packages\tensorflow\python\framework\ops.py in _TensorTensorConversionFunction(t, dtype, name, as_ref)
613 "Tensor conversion requested dtype %s for Tensor with dtype %s: %r"
--> 614 % (dtype.name, t.dtype.name, str(t)))
615 return t
ValueError: Tensor conversion requested dtype int32 for Tensor with dtype float32: 'Tensor("sampled_softmax_loss/Reshape_1:0", shape=(?, 1, ?), dtype=float32, device=/device:CPU:0)'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-7-66d378b94a16> in <module>()
34 loss = tf.reduce_mean(
35 tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
---> 36 train_labels, num_sampled, vocabulary_size))
37
38 # Optimizer.
C:\Anaconda3\envs\aind-dog\lib\site-packages\tensorflow\python\ops\nn_impl.py in sampled_softmax_loss(weights, biases, labels, inputs, num_sampled, num_classes, num_true, sampled_values, remove_accidental_hits, partition_strategy, name)
1266 remove_accidental_hits=remove_accidental_hits,
1267 partition_strategy=partition_strategy,
-> 1268 name=name)
1269 sampled_losses = nn_ops.softmax_cross_entropy_with_logits(labels=labels,
1270 logits=logits)
C:\Anaconda3\envs\aind-dog\lib\site-packages\tensorflow\python\ops\nn_impl.py in _compute_sampled_logits(weights, biases, labels, inputs, num_sampled, num_classes, num_true, sampled_values, subtract_log_q, remove_accidental_hits, partition_strategy, name)
1005 row_wise_dots = math_ops.multiply(
1006 array_ops.expand_dims(inputs, 1),
-> 1007 array_ops.reshape(true_w, new_true_w_shape))
1008 # We want the row-wise dot plus biases which yields a
1009 # [batch_size, num_true] tensor of true_logits.
C:\Anaconda3\envs\aind-dog\lib\site-packages\tensorflow\python\ops\math_ops.py in multiply(x, y, name)
284
285 def multiply(x, y, name=None):
--> 286 return gen_math_ops._mul(x, y, name)
287
288
C:\Anaconda3\envs\aind-dog\lib\site-packages\tensorflow\python\ops\gen_math_ops.py in _mul(x, y, name)
1375 A `Tensor`. Has the same type as `x`.
1376 """
-> 1377 result = _op_def_lib.apply_op("Mul", x=x, y=y, name=name)
1378 return result
1379
C:\Anaconda3\envs\aind-dog\lib\site-packages\tensorflow\python\framework\op_def_library.py in apply_op(self, op_type_name, name, **keywords)
524 "%s type %s of argument '%s'." %
525 (prefix, dtypes.as_dtype(attrs[input_arg.type_attr]).name,
--> 526 inferred_from[input_arg.type_attr]))
527
528 types = [values.dtype]
TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type int32 of argument 'x'.
Here's the complete program:
# These are all the modules we'll be using later.
# Make sure you can import them before proceeding further.
# %matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
print("Working directory = %s\n" % os.getcwd())
def read_data(filename):
"""Extract the first file enclosed in a zip file as a list of words"""
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
filename = 'text8.zip'
words = read_data(filename)
print('Data size %d' % len(words))
vocabulary_size = 50000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
# Loop through the keys of the count collection dictionary
# (apparently, zeroing out counts)
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0 # count of unknown words
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count + 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words # Hint to reduce memory.
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [ skip_window ]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
print('data:', [reverse_dictionary[di] for di in data[:8]])
for num_skips, skip_window in [(2, 1), (4, 2)]:
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
print(' batch:', [reverse_dictionary[bi] for bi in batch])
print(' labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
# Input data.
train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Variables.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Model.
# Look up embeddings for inputs.
embed = tf.nn.embedding_lookup(embeddings, train_dataset)
# Compute the softmax loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
train_labels, num_sampled, vocabulary_size))
# Optimizer.
# Note: The optimizer will optimize the softmax_weights AND the embeddings.
# This is because the embeddings are defined as a variable quantity and the
# optimizer's `minimize` method will by default modify all variable quantities
# that contribute to the tensor it is passed.
# See docs on `tf.train.Optimizer.minimize()` for more details.
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
# Compute the similarity between minibatch examples and all embeddings.
# We use the cosine distance:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

I had the same issue and it looks like that two parameters that are passed on to the loss function are swapped around.
If you look at the tensorflow description for 'sample_softmax_loss' (https://www.tensorflow.org/api_docs/python/tf/nn/sampled_softmax_loss):
sampled_softmax_loss(
weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=1,
sampled_values=None,
remove_accidental_hits=True,
partition_strategy='mod',
name='sampled_softmax_loss'
)
The third expected parameter is 'labels' and the fourth 'inputs'. In the supplied code, these two parameters seem to have been switched around. I'm a bit puzzled how this is possible. Maybe this used to be different in an older version of TF. Anyway, swapping those two parameters around will solve the problem.

Related

feed_dict in for loop

I'm trying to make this old code work. Tensorflow has deleted train and next_batch.
I adjusted the code as much as I can but faced a stone hard wall here.
from keras.datasets import mnist
import keras as K
(Xtr, Ytr), (Xte, Yte) = mnist.load_data()
#Xtr, Ytr = mnist.ds_train.batch(5000) #5000 for training (nn candidates)
#Xte, Yte = mnist.test.next_batch(200) #200 for testing
# tf Graph Input
xtr = tf.placeholder("float", [None, 784])
xte = tf.placeholder("float", [784])
# Nearest Neighbor calculation using L1 Distance
# Calculate L1 Distance
distance = tf.reduce_sum(tf.abs(tf.add(xtr, tf.negative(xte))), reduction_indices=1)
# Prediction: Get min distance index (Nearest neighbor)
pred = tf.arg_min(distance, 0)
accuracy = 0.
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
xtr = tf.reshape(xtr, (-1, 28, 28))
xte = tf.reshape(xtr, (-1, 28, 28))
Training start
# Start training
with tf.Session() as sess:
sess.run(init)
# loop over test data
for i in range(Xte.size):
# Get nearest neighbor\
n_index = sess.run(pred, feed_dict={xtr: Xtr, xte: Xte[i, :]})
# Get nearest neighbor class label and compare it to its true label
print("Test", i, "Prediction:", np.argmax(Ytr[nn_index]), "True Class:", np.argmax(Yte[i]))
# Calculate accuracy
if np.argmax(Ytr[nn_index]) == np.argmax(Yte[i]):
accuracy += 1./len(Xte)
it seems to be impossible to have an index in an array size in loop.
However, The OUTPUT is like this
(10000, 28, 28)
(28, 28)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-31-4c49a9166101> in <module>
9 Xtee = Xte[i, :, :]
10 print(Xtee.shape)
---> 11 nn_index = sess.run(pred, feed_dict={xtr: Xtr, xte: Xte[50, :]})
12 # Get nearest neighbor class label and compare it to its true label
13 print("Test", i, "Prediction:", np.argmax(Ytr[nn_index]), "True Class:", np.argmax(Yte[i]))
1 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1163 not subfeed_t.get_shape().is_compatible_with(np_val.shape)):
1164 raise ValueError(
-> 1165 f'Cannot feed value of shape {str(np_val.shape)} for Tensor '
1166 f'{subfeed_t.name}, which has shape '
1167 f'{str(subfeed_t.get_shape())}')
ValueError: Cannot feed value of shape (28, 28) for Tensor Reshape_3:0, which has shape (?, 28, 28)

Permutation Invariant Loss in Tensorflow

I am working on a permutation invariant loss in Tensorflow 2.8.
The Loss takes two vectorized matrices of shape (N x 5), reshapes them (N,5) and then calculates all possible permutations (N!).
Then for all permutations a loss is calculated, and the minimum of this loss is used (for the best match).
However, I get the error message:
File "C:\Users\meist\anaconda3\envs\tf-2-8\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "C:\Users\meist\AppData\Roaming\Python\Python39\site-packages\tensorflow\python\framework\func_graph.py", line 1147, in autograph_handler
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
File "C:\Users\meist\anaconda3\envs\tf-2-8\lib\site-packages\keras\engine\training.py", line 1021, in train_function *
return step_function(self, iterator)
File "C:\Users\meist\anaconda3\envs\tf-2-8\lib\site-packages\keras\engine\training.py", line 1010, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "C:\Users\meist\anaconda3\envs\tf-2-8\lib\site-packages\keras\engine\training.py", line 1000, in run_step **
outputs = model.train_step(data)
File "C:\Users\meist\anaconda3\envs\tf-2-8\lib\site-packages\keras\engine\training.py", line 863, in train_step
self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
File "C:\Users\meist\anaconda3\envs\tf-2-8\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 532, in minimize
return self.apply_gradients(grads_and_vars, name=name)
File "C:\Users\meist\anaconda3\envs\tf-2-8\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 633, in apply_gradients
grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
File "C:\Users\meist\anaconda3\envs\tf-2-8\lib\site-packages\keras\optimizer_v2\utils.py", line 73, in filter_empty_gradients
raise ValueError(f"No gradients provided for any variable: {variable}. "
ValueError: No gradients provided for any variable
Apparently there are no gradients. However, when I simply input a y_train, and y_pred I do get a loss. Here is the loss code:
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.losses import Loss
import tensorflow as tf
from itertools import permutations
import numpy as np
import keras.backend as K
class PermInvLoss(Loss):
'''
This loss is supposed to return the minimum loss, based on the best metching of y_true and y_pred.
y_true is of dim [batchsize, Nmix x 5], and will be reshaped to [batchsize, Nmix, 5] in the call
Nmix are the number of vectors that can be permutated. The elements within the vector are fixed.
The 5 elements are [class_value,class_value,reg_value,reg_value,reg_value]
The two class values will be evaluated with CategoricalCrossentropy
The three regression values will be evaluted with MSE.
'''
def __init__(self,Nmix = 3):
super(PermInvLoss, self).__init__() # is this correct?
self.name = 'perm_inv_loss'
self.cce = CategoricalCrossentropy()
self.shape = (-1,Nmix,5) # for transforming y_true, and y_pred
variants = np.math.factorial(Nmix) # number of possible permut.
permutation_idx = list(permutations(np.arange(Nmix))) # list of permutations
perm = tf.constant(permutation_idx)
self.perm_mat = tf.constant(np.eye(Nmix)[permutation_idx],dtype = tf.float32) # permutation matrix for y_pred
eye = tf.eye(Nmix,dtype=tf.int32) # eye matrix
self.rep_mat = tf.broadcast_to(eye[tf.newaxis,...],(variants,Nmix,Nmix)) # repetition matrix for y_true
def MSE(self,y_true,y_pred,axis=(-2,-1)):
# simple MSE implementation with axis
mse = K.mean(K.square(K.abs(y_true-y_pred)),axis=axis)
return mse
def call(self, y_true, y_pred):
# reshape to [batchsize, Nmix, 5]
y_true = K.reshape(y_true,self.shape)
y_pred = K.reshape(y_pred ,self.shape)
# now y_pred is permutaed in one extra dimension (variants)
y_perm = tf.linalg.matmul(tf.cast(self.perm_mat,tf.float32),y_pred[:,tf.newaxis,...]) # [batchsize x variants x Nmix x 5]
# same for y_true, but with the repetition matrix
y_true = tf.linalg.matmul(tf.cast(self.one_mat,tf.float32),y_pred[:,tf.newaxis,...])
# print(y_perm.shape) # [batchsize x variants x Nmix x 5]
# print(y_true.shape) # [batchsize x variants x Nmix x 5]
# now we have on the second dimension all possible permutations of y_pred and can evaluate them against y_true of the same shape
# CategoricalCrossentropy for the first two values (classification)
cce = CategoricalCrossentropy(reduction='none',axis=(-1))
CE = K.sum(cce(y_true[...,:2], y_perm[...,:2]),axis=-1) # [batchsize x variants]
# MSE for other values (regression)
mse = self.MSE(y_true[...,2:], y_perm[...,2:]) # [batchsize x variants]
loss = K.min(CE+mse,axis=-1) # calculates minimum loss over the variants [batchsize]
return loss
Is the Class wrong, or is there really no Gradient?
I found the mistake, the line:
y_true = tf.linalg.matmul(tf.cast(self.rep_mat,tf.float32),y_pred[:,tf.newaxis,...])
should obviously be
y_true = tf.linalg.matmul(tf.cast(self.rep_mat,tf.float32),y_true[:,tf.newaxis,...])

GPU goes out of memory during training large dataset

I am using a Transformer network for machine translation, during training of model the GPU runs out of memory during large dataset, it works fine with small data.
This is the self attention part, The error comes during the computation of matrices.
import tensorflow as tf
class SelfAttention(tf.keras.layers.Layer):
def __init__(self, embed_size, head):
super(SelfAttention, self).__init__()
self.head = head
self.embed_size = embed_size
self.head_dim = embed_size // head
assert (self.head_dim * head == embed_size), 'size of head_dim is not matching'
self.query = tf.keras.layers.Dense(self.head_dim, activation='linear', use_bias=False)
self.value = tf.keras.layers.Dense(self.head_dim, activation='linear', use_bias=False)
self.key = tf.keras.layers.Dense(self.head_dim, activation='linear', use_bias=False)
self.fc_layer = tf.keras.layers.Dense(self.embed_size, activation='linear')
def call(self, value, key, query, mask):
# Number of training examples
N = query.shape[0]
query_len, value_len, key_len = query.shape[1], value.shape[1], key.shape[1]
# Reshape according to the number of examples and words
query = tf.reshape(query, (N, query_len, self.head, self.head_dim))
value = tf.reshape(value, (N, value_len, self.head, self.head_dim))
key = tf.reshape(key, (N, key_len, self.head, self.head_dim))
query = self.query(query)
value = self.value(value)
key = self.key(key)
# energy shape: (N, head, query_len, key_len) try to imagine the shape in mind
energy = tf.einsum("nqhd, nkhd->nhqk", query, key)
if mask is not None:
energy = energy * mask
energy = tf.where(tf.equal(energy, 0), -1e20, energy)
attention = tf.keras.activations.softmax(energy, axis=3)
# attention shape: (N, head, query_len, key_len)
# value shape:(N, value_len, head, head_dim)
# output: (N, query_len, head, head_dim)
output = tf.reshape(tf.einsum("nhql, nlhd->nqhd", attention, value), (N, query_len, self.head*self.head_dim))
output = tf.keras.activations.linear(output)
return output
The error is
2021-09-20 11:51:49.615495: I tensorflow/core/common_runtime/bfc_allocator.cc:1036] 1 Chunks of size 35477760 totalling 33.83MiB
2021-09-20 11:51:49.615502: I tensorflow/core/common_runtime/bfc_allocator.cc:1036] 1 Chunks of size 40866304 totalling 38.97MiB
2021-09-20 11:51:49.615509: I tensorflow/core/common_runtime/bfc_allocator.cc:1036] 1 Chunks of size 47409664 totalling 45.21MiB
2021-09-20 11:51:49.615516: I tensorflow/core/common_runtime/bfc_allocator.cc:1036] 1 Chunks of size 47547136 totalling 45.34MiB
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in raise_from_not_ok_status(e, name)
6860 message = e.message + (" name: " + name if name is not None else "")
6861 # pylint: disable=protected-access
-> 6862 six.raise_from(core._status_to_exception(e.code, message), None)
6863 # pylint: enable=protected-access
6864
/opt/conda/lib/python3.7/site-packages/six.py in raise_from(value, from_value)
ResourceExhaustedError: OOM when allocating tensor with shape[32,334,25335] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:BiasAdd]
What should I do?
You can use a generator to load just a part of the dataset in the GPU memory and with that you will be able to train with your model.
Here is an example of a simple generator for image classification that you need to adjust to your use for NLP:
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32), n_channels=1,
n_classes=10, shuffle=True):
'Initialization'
self.dim = dim
self.batch_size = batch_size
self.labels = labels
self.list_IDs = list_IDs
self.n_channels = n_channels
self.n_classes = n_classes
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_IDs_temp):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
X = np.empty((self.batch_size, *self.dim, self.n_channels))
y = np.empty((self.batch_size), dtype=int)
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
X[i,] = np.load('data/' + ID + '.npy')
# Store class
y[i] = self.labels[ID]
return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
And then pass it to .fit
params = {'dim': (32,32,32),
'batch_size': 64,
'n_classes': 6,
'n_channels': 1,
'shuffle': True}
# Datasets
partition = # IDs
labels = # Labels
# Generators
training_generator = DataGenerator(partition['train'], labels, **params)
validation_generator = DataGenerator(partition['validation'], labels, **params)
model.fit_generator(generator=training_generator,
validation_data=validation_generator)

Tensorflow : Shape error in LSTM model expected shape=(None, None, 90), found shape=[90, 1, 78]

I am getting a shape error in this code and not able to figure out what am I doing wrong,
Shape error in LSTM model expected shape=(None, None, 90), found shape=[90, 1, 78]
Tried checking each and every shape
Please help
This question is from Programming assignment Coursera ( Deep leaning specialization course 5)
def music_inference_model(LSTM_cell, densor, Ty=100):
"""
Uses the trained "LSTM_cell" and "densor" from model() to generate a sequence of values.
Arguments:
LSTM_cell -- the trained "LSTM_cell" from model(), Keras layer object
densor -- the trained "densor" from model(), Keras layer object
Ty -- integer, number of time steps to generate
Returns:
inference_model -- Keras model instance
"""
# Get the shape of input values
n_values = densor.units
# Get the number of the hidden state vector
n_a = LSTM_cell.units
# Define the input of your model with a shape
x0 = Input(shape=(1, n_values))
# Define s0, initial hidden state for the decoder LSTM
a0 = Input(shape=(n_a,), name='a0')
c0 = Input(shape=(n_a,), name='c0')
a = a0
c = c0
x = x0
### START CODE HERE ###
# Step 1: Create an empty list of "outputs" to later store your predicted values (≈1 line)
outputs = []
# Step 2: Loop over Ty and generate a value at every time step
for t in range(Ty):
# Step 2.A: Perform one step of LSTM_cell (≈1 line)
a, _, c = LSTM_cell(x, initial_state=[a, c])
# Step 2.B: Apply Dense layer to the hidden state output of the LSTM_cell (≈1 line)
out = densor(_)
# Step 2.C: Append the prediction "out" to "outputs". out.shape = (None, 90) (≈1 line)
outputs.append(out)
# Step 2.D:
# Select the next value according to "out",
# Set "x" to be the one-hot representation of the selected value
# See instructions above.
x = tf.math.argmax(out)
x = tf.one_hot(indices=x, depth=78)
# Step 2.E:
# Use RepeatVector(1) to convert x into a tensor with shape=(None, 1, 90)
x = RepeatVector(1)(x)
# Step 3: Create model instance with the correct "inputs" and "outputs" (≈1 line)
inference_model = Model(inputs=[x0, a0, c0], outputs=outputs)
### END CODE HERE ###
return inference_model
inference_model = music_inference_model(LSTM_cell, densor, Ty = 50)
The error i am getting
ValueError Traceback (most recent call last)
<ipython-input-19-a33998d93c7b> in <module>
----> 1 inference_model = music_inference_model(LSTM_cell, densor, Ty = 50)
<ipython-input-18-ead9bae0b252> in music_inference_model(LSTM_cell, densor, Ty)
38 for t in range(Ty):
39 # Step 2.A: Perform one step of LSTM_cell (≈1 line)
---> 40 a, _, c = LSTM_cell(x, initial_state=[a, c])
41
42 # Step 2.B: Apply Dense layer to the hidden state output of the LSTM_cell (≈1 line)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/layers/recurrent.py in __call__(self, inputs, initial_state, constants, **kwargs)
707 # Perform the call with temporarily replaced input_spec
708 self.input_spec = full_input_spec
--> 709 output = super(RNN, self).__call__(full_input, **kwargs)
710 # Remove the additional_specs from input spec and keep the rest. It is
711 # important to keep since the input spec was populated by build(), and
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, *args, **kwargs)
924 if _in_functional_construction_mode(self, inputs, args, kwargs, input_list):
925 return self._functional_construction_call(inputs, args, kwargs,
--> 926 input_list)
927
928 # Maintains info about the `Layer.call` stack.
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py in _functional_construction_call(self, inputs, args, kwargs, input_list)
1090 # TODO(reedwm): We should assert input compatibility after the inputs
1091 # are casted, not before.
-> 1092 input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
1093 graph = backend.get_graph()
1094 # Use `self._name_scope()` to avoid auto-incrementing the name.
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py in assert_input_compatibility(input_spec, inputs, layer_name)
225 ' is incompatible with layer ' + layer_name +
226 ': expected shape=' + str(spec.shape) +
--> 227 ', found shape=' + str(shape))
228
229
ValueError: Input 0 is incompatible with layer lstm: expected shape=(None, None, 90), found shape=[90, 1, 78]
try fixing your code in Step2.B and Step2.D as following:
This is your code:
# Step 2.B: Apply Dense layer to the hidden state output of the LSTM_cell
out = densor(_)
Change to:
out = densor(a)
This is your code:
# Step 2.D:
x = tf.math.argmax(out)
x = tf.one_hot(indices=x, depth=78)
Change to:
x = tf.math.argmax(out, axis = -1)
x = tf.one_hot(x, depth=n_values)
Make sure to restart the Kernel and then rerun fixed codes!

Tensorflow / Keras, trouble with Embedding and sparse_categorical_crossentropy

Keras noob here,
I'm trying to build an LSTM network to generate text using Shakespeare's works. (A bit like in this tutorial)
This the method which generates my model:
def generate_model(seq_len=100, stateful=True):
# Initialize model
source = tf.keras.Input(
name='seed', shape=(seq_len,), dtype=tf.int32)
# Embed ascii character (0 to 255) into one hot encoding (0, 1, 0...)
embedding = tf.keras.layers.Embedding(input_dim=256, output_dim=EMBEDDING_DIM, input_length=seq_len)(source)
# Good old LSTM's
lstm_1 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(embedding)
lstm_2 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(lstm_1)
# I honestly don't understand what the TimeDistributed method does
predicted_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(256, activation='softmax'))(lstm_2)
model = tf.keras.Model(inputs=[source], outputs=[predicted_char])
model.compile(
optimizer=tf.train.RMSPropOptimizer(learning_rate=0.01),
loss='categorical_crossentropy',
metrics=['categorical_accuracy'])
return model
I thought the Embedding layer was responsible for one-hot encoding all my characters into vectors. If this is true, is that structure not preserved while passing through the LSTM layers? I'm a bit confused.
For reference, this is an example of an input:
[65 76 76 83 32 87 69 76 76 32]
(before encoding it would be)
['A', 'L', 'L', 'S', ' ', 'W', 'E', 'L', 'L', ' ']
The corresponding label is the next character in the sequence:
[84] ie ['T']
I'm struggling with what appears to be a common error among newcomers
tensorflow.python.framework.errors_impl.InvalidArgumentError: logits and labels must have the same first dimension, got logits shape [100,256] and labels shape [1]
[[{{node loss/time_distributed_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]]
[[{{node training/TFOptimizer/gradients/embedding/embedding_lookup_grad/Reshape}}]]
From the research I've done, it would appear that this has to do with my use of sparse_categorical_crossentropy, however, if I use categorical_crossentropy I get the following error during training
ValueError: You are passing a target array of shape (5524624, 1) while using as loss `categorical_crossentropy`. `categorical_crossentropy` expects targets to be binary matrices (1s and 0s) of shape (samples, classes). If your targets are integer classes, you can convert them to the expected format via:
I must be missing something, how could I get my model to train?
Thank you, I would really appreciate any help :))
Edit:
If it could be useful, this is all of my code:
import numpy as np
import tensorflow as tf
SHAKESPEARE_TXT = 'shakespeare.txt'
with open(SHAKESPEARE_TXT, 'r', encoding="utf8") as f:
raw = f.read()
def encode(txt):
# drop any non-ascii characters
output = np.asarray([ord(c) for c in txt if ord(c) < 255 and c != '\r'], dtype=np.int32)
return output
def decode(txt):
return [chr(c) for c in txt]
def get_training_data(seq_len, txt=raw):
source = encode(txt)
x, y = [], []
n = len(source) - seq_len
#n=100
for i in range(n):
sequence = source[i: i + seq_len]
x.append(sequence)
y.append([source[i + seq_len]])
return np.asarray(x), np.asarray(y)
# txt = encode(raw)
# print(decode(txt[0:100]))
'''
training_data = get_training_data(seq_len=10)
for i in range(10):
print(decode(training_data[0][i]), decode(training_data[1][i]))
'''
EMBEDDING_DIM = 512
def generate_model(seq_len=100, stateful=True):
# Initialize model
source = tf.keras.Input(
name='seed', shape=(seq_len,), dtype=tf.int32)
# Embed ascii character (0 to 255) into one hot encoding (0, 1, 0...)
embedding = tf.keras.layers.Embedding(input_dim=256, output_dim=EMBEDDING_DIM, input_length=seq_len)(source)
# Good old LSTM's
lstm_1 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(embedding)
lstm_2 = tf.keras.layers.LSTM(EMBEDDING_DIM, stateful=stateful, return_sequences=True)(lstm_1)
# I honestly don't understand what the TimeDistributed method does
predicted_char = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(256, activation='softmax'))(lstm_2)
model = tf.keras.Model(inputs=[source], outputs=[predicted_char])
model.compile(
optimizer=tf.train.RMSPropOptimizer(learning_rate=0.01),
loss='categorical_crossentropy',
metrics=['categorical_accuracy'])
return model
def train():
tf.keras.backend.clear_session()
print("Creating model")
training_model = generate_model(seq_len=100, stateful=False)
'''
tpu_model = tf.contrib.tpu.keras_to_tpu_model(
training_model,
strategy=tf.contrib.tpu.TPUDistributionStrategy(
tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))
'''
print("Training")
data = get_training_data(seq_len=100)
'''
print(data[0].shape)
print(data[0][0])
print(data[1].shape)
print(data[1][0])
'''
# Start training
training_model.fit(
x=data[0],
y=data[1],
batch_size=1,
# steps_per_epoch=100,
epochs=2
)
print("Saving")
training_model.save_weights('/tmp/bard.h5', overwrite=True)
train()