How should I append an element to each sequence data by tf.data.Dataset - tensorflow

I want to get sequence data with char2int['EOS'] added behind by tf.data.Dataset.
The codes I wrote are as below:
import tensorflow as tf
def _get_generator(list_of_text, char2int):
def gen():
for text in list_of_text:
yield [char2int[x] for x in text] # transform char to int
return gen
def get_dataset(list_of_text, char2int):
gen = _get_generator(list_of_text, char2int)
dataset = tf.data.Dataset.from_generator(gen, (tf.int32), tf.TensorShape([None]))
dataset = dataset.map(lambda seq: seq+[char2int['EOS']]) # append EOS to the end of line
data_iter = dataset.make_initializable_iterator()
return dataset, data_iter
char2int = {'EOS':1, 'a':2, 'b':3, 'c':4}
list_of_text = ['aaa', 'abc'] # the sequence data
with tf.Graph().as_default():
dataset, data_iter = get_dataset(list_of_text, char2int)
with tf.Session() as sess:
sess.run(data_iter.initializer)
tt1 = sess.run(data_iter.get_next())
tt2 = sess.run(data_iter.get_next())
print(tt1) # got [3 3 3] but I want [2 2 2 1]
print(tt2) # god [3 4 5] but I want [2 3 4 1]
But I can't get what I want. It performs element-wise addition to each data. How should I fix it, thanks

In your map function you are adding each value by 1 instead of concatenating the value. You can change your _get_generator to :
def _get_generator(list_of_text, char2int):
def gen():
for text in list_of_text:
yield [char2int[x] for x in text] + [char2int['EOS']]# transform char to int
return gen
and remove dataset.map call.

As Vijay points out in his answer, the + operator on a tf.Tensor of type tf.int32 performs addition rather than concatenation. To concatenate an additional symbol onto the end of the sequence, instead use tf.concat() in the Dataset.map():
dataset = dataset.map(lambda seq: tf.concat([seq, [char2int['EOS']]], axis=0)

Related

How to concatenate two tensors with intervals in tensorflow?

I want to concatenate two tensors checkerboard-ly in tensorflow2, like examples showed below:
example 1:
a = [[1,1],[1,1]]
b = [[0,0],[0,0]]
concated_a_and_b = [[1,0,1,0],[0,1,0,1]]
example 2:
a = [[1,1,1],[1,1,1],[1,1,1]]
b = [[0,0,0],[0,0,0],[0,0,0]]
concated_a_and_b = [[1,0,1,0,1,0],[0,1,0,1,0,1],[1,0,1,0,1,0]]
Is there a decent way in tensorflow2 to concatenate them like this?
A bit of background for this:
I first split a tensor c with a checkerboard mask into two halves a and b. A after some transformation I have to concat them back into oringnal shape and order.
What I mean by checkerboard-ly:
Step 1: Generate a matrix with alternated values
You can do this by first concatenating into [1, 0] pairs, and then by applying a final reshape.
Step 2: Reverse some rows
I split the matrix into two parts, reverse the second part and then rebuild the full matrix by picking alternatively from the first and second part
Code sample:
import math
import numpy as np
import tensorflow as tf
a = tf.ones(shape=(3, 4))
b = tf.zeros(shape=(3, 4))
x = tf.expand_dims(a, axis=-1)
y = tf.expand_dims(b, axis=-1)
paired_ones_zeros = tf.concat([x, y], axis=-1)
alternated_values = tf.reshape(paired_ones_zeros, [-1, a.shape[1] + b.shape[1]])
num_samples = alternated_values.shape[0]
middle = math.ceil(num_samples / 2)
is_num_samples_odd = middle * 2 != num_samples
# Gather first part of the matrix, don't do anything to it
first_elements = tf.gather_nd(alternated_values, [[index] for index in range(middle)])
# Gather second part of the matrix and reverse its elements
second_elements = tf.reverse(tf.gather_nd(alternated_values, [[index] for index in range(middle, num_samples)]), axis=[1])
# Pick alternatively between first and second part of the matrix
indices = np.concatenate([[[index], [index + middle]] for index in range(middle)], axis=0)
if is_num_samples_odd:
indices = indices[:-1]
output = tf.gather_nd(
tf.concat([first_elements, second_elements], axis=0),
indices
)
print(output)
I know this is not a decent way as it will affect time and space complexity. But it solves the above problem
def concat(tf1, tf2):
result = []
for (index, (tf_item1, tf_item2)) in enumerate(zip(tf1, tf2)):
item = []
for (subitem1, subitem2) in zip(tf_item1, tf_item2):
if index % 2 == 0:
item.append(subitem1)
item.append(subitem2)
else:
item.append(subitem2)
item.append(subitem1)
concated_a_and_b.append(item)
return concated_a_and_b

Keras custom layer on ragged tensor to reduce dimensionallity

I'm trying to write a custom layer that will handle variable-length vectors, and reduce them to the same length vector.
The length is known in advance because the reason for the variable lengths is that I have several different data types that I encode using a different number of features.
In a sense, it is similar to Embedding only for numerical values.
I've tried using padding, but the results were bad, so I'm trying this approach instead.
So, for example let's say I have 3 data types, which I encode with 3, 4, 6 length vectors.
arr = [
# example one (data type 1 [len()==3], datat type 3[len()==6]) - force values as floats
[[1.0,2.0,3],[1,2,3,4,5,6]],
# example two (data type 2 [len()==4], datat type 3len()==6]) - force values as floats
[[1.0,2,3,4],[1,2,3,4,5,6]],
]
I tried implementing a custom layer like:
class DimensionReducer(tf.keras.layers.Layer):
def __init__(self, output_dim, expected_lengths):
super(DimensionReducer, self).__init__()
self._supports_ragged_inputs = True
self.output_dim = output_dim
for l in expected_lengths:
setattr(self,f'w_{l}', self.add_weight(shape=(l, self.output_dim),initializer='random_normal',trainable=True))
setattr(self, f'b_{l}',self.add_weight(shape=(self.output_dim,), initializer='random_normal',trainable=True))
def call(self, inputs):
print(inputs.shape)
# batch
if len(inputs.shape) == 3:
print("batch")
result = []
for i,x in enumerate(inputs):
_result = []
for v in x:
l = len(v)
print(l)
print(v)
w = getattr(self, f'w_{l}')
b = getattr(self, f'b_{l}')
out = tf.matmul([v],w) + b
_result.append(out)
result.append(tf.concat(_result, 0))
r = tf.stack(result)
print("batch output:",r.shape)
return r
Which seems to be working when called directly:
dim = DimensionReducer(3, [3,4,6])
dim(tf.ragged.constant(arr))
But when I try to incorporate it into a model, it fails:
import tensorflow as tf
val_ragged = tf.ragged.constant(arr)
inputs_ragged = tf.keras.layers.Input(shape=(None,None), ragged=True)
outputs_ragged = DimensionReducer(3, [3,4,6])(inputs_ragged)
model_ragged = tf.keras.Model(inputs=inputs_ragged, outputs=outputs_ragged)
# this one with RaggedTensor doesn't
print(model_ragged(val_ragged))
With
AttributeError: 'DimensionReducer' object has no attribute 'w_Tensor("dimension_reducer_98/strided_slice:0", shape=(), dtype=int32)'
I'm not sure how am I to implement such a layer, or what I'm doing wrong.

Finding loss mask of variable length in keras tensorflow

Trying to build loss function which captures the below functionality, which mask the output values once 'end of sequence' is encountered.
Given a tensor of shape [BatchSize,MaxSequenceLenght,OutputNodes]
Consider the below example
batch size = 3
Max Sequence Length=4
OutputNodes = 3
predicted = [[[0.1,0.3,0.2],[0.4,0.6,0.8],[0.5,0.2,0.3],[0.0,0.0,0.99]],
[[0.1,0.3,0.2],[0.4,0.9,0.8],[0.5,0.2,0.9],[0.4,0.6,0.8]],
[[0.1,0.3,0.2],[0.4,0.9,0.8],[0.5,0.2,0.1],[0.4,0.6,0.1]]]
I am dedicating the last output node to symbolise the 'end of sequence(EOS)' here node=2 . Nodes are labelled as (0, 1 and 2)
Based on the predicted value, I have to return a mask which tries to find the first occurrence of EOS.
In the above example,
first row has following sequence (argmax) => 1,2,0,2
Second row has following sequence => 1,1,2,2
Third row has following sequence => 1,1,9,1
So my mask should be
[[1,0,0,0],
[1,1,0,0],
[1,1,1,1]
The mask will ensure, the values post the EOS is ignored or not considered in calculating the loss.
Below is my code snipped I tried
sequence_cluster_asign = keras.backend.argmax(sequence_values,axis=-1)
loss_mask = []
for seq in K.tf.unstack(sequence_cluster_asign):
##appendEOS- To make sure tf.where is not empty
seq = tf.concat([seq,endOfSequenceTensor],axis=0)
endOfSequenceLocation = K.tf.where(K.tf.equal(seq,endOfSequence))[0][0]
loss_mask.append(tf.sequence_mask(endOfSequenceLocation,max_decoder_seq_length,dtype=tf.float32))
final_mask = K.stack(loss_mask)
Error encountered : ValueError: Cannot infer num from shape (?,?)
If you want to get mask in your question, you can use the following method.
import tensorflow as tf
import keras
from keras import backend as K
sequence_values = K.placeholder(shape=(None, 4, 3))
sequence_cluster_asign = keras.backend.argmax(sequence_values,axis=-1)
# keras version
result = K.cast(K.less(sequence_cluster_asign,sequence_values.get_shape().as_list()[-1]-1),dtype='int32')
result = K.cumprod(result,axis=-1)
# tensorflow version
# result = tf.cast(tf.less(sequence_cluster_asign,sequence_values.get_shape().as_list()[-1]-1),dtype=tf.int32)
# result = tf.cumprod(result,axis=-1)
predicted = [[[0.1,0.3,0.2],[0.4,0.6,0.8],[0.5,0.2,0.3],[0.0,0.0,0.99]],
[[0.1,0.3,0.2],[0.4,0.9,0.8],[0.5,0.2,0.9],[0.4,0.6,0.8]],
[[0.1,0.3,0.2],[0.4,0.9,0.8],[0.5,0.2,0.1],[0.4,0.6,0.1]]]
with tf.Session() as sess:
print(result.eval(feed_dict={sequence_values:predicted}))
[[1 0 0 0]
[1 1 0 0]
[1 1 1 1]]

How to reverse the tf.image.per_image_standardization() function in tensorflow?

tf.image.per_image_standardization() in Tensorflow converts each image with zero mean & unit variance. So that this leads to a non-exploding gradients while training a deep learning model.But when we want to display the image array, how do we revert this z-score normalization step in Tensorflow?
By "display the image array" I assume you mean to display it in tensorboard. If this is the case then you don't need to do anything, tensorboard can handle images that have been standardized. If you want the original value for any other purpose why not just use the variable before you standardized it, such as:
img = tf.placeholder(...)
img_std = tf.image.per_image_standardization(img)
You can work with either img or img_std in any way you see fit.
If you somehow have a use case for denormalizing the standardized image that isn't covered above then you would need to compute the mean and standard deviation yourself, then multiply by the standard deviation and add the mean. Note that tf.image.per_image_standardization uses an adjusted_stddev that is defined in the documentation as:
adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))
The tf.image.per_image_standardization() layer will create some internal variables you can use to recover the original data. Please note that this is undocumented behavior and not guaranteed to stay the same. Still, for now, you can use the code below (tested) for reference how to get the relevant tensors and recover the original data:
import tensorflow as tf
import numpy as np
img_size = 3
a = tf.placeholder( shape = ( img_size, img_size, 1 ), dtype = tf.float32 )
b = tf.image.per_image_standardization( a )
with tf.Session() as sess:
tensors, tensor_names = [], []
for l in sess.graph.get_operations():
tensors.append( sess.graph.get_tensor_by_name( l.name + ":0" ) )
tensor_names.append( l.name )
#mean_t = sess.graph.get_tensor_by_name( "per_image_standardization/Mean:0" )
#variance_t = sess.graph.get_tensor_by_name( "per_image_standardization/Sqrt:0" )
foobar = np.reshape( np.array( range( img_size * img_size ), dtype = np.float32 ), ( img_size, img_size, 1 ) )
res = sess.run( tensors, feed_dict = { a : foobar } )
#for i in xrange( len( res ) ):
# print( i, tensor_names[ i ] + ":" )
# print( res[ i ] )
# print()
mean = res[ 6 ] # "per_image_standardization/Mean:0"
variance = res[ 13 ] # "per_image_standardization/Sqrt:0"
standardized = res[ 18 ] # "per_image_standardization:0"
original = standardized * variance + mean
print( original )
You can uncomment the mean_t and variance_t lines to get the reference to the relevant tensors by name. (Needs some rewrite of the sess.run() part.) You can uncomment the four lines starting with for i in xrange(... (no rewrite needed) to print all the available created tensors for your edification. :)
The above code, as is, outputs:
[[[0.]
[1.]
[2.]]
[[3.]
[4.]
[5.]]
[[6.]
[7.]
[8.]]]
Which is exactly the data that was fed to the network.

Split a dataset created by Tensorflow dataset API in to Train and Test?

Does anyone know how to split a dataset created by the dataset API (tf.data.Dataset) in Tensorflow into Test and Train?
Assuming you have all_dataset variable of tf.data.Dataset type:
test_dataset = all_dataset.take(1000)
train_dataset = all_dataset.skip(1000)
Test dataset now has first 1000 elements and the rest goes for training.
You may use Dataset.take() and Dataset.skip():
train_size = int(0.7 * DATASET_SIZE)
val_size = int(0.15 * DATASET_SIZE)
test_size = int(0.15 * DATASET_SIZE)
full_dataset = tf.data.TFRecordDataset(FLAGS.input_file)
full_dataset = full_dataset.shuffle()
train_dataset = full_dataset.take(train_size)
test_dataset = full_dataset.skip(train_size)
val_dataset = test_dataset.skip(val_size)
test_dataset = test_dataset.take(test_size)
For more generality, I gave an example using a 70/15/15 train/val/test split but if you don't need a test or a val set, just ignore the last 2 lines.
Take:
Creates a Dataset with at most count elements from this dataset.
Skip:
Creates a Dataset that skips count elements from this dataset.
You may also want to look into Dataset.shard():
Creates a Dataset that includes only 1/num_shards of this dataset.
Disclaimer I stumbled upon this question after answering this one so I thought I'd spread the love
Most of the answers here use take() and skip(), which requires knowing the size of your dataset before hand. This isn't always possible, or is difficult/intensive to ascertain.
Instead what you can do is to essentially slice the dataset up so that 1 every N records becomes a validation record.
To accomplish this, lets start with a simple dataset of 0-9:
dataset = tf.data.Dataset.range(10)
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Now for our example, we're going to slice it so that we have a 3/1 train/validation split. Meaning 3 records will go to training, then 1 record to validation, then repeat.
split = 3
dataset_train = dataset.window(split, split + 1).flat_map(lambda ds: ds)
# [0, 1, 2, 4, 5, 6, 8, 9]
dataset_validation = dataset.skip(split).window(1, split + 1).flat_map(lambda ds: ds)
# [3, 7]
So the first dataset.window(split, split + 1) says to grab split number (3) of elements, then advance split + 1 elements, and repeat. That + 1 effectively skips the 1 element we're going to use in our validation dataset.
The flat_map(lambda ds: ds) is because window() returns the results in batches, which we don't want. So we flatten it back out.
Then for the validation data we first skip(split), which skips over the first split number (3) of elements that were grabbed in the first training window, so we start our iteration on the 4th element. The window(1, split + 1) then grabs 1 element, advances split + 1 (4), and repeats.
 
Note on nested datasets:
The above example works well for simple datasets, but flat_map() will generate an error if the dataset is nested. To address this, you can swap out the flat_map() with a more complicated version that can handle both simple and nested datasets:
.flat_map(lambda *ds: ds[0] if len(ds) == 1 else tf.data.Dataset.zip(ds))
#ted's answer will cause some overlap. Try this.
train_ds_size = int(0.64 * full_ds_size)
valid_ds_size = int(0.16 * full_ds_size)
train_ds = full_ds.take(train_ds_size)
remaining = full_ds.skip(train_ds_size)
valid_ds = remaining.take(valid_ds_size)
test_ds = remaining.skip(valid_ds_size)
use code below to test.
tf.enable_eager_execution()
dataset = tf.data.Dataset.range(100)
train_size = 20
valid_size = 30
test_size = 50
train = dataset.take(train_size)
remaining = dataset.skip(train_size)
valid = remaining.take(valid_size)
test = remaining.skip(valid_size)
for i in train:
print(i)
for i in valid:
print(i)
for i in test:
print(i)
Now Tensorflow doesn't contain any tools for that.
You could use sklearn.model_selection.train_test_split to generate train/eval/test dataset, then create tf.data.Dataset respectively.
You can use shard:
dataset = dataset.shuffle() # optional
trainset = dataset.shard(2, 0)
testset = dataset.shard(2, 1)
See:
https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shard
The upcoming TensorFlow 2.10.0 will have a tf.keras.utils.split_dataset function, see the rc3 release notes:
Added tf.keras.utils.split_dataset utility to split a Dataset object or a list/tuple of arrays into two Dataset objects (e.g. train/test).
In case size of the dataset is known:
from typing import Tuple
import tensorflow as tf
def split_dataset(dataset: tf.data.Dataset,
dataset_size: int,
train_ratio: float,
validation_ratio: float) -> Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]:
assert (train_ratio + validation_ratio) < 1
train_count = int(dataset_size * train_ratio)
validation_count = int(dataset_size * validation_ratio)
test_count = dataset_size - (train_count + validation_count)
dataset = dataset.shuffle(dataset_size)
train_dataset = dataset.take(train_count)
validation_dataset = dataset.skip(train_count).take(validation_count)
test_dataset = dataset.skip(validation_count + train_count).take(test_count)
return train_dataset, validation_dataset, test_dataset
Example:
size_of_ds = 1001
train_ratio = 0.6
val_ratio = 0.2
ds = tf.data.Dataset.from_tensor_slices(list(range(size_of_ds)))
train_ds, val_ds, test_ds = split_dataset(ds, size_of_ds, train_ratio, val_ratio)
A robust way to split dataset into two parts is to first deterministically map every item in the dataset into a bucket with, for example, tf.strings.to_hash_bucket_fast. Then you can split the dataset into two by filtering by the bucket. If you split your data into five buckets, you get 80-20 split assuming that the split is even.
As an example, assume that your dataset contains dictionaries with key filename. We split the data into five buckets based on this key. With this add_fold function, we add the key "fold" in the dictionaries:
def add_fold(buckets: int):
def add_(sample, label):
fold = tf.strings.to_hash_bucket(sample["filename"], num_buckets=buckets)
return {**sample, "fold": fold}, label
return add_
dataset = dataset.map(add_fold(buckets=5))
Now we can split the dataset into two disjoint datasets with Dataset.filter:
def pick_fold(fold: int):
def filter_fn(sample, _):
return tf.math.equal(sample["fold"], fold)
return filter_fn
def skip_fold(fold: int):
def filter_fn(sample, _):
return tf.math.not_equal(sample["fold"], fold)
return filter_fn
train_dataset = dataset.filter(skip_fold(0))
val_dataset = dataset.filter(pick_fold(0))
The key that you use for hashing should be one that captures the correlations in the dataset. For example, if your samples collected by the same person are correlated and you want all samples with the same collector end up in the same bucket (and the same split), you should use the collector name or ID as the hashing column.
Of course, you can skip the part with dataset.map and do the hashing and filtering in one filter function. Here's a full example:
dataset = tf.data.Dataset.from_tensor_slices([f"value-{i}" for i in range(10000)])
def to_bucket(sample):
return tf.strings.to_hash_bucket_fast(sample, 5)
def filter_train_fn(sample):
return tf.math.not_equal(to_bucket(sample), 0)
def filter_val_fn(sample):
return tf.math.logical_not(filter_train_fn(sample))
train_ds = dataset.filter(filter_train_fn)
val_ds = dataset.filter(filter_val_fn)
print(f"Length of training set: {len(list(train_ds.as_numpy_iterator()))}")
print(f"Length of validation set: {len(list(val_ds.as_numpy_iterator()))}")
This prints:
Length of training set: 7995
Length of validation set: 2005
Can't comment, but above answer has overlap and is incorrect. Set BUFFER_SIZE to DATASET_SIZE for perfect shuffle. Try different sized val/test size to verify. Answer should be:
DATASET_SIZE = tf.data.experimental.cardinality(full_dataset).numpy()
train_size = int(0.7 * DATASET_SIZE)
val_size = int(0.15 * DATASET_SIZE)
test_size = int(0.15 * DATASET_SIZE)
full_dataset = full_dataset.shuffle(BUFFER_SIZE)
train_dataset = full_dataset.take(train_size)
test_dataset = full_dataset.skip(train_size)
val_dataset = test_dataset.take(val_size)
test_dataset = test_dataset.skip(val_size)