How to split the dataset into inputs and labels in tensorflow? - tensorflow

consider the code below. I want to split the tensorflow.python.data.ops.dataset_ops.BatchDataset into inputs and labels according to the function below. Yet I get the error 'BatchDataset' object is not subscriptable. Can anyone help me with that?
import tensorflow as tf
input_slice=3
labels_slice=2
def split_window(features):
inputs = features[:, input_slice, :]
labels = features[:, labels_slice, :]
#####create a batch dataset
dataset = tf.data.Dataset.range(1, 25 + 1).batch(5)
#####split the dataset into input and labels
dataset=split_window(dataset)
The dataset without the split window looks like this:
tf.Tensor([1 2 3 4 5], shape=(5,), dtype=int64)
tf.Tensor([ 6 7 8 9 10], shape=(5,), dtype=int64)
tf.Tensor([11 12 13 14 15], shape=(5,), dtype=int64)
tf.Tensor([16 17 18 19 20], shape=(5,), dtype=int64)
tf.Tensor([21 22 23 24 25], shape=(5,), dtype=int64)
But what I meant was to display the inputs and labels like this:
Inputs:
[1 2 3 ]
[ 6 7 8 ]
[11 12 13 ]
[16 17 18 ]
[21 22 23 ]
Labels:
[4 5]
[9 10]
[14 15]
[19 20]
[24 25]

You can try this:
import tensorflow as tf
input_slice=3
labels_slice=2
def split_window(x):
features = tf.slice(x,[0], [input_slice])
labels = tf.slice(x,[input_slice], [labels_slice])
return features, labels
dataset = tf.data.Dataset.range(1, 25 + 1).batch(5).map(split_window)
for i, j in dataset:
print(i.numpy(),end="->")
print(j.numpy())
[1 2 3]->[4 5]
[6 7 8]->[ 9 10]
[11 12 13]->[14 15]
[16 17 18]->[19 20]
[21 22 23]->[24 25]

You can't apply a Python function directly to a tf.data.Dataset. You need to use the .map() method. Also, your function is returning nothing.
import tensorflow as tf
input_slice = 3
labels_slice = 2
def split_window(features):
inputs = tf.gather_nd(features, [input_slice])
labels = tf.gather_nd(features, [labels_slice])
return inputs, labels
dataset = tf.data.Dataset.range(1, 25 + 1).batch(5).map(split_window)
for x, y in dataset:
print(x.numpy(), y.numpy())
4 3
9 8
14 13
19 18
24 23

Related

Conditional sum of two 3D arrays; condition is a 1D array

I want to add two 3D (shape (N, 3)) numpy arrays conditionally where the condition is specified as a 1D array (shape of N).
What's an efficient (vectorized) way to do this? numpy.where() only supports a conditional operation where all three arrays (including the condition) have matching dimensions.
For instance:
a = np.asarray([[1, 1], [2, 2], [3, 3], [4, 4]])
b = np.asarray([[0.2, 0.2], [0.3, 0.3], [0.4, 0.4], [0.5, 0.5]])
c = np.asarray([0, 1, 1, 0])
I would like to be able to do:
np.where(c == 1, a + b, a)
i.e., do an element-wise add of a + b as long as the corresponding element in c is equal to 1 at the same index in the array.
However, I get an error instead:
ValueError: operands could not be broadcast together with shapes (4,) (4,3) (4,3)
Use boolean indexing:
import numpy as np
a = np.random.randint(20, 30, size=(5, 3))
#[[23 29 23]
# [20 27 24]
# [28 26 26]
# [27 20 26]
# [23 24 23]]
b = np.random.randint(20, 30, size=(5, 3))
#[[22 25 20]
# [28 29 20]
# [29 22 29]
# [28 28 21]
# [22 26 27]]
c = np.random.randint(0, 2, size=5).astype(bool)
# [ True True True False False]
r = a + b
r[~c] = a[~c] # keeping the default value if the corresponding value is 0 (or False) in c.
print(r)
[[45 54 43]
[48 56 44]
[57 48 55]
[27 20 26]
[23 24 23]]

Numpy 3D Array dimensions and slicing

I have trouble understanding the Numpy representation for 3D arrays. I'm used to it being (rows,columns,depth) but with Numpy it seems to be (depth,rows,columns).
E.g.:
c = np.arange(12)
c = c.reshape(2,3,2)
d = c
d = d.reshape(2,2,3)
print(c)
print(d)
[[[ 0 1]
[ 2 3]
[ 4 5]]
[[ 6 7]
[ 8 9]
[10 11]]]
d:
[[[ 0 1 2]
[ 3 4 5]]
[[ 6 7 8]
[ 9 10 11]]]
d is the representation I wish for. Now if I want to access the second 2D array I can write:
print(d[1,:,:])
[[ 6 7 8]
[ 9 10 11]]
So why is the representation so unintuitive for Numpy? And how would I access all uneven indexed(the first,3th,5th ...) 2D arrays of a 3D array given both representations?

The shape of the predicted_ids in the outputs of `tf.contrib.seq2seq.BeamSearchDecoder`

What is the shape of the contents in the outputs of tf.contrib.seq2seq.BeamSearchDecoder. I know that it is an instance of class BeamSearchDecoderOutput(scores, predicted_ids, parent_ids), but what is the shape of the scores, predicted_ids and parent_ids?
I wrote followig toy code to explore it a little bit myself.
tgt_vocab_size = 20
embedding_decoder = tf.one_hot(list(range(0, tgt_vocab_size)), tgt_vocab_size)
batch_size = 2
start_tokens = tf.fill([batch_size], 0)
end_token = 1
beam_width = 3
num_units=18
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
encoder_outputs = decoder_cell.zero_state(batch_size, dtype=tf.float32)
tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(encoder_outputs, multiplier=beam_width)
my_decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=decoder_cell,
embedding=embedding_decoder,
start_tokens=start_tokens,
end_token=end_token,
initial_state=tiled_encoder_outputs,
beam_width=beam_width)
# dynamic decoding
outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder,
maximum_iterations=4,
output_time_major=True)
final_predicted_ids = outputs.predicted_ids
scores = outputs.beam_search_decoder_output.scores
predicted_ids = outputs.beam_search_decoder_output.predicted_ids
parent_ids = outputs.beam_search_decoder_output.parent_ids
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
final_predicted_ids_vals = sess.run(final_predicted_ids)
print("final_predicted_ids shape:")
print(final_predicted_ids_vals.shape)
print("final_predicted_ids_vals: \n%s" %final_predicted_ids_vals)
print("scores shape:")
print(sess.run(scores).shape)
print("scores values: \n %s" % sess.run(scores))
print("predicted_ids shape: ")
print(sess.run(predicted_ids).shape)
print("predicted_ids values: \n %s" % sess.run(predicted_ids))
print("parent_ids shape:")
print(sess.run(parent_ids).shape)
print("parent_ids values: \n %s" % sess.run(parent_ids))
The print is as follows:
final_predicted_ids shape:
(4, 2, 3)
final_predicted_ids_vals:
[[[ 1 8 8]
[ 1 8 8]]
[[ 1 13 13]
[ 1 13 13]]
[[ 1 13 13]
[ 1 13 13]]
[[ 1 13 2]
[ 1 13 2]]]
scores shape:
(4, 2, 3)
scores values:
[[[ -2.8376358 -2.843168 -2.8478816]
[ -2.8376358 -2.843168 -2.8478816]]
[[ -2.8478816 -5.655898 -5.6810265]
[ -2.8478816 -5.655898 -5.6810265]]
[[ -2.8478816 -8.478384 -8.495466 ]
[ -2.8478816 -8.478384 -8.495466 ]]
[[ -2.8478816 -11.292251 -11.307263 ]
[ -2.8478816 -11.292251 -11.307263 ]]]
predicted_ids shape:
(4, 2, 3)
predicted_ids values:
[[[ 8 13 1]
[ 8 13 1]]
[[ 1 13 13]
[ 1 13 13]]
[[ 1 13 12]
[ 1 13 12]]
[[ 1 13 2]
[ 1 13 2]]]
parent_ids shape:
(4, 2, 3)
parent_ids values:
[[[0 0 0]
[0 0 0]]
[[2 0 1]
[2 0 1]]
[[0 1 1]
[0 1 1]]
[[0 1 1]
[0 1 1]]]
The outputs of tf.contrib.seq2seq.dynamic_decode(BeamSearchDecoder) is actually an instance of class FinalBeamSearchDecoderOutput which consists of:
predicted_ids: Final outputs returned by the beam search after all decoding is finished. A tensor of shape [batch_size, num_steps, beam_width] (or [num_steps, batch_size, beam_width] if output_time_major is True). Beams are ordered from best to worst.
beam_search_decoder_output: An instance of BeamSearchDecoderOutput that describes the state of the beam search.
So need to make sure the final predictions/translations are of shape [beam_width, batch_size, num_steps] by transpose([2, 0, 1]) or tf.transpose(final_predicted_ids) if output_time_major=True.

Weird behavior of multiply in tensorflow

I am trying to use multiply in my program, but I find the behavior of this op is unnormal. It seems that it is calculating the wrong results. Minimum example:
import tensorflow as tf
batchSize = 2
maxSteps = 3
max_cluster_size = 4
x = tf.Variable(tf.random_uniform(dtype=tf.int32, maxval=20, shape=[batchSize, maxSteps, max_cluster_size]))
y = tf.sequence_mask(tf.random_uniform(minval=1, maxval=max_cluster_size-1, dtype=tf.int32, shape=[batchSize, maxSteps]), maxlen=max_cluster_size)
y = tf.cast(y, tf.int32)
z = tf.multiply(x, y)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
x_v = sess.run(x)
y_v = sess.run(y)
z_v = sess.run(z)
print(x_v.shape)
print(x_v)
print('----------------------------')
print(y_v.shape)
print(y_v)
print('----------------------------')
print(z_v.shape)
print(z_v)
print('----------------------------')
Result:
(2, 3, 4)
[[[ 7 12 19 3]
[10 18 15 7]
[18 9 2 7]]
[[ 4 5 16 1]
[ 2 14 15 14]
[ 5 18 8 18]]]
----------------------------
(2, 3, 4)
[[[1 1 0 0]
[1 0 0 0]
[1 1 0 0]]
[[1 1 0 0]
[1 1 0 0]
[1 1 0 0]]]
----------------------------
(2, 3, 4)
[[[ 7 12 0 0]
[10 0 0 0]
[18 0 0 0]]
[[ 4 5 0 0]
[ 2 0 0 0]
[ 5 0 0 0]]]
----------------------------
Where z_v is expected to be:
[[[ 7 12 0 0]
[10 0 0 0]
[18 9 0 0]]
[[ 4 5 0 0]
[ 2 14 0 0]
[ 5 18 0 0]]]
When I test multiply in other programs, it goes just fine.
I suspect that this may be related to x and y are random variables. Anyone give a hint on this?
Instead of these lines:
x_v = sess.run(x)
y_v = sess.run(y)
z_v = sess.run(z)
you need to use this:
x_v, y_v, z_v = sess.run( [ x, y, z ] )
With the first, separate version, basically what ends up happening is that you create x_v, and then y_v, but when you run the sess.run(z) it will recalculate z's dependencies as well, so you end up seeing the output from different x's and y's than you print.

Tensorflow: how to make sure all samples in each batch are with the same label?

I wonder whether there are some ways to apply constraints on the batches to generate in Tensorflow. For example, let's say we are training a CNN on a huge dataset to do image classification. Is it possible to force Tensorflow to generate batches where all samples are with the same class? Like, one batch of images all tagged with "Apple", the other one where samples all tagged with "Orange".
The reason I ask this question is I want to do some experiments to see how different levels of shuffling influence the final trained models. It's common practice to do sample-level shuffling for CNN training, and everybody is doing it. I just want to check it myself, thus obtaining a more vivid and first-hand knowledge about it.
Thanks!
Dataset.filter() can be used:
labels = np.random.randint(0, 10, (10000))
data = np.random.uniform(size=(10000, 5))
ds = tf.data.Dataset.from_tensor_slices((data, labels))
ds = ds.filter(lambda data, labels: tf.equal(labels, 1)) #comment this line out for unfiltered case
ds = ds.batch(5)
iterator = ds.make_one_shot_iterator()
vals = iterator.get_next()
with tf.Session() as sess:
for _ in range(5):
py_data, py_labels = sess.run(vals)
print(py_labels)
with ds.filter():
> [1 1 1 1 1]
[1 1 1 1 1]
[1 1 1 1 1]
[1 1 1 1 1]
[1 1 1 1 1]
without ds.filter():
> [8 0 7 6 3]
[2 4 7 6 1]
[1 8 5 5 5]
[7 1 7 4 0]
[7 1 8 0 0]
Edit. The following code shows how to use a feedable iterator to perform batch label selection on the fly. See "Creating an iterator"
labels = ['Apple'] * 100 + ['Orange'] * 100
data = list(range(200))
random.shuffle(labels)
batch_size = 4
ds_apple = tf.data.Dataset.from_tensor_slices((data, labels)).filter(
lambda data, label: tf.equal(label, 'Apple')).batch(batch_size)
ds_orange = tf.data.Dataset.from_tensor_slices((data, labels)).filter(
lambda data, label: tf.equal(label, 'Orange')).batch(batch_size)
handle = tf.placeholder(tf.string, [])
iterator = tf.data.Iterator.from_string_handle(
handle, ds_apple.output_types, ds_apple.output_shapes)
batch = iterator.get_next()
apple_iterator = ds_apple.make_one_shot_iterator()
orange_iterator = ds_orange.make_one_shot_iterator()
with tf.Session() as sess:
apple_handle = sess.run(apple_iterator.string_handle())
orange_handle = sess.run(orange_iterator.string_handle())
# loop and switch back and forth between apples and oranges
for _ in range(3):
feed_dict = {handle: apple_handle}
print(sess.run(batch, feed_dict=feed_dict))
feed_dict = {handle: orange_handle}
print(sess.run(batch, feed_dict=feed_dict))
Typical output for this is as follows. Note that the data values increase monotonically across Apple and Orange batches showing that the iterators are not resetting.
> (array([2, 3, 6, 7], dtype=int32), array([b'Apple', b'Apple', b'Apple', b'Apple'], dtype=object))
(array([0, 1, 4, 5], dtype=int32), array([b'Orange', b'Orange', b'Orange', b'Orange'], dtype=object))
(array([ 9, 13, 15, 19], dtype=int32), array([b'Apple', b'Apple', b'Apple', b'Apple'], dtype=object))
(array([ 8, 10, 11, 12], dtype=int32), array([b'Orange', b'Orange', b'Orange', b'Orange'], dtype=object))
(array([21, 22, 23, 25], dtype=int32), array([b'Apple', b'Apple', b'Apple', b'Apple'], dtype=object))
(array([14, 16, 17, 18], dtype=int32), array([b'Orange', b'Orange', b'Orange', b'Orange'], dtype=object))