Normalizing windows in tensorflow dataset - tensorflow

I am trying to build a windowed dataset from a univariate time series.
The idea is if the series looks like [1, 2, 3, 4, 5, 6] and the window length was 2, then
I'd take windows of length 3 to account for 2 X features and Y target output, so
[[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]] then I'll shuffle them up to avoid bias from that, and split out the input features from target output for each window: [[[1, 2], [3]], [[2, 3], [4]], [[3, 4], [5]], [[4, 5], [6]]]
def windowed_dataset(series):
# Initially the data is (N,) expand dims to (N, 1)
series = tf.expand_dims(series, axis=-1)
# Tensorflow Dataset from the array
ds = tf.data.Dataset.from_tensor_slices(series)
# Create the windows that will serve as input features and label (hence +1)
ds = ds.window(window_len + 1, shift=1, drop_remainder=True)
ds = ds.flat_map(lambda w: w.batch(window_len + 1))
# randomize order
ds = ds.shuffle(shuffle_buffer)
# Separate the inputs and the target output(label)
ds = ds.map(lambda w: (w[:-1], w[-1]))
return ds.batch(batch_size).prefetch(1)
However I'd like to add some normalization. For example if my window is w=[1, 2, 3] then I'd like to normalize according to [p/w[0] - 1 for p in w]
I thought I could achieve this with ds.map and
def normalize_window(w):
return [((i/w[0]) -1) for i in w]
ds = ds.map(normalize_window)
because map is supposed to apply the function to each window in the dataset, but this didn't work. All the example in tensorflow dataset docs use map with lambda functions but I presume it works with regular functions too
Does anyone know how it should be done?
EDIT
The traceback I get is
<ipython-input-39-929295e1b775> in <module>()
----> 1 dataset = model_forecast_datasets(btc_model, np_data[:6])
11 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
263 except Exception as e: # pylint:disable=broad-except
264 if hasattr(e, 'ag_error_metadata'):
--> 265 raise e.ag_error_metadata.to_exception(e)
266 else:
267 raise
OperatorNotAllowedInGraphError: in user code:
<ipython-input-38-b3d0f7e17689>:12 normalize_window *
return [(i/w[0] -1) for i in w]
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:561 __iter__
self._disallow_iteration()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:557 _disallow_iteration
self._disallow_in_graph_mode("iterating over `tf.Tensor`")
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:537 _disallow_in_graph_mode
" this function with #tf.function.".format(task))
OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed in Graph execution. Use Eager execution or decorate this function with #tf.function.

You would need a function that vectorizes the calculation, something like
def normalize(data):
mean = tf.math.reduce_mean(data)
std = tf.math.reduce_std(data)
data = tf.subtract(data, mean)
data = tf.divide(data, std)
return data
ds = ds.map(normalize)
Edit: For your specific normalization this may work:
def normalize(data):
data1 = tf.subtract(data, tf.constant(1))
data1 = tf.divide(data1, data[0])
return data1
(this would have to go after batching ds = ds.flat_map(...)

Related

Process output data from YOLOv5 TFlite

ā¯”Question
Hi, I have successfully trained a custom model based on YOLOv5s and converted the model to TFlite. I feel silly asking, but how do you use the output data?
I get as output:
StatefulPartitionedCall: 0 = [1,25200,7]
from the converted YOLOv5 model
Netron YOLOv5s.tflite model
But I expect an output like:
StatefulPartitionedCall:3 = [1, 10, 4] # boxes
StatefulPartitionedCall:2 = [1, 10] # classes
StatefulPartitionedCall:1 = [1, 10] #scores
StatefulPartitionedCall:0 = [1] #count
(this one is from a tensorflow lite mobilenet model (trained to give 10 output data, default for tflite))
Netron mobilenet.tflite model
It may also be some other form of output, but I honestly have no idea how to get the boxes, classes, scores from a [1,25200,7] array.
(on 15-January-2021 I updated pytorch, tensorflow and yolov5 to the latest version)
The data contained in the [1, 25200, 7] array can be found in this file: outputdata.txt
0.011428807862102985, 0.006756599526852369, 0.04274776205420494, 0.034441519528627396, 0.00012877583503723145, 0.33658933639526367, 0.4722323715686798
0.023071227595210075, 0.006947836373001337, 0.046426184475421906, 0.023744791746139526, 0.0002465546131134033, 0.29862138628959656, 0.4498370885848999
0.03636947274208069, 0.006819264497607946, 0.04913407564163208, 0.025004519149661064, 0.00013208389282226562, 0.3155967593193054, 0.4081345796585083
0.04930267855525017, 0.007249316666275263, 0.04969717934727669, 0.023645592853426933, 0.0001222355494974181, 0.3123127520084381, 0.40113094449043274
...
Should I add a Non Max Suppression or something else, can someone help me please? (github YOLOv5 #1981)
Thanks to #Glenn Jocher I found the solution. The output is [xywh, conf, class0, class1, ...]
My current code is now:
def classFilter(classdata):
classes = [] # create a list
for i in range(classdata.shape[0]): # loop through all predictions
classes.append(classdata[i].argmax()) # get the best classification location
return classes # return classes (int)
def YOLOdetect(output_data): # input = interpreter, output is boxes(xyxy), classes, scores
output_data = output_data[0] # x(1, 25200, 7) to x(25200, 7)
boxes = np.squeeze(output_data[..., :4]) # boxes [25200, 4]
scores = np.squeeze( output_data[..., 4:5]) # confidences [25200, 1]
classes = classFilter(output_data[..., 5:]) # get classes
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
x, y, w, h = boxes[..., 0], boxes[..., 1], boxes[..., 2], boxes[..., 3] #xywh
xyxy = [x - w / 2, y - h / 2, x + w / 2, y + h / 2] # xywh to xyxy [4, 25200]
return xyxy, classes, scores # output is boxes(x,y,x,y), classes(int), scores(float) [predictions length]
To get the output data:
"""Output data"""
output_data = interpreter.get_tensor(output_details[0]['index']) # get tensor x(1, 25200, 7)
xyxy, classes, scores = YOLOdetect(output_data) #boxes(x,y,x,y), classes(int), scores(float) [25200]
And for the boxes:
for i in range(len(scores)):
if ((scores[i] > 0.1) and (scores[i] <= 1.0)):
H = frame.shape[0]
W = frame.shape[1]
xmin = int(max(1,(xyxy[0][i] * W)))
ymin = int(max(1,(xyxy[1][i] * H)))
xmax = int(min(H,(xyxy[2][i] * W)))
ymax = int(min(W,(xyxy[3][i] * H)))
cv2.rectangle(frame, (xmin,ymin), (xmax,ymax), (10, 255, 0), 2)
...

Tensorflow: When using slim.dataset.Dataset, is there a way to map label ID values to other values?

dataset = slim.dataset.Dataset(...)
provider = slim.dataset_data_provider.DatasetDataProvider(dataset, ..._
image, labels = provider.get(['image', 'label')
Let's say, for an example in a dataset A, labels could be [1, 2, 1, 3]. However, for some reason (e.g, due to dataset B), I would like to map the label IDs to other values. The mapping could be like below.
# {old_label: target_label}
mapping = {0: 0, 1: 2, 2: 2, 3: 2, 4: 2, 5: 3, 6: 1}
For now, I am guessing two ways:
-- tf.data.Dataset seems to have a map(map_func) function that every examples should pass, which could be the solution. However, I am more familiar to slim.dataset.Dataset. Is there a similar trick for slim.dataset.Dataset?
-- I was wondering if I can simply apply some mapping function to a tensor label such as:
new_labels = tf.map_fn(lambda x: x+1, labels, dtype=tf.int32)
# labels = [1 2 1 3] --> new_labels = [2 3 2 4]. This works.
new_labels = tf.map_fn(lambda x: mapping[x], labels, dtype=tf.int32)
# I wished but this does not work!
However, the below didn't work, which is what I need. Could anyone please advise?
I think you can try tf.contrib.lookup:
keys = list(mapping.keys())
values = [mapping[k] for k in keys]
table = tf.contrib.lookup.HashTable(
tf.contrib.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.int64, value_dtype=tf.int64), -1
)
new_labels = table.lookup(labels)
sess=tf.Session()
sess.run(table.init)
print(sess.run(new_labels))

How to find an index of the first matching element in TensorFlow

I am looking for a TensorFlow way of implementing something similar to Python's list.index() function.
Given a matrix and a value to find, I want to know the first occurrence of the value in each row of the matrix.
For example,
m is a <batch_size, 100> matrix of integers
val = 23
result = [0] * batch_size
for i, row_elems in enumerate(m):
result[i] = row_elems.index(val)
I cannot assume that 'val' appears only once in each row, otherwise I would have implemented it using tf.argmax(m == val). In my case, it is important to get the index of the first occurrence of 'val' and not any.
It seems that tf.argmax works like np.argmax (according to the test), which will return the first index when there are multiple occurrences of the max value.
You can use tf.argmax(tf.cast(tf.equal(m, val), tf.int32), axis=1) to get what you want. However, currently the behavior of tf.argmax is undefined in case of multiple occurrences of the max value.
If you are worried about undefined behavior, you can apply tf.argmin on the return value of tf.where as #Igor Tsvetkov suggested.
For example,
# test with tensorflow r1.0
import tensorflow as tf
val = 3
m = tf.placeholder(tf.int32)
m_feed = [[0 , 0, val, 0, val],
[val, 0, val, val, 0],
[0 , val, 0, 0, 0]]
tmp_indices = tf.where(tf.equal(m, val))
result = tf.segment_min(tmp_indices[:, 1], tmp_indices[:, 0])
with tf.Session() as sess:
print(sess.run(result, feed_dict={m: m_feed})) # [2, 0, 1]
Note that tf.segment_min will raise InvalidArgumentError when there is some row containing no val. In your code row_elems.index(val) will raise exception too when row_elems don't contain val.
Looks a little ugly but works (assuming m and val are both tensors):
idx = list()
for t in tf.unpack(m, axis=0):
idx.append(tf.reduce_min(tf.where(tf.equal(t, val))))
idx = tf.pack(idx, axis=0)
EDIT:
As Yaroslav Bulatov mentioned, you could achieve the same result with tf.map_fn:
def index1d(t):
return tf.reduce_min(tf.where(tf.equal(t, val)))
idx = tf.map_fn(index1d, m, dtype=tf.int64)
Here is another solution to the problem, assuming there is a hit on every row.
import tensorflow as tf
val = 3
m = tf.constant([
[0 , 0, val, 0, val],
[val, 0, val, val, 0],
[0 , val, 0, 0, 0]])
# replace all entries in the matrix either with its column index, or out-of-index-number
match_indices = tf.where( # [[5, 5, 2, 5, 4],
tf.equal(val, m), # [0, 5, 2, 3, 5],
x=tf.range(tf.shape(m)[1]) * tf.ones_like(m), # [5, 1, 5, 5, 5]]
y=(tf.shape(m)[1])*tf.ones_like(m))
result = tf.reduce_min(match_indices, axis=1)
with tf.Session() as sess:
print(sess.run(result)) # [2, 0, 1]
Here is a solution which also considers the case the element is not included by the matrix (solution from github repository of DeepMind)
def get_first_occurrence_indices(sequence, eos_idx):
'''
args:
sequence: [batch, length]
eos_idx: scalar
'''
batch_size, maxlen = sequence.get_shape().as_list()
eos_idx = tf.convert_to_tensor(eos_idx)
tensor = tf.concat(
[sequence, tf.tile(eos_idx[None, None], [batch_size, 1])], axis = -1)
index_all_occurrences = tf.where(tf.equal(tensor, eos_idx))
index_all_occurrences = tf.cast(index_all_occurrences, tf.int32)
index_first_occurrences = tf.segment_min(index_all_occurrences[:, 1],
index_all_occurrences[:, 0])
index_first_occurrences.set_shape([batch_size])
index_first_occurrences = tf.minimum(index_first_occurrences + 1, maxlen)
return index_first_occurrences
And:
import tensorflow as tf
mat = tf.Variable([[1,2,3,4,5], [2,3,4,5,6], [3,4,5,6,7], [0,0,0,0,0]], dtype = tf.int32)
idx = 3
first_occurrences = get_first_occurrence_indices(mat, idx)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(first_occurrence) # [3, 2, 1, 5]

New error during set_labels in pandas 0.19.2: ValueError: Unequal label lengths

After upgrading from Pandas 0.18.1 to 0.19.2, I am getting the following error when I try to add new levels and labels to my dataframe. Any idea what the problem is?
print index
MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
print levels
[['1', '2', 'Total'], ['nextLevel']]
print labels
[[0, 1, 2], [0, 0, 0]]
index = index.set_levels(levels)
print index
MultiIndex(levels=[[u'Supported', u'Unsupported', u'Total'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
index = index.set_labels(labels)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-f6fb11fbbb3a> in <module>()
288
289 # Initialize dfplot
--> 290 slice_data()
291
292 if len(resultList)==1:
<ipython-input-11-f6fb11fbbb3a> in slice_data(*args)
71 index = index.set_levels(levels)
72 print index
---> 73 index = index.set_labels(labels)
74 data_slice = data_slice.reindex(index)
75
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in set_labels(self, labels, level, inplace, verify_integrity)
350 idx = self._shallow_copy()
351 idx._reset_identity()
--> 352 idx._set_labels(labels, level=level, verify_integrity=verify_integrity)
353 if not inplace:
354 return idx
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _set_labels(self, labels, level, copy, validate, verify_integrity)
285
286 if verify_integrity:
--> 287 self._verify_integrity(labels=new_labels)
288
289 self._labels = new_labels
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _verify_integrity(self, labels, levels)
145 if len(label) != label_length:
146 raise ValueError("Unequal label lengths: %s" %
--> 147 ([len(lab) for lab in labels]))
148 if len(label) and label.max() >= len(level):
149 raise ValueError("On level %d, label max (%d) >= length of"
ValueError: Unequal label lengths: [3, 3]
I'm wondering if it's a bug in the new pandas code. Perhaps self.labels[0] should be labels[0]?
def _verify_integrity(self, labels=None, levels=None):
"""
Parameters
----------
labels : optional list
Labels to check for validity. Defaults to current labels.
levels : optional list
Levels to check for validity. Defaults to current levels.
Raises
------
ValueError
* if length of levels and labels don't match or any label would
exceed level bounds
"""
# NOTE: Currently does not check, among other things, that cached
# nlevels matches nor that sortorder matches actually sortorder.
labels = labels or self.labels
levels = levels or self.levels
if len(levels) != len(labels):
raise ValueError("Length of levels and labels must match. NOTE:"
" this index is in an inconsistent state.")
label_length = len(self.labels[0])
for i, (level, label) in enumerate(zip(levels, labels)):
if len(label) != label_length:
raise ValueError("Unequal label lengths: %s" %
([len(lab) for lab in labels]))
if len(label) and label.max() >= len(level):
raise ValueError("On level %d, label max (%d) >= length of"
" level (%d). NOTE: this index is in an"
" inconsistent state" % (i, label.max(),
len(level)))
I tested my fix and it worked! I submitted a bug to Pandas:
https://github.com/pandas-dev/pandas/issues/15157
I'm not sure if its a bug - I suppose Pandas could replace all the extra indexes with missing values doing it your way but I think you should use reindex
df.reindex(index2)
index = pd.MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
index2 = pd.MultiIndex(levels=[['1', '2', 'Total'], ['nextLevel']],
labels=[[0, 1, 2], [0, 0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
I am new to Pandas, and I found the documentation on MultiIndexing difficult to adapt to solving my own problem. Basically, I want to add some extra rows. This is the solution I came up with. There is probably a much better way to do it. Feel free to share if you'd like.
groupbyColumns = ['label0', 'label1']
data_slice = dataframe.groupby(by=groupbyColumns).sum()
index = data_slice.index
levels = list()
for levelIter in range(len(data_slice.index.levels)):
levels.append([x for x in data_slice.index.levels[levelIter]])
levels[0].append('Total')
if len(resultList)==2:
levels[-1].append('Difference')
addIndexCountForDifferenceRow = 1
else:
addIndexCountForDifferenceRow = 0
# Create new indexing sequence since we are adding Total (and Difference if doing comparison) rows
labels = list()
for labelIter in range(len(data_slice.index.labels)):
labels.append(list())
if len(data_slice.index.labels)==2:
labels0 = [x for x in data_slice.index.labels[0]]
labels1 = [x for x in data_slice.index.labels[1]]
for iter0 in range(max(labels0)+2):
for iter1 in range(max(labels1)+1+addIndexCountForDifferenceRow):
labels[0].append(iter0)
labels[1].append(iter1)
if len(data_slice.index.labels)==3:
labels0 = [x for x in data_slice.index.labels[0]]
labels1 = [x for x in data_slice.index.labels[1]]
labels2 = [x for x in data_slice.index.labels[2]]
for iter0 in range(max(labels0)+2):
for iter1 in range(max(labels1)+1):
for iter2 in range(max(labels2)+1+addIndexCountForDifferenceRow):
labels[0].append(iter0)
labels[1].append(iter1)
labels[2].append(iter2)
index = index.set_levels(levels)
index = index.set_labels(labels)
data_slice = data_slice.reindex(index)

get dynamic sequence length from PaddingFIFOQueue

I use tf.PaddingFIFOQueue or tf.contrib.data.PaddedBatchDataset to feed in sequences of varying lengths and dequeue_many to get a zero-padded batch out of it.
Is there some generic way to also get the sequence length for this batch?
My current solution is to explicitly provide the sequence length as additional input to the queue, i.e. I have sth like tf.PaddingFIFOQueue(names=["data", "seq_length"], ...). I could also use tf.ones_like() but my current way seems cheaper and simpler. But I wonder if that is the canonical/standard way or if there is some other way.
You can combine your data and seq_length into a tuple (or a list) and then push the tuple into the queue.
import tensorflow as tf
sess = tf.InteractiveSession()
q = tf.PaddingFIFOQueue(capacity=10, dtypes=[tf.int32, tf.int32], shapes=[[], [None]])
eq1 = q.enqueue([1, [1]])
eq2 = q.enqueue([2, [2,3]])
eq3 = q.enqueue([3, [4,5,6]])
dq = q.dequeue()
sess.run(eq1)
sess.run(eq2)
sess.run(eq3)
sess.run(dq) # [1, array([1], dtype=int32)]
sess.run(dq) # [2, array([2, 3], dtype=int32)]
sess.run(dq) # [3, array([4, 5, 6], dtype=int32)]