I'm having some trouble with a pretty basic model. Am unable to create a pre-processing layer that simply normalizes all features. It is likely that my conceptual understanding of the situation is problematic. My thinking was that the input layer is a list or a dictionary of tf.keras.Input objects, which refer to the input tensors by "name", and indicates their shape and datatypes. Normalizer layers are built by first adapting them over the training dataset, and those layers can be accrued in a list and concatenated. After the input layer is defined, the preprocessing layer takes as input the input layer, and passes its results downstream. Each item in an input layer list is a symbolic representation of the tensors that will flow, and each normalizer will get the right tensors by virtue of having been adapted on that feature.
The error I get is as follows:
TypeError Traceback (most recent call last)
Input In [11], in <cell line: 63>()
59 concatenated_preprocessing_layer = tf.keras.layers.Concatenate(preprocessing_layers)
61 #outputs = concatenated_preprocessing_layer(input_layer.values())
---> 63 outputs = concatenated_preprocessing_layer(all_inputs)
File ~/.pyenv/versions/3.8.5/lib/python3.8/site-packages/keras/utils/traceback_utils.py:67, in filter_traceback.<locals>.error_handler(*args, **kwargs)
65 except Exception as e: # pylint: disable=broad-except
66 filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67 raise e.with_traceback(filtered_tb) from None
68 finally:
69 del filtered_tb
File ~/.pyenv/versions/3.8.5/lib/python3.8/site-packages/keras/layers/merge.py:509, in Concatenate.build(self, input_shape)
507 shape_set = set()
508 for i in range(len(reduced_inputs_shapes)):
--> 509 del reduced_inputs_shapes[i][self.axis]
510 shape_set.add(tuple(reduced_inputs_shapes[i]))
512 if len(shape_set) != 1:
TypeError: list indices must be integers or slices, not ListWrapper
And the code is as follows:
import tensorflow as tf
filepath='./taxi_data.csv'
CSV_COLUMNS = [
'fare_amount',
'pickup_datetime',
'pickup_longitude',
'pickup_latitude',
'dropoff_longitude',
'dropoff_latitude',
'passenger_count',
'key',
]
LABEL_COLUMN = 'fare_amount'
STRING_COLS = ['pickup_datetime']
NUMERIC_COLS = ['pickup_longitude', 'pickup_latitude',
'dropoff_longitude', 'dropoff_latitude',
'passenger_count']
DEFAULTS = [[0.0], ['na'], [0.0], [0.0], [0.0], [0.0], [0.0], ['na']]
DAYS = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
def map_features_and_labels(row, target_name):
label = row.pop(target_name)
row.pop('key')
row.pop('pickup_datetime')
return row, label
def create_dataset(filepath, target_name, batch_size=1, mode=tf.estimator.ModeKeys.EVAL, CSV_COLUMNS=None, column_defaults=None ):
dataset = tf.data.experimental.make_csv_dataset(file_pattern=filepath, column_names=CSV_COLUMNS, column_defaults=DEFAULTS, num_epochs=1, batch_size=1)
dataset = dataset.map(lambda X: map_features_and_labels(X, target_name))
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=1000).repeat()
return dataset
train_ds = create_dataset(filepath, target_name=LABEL_COLUMN, batch_size=1, CSV_COLUMNS=CSV_COLUMNS, column_defaults=DEFAULTS )
#The input layer is usually a dictionary of feature_name: Input object
input_layer = {
'pickup_longitude': tf.keras.Input(shape=(0,), name='pickup_longitude', dtype=tf.dtypes.float32),
'pickup_latitude': tf.keras.Input(shape=(0,), name='pickup_latitude', dtype=tf.dtypes.float32),
'dropoff_longitude': tf.keras.Input(shape=(0,), name='dropoff_longitude', dtype=tf.dtypes.float32),
'dropoff_latitude': tf.keras.Input(shape=(0,), name='dropoff_latitude', dtype=tf.dtypes.float32),
'passenger_count': tf.keras.Input(shape=(0,), name='passenger_count', dtype=tf.dtypes.float32),
}
preprocessing_layers = []
all_inputs = []
for column in NUMERIC_COLS:
feature_ds = train_ds.map(lambda X, y: X[column])
normalizer = tf.keras.layers.Normalization(axis=None)
normalizer.adapt(feature_ds)
preprocessing_layers.append(normalizer)
all_inputs.append(tf.keras.Input(shape=(0,), name=column, dtype=tf.dtypes.float32, ))
concatenated_preprocessing_layer = tf.keras.layers.Concatenate(preprocessing_layers)
#outputs = concatenated_preprocessing_layer(input_layer.values())
outputs = concatenated_preprocessing_layer(all_inputs)
And here is some of the data in the taxi_data.csv file
17,2014-10-25 21:39:42 UTC,-73.978713,40.78303,-74.008102,40.73881,2,unused
14.9,2012-08-22 12:01:00 UTC,-73.987667,40.728747,-74.003272,40.715202,2,unused
21.5,2013-12-18 23:26:12 UTC,-74.008969,40.716853,-73.97688,40.780289,2,unused
23.5,2014-10-04 21:58:00 UTC,-73.954153,40.806257,-74.00343,40.731867,2,unused
34.3,2012-12-17 15:23:00 UTC,-73.866917,40.770342,-73.968872,40.757482,2,unused
16.1,2009-09-24 17:37:31 UTC,-73.967549,40.762828,-73.97961,40.723133,2,unused
17.3,2010-04-26 20:52:36 UTC,-73.981381,40.749913,-73.966612,40.691132,2,unused
35,2014-08-13 20:16:00 UTC,-73.866107,40.771245,-74.013987,40.676437,2,unused
17.3,2010-12-30 17:55:00 UTC,-73.997803,40.725982,-73.982382,40.772225,2,unused
I was able to get this to work. Like I suspected, it was my conceptual understanding that was the issue. Specifically, I wasn't correctly hooking up the Input (input_placeholder) to the normalizer. The modified code is below:
preprocessing_layers = []
all_inputs = []
for column in NUMERIC_COLS:
normalizer = get_normalization_layer(column, train_ds)
input_placeholder = tf.keras.Input(shape=(1,), name=column, dtype=tf.dtypes.float32, )
encoded_feature = normalizer(input_placeholder)
preprocessing_layers.append(encoded_feature)
all_inputs.append(input_placeholder)
concatenated_preprocessing_layer = tf.keras.layers.concatenate(preprocessing_layers)
#outputs = concatenated_preprocessing_layer(input_layer.values())
preprocessing_new_model = tf.keras.Model(inputs=all_inputs, outputs=concatenated_preprocessing_layer)
preprocessing_new_model(train_features)
You need to concatenate preprocessing_layers and all_inputs by using the code below:
concatenated_preprocessing_layer = tf.keras.layers.Concatenate((preprocessing_layers,all_inputs))
As you have used
concatenated_preprocessing_layer = tf.keras.layers.Concatenate(preprocessing_layers)
You can concatenate all_inputs by using:
outputs =tf.keras.layers.Concatenate((concatenated_preprocessing_layer,all_inputs))
Please refer to this working gist for your reference.
I'm loading a dataset with multiple input images. The input image paths should only be decoded at batch time, in order to handle a large dataset.
The data set is N image path inputs and M float outputs. The images for each input have different resolutions.
Data is ([img_input_1.png, img_input_2.png, ...], [0.65, 0.7, 0.8])
The model is using the Keras functional api in symbolic mode.
Here is the most recently EDITED code
from itertools import zip_longest
def read_image(path, shape):
try:
image = tf.io.read_file(path)
image = tf.image.decode_png(image)
image = tf.image.resize(image, [shape[1],shape[2]])
image /= 255.0
return image
except:
print('ERROR: preprocess_image: bad path', path)
def load_image(x, y, shp):
pout = [(k, x[k]) for k in x.keys()]
l1 = tf.convert_to_tensor(list(x))
l2 = tf.convert_to_tensor(list(x.values()))
pl = tf.map_fn(
lambda args: (read_image(args[0], shp), args[1]), [l1, l2], dtype=(tf.float32, tf.float32)
)
pl = {path: (pl[0][i], pl[1][i]) for i, path in enumerate(x)}
return (pl,y)
def dataset_prep(json_data, seq, batch_size):
# LOAD DATA FROM JSON
x,y = json_parse_x_y(json_data[seq])
xx = [*zip_longest(*x)] # NOTE: goes from variable sized input to {'input_N':...}
yy = [*zip_longest(*y)]
# GET SHAPES (hard coded atm)
lns = [[len(xxx)] for xxx in xx]
rzs = [[24,512,1],[96,512,1]] # TEMP TODO! grab grom [(v['h'],v['w'],v['c']) for v in xx]
shp = [*zip_longest(*[lns,rzs])]
shp = [list(s) for s in shp]
shp = [[*itertools.chain.from_iterable(s)] for s in shp]
xd = dict([[ "input_{}".format(i+1),np.array(y)] for i,y in [*enumerate(xx)]])
yd = dict([["output_{}".format(i+1),np.array(y)] for i,y in [*enumerate(yy)]])
ds = tf.data.Dataset.from_tensor_slices((xd, yd))
ds = ds.shuffle(10000)
ds = ds.repeat()
ds = ds.map(map_func=lambda x,y: load_image(x, y, shp), num_parallel_calls=AUTOTUNE)
ds = ds.batch(batch_size) if batch_size else ds
ds = ds.prefetch(AUTOTUNE)
return ds
This is the error I'm getting:
Traceback (most recent call last):
File "/home/me/.local/bin/wavfeat", line 11, in <module>
load_entry_point('wavfeat==0.1.0', 'console_scripts', 'wavfeat')()
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/__main__.py", line 91, in main
analysis_batch_sql(obj)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/analysis_run_csv.py", line 50, in analysis_batch_sql
qy = [*map(lambda c: run_elm(c[0], c[1]), ch)]
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/analysis_run_csv.py", line 50, in <lambda>
qy = [*map(lambda c: run_elm(c[0], c[1]), ch)]
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/analysis_run_csv.py", line 23, in run_elm
out = fn(input, elm)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_onset.py", line 196, in one_sec_onset_train
return train(input, elm)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_onset.py", line 182, in train
ts = dataset_prep(jd, 'train', bc)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_onset.py", line 123, in dataset_prep
ds = ds.map(map_func=lambda x,y: load_image(x, y, shp), num_parallel_calls=AUTOTUNE)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 1146, in map
self, map_func, num_parallel_calls, preserve_cardinality=True)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 3264, in __init__
use_legacy_function=use_legacy_function)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2591, in __init__
self._function = wrapper_fn._get_concrete_function_internal()
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1366, in _get_concrete_function_internal
*args, **kwargs)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1360, in _get_concrete_function_internal_garbage_collected
graph_function, _, _ = self._maybe_define_function(args, kwargs)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1648, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1541, in _create_graph_function
capture_by_value=self._capture_by_value),
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/framework/func_graph.py", line 716, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2585, in wrapper_fn
ret = _wrapper_helper(*args)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2530, in _wrapper_helper
ret = func(*nested_args)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_onset.py", line 123, in <lambda>
ds = ds.map(map_func=lambda x,y: load_image(x, y, shp), num_parallel_calls=AUTOTUNE)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_data_loader.py", line 91, in load_image
print("x['input_1'].values(): ", x['input_1'].values())
AttributeError: 'Tensor' object has no attribute 'values'
What am I doing that is preventing the paths from being loaded?
EDIT:
Attempting pandrey's fix, I'm getting input errors. Here is the data before from_tensor_slices and ds.map and then after:
pre_from_tensor_slices x: {'input_1': array(['/media/me/sp_data/sp_data/datasets/chr_01/one_sec_onset_11_oac-leg/7388_39216_30--id=7388__sql_table=oac_1__sql_idx=405167__pitch=30__onset=39216.png',
'/media/me/sp_data/sp_data/datasets/chr_01/one_sec_onset_11_oac-leg/2447_864_27--id=2447__sql_table=oac_1__sql_idx=415458__pitch=27__onset=864.png',
'/media/me/sp_data/sp_data/datasets/chr_01/one_sec_onset_11_oac-leg/2386_20208_38--id=2386__sql_table=oac_1__sql_idx=433248__pitch=38__onset=20208.png',
...,
'/media/me/sp_data/sp_data/datasets/chr_01/one_sec_onset_11_oac-leg/6261_24528_57--id=6261__sql_table=oac_1__sql_idx=449753__pitch=57__onset=24528.png',
'/media/me/sp_data/sp_data/datasets/chr_01/one_sec_onset_11_oac-leg/3727_22944_31--id=3727__sql_table=oac_1__sql_idx=407620__pitch=31__onset=22944.png',
'/media/me/sp_data/sp_data/datasets/chr_01/one_sec_onset_11_oac-leg/1668_7056_60--id=1668__sql_table=oac_1__sql_idx=381152__pitch=60__onset=7056.png'],
dtype='<U162'), 'input_2': array(['/media/me/sp_data/sp_data/datasets/mel_01/one_sec_onset_11_oac-leg/7388_39216_30--id=7388__sql_table=oac_1__sql_idx=405167__pitch=30__onset=39216.png',
'/media/me/sp_data/sp_data/datasets/mel_01/one_sec_onset_11_oac-leg/2447_864_27--id=2447__sql_table=oac_1__sql_idx=415458__pitch=27__onset=864.png',
'/media/me/sp_data/sp_data/datasets/mel_01/one_sec_onset_11_oac-leg/2386_20208_38--id=2386__sql_table=oac_1__sql_idx=433248__pitch=38__onset=20208.png',
...,
'/media/me/sp_data/sp_data/datasets/mel_01/one_sec_onset_11_oac-leg/6261_24528_57--id=6261__sql_table=oac_1__sql_idx=449753__pitch=57__onset=24528.png',
'/media/me/sp_data/sp_data/datasets/mel_01/one_sec_onset_11_oac-leg/3727_22944_31--id=3727__sql_table=oac_1__sql_idx=407620__pitch=31__onset=22944.png',
'/media/me/sp_data/sp_data/datasets/mel_01/one_sec_onset_11_oac-leg/1668_7056_60--id=1668__sql_table=oac_1__sql_idx=381152__pitch=60__onset=7056.png'],
dtype='<U162')}
pre_from_tensor_slices y: {'output_1': array([0.817, 0.018, 0.421, ..., 0.511, 0.478, 0.147])}
_________________________
y: {'output_1': <tf.Tensor 'args_2:0' shape=() dtype=float64>}
x: {'input_1': <tf.Tensor 'args_0:0' shape=() dtype=string>, 'input_2': <tf.Tensor 'args_1:0' shape=() dtype=string>}
x.values(): dict_values([<tf.Tensor 'args_0:0' shape=() dtype=string>, <tf.Tensor 'args_1:0' shape=() dtype=string>])
x['input_1']: Tensor("args_0:0", shape=(), dtype=string)
Running x['input_1'].values() throws an error: 'Tensor' object has no attribute 'values'
I get an error situated around map_fn
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/framework/constant_op.py", line 284, in _constant_impl
allow_broadcast=allow_broadcast))
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/framework/tensor_util.py", line 455, in make_tensor_proto
raise ValueError("None values not supported.")
ValueError: None values not supported.
EDIT 2
Attempting the latest I get the following error
Traceback (most recent call last):
File "/home/me/.local/bin/wavfeat", line 11, in <module>
load_entry_point('wavfeat==0.1.0', 'console_scripts', 'wavfeat')()
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/__main__.py", line 91, in main
analysis_batch_sql(obj)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/analysis_run_csv.py", line 50, in analysis_batch_sql
qy = [*map(lambda c: run_elm(c[0], c[1]), ch)]
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/analysis_run_csv.py", line 50, in <lambda>
qy = [*map(lambda c: run_elm(c[0], c[1]), ch)]
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/analysis_run_csv.py", line 23, in run_elm
out = fn(input, elm)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_onset.py", line 216, in one_sec_onset_train
return train(input, elm)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_onset.py", line 203, in train
vs = validation_prep(jd, 'validation', bc)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_onset.py", line 176, in validation_prep
ds = ds.map(map_func=load_and_preprocess_from_path_label, num_parallel_calls=AUTOTUNE)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 1146, in map
self, map_func, num_parallel_calls, preserve_cardinality=True)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 3264, in __init__
use_legacy_function=use_legacy_function)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2591, in __init__
self._function = wrapper_fn._get_concrete_function_internal()
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1366, in _get_concrete_function_internal
*args, **kwargs)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1360, in _get_concrete_function_internal_garbage_collected
graph_function, _, _ = self._maybe_define_function(args, kwargs)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1648, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py", line 1541, in _create_graph_function
capture_by_value=self._capture_by_value),
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/framework/func_graph.py", line 716, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2585, in wrapper_fn
ret = _wrapper_helper(*args)
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2530, in _wrapper_helper
ret = func(*nested_args)
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_data_loader.py", line 47, in load_and_preprocess_from_path_label
pl = dict([(pk, tf.map_fn(load_and_preprocess_image, po, dtype=tf.float32)) for pk,po in pout])
File "/home/me/.local/lib/python3.6/site-packages/wavfeat/one_sec_data_loader.py", line 47, in <listcomp>
pl = dict([(pk, tf.map_fn(load_and_preprocess_image, po, dtype=tf.float32)) for pk,po in pout])
File "/home/me/.local/lib/python3.6/site-packages/tensorflow/python/ops/map_fn.py", line 214, in map_fn
raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
ValueError: elems must be a 1+ dimensional Tensor, not a scalar
Add-on: not using dict structures
This is a full code (save for defining json_parse_x_y and declaring AUTOTUNE) to achieve what you are attempting without using dict structures.
I tested that make_dataset works (see example below), so if you encounter an issue it should be due to a specification error regarding load_tensors.
from itertools import zip_longest
import tensorflow as tf
# additionnally, `json_parse_x_y` must be defined
# and `AUTOTUNE` must be declared (in my example, I set it to 2)
def read_image(path, shape):
"""Read an image of givent filepath and tensor shape.
Return a float tensor of given shape.
"""
try:
image = tf.io.read_file(path)
image = tf.image.decode_png(image)
image = tf.image.resize(image, [shape[1], shape[2]])
image /= 255.0
return image
except:
raise FileNotFoundError("preprocess_image: bad path '%s'" % path)
def load_images(paths, shapes):
"""Load an ensemble of images (associated with a single sample).
paths : rank-1 string Tensor
shapes : list of images' shapes (same length as `paths`)
Return a tuple of float tensors containing the loaded images.
"""
return tuple((
read_image(paths[i], shapes[i])
for i in range(len(shapes))
))
def load_tensors(json_data, seq):
"""Load images descriptors from a json dump.
Return a tuple containing:
* a rank-2 tensor containing lists of image paths (str)
* a rank-2 tensor containing resolution values (float)
* a list of image shapes, of same length as the rank-2
tensor's second axis
"""
x,y = json_parse_x_y(json_data[seq])
xx = [*zip_longest(*x)] # NOTE: goes from variable sized input to {'input_N':...}
yy = [*zip_longest(*y)]
# GET SHAPES (hard coded atm)
lns = [[len(xxx)] for xxx in xx]
rzs = [[24,512,1],[96,512,1]] # TEMP TODO! grab grom [(v['h'],v['w'],v['c']) for v in xx]
shp = [*zip_longest(*[lns,rzs])]
shp = [list(s) for s in shp]
shp = [[*itertools.chain.from_iterable(s)] for s in shp]
return (xx, yy, shp)
def make_dataset(xx, yy, shp, batch_size):
"""Build a Dataset instance containing loaded images.
xx, yy, shp : see the specification of `load_tensors`'s outputs
batch_size : batch size to set on the Dataset
Return a Dataset instance where each batched sample is a tuple
containing two elements: first, a tuple containing N loaded images'
rank-3 tensors; second, a rank-1 tensor containing M float values.
(to be clear: batching adds a dimension to all those tensors)
"""
data = tf.data.Dataset.from_tensor_slices((xx, yy))
data = data.shuffle(10000)
data = data.map(lambda x, y: (load_images(x, shapes), y))
data = data.repeat()
data = data.batch(batch_size) if batch_size else data
data = data.prefetch(AUTOTUNE)
return data
def dataset_prep(json_data, seq, batch_size):
"""Full pipeline to making a Dataset from json."""
xx, yy, shapes = load_tensors(json_data, seq)
return make_dataset(xx, yy, shapes)
Example, using "hand-made' values ; all images are actually
this classic image, of shape [512, 512, 3].
import numpy as np
import tensorflow as tf
# import previous code
# Here, N = 2, and I make 2 samples.
x = tf.convert_to_tensor(np.array([
['image_1a.png', 'image_1b.png'],
['image_2a.png', 'image_2b.png']
]))
shapes = [[1, 512, 512], [1, 512, 512]] # images are initially [512, 512, 3]
# Here, M = 3, and I make 2 samples. Values are purely random.
y = tf.convert_to_tensor(np.array([
[.087, .92, .276],
[.242, .37, .205]
]))
# This should work.
data = make_dataset(x, y, shapes, batch_size=1)
# Output signature is <PrefetchDataset shapes:
# (((None, 512, 512, None), (None, 512, 512, None)), (None, 3)),
# types: ((tf.float32, tf.float32), tf.float64)
# >
# Where the first None is actually `batch_size`
# and the second is, in this case, 3.
Answer to the current question:
Okay, the problem you are now encountering is that the revised load_image function does not fit the specifications of the Dataset, hence the exception raising. Please find below a full edited code that seems to work (I ran a test using custom images on my computer, with xd / yd dict initialized to look like your reported x and y in-dataset tensors). It is not pretty, and I would personally advise to drop the dict structures, but it works:
from itertools import zip_longest
def read_image(path, shape):
try:
image = tf.io.read_file(path)
image = tf.image.decode_png(image)
image = tf.image.resize(image, [shape[1],shape[2]])
image /= 255.0
return image
except:
raise FileNotFoundError("preprocess_image: bad path '%s'" % path)
# CHANGED: load_image is actually useless
def dataset_prep(json_data, seq, batch_size):
# LOAD DATA FROM JSON
x,y = json_parse_x_y(json_data[seq])
xx = [*zip_longest(*x)] # NOTE: goes from variable sized input to {'input_N':...}
yy = [*zip_longest(*y)]
# GET SHAPES (hard coded atm)
lns = [[len(xxx)] for xxx in xx]
rzs = [[24,512,1],[96,512,1]] # TEMP TODO! grab grom [(v['h'],v['w'],v['c']) for v in xx]
shp = [*zip_longest(*[lns,rzs])]
shp = [list(s) for s in shp]
shp = [[*itertools.chain.from_iterable(s)] for s in shp]
xd = dict([[ "input_{}".format(i+1),np.array(y)] for i,y in [*enumerate(xx)]])
yd = dict([["output_{}".format(i+1),np.array(y)] for i,y in [*enumerate(yy)]])
ds = tf.data.Dataset.from_tensor_slices((xd, yd))
ds = ds.shuffle(10000)
# CHANGED: the following line, to run images import (also moved epeat instruction later)
ds = ds.map(
lambda x, y: (
{key: read_image(path, shp[i]) for i, (key, path) in enumerate(x.items())},
y
),
num_parallel_calls=AUTOTUNE
)
ds = ds.repeat()
ds = ds.batch(batch_size) if batch_size else ds
ds = ds.prefetch(AUTOTUNE)
return ds
Initial answer (before question edit):
I will only deal with the exception raised by load_image in this answer, but there might be additional work to perform on the rest - I did not test for that, not having a convenient dataset at hand.
The exception message is actually quite explicit: you are passing a scalar element (e.g. n in [(k, tf.map_fn(lambda x: read_image(x, shp), n, dtype=tf.float32)) for k,n in pout]) as elems argument to tf.map_fn, when it expects a tensor (or (possibly nested) list or tuple of tensors), as clearly specified in its documentation.
You are also using tf.map_fn the wrong way in the quoted line of code, because basically you are mixing it up with a python intention list, when you should use either one or the other.
With intention list (also replacing the useless previous lines of the load_image function):
pl = {path: (load_image(path, shp), res) for path, res in x.items()}
With tf.map_fn:
# Read all images, return two tensors, one with images, the other with resolutions.
# (so, resolutions inclusion in this is actually useless and should be redesigned)
pl = tf.map_fn(
lambda args: (read_image(args[0], shp), args[1]),
[tf.convert_to_tensor(list(x)), tf.convert_to_tensor(list(x.values()))],
dtype=(tf.float32, tf.float32)
)
# If you really, really want to return a dict, but is it an optimal design?
pl = {path: (pl[0][i], pl[1][i]) for i, path in enumerate(x)}
I do not know whether returning a dict specified in this way is optimal (or even compatible) with Dataset instantiation, however if the rest of your code is working, this should do the trick.
At any rate, if you want to iterate over a dict, go ahead and use either the first version or a modified version of the second one (which may have the advantage of parallelizing images reading).
I hope this helps :-)
I am using this github.com/Determined22/zh-NER-TF
I just used another train_data of the same format.
Nothing is wrong with the code because it's okay when I run with the original train_data. What can cause this?
Traceback (most recent call last):
File "main.py", line 83, in <module>
model.train(train=train_data, dev=dev_data)
File "/home/mengyuguang/NER/model.py", line 161, in train
self.run_one_epoch(sess, train, dev, self.tag2label, epoch, saver)
File "/home/mengyuguang/NER/model.py", line 221, in run_one_epoch
label_list_dev, seq_len_list_dev = self.dev_one_epoch(sess, dev)
File "/home/mengyuguang/NER/model.py", line 256, in dev_one_epoch
label_list_, seq_len_list_ = self.predict_one_batch(sess, seqs)
File "/home/mengyuguang/NER/model.py", line 277, in predict_one_batch
viterbi_seq, _ = viterbi_decode(logit[:seq_len], transition_params)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/crf/python/ops/crf.py", line 333, in viterbi_decode
trellis[0] = score[0]
IndexError: index 0 is out of bounds for axis 0 with size 0
def read_corpus(self, corpus_path):
data = []
with open(corpus_path, 'r') as r_file:
sent_, tag_ = [], []
for line in r_file:
line = line.strip()
if len(line) != 0 and line != '-DOCSTART-':
ls = line.split('\t')
char, tag = ls[0], ls[-1]
sent_.append(char)
tag_.append(tag)
else:
data.append((sent_, tag_))
sent_, tag_ = [], []
# Bug-fix
# Here, since the last tuple (sent_, tag_) will be added into data
# It will case IndexError in viterbi_decode since the sequence_length is 0
if sent_ and tag_:
data.append((sent_, tag_))
self.data = data
The code should be changed to the following:
def read_corpus(corpus_path):
"""
read corpus and return the list of samples
:param corpus_path:
:return: data
"""
data = []
with open(corpus_path, encoding='utf-8') as fr:
lines = fr.readlines()
sent_, tag_ = [], []
for line in lines:
if line != '\n' and line != '\t\n': #
[char, label] = line.strip().split()
sent_.append(char)
tag_.append(label)
#else:
elif sent_ !=[] and tag_ !=[]: #
data.append((sent_, tag_))
sent_, tag_ = [], []
return data