How to handle the categorical_column in parse_single_example function. - tensorflow

I am trying to use parse_single_example to process the TFRecordDataset file.
But It looks it can not handle the categorical_column in the Example proto.
For example:
In the example, there are 3 features. "a", "b", and "month".
mouth is an int64 type feature, here I'd like to use parse this field and expand to a categorical signal directly.
I am not sure if parse_single_example support it or my method is totally wrong?
def _parse_function(example_proto):
print(example_proto)
features = {"a": tf.VarLenFeature(tf.float32),
"b": tf.VarLenFeature(tf.float32),
"month": tf.feature_column.categorical_column_with_identity(key='month', num_buckets=12),
"label": tf.FixedLenFeature((), tf.float32, default_value=0.0)}
parsed_features = tf.parse_single_example(example_proto, features)
return tf.concat([tf.sparse_tensor_to_dense(parsed_features["a"]),
tf.sparse_tensor_to_dense(parsed_features["b"]),
parsed_features["month"]], 0),\
tf.cast(parsed_features["close_pct"] + tf.convert_to_tensor(1.0, tf.float32), tf.int64)
https://www.tensorflow.org/api_docs/python/tf/io/parse_single_example
https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_identity

Use tf.one_hot after parse the example.
return tf.concat([tf.sparse_tensor_to_dense(parsed_features["a"]),
tf.sparse_tensor_to_dense(parsed_features["b"]),
tf.one_hot(parsed_features["month"], categorical_size, axis=-1)], 0),\
tf.cast(parsed_features["close_pct"] + tf.convert_to_tensor(1.0, tf.float32), tf.int64)

Related

tensorflow2 how to print tensor value

I try to print the actual value of the tensor which is loaded from my custom tfds dataset.
I can't figure out how to do it. I am using Tensorflow2 so the session is no longer encouraged.
I tried using .numpy() tf.print. tf.executing.eagerly() but nothing works. It either prints
just the tensor object showing me the shape or in case of .numpy() it throws the error in the title.
I need the value of tensors and I need to bring it back to numpy in order to debugg the code.
This is how I create the dataset:
class dt(tfds.core.GeneratorBasedBuilder):
''' Dataset builder'''
# DOuble check
VERSION = tfds.core.Version('1.0.0')
RELEASE_NOTES = {
'1.0.0': 'Initial release.',
}
def _info(self) ->tfds.core.DatasetInfo:
'''Dataset metadata'''
return tfds.core.DatasetInfo(
builder=self,
features=tfds.features.FeaturesDict({
"id": tf.int64,
"image": tfds.features.Image(shape=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), encoding_format='png'),
"mask": tfds.features.Image(shape=(IMG_HEIGHT, IMG_WIDTH, 1), encoding_format='png'),
"label": tfds.features.ClassLabel(names=CLASSES),
}),
supervised_keys=('image', 'mask')
)
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
'''Splitgenerator for train and test splits'''
path = DATASETS_ROOT
return {
"train": self._generate_examples(
images_path=os.path.join(path, "train/rgb"),
masks_path=os.path.join(path, "train/masks")
),
"test": self._generate_examples(
images_path=os.path.join(path, "test/rgb"),
masks_path=os.path.join(path, "test/masks")
)
}
def _generate_examples(self, images_path, masks_path):
'''Generator of examples for each split'''
for i, (image, mask) in enumerate(zip(glob.glob(images_path + "/*.png"), glob.glob(masks_path + "/*.png"))):
yield i, {
"id": i,
"image": image,
"mask": mask,
"label": CLASSES[3],
}
This is how I try to extract the numpy array
def custom_load_X_Y(training=True):
if training:
dt, dt_info = tfds.load("dt", split="train", shuffle_files=True, as_supervised=True, with_info=True)
print(f'EAGERLY {tf.executing_eagerly()}')
print(f'MOde type {type(dt)}')
tf.print(f"aaaaa {dt.numpy()} aaaaaa")
Console output:
Console output
You can print the image dataset by using tfds.show_examples()
import tensorflow_datasets as tfds
ds, ds_info = tfds.load('cifar10', split='train', with_info=True)
fig = tfds.show_examples(ds, ds_info)
output:
For more details please refer to this gist.Thank You.

tensorflow 2.8 dataset get single element error

I would like to use "get_single_element" to get the one batch dataset as dict. The keys in dict are features names. The values are values.
My code is in tf2.8 on EC2 instance and run from jupyter.
import pandas as pd
import tensorflow as tf
df = pd.DataFrame({
'id': [1, 2, 6, 3, 5, 0],
'value_1': [10.892561, 7.210528, 1.2278101, -9.251782, 0.2118367, 6.9128551],
'value_2': ['large', 'small', 'mid','small','large', 'mid'],
'name_1': ['tyne', 'wnhp', 'ebhg','lpzhn','tyne', 'ebhg'],
'label': [0, 1, 0, 1, 1, 1]
})
dataset = tf.data.Dataset.from_tensor_slices(dict(df))
dataset = dataset.batch(2)
print(type(dataset))
print(dataset)
print(len(dataset))
class Features(object):
feature_data = {
"id": tf.io.FixedLenFeature((1,), dtype=tf.int8),
"value_1": tf.io.VarLenFeature(dtype=tf.float32),
"value_2": tf.io.FixedLenFeature((1,), dtype=tf.string),
"name_1": tf.io.FixedLenFeature((1,), dtype=tf.string)
}
label_data = {"label": tf.io.FixedLenFeature((1,), dtype=tf.int8)}
def process_sample(ds):
print(f"ds type is {type(ds)}")
features = tf.io.parse_single_example(ds, Features.feature_data) # error !
labels = tf.io.parse_single_example(ds, Features.label_data)['label']
return (features, labels)
dataset = dataset.map(lambda x: process_sample(x), num_parallel_calls=tf.data.AUTOTUNE)
dataset = tf.data.Dataset.get_single_element(dataset.batch(len(dataset)))
print(f"dataset type is {type(dataset)} dataset is {dataset}")
def get_dict(input_ds):
feature_dict_tensors = dict(input_ds)
print(f"\feature_dict_tensors type is {type(feature_dict_tensors)}, feature_dict_tensors is {feature_dict_tensors}")
return feature_dict_tensors
ds = dataset.map(get_dict)
print(f"ds type is {type(ds)}")
print(ds)
I got error:
File "<ipython-input-4-e9407f42e0a7>", line 37, in None *
lambda x: process_sample(x), num_parallel_calls=tf.data.AUTOTUNE)
File "<ipython-input-4-e9407f42e0a7>", line 33, in process_sample *
features = tf.io.parse_single_example(ds, Features.feature_data)
TypeError: Expected any non-tensor type, but got a tensor instead.
Based on https://www.tensorflow.org/api_docs/python/tf/io/parse_single_example, the first argument should be
A scalar string Tensor, a single serialized Example.
Why I got this error ?
thanks

How can I preprocess my Mapdataset to fit my model input?

I use a MapDataset compose of label in text and a vector of float in string.
Here is the way I read the content of my tfrecord:
def extract_data(tfrecord_ds):
feature_description = {
'classes_text': tf.io.FixedLenFeature((), tf.string),
'data': tf.io.FixedLenFeature([], tf.string)
}
def _parse_data_function(example_proto):
return tf.compat.v1.parse_single_example(example_proto, feature_description)
parsed_dataset = tfrecord_ds.map(_parse_data_function)
dataset = parsed_dataset.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
return dataset
I want to convert the label_text to int according to label.txt file and the data string to vector of float.
I want to use this data to train a custom model like this:
my_model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
name='input_embedding'),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(num_classes)
], name='audio_detector')
How can I process my MapDataset from (string,string) to (int, float_array) to be able to train my model?
Edit:
Here is the way I encode my data:
features = {}
features['classes_text'] = tf.train.Feature(
bytes_list=tf.train.BytesList(value=[audio_data_generator.label.encode()]))
bytes = embedding.numpy().tobytes()
features['data'] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes]))
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
It is easier to encode the embedding using tf.train.FloatList.
When writing to tfrecords use:
features = {
'classes_text': tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode()])),
'data': tf.train.Feature(float_list=tf.train.FloatList(value=embedding))
}
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
And when reading give the embedding size to tf.io.FixedLenFeature, for example:
embedding_size = 10
feature_description = {
'classes_text': tf.io.FixedLenFeature((), tf.string),
'data': tf.io.FixedLenFeature([embedding_size], tf.float32)
}
To convert label_text to int you can use tf.lookup.StaticVocabularyTable.
# Assuming lable.txt contains a single label per line.
with open('label.txt', 'r') as fin:
categories = [line.strip() for line in fin.readlines()]
init = tf.lookup.KeyValueTensorInitializer(
keys=tf.constant(categories),
values=tf.constant(list(range(len(categories))), dtype=tf.int64))
label_table = tf.lookup.StaticVocabularyTable(
init,
num_oov_buckets=1)
feature_description = {
'classes_text': tf.io.FixedLenFeature((), tf.string),
'data': tf.io.FixedLenFeature([embedding_size], tf.float32)
}
def _parse_data_function(example_proto):
example = tf.compat.v1.parse_single_example(example_proto, feature_description)
# Apply the label lookup.
example['classes_text'] = label_table.lookup(example['classes_text'])
return example
parsed_dataset = tfrecord_ds.map(_parse_data_function)
dataset = parsed_dataset.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
Edit
If you wish to keep the way you save data you can use np.frombuffer to convert the numpy vectors to from binary stings. You will have to wrap this code in a tf.function and tf.py_function though.
def decode_embedding(embedding_bytes):
return np.frombuffer(embedding_bytes.numpy())
#tf.function()
def tf_decode_embedding(embedding_bytes):
return tf.py_function(decode_embedding, inp=[embedding_bytes], Tout=tf.float32)
feature_description = {
'classes_text': tf.io.FixedLenFeature((), tf.string),
'data': tf.io.FixedLenFeature([], tf.string)
}
def _parse_data_function(example_proto):
example = tf.compat.v1.parse_single_example(example_proto, feature_description)
example['classes_text'] = label_table.lookup(example['classes_text'])
example['data'] = tf_decode_embedding(example['data'])
return example
parsed_dataset = tfrecord_ds.map(_parse_data_function)
dataset = parsed_dataset.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)

Performing inference with a BERT (TF 1.x) saved model

I'm stuck on one line of code and have been stalled on a project all weekend as a result.
I am working on a project that uses BERT for sentence classification. I have successfully trained the model, and I can test the results using the example code from run_classifier.py.
I can export the model using this example code (which has been reposted repeatedly, so I believe that it's right for this model):
def export(self):
def serving_input_fn():
label_ids = tf.placeholder(tf.int32, [None], name='label_ids')
input_ids = tf.placeholder(tf.int32, [None, self.max_seq_length], name='input_ids')
input_mask = tf.placeholder(tf.int32, [None, self.max_seq_length], name='input_mask')
segment_ids = tf.placeholder(tf.int32, [None, self.max_seq_length], name='segment_ids')
input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
'label_ids': label_ids, 'input_ids': input_ids,
'input_mask': input_mask, 'segment_ids': segment_ids})()
return input_fn
self.estimator._export_to_tpu = False
self.estimator.export_savedmodel(self.output_dir, serving_input_fn)
I can also load the exported estimator (where the export function saves the exported model into a subdirectory labeled with a timestamp):
predict_fn = predictor.from_saved_model(self.output_dir + timestamp_number)
However, for the life of me, I cannot figure out what to provide to predict_fn as input for inference. Here is my best code at the moment:
def predict(self):
input = 'Test input'
guid = 'predict-0'
text_a = tokenization.convert_to_unicode(input)
label = self.label_list[0]
examples = [InputExample(guid=guid, text_a=text_a, text_b=None, label=label)]
features = convert_examples_to_features(examples, self.label_list,
self.max_seq_length, self.tokenizer)
predict_input_fn = input_fn_builder(features, self.max_seq_length, False)
predict_fn = predictor.from_saved_model(self.output_dir + timestamp_number)
result = predict_fn(predict_input_fn) # this generates an error
print(result)
It doesn't seem to matter what I provide to predict_fn: the examples array, the features array, the predict_input_fn function. Clearly, predict_fn wants a dictionary of some type - but every single thing that I've tried generates an exception due to a tensor mismatch or other errors that generally mean: bad input.
I presumed that the from_saved_model function wants the same sort of input as the model test function - apparently, that's not the case.
It seems that lots of people have asked this very question - "how do I use an exported BERT TensorFlow model for inference?" - and have gotten no answers:
Thread #1
Thread #2
Thread #3
Thread #4
Any help? Thanks in advance.
Thank you for this post. Your serving_input_fn was the piece I was missing! Your predict function needs to be changed to feed the features dict directly, rather than use the predict_input_fn:
def predict(sentences):
labels = [0, 1]
input_examples = [
run_classifier.InputExample(
guid="",
text_a = x,
text_b = None,
label = 0
) for x in sentences] # here, "" is just a dummy label
input_features = run_classifier.convert_examples_to_features(
input_examples, labels, MAX_SEQ_LEN, tokenizer
)
# this is where pred_input_fn is replaced
all_input_ids = []
all_input_mask = []
all_segment_ids = []
all_label_ids = []
for feature in input_features:
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_segment_ids.append(feature.segment_ids)
all_label_ids.append(feature.label_id)
pred_dict = {
'input_ids': all_input_ids,
'input_mask': all_input_mask,
'segment_ids': all_segment_ids,
'label_ids': all_label_ids
}
predict_fn = predictor.from_saved_model('../testing/1589418540')
result = predict_fn(pred_dict)
print(result)
pred_sentences = [
"That movie was absolutely awful",
"The acting was a bit lacking",
"The film was creative and surprising",
"Absolutely fantastic!",
]
predict(pred_sentences)
{'probabilities': array([[-0.3579178 , -1.2010787 ],
[-0.36648935, -1.1814401 ],
[-0.30407643, -1.3386648 ],
[-0.45970002, -0.9982413 ],
[-0.36113673, -1.1936386 ],
[-0.36672896, -1.1808994 ]], dtype=float32), 'labels': array([0, 0, 0, 0, 0, 0])}
However, the probabilities returned for sentences in pred_sentences do not match the probabilities I get use estimator.predict(predict_input_fn) where estimator is the fine-tuned model being used within the same (python) session. For example, [-0.27276006, -1.4324446 ] using estimator vs [-0.26713806, -1.4505868 ] using predictor.

How can i use 38 classes instead of 1000 in model.predict decode predictions

i am founding an error in plant disease detection using resnet50 deep learning model every time it raises an error message in decode_predictions
error
expects a batch of predictions (i.e. a 2D array of shape (samples, 1000)). Found array with shape: (1, 38)"
enter code here
model = ResNet50(weights='imagenet',include_top=False,classes=38)
try:
model = load_model('/content/drive/My
Drive/color/checkpoints/ResNet50_model_weights.h5')
print("model loaded")
except:
print("model not loaded")
img_path = '/content/drive/My Drive/color/test/0/appleblackrot188.jpg'
img = image.load_img(img_path, target_size=(300, 300))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
preds = model.predict(x)
print('Predicted:', decode_predictions(preds,top=3)[0])
You can try using the preprocesing function:
import tensorflow as tf
# Using the keras wrapper on tensorflow (it must be the same using just keras).
IMAGE = [] # From image source, i did it from the camera.
toPred = tf.keras.applications.resnet50.preprocess_input(np.expand_dims(tf.keras.preprocessing.image.img_to_array(IMAGE), axis=0))
Maybe that can help :)
decode_predictions works only for ImageNet (no. of classes = 1000). For these 38 classes of plants, you have to write your own decode predictions based on the ground truth label you've assigned for each plant.
First, you need an index JSON file and create a new decode_predictions function.
For example
This HAM10000 that has 7 classes and you need to split to each folder like this
then make an index JSON file like this
{
"0" : [
"AKIEC",
"akiec"
],
"1" : [
"BCC",
"bcc"
],
"2" : [
"BKL",
"bkl"
],
"3" : [
"DF",
"df"
],
"4" : [
"MEL",
"mel"
],
"5" : [
"NV",
"nv"
],
"6" : [
"VASC",
"vasc"
]
}
Then try this code
def decode_predictions(preds, top=4, class_list_path='/content/SKIN_DATA/HAM10000_index.json'):
if len(preds.shape) != 2 or preds.shape[1] != 7: # your classes number
raise ValueError('`decode_predictions` expects '
'a batch of predictions '
'(i.e. a 2D array of shape (samples, 1000)). '
'Found array with shape: ' + str(preds.shape))
index_list = json.load(open(class_list_path))
results = []
for pred in preds:
top_indices = pred.argsort()[-top:][::-1]
result = [tuple(index_list[str(i)]) + (pred[i],) for i in top_indices]
result.sort(key=lambda x: x[2], reverse=True)
results.append(result)
return results