TFX Tensorflow model validator component - You passed a data dictionary with keys ['image_raw_xf']. Expected the following keys: ['input_1'] - tensorflow

I'm building a tfx pipeline based on the cifar10 example : [https://github.com/tensorflow/tfx/tree/master/tfx/examples/cifar10]
The difference is that I don't want to convert it to tf_lite model and instead use a regular keras based tensorflow model.
Everything works as expected until I get to the Evaluator component as it fails with the following error:
ValueError: Missing data for input "input_1". You passed a data dictionary with keys ['image_xf']. Expected the following keys: ['input_1']
[while running 'Run[Trainer]']
Not sure what I'm doing wrong, but so far I debugged/modified the code as follows:
[1] The preprocessing_fn output is outputting the key image_xf:
_IMAGE_KEY = 'image'
_LABEL_KEY = 'label'
def _transformed_name(key):
return key + '_xf'
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
# tf.io.decode_png function cannot be applied on a batch of data.
# We have to use tf.map_fn
image_features = tf.map_fn(
lambda x: tf.io.decode_png(x[0], channels=3),
inputs[_IMAGE_KEY],
dtype=tf.uint8)
# image_features = tf.cast(image_features, tf.float32)
image_features = tf.image.resize(image_features, [224, 224])
image_features = tf.keras.applications.mobilenet.preprocess_input(
image_features)
outputs[_transformed_name(_IMAGE_KEY)] = image_features
#outputs["input_1"] = image_features
# TODO(b/157064428): Support label transformation for Keras.
# Do not apply label transformation as it will result in wrong evaluation.
outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]
return outputs
[2] When I build the model, I am using transfer learning with an inputLayer with the same name image_xf.
def _build_keras_model() -> tf.keras.Model:
"""Creates a Image classification model with MobileNet backbone.
Returns:
The image classifcation Keras Model and the backbone MobileNet model
"""
# We create a MobileNet model with weights pre-trained on ImageNet.
# We remove the top classification layer of the MobileNet, which was
# used for classifying ImageNet objects. We will add our own classification
# layer for CIFAR10 later. We use average pooling at the last convolution
# layer to get a 1D vector for classifcation, which is consistent with the
# origin MobileNet setup
base_model = tf.keras.applications.MobileNet(
input_shape=(224, 224, 3),
include_top=False,
weights='imagenet',
pooling='avg')
base_model.input_spec = None
# We add a Dropout layer at the top of MobileNet backbone we just created to
# prevent overfiting, and then a Dense layer to classifying CIFAR10 objects
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(
input_shape=(224, 224, 3), name=_transformed_name(_IMAGE_KEY)),
base_model,
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(10, activation='softmax')
])
[3] The model signature is created accordingly:
def _get_serve_image_fn(model, tf_transform_output):
"""Returns a function that feeds the input tensor into the model."""
model.tft_layer = tf_transform_output.transform_features_layer()
#tf.function
def serve_image_fn(serialized_tf_examples):
feature_spec = tf_transform_output.raw_feature_spec()
feature_spec.pop(_LABEL_KEY)
parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
transformed_features = model.tft_layer(parsed_features)
return model(transformed_features)
return serve_image_fn
def run_fn(fn_args: FnArgs):
tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
signatures = {
'serving_default':
_get_serve_image_fn(model,tf_transform_output).get_concrete_function(
tf.TensorSpec(
shape=[None],
dtype=tf.string,
name=_IMAGE_KEY))
}
temp_saving_model_dir = os.path.join(fn_args.serving_model_dir)
model.save(temp_saving_model_dir, save_format='tf', signatures=signatures)
Now, I suspect that tensorflow is not saving the model correctly because when I export the saved model, the input layer is input_1 instead of image_xf.
import tensorflow as tf
import numpy as np
import tensorflow.python.ops.numpy_ops.np_config as np_config
np_config.enable_numpy_behavior()
path = './model/Format-Serving/'
imported = tf.saved_model.load(path)
model = tf.keras.models.load_model(path)
print(model.summary())
print(list(imported.signatures.keys()))
print(model.get_layer('mobilenet_1.00_224').layers[0].name)
The thing to notice here is (1) that the Input layer I added in Sequential model above is missing and (2) the mobilenet first layer is input_1, so it makes sense why I'm getting a mismatch.
2021-10-15 08:33:40.683034: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
mobilenet_1.00_224 (Function (None, 1024) 3228864
_________________________________________________________________
dropout (Dropout) (None, 1024) 0
_________________________________________________________________
dense (Dense) (None, 10) 10250
=================================================================
Total params: 3,239,114
Trainable params: 1,074,186
Non-trainable params: 2,164,928
_________________________________________________________________
None
['serving_default']
input_1
So how can I actually get the model to save correctly with the right input?
Here is the full code:
pipeline.py
# Lint as: python2, python3
# Copyright 2019 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CIFAR10 image classification example using TFX.
This example demonstrates how to do data augmentation, transfer learning,
and inserting TFLite metadata with TFX.
The trained model can be pluged into MLKit for object detection.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import os
from typing import List, Text
import absl
from tfx import v1 as tfx
import tensorflow_model_analysis as tfma
from tfx.components import Evaluator
from tfx.components import ExampleValidator
from tfx.components import ImportExampleGen
from tfx.components import Pusher
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import Trainer
from tfx.components import Transform
from tfx.dsl.components.common import resolver
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner
from tfx.proto import example_gen_pb2
from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
_pipeline_name = 'cifar10_native_keras'
# This example assumes that CIFAR10 train set data is stored in
# ~/cifar10/data/train, test set data is stored in ~/cifar10/data/test, and
# the utility function is in ~/cifar10. Feel free to customize as needed.
_cifar10_root = os.path.join(os.getcwd())
_data_root = os.path.join(_cifar10_root, 'data')
# Python module files to inject customized logic into the TFX components. The
# Transform and Trainer both require user-defined functions to run successfully.
_module_file = os.path.join(_cifar10_root, 'cifar10_utils_native_keras.py')
# Path which can be listened to by the model server. Pusher will output the
# trained model here.
_serving_model_dir_lite = os.path.join(_cifar10_root, 'serving_model_lite',
_pipeline_name)
# Directory and data locations. This example assumes all of the images,
# example code, and metadata library is relative to $HOME, but you can store
# these files anywhere on your local filesystem.
_tfx_root = os.path.join(os.getcwd(), 'tfx')
_pipeline_root = os.path.join(_tfx_root, 'pipelines', _pipeline_name)
# Sqlite ML-metadata db path.
_metadata_path = os.path.join(_tfx_root, 'metadata', _pipeline_name,
'metadata.db')
# Path to labels file for mapping model outputs.
_labels_path = os.path.join(_data_root, 'labels.txt')
# Pipeline arguments for Beam powered Components.
_beam_pipeline_args = [
'--direct_running_mode=multi_processing',
'--direct_num_workers=0',
]
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
module_file: Text, serving_model_dir_lite: Text,
metadata_path: Text,
labels_path: Text,
beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
"""Implements the CIFAR10 image classification pipeline using TFX."""
# This is needed for datasets with pre-defined splits
# Change the pattern argument to train_whole/* and test_whole/* to train
# on the whole CIFAR-10 dataset
input_config = example_gen_pb2.Input(splits=[
example_gen_pb2.Input.Split(name='train', pattern='train/*'),
example_gen_pb2.Input.Split(name='eval', pattern='test/*')
])
# Brings data into the pipeline.
example_gen = ImportExampleGen(
input_base=data_root, input_config=input_config)
# Computes statistics over data for visualization and example validation.
statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
# Generates schema based on statistics files.
schema_gen = SchemaGen(
statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True)
# Performs anomaly detection based on statistics and data schema.
example_validator = ExampleValidator(
statistics=statistics_gen.outputs['statistics'],
schema=schema_gen.outputs['schema'])
# Performs transformations and feature engineering in training and serving.
transform = Transform(
examples=example_gen.outputs['examples'],
schema=schema_gen.outputs['schema'],
module_file=module_file)
model_resolver = resolver.Resolver(
#instance_name='latest_model_resolver',
strategy_class=tfx.dsl.experimental.LatestArtifactStrategy,
model=Channel(type=Model)).with_id('latest_blessed_model_resolver')
# Uses user-provided Python function that trains a model.
# When traning on the whole dataset, use 18744 for train steps, 156 for eval
# steps. 18744 train steps correspond to 24 epochs on the whole train set, and
# 156 eval steps correspond to 1 epoch on the whole test set. The
# configuration below is for training on the dataset we provided in the data
# folder, which has 128 train and 128 test samples. The 160 train steps
# correspond to 40 epochs on this tiny train set, and 4 eval steps correspond
# to 1 epoch on this tiny test set.
trainer = Trainer(
module_file=module_file,
examples=transform.outputs['transformed_examples'],
transform_graph=transform.outputs['transform_graph'],
schema=schema_gen.outputs['schema'],
base_model=model_resolver.outputs['model'],
train_args=trainer_pb2.TrainArgs(num_steps=160),
eval_args=trainer_pb2.EvalArgs(num_steps=4),
custom_config={'labels_path': labels_path})
# Get the latest blessed model for model validation.
# model_resolver = resolver.Resolver(
# strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
# model=Channel(type=Model),
# model_blessing=Channel(
# type=ModelBlessing)).with_id('latest_blessed_model_resolver')
# Uses TFMA to compute evaluation statistics over features of a model and
# perform quality validation of a candidate model (compare to a baseline).
eval_config = tfma.EvalConfig(
model_specs=[tfma.ModelSpec(label_key='label')],
slicing_specs=[tfma.SlicingSpec()],
metrics_specs=[
tfma.MetricsSpec(metrics=[
tfma.MetricConfig(
class_name='SparseCategoricalAccuracy',
threshold=tfma.MetricThreshold(
value_threshold=tfma.GenericValueThreshold(
lower_bound={'value': 0.55}),
# Change threshold will be ignored if there is no
# baseline model resolved from MLMD (first run).
change_threshold=tfma.GenericChangeThreshold(
direction=tfma.MetricDirection.HIGHER_IS_BETTER,
absolute={'value': -1e-3})))
])
])
# Uses TFMA to compute the evaluation statistics over features of a model.
# We evaluate using the materialized examples that are output by Transform
# because
# 1. the decoding_png function currently performed within Transform are not
# compatible with TFLite.
# 2. MLKit requires deserialized (float32) tensor image inputs
# Note that for deployment, the same logic that is performed within Transform
# must be reproduced client-side.
evaluator = Evaluator(
examples=example_gen.outputs['examples'],
model=trainer.outputs['model'],
#baseline_model=model_resolver.outputs['model'],
eval_config=eval_config)
# Checks whether the model passed the validation steps and pushes the model
# to a file destination if check passed.
pusher = Pusher(
model=trainer.outputs['model'],
model_blessing=evaluator.outputs['blessing'],
push_destination=pusher_pb2.PushDestination(
filesystem=pusher_pb2.PushDestination.Filesystem(
base_directory=serving_model_dir_lite)))
components = [
example_gen, statistics_gen, schema_gen, example_validator, transform,
trainer, model_resolver, evaluator, pusher
]
return pipeline.Pipeline(
pipeline_name=pipeline_name,
pipeline_root=pipeline_root,
components=components,
enable_cache=True,
metadata_connection_config=metadata.sqlite_metadata_connection_config(
metadata_path),
beam_pipeline_args=beam_pipeline_args)
# To run this pipeline from the python CLI:
# $python cifar_pipeline_native_keras.py
if __name__ == '__main__':
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
logger.setLevel(logging.INFO)
logging.getLogger().setLevel(logging.INFO)
absl.logging.set_verbosity(absl.logging.FATAL)
BeamDagRunner().run(
_create_pipeline(
pipeline_name=_pipeline_name,
pipeline_root=_pipeline_root,
data_root=_data_root,
module_file=_module_file,
serving_model_dir_lite=_serving_model_dir_lite,
metadata_path=_metadata_path,
labels_path=_labels_path,
beam_pipeline_args=_beam_pipeline_args))
utils file:
# Lint as: python2, python3
# Copyright 2019 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Python source file includes CIFAR10 utils for Keras model.
The utilities in this file are used to build a model with native Keras.
This module file will be used in Transform and generic Trainer.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from typing import List, Text
import absl
import tensorflow as tf
import tensorflow_transform as tft
from tfx.components.trainer.fn_args_utils import DataAccessor
from tfx.components.trainer.fn_args_utils import FnArgs
from tfx.components.trainer.rewriting import converters
from tfx.components.trainer.rewriting import rewriter
from tfx.components.trainer.rewriting import rewriter_factory
from tfx.dsl.io import fileio
from tfx_bsl.tfxio import dataset_options
# import flatbuffers
# from tflite_support import metadata_schema_py_generated as _metadata_fb
# from tflite_support import metadata as _metadata
# When training on the whole dataset use following constants instead.
# This setting should give ~91% accuracy on the whole test set
# _TRAIN_DATA_SIZE = 50000
# _EVAL_DATA_SIZE = 10000
# _TRAIN_BATCH_SIZE = 64
# _EVAL_BATCH_SIZE = 64
# _CLASSIFIER_LEARNING_RATE = 3e-4
# _FINETUNE_LEARNING_RATE = 5e-5
# _CLASSIFIER_EPOCHS = 12
_TRAIN_DATA_SIZE = 128
_EVAL_DATA_SIZE = 128
_TRAIN_BATCH_SIZE = 32
_EVAL_BATCH_SIZE = 32
_CLASSIFIER_LEARNING_RATE = 1e-3
_FINETUNE_LEARNING_RATE = 7e-6
_CLASSIFIER_EPOCHS = 30
_IMAGE_KEY = 'image'
_LABEL_KEY = 'label'
_TFLITE_MODEL_NAME = 'tflite'
def _transformed_name(key):
return key + '_xf'
def _get_serve_image_fn(model, tf_transform_output):
"""Returns a function that feeds the input tensor into the model."""
model.tft_layer = tf_transform_output.transform_features_layer()
#tf.function
def serve_image_fn(serialized_tf_examples):
feature_spec = tf_transform_output.raw_feature_spec()
feature_spec.pop(_LABEL_KEY)
parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
transformed_features = model.tft_layer(parsed_features)
return model(transformed_features)
return serve_image_fn
def _image_augmentation(image_features):
"""Perform image augmentation on batches of images .
Args:
image_features: a batch of image features
Returns:
The augmented image features
"""
batch_size = tf.shape(image_features)[0]
image_features = tf.image.random_flip_left_right(image_features)
image_features = tf.image.resize_with_crop_or_pad(image_features, 250, 250)
image_features = tf.image.random_crop(image_features,
(batch_size, 224, 224, 3))
return image_features
def _data_augmentation(feature_dict):
"""Perform data augmentation on batches of data.
Args:
feature_dict: a dict containing features of samples
Returns:
The feature dict with augmented features
"""
image_features = feature_dict[_transformed_name(_IMAGE_KEY)]
image_features = _image_augmentation(image_features)
feature_dict[_transformed_name(_IMAGE_KEY)] = image_features
return feature_dict
def _input_fn(file_pattern: List[Text],
data_accessor: DataAccessor,
tf_transform_output: tft.TFTransformOutput,
is_train: bool = False,
batch_size: int = 200) -> tf.data.Dataset:
"""Generates features and label for tuning/training.
Args:
file_pattern: List of paths or patterns of input tfrecord files.
data_accessor: DataAccessor for converting input to RecordBatch.
tf_transform_output: A TFTransformOutput.
is_train: Whether the input dataset is train split or not.
batch_size: representing the number of consecutive elements of returned
dataset to combine in a single batch
Returns:
A dataset that contains (features, indices) tuple where features is a
dictionary of Tensors, and indices is a single Tensor of label indices.
"""
dataset = data_accessor.tf_dataset_factory(
file_pattern,
dataset_options.TensorFlowDatasetOptions(
batch_size=batch_size, label_key=_transformed_name(_LABEL_KEY)),
tf_transform_output.transformed_metadata.schema)
# Apply data augmentation. We have to do data augmentation here because
# we need to apply data agumentation on-the-fly during training. If we put
# it in Transform, it will only be applied once on the whole dataset, which
# will lose the point of data augmentation.
if is_train:
dataset = dataset.map(lambda x, y: (_data_augmentation(x), y))
return dataset
def _freeze_model_by_percentage(model: tf.keras.Model, percentage: float):
"""Freeze part of the model based on specified percentage.
Args:
model: The keras model need to be partially frozen
percentage: the percentage of layers to freeze
Raises:
ValueError: Invalid values.
"""
if percentage < 0 or percentage > 1:
raise ValueError('Freeze percentage should between 0.0 and 1.0')
if not model.trainable:
raise ValueError(
'The model is not trainable, please set model.trainable to True')
num_layers = len(model.layers)
num_layers_to_freeze = int(num_layers * percentage)
for idx, layer in enumerate(model.layers):
if idx < num_layers_to_freeze:
layer.trainable = False
else:
layer.trainable = True
def _build_keras_model() -> tf.keras.Model:
"""Creates a Image classification model with MobileNet backbone.
Returns:
The image classifcation Keras Model and the backbone MobileNet model
"""
# We create a MobileNet model with weights pre-trained on ImageNet.
# We remove the top classification layer of the MobileNet, which was
# used for classifying ImageNet objects. We will add our own classification
# layer for CIFAR10 later. We use average pooling at the last convolution
# layer to get a 1D vector for classifcation, which is consistent with the
# origin MobileNet setup
base_model = tf.keras.applications.MobileNet(
input_shape=(224, 224, 3),
include_top=False,
weights='imagenet',
pooling='avg')
base_model.input_spec = None
# We add a Dropout layer at the top of MobileNet backbone we just created to
# prevent overfiting, and then a Dense layer to classifying CIFAR10 objects
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(
input_shape=(224, 224, 3), name=_transformed_name(_IMAGE_KEY)),
base_model,
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(10, activation='softmax')
])
# Freeze the whole MobileNet backbone to first train the top classifer only
_freeze_model_by_percentage(base_model, 1.0)
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.RMSprop(lr=_CLASSIFIER_LEARNING_RATE),
metrics=['sparse_categorical_accuracy'])
model.summary(print_fn=absl.logging.info)
return model, base_model
# TFX Transform will call this function.
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
# tf.io.decode_png function cannot be applied on a batch of data.
# We have to use tf.map_fn
image_features = tf.map_fn(
lambda x: tf.io.decode_png(x[0], channels=3),
inputs[_IMAGE_KEY],
dtype=tf.uint8)
# image_features = tf.cast(image_features, tf.float32)
image_features = tf.image.resize(image_features, [224, 224])
image_features = tf.keras.applications.mobilenet.preprocess_input(
image_features)
outputs[_transformed_name(_IMAGE_KEY)] = image_features
#outputs["input_1"] = image_features
# TODO(b/157064428): Support label transformation for Keras.
# Do not apply label transformation as it will result in wrong evaluation.
outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]
return outputs
# TFX Trainer will call this function.
def run_fn(fn_args: FnArgs):
"""Train the model based on given args.
Args:
fn_args: Holds args used to train the model as name/value pairs.
Raises:
ValueError: if invalid inputs.
"""
tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
baseline_path = fn_args.base_model
if baseline_path is not None:
model = tf.keras.models.load_model(os.path.join(baseline_path))
else:
train_dataset = _input_fn(
fn_args.train_files,
fn_args.data_accessor,
tf_transform_output,
is_train=True,
batch_size=_TRAIN_BATCH_SIZE)
eval_dataset = _input_fn(
fn_args.eval_files,
fn_args.data_accessor,
tf_transform_output,
is_train=False,
batch_size=_EVAL_BATCH_SIZE)
model, base_model = _build_keras_model()
absl.logging.info('Tensorboard logging to {}'.format(fn_args.model_run_dir))
# Write logs to path
tensorboard_callback = tf.keras.callbacks.TensorBoard(
log_dir=fn_args.model_run_dir, update_freq='batch')
# Our training regime has two phases: we first freeze the backbone and train
# the newly added classifier only, then unfreeze part of the backbone and
# fine-tune with classifier jointly.
steps_per_epoch = int(_TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE)
total_epochs = int(fn_args.train_steps / steps_per_epoch)
if _CLASSIFIER_EPOCHS > total_epochs:
raise ValueError('Classifier epochs is greater than the total epochs')
absl.logging.info('Start training the top classifier')
model.fit(
train_dataset,
epochs=_CLASSIFIER_EPOCHS,
steps_per_epoch=steps_per_epoch,
validation_data=eval_dataset,
validation_steps=fn_args.eval_steps,
callbacks=[tensorboard_callback])
absl.logging.info('Start fine-tuning the model')
# Unfreeze the top MobileNet layers and do joint fine-tuning
_freeze_model_by_percentage(base_model, 0.9)
# We need to recompile the model because layer properties have changed
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.RMSprop(lr=_FINETUNE_LEARNING_RATE),
metrics=['sparse_categorical_accuracy'])
model.summary(print_fn=absl.logging.info)
model.fit(
train_dataset,
initial_epoch=_CLASSIFIER_EPOCHS,
epochs=total_epochs,
steps_per_epoch=steps_per_epoch,
validation_data=eval_dataset,
validation_steps=fn_args.eval_steps,
callbacks=[tensorboard_callback])
# Prepare the TFLite model used for serving in MLKit
signatures = {
'serving_default':
_get_serve_image_fn(model,tf_transform_output).get_concrete_function(
tf.TensorSpec(
shape=[None],
dtype=tf.string,
name=_IMAGE_KEY))
}
temp_saving_model_dir = os.path.join(fn_args.serving_model_dir)
model.save(temp_saving_model_dir, save_format='tf', signatures=signatures)
# tfrw = rewriter_factory.create_rewriter(
# rewriter_factory.TFLITE_REWRITER,
# name='tflite_rewriter')
# converters.rewrite_saved_model(temp_saving_model_dir,
# fn_args.serving_model_dir, tfrw,
# rewriter.ModelType.TFLITE_MODEL)
# # Add necessary TFLite metadata to the model in order to use it within MLKit
# # TODO(dzats#): Handle label map file path more properly, currently
# # hard-coded.
# tflite_model_path = os.path.join(fn_args.serving_model_dir,
# _TFLITE_MODEL_NAME)
# # TODO(dzats#): Extend the TFLite rewriter to be able to add TFLite metadata
# ## to the model.
# _write_metadata(
# model_path=tflite_model_path,
# label_map_path=fn_args.custom_config['labels_path'],
# mean=[127.5],
# std=[127.5])
# fileio.rmtree(temp_saving_model_dir)

Ok I found the answer. Because the model is expecting the input_1 name, then in _get_serve_image_fn, I need to create the dictionary key, such as:
def _get_serve_image_fn(model, tf_transform_output):
"""Returns a function that feeds the input tensor into the model."""
model.tft_layer = tf_transform_output.transform_features_layer()
#tf.function
def serve_image_fn(serialized_tf_examples):
feature_spec = tf_transform_output.raw_feature_spec()
feature_spec.pop(_LABEL_KEY)
parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
transformed_features = model.tft_layer(parsed_features)
transformed_features[model.get_layer('mobilenet_1.00_224').layers[0].name] = transformed_features[_transformed_name(_IMAGE_KEY)]
del transformed_features[_transformed_name(_IMAGE_KEY)]
return model(transformed_features)
return serve_image_fn

Related

degraded accuracy performance with overfitting when downgrading from tensorflow 2.3.1 to tensorflow 1.14 or 1.15 on multiclass categorization

I made a script in tensorflow 2.x but I had to downconvert it to tensorflow 1.x (tested in 1.14 and 1.15). However, the tf1 version performs very differently (10% accuracy lower on the test set). See also the plot for train and validation performance (diagram is attached below).
Looking at the operations needed for the migration from tf1 to tf2 it seems that only the Adam learning rate may be a problem but I'm defining it explicitly tensorflow migration
I've reproduced the same behavior both locally on GPU and CPU and on colab. The keras used was the one built-in in tensorflow (tf.keras). I've used the following functions (both for train,validation and test), using a sparse categorization (integers):
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
horizontal_flip=horizontal_flip,
#rescale=None, #not needed for resnet50
preprocessing_function=None,
validation_split=None)
train_dataset = train_datagen.flow_from_directory(
directory=train_dir,
target_size=image_size,
class_mode='sparse',
batch_size=batch_size,
shuffle=True)
And the model is a simple resnet50 with a new layer on top:
IMG_SHAPE = img_size+(3,)
inputs = Input(shape=IMG_SHAPE, name='image_input',dtype = tf.uint8)
x = tf.cast(inputs, tf.float32)
# not working in this version of keras. inserted in imageGenerator
x = preprocess_input_resnet50(x)
base_model = tf.keras.applications.ResNet50(
include_top=False,
input_shape = IMG_SHAPE,
pooling=None,
weights='imagenet')
# Freeze the pretrained weights
base_model.trainable = False
x=base_model(x)
# Rebuild top
x = GlobalAveragePooling2D(data_format='channels_last',name="avg_pool")(x)
top_dropout_rate = 0.2
x = Dropout(top_dropout_rate, name="top_dropout")(x)
outputs = Dense(num_classes,activation="softmax", name="pred_out")(x)
model = Model(inputs=inputs, outputs=outputs,name="ResNet50_comp")
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer,
loss="sparse_categorical_crossentropy",
metrics=['accuracy'])
And then I'm calling the fit function:
history = model.fit_generator(train_dataset,
steps_per_epoch=n_train_batches,
validation_data=validation_dataset,
validation_steps=n_val_batches,
epochs=initial_epochs,
verbose=1,
callbacks=[stopping])
I've reproduced the same behavior for example with the following full script (applied to my dataset and changed to adam and removed intermediate final dense layer):
deep learning sandbox
The easiest way to replicate this behavior was to enable or disable the following line on a tf2 environment with the same script and add the following line to it. However, I've tested also on tf1 environments (1.14 and 1.15):
tf.compat.v1.disable_v2_behavior()
Sadly I cannot provide the dataset.
Update 26/11/2020
For full reproducibility I've obtained a similar behaviour by means of the food101 (101 categories) dataset enabling tf1 behaviour with 'tf.compat.v1.disable_v2_behavior()'. The following is the script executed with tensorflow-gpu 2.2.0:
#%% ref https://medium.com/deeplearningsandbox/how-to-use-transfer-learning-and-fine-tuning-in-keras-and-tensorflow-to-build-an-image-recognition-94b0b02444f2
import os
import sys
import glob
import argparse
import matplotlib.pyplot as plt
import tensorflow as tf
# enable and disable this to obtain tf1 behaviour
tf.compat.v1.disable_v2_behavior()
from tensorflow.keras import __version__
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
# since i'm using resnet50 weights from imagenet, i'm using food101 for
# similar but different categorization tasks
# pip install tensorflow-datasets if tensorflow_dataset not found
import tensorflow_datasets as tfds
(train_ds,validation_ds),info= tfds.load('food101', split=['train','validation'], shuffle_files=True, with_info=True)
assert isinstance(train_ds, tf.data.Dataset)
print(train_ds)
#%%
IM_WIDTH, IM_HEIGHT = 224, 224
NB_EPOCHS = 10
BAT_SIZE = 32
def get_nb_files(directory):
"""Get number of files by searching directory recursively"""
if not os.path.exists(directory):
return 0
cnt = 0
for r, dirs, files in os.walk(directory):
for dr in dirs:
cnt += len(glob.glob(os.path.join(r, dr + "/*")))
return cnt
def setup_to_transfer_learn(model, base_model):
"""Freeze all layers and compile the model"""
for layer in base_model.layers:
layer.trainable = False
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
def add_new_last_layer(base_model, nb_classes):
"""Add last layer to the convnet
Args:
base_model: keras model excluding top
nb_classes: # of classes
Returns:
new keras model with last layer
"""
x = base_model.output
x = GlobalAveragePooling2D()(x)
#x = Dense(FC_SIZE, activation='relu')(x) #new FC layer, random init
predictions = Dense(nb_classes, activation='softmax')(x) #new softmax layer
model = Model(inputs=base_model.input, outputs=predictions)
return model
def train(nb_epoch, batch_size):
"""Use transfer learning and fine-tuning to train a network on a new dataset"""
#nb_train_samples = train_ds.cardinality().numpy()
nb_train_samples=info.splits['train'].num_examples
nb_classes = info.features['label'].num_classes
classes_names = info.features['label'].names
#nb_val_samples = validation_ds.cardinality().numpy()
nb_val_samples = info.splits['validation'].num_examples
#nb_epoch = int(args.nb_epoch)
#batch_size = int(args.batch_size)
def preprocess(features):
#print(features['image'], features['label'])
image = tf.image.resize(features['image'], [224,224])
#image = tf.divide(image, 255)
#print(image)
# data augmentation
image=tf.image.random_flip_left_right(image)
image = preprocess_input(image)
label = features['label']
# for categorical crossentropy
#label = tf.one_hot(label,101,axis=-1)
#return image, tf.cast(label, tf.float32)
return image, label
#pre-processing the dataset to fit a specific image size and 2D labelling
train_generator = train_ds.map(preprocess).batch(batch_size).repeat()
validation_generator = validation_ds.map(preprocess).batch(batch_size).repeat()
#train_generator=train_ds
#validation_generator=validation_ds
#fig = tfds.show_examples(validation_generator, info)
# setup model
base_model = ResNet50(weights='imagenet', include_top=False) #include_top=False excludes final FC layer
model = add_new_last_layer(base_model, nb_classes)
# transfer learning
setup_to_transfer_learn(model, base_model)
history = model.fit(
train_generator,
epochs=nb_epoch,
steps_per_epoch=nb_train_samples//BAT_SIZE,
validation_data=validation_generator,
validation_steps=nb_val_samples//BAT_SIZE)
#class_weight='auto')
#execute
history = train(nb_epoch=NB_EPOCHS, batch_size=BAT_SIZE)
And the performance on food101 dataset:
update 27/11/2020
It's possible to see the discrepancy also in the way smaller oxford_flowers102 dataset:
(train_ds,validation_ds,test_ds),info= tfds.load('oxford_flowers102', split=['train','validation','test'], shuffle_files=True, with_info=True)
Nb: the above plot shows confidences given by running the same training multiple times and evaluatind mean and std to check for the effects on random weights initialization and data augmentation.
Moreover I've tried some hyperparameter tuning on tf2 resulting in the following picture:
changing optimizer (adam and rmsprop)
not applying horizontal flipping aumgentation
deactivating keras resnet50 preprocess_input
Thanks in advance for every suggestion. Here are the accuracy and validation performance on tf1 and tf2 on my dataset:
Update 14/12/2020
I'm sharing the colab for reproducibility on oxford_flowers at the clic of a button:
colab script
I came across something similar, when doing the opposite migration (from TF1+Keras to TF2).
Running this code below:
# using TF2
import numpy as np
from tensorflow.keras.applications.resnet50 import ResNet50
fe = ResNet50(include_top=False, pooling="avg")
out = fe.predict(np.ones((1,224,224,3))).flatten()
sum(out)
>>> 212.3205274187726
# using TF1+Keras
import numpy as np
from keras.applications.resnet50 import ResNet50
fe = ResNet50(include_top=False, pooling="avg")
out = fe.predict(np.ones((1,224,224,3))).flatten()
sum(out)
>>> 187.23898954353717
you can see the same model from the same library on different versions does not return the same value (using sum as a quick check-up). I found the answer to this mysterious behavior in this other SO answer: ResNet model in keras and tf.keras give different output for the same image
Another recommendation I'd give you is, try using pooling from inside applications.resnet50.ResNet50 class, instead of the additional layer in your function, for simplicity, and to remove possible problem-generators :)

How to export test values on Tensorflow

I'm using a similar code to this as main train/test database and this to run the model.
I can print predictions in json but I can't print the test values to see which prediction refeers to each test.
How can I do that?
I'would like to export the tested datas.
Here is my code of import datas
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A dataset loader for imports85.data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
import tensorflow as tf
try:
import pandas as pd # pylint: disable=g-import-not-at-top
except ImportError:
pass
# Order is important for the csv-readers, so we use an OrderedDict here.
defaults = collections.OrderedDict([
("mes", [""]),
("marca", [""]),
("linha", [""]),
("grupo", [""]),
("capacidade", [0.0]),
("grade", [0.0]),
("custo", [0.0]),
("benef", [""]),
("desenvolvimento", [""]),
("leadtime", [0.0])
]) # pyformat: disable
types = collections.OrderedDict((key, type(value[0]))
for key, value in defaults.items())
def dataset(file_name="treino.csv", y_name="leadtime", train_fraction=0.7):
"""Load the imports85 data as a (train,test) pair of `Dataset`.
Each dataset generates (features_dict, label) pairs.
Args:
y_name: The name of the column to use as the label.
train_fraction: A float, the fraction of data to use for training. The
remainder will be used for evaluation.
Returns:
A (train,test) pair of `Datasets`
"""
# Download and cache the data
path = file_name
# Define how the lines of the file should be parsed
def decode_line(line):
"""Convert a csv line into a (features_dict,label) pair."""
# Decode the line to a tuple of items based on the types of
# csv_header.values().
items = tf.decode_csv(line, list(defaults.values()),field_delim=';')
# Convert the keys and items to a dict.
pairs = zip(defaults.keys(), items)
features_dict = dict(pairs)
# Remove the label from the features_dict
label = features_dict.pop(y_name)
return features_dict, label
def has_no_question_marks(line):
"""Returns True if the line of text has no question marks."""
# split the line into an array of characters
chars = tf.string_split(line[tf.newaxis], "").values
# for each character check if it is a question mark
is_question = tf.equal(chars, "?")
any_question = tf.reduce_any(is_question)
no_question = ~any_question
return no_question
def in_training_set(line):
"""Returns a boolean tensor, true if the line is in the training set."""
# If you randomly split the dataset you won't get the same split in both
# sessions if you stop and restart training later. Also a simple
# random split won't work with a dataset that's too big to `.cache()` as
# we are doing here.
num_buckets = 1000000
bucket_id = tf.string_to_hash_bucket_fast(line, num_buckets)
# Use the hash bucket id as a random number that's deterministic per example
return bucket_id < int(train_fraction * num_buckets)
def in_test_set(line):
"""Returns a boolean tensor, true if the line is in the training set."""
# Items not in the training set are in the test set.
# This line must use `~` instead of `not` because `not` only works on python
# booleans but we are dealing with symbolic tensors.
return ~in_training_set(line)
base_dataset = (tf.contrib.data
# Get the lines from the file.
.TextLineDataset(path)
# drop lines with question marks.
.filter(has_no_question_marks))
train = (base_dataset
# Take only the training-set lines.
.filter(in_training_set)
# Decode each line into a (features_dict, label) pair.
.map(decode_line)
# Cache data so you only decode the file once.
.cache())
# Do the same for the test-set.
test = (base_dataset.filter(in_test_set).cache().map(decode_line))
return train, test
def raw_dataframe():
"""Load the imports85 data as a pd.DataFrame."""
# Download and cache the data
path = file_name
# Load it into a pandas dataframe
df = pd.read_csv(path, names=types.keys(), dtype=types, na_values="?")
return df
def load_data(y_name="leadtime", train_fraction=0.7, seed=None):
"""Get the imports85 data set.
A description of the data is available at:
https://archive.ics.uci.edu/ml/datasets/automobile
The data itself can be found at:
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Args:
y_name: the column to return as the label.
train_fraction: the fraction of the dataset to use for training.
seed: The random seed to use when shuffling the data. `None` generates a
unique shuffle every run.
Returns:
a pair of pairs where the first pair is the training data, and the second
is the test data:
`(x_train, y_train), (x_test, y_test) = get_imports85_dataset(...)`
`x` contains a pandas DataFrame of features, while `y` contains the label
array.
"""
# Load the raw data columns.
data = raw_dataframe()
# Delete rows with unknowns
data = data.dropna()
# Shuffle the data
np.random.seed(seed)
# Split the data into train/test subsets.
x_train = data.sample(frac=train_fraction, random_state=seed)
x_test = data.drop(x_train.index)
# Extract the label from the features dataframe.
y_train = x_train.pop(y_name)
y_test = x_test.pop(y_name)
return (x_train, y_train), (x_test, y_test)
and here is my code to test, evaluate and predict
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Regression using the DNNRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import pandas as pd
import importar_dados # pylint: disable=g-bad-import-order
STEPS = 100
LT_NORM_FACTOR = 199
def my_dnn_regression_fn(features, labels, mode, params):
"""A model function implementing DNN regression for a custom Estimator."""
# Extract the input into a dense layer, according to the feature_columns.
top = tf.feature_column.input_layer(features, params["feature_columns"])
# Iterate over the "hidden_units" list of layer sizes, default is [20].
for units in params.get("hidden_units", [100]):
# Add a hidden layer, densely connected on top of the previous layer.
top = tf.layers.dense(inputs=top, units=units, activation=tf.nn.relu)
# Connect a linear output layer on top.
output_layer = tf.layers.dense(inputs=top, units=1)
# Reshape the output layer to a 1-dim Tensor to return predictions
predictions = tf.squeeze(output_layer, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
# In `PREDICT` mode we only need to return predictions.
return tf.estimator.EstimatorSpec(
mode=mode, predictions={"leadtime": predictions})
# Calculate loss using mean squared error
average_loss = tf.losses.mean_squared_error(labels, predictions)
# Pre-made estimators use the total_loss instead of the average,
# so report total_loss for compatibility.
batch_size = tf.shape(labels)[0]
total_loss = tf.to_float(batch_size) * average_loss
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = params.get("optimizer", tf.train.AdamOptimizer)
optimizer = optimizer(params.get("learning_rate", None))
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(
mode=mode, loss=total_loss, train_op=train_op)
# In evaluation mode we will calculate evaluation metrics.
assert mode == tf.estimator.ModeKeys.EVAL
# Calculate root mean squared error
rmse = tf.metrics.root_mean_squared_error(labels, predictions)
# Add the rmse to the collection of evaluation metrics.
eval_metrics = {"rmse": rmse}
return tf.estimator.EstimatorSpec(
mode=mode,
# Report sum of error for compatibility with pre-made estimators
loss=total_loss,
eval_metric_ops=eval_metrics)
def main(argv):
"""Builds, trains, and evaluates the model."""
assert len(argv) == 1
(train, test) = importar_dados.dataset()
# Switch the labels to units of thousands for better convergence.
def normalize_lt(features, labels):
return features, labels / LT_NORM_FACTOR
train = train.map(normalize_lt)
test = test.map(normalize_lt)
# Build the training input_fn.
def input_train():
return (
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
train.shuffle(1000).batch(128)
# Repeat forever
.repeat().make_one_shot_iterator().get_next())
# Build the validation input_fn.
def input_test():
return (test.shuffle(1000).batch(128)
.make_one_shot_iterator().get_next())
# The first way assigns a unique weight to each category. To do this you must
# specify the category's vocabulary (values outside this specification will
# receive a weight of zero). Here we specify the vocabulary using a list of
# options. The vocabulary can also be specified with a vocabulary file (using
# `categorical_column_with_vocabulary_file`). For features covering a
# range of positive integers use `categorical_column_with_identity`.
marca_vocab = ["ANIMALE","FABULA","FARM","A.BRAND","F.Y.I","MAS ANIMALE"]
marca = tf.feature_column.categorical_column_with_vocabulary_list(
key="marca", vocabulary_list=marca_vocab)
mes_vocab = ["1","2","3","4","5","6","7","8","9","10","11","12"]
mes = tf.feature_column.categorical_column_with_vocabulary_list(
key="mes", vocabulary_list=mes_vocab)
linha_vocab = ["A+","SEDA","TRICOLINE","MALHA","JNS","SARJA","TECIDO","TECIDO PLANO","DESFILE ABRAND","ARTESANAL",
"TREND","NOITE","BB","JEANS","HANDMADE","ESI","ALFAIATARIA","PRO","COURO","EST","CONCEPT","OFF PREMIUM",
"ACESSORIOS","MOVE","NOITE CASUAL","TAT","RESORT","EMI","EMT","FITNESS","BALADA","HOME VESTUARIO",
"UNIFORME","BOT","VTL","TECIDO PLANO BASICO","HOM","PRAIA","INTIMATES","BTP","TRICOT","QUERO","EMB",
"ATL","BMA","SAPATO","PRINCESS","BLUE","BOLSA","ESB","TECIDO PLANO ELABORADO","NOVOS DESEJOS","FESTA",
"FANTASIA","MARKETING","ACE","TECIDO PLANO ESTAMPADO","ADMINISTRATIVO","FAN","TECIDO PLANO LISO","AGA",
"CDO","AGE","BIJOUX","COBRANDING","NEUTROS","ESM"]
linha = tf.feature_column.categorical_column_with_vocabulary_list(
key="linha", vocabulary_list=linha_vocab)
grupo_vocab = ["VESTIDOS","TOP","TOP NEUTRO","TOP ELABORADO","SHORT","BLUSA","TOP BASICO","BOTTOM BASICO","VESTIDO BASICO",
"BLUSA ESTAMPADA","BOTTOM","MACACAO","TOP FUN","OVERTOPS","VESTIDO ESTAMPADO","BOTTOM ESTAMPADO",
"BOTTOM ELABORADO","CALCAS","CAMISA","SAIAS","AGASALHO","CALCA ESTAMPADA","ACESSORIOS","DIVERSOS",
"CINTOS","BIQUINI","TOP TECIDO","BIQUINI/MAIO","VESTUARIO","OVERTOP ESTAMPADO","CALCINHA","BERMUDA",
"LINGERIE","MAIO","VESTIDOS ELABORADO","OUTROS","SAPATOS","BOLSAS","CAMISA ESTAMPADA","LENCO","CHAPEU",
"FANTASIA","OVERTOP PESADO","TOP LEVE","HOME","PRAIA","OVERTOP LEVE","OVERTOP ELABORADO","STREET","ESPECIAL",
"PIJAMA","CANGA","BRINCO","SOUTIEN","OVERTOP BASICO","UNDERWEAR"]
grupo = tf.feature_column.categorical_column_with_vocabulary_list(
key="grupo", vocabulary_list=grupo_vocab)
benef_vocab = ["S","N"]
benef = tf.feature_column.categorical_column_with_vocabulary_list(
key="benef", vocabulary_list=benef_vocab)
desenvolvimento_vocab = ["INT","EX"]
desenvolvimento = tf.feature_column.categorical_column_with_vocabulary_list(
key="desenvolvimento", vocabulary_list=desenvolvimento_vocab)
# make = tf.feature_column.categorical_column_with_hash_bucket(
# key="make", hash_bucket_size=50)
feature_columns = [
tf.feature_column.indicator_column(mes),
tf.feature_column.indicator_column(marca),
tf.feature_column.indicator_column(linha),
tf.feature_column.indicator_column(grupo),
tf.feature_column.numeric_column(key="capacidade"),
tf.feature_column.numeric_column(key="grade"),
tf.feature_column.numeric_column(key="custo"),
# Since this is a DNN model, convert categorical columns from sparse
# to dense.
# Wrap them in an `indicator_column` to create a
# one-hot vector from the input.
tf.feature_column.indicator_column(benef),
tf.feature_column.indicator_column(desenvolvimento)#,
# Or use an `embedding_column` to create a trainable vector for each
# index.
# tf.feature_column.embedding_column(make, dimension=3),
]
# Build a custom Estimator, using the model_fn.
# `params` is passed through to the `model_fn`.
model = tf.estimator.Estimator(
model_fn=my_dnn_regression_fn,
params={
"feature_columns": feature_columns,
"learning_rate": 0.001,
"optimizer": tf.train.AdamOptimizer,
"hidden_units": [100,500,100]
},
model_dir="resultados")
# Train the model.
model.train(input_fn=input_train, steps=STEPS)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=input_test)
pred_result = model.predict(input_fn = input_test,
predict_keys=None,
hooks=None,
checkpoint_path=None)
sess = tf.Session()
# Print the Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: {:.0f} Dias"
.format(LT_NORM_FACTOR * eval_result["rmse"]))
#prediction_df = pd.DataFrame(list(pred_result))
#prediction_df.to_csv('prediction.csv')
print(list(pred_result))
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)

Create keras callback to save model predictions and targets for each batch during training

I am building a simple Sequential model in Keras (tensorflow backend). During training I want to inspect the individual training batches and model predictions. Therefore, I am trying to create a custom Callback that saves the model predictions and targets for each training batch. However, the model is not using the current batch for prediction, but the entire training data.
How can I hand over only the current training batch to the Callback?
And how can I access the batches and targets that the Callback saves in self.predhis and self.targets?
My current version looks as follows:
callback_list = [prediction_history((self.x_train, self.y_train))]
self.model.fit(self.x_train, self.y_train, batch_size=self.batch_size, epochs=self.n_epochs, validation_data=(self.x_val, self.y_val), callbacks=callback_list)
class prediction_history(keras.callbacks.Callback):
def __init__(self, train_data):
self.train_data = train_data
self.predhis = []
self.targets = []
def on_batch_end(self, epoch, logs={}):
x_train, y_train = self.train_data
self.targets.append(y_train)
prediction = self.model.predict(x_train)
self.predhis.append(prediction)
tf.logging.info("Prediction shape: {}".format(prediction.shape))
tf.logging.info("Targets shape: {}".format(y_train.shape))
NOTE: this answer is outdated and only works with TF1. Check #bers's answer for a solution tested on TF2.
After model compilation, the placeholder tensor for y_true is in model.targets and y_pred is in model.outputs.
To save the values of these placeholders at each batch, you can:
First copy the values of these tensors into variables.
Evaluate these variables in on_batch_end, and store the resulting arrays.
Now step 1 is a bit involved because you'll have to add an tf.assign op to the training function model.train_function. Using current Keras API, this can be done by providing a fetches argument to K.function() when the training function is constructed.
In model._make_train_function(), there's a line:
self.train_function = K.function(inputs,
[self.total_loss] + self.metrics_tensors,
updates=updates,
name='train_function',
**self._function_kwargs)
The fetches argument containing the tf.assign ops can be provided via model._function_kwargs (only works after Keras 2.1.0).
As an example:
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import Callback
from keras import backend as K
import tensorflow as tf
import numpy as np
class CollectOutputAndTarget(Callback):
def __init__(self):
super(CollectOutputAndTarget, self).__init__()
self.targets = [] # collect y_true batches
self.outputs = [] # collect y_pred batches
# the shape of these 2 variables will change according to batch shape
# to handle the "last batch", specify `validate_shape=False`
self.var_y_true = tf.Variable(0., validate_shape=False)
self.var_y_pred = tf.Variable(0., validate_shape=False)
def on_batch_end(self, batch, logs=None):
# evaluate the variables and save them into lists
self.targets.append(K.eval(self.var_y_true))
self.outputs.append(K.eval(self.var_y_pred))
# build a simple model
# have to compile first for model.targets and model.outputs to be prepared
model = Sequential([Dense(5, input_shape=(10,))])
model.compile(loss='mse', optimizer='adam')
# initialize the variables and the `tf.assign` ops
cbk = CollectOutputAndTarget()
fetches = [tf.assign(cbk.var_y_true, model.targets[0], validate_shape=False),
tf.assign(cbk.var_y_pred, model.outputs[0], validate_shape=False)]
model._function_kwargs = {'fetches': fetches} # use `model._function_kwargs` if using `Model` instead of `Sequential`
# fit the model and check results
X = np.random.rand(10, 10)
Y = np.random.rand(10, 5)
model.fit(X, Y, batch_size=8, callbacks=[cbk])
Unless the number of samples can be divided by the batch size, the final batch will have a different size than other batches. So K.variable() and K.update() can't be used in this case. You'll have to use tf.Variable(..., validate_shape=False) and tf.assign(..., validate_shape=False) instead.
To verify the correctness of the saved arrays, you can add one line in training.py to print out the shuffled index array:
if shuffle == 'batch':
index_array = _batch_shuffle(index_array, batch_size)
elif shuffle:
np.random.shuffle(index_array)
print('Index array:', repr(index_array)) # Add this line
batches = _make_batches(num_train_samples, batch_size)
The shuffled index array should be printed out during fitting:
Epoch 1/1
Index array: array([8, 9, 3, 5, 4, 7, 1, 0, 6, 2])
10/10 [==============================] - 0s 23ms/step - loss: 0.5670
And you can check if cbk.targets is the same as Y[index_array]:
index_array = np.array([8, 9, 3, 5, 4, 7, 1, 0, 6, 2])
print(Y[index_array])
[[ 0.75325592 0.64857277 0.1926653 0.7642865 0.38901153]
[ 0.77567689 0.13573623 0.4902501 0.42897559 0.55825652]
[ 0.33760938 0.68195038 0.12303088 0.83509441 0.20991668]
[ 0.98367778 0.61325065 0.28973401 0.28734073 0.93399794]
[ 0.26097574 0.88219054 0.87951941 0.64887846 0.41996446]
[ 0.97794604 0.91307569 0.93816428 0.2125808 0.94381495]
[ 0.74813435 0.08036688 0.38094272 0.83178364 0.16713736]
[ 0.52609421 0.39218962 0.21022047 0.58569125 0.08012982]
[ 0.61276627 0.20679494 0.24124858 0.01262245 0.0994412 ]
[ 0.6026137 0.25620512 0.7398164 0.52558182 0.09955769]]
print(cbk.targets)
[array([[ 0.7532559 , 0.64857274, 0.19266529, 0.76428652, 0.38901153],
[ 0.77567691, 0.13573623, 0.49025011, 0.42897558, 0.55825651],
[ 0.33760938, 0.68195039, 0.12303089, 0.83509439, 0.20991668],
[ 0.9836778 , 0.61325067, 0.28973401, 0.28734073, 0.93399793],
[ 0.26097575, 0.88219053, 0.8795194 , 0.64887846, 0.41996446],
[ 0.97794604, 0.91307569, 0.93816429, 0.2125808 , 0.94381493],
[ 0.74813437, 0.08036689, 0.38094273, 0.83178365, 0.16713737],
[ 0.5260942 , 0.39218962, 0.21022047, 0.58569127, 0.08012982]], dtype=float32),
array([[ 0.61276627, 0.20679495, 0.24124858, 0.01262245, 0.0994412 ],
[ 0.60261369, 0.25620511, 0.73981643, 0.52558184, 0.09955769]], dtype=float32)]
As you can see, there are two batches in cbk.targets (one "full batch" of size 8 and the final batch of size 2), and the row order is the same as Y[index_array].
Long edit (almost a new answer) for the following reasons:
Yu-Yang's 2017 answer relies on the private _make_train_function and _function_kwargs APIs, which work only in TF1 (and maybe in TF1 compatibility, so-called non-eager mode).
Similarly, Binyan Hu's 2020 answer relies on _make_test_function and does not work in TF2 by default (requiring non-eager mode as well).
My own Jan 2020 answer, which was already subject to several required configuration settings, seems to have stopped working with (or before) TF 2.5, and I was not able to make model.inputs or model.outputs work any longer.
Finally, the earlier version of this answer requires potentially expensive model evaluation to obtain the predictions for each batch. A similar solution to obtain activation histograms even led to OOM issues with repeated training of different models.
So I set out find a way to obtain all possible quantities (inputs, targets, predictions, activations), batch-wise, without using any private APIs. The aim was to be able to call .numpy() on the intended quantities, so Keras callbacks can run ordinary Python code to ease debugging (I suppose that is what this question is mainly about - for maximum performance, one would probably try to integrate as many computations as possible into TensorFlow's graph operations anyway).
This is the common base model for all solutions:
"""Demonstrate batch data access."""
import tensorflow as tf
from tensorflow import keras
class DataCallback(keras.callbacks.Callback):
"""This class is where all implementations differ."""
def tf_nan(dtype):
"""Create NaN variable of proper dtype and variable shape for assign()."""
return tf.Variable(float("nan"), dtype=dtype, shape=tf.TensorShape(None))
def main():
"""Run main."""
model = keras.Sequential([keras.layers.Dense(1, input_shape=(2,))])
callback = DataCallback()
model.compile(loss="mse", optimizer="adam")
model.fit(
x=tf.transpose(tf.range(7.0) + [[0.2], [0.4]]),
y=tf.transpose(tf.range(7.0) + 10 + [[0.5]]),
validation_data=(
tf.transpose(tf.range(11.0) + 30 + [[0.6], [0.7]]),
tf.transpose(tf.range(11.0) + 40 + [[0.9]]),
),
shuffle=False,
batch_size=3,
epochs=2,
verbose=0,
callbacks=[callback],
)
model.save("tmp.tf")
if __name__ == "__main__":
main()
The following three snippets show one possible solution each, each with their own pros and cons. The core trick is always the same: allocate a tf.Variable and use tf.Variable.assign to export the intended quantity, from some Keras code run in graph mode, into the callback. The methods differ slightly in callback initialization and (in one case) model compilation, and most importantly, in the quantities they can access, which is why I summarize them above each snippet.
Custom metric
Using a custom (fake) metric (similar to my Jan 2020 answer), while we cannot seem to access model.inputs nor model.outputs any more (and model.(_)targets does not even exist any longer), we can access y_true and y_pred, which represent the model targets and outputs:
[ ] Inputs/Samples (x)
[ ] Weights (w)
[+] Targets/Labels (y_true)
[+] Outputs/Predictions (y_pred)
[ ] All layers (or only final input/output layers)
"""Demonstrate batch data access using a custom metric."""
import tensorflow as tf
from tensorflow import keras
class DataCallback(keras.callbacks.Callback): # diff
"""Callback to operate on batch data from metric."""
def __init__(self):
"""Offer a metric to access batch data."""
super().__init__()
self.y_true = None
self.y_pred = None
def set_model(self, model):
"""Initialize variables when model is set."""
self.y_true = tf_nan(model.output.dtype)
self.y_pred = tf_nan(model.output.dtype)
def metric(self, y_true, y_pred):
"""Fake metric."""
self.y_true.assign(y_true)
self.y_pred.assign(y_pred)
return 0
def on_train_batch_end(self, _batch, _logs=None):
"""See keras.callbacks.Callback.on_train_batch_end."""
print("y_true =", self.y_true.numpy())
print("y_pred =", self.y_pred.numpy())
def on_train_end(self, _logs=None):
"""Clean up."""
del self.y_true, self.y_pred
def tf_nan(dtype):
"""Create NaN variable of proper dtype and variable shape for assign()."""
return tf.Variable(float("nan"), dtype=dtype, shape=tf.TensorShape(None))
def main():
"""Run main."""
model = keras.Sequential([keras.layers.Dense(1, input_shape=(2,))])
callback = DataCallback()
model.compile(loss="mse", optimizer="adam", metrics=[callback.metric]) # diff
model.fit(
x=tf.transpose(tf.range(7.0) + [[0.2], [0.4]]),
y=tf.transpose(tf.range(7.0) + 10 + [[0.5]]),
validation_data=(
tf.transpose(tf.range(11.0) + 30 + [[0.6], [0.7]]),
tf.transpose(tf.range(11.0) + 40 + [[0.9]]),
),
shuffle=False,
batch_size=3,
epochs=2,
verbose=0,
callbacks=[callback],
)
model.save("tmp.tf")
if __name__ == "__main__":
main()
Custom training step
A custom training step is what I used in an earlier version of this answer. The idea still works in principle, but y_pred can be expensive and it might make sense to use a custom metric (see above) if that is required.
[+] Inputs/Samples (x)
[+] Weights (w)
[+] Targets/Labels (y_true)
[~] Outputs/Predictions (y_pred) [expensive!]
[ ] All layers (or only final input/output layers)
"""Demonstrate batch data access using a custom training step."""
import tensorflow as tf
from tensorflow import keras
class DataCallback(keras.callbacks.Callback): # diff
"""Callback to operate on batch data from training step."""
def __init__(self):
"""Initialize tf.Variables."""
super().__init__()
self.x = None
self.w = None
self.y_true = None
self.y_pred = None
def set_model(self, model):
"""Wrap the model.train_step function to access training batch data."""
self.x = tf_nan(model.input.dtype)
# pylint:disable=protected-access (replace by proper dtype if you know it)
if model.compiled_loss._user_loss_weights is not None:
self.w = tf_nan(model.compiled_loss._user_loss_weights.dtype)
self.y_true = tf_nan(model.output.dtype)
self.y_pred = tf_nan(model.output.dtype)
model_train_step = model.train_step
def outer_train_step(data):
# https://github.com/keras-team/keras/blob/v2.7.0/keras/engine/training.py
x, y_true, w = keras.utils.unpack_x_y_sample_weight(data)
self.x.assign(x)
if w is not None:
self.w.assign(w)
self.y_true.assign(y_true)
result = model_train_step(data)
y_pred = model(x)
self.y_pred.assign(y_pred)
return result
model.train_step = outer_train_step
def on_train_batch_end(self, _batch, _logs=None):
"""See keras.callbacks.Callback.on_train_batch_end."""
print("x =", self.x.numpy())
if self.w is not None:
print("w =", self.w.numpy())
print("y_true =", self.y_true.numpy())
print("y_pred =", self.y_pred.numpy())
def on_train_end(self, _logs=None):
"""Clean up."""
del self.x, self.w, self.y_true, self.y_pred
def tf_nan(dtype):
"""Create NaN variable of proper dtype and variable shape for assign()."""
return tf.Variable(float("nan"), dtype=dtype, shape=tf.TensorShape(None))
def main():
"""Run main."""
model = keras.Sequential([keras.layers.Dense(1, input_shape=(2,))])
callback = DataCallback()
model.compile(loss="mse", optimizer="adam")
model.fit(
x=tf.transpose(tf.range(7.0) + [[0.2], [0.4]]),
y=tf.transpose(tf.range(7.0) + 10 + [[0.5]]),
validation_data=(
tf.transpose(tf.range(11.0) + 30 + [[0.6], [0.7]]),
tf.transpose(tf.range(11.0) + 40 + [[0.9]]),
),
shuffle=False,
batch_size=3,
epochs=2,
verbose=0,
callbacks=[callback],
)
model.save("tmp.tf")
if __name__ == "__main__":
main()
Custom layer call
A custom layer call is a super-flexible way of accessing each layer's inputs and outputs. The callback handles patching of the call functions for a list of layers. While we cannot access weights and targets (as these quantitities do not make sense at the level of individual layers), it allows us to access individual layer activations, which can be handy for questions such as How does one log activations using `tf.keras.callbacks.TensorBoard`?.
[+] Inputs/Samples (x)
[ ] Weights (w)
[ ] Targets/Labels (y_true)
[+] Outputs/Predictions (y_pred)
[+] All layers (or only final input/output layers)
"""Demonstrate batch data access using custom layer calls."""
import tensorflow as tf
from tensorflow import keras
class DataCallback(keras.callbacks.Callback): # diff
"""Callback to operate on batch data from selected (to be wrapped) layers."""
def __init__(self, layers):
"""Wrap the calls of an iterable of model layers to access layer batch data."""
super().__init__()
self.data = {}
self.inner_calls = {}
self.outer_calls = {}
for layer in layers:
self.data[layer] = {
"inputs": tf_nan(layer.input.dtype),
"outputs": tf_nan(layer.output.dtype),
}
self.inner_calls[layer] = layer.call
def outer_call(inputs, layer=layer, layer_call=layer.call):
self.data[layer]["inputs"].assign(inputs)
outputs = layer_call(inputs)
self.data[layer]["outputs"].assign(outputs)
return outputs
self.outer_calls[layer] = outer_call
def on_train_batch_begin(self, _epoch, _logs=None):
"""Wrap layer calls during each batch."""
for layer, call in self.outer_calls.items():
layer.call = call
def on_train_batch_end(self, _epoch, _logs=None):
"""Restore original layer calls for ModelCheckpoint, model.save, ..."""
for layer, call in self.inner_calls.items():
layer.call = call
for layer, data in self.data.items():
print("Layer =", layer)
print("Inputs =", data["inputs"].numpy())
print("Outputs =", data["outputs"].numpy())
def tf_nan(dtype):
"""Create NaN variable of proper dtype and variable shape for assign()."""
return tf.Variable(float("nan"), dtype=dtype, shape=tf.TensorShape(None))
def main():
"""Run main."""
model = keras.Sequential([keras.layers.Dense(1, input_shape=(2,))])
callback = DataCallback(model.layers) # diff
model.compile(loss="mse", optimizer="adam")
model.fit(
x=tf.transpose(tf.range(7.0) + [[0.2], [0.4]]),
y=tf.transpose(tf.range(7.0) + 10 + [[0.5]]),
validation_data=(
tf.transpose(tf.range(11.0) + 30 + [[0.6], [0.7]]),
tf.transpose(tf.range(11.0) + 40 + [[0.9]]),
),
shuffle=False,
batch_size=3,
epochs=2,
verbose=0,
callbacks=[callback],
)
model.save("tmp.tf")
if __name__ == "__main__":
main()
When to use which and open to-dos
I think the snippets above each solution nicely summarize what each approach is capable of. Generally,
a custom training step will be ideal to access the model input, such as batched dataset generators, effects of shuffling, etc;
a custom layer call is ideal to access the in-betweens of the model; and
a custom metric is ideal to access the outputs of the model.
I am fairly certain (but have not tried) that one can combine all approaches to be able to access all batch quantities simultaneously. I have not tested anything but training mode - each method can have further pros and cons relating to their usefulness in testing or prediction mode. Finally, I assume, but have not tested either, that their should be only minor differences between tf.keras and keras. Having tested this code on TF2.8.rc1 and Keras 2.8.0, which has moved the tf.keras code back into the keras pip package, and not using any private APIs, I believe this assumption is justified.
It would be great if this approach could be extended to access model.inputs and model.outputs again. Currently, I am getting errors such as this one:
TypeError: You are passing KerasTensor(...), an intermediate Keras symbolic input/output, to a TF API that does not allow registering custom dispatchers, such as tf.cond, tf.function, gradient tapes, or tf.map_fn. Keras Functional model construction only supports TF API calls that do support dispatching, such as tf.math.add or tf.reshape. Other APIs cannot be called directly on symbolic Kerasinputs/outputs. You can work around this limitation by putting the operation in a custom Keras layer call and calling that layer on this symbolic input/output.
Previous answer
From TF 2.2 on, you can use custom training steps rather than callbacks to achieve what you want. Here's a demo that works with tensorflow==2.2.0rc1, using inheritance to improve the keras.Sequential model. Performance-wise, this is not ideal as predictions are made twice, once in self(x, training=True) and once in super().train_step(data). But you get the idea.
This works in eager mode and does not use private APIs, so it should be pretty stable. One caveat is that you have to use tf.keras (standalone keras does not support Model.train_step), but I feel standalone keras is becoming more and more deprecated anyway. (In fact, tf.keras migrates to keras in TF2.8.)
"""Demonstrate access to Keras batch tensors in a tf.keras custom training step."""
import numpy as np
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.python.keras.engine import data_adapter
in_shape = (2,)
out_shape = (1,)
batch_size = 3
n_samples = 7
class SequentialWithPrint(keras.Sequential):
def train_step(self, original_data):
# Basically copied one-to-one from https://git.io/JvDTv
data = data_adapter.expand_1d(original_data)
x, y_true, w = data_adapter.unpack_x_y_sample_weight(data)
y_pred = self(x, training=True)
# this is pretty much like on_train_batch_begin
K.print_tensor(w, "Sample weight (w) =")
K.print_tensor(x, "Batch input (x) =")
K.print_tensor(y_true, "Batch output (y_true) =")
K.print_tensor(y_pred, "Prediction (y_pred) =")
result = super().train_step(original_data)
# add anything here for on_train_batch_end-like behavior
return result
# Model
model = SequentialWithPrint([keras.layers.Dense(out_shape[0], input_shape=in_shape)])
model.compile(loss="mse", optimizer="adam")
# Example data
X = np.random.rand(n_samples, *in_shape)
Y = np.random.rand(n_samples, *out_shape)
model.fit(X, Y, batch_size=batch_size)
print("X: ", X)
print("Y: ", Y)
Finally, here is a simpler example without inheritance:
"""Demonstrate access to Keras batch tensors in a tf.keras custom training step."""
import tensorflow as tf
IN_SHAPE = (2,)
OUT_SHAPE = (1,)
BATCH_SIZE = 3
N_SAMPLES = 7
def make_print_data_and_train_step(keras_model):
"""Return a train_step function that prints data batches."""
original_train_step = keras_model.train_step
def print_data_and_train_step(data):
# Adapted from https://git.io/JvDTv, skipping data_adapter.expand_1d
x, y_true, w = tf.keras.utils.unpack_x_y_sample_weight(data)
y_pred = keras_model(x, training=True)
# this is pretty much like on_train_batch_begin
tf.keras.backend.print_tensor(w, "Sample weight (w) =")
tf.keras.backend.print_tensor(x, "Batch input (x) =")
tf.keras.backend.print_tensor(y_true, "Batch output (y_true) =")
tf.keras.backend.print_tensor(y_pred, "Prediction (y_pred) =")
result = original_train_step(data)
# add anything here for on_train_batch_end-like behavior
return result
return print_data_and_train_step
# Model
model = tf.keras.Sequential([tf.keras.layers.Dense(OUT_SHAPE[0], input_shape=IN_SHAPE)])
model.train_step = make_print_data_and_train_step(model)
model.compile(loss="mse", optimizer="adam")
# Example data
X = tf.random.normal((N_SAMPLES, *IN_SHAPE))
Y = tf.random.normal((N_SAMPLES, *OUT_SHAPE))
model.fit(X, Y, batch_size=BATCH_SIZE)
print("X: ", X)
print("Y: ", Y)
Update: This approach has stopped working. See my other answer a number of solutions compatible with TF2.8 (and hopefully beyond).
One problem with #Yu-Yang's solution is that it relies on model._function_kwargs, which is not guaranteed to work as it is not part of the API. In particular, in TF2 with eager execution, session kwargs seem to be either not accepted at all or run preemptively due to eager mode.
Therefore, here is my solution tested on tensorflow==2.1.0. The trick is to replace fetches by a Keras metric, in which the assignment operations from fetches are made during training.
This even enables a Keras-only solution if the batch size divides the number of samples; otherwise, another trick has to be applied when initializing TensorFlow variables with a None shape, similar to validate_shape=False in earlier solutions (compare https://github.com/tensorflow/tensorflow/issues/35667).
Importantly, tf.keras behaves differently from keras (sometimes just ignoring assignments, or seeing variables as Keras symbolic tensors), so this updated solution takes care of both implementations (Keras==2.3.1 and tensorflow==2.1.0).
"""Demonstrate access to Keras symbolic tensors in a (tf.)keras.Callback."""
import numpy as np
import tensorflow as tf
use_tf_keras = True
if use_tf_keras:
from tensorflow import keras
from tensorflow.keras import backend as K
tf.config.experimental_run_functions_eagerly(False)
compile_kwargs = {"run_eagerly": False, "experimental_run_tf_function": False}
else:
import keras
from keras import backend as K
compile_kwargs = {}
in_shape = (2,)
out_shape = (1,)
batch_size = 3
n_samples = 7
class CollectKerasSymbolicTensorsCallback(keras.callbacks.Callback):
"""Collect Keras symbolic tensors."""
def __init__(self):
"""Initialize intermediate variables for batches and lists."""
super().__init__()
# Collect batches here
self.inputs = []
self.targets = []
self.outputs = []
# # For a pure Keras solution, we need to know the shapes beforehand;
# # in particular, batch_size must divide n_samples:
# self.input = K.variable(np.empty((batch_size, *in_shape)))
# self.target = K.variable(np.empty((batch_size, *out_shape)))
# self.output = K.variable(np.empty((batch_size, *out_shape)))
# If the shape of these variables will change (e.g., last batch), initialize
# arbitrarily and specify `shape=tf.TensorShape(None)`:
self.input = tf.Variable(0.0, shape=tf.TensorShape(None))
self.target = tf.Variable(0.0, shape=tf.TensorShape(None))
self.output = tf.Variable(0.0, shape=tf.TensorShape(None))
def on_batch_end(self, batch, logs=None):
"""Evaluate the variables and save them into lists."""
self.inputs.append(K.eval(self.input))
self.targets.append(K.eval(self.target))
self.outputs.append(K.eval(self.output))
def on_train_end(self, logs=None):
"""Print all variables."""
print("Inputs: ", *self.inputs)
print("Targets: ", *self.targets)
print("Outputs: ", *self.outputs)
#tf.function
def assign_keras_symbolic_tensors_metric(_foo, _bar):
"""
Return the assignment operations as a metric to have them evaluated by Keras.
This replaces `fetches` from the TF1/non-eager-execution solution.
"""
# Collect assignments as list of (dest, src)
assignments = (
(callback.input, model.inputs[0]),
(callback.target, model._targets[0] if use_tf_keras else model.targets[0]),
(callback.output, model.outputs[0]),
)
for (dest, src) in assignments:
dest.assign(src)
return 0
callback = CollectKerasSymbolicTensorsCallback()
metrics = [assign_keras_symbolic_tensors_metric]
# Example model
model = keras.Sequential([keras.layers.Dense(out_shape[0], input_shape=in_shape)])
model.compile(loss="mse", optimizer="adam", metrics=metrics, **compile_kwargs)
# Example data
X = np.random.rand(n_samples, *in_shape)
Y = np.random.rand(n_samples, *out_shape)
model.fit(X, Y, batch_size=batch_size, callbacks=[callback])
print("X: ", X)
print("Y: ", Y)
Inspired by the way tf.keras.callbacks.TesnsorBoard saves v1 (graph) summaries.
No variable assignments and no redundant metrics.
For use with tensorflow>=2.0.0, graph (disable eager) mode during evaluating.
Extensive operations on the numpy predictions can be implemented by overriding SavePrediction._pred_callback.
import numpy as np
import tensorflow as tf
from tensorflow import keras
tf.compat.v1.disable_eager_execution()
in_shape = (2,)
out_shape = (1,)
batch_size = 2
n_samples = 32
class SavePrediction(keras.callbacks.Callback):
def __init__(self):
super().__init__()
self._get_pred = None
self.preds = []
def _pred_callback(self, preds):
self.preds.append(preds)
def set_model(self, model):
super().set_model(model)
if self._get_pred is None:
self._get_pred = self.model.outputs[0]
def on_test_begin(self, logs):
# pylint: disable=protected-access
self.model._make_test_function()
# pylint: enable=protected-access
if self._get_pred not in self.model.test_function.fetches:
self.model.test_function.fetches.append(self._get_pred)
self.model.test_function.fetch_callbacks[self._get_pred] = self._pred_callback
def on_test_end(self, logs):
if self._get_pred in self.model.test_function.fetches:
self.model.test_function.fetches.remove(self._get_pred)
if self._get_pred in self.model.test_function.fetch_callbacks:
self.model.test_function.fetch_callbacks.pop(self._get_pred)
print(self.preds)
model = keras.Sequential([
keras.layers.Dense(out_shape[0], input_shape=in_shape)
])
model.compile(loss="mse", optimizer="adam")
X = np.random.rand(n_samples, *in_shape)
Y = np.random.rand(n_samples, *out_shape)
model.evaluate(X, Y,
batch_size=batch_size,
callbacks=[SavePrediction()])

Tensor Flow Estimator Template based save and restore of models

I took the neural networks simple example from tensorflow github and have tried to split it into two parts. The first part is training+test, and the second part is separating out the test part which requires a restore. The restore seems to work, but it cannot find the predict function.
Here is the first part:
from __future__ import print_function
from tensorflow.python.saved_model import builder as saved_model_builder
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=False)
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import shutil
matplotlib.use('TkAgg')
# Parameters
learning_rate = 0.1
num_steps = 1000
batch_size = 128
display_step = 100
# Network Parameters
n_hidden_1 = 256 # 1st layer number of neurons
n_hidden_2 = 256 # 2nd layer number of neurons
num_input = 784 # MNIST data input (img shape: 28*28)
num_classes = 10 # MNIST total classes (0-9 digits)
#init = tf.initialize_all_variables()
sess = tf.Session()
# Define the input function for training
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': mnist.train.images}, y=mnist.train.labels,
batch_size=batch_size, num_epochs=None, shuffle=True)
# Define the neural network
def neural_net(x_dict):
# TF Estimator input is a dict, in case of multiple inputs
x = x_dict['images']
# Hidden fully connected layer with 256 neurons
layer_1 = tf.layers.dense(x, n_hidden_1, name="layer_1")
# Hidden fully connected layer with 256 neurons
layer_2 = tf.layers.dense(layer_1, n_hidden_2, name="layer_2")
# Output fully connected layer with a neuron for each class
out_layer = tf.layers.dense(layer_2, num_classes, name="out_layer")
return out_layer
# Define the model function (following TF Estimator Template)
def model_fn(features, labels, mode):
# Build the neural network
logits = neural_net(features)
# Predictions
pred_classes = tf.argmax(logits, axis=1)
pred_probas = tf.nn.softmax(logits)
# If prediction mode, early return
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=tf.cast(labels, dtype=tf.int32)))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())
# Evaluate the accuracy of the model
acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)
# TF Estimators requires to return a EstimatorSpec, that specify
# the different ops for training, evaluating, ...
estim_specs = tf.estimator.EstimatorSpec(
mode=mode,
predictions=pred_classes,
loss=loss_op,
train_op=train_op,
eval_metric_ops={'accuracy': acc_op})
return estim_specs
# Build the Estimator
model = tf.estimator.Estimator(model_fn)
# Train the Model
model.train(input_fn, steps=num_steps)
# Evaluate the Model
# Define the input function for evaluating
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': mnist.test.images}, y=mnist.test.labels,
batch_size=batch_size, shuffle=False)
# Use the Estimator 'evaluate' method
model.evaluate(input_fn)
#model.export_savedmodel(".", input_fn)
init = tf.global_variables_initializer()
sess.run(init)
tf.add_to_collection("nn_model", model)
# Add ops to save and restore all the variables.
#saver = tf.train.Saver()
#save_path = saver.save(sess, "model/model.ckpt")
try:
shutil.rmtree("model")
except:
pass
builder = saved_model_builder.SavedModelBuilder("model")
builder.add_meta_graph_and_variables(sess, ["nn"])
builder.save()
print("Model saved in file")
# Predict single images
n_images = 4
# Get images from test set
test_images = mnist.test.images[:n_images]
# Prepare the input data
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': test_images}, shuffle=False)
# Use the model to predict the images class
preds = list(model.predict(input_fn))
# Display
for i in range(n_images):
plt.imshow(np.reshape(test_images[i], [28, 28]), cmap='gray')
plt.show()
print("Model prediction:", preds[i])
The above program works fine. It saves the model, not sure correctly, as I see all the directories being created. Although it does give one warning:
WARNING:tensorflow:Error encountered when serializing nn_model.
Type is unsupported, or the types of the items don't match field type in CollectionDef.
'Estimator' object has no attribute 'name'
Here is the "apply" program that restores and tries to apply and fails at the predict() line:
import tensorflow as tf
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=False)
sess=tf.Session()
#First let's load meta graph and restore weights
#saver = tf.train.import_meta_graph('model/model.ckpt.meta')
#saver.restore(sess,tf.train.latest_checkpoint('nn_model'))
tf.saved_model.loader.load(sess, ["nn"], "model")
model = tf.get_collection('nn_model')
# Predict single images
n_images = 4
# Get images from test set
test_images = mnist.test.images[:n_images]
# Prepare the input data
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': test_images}, shuffle=False)
# Use the model to predict the images class
preds = list(model.predict(input_fn))
# Display
for i in range(n_images):
plt.imshow(np.reshape(test_images[i], [28, 28]), cmap='gray')
plt.show()
print("Model prediction:", preds[i])
The error it gives is:
Traceback (most recent call last):
File "applynn.py", line 35, in
preds = list(model.predict(input_fn))
AttributeError: 'module' object has no attribute 'predict'
So what is missing here?
So this problem is now fixed. Here is what I had to do to fix this.
The first part is:
from __future__ import print_function
from tensorflow.python.saved_model import builder as saved_model_builder
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=False)
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import shutil
matplotlib.use('TkAgg')
# Parameters
learning_rate = 0.1
num_steps = 1000
batch_size = 128
display_step = 100
# Network Parameters
n_hidden_1 = 256 # 1st layer number of neurons
n_hidden_2 = 256 # 2nd layer number of neurons
num_input = 784 # MNIST data input (img shape: 28*28)
num_classes = 10 # MNIST total classes (0-9 digits)
#init = tf.initialize_all_variables()
sess = tf.Session()
# Define the input function for training
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': mnist.train.images}, y=mnist.train.labels,
batch_size=batch_size, num_epochs=None, shuffle=True)
# Define the neural network
def neural_net(x_dict):
# TF Estimator input is a dict, in case of multiple inputs
x = x_dict['images']
# Hidden fully connected layer with 256 neurons
layer_1 = tf.layers.dense(x, n_hidden_1, name="layer_1")
# Hidden fully connected layer with 256 neurons
layer_2 = tf.layers.dense(layer_1, n_hidden_2, name="layer_2")
# Output fully connected layer with a neuron for each class
out_layer = tf.layers.dense(layer_2, num_classes, name="out_layer")
return out_layer
# Define the model function (following TF Estimator Template)
def model_fn(features, labels, mode):
# Build the neural network
logits = neural_net(features)
# Predictions
pred_classes = tf.argmax(logits, axis=1)
pred_probas = tf.nn.softmax(logits)
# If prediction mode, early return
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=tf.cast(labels, dtype=tf.int32)))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())
# Evaluate the accuracy of the model
acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)
# TF Estimators requires to return a EstimatorSpec, that specify
# the different ops for training, evaluating, ...
estim_specs = tf.estimator.EstimatorSpec(
mode=mode,
predictions=pred_classes,
loss=loss_op,
train_op=train_op,
eval_metric_ops={'accuracy': acc_op})
return estim_specs
# Build the Estimator
estimator = tf.estimator.Estimator(model_fn, model_dir='estimator')
# Train the Model
estimator.train(input_fn, steps=num_steps)
# Evaluate the Model
# Define the input function for evaluating
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': mnist.test.images}, y=mnist.test.labels,
batch_size=batch_size, shuffle=False)
# Use the Estimator 'evaluate' method
estimator.evaluate(input_fn)
#model.export_savedmodel(".", input_fn)
init = tf.global_variables_initializer()
sess.run(init)
tf.add_to_collection("nn_model", estimator)
# Add ops to save and restore all the variables.
#saver = tf.train.Saver()
#save_path = saver.save(sess, "model/model.ckpt")
try:
shutil.rmtree("model")
except:
pass
builder = saved_model_builder.SavedModelBuilder("model")
builder.add_meta_graph_and_variables(sess, ["nn"])
builder.save()
print("Model saved in file")
# Predict single images
n_images = 4
# Get images from test set
test_images = mnist.test.images[:n_images]
# Prepare the input data
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': test_images}, shuffle=False)
# Use the model to predict the images class
preds = list(estimator.predict(input_fn))
# Display
for i in range(n_images):
plt.imshow(np.reshape(test_images[i], [28, 28]), cmap='gray')
plt.show()
print("Model prediction:", preds[i])
The second part is:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=False)
# Network Parameters
n_hidden_1 = 256 # 1st layer number of neurons
n_hidden_2 = 256 # 2nd layer number of neurons
num_classes = 10 # MNIST total classes (0-9 digits)
# Define the neural network
def neural_net(x_dict):
# TF Estimator input is a dict, in case of multiple inputs
x = x_dict['images']
# Hidden fully connected layer with 256 neurons
layer_1 = tf.layers.dense(x, n_hidden_1, name="layer_1")
# Hidden fully connected layer with 256 neurons
layer_2 = tf.layers.dense(layer_1, n_hidden_2, name="layer_2")
# Output fully connected layer with a neuron for each class
out_layer = tf.layers.dense(layer_2, num_classes, name="out_layer")
return out_layer
# Define the model function (following TF Estimator Template)
def model_fn(features, labels, mode):
# Build the neural network
logits = neural_net(features)
# Predictions
pred_classes = tf.argmax(logits, axis=1)
pred_probas = tf.nn.softmax(logits)
# If prediction mode, early return
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=tf.cast(labels, dtype=tf.int32)))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())
# Evaluate the accuracy of the model
acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)
# TF Estimators requires to return a EstimatorSpec, that specify
# the different ops for training, evaluating, ...
estim_specs = tf.estimator.EstimatorSpec(
mode=mode,
predictions=pred_classes,
loss=loss_op,
train_op=train_op,
eval_metric_ops={'accuracy': acc_op})
return estim_specs
sess=tf.Session()
estimator = tf.estimator.Estimator(model_fn, model_dir='estimator')
# Predict single images
n_images = 4
# Get images from test set
test_images = mnist.test.images[:n_images]
# Prepare the input data
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'images': test_images}, shuffle=False)
# Use the model to predict the images class
preds = list(estimator.predict(input_fn))
# Display
for i in range(n_images):
plt.imshow(np.reshape(test_images[i], [28, 28]), cmap='gray')
plt.show()
print("Model prediction:", preds[i])
Note that I have called the model variable as estimator, since it really is the estimator. Also, I am passing a model_dir so serialize the estimator separately from the other variables. I also had to explicitly make sure the second python file has access to the two functions and any variables they depend upon. A couple of minor other fixes were made in the code.

Exporting cifar10 model from checkpoint file to tensorflow serving

I tried to modify the inception_export.py for CIFAR10 model, but I get the errors:
raise type(e)(node_def, op, message) tensorflow.python.framework.errors.InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [18,384] rhs shape= [2304,384]
[[Node: save/Assign_5 = Assign[T=DT_FLOAT, _class=["loc:#local3/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/cpu:0"](local3/weights, save/restore_slice_5)]] Caused by op u'save/Assign_5', defined at:
I am still very new to tensorflow, any help is much appreciated, thanks
EDIT1: here is my code. I haven't installed tensorflow serving so the related block is commented out. I also change the image_size to 24 to fit the CIFAR10 model.
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#!/usr/bin/env python2.7
"""Modified for CIFAR10 model from https://github.com/tensorflow/serving/blob/master/tensorflow_serving/example/inception_export.py
"""
import os.path
import sys
# This is a placeholder for a Google-internal import.
import tensorflow as tf
from tensorflow.models.image.cifar10 import cifar10
#from inception import inception_model
#from tensorflow_serving.session_bundle import exporter
tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
"""Directory where to read training checkpoints.""")
tf.app.flags.DEFINE_string('export_dir', '/tmp/cifar10_export',
"""Directory where to export inference model.""")
tf.app.flags.DEFINE_integer('image_size', 24,
"""Needs to provide same value as in training.""")
FLAGS = tf.app.flags.FLAGS
NUM_CLASSES = 10
NUM_TOP_CLASSES = 2
WORKING_DIR = os.path.dirname(os.path.realpath(__file__))
SYNSET_FILE = os.path.join(WORKING_DIR, 'imagenet_lsvrc_2015_synsets.txt')
METADATA_FILE = os.path.join(WORKING_DIR, 'imagenet_metadata.txt')
def export():
"""can be deleted if my simply define the constant string manually below?
# Create index->synset mapping
synsets = []
with open(SYNSET_FILE) as f:
synsets = f.read().splitlines()
# Create synset->metadata mapping
texts = {}
with open(METADATA_FILE) as f:
for line in f.read().splitlines():
parts = line.split('\t')
assert len(parts) == 2
texts[parts[0]] = parts[1]
"""
with tf.Graph().as_default():
# Build inference model.
# Please refer to Tensorflow inception model for details.
# Input transformation.
# TODO(b/27776734): Add batching support.
jpegs = tf.placeholder(tf.string, shape=(1))
image_buffer = tf.squeeze(jpegs, [0])
# Decode the string as an RGB JPEG.
# Note that the resulting image contains an unknown height and width
# that is set dynamically by decode_jpeg. In other words, the height
# and width of image is unknown at compile-time.
image = tf.image.decode_jpeg(image_buffer, channels=3)
# After this point, all image pixels reside in [0,1)
# until the very end, when they're rescaled to (-1, 1). The various
# adjust_* ops all require this range for dtype float.
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
# Crop the central region of the image with an area containing 87.5% of
# the original image.
image = tf.image.central_crop(image, central_fraction=0.875)
# Resize the image to the original height and width.
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image,
[FLAGS.image_size, FLAGS.image_size],
align_corners=False)
image = tf.squeeze(image, [0])
# Finally, rescale to [-1,1] instead of [0, 1)
image = tf.sub(image, 0.5)
image = tf.mul(image, 2.0)
images = tf.expand_dims(image, 0)
# Run inference.
logits = cifar10.inference(images)
# Transform output to topK result.
values, indices = tf.nn.top_k(logits, NUM_TOP_CLASSES)
# Create a constant string Tensor where the i'th element is
# the human readable class description for the i'th index.
# Note that the 0th index is an unused background class
# (see inception model definition code).
class_descriptions = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
# for s in synsets:
# class_descriptions.append(texts[s])
class_tensor = tf.constant(class_descriptions)
classes = tf.contrib.lookup.index_to_string(tf.to_int64(indices),
mapping=class_tensor)
# Restore variables from training checkpoint.
variable_averages = tf.train.ExponentialMovingAverage(
cifar10.MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
with tf.Session() as sess:
# Restore variables from training checkpoints.
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
# Assuming model_checkpoint_path looks something like:
# /my-favorite-path/imagenet_train/model.ckpt-0,
# extract global_step from it.
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
print('Successfully loaded model from %s at step=%s.' %
(ckpt.model_checkpoint_path, global_step))
else:
print('No checkpoint file found at %s' % FLAGS.checkpoint_dir)
return
""" Not exporting yet because I haven't installed tensorflow serving
# Export inference model.
init_op = tf.group(tf.initialize_all_tables(), name='init_op')
model_exporter = exporter.Exporter(saver)
signature = exporter.classification_signature(
input_tensor=jpegs, classes_tensor=classes, scores_tensor=values)
model_exporter.init(default_graph_signature=signature, init_op=init_op)
model_exporter.export(FLAGS.export_dir, tf.constant(global_step), sess)
print('Successfully exported model to %s' % FLAGS.export_dir)
"""
def main(unused_argv=None):
export()
if __name__ == '__main__':
tf.app.run()