How to export test values on Tensorflow - testing

I'm using a similar code to this as main train/test database and this to run the model.
I can print predictions in json but I can't print the test values to see which prediction refeers to each test.
How can I do that?
I'would like to export the tested datas.
Here is my code of import datas
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A dataset loader for imports85.data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
import tensorflow as tf
try:
import pandas as pd # pylint: disable=g-import-not-at-top
except ImportError:
pass
# Order is important for the csv-readers, so we use an OrderedDict here.
defaults = collections.OrderedDict([
("mes", [""]),
("marca", [""]),
("linha", [""]),
("grupo", [""]),
("capacidade", [0.0]),
("grade", [0.0]),
("custo", [0.0]),
("benef", [""]),
("desenvolvimento", [""]),
("leadtime", [0.0])
]) # pyformat: disable
types = collections.OrderedDict((key, type(value[0]))
for key, value in defaults.items())
def dataset(file_name="treino.csv", y_name="leadtime", train_fraction=0.7):
"""Load the imports85 data as a (train,test) pair of `Dataset`.
Each dataset generates (features_dict, label) pairs.
Args:
y_name: The name of the column to use as the label.
train_fraction: A float, the fraction of data to use for training. The
remainder will be used for evaluation.
Returns:
A (train,test) pair of `Datasets`
"""
# Download and cache the data
path = file_name
# Define how the lines of the file should be parsed
def decode_line(line):
"""Convert a csv line into a (features_dict,label) pair."""
# Decode the line to a tuple of items based on the types of
# csv_header.values().
items = tf.decode_csv(line, list(defaults.values()),field_delim=';')
# Convert the keys and items to a dict.
pairs = zip(defaults.keys(), items)
features_dict = dict(pairs)
# Remove the label from the features_dict
label = features_dict.pop(y_name)
return features_dict, label
def has_no_question_marks(line):
"""Returns True if the line of text has no question marks."""
# split the line into an array of characters
chars = tf.string_split(line[tf.newaxis], "").values
# for each character check if it is a question mark
is_question = tf.equal(chars, "?")
any_question = tf.reduce_any(is_question)
no_question = ~any_question
return no_question
def in_training_set(line):
"""Returns a boolean tensor, true if the line is in the training set."""
# If you randomly split the dataset you won't get the same split in both
# sessions if you stop and restart training later. Also a simple
# random split won't work with a dataset that's too big to `.cache()` as
# we are doing here.
num_buckets = 1000000
bucket_id = tf.string_to_hash_bucket_fast(line, num_buckets)
# Use the hash bucket id as a random number that's deterministic per example
return bucket_id < int(train_fraction * num_buckets)
def in_test_set(line):
"""Returns a boolean tensor, true if the line is in the training set."""
# Items not in the training set are in the test set.
# This line must use `~` instead of `not` because `not` only works on python
# booleans but we are dealing with symbolic tensors.
return ~in_training_set(line)
base_dataset = (tf.contrib.data
# Get the lines from the file.
.TextLineDataset(path)
# drop lines with question marks.
.filter(has_no_question_marks))
train = (base_dataset
# Take only the training-set lines.
.filter(in_training_set)
# Decode each line into a (features_dict, label) pair.
.map(decode_line)
# Cache data so you only decode the file once.
.cache())
# Do the same for the test-set.
test = (base_dataset.filter(in_test_set).cache().map(decode_line))
return train, test
def raw_dataframe():
"""Load the imports85 data as a pd.DataFrame."""
# Download and cache the data
path = file_name
# Load it into a pandas dataframe
df = pd.read_csv(path, names=types.keys(), dtype=types, na_values="?")
return df
def load_data(y_name="leadtime", train_fraction=0.7, seed=None):
"""Get the imports85 data set.
A description of the data is available at:
https://archive.ics.uci.edu/ml/datasets/automobile
The data itself can be found at:
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Args:
y_name: the column to return as the label.
train_fraction: the fraction of the dataset to use for training.
seed: The random seed to use when shuffling the data. `None` generates a
unique shuffle every run.
Returns:
a pair of pairs where the first pair is the training data, and the second
is the test data:
`(x_train, y_train), (x_test, y_test) = get_imports85_dataset(...)`
`x` contains a pandas DataFrame of features, while `y` contains the label
array.
"""
# Load the raw data columns.
data = raw_dataframe()
# Delete rows with unknowns
data = data.dropna()
# Shuffle the data
np.random.seed(seed)
# Split the data into train/test subsets.
x_train = data.sample(frac=train_fraction, random_state=seed)
x_test = data.drop(x_train.index)
# Extract the label from the features dataframe.
y_train = x_train.pop(y_name)
y_test = x_test.pop(y_name)
return (x_train, y_train), (x_test, y_test)
and here is my code to test, evaluate and predict
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Regression using the DNNRegressor Estimator."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import pandas as pd
import importar_dados # pylint: disable=g-bad-import-order
STEPS = 100
LT_NORM_FACTOR = 199
def my_dnn_regression_fn(features, labels, mode, params):
"""A model function implementing DNN regression for a custom Estimator."""
# Extract the input into a dense layer, according to the feature_columns.
top = tf.feature_column.input_layer(features, params["feature_columns"])
# Iterate over the "hidden_units" list of layer sizes, default is [20].
for units in params.get("hidden_units", [100]):
# Add a hidden layer, densely connected on top of the previous layer.
top = tf.layers.dense(inputs=top, units=units, activation=tf.nn.relu)
# Connect a linear output layer on top.
output_layer = tf.layers.dense(inputs=top, units=1)
# Reshape the output layer to a 1-dim Tensor to return predictions
predictions = tf.squeeze(output_layer, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
# In `PREDICT` mode we only need to return predictions.
return tf.estimator.EstimatorSpec(
mode=mode, predictions={"leadtime": predictions})
# Calculate loss using mean squared error
average_loss = tf.losses.mean_squared_error(labels, predictions)
# Pre-made estimators use the total_loss instead of the average,
# so report total_loss for compatibility.
batch_size = tf.shape(labels)[0]
total_loss = tf.to_float(batch_size) * average_loss
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = params.get("optimizer", tf.train.AdamOptimizer)
optimizer = optimizer(params.get("learning_rate", None))
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(
mode=mode, loss=total_loss, train_op=train_op)
# In evaluation mode we will calculate evaluation metrics.
assert mode == tf.estimator.ModeKeys.EVAL
# Calculate root mean squared error
rmse = tf.metrics.root_mean_squared_error(labels, predictions)
# Add the rmse to the collection of evaluation metrics.
eval_metrics = {"rmse": rmse}
return tf.estimator.EstimatorSpec(
mode=mode,
# Report sum of error for compatibility with pre-made estimators
loss=total_loss,
eval_metric_ops=eval_metrics)
def main(argv):
"""Builds, trains, and evaluates the model."""
assert len(argv) == 1
(train, test) = importar_dados.dataset()
# Switch the labels to units of thousands for better convergence.
def normalize_lt(features, labels):
return features, labels / LT_NORM_FACTOR
train = train.map(normalize_lt)
test = test.map(normalize_lt)
# Build the training input_fn.
def input_train():
return (
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
train.shuffle(1000).batch(128)
# Repeat forever
.repeat().make_one_shot_iterator().get_next())
# Build the validation input_fn.
def input_test():
return (test.shuffle(1000).batch(128)
.make_one_shot_iterator().get_next())
# The first way assigns a unique weight to each category. To do this you must
# specify the category's vocabulary (values outside this specification will
# receive a weight of zero). Here we specify the vocabulary using a list of
# options. The vocabulary can also be specified with a vocabulary file (using
# `categorical_column_with_vocabulary_file`). For features covering a
# range of positive integers use `categorical_column_with_identity`.
marca_vocab = ["ANIMALE","FABULA","FARM","A.BRAND","F.Y.I","MAS ANIMALE"]
marca = tf.feature_column.categorical_column_with_vocabulary_list(
key="marca", vocabulary_list=marca_vocab)
mes_vocab = ["1","2","3","4","5","6","7","8","9","10","11","12"]
mes = tf.feature_column.categorical_column_with_vocabulary_list(
key="mes", vocabulary_list=mes_vocab)
linha_vocab = ["A+","SEDA","TRICOLINE","MALHA","JNS","SARJA","TECIDO","TECIDO PLANO","DESFILE ABRAND","ARTESANAL",
"TREND","NOITE","BB","JEANS","HANDMADE","ESI","ALFAIATARIA","PRO","COURO","EST","CONCEPT","OFF PREMIUM",
"ACESSORIOS","MOVE","NOITE CASUAL","TAT","RESORT","EMI","EMT","FITNESS","BALADA","HOME VESTUARIO",
"UNIFORME","BOT","VTL","TECIDO PLANO BASICO","HOM","PRAIA","INTIMATES","BTP","TRICOT","QUERO","EMB",
"ATL","BMA","SAPATO","PRINCESS","BLUE","BOLSA","ESB","TECIDO PLANO ELABORADO","NOVOS DESEJOS","FESTA",
"FANTASIA","MARKETING","ACE","TECIDO PLANO ESTAMPADO","ADMINISTRATIVO","FAN","TECIDO PLANO LISO","AGA",
"CDO","AGE","BIJOUX","COBRANDING","NEUTROS","ESM"]
linha = tf.feature_column.categorical_column_with_vocabulary_list(
key="linha", vocabulary_list=linha_vocab)
grupo_vocab = ["VESTIDOS","TOP","TOP NEUTRO","TOP ELABORADO","SHORT","BLUSA","TOP BASICO","BOTTOM BASICO","VESTIDO BASICO",
"BLUSA ESTAMPADA","BOTTOM","MACACAO","TOP FUN","OVERTOPS","VESTIDO ESTAMPADO","BOTTOM ESTAMPADO",
"BOTTOM ELABORADO","CALCAS","CAMISA","SAIAS","AGASALHO","CALCA ESTAMPADA","ACESSORIOS","DIVERSOS",
"CINTOS","BIQUINI","TOP TECIDO","BIQUINI/MAIO","VESTUARIO","OVERTOP ESTAMPADO","CALCINHA","BERMUDA",
"LINGERIE","MAIO","VESTIDOS ELABORADO","OUTROS","SAPATOS","BOLSAS","CAMISA ESTAMPADA","LENCO","CHAPEU",
"FANTASIA","OVERTOP PESADO","TOP LEVE","HOME","PRAIA","OVERTOP LEVE","OVERTOP ELABORADO","STREET","ESPECIAL",
"PIJAMA","CANGA","BRINCO","SOUTIEN","OVERTOP BASICO","UNDERWEAR"]
grupo = tf.feature_column.categorical_column_with_vocabulary_list(
key="grupo", vocabulary_list=grupo_vocab)
benef_vocab = ["S","N"]
benef = tf.feature_column.categorical_column_with_vocabulary_list(
key="benef", vocabulary_list=benef_vocab)
desenvolvimento_vocab = ["INT","EX"]
desenvolvimento = tf.feature_column.categorical_column_with_vocabulary_list(
key="desenvolvimento", vocabulary_list=desenvolvimento_vocab)
# make = tf.feature_column.categorical_column_with_hash_bucket(
# key="make", hash_bucket_size=50)
feature_columns = [
tf.feature_column.indicator_column(mes),
tf.feature_column.indicator_column(marca),
tf.feature_column.indicator_column(linha),
tf.feature_column.indicator_column(grupo),
tf.feature_column.numeric_column(key="capacidade"),
tf.feature_column.numeric_column(key="grade"),
tf.feature_column.numeric_column(key="custo"),
# Since this is a DNN model, convert categorical columns from sparse
# to dense.
# Wrap them in an `indicator_column` to create a
# one-hot vector from the input.
tf.feature_column.indicator_column(benef),
tf.feature_column.indicator_column(desenvolvimento)#,
# Or use an `embedding_column` to create a trainable vector for each
# index.
# tf.feature_column.embedding_column(make, dimension=3),
]
# Build a custom Estimator, using the model_fn.
# `params` is passed through to the `model_fn`.
model = tf.estimator.Estimator(
model_fn=my_dnn_regression_fn,
params={
"feature_columns": feature_columns,
"learning_rate": 0.001,
"optimizer": tf.train.AdamOptimizer,
"hidden_units": [100,500,100]
},
model_dir="resultados")
# Train the model.
model.train(input_fn=input_train, steps=STEPS)
# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=input_test)
pred_result = model.predict(input_fn = input_test,
predict_keys=None,
hooks=None,
checkpoint_path=None)
sess = tf.Session()
# Print the Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: {:.0f} Dias"
.format(LT_NORM_FACTOR * eval_result["rmse"]))
#prediction_df = pd.DataFrame(list(pred_result))
#prediction_df.to_csv('prediction.csv')
print(list(pred_result))
print()
if __name__ == "__main__":
# The Estimator periodically generates "INFO" logs; make these logs visible.
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main=main)

Related

Use `sentence-transformers` inside of a Tensorflow-recommendation keras model in SageMaker

I've been going crazy for a few days over a problem that I thought trivial. My end-goal is to deploy to AWS Sagemaker a Tensorflow model that uses a simple string as input, calculates the embedding using a 'sentence-transformer' pre-trained model and eventually uses TensorFlow Recommenders to suggest the knn among a collection of embedding I already have calculated. I would like to do this entirely from the model, including the preprocessing (tokenization).
I made the predictions works with different approaches in my notebook. I start having troubles when I try to save my model.
The problem seems to be that HF's AutoTokenizer needs a pure List of Strings as input, and I hit a roadblock whenever I try to save my model using , and trying to go around this with tf.py_function using this approach results in problems with Sagemaker.
My approaches so far:
1. THE 'I THOUGHT IT WAS SO SIMPLE'
startups_ids: list, startup_vectors
):
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
from random import randint
exported_model = tfrs.layers.factorized_top_k.BruteForce(SentenceTransformer("all-mpnet-base-v2").encode)
exported_model.index(np.array(startup_vectors), np.array(startups_ids))
# TESTS the model
#for some reason this seems to be needed in order to save the model :/
# https://github.com/tensorflow/recommenders/issues/131
test = exported_model(['Test Text Query'])
print(test)
return exported_model
text_to_startup_model(search_db_ids, search_db_embeddings)
#--> WORKS PERFECTLY, AS I GET SOME SUGGESTIONS
tf.saved_model.save(text_to_startup_model(search_db_ids, search_db_embeddings), export_dir="/home/nicholas/test_model_save/1")
#TypeError Traceback (most recent call last)
# /home/nicholas/Documents/Dev/Rialto-predict-1/notebooks/t2s_different_approaches.ipynb Cell 5 in <cell line: 22>()
# 19 text_to_startup_model(search_db_ids, search_db_embeddings)
# 20 #--> WORKS PERFECTLY, AS I GET SOME SUGGESTIONS
# ---> 22 tf.saved_model.save(text_to_startup_model(search_db_ids, search_db_embeddings), export_dir="/home/nicholas/test_model_save/1")
# File ~/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/tensorflow/python/saved_model/save.py:1334, in save(obj, export_dir, signatures, options)
# 1332 # pylint: enable=line-too-long
# 1333 metrics.IncrementWriteApi(_SAVE_V2_LABEL)
# -> 1334 save_and_return_nodes(obj, export_dir, signatures, options)
# 1335 metrics.IncrementWrite(write_version="2")
#
# .........
#
#
# File ~/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/tensorflow/python/eager/def_function.py:677, in Function._defun_with_scope.<locals>.wrapped_fn(*args, **kwds)
# 673 with default_graph._variable_creator_scope(scope, priority=50): # pylint: disable=protected-access
# 674 # __wrapped__ allows AutoGraph to swap in a converted function. We give
# 675 # the function a weak reference to itself to avoid a reference cycle.
# 676 with OptionalXlaContext(compile_with_xla):
# --> 677 out = weak_wrapped_fn().__wrapped__(*args, **kwds)
# 678 return out
# File ~/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/tensorflow/python/framework/func_graph.py:1147, in func_graph_from_py_func.<locals>.autograph_handler(*args, **kwargs)
# 1145 except Exception as e: # pylint:disable=broad-except
# 1146 if hasattr(e, "ag_error_metadata"):
# -> 1147 raise e.ag_error_metadata.to_exception(e)
# 1148 else:
# 1149 raise
# TypeError: in user code:
# File "/home/nicholas/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/keras/saving/saving_utils.py", line 138, in _wrapped_model *
# outputs = model(*args, **kwargs)
# File "/home/nicholas/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler **
# raise e.with_traceback(filtered_tb) from None
# TypeError: Exception encountered when calling layer "brute_force_3" (type BruteForce).
# in user code:
# File "/home/nicholas/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/tensorflow_recommenders/layers/factorized_top_k.py", line 567, in call *
# queries = self.query_model(queries)
# File "/home/nicholas/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py", line 160, in encode *
# features = self.tokenize(sentences_batch)
# File "/home/nicholas/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py", line 318, in tokenize *
# return self._first_module().tokenize(texts)
# File "/home/nicholas/Documents/Dev/Rialto-predict-1/venv/lib/python3.10/site-packages/sentence_transformers/models/Transformer.py", line 102, in tokenize *
# batch1.append(text_tuple[0])
# TypeError: 'NoneType' object is not subscriptable
# ...
# Call arguments received:
# • queries=['None']
# • k=None
2. THE tf.py_function
As from my understanding the problem with the first approach is that it has no knowledge of the input type/value this second approach, from Use `sentence-transformers` inside of a keras model was supposedly gonna work, as it uses tf.py_function to accept a List of Strings as first input, without complaining.
def approach_2(startups_ids: list, startup_vectors):
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
from transformers import MPNetTokenizer, TFMPNetModel
# Here it loads the specific pre-trained model we are using for Rialto
tokenizer = MPNetTokenizer.from_pretrained(
"sentence-transformers/all-mpnet-base-v2"
)
model = TFMPNetModel.from_pretrained(
"sentence-transformers/all-mpnet-base-v2", from_pt=True
)
class SBert(tf.keras.layers.Layer):
def __init__(self, tokenizer, model):
super(SBert, self).__init__()
self.tokenizer = tokenizer
self.model = model
def tf_encode(self, inputs):
def encode(inputs):
inputs = [x[0].decode("utf-8") for x in inputs.numpy()]
outputs = self.tokenizer(
inputs, padding=True, truncation=True, return_tensors="tf"
)
return outputs["input_ids"], outputs["attention_mask"]
return tf.py_function(
func=encode, inp=[inputs], Tout=[tf.int32, tf.int32]
)
def process(self, i, a):
def __call(i, a):
model_output = self.model(
{"input_ids": i.numpy(), "attention_mask": a.numpy()}
)
return model_output[0]
return tf.py_function(func=__call, inp=[i, a], Tout=[tf.float32])
def mean_pooling(self, model_output, attention_mask):
token_embeddings = tf.squeeze(tf.stack(model_output), axis=0)
input_mask_expanded = tf.cast(
tf.broadcast_to(
tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)
),
tf.float32,
)
a = tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1)
b = tf.clip_by_value(
tf.math.reduce_sum(input_mask_expanded, axis=1),
1e-9,
tf.float32.max,
)
embeddings = a / b
embeddings, _ = tf.linalg.normalize(embeddings, 2, axis=1)
return embeddings
def call(self, inputs):
input_ids, attention_mask = self.tf_encode(inputs)
model_output = self.process(input_ids, attention_mask)
embeddings = self.mean_pooling(model_output, attention_mask)
return embeddings
# Uses the keras-ified model in a Keras model
sbert = SBert(tokenizer, model)
inputs = tf.keras.layers.Input((1,), dtype=tf.string)
outputs = sbert(inputs)
model = tf.keras.Model(inputs, outputs)
# Implements the model we just build for top KNN retrieval, from the pool of pre-calculated startups embeddings.
exported_model = tfrs.layers.factorized_top_k.BruteForce(model)
exported_model.index(np.array(startup_vectors), np.array(startups_ids))
# TESTS the model
# for some reason this seems to be needed in order to save the model :/
# https://github.com/tensorflow/recommenders/issues/131
print(exported_model(tf.constant(["'Test Text Query'"])))
return exported_model
model_to_store_1 = approach_2(search_db_ids, search_db_embeddings)
tf.saved_model.save(model_to_store_1, export_dir="/home/nicholas/test_model_save/2")
# THIS ONE WORKS LIKE A CHARM, saving the model and everything. Deploy on sagemaker is successful.
# FAILS TO WORK ON SAGEMAKER. BELOW THE LOGS WHEN THE MODEL IS CALLED
# ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from model with message "{
# "error": "No OpKernel was registered to support Op 'EagerPyFunc' used by {{node StatefulPartitionedCall/brute_force/model/s_bert/EagerPyFunc}} with these attrs: [is_async=false, Tin=[DT_STRING], _output_shapes=[<unknown>, <unknown>], Tout=[DT_INT32, DT_INT32], token=\"pyfunc_4\"]\nRegistered devices: [CPU]\nRegistered kernels:\n <no registered kernels>\n\n\t [[StatefulPartitionedCall/brute_force/model/s_bert/EagerPyFunc]]\n\t [[StatefulPartitionedCall]]"
# }". See https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/rialto-t2s-model-endpoint in account 634470116418 for more information
As you can see from the log, that the problem seems to be with the Eager mode and py_functions. I tried to google and found absolutely nothing on how to address this issue.
3. THE Classes approach
I've tried implementing something building upon this article, but I am running into similar issues that with the first approach, as when I go to save the model, the expected input clashed with the requirements of tokenizer.
EDIT 1 - here a coolab showcasing the approach: https://colab.research.google.com/drive/1gibFdEoHTs0hzD5yiXzLT_-asmilUoAQ?usp=sharing#scrollTo=TibAssWm3D5e
All of this journey triggered some questions:
Question 1 Is this even a best practice? Should I serve my model the tokenized sentences as a tensor?
Question 2 How the hell do I make it work? :)

TFX Tensorflow model validator component - You passed a data dictionary with keys ['image_raw_xf']. Expected the following keys: ['input_1']

I'm building a tfx pipeline based on the cifar10 example : [https://github.com/tensorflow/tfx/tree/master/tfx/examples/cifar10]
The difference is that I don't want to convert it to tf_lite model and instead use a regular keras based tensorflow model.
Everything works as expected until I get to the Evaluator component as it fails with the following error:
ValueError: Missing data for input "input_1". You passed a data dictionary with keys ['image_xf']. Expected the following keys: ['input_1']
[while running 'Run[Trainer]']
Not sure what I'm doing wrong, but so far I debugged/modified the code as follows:
[1] The preprocessing_fn output is outputting the key image_xf:
_IMAGE_KEY = 'image'
_LABEL_KEY = 'label'
def _transformed_name(key):
return key + '_xf'
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
# tf.io.decode_png function cannot be applied on a batch of data.
# We have to use tf.map_fn
image_features = tf.map_fn(
lambda x: tf.io.decode_png(x[0], channels=3),
inputs[_IMAGE_KEY],
dtype=tf.uint8)
# image_features = tf.cast(image_features, tf.float32)
image_features = tf.image.resize(image_features, [224, 224])
image_features = tf.keras.applications.mobilenet.preprocess_input(
image_features)
outputs[_transformed_name(_IMAGE_KEY)] = image_features
#outputs["input_1"] = image_features
# TODO(b/157064428): Support label transformation for Keras.
# Do not apply label transformation as it will result in wrong evaluation.
outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]
return outputs
[2] When I build the model, I am using transfer learning with an inputLayer with the same name image_xf.
def _build_keras_model() -> tf.keras.Model:
"""Creates a Image classification model with MobileNet backbone.
Returns:
The image classifcation Keras Model and the backbone MobileNet model
"""
# We create a MobileNet model with weights pre-trained on ImageNet.
# We remove the top classification layer of the MobileNet, which was
# used for classifying ImageNet objects. We will add our own classification
# layer for CIFAR10 later. We use average pooling at the last convolution
# layer to get a 1D vector for classifcation, which is consistent with the
# origin MobileNet setup
base_model = tf.keras.applications.MobileNet(
input_shape=(224, 224, 3),
include_top=False,
weights='imagenet',
pooling='avg')
base_model.input_spec = None
# We add a Dropout layer at the top of MobileNet backbone we just created to
# prevent overfiting, and then a Dense layer to classifying CIFAR10 objects
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(
input_shape=(224, 224, 3), name=_transformed_name(_IMAGE_KEY)),
base_model,
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(10, activation='softmax')
])
[3] The model signature is created accordingly:
def _get_serve_image_fn(model, tf_transform_output):
"""Returns a function that feeds the input tensor into the model."""
model.tft_layer = tf_transform_output.transform_features_layer()
#tf.function
def serve_image_fn(serialized_tf_examples):
feature_spec = tf_transform_output.raw_feature_spec()
feature_spec.pop(_LABEL_KEY)
parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
transformed_features = model.tft_layer(parsed_features)
return model(transformed_features)
return serve_image_fn
def run_fn(fn_args: FnArgs):
tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
signatures = {
'serving_default':
_get_serve_image_fn(model,tf_transform_output).get_concrete_function(
tf.TensorSpec(
shape=[None],
dtype=tf.string,
name=_IMAGE_KEY))
}
temp_saving_model_dir = os.path.join(fn_args.serving_model_dir)
model.save(temp_saving_model_dir, save_format='tf', signatures=signatures)
Now, I suspect that tensorflow is not saving the model correctly because when I export the saved model, the input layer is input_1 instead of image_xf.
import tensorflow as tf
import numpy as np
import tensorflow.python.ops.numpy_ops.np_config as np_config
np_config.enable_numpy_behavior()
path = './model/Format-Serving/'
imported = tf.saved_model.load(path)
model = tf.keras.models.load_model(path)
print(model.summary())
print(list(imported.signatures.keys()))
print(model.get_layer('mobilenet_1.00_224').layers[0].name)
The thing to notice here is (1) that the Input layer I added in Sequential model above is missing and (2) the mobilenet first layer is input_1, so it makes sense why I'm getting a mismatch.
2021-10-15 08:33:40.683034: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
mobilenet_1.00_224 (Function (None, 1024) 3228864
_________________________________________________________________
dropout (Dropout) (None, 1024) 0
_________________________________________________________________
dense (Dense) (None, 10) 10250
=================================================================
Total params: 3,239,114
Trainable params: 1,074,186
Non-trainable params: 2,164,928
_________________________________________________________________
None
['serving_default']
input_1
So how can I actually get the model to save correctly with the right input?
Here is the full code:
pipeline.py
# Lint as: python2, python3
# Copyright 2019 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CIFAR10 image classification example using TFX.
This example demonstrates how to do data augmentation, transfer learning,
and inserting TFLite metadata with TFX.
The trained model can be pluged into MLKit for object detection.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import os
from typing import List, Text
import absl
from tfx import v1 as tfx
import tensorflow_model_analysis as tfma
from tfx.components import Evaluator
from tfx.components import ExampleValidator
from tfx.components import ImportExampleGen
from tfx.components import Pusher
from tfx.components import SchemaGen
from tfx.components import StatisticsGen
from tfx.components import Trainer
from tfx.components import Transform
from tfx.dsl.components.common import resolver
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import metadata
from tfx.orchestration import pipeline
from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner
from tfx.proto import example_gen_pb2
from tfx.proto import pusher_pb2
from tfx.proto import trainer_pb2
from tfx.types import Channel
from tfx.types.standard_artifacts import Model
from tfx.types.standard_artifacts import ModelBlessing
_pipeline_name = 'cifar10_native_keras'
# This example assumes that CIFAR10 train set data is stored in
# ~/cifar10/data/train, test set data is stored in ~/cifar10/data/test, and
# the utility function is in ~/cifar10. Feel free to customize as needed.
_cifar10_root = os.path.join(os.getcwd())
_data_root = os.path.join(_cifar10_root, 'data')
# Python module files to inject customized logic into the TFX components. The
# Transform and Trainer both require user-defined functions to run successfully.
_module_file = os.path.join(_cifar10_root, 'cifar10_utils_native_keras.py')
# Path which can be listened to by the model server. Pusher will output the
# trained model here.
_serving_model_dir_lite = os.path.join(_cifar10_root, 'serving_model_lite',
_pipeline_name)
# Directory and data locations. This example assumes all of the images,
# example code, and metadata library is relative to $HOME, but you can store
# these files anywhere on your local filesystem.
_tfx_root = os.path.join(os.getcwd(), 'tfx')
_pipeline_root = os.path.join(_tfx_root, 'pipelines', _pipeline_name)
# Sqlite ML-metadata db path.
_metadata_path = os.path.join(_tfx_root, 'metadata', _pipeline_name,
'metadata.db')
# Path to labels file for mapping model outputs.
_labels_path = os.path.join(_data_root, 'labels.txt')
# Pipeline arguments for Beam powered Components.
_beam_pipeline_args = [
'--direct_running_mode=multi_processing',
'--direct_num_workers=0',
]
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
module_file: Text, serving_model_dir_lite: Text,
metadata_path: Text,
labels_path: Text,
beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
"""Implements the CIFAR10 image classification pipeline using TFX."""
# This is needed for datasets with pre-defined splits
# Change the pattern argument to train_whole/* and test_whole/* to train
# on the whole CIFAR-10 dataset
input_config = example_gen_pb2.Input(splits=[
example_gen_pb2.Input.Split(name='train', pattern='train/*'),
example_gen_pb2.Input.Split(name='eval', pattern='test/*')
])
# Brings data into the pipeline.
example_gen = ImportExampleGen(
input_base=data_root, input_config=input_config)
# Computes statistics over data for visualization and example validation.
statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
# Generates schema based on statistics files.
schema_gen = SchemaGen(
statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True)
# Performs anomaly detection based on statistics and data schema.
example_validator = ExampleValidator(
statistics=statistics_gen.outputs['statistics'],
schema=schema_gen.outputs['schema'])
# Performs transformations and feature engineering in training and serving.
transform = Transform(
examples=example_gen.outputs['examples'],
schema=schema_gen.outputs['schema'],
module_file=module_file)
model_resolver = resolver.Resolver(
#instance_name='latest_model_resolver',
strategy_class=tfx.dsl.experimental.LatestArtifactStrategy,
model=Channel(type=Model)).with_id('latest_blessed_model_resolver')
# Uses user-provided Python function that trains a model.
# When traning on the whole dataset, use 18744 for train steps, 156 for eval
# steps. 18744 train steps correspond to 24 epochs on the whole train set, and
# 156 eval steps correspond to 1 epoch on the whole test set. The
# configuration below is for training on the dataset we provided in the data
# folder, which has 128 train and 128 test samples. The 160 train steps
# correspond to 40 epochs on this tiny train set, and 4 eval steps correspond
# to 1 epoch on this tiny test set.
trainer = Trainer(
module_file=module_file,
examples=transform.outputs['transformed_examples'],
transform_graph=transform.outputs['transform_graph'],
schema=schema_gen.outputs['schema'],
base_model=model_resolver.outputs['model'],
train_args=trainer_pb2.TrainArgs(num_steps=160),
eval_args=trainer_pb2.EvalArgs(num_steps=4),
custom_config={'labels_path': labels_path})
# Get the latest blessed model for model validation.
# model_resolver = resolver.Resolver(
# strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
# model=Channel(type=Model),
# model_blessing=Channel(
# type=ModelBlessing)).with_id('latest_blessed_model_resolver')
# Uses TFMA to compute evaluation statistics over features of a model and
# perform quality validation of a candidate model (compare to a baseline).
eval_config = tfma.EvalConfig(
model_specs=[tfma.ModelSpec(label_key='label')],
slicing_specs=[tfma.SlicingSpec()],
metrics_specs=[
tfma.MetricsSpec(metrics=[
tfma.MetricConfig(
class_name='SparseCategoricalAccuracy',
threshold=tfma.MetricThreshold(
value_threshold=tfma.GenericValueThreshold(
lower_bound={'value': 0.55}),
# Change threshold will be ignored if there is no
# baseline model resolved from MLMD (first run).
change_threshold=tfma.GenericChangeThreshold(
direction=tfma.MetricDirection.HIGHER_IS_BETTER,
absolute={'value': -1e-3})))
])
])
# Uses TFMA to compute the evaluation statistics over features of a model.
# We evaluate using the materialized examples that are output by Transform
# because
# 1. the decoding_png function currently performed within Transform are not
# compatible with TFLite.
# 2. MLKit requires deserialized (float32) tensor image inputs
# Note that for deployment, the same logic that is performed within Transform
# must be reproduced client-side.
evaluator = Evaluator(
examples=example_gen.outputs['examples'],
model=trainer.outputs['model'],
#baseline_model=model_resolver.outputs['model'],
eval_config=eval_config)
# Checks whether the model passed the validation steps and pushes the model
# to a file destination if check passed.
pusher = Pusher(
model=trainer.outputs['model'],
model_blessing=evaluator.outputs['blessing'],
push_destination=pusher_pb2.PushDestination(
filesystem=pusher_pb2.PushDestination.Filesystem(
base_directory=serving_model_dir_lite)))
components = [
example_gen, statistics_gen, schema_gen, example_validator, transform,
trainer, model_resolver, evaluator, pusher
]
return pipeline.Pipeline(
pipeline_name=pipeline_name,
pipeline_root=pipeline_root,
components=components,
enable_cache=True,
metadata_connection_config=metadata.sqlite_metadata_connection_config(
metadata_path),
beam_pipeline_args=beam_pipeline_args)
# To run this pipeline from the python CLI:
# $python cifar_pipeline_native_keras.py
if __name__ == '__main__':
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
logger.setLevel(logging.INFO)
logging.getLogger().setLevel(logging.INFO)
absl.logging.set_verbosity(absl.logging.FATAL)
BeamDagRunner().run(
_create_pipeline(
pipeline_name=_pipeline_name,
pipeline_root=_pipeline_root,
data_root=_data_root,
module_file=_module_file,
serving_model_dir_lite=_serving_model_dir_lite,
metadata_path=_metadata_path,
labels_path=_labels_path,
beam_pipeline_args=_beam_pipeline_args))
utils file:
# Lint as: python2, python3
# Copyright 2019 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Python source file includes CIFAR10 utils for Keras model.
The utilities in this file are used to build a model with native Keras.
This module file will be used in Transform and generic Trainer.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from typing import List, Text
import absl
import tensorflow as tf
import tensorflow_transform as tft
from tfx.components.trainer.fn_args_utils import DataAccessor
from tfx.components.trainer.fn_args_utils import FnArgs
from tfx.components.trainer.rewriting import converters
from tfx.components.trainer.rewriting import rewriter
from tfx.components.trainer.rewriting import rewriter_factory
from tfx.dsl.io import fileio
from tfx_bsl.tfxio import dataset_options
# import flatbuffers
# from tflite_support import metadata_schema_py_generated as _metadata_fb
# from tflite_support import metadata as _metadata
# When training on the whole dataset use following constants instead.
# This setting should give ~91% accuracy on the whole test set
# _TRAIN_DATA_SIZE = 50000
# _EVAL_DATA_SIZE = 10000
# _TRAIN_BATCH_SIZE = 64
# _EVAL_BATCH_SIZE = 64
# _CLASSIFIER_LEARNING_RATE = 3e-4
# _FINETUNE_LEARNING_RATE = 5e-5
# _CLASSIFIER_EPOCHS = 12
_TRAIN_DATA_SIZE = 128
_EVAL_DATA_SIZE = 128
_TRAIN_BATCH_SIZE = 32
_EVAL_BATCH_SIZE = 32
_CLASSIFIER_LEARNING_RATE = 1e-3
_FINETUNE_LEARNING_RATE = 7e-6
_CLASSIFIER_EPOCHS = 30
_IMAGE_KEY = 'image'
_LABEL_KEY = 'label'
_TFLITE_MODEL_NAME = 'tflite'
def _transformed_name(key):
return key + '_xf'
def _get_serve_image_fn(model, tf_transform_output):
"""Returns a function that feeds the input tensor into the model."""
model.tft_layer = tf_transform_output.transform_features_layer()
#tf.function
def serve_image_fn(serialized_tf_examples):
feature_spec = tf_transform_output.raw_feature_spec()
feature_spec.pop(_LABEL_KEY)
parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
transformed_features = model.tft_layer(parsed_features)
return model(transformed_features)
return serve_image_fn
def _image_augmentation(image_features):
"""Perform image augmentation on batches of images .
Args:
image_features: a batch of image features
Returns:
The augmented image features
"""
batch_size = tf.shape(image_features)[0]
image_features = tf.image.random_flip_left_right(image_features)
image_features = tf.image.resize_with_crop_or_pad(image_features, 250, 250)
image_features = tf.image.random_crop(image_features,
(batch_size, 224, 224, 3))
return image_features
def _data_augmentation(feature_dict):
"""Perform data augmentation on batches of data.
Args:
feature_dict: a dict containing features of samples
Returns:
The feature dict with augmented features
"""
image_features = feature_dict[_transformed_name(_IMAGE_KEY)]
image_features = _image_augmentation(image_features)
feature_dict[_transformed_name(_IMAGE_KEY)] = image_features
return feature_dict
def _input_fn(file_pattern: List[Text],
data_accessor: DataAccessor,
tf_transform_output: tft.TFTransformOutput,
is_train: bool = False,
batch_size: int = 200) -> tf.data.Dataset:
"""Generates features and label for tuning/training.
Args:
file_pattern: List of paths or patterns of input tfrecord files.
data_accessor: DataAccessor for converting input to RecordBatch.
tf_transform_output: A TFTransformOutput.
is_train: Whether the input dataset is train split or not.
batch_size: representing the number of consecutive elements of returned
dataset to combine in a single batch
Returns:
A dataset that contains (features, indices) tuple where features is a
dictionary of Tensors, and indices is a single Tensor of label indices.
"""
dataset = data_accessor.tf_dataset_factory(
file_pattern,
dataset_options.TensorFlowDatasetOptions(
batch_size=batch_size, label_key=_transformed_name(_LABEL_KEY)),
tf_transform_output.transformed_metadata.schema)
# Apply data augmentation. We have to do data augmentation here because
# we need to apply data agumentation on-the-fly during training. If we put
# it in Transform, it will only be applied once on the whole dataset, which
# will lose the point of data augmentation.
if is_train:
dataset = dataset.map(lambda x, y: (_data_augmentation(x), y))
return dataset
def _freeze_model_by_percentage(model: tf.keras.Model, percentage: float):
"""Freeze part of the model based on specified percentage.
Args:
model: The keras model need to be partially frozen
percentage: the percentage of layers to freeze
Raises:
ValueError: Invalid values.
"""
if percentage < 0 or percentage > 1:
raise ValueError('Freeze percentage should between 0.0 and 1.0')
if not model.trainable:
raise ValueError(
'The model is not trainable, please set model.trainable to True')
num_layers = len(model.layers)
num_layers_to_freeze = int(num_layers * percentage)
for idx, layer in enumerate(model.layers):
if idx < num_layers_to_freeze:
layer.trainable = False
else:
layer.trainable = True
def _build_keras_model() -> tf.keras.Model:
"""Creates a Image classification model with MobileNet backbone.
Returns:
The image classifcation Keras Model and the backbone MobileNet model
"""
# We create a MobileNet model with weights pre-trained on ImageNet.
# We remove the top classification layer of the MobileNet, which was
# used for classifying ImageNet objects. We will add our own classification
# layer for CIFAR10 later. We use average pooling at the last convolution
# layer to get a 1D vector for classifcation, which is consistent with the
# origin MobileNet setup
base_model = tf.keras.applications.MobileNet(
input_shape=(224, 224, 3),
include_top=False,
weights='imagenet',
pooling='avg')
base_model.input_spec = None
# We add a Dropout layer at the top of MobileNet backbone we just created to
# prevent overfiting, and then a Dense layer to classifying CIFAR10 objects
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(
input_shape=(224, 224, 3), name=_transformed_name(_IMAGE_KEY)),
base_model,
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(10, activation='softmax')
])
# Freeze the whole MobileNet backbone to first train the top classifer only
_freeze_model_by_percentage(base_model, 1.0)
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.RMSprop(lr=_CLASSIFIER_LEARNING_RATE),
metrics=['sparse_categorical_accuracy'])
model.summary(print_fn=absl.logging.info)
return model, base_model
# TFX Transform will call this function.
def preprocessing_fn(inputs):
"""tf.transform's callback function for preprocessing inputs.
Args:
inputs: map from feature keys to raw not-yet-transformed features.
Returns:
Map from string feature key to transformed feature operations.
"""
outputs = {}
# tf.io.decode_png function cannot be applied on a batch of data.
# We have to use tf.map_fn
image_features = tf.map_fn(
lambda x: tf.io.decode_png(x[0], channels=3),
inputs[_IMAGE_KEY],
dtype=tf.uint8)
# image_features = tf.cast(image_features, tf.float32)
image_features = tf.image.resize(image_features, [224, 224])
image_features = tf.keras.applications.mobilenet.preprocess_input(
image_features)
outputs[_transformed_name(_IMAGE_KEY)] = image_features
#outputs["input_1"] = image_features
# TODO(b/157064428): Support label transformation for Keras.
# Do not apply label transformation as it will result in wrong evaluation.
outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]
return outputs
# TFX Trainer will call this function.
def run_fn(fn_args: FnArgs):
"""Train the model based on given args.
Args:
fn_args: Holds args used to train the model as name/value pairs.
Raises:
ValueError: if invalid inputs.
"""
tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)
baseline_path = fn_args.base_model
if baseline_path is not None:
model = tf.keras.models.load_model(os.path.join(baseline_path))
else:
train_dataset = _input_fn(
fn_args.train_files,
fn_args.data_accessor,
tf_transform_output,
is_train=True,
batch_size=_TRAIN_BATCH_SIZE)
eval_dataset = _input_fn(
fn_args.eval_files,
fn_args.data_accessor,
tf_transform_output,
is_train=False,
batch_size=_EVAL_BATCH_SIZE)
model, base_model = _build_keras_model()
absl.logging.info('Tensorboard logging to {}'.format(fn_args.model_run_dir))
# Write logs to path
tensorboard_callback = tf.keras.callbacks.TensorBoard(
log_dir=fn_args.model_run_dir, update_freq='batch')
# Our training regime has two phases: we first freeze the backbone and train
# the newly added classifier only, then unfreeze part of the backbone and
# fine-tune with classifier jointly.
steps_per_epoch = int(_TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE)
total_epochs = int(fn_args.train_steps / steps_per_epoch)
if _CLASSIFIER_EPOCHS > total_epochs:
raise ValueError('Classifier epochs is greater than the total epochs')
absl.logging.info('Start training the top classifier')
model.fit(
train_dataset,
epochs=_CLASSIFIER_EPOCHS,
steps_per_epoch=steps_per_epoch,
validation_data=eval_dataset,
validation_steps=fn_args.eval_steps,
callbacks=[tensorboard_callback])
absl.logging.info('Start fine-tuning the model')
# Unfreeze the top MobileNet layers and do joint fine-tuning
_freeze_model_by_percentage(base_model, 0.9)
# We need to recompile the model because layer properties have changed
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.RMSprop(lr=_FINETUNE_LEARNING_RATE),
metrics=['sparse_categorical_accuracy'])
model.summary(print_fn=absl.logging.info)
model.fit(
train_dataset,
initial_epoch=_CLASSIFIER_EPOCHS,
epochs=total_epochs,
steps_per_epoch=steps_per_epoch,
validation_data=eval_dataset,
validation_steps=fn_args.eval_steps,
callbacks=[tensorboard_callback])
# Prepare the TFLite model used for serving in MLKit
signatures = {
'serving_default':
_get_serve_image_fn(model,tf_transform_output).get_concrete_function(
tf.TensorSpec(
shape=[None],
dtype=tf.string,
name=_IMAGE_KEY))
}
temp_saving_model_dir = os.path.join(fn_args.serving_model_dir)
model.save(temp_saving_model_dir, save_format='tf', signatures=signatures)
# tfrw = rewriter_factory.create_rewriter(
# rewriter_factory.TFLITE_REWRITER,
# name='tflite_rewriter')
# converters.rewrite_saved_model(temp_saving_model_dir,
# fn_args.serving_model_dir, tfrw,
# rewriter.ModelType.TFLITE_MODEL)
# # Add necessary TFLite metadata to the model in order to use it within MLKit
# # TODO(dzats#): Handle label map file path more properly, currently
# # hard-coded.
# tflite_model_path = os.path.join(fn_args.serving_model_dir,
# _TFLITE_MODEL_NAME)
# # TODO(dzats#): Extend the TFLite rewriter to be able to add TFLite metadata
# ## to the model.
# _write_metadata(
# model_path=tflite_model_path,
# label_map_path=fn_args.custom_config['labels_path'],
# mean=[127.5],
# std=[127.5])
# fileio.rmtree(temp_saving_model_dir)
Ok I found the answer. Because the model is expecting the input_1 name, then in _get_serve_image_fn, I need to create the dictionary key, such as:
def _get_serve_image_fn(model, tf_transform_output):
"""Returns a function that feeds the input tensor into the model."""
model.tft_layer = tf_transform_output.transform_features_layer()
#tf.function
def serve_image_fn(serialized_tf_examples):
feature_spec = tf_transform_output.raw_feature_spec()
feature_spec.pop(_LABEL_KEY)
parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)
transformed_features = model.tft_layer(parsed_features)
transformed_features[model.get_layer('mobilenet_1.00_224').layers[0].name] = transformed_features[_transformed_name(_IMAGE_KEY)]
del transformed_features[_transformed_name(_IMAGE_KEY)]
return model(transformed_features)
return serve_image_fn

estimator.train throws ValueError: model_fn should return an EstimatorSpec

Here's the code I'm using...
I've got a breakpoint installed at what is for me line 304...
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
Has anyone seen this? I'm certain I have the correct versions of TensorFlow and BERT installed.
The complete stack trace is as follows....
Exception has occurred: ValueError
model_fn should return an EstimatorSpec.
File "C:\Program Files\Python36\Lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1153, in _call_model_fn
raise ValueError('model_fn should return an EstimatorSpec.')
File "C:\Program Files\Python36\Lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1191, in _train_model_default
features, labels, ModeKeys.TRAIN, self.config)
File "C:\Program Files\Python36\Lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "C:\Program Files\Python36\Lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "C:\Users\brownru\eclipse-workspace\tiaaNLPPython\org\tiaa\ai\penelope\bertNLP\sentiment\sentiment.py", line 304, in <module>
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
File "C:\Program Files\Python36\Lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Program Files\Python36\Lib\runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "C:\Program Files\Python36\Lib\runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
ValueError: model_fn should return an EstimatorSpec.
This code is my attempt to run some Google colab code from here -
https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb#scrollTo=t6Nukby2EB6-
# Copyright 2019 Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# install --proxy http://proxy.ops.tiaa-cref.org:8080 tensorFlow
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_estimator as tfe
from datetime import datetime
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
# Set the output directory for saving model file
# Optionally, set a GCP bucket location
OUTPUT_DIR = r'C:\Users\brownru\Documents\npsExplanationComplains\sentimentOutput'
##markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True ##param {type:"boolean"}
##markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = False ##param {type:"boolean"}
BUCKET = 'BUCKET_NAME' ##param {type:"string"}
if USE_BUCKET:
OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
#from google.colab import auto
#auth.authenticate_user()
if DO_DELETE:
try:
tf.gfile.DeleteRecursively(OUTPUT_DIR)
except:
# Doesn't matter if the directory didn't exist
pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))
'''
First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub).
'''
from tensorflow import keras
import os
import re
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
data = {}
data["sentence"] = []
data["sentiment"] = []
for file_path in os.listdir(directory):
with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
data["sentence"].append(f.read())
data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
return pd.DataFrame.from_dict(data)
# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
pos_df = load_directory_data(os.path.join(directory, "pos"))
neg_df = load_directory_data(os.path.join(directory, "neg"))
pos_df["polarity"] = 1
neg_df["polarity"] = 0
return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
# Download and process the dataset files.
def download_and_load_datasets():
#dataset = tf.keras.utils.get_file(fname="aclImdb.tar.gz", origin="http://chapdc3sas51.ops.tiaa-cref.org/nlpAssets/aclImdb_v1.tar.gz", extract=True)
trainPath = r'C:\Users\brownru\.keras\datasets\aclImdb\train'
testPath = r'C:\Users\brownru\.keras\datasets\aclImdb\test'
train_df = load_dataset(trainPath)
test_df = load_dataset(testPath)
return train_df, test_df
train, test = download_and_load_datasets()
#To keep training fast, we'll take a sample of 5000 train and test examples, respectively.
train = train.sample(5000)
test = test.sample(5000)
train.columns
#Index(['sentence', 'sentiment', 'polarity'], dtype='object')
#For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respectively)
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]
#Data Preprocessing We'll need to transform our data into a format BERT understands. This involves two steps. First, we create InputExample's using the constructor provided in the BERT library.
#text_a is the text we want to classify, which in this case, is the Request field in our Dataframe.
#text_b is used if we're training a model to understand the relationship between sentences (i.e. is text_b a translation of text_a? Is text_b an answer to the question asked by text_a?). This doesn't apply to our task, so we can leave text_b blank.
#label is the label for our example, i.e. True, False
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
text_a = x[DATA_COLUMN],
text_b = None,
label = x[LABEL_COLUMN]), axis = 1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None,
text_a = x[DATA_COLUMN],
text_b = None,
label = x[LABEL_COLUMN]), axis = 1)
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "http://chapdc3sas51.ops.tiaa-cref.org/nlpAssets/1.tar.gz"
def create_tokenizer_from_hub_module():
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL_HUB)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],tokenization_info["do_lower_case"]])
return bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
tokenizer = create_tokenizer_from_hub_module()
tokenizer.tokenize("This here's an example of using the BERT tokenizer")
# We'll set sequences to be at most 128 tokens long TEST.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
#Creating a model
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, num_labels):
#Creates a classification model.
bert_module = hub.Module(BERT_MODEL_HUB,trainable=True)
bert_inputs = dict(input_ids=input_ids,input_mask=input_mask,segment_ids=segment_ids)
bert_outputs = bert_module(inputs=bert_inputs,signature="tokens",as_dict=True)
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
output_layer = bert_outputs["pooled_output"]
hidden_size = output_layer.shape[-1].value
# Create our own layer to tune for politeness data.
output_weights = tf.get_variable("output_weights", [num_labels, hidden_size],initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
# Dropout helps prevent overfitting
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
# Convert labels into one-hot encoding
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
# If we're predicting, we want predicted labels and the probabilities.
if is_predicting:
return (predicted_labels, log_probs)
# If we're train/eval, compute loss between predicted and actual label
per_example_loss = tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, predicted_labels, log_probs)
'''Next we'll wrap our model function in a model_fn_builder function that adapts our model to work for training, evaluation, and prediction.'''
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
num_warmup_steps):
#Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
#"""The `model_fn` for TPUEstimator."""
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_predicting = (mode == tfe.estimator.ModeKeys.PREDICT)
# TRAIN and EVAL
if not is_predicting:
(loss, predicted_labels, log_probs) = create_model(is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
train_op = bert.optimization.create_optimizer(loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
# Calculate evaluation metrics.
def metric_fn(label_ids, predicted_labels):
accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
f1_score = tf.contrib.metrics.f1_score(
label_ids,
predicted_labels)
auc = tf.metrics.auc(
label_ids,
predicted_labels)
recall = tf.metrics.recall(
label_ids,
predicted_labels)
precision = tf.metrics.precision(
label_ids,
predicted_labels)
true_pos = tf.metrics.true_positives(
label_ids,
predicted_labels)
true_neg = tf.metrics.true_negatives(
label_ids,
predicted_labels)
false_pos = tf.metrics.false_positives(
label_ids,
predicted_labels)
false_neg = tf.metrics.false_negatives(
label_ids,
predicted_labels)
return {
"eval_accuracy": accuracy,
"f1_score": f1_score,
"auc": auc,
"precision": precision,
"recall": recall,
"true_positives": true_pos,
"true_negatives": true_neg,
"false_positives": false_pos,
"false_negatives": false_neg
}
eval_metrics = metric_fn(label_ids, predicted_labels)
if mode == tfe.estimator.ModeKeys.TRAIN:
return tfe.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
else:
return tfe.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics)
else:
(predicted_labels, log_probs) = create_model(is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
predictions = {'probabilities': log_probs, 'labels': predicted_labels}
return tfe.estimator.EstimatorSpec(mode, predictions=predictions)
# Return the actual model function in the closure
return model_fn
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
# Specify outpit directory and number of checkpoint steps to save
run_config = tfe.estimator.RunConfig(
model_dir=OUTPUT_DIR,
save_summary_steps=SAVE_SUMMARY_STEPS,
save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
model_fn = model_fn_builder(
num_labels=len(label_list),
learning_rate=LEARNING_RATE,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps)
estimator = tfe.estimator.Estimator(
model_fn=model_fn,
config=run_config,
params={"batch_size": BATCH_SIZE}
)
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
features=train_features,
seq_length=MAX_SEQ_LENGTH,
is_training=True,
drop_remainder=False)
#Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes.
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)
#Now let's use our test data to see how well our model did:
test_input_fn = run_classifier.input_fn_builder(
features=test_features,
seq_length=MAX_SEQ_LENGTH,
is_training=False,
drop_remainder=False)
estimator.evaluate(input_fn=test_input_fn, steps=None)
def getPrediction(in_sentences):
labels = ["Negative", "Positive"]
input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
predictions = estimator.predict(predict_input_fn)
return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]
pred_sentences = [
"That movie was absolutely awful",
"The acting was a bit lacking",
"The film was creative and surprising",
"Absolutely fantastic!"
]
predictions = getPrediction(pred_sentences)
predictions
Horrifyingly, the answer to this problem was all about indentation. There is a function in the Google Colab example posted above called def model_fn. This appears to be wrapper function for another function that actually creates a model to pass to the TensorFlow Estimator. While I was debugging this in VS code I'd placed a break-point in the function to try and sort out what was happening and it kept skipping over the middle bit where it was checking for "false pos, false_neg etc.
Evidently i'd somehow broken the indentation when editing in VS Code and the functions were nested such that pylint didn't identify any syntax problems - it just skipped over the function.
Fix was to just recopy the entire def model_fn function from the colab notebook and voila it worked.

Distributed Tensorflow: non-chief worker blocked

I am trying the distributed tensorflow, and my code is shown as follow. The problem is that the chief worker can run as expected. However, non-chief worker will blocked at :
sess = sv.prepare_or_wait_for_session(target, config=sess_config)
Could anybody help me solve this problem?
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A library to train Inception using multiple replicas with synchronous update.
Please see accompanying README.md for details and instructions.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os.path
import time
import numpy as np
import tensorflow as tf
from inception.slim.datasets import dataset_factory
from inception.slim.nets import nets_factory
from inception.slim.preprocessing import preprocessing_factory
from inception import inception_model as inception
from inception.slim import slim
#from inception import image_processing
sslim = tf.contrib.slim
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
'dataset_name', 'imagenet', 'The name of the dataset to load.')
tf.app.flags.DEFINE_string(
'dataset_split_name', 'train', 'The name of the train/test split.')
tf.app.flags.DEFINE_integer(
'train_image_size', None, 'Train image size')
tf.app.flags.DEFINE_string(
'dataset_dir', None, 'The directory where the dataset files are stored.')
tf.app.flags.DEFINE_string('job_name', '', 'One of "ps", "worker"')
tf.app.flags.DEFINE_string('ps_hosts', '',
"""Comma-separated list of hostname:port for the """
"""parameter server jobs. e.g. """
"""'machine1:2222,machine2:1111,machine2:2222'""")
tf.app.flags.DEFINE_string('worker_hosts', '',
"""Comma-separated list of hostname:port for the """
"""worker jobs. e.g. """
"""'machine1:2222,machine2:1111,machine2:2222'""")
tf.app.flags.DEFINE_float(
'weight_decay', 0.00004, 'The weight decay on the model weights.')
tf.app.flags.DEFINE_string('train_dir', '/tmp/imagenet_train',
"""Directory where to write event logs """
"""and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 100, 'Number of batches to run.')
tf.app.flags.DEFINE_string('subset', 'train', 'Either "train" or "validation".')
tf.app.flags.DEFINE_boolean('log_device_placement', False,
'Whether to log device placement.')
tf.app.flags.DEFINE_string(
'model_name', 'inception_v3', 'The name of the architecture to train.')
tf.app.flags.DEFINE_integer(
'batch_size', 32, 'The number of samples in each batch.')
tf.app.flags.DEFINE_string(
'preprocessing_name', None, 'The name of the preprocessing to use. If left '
'as `None`, then the model_name flag is used.')
# Task ID is used to select the chief and also to access the local_step for
# each replica to check staleness of the gradients in sync_replicas_optimizer.
tf.app.flags.DEFINE_integer(
'task_id', 0, 'Task ID of the worker/replica running the training.')
# More details can be found in the sync_replicas_optimizer class:
# tensorflow/python/training/sync_replicas_optimizer.py
tf.app.flags.DEFINE_integer('num_replicas_to_aggregate', -1,
"""Number of gradients to collect before """
"""updating the parameters.""")
tf.app.flags.DEFINE_integer('save_interval_secs', 10 * 60,
'Save interval seconds.')
tf.app.flags.DEFINE_integer('save_summaries_secs', 10 * 60,
'Save summaries interval seconds.')
# **IMPORTANT**
# Please note that this learning rate schedule is heavily dependent on the
# hardware architecture, batch size and any changes to the model architecture
# specification. Selecting a finely tuned learning rate schedule is an
# empirical process that requires some experimentation. Please see README.md
# more guidance and discussion.
#
# Learning rate decay factor selected from https://arxiv.org/abs/1604.00981
tf.app.flags.DEFINE_float('initial_learning_rate', 0.045,
'Initial learning rate.')
tf.app.flags.DEFINE_float('num_epochs_per_decay', 2.0,
'Epochs after which learning rate decays.')
tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.94,
'Learning rate decay factor.')
# Constants dictating the learning rate schedule.
RMSPROP_DECAY = 0.9 # Decay term for RMSProp.
RMSPROP_MOMENTUM = 0.9 # Momentum in RMSProp.
RMSPROP_EPSILON = 1.0 # Epsilon term for RMSProp.
def train(target, dataset, cluster_spec):
"""Train Inception on a dataset for a number of steps."""
# Number of workers and parameter servers are infered from the workers and ps
# hosts string.
num_workers = len(cluster_spec.as_dict()['worker'])
num_parameter_servers = len(cluster_spec.as_dict()['ps'])
# If no value is given, num_replicas_to_aggregate defaults to be the number of
# workers.
if FLAGS.num_replicas_to_aggregate == -1:
num_replicas_to_aggregate = num_workers
else:
num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate
# Both should be greater than 0 in a distributed training.
assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
'num_parameter_servers'
' must be > 0.')
# Choose worker 0 as the chief. Note that any worker could be the chief
# but there should be only one chief.
is_chief = (FLAGS.task_id == 0)
# Ops are assigned to worker by default.
with tf.device('/job:worker/task:%d' % FLAGS.task_id):
# Variables and its related init/assign ops are assigned to ps.
with slim.scopes.arg_scope(
[slim.variables.variable, slim.variables.global_step],
device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
# Create a variable to count the number of train() calls. This equals the
# number of updates applied to the variables.
global_step = slim.variables.global_step()
# Calculate the learning rate schedule.
num_batches_per_epoch = (dataset.num_examples_per_epoch() /
FLAGS.batch_size)
# Decay steps need to be divided by the number of replicas to aggregate.
decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
num_replicas_to_aggregate)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
global_step,
decay_steps,
FLAGS.learning_rate_decay_factor,
staircase=True)
# Add a summary to track the learning rate.
tf.summary.scalar('learning_rate', lr)
# Create an optimizer that performs gradient descent.
opt = tf.train.RMSPropOptimizer(lr,
RMSPROP_DECAY,
momentum=RMSPROP_MOMENTUM,
epsilon=RMSPROP_EPSILON)
dataset = dataset_factory.get_dataset(
FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
network_fn = nets_factory.get_network_fn(
FLAGS.model_name,
num_classes=(dataset.num_classes),
weight_decay=FLAGS.weight_decay,
is_training=True)
preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
image_preprocessing_fn = preprocessing_factory.get_preprocessing(
preprocessing_name,
is_training=True)
provider = sslim.dataset_data_provider.DatasetDataProvider(
dataset,
num_readers=4,
common_queue_capacity=20 * FLAGS.batch_size,
common_queue_min=10 * FLAGS.batch_size)
[image, label] = provider.get(['image', 'label'])
train_image_size = FLAGS.train_image_size or network_fn.default_image_size
image = image_preprocessing_fn(image, train_image_size, train_image_size)
images, labels = tf.train.batch(
[image, label],
batch_size=FLAGS.batch_size,
num_threads=4,
capacity=5 * FLAGS.batch_size)
# Number of classes in the Dataset label set plus 1.
# Label 0 is reserved for an (unused) background class.
num_classes = 1001
logits, end_points = network_fn(images)
batch_size=FLAGS.batch_size
# Add classification loss.
sparse_labels = tf.reshape(labels, [batch_size, 1])
indices = tf.reshape(tf.range(batch_size), [batch_size, 1])
#concated = tf.concat(1, [indices, sparse_labels])
sparse_labels = tf.cast(sparse_labels, tf.int32)
concated = tf.concat([indices, sparse_labels], 1)
dense_labels = tf.sparse_to_dense(concated,
[batch_size, 1001],
1.0, 0.0)
slim.losses.cross_entropy_loss(
logits, dense_labels, label_smoothing=0.01, weight=1.0)
# Gather all of the losses including regularization losses.
losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
total_loss = tf.add_n(losses, name='total_loss')
if is_chief:
# Compute the moving average of all individual losses and the
# total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summmary to all individual losses and the total loss;
# do the same for the averaged version of the losses.
for l in losses + [total_loss]:
loss_name = l.op.name
# Name each loss as '(raw)' and name the moving average version of the
# loss as the original loss name.
tf.summary.scalar(loss_name + '_raw', l)
tf.summary.scalar(loss_name, loss_averages.average(l))
# Add dependency to compute loss_averages.
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
# Track the moving averages of all trainable variables.
# Note that we maintain a 'double-average' of the BatchNormalization
# global statistics.
# This is not needed when the number of replicas are small but important
# for synchronous distributed training with tens of workers/replicas.
exp_moving_averager = tf.train.ExponentialMovingAverage(
inception.MOVING_AVERAGE_DECAY, global_step)
variables_to_average = (
tf.trainable_variables() + tf.moving_average_variables())
# Add histograms for model variables.
for var in variables_to_average:
tf.summary.histogram(var.op.name, var)
# Create synchronous replica optimizer.
opt = tf.train.SyncReplicasOptimizer(
opt,
replicas_to_aggregate=num_replicas_to_aggregate,
total_num_replicas=num_workers,
variable_averages=exp_moving_averager,
variables_to_average=variables_to_average)
# Compute gradients with respect to the loss.
grads = opt.compute_gradients(total_loss)
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
tf.summary.histogram(var.op.name + '/gradients', grad)
apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)
with tf.control_dependencies([apply_gradients_op]):
train_op = tf.identity(total_loss, name='train_op')
# Get chief queue_runners, init_tokens and clean_up_op, which is used to
# synchronize replicas.
# More details can be found in sync_replicas_optimizer.
chief_queue_runners = [opt.get_chief_queue_runner()]
init_tokens_op = opt.get_init_tokens_op()
# Build the summary operation based on the TF collection of Summaries.
summary_op = tf.summary.merge_all()
# Build an initialization operation to run below.
#init_op = tf.global_variables_initializer()
# We run the summaries in the same thread as the training operations by
# passing in None for summary_op to avoid a summary_thread being started.
# Running summaries and training operations in parallel could run out of
# GPU memory.
sv = tf.train.Supervisor(is_chief=is_chief,
logdir=FLAGS.train_dir,
init_op=init_op,
summary_op=None,
global_step=global_step,
#saver=saver,
saver=None,
save_model_secs=FLAGS.save_interval_secs)
tf.logging.info('%s Supervisor' % datetime.now())
sess_config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement)
# Get a session.
sess = sv.prepare_or_wait_for_session(target, config=sess_config)
# Start the queue runners.
queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
sv.start_queue_runners(sess, queue_runners)
tf.logging.info('Started %d queues for processing input data.',
len(queue_runners))
if is_chief:
sv.start_queue_runners(sess, chief_queue_runners)
sess.run(init_tokens_op)
# Train, checking for Nans. Concurrently run the summary operation at a
# specified interval. Note that the summary_op and train_op never run
# simultaneously in order to prevent running out of GPU memory.
#sess = sv.managed_session(target)
next_summary_time = time.time() + FLAGS.save_summaries_secs
while not sv.should_stop():
try:
start_time = time.time()
loss_value, step = sess.run([train_op, global_step])
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step > FLAGS.max_steps:
break
duration = time.time() - start_time
if step % 10 == 0:
examples_per_sec = FLAGS.batch_size / float(duration)
format_str = ('Worker %d: %s: step %d, loss = %.2f'
'(%.1f examples/sec; %.3f sec/batch)')
tf.logging.info(format_str %
(FLAGS.task_id, datetime.now(), step, loss_value,
examples_per_sec, duration))
# Determine if the summary_op should be run on the chief worker.
if is_chief and next_summary_time < time.time():
tf.logging.info('Running Summary operation on the chief.')
summary_str = sess.run(summary_op)
sv.summary_computed(sess, summary_str)
tf.logging.info('Finished running Summary operation.')
# Determine the next time for running the summary.
next_summary_time += FLAGS.save_summaries_secs
except:
if is_chief:
tf.logging.info('About to execute sync_clean_up_op!')
#sess.run(clean_up_op)
raise
# Stop the supervisor. This also waits for service threads to finish.
sv.stop()
Sync will create a local variable which will basically create the local step variable which is a local variable. But VariableDeviceChooser doesn't tell global from local so it is not functioning until we fix the device chooser. Thanks for reporting though.
Also concerned about this issue,Can you put your command line here?

Exporting cifar10 model from checkpoint file to tensorflow serving

I tried to modify the inception_export.py for CIFAR10 model, but I get the errors:
raise type(e)(node_def, op, message) tensorflow.python.framework.errors.InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [18,384] rhs shape= [2304,384]
[[Node: save/Assign_5 = Assign[T=DT_FLOAT, _class=["loc:#local3/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/cpu:0"](local3/weights, save/restore_slice_5)]] Caused by op u'save/Assign_5', defined at:
I am still very new to tensorflow, any help is much appreciated, thanks
EDIT1: here is my code. I haven't installed tensorflow serving so the related block is commented out. I also change the image_size to 24 to fit the CIFAR10 model.
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#!/usr/bin/env python2.7
"""Modified for CIFAR10 model from https://github.com/tensorflow/serving/blob/master/tensorflow_serving/example/inception_export.py
"""
import os.path
import sys
# This is a placeholder for a Google-internal import.
import tensorflow as tf
from tensorflow.models.image.cifar10 import cifar10
#from inception import inception_model
#from tensorflow_serving.session_bundle import exporter
tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
"""Directory where to read training checkpoints.""")
tf.app.flags.DEFINE_string('export_dir', '/tmp/cifar10_export',
"""Directory where to export inference model.""")
tf.app.flags.DEFINE_integer('image_size', 24,
"""Needs to provide same value as in training.""")
FLAGS = tf.app.flags.FLAGS
NUM_CLASSES = 10
NUM_TOP_CLASSES = 2
WORKING_DIR = os.path.dirname(os.path.realpath(__file__))
SYNSET_FILE = os.path.join(WORKING_DIR, 'imagenet_lsvrc_2015_synsets.txt')
METADATA_FILE = os.path.join(WORKING_DIR, 'imagenet_metadata.txt')
def export():
"""can be deleted if my simply define the constant string manually below?
# Create index->synset mapping
synsets = []
with open(SYNSET_FILE) as f:
synsets = f.read().splitlines()
# Create synset->metadata mapping
texts = {}
with open(METADATA_FILE) as f:
for line in f.read().splitlines():
parts = line.split('\t')
assert len(parts) == 2
texts[parts[0]] = parts[1]
"""
with tf.Graph().as_default():
# Build inference model.
# Please refer to Tensorflow inception model for details.
# Input transformation.
# TODO(b/27776734): Add batching support.
jpegs = tf.placeholder(tf.string, shape=(1))
image_buffer = tf.squeeze(jpegs, [0])
# Decode the string as an RGB JPEG.
# Note that the resulting image contains an unknown height and width
# that is set dynamically by decode_jpeg. In other words, the height
# and width of image is unknown at compile-time.
image = tf.image.decode_jpeg(image_buffer, channels=3)
# After this point, all image pixels reside in [0,1)
# until the very end, when they're rescaled to (-1, 1). The various
# adjust_* ops all require this range for dtype float.
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
# Crop the central region of the image with an area containing 87.5% of
# the original image.
image = tf.image.central_crop(image, central_fraction=0.875)
# Resize the image to the original height and width.
image = tf.expand_dims(image, 0)
image = tf.image.resize_bilinear(image,
[FLAGS.image_size, FLAGS.image_size],
align_corners=False)
image = tf.squeeze(image, [0])
# Finally, rescale to [-1,1] instead of [0, 1)
image = tf.sub(image, 0.5)
image = tf.mul(image, 2.0)
images = tf.expand_dims(image, 0)
# Run inference.
logits = cifar10.inference(images)
# Transform output to topK result.
values, indices = tf.nn.top_k(logits, NUM_TOP_CLASSES)
# Create a constant string Tensor where the i'th element is
# the human readable class description for the i'th index.
# Note that the 0th index is an unused background class
# (see inception model definition code).
class_descriptions = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
# for s in synsets:
# class_descriptions.append(texts[s])
class_tensor = tf.constant(class_descriptions)
classes = tf.contrib.lookup.index_to_string(tf.to_int64(indices),
mapping=class_tensor)
# Restore variables from training checkpoint.
variable_averages = tf.train.ExponentialMovingAverage(
cifar10.MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
with tf.Session() as sess:
# Restore variables from training checkpoints.
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
# Assuming model_checkpoint_path looks something like:
# /my-favorite-path/imagenet_train/model.ckpt-0,
# extract global_step from it.
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
print('Successfully loaded model from %s at step=%s.' %
(ckpt.model_checkpoint_path, global_step))
else:
print('No checkpoint file found at %s' % FLAGS.checkpoint_dir)
return
""" Not exporting yet because I haven't installed tensorflow serving
# Export inference model.
init_op = tf.group(tf.initialize_all_tables(), name='init_op')
model_exporter = exporter.Exporter(saver)
signature = exporter.classification_signature(
input_tensor=jpegs, classes_tensor=classes, scores_tensor=values)
model_exporter.init(default_graph_signature=signature, init_op=init_op)
model_exporter.export(FLAGS.export_dir, tf.constant(global_step), sess)
print('Successfully exported model to %s' % FLAGS.export_dir)
"""
def main(unused_argv=None):
export()
if __name__ == '__main__':
tf.app.run()