While trying to follow the sample code for TensorFlow Linear Model Tutorial
No change to the structure of the code, only the columns are changed.
while running, i encounter the error"ValueError: Features are incompatible with given information."
Given Features:
'COLUMN_1': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000094C2D30>
'COLUMN_2': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000000000C4D4B38>
'COLUMN_3': <tf.Tensor 'Const_1:0' shape=(3,) dtype=float64>
required signatures::
'COLUMN_1': TensorSignature(dtype=tf.string, shape=None, is_sparse=True)
'COLUMN_2': TensorSignature(dtype=tf.string, shape=None, is_sparse=True)
'COLUMN_3': TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(3)]), is_sparse=False)
issue is that the test data and the training data are processed by the same function, the output structure should be the same.
i did testing to use training data as test data, it works,no error.
so how does the data in test set affects the feature signature?
same error was asked here
Full error message
traceback (most recent call last):
File "C:\Users\USERA\Desktop\USERA\New Projects 2017\Machine Learning\Renewal\TensorFlowLinearModelRenewal.py", line 308, in <module>
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
File "C:\Users\USERA\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\platform\app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "C:\Users\USERA\Desktop\USERA\New Projects 2017\Machine Learning\Renewal\TensorFlowLinearModelRenewal.py", line 270, in main
FLAGS.train_data, FLAGS.test_data)
File "C:\Users\USERA\Desktop\USERA\New Projects 2017\Machine Learning\Renewal\TensorFlowLinearModelRenewal.py", line 255, in train_and_eval
results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
File "C:\Users\USERA\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\util\deprecation.py", line 289, in new_func
return func(*args, **kwargs)
File "C:\Users\USERA\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py", line 543, in evaluate
log_progress=log_progress)
File "C:\Users\USERA\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py", line 827, in _evaluate_model
self._check_inputs(features, labels)
File "C:\Users\USERA\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\contrib\learn\python\learn\estimators\estimator.py", line 757, in _check_inputs
(str(features), str(self._features_info)))
ValueError: Features are incompatible with given information. Given features: {'POL_SRC_BUS_CODE': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000152A2320>, 'POL_SUB_PRD_CODE': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000120DE080>, 'ASSR_TYPE': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000152A2518>, 'POL_PYMT_MODE': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000000000E523FD0>, 'POL_PREM_RENEWAL': <tf.Tensor 'Const_2:0' shape=(12,) dtype=float64>, 'POL_JACKET_CODE': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000000015D4C7F0>, 'POL_AGE': <tf.Tensor 'Const:0' shape=(12,) dtype=int64>, 'ASSR_GENDER': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000000000D92C2B0>, 'ACCOUNT_CLASS': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000000000B40FE80>, 'POL_PREM_ORIGINAL': <tf.Tensor 'Const_1:0' shape=(12,) dtype=float64>, 'POL_SCHEME_PLAN': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000152A2908>, 'POL_CUST_CODE': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000000000BCF27F0>, 'CLAIM_INCURRED': <tf.Tensor 'Const_4:0' shape=(12,) dtype=float64>, 'POL_END_NO_IDX': <tf.Tensor 'Const_3:0' shape=(12,) dtype=int64>, 'ASSR_MAR_STATUS': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000152A2208>, 'POL_OCC_DESC': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x00000000152A27F0>, 'ASSR_NATIONALITY': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000000000DB4F588>}, required signatures: {'ASSR_GENDER': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'POL_PREM_RENEWAL': TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(21)]), is_sparse=False), 'POL_PREM_ORIGINAL': TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(21)]), is_sparse=False), 'POL_SUB_PRD_CODE': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'ASSR_NATIONALITY': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'POL_CUST_CODE': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'POL_SRC_BUS_CODE': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'ASSR_MAR_STATUS': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'CLAIM_INCURRED': TensorSignature(dtype=tf.int64, shape=TensorShape([Dimension(21)]), is_sparse=False), 'POL_END_NO_IDX': TensorSignature(dtype=tf.int64, shape=TensorShape([Dimension(21)]), is_sparse=False), 'POL_SCHEME_PLAN': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'ASSR_TYPE': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'POL_PYMT_MODE': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'ACCOUNT_CLASS': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'POL_JACKET_CODE': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'POL_OCC_DESC': TensorSignature(dtype=tf.string, shape=None, is_sparse=True), 'POL_AGE': TensorSignature(dtype=tf.int64, shape=TensorShape([Dimension(21)]), is_sparse=False)}.
my code
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import tempfile
from six.moves import urllib
import pandas as pd
import tensorflow as tf
COLUMNS = ["EXPIRY_MONTH", "POL_SYS_ID","POL_DEPT_CODE","ACCOUNT_CLASS","POL_CUST_CODE","POL_SUB_PRD_CODE",
"POL_SRC_BUS_CODE","POL_OCC_DESC","POL_AGE","POL_INSURED_ID","ASSR_TYPE","ASSR_GENDER","ASSR_MAR_STATUS","ASSR_NATIONALITY",
"POL_SCHEME_PLAN","POL_PYMT_MODE","POL_POSTAL_CODE","POL_JACKET_CODE","POL_PREM_ORIGINAL","POL_PREM_RENEWAL","POL_END_NO_IDX",
"CLAIM_INCURRED","REVIEW_STATUS",]
LABEL_COLUMN = "label"
CATEGORICAL_COLUMNS = [ "ACCOUNT_CLASS", "POL_CUST_CODE", "POL_SUB_PRD_CODE",
"POL_SRC_BUS_CODE", "POL_OCC_DESC", "ASSR_TYPE", "ASSR_GENDER",
"ASSR_MAR_STATUS", "ASSR_NATIONALITY", "POL_SCHEME_PLAN", "POL_PYMT_MODE",
"POL_JACKET_CODE"
]
CONTINUOUS_COLUMNS = ["POL_AGE", "POL_PREM_ORIGINAL", "POL_PREM_RENEWAL","POL_END_NO_IDX","CLAIM_INCURRED"]
def maybe_download(train_data, test_data):
"""Maybe downloads training data and returns train and test file names."""
print('-----start of maybe_download')
if train_data:
train_file_name = train_data
else:
train_file = open('Renewal Listing 2015 export2.csv')
train_file_name = train_file.name
train_file.close()
print("------maybe_download()-----Training data is downloaded to %s" % train_file_name)
if test_data:
test_file_name = test_data
else:
test_file = open('Renewal Listing 2015 export2 Test.csv')
test_file_name = test_file.name
test_file.close()
print("------maybe_download()-----Test data is downloaded to %s" % test_file_name)
print('-----end of maybe_download')
return train_file_name, test_file_name
def build_estimator(model_dir, model_type):
"""Build an estimator."""
print('-----start of build_estimator')
# Sparse base columns.
ACCOUNT_CLASS = tf.contrib.layers.sparse_column_with_hash_bucket(
"ACCOUNT_CLASS", hash_bucket_size=1000)
POL_CUST_CODE = tf.contrib.layers.sparse_column_with_hash_bucket(
"POL_CUST_CODE", hash_bucket_size=100)
POL_SUB_PRD_CODE = tf.contrib.layers.sparse_column_with_hash_bucket(
"POL_SUB_PRD_CODE", hash_bucket_size=100)
POL_SRC_BUS_CODE = tf.contrib.layers.sparse_column_with_hash_bucket(
"POL_SRC_BUS_CODE", hash_bucket_size=1000)
POL_OCC_DESC = tf.contrib.layers.sparse_column_with_hash_bucket(
"POL_OCC_DESC", hash_bucket_size=1000)
ASSR_NATIONALITY = tf.contrib.layers.sparse_column_with_hash_bucket(
"ASSR_NATIONALITY", hash_bucket_size=1000)
POL_JACKET_CODE = tf.contrib.layers.sparse_column_with_hash_bucket(
"POL_JACKET_CODE", hash_bucket_size=1000)
print("----build_estimator()----hash_bucket_size: columns are processed")
ASSR_GENDER = tf.contrib.layers.sparse_column_with_keys(column_name="ASSR_GENDER",
keys=["M", "F"])
ASSR_TYPE = tf.contrib.layers.sparse_column_with_keys(column_name="ASSR_TYPE",
keys=["I", "C","M"])
ASSR_MAR_STATUS = tf.contrib.layers.sparse_column_with_keys(column_name="ASSR_MAR_STATUS",
keys=["D", "M","S","W"])
POL_SCHEME_PLAN = tf.contrib.layers.sparse_column_with_keys(column_name="POL_SCHEME_PLAN",
keys=["Y", "N"])
POL_PYMT_MODE = tf.contrib.layers.sparse_column_with_keys(column_name="POL_PYMT_MODE",
keys=["C", "CH","CR"])
print("----build_estimator()----sparse_column_with_keys: columns are processed")
# Continuous base columns.
POL_AGE = tf.contrib.layers.real_valued_column("POL_AGE")
POL_PREM_ORIGINAL = tf.contrib.layers.real_valued_column("POL_PREM_ORIGINAL")
POL_PREM_RENEWAL = tf.contrib.layers.real_valued_column("POL_PREM_RENEWAL")
POL_END_NO_IDX = tf.contrib.layers.real_valued_column("POL_END_NO_IDX")
CLAIM_INCURRED = tf.contrib.layers.real_valued_column("CLAIM_INCURRED")
print("----build_estimator()----Continuous base columns; are processed")
# Transformations.
age_buckets = tf.contrib.layers.bucketized_column(POL_AGE,
boundaries=[
18, 25, 30, 35, 40, 45,
50, 55, 60, 65
])
print("----build_estimator()----Transformations for age to boundaries is processed")
# Wide columns and deep columns.
wide_columns = [ACCOUNT_CLASS, POL_CUST_CODE, POL_SUB_PRD_CODE, POL_SRC_BUS_CODE, POL_OCC_DESC,
ASSR_NATIONALITY, POL_JACKET_CODE,ASSR_GENDER,ASSR_TYPE,ASSR_MAR_STATUS,POL_SCHEME_PLAN,POL_PYMT_MODE,
POL_PREM_ORIGINAL,POL_PREM_RENEWAL,POL_END_NO_IDX,CLAIM_INCURRED,
age_buckets,
tf.contrib.layers.crossed_column([POL_CUST_CODE, POL_SUB_PRD_CODE],
hash_bucket_size=int(1e4)),
tf.contrib.layers.crossed_column(
[age_buckets, POL_OCC_DESC],
hash_bucket_size=int(1e4)),
tf.contrib.layers.crossed_column([ACCOUNT_CLASS, ASSR_TYPE],
hash_bucket_size=int(1e4))]
deep_columns = [
tf.contrib.layers.embedding_column(POL_OCC_DESC, dimension=8),
tf.contrib.layers.embedding_column(POL_SRC_BUS_CODE, dimension=8),
tf.contrib.layers.embedding_column(ASSR_GENDER, dimension=8),
tf.contrib.layers.embedding_column(POL_JACKET_CODE, dimension=8),
tf.contrib.layers.embedding_column(ASSR_NATIONALITY,
dimension=8),
tf.contrib.layers.embedding_column(POL_PYMT_MODE, dimension=8),
POL_AGE,
POL_PREM_ORIGINAL,
POL_PREM_RENEWAL,
POL_END_NO_IDX,
CLAIM_INCURRED,
]
if model_type == "wide":
m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
feature_columns=wide_columns)
elif model_type == "deep":
m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
feature_columns=deep_columns,
hidden_units=[100, 50])
else:
m = tf.contrib.learn.DNNLinearCombinedClassifier(
model_dir=model_dir,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[100, 50],
fix_global_step_increment_bug=True)
print("-----end of build_estimator")
return m
def input_fn(df):
"""Input builder function."""
# Creates a dictionary mapping from each continuous feature column name (k) to
# the values of that column stored in a constant Tensor.
print('---------------------------------------------')
print('-----start of input_fn')
print('-----input_fn()----print df')
#print(df)
print('-----input_fn()----print df end')
continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
print('-----input_fn()----print continuous_cols')
#print(continuous_cols)
print('-----input_fn()----print continuous_cols end')
# Creates a dictionary mapping from each categorical feature column name (k)
# to the values of that column stored in a tf.SparseTensor.
categorical_cols = {
k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
dense_shape=[df[k].size, 1])
for k in CATEGORICAL_COLUMNS}
print('-----input_fn()----print categorical_cols')
#print(categorical_cols)
print('-----input_fn()----print categorical_cols end')
# Merges the two dictionaries into one.
feature_cols = dict(continuous_cols)
feature_cols.update(categorical_cols)
print('-----input_fn()----print feature_cols')
#print(feature_cols)
print('-----input_fn()----end of print feature_cols')
# Converts the label column into a constant Tensor.
label = tf.constant(df[LABEL_COLUMN].values)
# Returns the feature columns and the label.
print('-----input_fn()----print label')
#print(label)
print('-----input_fn()----end of print label')
print('---------------------------------------------')
print('-----end of input_fn')
return feature_cols, label
def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
"""Train and evaluate the model."""
print('---------------------------------------------')
print('-----start of train_and_eval')
print("-----train_and_eval()-----start to get files")
train_file_name, test_file_name = maybe_download(train_data, test_data)
print("-----train_and_eval()-----got train and test data")
df_train = pd.read_csv(
tf.gfile.Open(train_file_name),
names=COLUMNS,
skipinitialspace=True,
engine="python")
df_test = pd.read_csv(
tf.gfile.Open(test_file_name),
names=COLUMNS,
skipinitialspace=True,
skiprows=1,
engine="python")
#print("----print df_train file")
#for row in df_train:
# print(row)
# remove NaN elements
df_train = df_train.dropna(how='any', axis=0)
df_test = df_test.dropna(how='any', axis=0)
df_train[LABEL_COLUMN] = (
df_train["REVIEW_STATUS"].apply(lambda x: "RENEWED" in x)).astype(int)
df_test[LABEL_COLUMN] = (
df_test["REVIEW_STATUS"].apply(lambda x: "RENEWED" in x)).astype(int)
model_dir = tempfile.mkdtemp() if not model_dir else model_dir
print("-----train_and_eval()-----model directory = %s" % model_dir)
m = build_estimator(model_dir, model_type)
print(m)
print('-----train_and_eval()-----finished build_estimator')
print('----------------------------------')
print('----------------------------------')
print('-----start of input_fn(df_train)-------------')
print('========== compare train and test')
print('train')
print(df_train.shape)
print(df_train['POL_JACKET_CODE'].shape)
print('test')
print(df_test.shape)
print(df_test)
print('=====================================')
m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps)
print('-----end of input_fn(df_train)-------------')
print('-----beginning of evaluate input_fn(df_test)')
results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
print('-----train_and_eval()-----end of input_fn(df_test)')
print(' start to print -----results--------')
print(results)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
print('---------------------------------------------')
print('-----end of train_and_eval')
FLAGS = None
def main(_):
print("-------main()----- program start")
train_and_eval(FLAGS.model_dir, FLAGS.model_type, FLAGS.train_steps,
FLAGS.train_data, FLAGS.test_data)
print("------main()----- program ended: ")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--model_dir",
type=str,
default="",
help="Base directory for output models."
)
parser.add_argument(
"--model_type",
type=str,
default="wide_n_deep",
help="Valid model types: {'wide', 'deep', 'wide_n_deep'}."
)
parser.add_argument(
"--train_steps",
type=int,
default=200,
help="Number of training steps."
)
parser.add_argument(
"--train_data",
type=str,
default="",
help="Path to the training data."
)
parser.add_argument(
"--test_data",
type=str,
default="",
help="Path to the test data."
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
i have found out the issue, when python read CSV, the data type depends on the data itself.
for me, there is a column with all data =0 in the test set, but its a float number column. when the program read test set csv, the column is considered as int. so it not matching with float number type in training data, thus the error. change 0 to 0.00 or use actual data where both 0 and floating number are in the column solved the issue.
Related
I'm trying to code a layer to interface between a data set (numerical and categorical features) so it can be fed into a model.
I can't understand the error I get when it comes to categorical columns.
ValueError: Exception encountered when calling layer (type CategoryEncoding).
When output_mode is not 'int', maximum supported output rank is 2. Received
output_mode multi_hot and input shape (10, 7, 1), which would result in output rank 3.
From what I understand, the batch size should not have been counted in, but it is. And that seems to break.
Note that reproducing with only numerical features works fine.
Thank you for your help.
import tensorflow as tf
import pandas as pd
import numpy as np
# Simulate a data set of categorical and numerical values
# Configure simulation specifications: {feature: number of unique categories or None for numerical}
theSimSpecs = {'Cat1': 54, 'Cat2': 2, 'Cat3': 4, 'Num1': None, 'Num2': None}
# theSimSpecs = {'Num1': None, 'Num2': None}
# batch size and timesteps
theBatchSz, theTimeSteps = 10, 4
# Creation of the dataset as pandas.DataFrame
theDFs = []
for theFeature, theUniques in theSimSpecs.items():
if theUniques is None:
theDF = pd.DataFrame(np.random.random(size=theBatchSz * theTimeSteps), columns=[theFeature])
else:
theDF = pd.DataFrame(np.random.randint(low=0, high=theUniques, size=theBatchSz * theTimeSteps),
columns=[theFeature]).astype('category')
theDFs.append(theDF)
theDF = pd.concat(theDFs, axis=1)
# code excerpt
# inventory of the categorical features' values ( None for the numerical)
theCatCodes = {theCol: (theDF[theCol].unique().tolist() if str(theDF[theCol].dtypes) == "category" else None)
for theCol in theDF.columns}
# Creation of the batched tensorflow.data.Dataset
theDS = tf.data.Dataset.from_tensor_slices(dict(theDF))
theDS = theDS.window(size=theTimeSteps, shift=1, stride=1, drop_remainder=True)
theDS = theDS.flat_map(lambda x: tf.data.Dataset.zip(x))
theDS = theDS.batch(batch_size=theTimeSteps, drop_remainder=True)
theDS = theDS.batch(batch_size=theBatchSz, drop_remainder=True)
# extracting one batch
theBatch = next(iter(theDS))
tf.print(theBatch)
# Creation of the components for the interface layer
theFeaturesInputs = {}
theFeaturesEncoded = {}
for theFeature, theCodes in theCatCodes.items():
if theCodes is None: # Pass-through for numerical features
theNumInput = tf.keras.layers.Input(shape=[], dtype=tf.float32, name=theFeature)
theFeaturesInputs[theFeature] = theNumInput
theFeatureExp = tf.expand_dims(input=theNumInput, axis=-1)
theFeaturesEncoded[theFeature] = theFeatureExp
else: # Process for categorical features
theCatInput = tf.keras.layers.Input(shape=[], dtype=tf.int64, name=theFeature)
theFeaturesInputs[theFeature] = theCatInput
theFeatureExp = tf.expand_dims(input=theCatInput, axis=-1)
theEncodingLayer = tf.keras.layers.CategoryEncoding(num_tokens=theSimSpecs[theFeature], name=f"{theFeature}_enc",
output_mode="multi_hot", sparse=False)
theFeaturesEncoded[theFeature] = theEncodingLayer(theFeatureExp)
theStackedInputs = tf.concat(tf.nest.flatten(theFeaturesEncoded), axis=1)
theModel = tf.keras.Model(inputs=theFeaturesInputs, outputs=theStackedInputs)
theOutput = theModel(theBatch)
tf.print(theOutput)
I am trying to create a batched environment version of an SAC agent example from the Tensorflow Agents library, the original code can be found here. I am also using a custom environment.
I am pursuing a batched environment setup in order to better leverage GPU resources in order to speed up training. My understanding is that by passing batches of trajectories to the GPU, there will be less overhead incurred when passing data from the host (CPU) to the device (GPU).
My custom environment is called SacEnv, and I attempt to create a batched environment like so:
py_envs = [SacEnv() for _ in range(0, batch_size)]
batched_env = batched_py_environment.BatchedPyEnvironment(envs=py_envs)
tf_env = tf_py_environment.TFPyEnvironment(batched_env)
My hope is that this will create a batched environment consisting of a 'batch' of non-batched environments. However I am receiving the following error when running the code:
ValueError: Cannot assign value to variable ' Accumulator:0': Shape mismatch.The variable shape (1,), and the assigned value shape (32,) are incompatible.
with the stack trace:
Traceback (most recent call last):
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 370, in <module>
app.run(main)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/absl/app.py", line 312, in run
_run_main(main, args)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/absl/app.py", line 258, in _run_main
sys.exit(main(argv))
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 366, in main
train_eval(FLAGS.root_dir)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1605, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.__traceback__) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1582, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/home/gary/Desktop/code/sac_test/sac_main2.py", line 274, in train_eval
results = metric_utils.eager_compute(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1605, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.__traceback__) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/gin/config.py", line 1582, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/eval/metric_utils.py", line 163, in eager_compute
common.function(driver.run)(time_step, policy_state)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 211, in run
return self._run_fn(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/utils/common.py", line 188, in with_check_resource_vars
return fn(*fn_args, **fn_kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 238, in _run
tf.while_loop(
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 154, in loop_body
observer_ops = [observer(traj) for observer in self._observers]
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/drivers/dynamic_episode_driver.py", line 154, in <listcomp>
observer_ops = [observer(traj) for observer in self._observers]
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metric.py", line 93, in __call__
return self._update_state(*args, **kwargs)
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metric.py", line 81, in _update_state
return self.call(*arg, **kwargs)
ValueError: in user code:
File "/home/gary/anaconda3/envs/py39/lib/python3.9/site-packages/tf_agents/metrics/tf_metrics.py", line 176, in call *
self._return_accumulator.assign(
ValueError: Cannot assign value to variable ' Accumulator:0': Shape mismatch.The variable shape (1,), and the assigned value shape (32,) are incompatible.
In call to configurable 'eager_compute' (<function eager_compute at 0x7fa4d6e5e040>)
In call to configurable 'train_eval' (<function train_eval at 0x7fa4c8622dc0>)
I have dug through the tf_metric.py code to try and understand the error, however I have been unsuccessful. A related issue was solved when I added the batch size (32) to the initializer for the AverageReturnMetric instance, and this issue seems related.
The full code is:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python2, python3
r"""Train and Eval SAC.
All hyperparameters come from the SAC paper
https://arxiv.org/pdf/1812.05905.pdf
To run:
```bash
tensorboard --logdir $HOME/tmp/sac/gym/HalfCheetah-v2/ --port 2223 &
python tf_agents/agents/sac/examples/v2/train_eval.py \
--root_dir=$HOME/tmp/sac/gym/HalfCheetah-v2/ \
--alsologtostderr
\```
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sac_env import SacEnv
import os
import time
from absl import app
from absl import flags
from absl import logging
import gin
from six.moves import range
import tensorflow as tf # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.agents.ddpg import critic_network
from tf_agents.agents.sac import sac_agent
from tf_agents.agents.sac import tanh_normal_projection_network
from tf_agents.drivers import dynamic_step_driver
#from tf_agents.environments import suite_mujoco
from tf_agents.environments import tf_py_environment
from tf_agents.environments import batched_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.policies import greedy_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
from tf_agents.train.utils import strategy_utils
flags.DEFINE_string('root_dir', os.getenv('TEST_UNDECLARED_OUTPUTS_DIR'),
'Root directory for writing logs/summaries/checkpoints.')
flags.DEFINE_multi_string('gin_file', None, 'Path to the trainer config files.')
flags.DEFINE_multi_string('gin_param', None, 'Gin binding to pass through.')
FLAGS = flags.FLAGS
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
print(e)
#gin.configurable
def train_eval(
root_dir,
env_name='SacEnv',
# The SAC paper reported:
# Hopper and Cartpole results up to 1000000 iters,
# Humanoid results up to 10000000 iters,
# Other mujoco tasks up to 3000000 iters.
num_iterations=3000000,
actor_fc_layers=(256, 256),
critic_obs_fc_layers=None,
critic_action_fc_layers=None,
critic_joint_fc_layers=(256, 256),
# Params for collect
# Follow https://github.com/haarnoja/sac/blob/master/examples/variants.py
# HalfCheetah and Ant take 10000 initial collection steps.
# Other mujoco tasks take 1000.
# Different choices roughly keep the initial episodes about the same.
#initial_collect_steps=10000,
initial_collect_steps=2000,
collect_steps_per_iteration=1,
replay_buffer_capacity=31250, # 1000000 / 32
# Params for target update
target_update_tau=0.005,
target_update_period=1,
# Params for train
train_steps_per_iteration=1,
#batch_size=256,
batch_size=32,
actor_learning_rate=3e-4,
critic_learning_rate=3e-4,
alpha_learning_rate=3e-4,
td_errors_loss_fn=tf.math.squared_difference,
gamma=0.99,
reward_scale_factor=0.1,
gradient_clipping=None,
use_tf_functions=True,
# Params for eval
num_eval_episodes=30,
eval_interval=10000,
# Params for summaries and logging
train_checkpoint_interval=50000,
policy_checkpoint_interval=50000,
rb_checkpoint_interval=50000,
log_interval=1000,
summary_interval=1000,
summaries_flush_secs=10,
debug_summaries=False,
summarize_grads_and_vars=False,
eval_metrics_callback=None):
"""A simple train and eval for SAC."""
root_dir = os.path.expanduser(root_dir)
train_dir = os.path.join(root_dir, 'train')
eval_dir = os.path.join(root_dir, 'eval')
train_summary_writer = tf.compat.v2.summary.create_file_writer(
train_dir, flush_millis=summaries_flush_secs * 1000)
train_summary_writer.set_as_default()
eval_summary_writer = tf.compat.v2.summary.create_file_writer(
eval_dir, flush_millis=summaries_flush_secs * 1000)
eval_metrics = [
tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
]
global_step = tf.compat.v1.train.get_or_create_global_step()
with tf.compat.v2.summary.record_if(
lambda: tf.math.equal(global_step % summary_interval, 0)):
py_envs = [SacEnv() for _ in range(0, batch_size)]
batched_env = batched_py_environment.BatchedPyEnvironment(envs=py_envs)
tf_env = tf_py_environment.TFPyEnvironment(batched_env)
eval_py_envs = [SacEnv() for _ in range(0, batch_size)]
eval_batched_env = batched_py_environment.BatchedPyEnvironment(envs=eval_py_envs)
eval_tf_env = tf_py_environment.TFPyEnvironment(eval_batched_env)
time_step_spec = tf_env.time_step_spec()
observation_spec = time_step_spec.observation
action_spec = tf_env.action_spec()
strategy = strategy_utils.get_strategy(tpu=False, use_gpu=True)
with strategy.scope():
actor_net = actor_distribution_network.ActorDistributionNetwork(
observation_spec,
action_spec,
fc_layer_params=actor_fc_layers,
continuous_projection_net=tanh_normal_projection_network
.TanhNormalProjectionNetwork)
critic_net = critic_network.CriticNetwork(
(observation_spec, action_spec),
observation_fc_layer_params=critic_obs_fc_layers,
action_fc_layer_params=critic_action_fc_layers,
joint_fc_layer_params=critic_joint_fc_layers,
kernel_initializer='glorot_uniform',
last_kernel_initializer='glorot_uniform')
tf_agent = sac_agent.SacAgent(
time_step_spec,
action_spec,
actor_network=actor_net,
critic_network=critic_net,
actor_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=actor_learning_rate),
critic_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=critic_learning_rate),
alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
learning_rate=alpha_learning_rate),
target_update_tau=target_update_tau,
target_update_period=target_update_period,
td_errors_loss_fn=td_errors_loss_fn,
gamma=gamma,
reward_scale_factor=reward_scale_factor,
gradient_clipping=gradient_clipping,
debug_summaries=debug_summaries,
summarize_grads_and_vars=summarize_grads_and_vars,
train_step_counter=global_step)
tf_agent.initialize()
# Make the replay buffer.
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=tf_agent.collect_data_spec,
batch_size=batch_size,
max_length=replay_buffer_capacity,
device="/device:GPU:0")
replay_observer = [replay_buffer.add_batch]
train_metrics = [
tf_metrics.NumberOfEpisodes(),
tf_metrics.EnvironmentSteps(),
tf_metrics.AverageReturnMetric(
buffer_size=num_eval_episodes, batch_size=tf_env.batch_size),
tf_metrics.AverageEpisodeLengthMetric(
buffer_size=num_eval_episodes, batch_size=tf_env.batch_size),
]
eval_policy = greedy_policy.GreedyPolicy(tf_agent.policy)
initial_collect_policy = random_tf_policy.RandomTFPolicy(
tf_env.time_step_spec(), tf_env.action_spec())
collect_policy = tf_agent.collect_policy
train_checkpointer = common.Checkpointer(
ckpt_dir=train_dir,
agent=tf_agent,
global_step=global_step,
metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
policy_checkpointer = common.Checkpointer(
ckpt_dir=os.path.join(train_dir, 'policy'),
policy=eval_policy,
global_step=global_step)
rb_checkpointer = common.Checkpointer(
ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
max_to_keep=1,
replay_buffer=replay_buffer)
train_checkpointer.initialize_or_restore()
rb_checkpointer.initialize_or_restore()
initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
tf_env,
initial_collect_policy,
observers=replay_observer + train_metrics,
num_steps=initial_collect_steps)
collect_driver = dynamic_step_driver.DynamicStepDriver(
tf_env,
collect_policy,
observers=replay_observer + train_metrics,
num_steps=collect_steps_per_iteration)
if use_tf_functions:
initial_collect_driver.run = common.function(initial_collect_driver.run)
collect_driver.run = common.function(collect_driver.run)
tf_agent.train = common.function(tf_agent.train)
if replay_buffer.num_frames() == 0:
# Collect initial replay data.
logging.info(
'Initializing replay buffer by collecting experience for %d steps '
'with a random policy.', initial_collect_steps)
initial_collect_driver.run()
results = metric_utils.eager_compute(
eval_metrics,
eval_tf_env,
eval_policy,
num_episodes=num_eval_episodes,
train_step=global_step,
summary_writer=eval_summary_writer,
summary_prefix='Metrics',
)
if eval_metrics_callback is not None:
eval_metrics_callback(results, global_step.numpy())
metric_utils.log_metrics(eval_metrics)
time_step = None
policy_state = collect_policy.get_initial_state(tf_env.batch_size)
timed_at_step = global_step.numpy()
time_acc = 0
# Prepare replay buffer as dataset with invalid transitions filtered.
def _filter_invalid_transition(trajectories, unused_arg1):
return ~trajectories.is_boundary()[0]
dataset = replay_buffer.as_dataset(
sample_batch_size=batch_size,
num_steps=2).unbatch().filter(
_filter_invalid_transition).batch(batch_size).prefetch(5)
# Dataset generates trajectories with shape [Bx2x...]
iterator = iter(dataset)
def train_step():
experience, _ = next(iterator)
return tf_agent.train(experience)
if use_tf_functions:
train_step = common.function(train_step)
global_step_val = global_step.numpy()
while global_step_val < num_iterations:
start_time = time.time()
time_step, policy_state = collect_driver.run(
time_step=time_step,
policy_state=policy_state,
)
for _ in range(train_steps_per_iteration):
train_loss = train_step()
time_acc += time.time() - start_time
global_step_val = global_step.numpy()
if global_step_val % log_interval == 0:
logging.info('step = %d, loss = %f', global_step_val,
train_loss.loss)
steps_per_sec = (global_step_val - timed_at_step) / time_acc
logging.info('%.3f steps/sec', steps_per_sec)
tf.compat.v2.summary.scalar(
name='global_steps_per_sec', data=steps_per_sec, step=global_step)
timed_at_step = global_step_val
time_acc = 0
for train_metric in train_metrics:
train_metric.tf_summaries(
train_step=global_step, step_metrics=train_metrics[:2])
if global_step_val % eval_interval == 0:
results = metric_utils.eager_compute(
eval_metrics,
eval_tf_env,
eval_policy,
num_episodes=num_eval_episodes,
train_step=global_step,
summary_writer=eval_summary_writer,
summary_prefix='Metrics',
)
if eval_metrics_callback is not None:
eval_metrics_callback(results, global_step_val)
metric_utils.log_metrics(eval_metrics)
if global_step_val % train_checkpoint_interval == 0:
train_checkpointer.save(global_step=global_step_val)
if global_step_val % policy_checkpoint_interval == 0:
policy_checkpointer.save(global_step=global_step_val)
if global_step_val % rb_checkpoint_interval == 0:
rb_checkpointer.save(global_step=global_step_val)
return train_loss
def main(_):
tf.compat.v1.enable_v2_behavior()
logging.set_verbosity(logging.INFO)
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
train_eval(FLAGS.root_dir)
if __name__ == '__main__':
flags.mark_flag_as_required('root_dir')
app.run(main)
What is the appropriate way to create a batched environment for a custom, non-batched environment? I can share my custom environment, but I don't believe the issue lies there as the code works fine when using batch sizes of 1.
Also, any tips on increasing GPU utilization in reinforcement learning scenarios would be greatly appreciated. I have examined examples of using tensorboard-profiler to profile GPU utilization, but it seems these require callbacks and a fit function, which doesn't seem to be applicable in RL use-cases.
It turns out I neglected to pass batch_size when initializing the AverageReturnMetric and AverageEpisodeLengthMetric instances.
I'm running a wide_deep.py script for linear regression in tensorflow.
I have cloned the models directory also as a part of process.
But i'm getting a error like AttributeError: 'list' object has no attribute 'model_dir'.
If I hard code this particular variable, I m getting other errors as AttributeError: 'list' object has no attribute 'data_dir' and so on .
Code:
"""Example code for TensorFlow Wide & Deep Tutorial using tf.estimator API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import shutil
from absl import app as absl_app
from absl import flags
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.flags import core as flags_core
from official.utils.logs import hooks_helper
from official.utils.misc import model_helpers
_CSV_COLUMNS = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
_CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
[0], [0], [0], [''], ['']]
_NUM_EXAMPLES = {
'train': 32561,
'validation': 16281,
}
LOSS_PREFIX = {'wide': 'linear/', 'deep': 'dnn/'}
def define_wide_deep_flags():
"""Add supervised learning flags, as well as wide-deep model type."""
flags_core.define_base()
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_enum(
name="model_type", short_name="mt", default="wide_deep",
enum_values=['wide', 'deep', 'wide_deep'],
help="Select model topology.")
flags_core.set_defaults(data_dir='/tmp/census_data',
model_dir='/tmp/census_model',
train_epochs=40,
epochs_between_evals=2,
batch_size=40)
def build_model_columns():
"""Builds a set of wide and deep feature columns."""
# Continuous columns
age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column('education_num')
capital_gain = tf.feature_column.numeric_column('capital_gain')
capital_loss = tf.feature_column.numeric_column('capital_loss')
hours_per_week = tf.feature_column.numeric_column('hours_per_week')
education = tf.feature_column.categorical_column_with_vocabulary_list(
'education', [
'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
'5th-6th', '10th', '1st-4th', 'Preschool', '12th'])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
'marital_status', [
'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
'relationship', [
'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
'Other-relative'])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
'workclass', [
'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])
# To show an example of hashing:
occupation = tf.feature_column.categorical_column_with_hash_bucket(
'occupation', hash_bucket_size=1000)
# Transformations.
age_buckets = tf.feature_column.bucketized_column(
age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
# Wide columns and deep columns.
base_columns = [
education, marital_status, relationship, workclass, occupation,
age_buckets,
]
crossed_columns = [
tf.feature_column.crossed_column(
['education', 'occupation'], hash_bucket_size=1000),
tf.feature_column.crossed_column(
[age_buckets, 'education', 'occupation'], hash_bucket_size=1000),
]
wide_columns = base_columns + crossed_columns
deep_columns = [
age,
education_num,
capital_gain,
capital_loss,
hours_per_week,
tf.feature_column.indicator_column(workclass),
tf.feature_column.indicator_column(education),
tf.feature_column.indicator_column(marital_status),
tf.feature_column.indicator_column(relationship),
# To show an example of embedding
tf.feature_column.embedding_column(occupation, dimension=8),
]
return wide_columns, deep_columns
def build_estimator(model_dir, model_type):
"""Build an estimator appropriate for the given model type."""
wide_columns, deep_columns = build_model_columns()
hidden_units = [100, 75, 50, 25]
# Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
# trains faster than GPU for this model.
run_config = tf.estimator.RunConfig().replace(
session_config=tf.ConfigProto(device_count={'GPU': 0}))
if model_type == 'wide':
return tf.estimator.LinearClassifier(
model_dir=model_dir,
feature_columns=wide_columns,
config=run_config)
elif model_type == 'deep':
return tf.estimator.DNNClassifier(
model_dir=model_dir,
feature_columns=deep_columns,
hidden_units=hidden_units,
config=run_config)
else:
return tf.estimator.DNNLinearCombinedClassifier(
model_dir=model_dir,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=hidden_units,
config=run_config)
def input_fn(data_file, num_epochs, shuffle, batch_size):
"""Generate an input function for the Estimator."""
assert tf.gfile.Exists(data_file), (
'%s not found. Please make sure you have run data_download.py and '
'set the --data_dir argument to the correct path.' % data_file)
def parse_csv(value):
print('Parsing', data_file)
columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
features = dict(zip(_CSV_COLUMNS, columns))
labels = features.pop('income_bracket')
return features, tf.equal(labels, '>50K')
# Extract lines from input files using the Dataset API.
dataset = tf.data.TextLineDataset(data_file)
if shuffle:
dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train'])
dataset = dataset.map(parse_csv, num_parallel_calls=5)
# We call repeat after shuffling, rather than before, to prevent separate
# epochs from blending together.
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
return dataset
def export_model(model, model_type, export_dir):
"""Export to SavedModel format.
Args:
model: Estimator object
model_type: string indicating model type. "wide", "deep" or "wide_deep"
export_dir: directory to export the model.
"""
wide_columns, deep_columns = build_model_columns()
if model_type == 'wide':
columns = wide_columns
elif model_type == 'deep':
columns = deep_columns
else:
columns = wide_columns + deep_columns
feature_spec = tf.feature_column.make_parse_example_spec(columns)
example_input_fn = (
tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec))
model.export_savedmodel(export_dir, example_input_fn)
def run_wide_deep(flags_obj):
"""Run Wide-Deep training and eval loop.
Args:
flags_obj: An object containing parsed flag values.
"""
# Clean up the model directory if present
shutil.rmtree(flags_obj.model_dir, ignore_errors=True)
model = build_estimator(flags_obj.model_dir, flags_obj.model_type)
train_file = os.path.join(flags_obj.data_dir, 'adult.data')
test_file = os.path.join(flags_obj.data_dir, 'adult.test')
# Train and evaluate the model every `flags.epochs_between_evals` epochs.
def train_input_fn():
return input_fn(
train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)
def eval_input_fn():
return input_fn(test_file, 1, False, flags_obj.batch_size)
loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
train_hooks = hooks_helper.get_train_hooks(
flags_obj.hooks, batch_size=flags_obj.batch_size,
tensors_to_log={'average_loss': loss_prefix + 'head/truediv',
'loss': loss_prefix + 'head/weighted_loss/Sum'})
# Train and evaluate the model every `flags.epochs_between_evals` epochs.
for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
model.train(input_fn=train_input_fn, hooks=train_hooks)
results = model.evaluate(input_fn=eval_input_fn)
# Display evaluation metrics
print('Results at epoch', (n + 1) * flags_obj.epochs_between_evals)
print('-' * 60)
for key in sorted(results):
print('%s: %s' % (key, results[key]))
if model_helpers.past_stop_threshold(
flags_obj.stop_threshold, results['accuracy']):
break
# Export the model
if flags_obj.export_dir is not None:
export_model(model, flags_obj.model_type, flags_obj.export_dir)
def main(_):
run_wide_deep(flags.FLAGS)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
define_wide_deep_flags()
absl_app.run(main)
Hunter, I tried to run without hardcoding but still faced issues with attributes , so I tried to hard code to avoid this .
But, The issue is resolved now.
I cloned the directory again and instead of copying the wide_deep.py to another directory and run from there(which I was doing before), I ran directly from the same directory and now it is working fine.
I have gone about exporting the textsum model using the export_textsum.py file shown below and when I connect using the textsumclient.py file below I receive the error:
Traceback (most recent call last): File "textsum_client.py", line
90, in
tf.app.run() File "/usr/local/lib/python2.7/site-packages/tensorflow/python/platform/app.py",
line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough)) File "textsum_client.py", line 83, in main
FLAGS.concurrency, FLAGS.num_tests) File "textsum_client.py", line 72, in do_singleDecode
result = stub.Predict(request, 5.0) # 5 seconds File "/usr/local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py",
line 324, in call
self._request_serializer, self._response_deserializer) File "/usr/local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py",
line 210, in _blocking_unary_unary
raise _abortion_error(rpc_error_call) grpc.framework.interfaces.face.face.AbortionError:
AbortionError(code=StatusCode.INVALID_ARGUMENT, details="input size
does not match signature")
I believe that it may have something to do with the building of tf_example in my export_textsum file but I honestly have not had luck figuring this out as of yet. Anyone with a bit more experience know what I am doing wrong here? If there are any ideas to help me narrow down exactly what is going on here I am open to any advice. Thanks.
textsumclient.py
from __future__ import print_function
import sys
import threading
# This is a placeholder for a Google-internal import.
from grpc.beta import implementations
import numpy
import tensorflow as tf
from datetime import datetime
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2
#from tensorflow_serving.example import mnist_input_data
tf.app.flags.DEFINE_integer('concurrency', 1,
'maximum number of concurrent inference requests')
tf.app.flags.DEFINE_integer('num_tests', 10, 'Number of test images')
tf.app.flags.DEFINE_string('server', '172.17.0.2:9000', 'PredictionService host:port')
tf.app.flags.DEFINE_string('work_dir', '/tmp', 'Working directory. ')
FLAGS = tf.app.flags.FLAGS
def do_singleDecode(hostport, work_dir, concurrency, num_tests):
#Connect to server
host, port = hostport.split(':')
channel = implementations.insecure_channel(host, int(port))
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
#Prepare our request object
request = predict_pb2.PredictRequest()
request.model_spec.name = 'textsum_model'
request.model_spec.signature_name = 'predict'
#Make some test data
test_data_set = ['This is a test','This is a sample']
#Lets test her out
now = datetime.now()
article, abstract = test_data_set
#***** POPULATE REQUEST INPUTS *****
request.inputs['article'].CopyFrom(
tf.contrib.util.make_tensor_proto(test_data_set[0], shape=[len(test_data_set[0])]))
request.inputs['abstract'].CopyFrom(
tf.contrib.util.make_tensor_proto(test_data_set[1], shape=[len(test_data_set[1])]))
result = stub.Predict(request, 5.0) # 5 seconds
waiting = datetime.now() - now
return result, waiting.microseconds
def main(_):
if not FLAGS.server:
print('please specify server host:port')
return
result, waiting = do_singleDecode(FLAGS.server, FLAGS.work_dir,
FLAGS.concurrency, FLAGS.num_tests)
print('\nTextsum result: %s%%' % result)
print('Waiting time is: ', waiting, 'microseconds.')
if __name__ == '__main__':
tf.app.run()
export_textsum.py
decode_mdl_hps = hps
# Only need to restore the 1st step and reuse it since
# we keep and feed in state for each step's output.
decode_mdl_hps = hps._replace(dec_timesteps=1)
model = seq2seq_attention_model.Seq2SeqAttentionModel(
decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus)
decoder = seq2seq_attention_decode.BSDecoder(model, batcher, hps, vocab)
serialized_output = tf.placeholder(tf.string, name='tf_output')
serialized_tf_example = tf.placeholder(tf.string, name='tf_example')
feature_configs = {
'article': tf.FixedLenFeature(shape=[1], dtype=tf.string),
'abstract': tf.FixedLenFeature(shape=[1], dtype=tf.string),
}
tf_example = tf.parse_example(serialized_tf_example, feature_configs)
saver = tf.train.Saver()
config = tf.ConfigProto(allow_soft_placement = True)
with tf.Session(config = config) as sess:
# Restore variables from training checkpoints.
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
print('Successfully loaded model from %s at step=%s.' %
(ckpt.model_checkpoint_path, global_step))
else:
print('No checkpoint file found at %s' % FLAGS.checkpoint_dir)
return
# ************** EXPORT MODEL ***************
export_path = os.path.join(FLAGS.export_dir,str(FLAGS.export_version))
print('Exporting trained model to %s' % export_path)
#-------------------------------------------
tensor_info_inputs = tf.saved_model.utils.build_tensor_info(serialized_tf_example)
tensor_info_outputs = tf.saved_model.utils.build_tensor_info(serialized_output)
prediction_signature = (
tf.saved_model.signature_def_utils.build_signature_def(
inputs={ tf.saved_model.signature_constants.PREDICT_INPUTS: tensor_info_inputs},
outputs={tf.saved_model.signature_constants.PREDICT_OUTPUTS:tensor_info_outputs},
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
))
#----------------------------------
legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
builder = saved_model_builder.SavedModelBuilder(export_path)
builder.add_meta_graph_and_variables(
sess=sess,
tags=[tf.saved_model.tag_constants.SERVING],
signature_def_map={
'predict':prediction_signature,
},
legacy_init_op=legacy_init_op)
builder.save()
print('Successfully exported model to %s' % export_path)
except:
traceback.print_exc()
pass
def main(_):
Export()
if __name__ == "__main__":
tf.app.run()
It looks like you should specify a shape of [1] both in your client and graph definition.
export_textsum.py
feature_configs = {
'article': tf.FixedLenFeature(shape=[1], dtype=tf.string),
'abstract': tf.FixedLenFeature(shape=[1], dtype=tf.string),
}
textsumclient.py
request.inputs['article'].CopyFrom(
tf.contrib.util.make_tensor_proto([test_data_set[0]], shape=[1]))
request.inputs['abstract'].CopyFrom(
tf.contrib.util.make_tensor_proto([test_data_set[1]], shape=[1]))
Or perhaps using shape=[len(test_data_set[0])] would be more appropriate
QuantumLicht I again just want to thank you for your assistance here as it was one part of my issue. It seemed to have something to do with the keys used in the feature config. I am still using TF 1.2 and I remember reading sometime back that there were some fixes performed for proper key names being able to be used now in newer versions. That said, as I debugged I noticed that it was expecting a single input named "inputs". So I removed "abstract" and set article to inputs. I then had to modify the output of decode and the final issue was related to the fact that I was only loading the model but never running the function against the model to get back the output that I needed to then send into tensor_info_outputs.
To get started with TF, I wanted to learn a predictor of match outcomes for a game. There are three features: the 5 heros on team 0, the 5 heroes on team 1, and the map. The winner is the label, 0 or 1. I want to represent the teams and the maps as SparseTensors. Out of a possible 71 heroes, five will be selected. Likewise for maps, out of a possible 13, one will be selected.
import tensorflow as tf
import packunpack as source
import tempfile
from collections import namedtuple
GameRecord = namedtuple('GameRecord', 'team_0 team_1 game_map winner')
def parse(line):
parts = line.rstrip().split("\t")
return GameRecord(
game_map = parts[1],
team_0 = parts[2].split(","),
team_1 = parts[3].split(","),
winner = int(parts[4]))
def conjugate(record):
return GameRecord(
team_0 = record.team_1,
team_1 = record.team_0,
game_map = record.game_map,
winner = 0 if record.winner == 1 else 1)
def sparse_team(team):
indices = list(map(lambda x: [x], map(source.encode_hero, team)))
return tf.SparseTensor(indices=indices, values = [1] * len(indices), dense_shape=[len(source.heroes_array)])
def sparse_map(map_name):
return tf.SparseTensor(indices=[[source.encode_hero(map_name)]], values = [1], dense_shape=[len(source.maps_array)])
def make_input_fn(filename, shuffle = True, add_conjugate_games = True):
def _fn():
records = []
with open(filename, "r") as raw:
i = 0
for line in raw:
record = parse(line)
records.append(record)
if add_conjugate_games:
# since 0 and 1 are arbitrary team labels, learn and test the conjugate game whenever
# learning the original inference
records.append(conjugate(record))
print("Making team 0")
team_0s = tf.constant(list(map(lambda r: sparse_team(r.team_0), records)))
print("Making team 1")
team_1s = tf.constant(list(map(lambda r: sparse_team(r.team_1), records)))
print("making maps")
maps = tf.constant(list(map(lambda r: sparse_map(r.game_map), records)))
print("Making winners")
winners = tf.constant(list(map(lambda r: tf.constant([r.winner]), records)))
return {
"team_0": team_0s,
"team_1": team_1s,
"game_map": maps,
}, winners
#Please help me finish this function?
return _fn
team_0 = tf.feature_column.embedding_column(
tf.feature_column.categorical_column_with_vocabulary_list("team_0", source.heroes_array), len(source.heroes_array))
team_1 = tf.feature_column.embedding_column(
tf.feature_column.categorical_column_with_vocabulary_list("team_1", source.heroes_array), len(source.heroes_array))
game_map = tf.feature_column.embedding_column(
tf.feature_column.categorical_column_with_vocabulary_list("game_map", source.maps_array), len(source.maps_array))
model_dir = tempfile.mkdtemp()
m = tf.estimator.DNNClassifier(
model_dir=model_dir,
hidden_units = [1024, 512, 256],
feature_columns=[team_0, team_1, game_map])
def main():
m.train(input_fn=make_input_fn("tiny.txt"), steps = 100)
if __name__ == "__main__":
main()
This fails on team_0s = tf.constant(list(map(lambda r: sparse_team(r.team_0), records)))
It's very difficult to understand what tf wants me to return in my input_fn, because all of the examples I can find in the docs ultimately call out to a pandas or numpy helper function, and I'm not familiar with those frameworks. I thought that each dictionary value should be a Tensor containing all examples of a single feature. Each of my examples is a SparseTensor, and I want to simply embed them as their dense versions for the sake of the DNNClassifier.
I'm sure my mental model is horribly broken right now, and I appreciate any help setting it straight.
Error output:
python3 estimator.py
Making team 0
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 468, in make_tensor_proto
str_values = [compat.as_bytes(x) for x in proto_values]
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 468, in <listcomp>
str_values = [compat.as_bytes(x) for x in proto_values]
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py", line 65, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7fe8
b4d7aef0>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "estimator.py", line 79, in <module>
main()
File "estimator.py", line 76, in main
m.train(input_fn=make_input_fn("tiny.txt"), steps = 100)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 302, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 709, in _train_model
input_fn, model_fn_lib.ModeKeys.TRAIN)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 577, in _get_features_and_l
abels_from_input_fn
result = self._call_input_fn(input_fn, mode)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 663, in _call_input_fn
return input_fn(**kwargs)
File "estimator.py", line 44, in _fn
team_0s = tf.constant(list(map(lambda r: sparse_team(r.team_0), records)))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py", line 208, in constant
value, dtype=dtype, shape=shape, verify_shape=verify_shape))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 472, in make_tensor_proto
"supported type." % (type(values), values))
TypeError: Failed to convert object of type <class 'list'> to Tensor. Contents: [<tensorflow.python.framework.sparse_tenso
r.SparseTensor object at 0x7fe8b4d7aef0>, <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7fe8b4d7af28
>, <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7fe8b4d7af60>, <tensorflow.python.framework.sparse_
tensor.SparseTensor object at 0x7fe8b4d7aeb8> ... ]
Ultimately it wasn't necessary to convert my text representation into sparse vectors in my input_fn. Instead I had to tell the model to expect an input of an array of strings, which it understands how to convert into a "bag of words" or n-hot vector and how to embed as dense vectors.
import tensorflow as tf
import tempfile
import os
from collections import namedtuple
GameRecord = namedtuple('GameRecord', 'team_0 team_1 game_map winner')
def parse(line):
parts = line.rstrip().split("\t")
return GameRecord(
game_map = parts[1],
team_0 = parts[2].split(","),
team_1 = parts[3].split(","),
winner = int(parts[4]))
def conjugate(record):
return GameRecord(
team_0 = record.team_1,
team_1 = record.team_0,
game_map = record.game_map,
winner = 0 if record.winner == 1 else 1)
def make_input_fn(filename, batch_size=128, shuffle = True, add_conjugate_games = True, epochs=1):
def _fn():
records = []
with open(filename, "r") as raw:
i = 0
for line in raw:
record = parse(line)
records.append(record)
if add_conjugate_games:
records.append(conjugate(record))
team_0s = tf.constant(list(map(lambda r: r.team_0, records)))
team_1s = tf.constant(list(map(lambda r: r.team_1, records)))
maps = tf.constant(list(map(lambda r: r.game_map, records)))
winners = tf.constant(list(map(lambda r: [r.winner],
return {
"team_0": team_0s,
"team_1": team_1s,
"game_map": maps,
}, winners
return _fn
team_0 = tf.feature_column.embedding_column(
tf.feature_column.categorical_column_with_vocabulary_list("team_0", source.heroes_array), dimension=len(source.heroes_array))
team_1 = tf.feature_column.embedding_column(
tf.feature_column.categorical_column_with_vocabulary_list("team_1", source.heroes_array), dimension=len(source.heroes_array))
game_map = tf.feature_column.embedding_column(
tf.feature_column.categorical_column_with_vocabulary_list("game_map", source.maps_array), dimension=len(source.maps_array))
model_dir = "DNNClassifierModel_00"
os.mkdir(model_dir)
m = tf.estimator.DNNClassifier(
model_dir=model_dir,
hidden_units = [1024, 512, 256],
feature_columns=[team_0, team_1, game_map])
def main():
m.train(input_fn=make_input_fn("training.txt"))
results = m.evaluate(input_fn=make_input_fn("validation.txt"))
print("model directory = %s" % model_dir)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
if __name__ == "__main__":
main()
Note that this code isn't perfect yet. I need to add in batching.