I would like to use "get_single_element" to get the one batch dataset as dict. The keys in dict are features names. The values are values.
My code is in tf2.8 on EC2 instance and run from jupyter.
import pandas as pd
import tensorflow as tf
df = pd.DataFrame({
'id': [1, 2, 6, 3, 5, 0],
'value_1': [10.892561, 7.210528, 1.2278101, -9.251782, 0.2118367, 6.9128551],
'value_2': ['large', 'small', 'mid','small','large', 'mid'],
'name_1': ['tyne', 'wnhp', 'ebhg','lpzhn','tyne', 'ebhg'],
'label': [0, 1, 0, 1, 1, 1]
})
dataset = tf.data.Dataset.from_tensor_slices(dict(df))
dataset = dataset.batch(2)
print(type(dataset))
print(dataset)
print(len(dataset))
class Features(object):
feature_data = {
"id": tf.io.FixedLenFeature((1,), dtype=tf.int8),
"value_1": tf.io.VarLenFeature(dtype=tf.float32),
"value_2": tf.io.FixedLenFeature((1,), dtype=tf.string),
"name_1": tf.io.FixedLenFeature((1,), dtype=tf.string)
}
label_data = {"label": tf.io.FixedLenFeature((1,), dtype=tf.int8)}
def process_sample(ds):
print(f"ds type is {type(ds)}")
features = tf.io.parse_single_example(ds, Features.feature_data) # error !
labels = tf.io.parse_single_example(ds, Features.label_data)['label']
return (features, labels)
dataset = dataset.map(lambda x: process_sample(x), num_parallel_calls=tf.data.AUTOTUNE)
dataset = tf.data.Dataset.get_single_element(dataset.batch(len(dataset)))
print(f"dataset type is {type(dataset)} dataset is {dataset}")
def get_dict(input_ds):
feature_dict_tensors = dict(input_ds)
print(f"\feature_dict_tensors type is {type(feature_dict_tensors)}, feature_dict_tensors is {feature_dict_tensors}")
return feature_dict_tensors
ds = dataset.map(get_dict)
print(f"ds type is {type(ds)}")
print(ds)
I got error:
File "<ipython-input-4-e9407f42e0a7>", line 37, in None *
lambda x: process_sample(x), num_parallel_calls=tf.data.AUTOTUNE)
File "<ipython-input-4-e9407f42e0a7>", line 33, in process_sample *
features = tf.io.parse_single_example(ds, Features.feature_data)
TypeError: Expected any non-tensor type, but got a tensor instead.
Based on https://www.tensorflow.org/api_docs/python/tf/io/parse_single_example, the first argument should be
A scalar string Tensor, a single serialized Example.
Why I got this error ?
thanks
Related
I am trying to create an estimated distribution function from the data in data.dat using Gaussian process regression and set it as the objective function of pso.but I keep getting the error below.
I would like to create a function that returns LD when I pass x, but it doesn't work.
In the case of pso, x is passed at the same time, so I split it into columns and return LD.
I tried to define a function with the result in an empty "result".Might be wrong to put in empty data.
Code
import numpy as np
import pandas as pd
import pyswarms as ps
import GPy
# define the random seed to fix
np.random.seed(0)
# optimization conditions
n_particles = 5
n = n_particles
iters = 10
bounds = (np.array([10,6,2,12,38,0,3.6,4,8,0]),np.array([18,13,7,18,42,9,9,10,18,7.5]))#(min,max)
# Determine hyperparameter
options = {"c1": 0.5, "c2": 0.3, "w":0.9}
# Input the dimensions of design parameters
ndim = 10
Ndim = np.arange(ndim)
# import the data
data = pd.read_csv('data.dat', header=None, sep=" ")
data.columns = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"]
print(data.shape) # (96,12)
N = len(data)
# import the input values to X and the output values to LD
X = data.iloc[:, 1:11]
fn = data.loc[:, "12"]
# change the data type from 'list' to 'numpy.ndarray'
x = np.array(X, dtype=float)
Y = np.array(fn, dtype=float)
# reshape the dimension od output from
y = Y.reshape(N, 1)
# choose the kernel to regression
kernel = GPy.kern.RBF(ndim, ARD=True)
# create a model from the data by Gaussian Progress Regression with a kernel
m = GPy.models.GPRegression(x, y, kernel)
#define an objective function
def func2(x):
print(x.shape) #(5,10)
for i in range(n):
xi = x[i,:]
xi = xi.reshape(1,len(xi))
print(xi.shape) #(1,10)
LD = m.predict(xi, include_likelihood = False)
# print('LD =',LD)#tuple
# print('LD0 =',LD[0])#list
result = np.empty((n,1))
result[i, 0] = LD[0]
print(result.shape) #(5,1)
return result
# start optimization
optimizer = ps.single.GlobalBestPSO(n_particles = n, dimensions = ndim, options = options, bounds = bounds)
cost, pos = optimizer.optimize (objective_func = func2 , iters = iters)
Error
Traceback (most recent call last):
File "e:\PSO\approx4pso.py", line 132, in <module>
cost, pos = optimizer.optimize (objective_func = func2 , iters = iters)
File "C:\Users\taku_\anaconda3\lib\site-packages\pyswarms\single\global_best.py", line 210, in optimize
self.swarm.pbest_pos, self.swarm.pbest_cost = compute_pbest(self.swarm)
File "C:\Users\taku_\anaconda3\lib\site-packages\pyswarms\backend\operators.py", line 69, in compute_pbest
new_pbest_pos = np.where(~mask_pos, swarm.pbest_pos, swarm.position)
File "<__array_function__ internals>", line 180, in where
ValueError: operands could not be broadcast together with shapes (5,10,5) (5,10) (5,10)
How do I define a function?I don't have to stick to this approach, so if this code doesn't work, please let me know of other ways to define the function using Gaussian process regression.
I'm trying to code a layer to interface between a data set (numerical and categorical features) so it can be fed into a model.
I can't understand the error I get when it comes to categorical columns.
ValueError: Exception encountered when calling layer (type CategoryEncoding).
When output_mode is not 'int', maximum supported output rank is 2. Received
output_mode multi_hot and input shape (10, 7, 1), which would result in output rank 3.
From what I understand, the batch size should not have been counted in, but it is. And that seems to break.
Note that reproducing with only numerical features works fine.
Thank you for your help.
import tensorflow as tf
import pandas as pd
import numpy as np
# Simulate a data set of categorical and numerical values
# Configure simulation specifications: {feature: number of unique categories or None for numerical}
theSimSpecs = {'Cat1': 54, 'Cat2': 2, 'Cat3': 4, 'Num1': None, 'Num2': None}
# theSimSpecs = {'Num1': None, 'Num2': None}
# batch size and timesteps
theBatchSz, theTimeSteps = 10, 4
# Creation of the dataset as pandas.DataFrame
theDFs = []
for theFeature, theUniques in theSimSpecs.items():
if theUniques is None:
theDF = pd.DataFrame(np.random.random(size=theBatchSz * theTimeSteps), columns=[theFeature])
else:
theDF = pd.DataFrame(np.random.randint(low=0, high=theUniques, size=theBatchSz * theTimeSteps),
columns=[theFeature]).astype('category')
theDFs.append(theDF)
theDF = pd.concat(theDFs, axis=1)
# code excerpt
# inventory of the categorical features' values ( None for the numerical)
theCatCodes = {theCol: (theDF[theCol].unique().tolist() if str(theDF[theCol].dtypes) == "category" else None)
for theCol in theDF.columns}
# Creation of the batched tensorflow.data.Dataset
theDS = tf.data.Dataset.from_tensor_slices(dict(theDF))
theDS = theDS.window(size=theTimeSteps, shift=1, stride=1, drop_remainder=True)
theDS = theDS.flat_map(lambda x: tf.data.Dataset.zip(x))
theDS = theDS.batch(batch_size=theTimeSteps, drop_remainder=True)
theDS = theDS.batch(batch_size=theBatchSz, drop_remainder=True)
# extracting one batch
theBatch = next(iter(theDS))
tf.print(theBatch)
# Creation of the components for the interface layer
theFeaturesInputs = {}
theFeaturesEncoded = {}
for theFeature, theCodes in theCatCodes.items():
if theCodes is None: # Pass-through for numerical features
theNumInput = tf.keras.layers.Input(shape=[], dtype=tf.float32, name=theFeature)
theFeaturesInputs[theFeature] = theNumInput
theFeatureExp = tf.expand_dims(input=theNumInput, axis=-1)
theFeaturesEncoded[theFeature] = theFeatureExp
else: # Process for categorical features
theCatInput = tf.keras.layers.Input(shape=[], dtype=tf.int64, name=theFeature)
theFeaturesInputs[theFeature] = theCatInput
theFeatureExp = tf.expand_dims(input=theCatInput, axis=-1)
theEncodingLayer = tf.keras.layers.CategoryEncoding(num_tokens=theSimSpecs[theFeature], name=f"{theFeature}_enc",
output_mode="multi_hot", sparse=False)
theFeaturesEncoded[theFeature] = theEncodingLayer(theFeatureExp)
theStackedInputs = tf.concat(tf.nest.flatten(theFeaturesEncoded), axis=1)
theModel = tf.keras.Model(inputs=theFeaturesInputs, outputs=theStackedInputs)
theOutput = theModel(theBatch)
tf.print(theOutput)
I'm trying to train a LightGBM model on the Kaggle Iowa housing dataset and I wrote a small script to randomly try different parameters within a given range. I'm not sure what's wrong with my code, but the script returns the same score with different parameters, which shouldn't be happening. I tried the same script with Catboost and it works as expected, so I'm guessing the issue is with LGBM.
The code:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from random import choice, randrange, uniform
complete_train = pd.read_csv(
"train.csv",
encoding = "UTF-8",
index_col = "Id")
complete_test = pd.read_csv(
"test.csv",
encoding = "UTF-8",
index_col = "Id")
def encode_impute(*datasets):
for dataset in datasets:
for column in dataset.columns:
dataset[
column].fillna(
-999,
inplace = True)
if dataset[
column].dtype == "object":
dataset[
column] = dataset[
column].astype("category", copy = False)
encode_impute(
complete_train,
complete_test)
X = complete_train.drop(
columns = "SalePrice")
y = complete_train[
"SalePrice"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
def objective():
while True:
params = {
"boosting_type": choice(["gbdt", "goss", "dart", "rf"]),
"num_leaves": randrange(10000),
"learning_rate": uniform(0.01, 1),
"subsample_for_bin": randrange(100000000),
"min_data_in_leaf": randrange(100000000),
"reg_alpha": uniform(0, 1),
"reg_lambda": uniform(0, 1),
"feature_fraction": uniform(0, 1),
"bagging_fraction": uniform(0, 1),
"bagging_freq": randrange(1, 100)}
params["bagging_fraction"] = 1.0 if params[
"boosting_type"] == "goss" else params[
"bagging_fraction"]
model = LGBMRegressor().set_params(**params)
model.fit(X_train, y_train)
predictions = model.predict(X_valid)
error_rate = mean_absolute_error(
y_valid, predictions)
print(f"Score = {error_rate} with parameters: {params}","\n" *5)
objective()
Example of the output I'm getting:
Score = 55967.70375930444 with parameters: {'boosting_type': 'gbdt', 'num_leaves': 6455, 'learning_rate': 0.2479700848039991, 'subsample_for_bin': 83737077, 'min_data_in_leaf': 51951103, 'reg_alpha': 0.1856001984332697, 'reg_lambda': 0.7849262049058852, 'feature_fraction': 0.10550627738309537, 'bagging_fraction': 0.2613298736131875, 'bagging_freq': 96}
Score = 55967.70375930444 with parameters: {'boosting_type': 'dart', 'num_leaves': 9678, 'learning_rate': 0.28670432435369037, 'subsample_for_bin': 24246091, 'min_data_in_leaf': 559094, 'reg_alpha': 0.07261459695501371, 'reg_lambda': 0.8834743560240725, 'feature_fraction': 0.5361519020265366, 'bagging_fraction': 0.9120030047714073, 'bagging_freq': 10}
Score = 55967.70375930444 with parameters: {'boosting_type': 'goss', 'num_leaves': 4898, 'learning_rate': 0.09237499846487345, 'subsample_for_bin': 32620066, 'min_data_in_leaf': 71317820, 'reg_alpha': 0.9818297737748625, 'reg_lambda': 0.11638265354331834, 'feature_fraction': 0.4230342728468828, 'bagging_fraction': 1.0, 'bagging_freq': 64}
I would point out that that min_data_in_leaf parameter in all the options seems very high and I suspect that the model is not learning anything and just sending the average value of the response variable with only root node.
I have a TensorFlow model on GCP ML Engine, however I have a problem with the JSON string below:
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json
credentials = GoogleCredentials.get_application_default()
api = discovery.build('ml', 'v1', credentials=credentials,
discoveryServiceUrl='https://storage.googleapis.com/cloud-ml/discovery/ml_v1_discovery.json')
request_data = {'instances':
[{
'inputs':{
'clump_thickness': 2,
'size_uniformity': 1,
'shape_uniformity': 1,
'marginal_adhesion': 1,
'epithelial_size': 2,
'bland_chromatin': 1,
'bare_nucleoli': 2,
'normal_nucleoli': 1,
'mitoses': 1
}
}]
}
parent = 'projects/%s/models/%s/versions/%s' % (PROJECT,
'breastCancer_optimized_06152018_2_2_a', 'v1')
response = api.projects().predict(body=request_data, name=parent).execute()
print(response)
I get the following error:
{'error': "Prediction failed: Error processing input: Expected string, got {u'epithelial_size': 2, u'marginal_adhesion': 1, u'clump_thickness': 2, u'size_uniformity': 1, u'shape_uniformity': 1, u'normal_nucleoli': 1, u'mitoses': 1, u'bland_chromatin': 1, u'bare_nucleoli': 2} of type 'dict' instead."}
I can't seem to format request_data properly. Does anyone see what is wrong?
original serving function:
clump_thickness = tf.feature_column.numeric_column("clump_thickness");
size_uniformity = tf.feature_column.numeric_column("size_uniformity");
shape_uniformity = tf.feature_column.numeric_column("shape_uniformity");
marginal_adhesion = tf.feature_column.numeric_column("marginal_adhesion");
epithelial_size = tf.feature_column.numeric_column("epithelial_size");
bare_nucleoli = tf.feature_column.numeric_column("bare_nucleoli");
bland_chromatin = tf.feature_column.numeric_column("bland_chromatin");
normal_nucleoli = tf.feature_column.numeric_column("normal_nucleoli");
mitoses = tf.feature_column.numeric_column("mitoses");
feature_columns = [clump_thickness, size_uniformity, shape_uniformity, marginal_adhesion, epithelial_size,
bare_nucleoli, bland_chromatin, normal_nucleoli, mitoses];
feature_spec = tf.feature_column.make_parse_example_spec(feature_columns);
export_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec);
estimator.export_savedmodel(output_dir, export_input_fn, as_text=False)
Then I tried:
def serving_input_fn():
feature_placeholders = {
'clump_thickness' : tf.placeholder(tf.float32, [None]),
'size_uniformity' : tf.placeholder(tf.float32, [None]),
'shape_uniformity' : tf.placeholder(tf.float32, [None]),
'marginal_adhesion' : tf.placeholder(tf.float32, [None]),
'epithelial_size' : tf.placeholder(tf.float32, [None]),
'bare_nucleoli' : tf.placeholder(tf.float32, [None]),
'bland_chromatin' : tf.placeholder(tf.float32, [None]),
'normal_nucleoli' : tf.placeholder(tf.float32, [None]),
'mitoses' : tf.placeholder(tf.float32, [None]),
}
features = feature_placeholders # no transformation needed
return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)
And in the train_and_eval function:
estimator.export_savedmodel(output_dir, serving_input_fn, as_text=False)
But now I get the following error:
{'error': "Prediction failed: Expected tensor name: inputs, got tensor name: [u'epithelial_size', u'marginal_adhesion', u'clump_thickness', u'size_uniformity', u'shape_uniformity', u'normal_nucleoli', u'mitoses', u'bland_chromatin', u'bare_nucleoli']."}
The estimator.export_savedmodel appears to create a model which requires a tensor input(in the request_data line).
When I use the model created with either serving function the following works fine:
predict_fn = tf.contrib.predictor.from_saved_model("gs://test-
203900/breastCancer_optimized_06182018/9/1529432417")
# Test inputs represented by Pandas DataFrame.
inputs = pd.DataFrame({
'clump_thickness': [2,5,4],
'size_uniformity': [1,10,8],
'shape_uniformity': [1,10,6],
'marginal_adhesion': [1,3,4],
'epithelial_size': [2,7,3],
'bland_chromatin': [1,3,4],
'bare_nucleoli': [2,8,10],
'normal_nucleoli': [1,10,6],
'mitoses': [1,2,1],
})
# Convert input data into serialized Example strings.
examples = []
for index, row in inputs.iterrows():
feature = {}
for col, value in row.iteritems():
feature[col] =
tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
example = tf.train.Example(
features=tf.train.Features(
feature=feature
)
)
examples.append(example.SerializeToString())
# Make predictions.
predictions = predict_fn({'inputs': examples})
It depends on what your serving input function is. It appears from the error message that 'inputs' needs to be a string (maybe comma-separated?)
Try this:
saved_model_cli show --dir $MODEL_LOCATION --tag_set serve --signature_def serving_default
It will tell you what your serving input function is set to.
I suspect that what you want is for your serving input function to be:
def serving_input_fn():
feature_placeholders = {
'size_uniformity' : tf.placeholder(tf.float32, [None]),
'shape_uniformity' : tf.placeholder(tf.float32, [None])
}
features = feature_placeholders # no transformation needed
return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)
and for your input format to be:
request_data = {'instances':
[
{
'size_uniformity': 1,
'shape_uniformity': 1
}
]
}
Using the commits from breznak for the encoders (I wasn't able to figure out "git checkout ..." with GitHub, so I just carefully copied over the three files - base.py, multi.py, and multi_test.py).
I ran multi_test.py without any problems.
Then I adjusted my model parameters (MODEL_PARAMS), so that the encoders portion of 'sensorParams' looks like this:
'encoders': {
'frequency': {
'fieldname': u'frequency',
'type': 'SimpleVector',
'length': 5,
'minVal': 0,
'maxVal': 210
}
},
I also adjusted the modelInput portion of my code, so it looked like this:
model = ModelFactory.create(model_params.MODEL_PARAMS)
model.enableInference({'predictedField': 'frequency'})
y = [1,2,3,4,5]
modelInput = {"frequency": y}
result = model.run(modelInput)
But I get the final error, regardless if I instantiate 'y' as a list or a numpy.ndarray
File "nta/eng/lib/python2.7/site-packages/nupic/encoders/base.py", line 183, in _getInputValue
return getattr(obj, fieldname)
AttributeError: 'list' object has no attribute 'idx0'
I also tried initializing a SimpleVector encoder inline with my modelInput, directly encoding my array, then passing it through modelInput. That violated the input parameters of my SimpleVector, because I was now double encoding. So I removed the encoders portion of my model parameters dictionary. That caused a spit up, because some part of my model was looking for that portion of the dictionary.
Any suggestions on what I should do next?
Edit: Here're the files I'm using with the OPF.
sendAnArray.py
import numpy
from nupic.frameworks.opf.modelfactory import ModelFactory
import model_params
class sendAnArray():
def __init__(self):
self.model = ModelFactory.create(model_params.MODEL_PARAMS)
self.model.enableInference({'predictedField': 'frequency'})
for i in range(100):
self.run()
def run(self):
y = [1,2,3,4,5]
modelInput = {"frequency": y}
result = self.model.run(modelInput)
anomalyScore = result.inferences['anomalyScore']
print y, anomalyScore
sAA = sendAnArray()
model_params.py
MODEL_PARAMS = {
'model': "CLA",
'version': 1,
'predictAheadTime': None,
'modelParams': {
'inferenceType': 'TemporalAnomaly',
'sensorParams': {
'verbosity' : 0,
'encoders': {
'frequency': {
'fieldname': u'frequency',
'type': 'SimpleVector',
'length': 5,
'minVal': 0,
'maxVal': 210
}
},
'sensorAutoReset' : None,
},
'spEnable': True,
'spParams': {
'spVerbosity' : 0,
'globalInhibition': 1,
'columnCount': 2048,
'inputWidth': 5,
'numActivePerInhArea': 60,
'seed': 1956,
'coincInputPoolPct': 0.5,
'synPermConnected': 0.1,
'synPermActiveInc': 0.1,
'synPermInactiveDec': 0.01,
},
'tpEnable' : True,
'tpParams': {
'verbosity': 0,
'columnCount': 2048,
'cellsPerColumn': 32,
'inputWidth': 2048,
'seed': 1960,
'temporalImp': 'cpp',
'newSynapseCount': 20,
'maxSynapsesPerSegment': 32,
'maxSegmentsPerCell': 128,
'initialPerm': 0.21,
'permanenceInc': 0.1,
'permanenceDec' : 0.1,
'globalDecay': 0.0,
'maxAge': 0,
'minThreshold': 12,
'activationThreshold': 16,
'outputType': 'normal',
'pamLength': 1,
},
'clParams': {
'regionName' : 'CLAClassifierRegion',
'clVerbosity' : 0,
'alpha': 0.0001,
'steps': '5',
},
'anomalyParams': {
u'anomalyCacheRecords': None,
u'autoDetectThreshold': None,
u'autoDetectWaitRecords': 2184
},
'trainSPNetOnlyIfRequested': False,
},
}
The problem seems to be that the SimpleVector class is accepting an array instead of a dict as its input, and then reconstructs that internally as {'list': {'idx0': 1, 'idx1': 2, ...}} (ie as if this dict had been the input). This is fine if it is done consistently, but your error shows that it's broken down somewhere. Have a word with #breznak about this.
Working through the OPF was difficult. I wanted to input an array of indices into the temporal pooler, so I opted to interface directly with the algorithms (I relied heavy on hello_tp.py). I ignored SimpleVector all together, and instead worked through the BitmapArray encoder.
Subutai has a useful email on the nupic-discuss listserve, where he breaks down the three main areas of the NuPIC API: algorithms, networks/regions, & the OPF. That helped me understand my options better.