Invalid parameter n_estimators for estimator Pipeline - pandas

I'm trying to create a pipline and to fine tune hyperparameters but I try to use fit, I get the error
ValueError: Invalid parameter n_esitmators for estimator Pipeline(steps=[('rfc', RandomForestClassifier())]). Check the list of available parameters with `estimator.get_params().keys()`.
I'd love to get some help with this please.
This is the code I'm using:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
#Classifier Pipeline
pipeline = Pipeline([
('rfc', RandomForestClassifier())
])
# Params for classifier
params = {'n_estimators': [5,20,50,100,150],
"max_depth": [1, 3,5,10,20,30,50],
"max_features": [1, 3,5,10,20,30,45],
"min_samples_split": [1, 3,5,10],
"min_samples_leaf": [1, 3,5,10]}
# Grid Search Execute
rf_rnd = RandomizedSearchCV(estimator=pipeline, param_distributions = params, cv=5, verbose=2, random_state=0, n_jobs=-1)
rf_rnd.fit(training_data, target)
Made some changes:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
#Classifier Pipeline
pipeline = Pipeline([
('rfc', RandomForestClassifier())
])
# Params for classifier
params = {'estimator__rfc__n_estimators': [5,20,50,100,150],
"estimator__rfc__max_depth": [1, 3,5,10,20,30,50],
"estimator__rfc__max_features": [1, 3,5,10,20,30,45],
"estimator__rfc__min_samples_split": [1, 3,5,10],
"estimator__rfc__min_samples_leaf": [1, 3,5,10]}
# Grid Search Execute
rf_rnd = RandomizedSearchCV(estimator=pipeline, param_distributions = params, cv=5, random_state=0, n_jobs=-1, return_train_score=True)
Now the error is:
ValueError: Invalid parameter estimator for estimator Pipeline(steps=[('rfc', RandomForestClassifier())]). Check the list of available parameters with estimator.get_params().keys().
And this is what estimator.get_params().keys() outputs:
dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__rfc', 'estimator__rfc__bootstrap', 'estimator__rfc__ccp_alpha', 'estimator__rfc__class_weight', 'estimator__rfc__criterion', 'estimator__rfc__max_depth', 'estimator__rfc__max_features', 'estimator__rfc__max_leaf_nodes', 'estimator__rfc__max_samples', 'estimator__rfc__min_impurity_decrease', 'estimator__rfc__min_impurity_split', 'estimator__rfc__min_samples_leaf', 'estimator__rfc__min_samples_split', 'estimator__rfc__min_weight_fraction_leaf', 'estimator__rfc__n_estimators', 'estimator__rfc__n_jobs', 'estimator__rfc__oob_score', 'estimator__rfc__random_state', 'estimator__rfc__verbose', 'estimator__rfc__warm_start', 'estimator', 'iid', 'n_iter', 'n_jobs', 'param_distributions', 'pre_dispatch', 'random_state', 'refit', 'return_train_score', 'scoring', 'verbose'])

When you use a pipeline with any search (randomized or grid) the param_grid keys have to follow this syntax (step name)__(param name). So you should use rfc__n_estimators and so on to the other parameters.
Summarizing just remove the estimator__ part that you've added in your last modification

Related

Error finding attribute `feature_names_in_` that exists in docs

I'm getting the error AttributeError: 'LogisticRegression' object has no attribute 'feature_names_in_' even though that attribute is written in the docs.
I'm on scikit-learn version 1.0.2.
I created an object LogisticRegression and I am trying to use the documented attribute of feature_names_in_ but it's returning an error.
#imports
import numpy as np
import pandas as pd
import statistics
import scipy.sparse
from scipy.stats import chi2_contingency
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
#create functions for preprocessing
# function to replace NaN's in the ordinal and interval data
def replace_NAN_median(X_df):
opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
'household_children']
for column in opinions:
X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
return X_df
# function to replace NaN's in the catagorical data
def replace_NAN_mode(X_df):
miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
for column in miss_cat_features:
X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
return X_df
# Instantiate transformers
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)
col_transformer = ColumnTransformer(transformers=
# replace NaN's in the binary data
[("NAN_0", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0),
['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance']),
# MinMaxScaler on our numeric ordinal and interval data
("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc',
'household_adults', 'household_children']),
# OHE catagorical string data
("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex',
'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa'])],
remainder="passthrough")
# Preprocessing Pipeline
preprocessing_pipe = Pipeline(steps=[
("NAN_median", NAN_median),
("NAN_mode", NAN_mode),
("col_transformer", col_transformer)
])
# model
logreg_optimized_pipe = Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])
#fit model to training data
logreg_optimized_pipe.fit(X_train, y_train)
#trying to get feature names
logreg_optimized_pipe.named_steps["log_reg"].feature_names_in_
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-38-512bfaf5962d> in <module>
----> 1 logreg_optimized_pipe.named_steps["log_reg"].feature_names_in_
AttributeError: 'LogisticRegression' object has no attribute 'feature_names_in_'
I'm open to alternative suggestions on how to get the feature names as well.
Docs says the following:
feature_names_in_ndarray of shape (n_features_in_,)
Names of features seen during fit. Defined only when X has feature names that are all strings.
You should make sure that data that reaches model has names in.
Also, it is defined only when fit is called.
Link to the docs for your version 1.0.2
LogisticRegression
So it turns out that SimpleImputer returns an array - thereby removing the column names. I replaced SimpleImputer with a function to fix this. I wasn't able to figure out how to use .feature_names_in_ on the LogisticRegression() model, but it did work when I called it on the preprocessing pipeline ColumnTransformer, and most importantly I was able to use .get_feature_names_out() on the preprocessing pipeline to get the feature names that were fed into the model.
Code:
#imports
import numpy as np
import pandas as pd
import statistics
import scipy.sparse
from scipy.stats import chi2_contingency
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
#create functions for preprocessing
# function to replace NaN's in the ordinal and interval data
def replace_NAN_median(X_df):
opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
'household_children']
for column in opinions:
X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
return X_df
# function to replace NaN's in the catagorical data
def replace_NAN_mode(X_df):
miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
for column in miss_cat_features:
X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
return X_df
# function to replace NaN's in the binary data
def replace_NAN_0(X_df):
miss_binary = ['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance']
for column in miss_binary:
X_df[column].replace(np.nan, 0, inplace = True)
return X_df
# Instantiate transformers
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)
NAN_0 = FunctionTransformer(replace_NAN_0)
col_transformer = ColumnTransformer(transformers= [
# MinMaxScaler on our numeric ordinal and interval data
("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc',
'household_adults', 'household_children']),
# OHE catagorical string data
("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex',
'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa'])],
remainder="passthrough")
# Preprocessing Pipeline
preprocessing_pipe = Pipeline(steps=[
("NAN_median", NAN_median),
("NAN_mode", NAN_mode),
("NAN_0", NAN_0),
("col_transformer", col_transformer)
])
# model
logreg_optimized_pipe = Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])
#fit model to training data
logreg_optimized_pipe.fit(X_train, y_train)
#trying to get feature names
logreg_optimized_pipe.named_steps["preprocessing_pipe"][3].feature_names_in_
#output - feature names put into `ColumnTransformer`
array(['respondent_id', 'behavioral_antiviral_meds',
'behavioral_avoidance', 'behavioral_face_mask',
'behavioral_wash_hands', 'behavioral_large_gatherings',
'behavioral_outside_home', 'behavioral_touch_face',
'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance',
'opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race',
'sex', 'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa', 'household_adults',
'household_children'], dtype=object)
logreg_optimized_pipe.named_steps["preprocessing_pipe"][3].get_feature_names_out()
#output - feature names after `ColumnTransformer`
array(['scaler__opinion_seas_vacc_effective', 'scaler__opinion_seas_risk',
'scaler__opinion_seas_sick_from_vacc', 'scaler__household_adults',
'scaler__household_children', 'ohe__age_group_18 - 34 Years',
'ohe__age_group_35 - 44 Years', 'ohe__age_group_45 - 54 Years',
'ohe__age_group_55 - 64 Years', 'ohe__age_group_65+ Years',
'ohe__education_12 Years', 'ohe__education_< 12 Years',
'ohe__education_College Graduate', 'ohe__education_Some College',
'ohe__race_Black', 'ohe__race_Hispanic',
'ohe__race_Other or Multiple', 'ohe__race_White',
'ohe__sex_Female', 'ohe__sex_Male',
'ohe__income_poverty_<= $75,000, Above Poverty',
'ohe__income_poverty_> $75,000',
'ohe__income_poverty_Below Poverty', 'ohe__marital_status_Married',
'ohe__marital_status_Not Married', 'ohe__rent_or_own_Own',
'ohe__rent_or_own_Rent', 'ohe__employment_status_Employed',
'ohe__employment_status_Not in Labor Force',
'ohe__employment_status_Unemployed',
'ohe__census_msa_MSA, Not Principle City',
'ohe__census_msa_MSA, Principle City', 'ohe__census_msa_Non-MSA',
'remainder__respondent_id', 'remainder__behavioral_antiviral_meds',
'remainder__behavioral_avoidance',
'remainder__behavioral_face_mask',
'remainder__behavioral_wash_hands',
'remainder__behavioral_large_gatherings',
'remainder__behavioral_outside_home',
'remainder__behavioral_touch_face',
'remainder__doctor_recc_seasonal',
'remainder__chronic_med_condition',
'remainder__child_under_6_months', 'remainder__health_worker',
'remainder__health_insurance'], dtype=object)

XGboost + GridSearch : wired warning

Below is a code I wrote for Hyperparameter tuning of XGboost using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, auc
from pprint import pprint
from xgboost import XGBClassifier
import time
# instantiate XGBoost model
clf = XGBClassifier(missing=np.nan, nthreads=-1)
# Define scoring metrics
scorers = {
'accuracy_score': make_scorer(accuracy_score),
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score)
}
param_grid_dummy = {
"n_estimators": [25, 250],
"max_depth": [3,5],
"learning_rate": [0.0005, 0,005],
}
def random_search_wrapper(refit_score = 'precision_score'):
"""
fits a RandomizedSearchCV classifier using refit_score for optimization
prints classifier performance metrics
"""
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = param_grid_dummy, n_iter = 3, scoring=scorers, refit = refit_score, cv = 3, return_train_score= True, n_jobs= -1)
rf_random.fit(X_train_df, Y_train)
# make the predictions
Y_pred = rf_random.predict(X_test_df)
print('Best params for {}'.format(refit_score))
print(rf_random.best_params_)
# confusion matrix on test data
print('\nConfusion matrix of Random Forest optimized for {} on the test data: '.format(refit_score))
print(pd.DataFrame(confusion_matrix(Y_test, Y_pred),
columns = ['pred_neg', 'pred_pos'], index = ['neg', 'pos']))
return rf_random
# Optimize classifier for recall score
start = time.time()
rf_random_cl = random_search_wrapper(refit_score='precision_score')
# Print time
end = time.time()
print()
print((end - start)/60, "minutes")
I get a wired warning.
/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
if diff:
Can someone pls help me understand what wrong am I doing here?
when I do simple clf.fit(X_train_df, Y_train). It works perfectly fine
This is an issue with sklearn version. few versions < 0.20.1 throw this this error
Code is correct.

ValueError: Cannot Convert String to Float With Pandas and Amazon Sagemaker

I'm trying to deploy a simple ML model on SageMaker to get the hang of it, and I am not having any luck because I get the following error:
ValueError: could not convert string to float: '6.320000000000000097e-03 1.800000000000000000e+01 2.310000000000000053e+00 0.000000000000000000e+00 5.380000000000000338e-01 6.575000000000000178e+00 6.520000000000000284e+01 4.089999999999999858e+00 1.000000000000000000e+00 2.960000000000000000e+02 1.530000000000000071e+01 3.968999999999999773e+02 4.980000000000000426e+00 2.400000000000000000e+01'
This is the first row of my dataframe.
This is the code in my notebook that I'm using right now:
from sagemaker import get_execution_role, Session
from sagemaker.sklearn.estimator import SKLearn
work_dir = 'data'
session = Session()
role = get_execution_role()
train_input = session.upload_data('data')
script = 'boston_housing_prep.py'
model = SKLearn(
entry_point = script,
train_instance_type = 'ml.c4.xlarge',
role = role,
sagemaker_session = session,
hyperparameters = {'alpha': 10}
)
model.fit({'train': train_input})
My script for boston_housing_prep.py looks like this:
import argparse
import pandas as pd
import os
from sklearn.linear_model import Ridge
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
import numpy as np
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--alpha', type=int, default=1)
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
args = parser.parse_args()
input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ]
if len(input_files) == 0:
raise ValueError(('There are no files in {}.\n' +
'This usually indicates that the channel ({}) was incorrectly specified,\n' +
'the data specification in S3 was incorrectly specified or the role specified\n' +
'does not have permission to access the data.').format(args.train, "train"))
raw_data = [ pd.read_csv(file, header=None, engine="python") for file in input_files ]
df = pd.concat(raw_data)
y_train = df.iloc[:, -1]
X_train = df.iloc[:, :5]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
alpha = args.alpha
clf = Ridge(alpha=alpha)
clf = clf.fit(X_train, y_train)
joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))
def model_fn(model_dir):
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
return clf
The line that's giving the problem is this one:
X_train = scaler.fit_transform(X_train)
I tried df = df.astype(np.float) after I loaded in the df, but that didn't work either.
This file loads in without a problem when I'm not in SageMaker.

small test_set xgb predict

i would like to ask a question about a problem that i have for the last couple days.
First of all i am a beginner in machine learning and this is my first time using the XGBoost algorithm so excuse me for any mistakes I have done.
I trained my model to predict whether a log file is malicious or not. After i save and reload my model on a different session i use the predict function which seems to be working normally ( with a few deviations in probabilities but that is another topic, I know I, have seen it in another topic )
The problem is this: Sometimes when i try to predict a "small" csv file after load it seems to be broken predicting only the Zero label, even for indexes that are categorized correct previously.
For example, i load a dataset containing 20.000 values , the predict() is working. I keep only the first 5 of these values using pandas drop, again its working. If i save the 5 values on a different csv and reload it its not working. The same error happens if i just remove by hand all indexes (19.995) and save file only with 5 remaining.
I would bet it is a size of file problem but when i drop the indexes on the dataframe through pandas it seems to be working
Also the number 5 ( of indexes ) is for example purpose the same happens if I delete a large portion of the dataset.
I first came up with this problem after trying to verify by hand some completely new logs, which seem to be classified correctly if thrown into the big csv file but not in a new file on their own.
Here is my load and predict code
##IMPORTS
import os
import pandas as pd
from pandas.compat import StringIO
from datetime import datetime
from langid.langid import LanguageIdentifier, model
import langid
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from ggplot import ggplot, aes, geom_line
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
import pickle
df = pd.read_csv('big_test.csv')
df3 = pd.read_csv('small_test.csv')
#This one is necessary for the loaded_model
class ColumnSelector(BaseEstimator, TransformerMixin):
def init(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
loaded_model = joblib.load('finalized_model.sav')
result = loaded_model.predict(df)
print(result)
df2=df[:5]
result2 = loaded_model.predict(df2)
print(result2)
result3 = loaded_model.predict(df3)
print(result3)
The results i get are these:
[1 0 1 ... 0 0 0]
[1 0 1 0 1]
[0 0 0 0 0]
I can provide any code even from training or my dataset if necessary.
*EDIT: I use a pipeline for my data. I tried to reproduce the error after using xgb to fit the iris data and i could not. Maybe there is something wrong with my pipeline? the code is below :
df = pd.read_csv('big_test.csv')
# df.info()
# Split Dataset
attributes = ['uri','code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ]
x_train, x_test, y_train, y_test = train_test_split(df[attributes], df['Scan'], test_size=0.2,
stratify=df['Scan'], random_state=0)
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2,
stratify=y_train, random_state=0)
# print('Train:', len(y_train), 'Dev:', len(y_dev), 'Test:', len(y_test))
# set up graph function
def plot_precision_recall_curve(y_true, y_pred_scores):
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_scores)
return ggplot(aes(x='recall', y='precision'),
data=pd.DataFrame({"precision": precision, "recall": recall})) + geom_line()
# XGBClassifier
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), min_df=10)
dict_vectorizer = DictVectorizer()
xgb = XGBClassifier(seed=0)
pipeline = Pipeline([
("feature_union", FeatureUnion([
('text_features', Pipeline([
('selector', ColumnSelector(['uri'])),
('count_vectorizer', count_vectorizer)
])),
('categorical_features', Pipeline([
('selector', ColumnSelector(['code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ])),
('dict_vectorizer', dict_vectorizer)
]))
])),
('xgb', xgb)
])
pipeline.fit(x_train, y_train)
filename = 'finalized_model.sav'
joblib.dump(pipeline, filename)
Thats due to different dtypes in big and small file.
When you do:
df = pd.read_csv('big_test.csv')
The dtypes are these:
print(df.dtypes)
# Output
uri object
code object # <== Observe this
r_size object # <== Observe this
Scan int64
...
...
...
Now when you do:
df3 = pd.read_csv('small_test.csv')
the dtypes are changed:
print(df3.dtypes)
# Output
uri object
code int64 # <== Now this has changed
r_size int64 # <== Now this has changed
Scan int64
...
...
You see, pandas will try to determine the dtypes of the columns by itself. When you load the big_test.csv, there are some values in code and r_size column which are of string types, due to this whole column dtype is changed to string, which is not done in small_test.csv.
Now due to this change, the dictVectorizer encodes the data in a different way than before and the features are changed, and hence the results are also changed.
If you do this:
df3[['code', 'r_size']] = df3[['code', 'r_size']].astype(str)
and then call the predict(), the results are same again.

One-hot encoding Tensorflow Strings

I have a list of strings as labels for training a neural network. Now I want to convert them via one_hot encoding so that I can use them for my tensorflow network.
My input list looks like this:
labels = ['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']
The requested outcome should be something like
one_hot [0,1,0,2,0]
What is the easiest way to do this? Any help would be much appreciated.
Cheers,
Andi
the desired outcome looks like LabelEncoder in sklearn, not like OneHotEncoder - in tf you need CategoryEncoder - BUT it is A preprocessing layer which encodes integer features.:
inp = layers.Input(shape=[X.shape[0]])
x0 = layers.CategoryEncoding(
num_tokens=3, output_mode="multi_hot")(inp)
model = keras.Model(inputs=[inp], outputs=[x0])
model.compile(optimizer= 'adam',
loss='categorical_crossentropy',
metrics=[tf.keras.metrics.CategoricalCrossentropy()])
print(model.summary())
this part gets encoding of unique values... And you can make another branch in this model to input your initial vector & fit it according labels from this reference-branch (it is like join reference-table with fact-table in any database) -- here will be ensemble of referenced-data & your needed data & output...
pay attention to -- num_tokens=3, output_mode="multi_hot" -- are being given explicitly... AND numbers from class_names get apriory to model use, as is Feature Engineering - like this (in pd.DataFrame)
import numpy as np
import pandas as pd
d = {'transport_col':['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']}
dataset_df = pd.DataFrame(data=d)
classes = dataset_df['transport_col'].unique().tolist()
print(f"Label classes: {classes}")
df= dataset_df['transport_col'].map(classes.index).copy()
print(df)
from manual example REF: Encode the categorical label into an integer.
Details: This stage is necessary if your classification label is represented as a string. Note: Keras expected classification labels to be integers.
in another architecture, perhaps, you could use StringLookup
vocab= np.array(np.unique(labels))
inp = tf.keras.Input(shape= labels.shape[0], dtype=tf.string)
x = tf.keras.layers.StringLookup(vocabulary=vocab)(inp)
but labels are dependent vars usually, as opposed to features, and shouldn't be used at Input
Everything in keras.docs
possible FULL CODE:
import numpy as np
import pandas as pd
import keras
X = np.array([['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']])
vocab= np.unique(X)
print(vocab)
y= np.array([[0,1,0,2,0]])
inp = layers.Input(shape=[X.shape[0]], dtype='string')
x0= tf.keras.layers.StringLookup(vocabulary=vocab, name='finish')(inp)
model = keras.Model(inputs=[inp], outputs=[x0])
model.compile(optimizer= 'adam',
loss='categorical_crossentropy',
metrics=[tf.keras.metrics.categorical_crossentropy])
print(model.summary())
from tensorflow.keras import backend as K
for layerIndex, layer in enumerate(model.layers):
print(layerIndex)
func = K.function([model.get_layer(index=0).input], layer.output)
layerOutput = func([X]) # input_data is a numpy array
print(layerOutput)
if layerIndex==1: # the last layer here
scale = lambda x: x - 1
print(scale(layerOutput))
res:
[[0 1 0 2 0]]
another possible Solution for your case - layers.TextVectorization
import numpy as np
import keras
input_array = np.atleast_2d(np.array(['"car"', '"pedestrian"', '"car"', '"truck"', '"car"']))
vocab= np.unique(input_array)
input_data = keras.Input(shape=(None,), dtype='string')
layer = layers.TextVectorization( max_tokens=None, standardize=None, split=None, output_mode="int", vocabulary=vocab)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
print(output_dataset) # starts from 2 ... probably [0, 1] somehow concerns binarization ?
scale = lambda x: x - 2
print(scale(output_dataset))
result:
array([[0, 1, 0, 2, 0]])