Error finding attribute `feature_names_in_` that exists in docs - pandas
I'm getting the error AttributeError: 'LogisticRegression' object has no attribute 'feature_names_in_' even though that attribute is written in the docs.
I'm on scikit-learn version 1.0.2.
I created an object LogisticRegression and I am trying to use the documented attribute of feature_names_in_ but it's returning an error.
#imports
import numpy as np
import pandas as pd
import statistics
import scipy.sparse
from scipy.stats import chi2_contingency
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
#create functions for preprocessing
# function to replace NaN's in the ordinal and interval data
def replace_NAN_median(X_df):
opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
'household_children']
for column in opinions:
X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
return X_df
# function to replace NaN's in the catagorical data
def replace_NAN_mode(X_df):
miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
for column in miss_cat_features:
X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
return X_df
# Instantiate transformers
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)
col_transformer = ColumnTransformer(transformers=
# replace NaN's in the binary data
[("NAN_0", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0),
['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance']),
# MinMaxScaler on our numeric ordinal and interval data
("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc',
'household_adults', 'household_children']),
# OHE catagorical string data
("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex',
'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa'])],
remainder="passthrough")
# Preprocessing Pipeline
preprocessing_pipe = Pipeline(steps=[
("NAN_median", NAN_median),
("NAN_mode", NAN_mode),
("col_transformer", col_transformer)
])
# model
logreg_optimized_pipe = Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])
#fit model to training data
logreg_optimized_pipe.fit(X_train, y_train)
#trying to get feature names
logreg_optimized_pipe.named_steps["log_reg"].feature_names_in_
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-38-512bfaf5962d> in <module>
----> 1 logreg_optimized_pipe.named_steps["log_reg"].feature_names_in_
AttributeError: 'LogisticRegression' object has no attribute 'feature_names_in_'
I'm open to alternative suggestions on how to get the feature names as well.
Docs says the following:
feature_names_in_ndarray of shape (n_features_in_,)
Names of features seen during fit. Defined only when X has feature names that are all strings.
You should make sure that data that reaches model has names in.
Also, it is defined only when fit is called.
Link to the docs for your version 1.0.2
LogisticRegression
So it turns out that SimpleImputer returns an array - thereby removing the column names. I replaced SimpleImputer with a function to fix this. I wasn't able to figure out how to use .feature_names_in_ on the LogisticRegression() model, but it did work when I called it on the preprocessing pipeline ColumnTransformer, and most importantly I was able to use .get_feature_names_out() on the preprocessing pipeline to get the feature names that were fed into the model.
Code:
#imports
import numpy as np
import pandas as pd
import statistics
import scipy.sparse
from scipy.stats import chi2_contingency
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
#create functions for preprocessing
# function to replace NaN's in the ordinal and interval data
def replace_NAN_median(X_df):
opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
'household_children']
for column in opinions:
X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
return X_df
# function to replace NaN's in the catagorical data
def replace_NAN_mode(X_df):
miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
for column in miss_cat_features:
X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
return X_df
# function to replace NaN's in the binary data
def replace_NAN_0(X_df):
miss_binary = ['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance']
for column in miss_binary:
X_df[column].replace(np.nan, 0, inplace = True)
return X_df
# Instantiate transformers
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)
NAN_0 = FunctionTransformer(replace_NAN_0)
col_transformer = ColumnTransformer(transformers= [
# MinMaxScaler on our numeric ordinal and interval data
("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc',
'household_adults', 'household_children']),
# OHE catagorical string data
("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex',
'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa'])],
remainder="passthrough")
# Preprocessing Pipeline
preprocessing_pipe = Pipeline(steps=[
("NAN_median", NAN_median),
("NAN_mode", NAN_mode),
("NAN_0", NAN_0),
("col_transformer", col_transformer)
])
# model
logreg_optimized_pipe = Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])
#fit model to training data
logreg_optimized_pipe.fit(X_train, y_train)
#trying to get feature names
logreg_optimized_pipe.named_steps["preprocessing_pipe"][3].feature_names_in_
#output - feature names put into `ColumnTransformer`
array(['respondent_id', 'behavioral_antiviral_meds',
'behavioral_avoidance', 'behavioral_face_mask',
'behavioral_wash_hands', 'behavioral_large_gatherings',
'behavioral_outside_home', 'behavioral_touch_face',
'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance',
'opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race',
'sex', 'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa', 'household_adults',
'household_children'], dtype=object)
logreg_optimized_pipe.named_steps["preprocessing_pipe"][3].get_feature_names_out()
#output - feature names after `ColumnTransformer`
array(['scaler__opinion_seas_vacc_effective', 'scaler__opinion_seas_risk',
'scaler__opinion_seas_sick_from_vacc', 'scaler__household_adults',
'scaler__household_children', 'ohe__age_group_18 - 34 Years',
'ohe__age_group_35 - 44 Years', 'ohe__age_group_45 - 54 Years',
'ohe__age_group_55 - 64 Years', 'ohe__age_group_65+ Years',
'ohe__education_12 Years', 'ohe__education_< 12 Years',
'ohe__education_College Graduate', 'ohe__education_Some College',
'ohe__race_Black', 'ohe__race_Hispanic',
'ohe__race_Other or Multiple', 'ohe__race_White',
'ohe__sex_Female', 'ohe__sex_Male',
'ohe__income_poverty_<= $75,000, Above Poverty',
'ohe__income_poverty_> $75,000',
'ohe__income_poverty_Below Poverty', 'ohe__marital_status_Married',
'ohe__marital_status_Not Married', 'ohe__rent_or_own_Own',
'ohe__rent_or_own_Rent', 'ohe__employment_status_Employed',
'ohe__employment_status_Not in Labor Force',
'ohe__employment_status_Unemployed',
'ohe__census_msa_MSA, Not Principle City',
'ohe__census_msa_MSA, Principle City', 'ohe__census_msa_Non-MSA',
'remainder__respondent_id', 'remainder__behavioral_antiviral_meds',
'remainder__behavioral_avoidance',
'remainder__behavioral_face_mask',
'remainder__behavioral_wash_hands',
'remainder__behavioral_large_gatherings',
'remainder__behavioral_outside_home',
'remainder__behavioral_touch_face',
'remainder__doctor_recc_seasonal',
'remainder__chronic_med_condition',
'remainder__child_under_6_months', 'remainder__health_worker',
'remainder__health_insurance'], dtype=object)
Related
ValueError: could not convert string to float: in python colab
import numpy as np import pandas as pd import matplotlib.pyplot as plt from google.colab import files uploded = files.upload() df = pd.read_csv("quotation_test_data.csv") print(df) import seaborn as sns df . describe() #describe the data as it shows mean , count df.isnull().sum() #for see is there any column dont have a data df ["Net profit"].value_counts() sns.countplot(df['Net profit']) #spli data into x and y then make it into train and test data x=df.iloc[: , :-1] y=df.iloc[: , :-1] # [:,:-1] the (:) mean the coumln (:-1) this mean like print the alll column except the last one #npw i will split the data into train and test data will use sklearn from sklearn.model_selection import train_test_split x_train , x_test,y_train,y_test = train_test_split(x,y,random_state=100) x_train.shape y_train.shape from sklearn.tree import DecisionTreeClassifier #intilize desion tree from pandas.core.common import random_state clf = DecisionTreeClassifier(criterion="gini",max_depth=7,min_samples_split=10,random_state=10) # criteraition import pandas as pd data=pd.read_csv('quotation_test_data.csv') dataconver = data.replace('[^\d.]','', regex=True).astype(float) i was try to make a desion tree then this is show for me : ValueError Traceback (most recent call last) <ipython-input-75-df55a55b03a4> in <module> ----> 1 dataconver = data.replace('[^\d.]', regex=True).astype(float) 2 7 frames /usr/local/lib/python3.8/dist-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna) 1199 if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): 1200 # Explicit copy, or required since NumPy can't view from / to object. -> 1201 return arr.astype(dtype, copy=True) 1202 1203 return arr.astype(dtype, copy=copy) ValueError: could not convert string to float: 'Kellie Scott' i was try make desion tree
how to solve this attribution error AttributeError: 'DataFrame' object has no attribute 'as_matrix' (using Python 3.8)
Hello guys I am getting (AttributeError: 'DataFrame' object has no attribute 'as_matrix') when I run the code below on jupyter notepad referring to those 2 lines #create x & y variables X = features_df.as_matrix() y = df['Price'].as_matrix() my whole code is as below #developing model to predict houses prices in Australia #importing needed libraries import pandas as pd from sklearn.model_selection import train_test_split from sklearn import ensemble from sklearn.metrics import mean_absolute_error import sklearn.externals # importing the file path df = pd.read_csv('~/mypython/machine_learning/machine_learning/housing/Melbourne_housing_FULL.csv') #removing less related dimentions del df['Address'] del df['Method'] del df['SellerG'] del df['Date'] del df['Postcode'] del df['Lattitude'] del df['Longtitude'] del df['Regionname'] del df['Propertycount'] #delete raws with any empty value df.dropna(axis = 0 ,how = 'any' ,thresh = None, subset = None, inplace = True) #converting non-numerical values to numerical values using pandas features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type']) # delete price because it's the dependant varilable del features_df['Price'] #create x & y variables X = features_df.as_matrix() y = df['Price'].as_matrix() X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3,random_state=0) model = ensembel.GradientBoostingRegressor( n_estimators=150, learning_rate=0.1, max_depth=30, min_sample_split= 4, min_samples_leaf=6, max_features=0.6, loss="huber") model.fit(X_train,y_train) joblib.dumb(model,"house_train_model.pkl") mse=mean_absolute_error(y_train_model, model.predict(X_train)) print("Training set mean absolute error:%.2f"%mse)
You should use this X = features_df.values y = df['Price'].values
ValueError: Cannot Convert String to Float With Pandas and Amazon Sagemaker
I'm trying to deploy a simple ML model on SageMaker to get the hang of it, and I am not having any luck because I get the following error: ValueError: could not convert string to float: '6.320000000000000097e-03 1.800000000000000000e+01 2.310000000000000053e+00 0.000000000000000000e+00 5.380000000000000338e-01 6.575000000000000178e+00 6.520000000000000284e+01 4.089999999999999858e+00 1.000000000000000000e+00 2.960000000000000000e+02 1.530000000000000071e+01 3.968999999999999773e+02 4.980000000000000426e+00 2.400000000000000000e+01' This is the first row of my dataframe. This is the code in my notebook that I'm using right now: from sagemaker import get_execution_role, Session from sagemaker.sklearn.estimator import SKLearn work_dir = 'data' session = Session() role = get_execution_role() train_input = session.upload_data('data') script = 'boston_housing_prep.py' model = SKLearn( entry_point = script, train_instance_type = 'ml.c4.xlarge', role = role, sagemaker_session = session, hyperparameters = {'alpha': 10} ) model.fit({'train': train_input}) My script for boston_housing_prep.py looks like this: import argparse import pandas as pd import os from sklearn.linear_model import Ridge from sklearn.externals import joblib from sklearn.preprocessing import StandardScaler import numpy as np if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--alpha', type=int, default=1) parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) args = parser.parse_args() input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ] if len(input_files) == 0: raise ValueError(('There are no files in {}.\n' + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 'the data specification in S3 was incorrectly specified or the role specified\n' + 'does not have permission to access the data.').format(args.train, "train")) raw_data = [ pd.read_csv(file, header=None, engine="python") for file in input_files ] df = pd.concat(raw_data) y_train = df.iloc[:, -1] X_train = df.iloc[:, :5] scaler = StandardScaler() X_train = scaler.fit_transform(X_train) alpha = args.alpha clf = Ridge(alpha=alpha) clf = clf.fit(X_train, y_train) joblib.dump(clf, os.path.join(args.model_dir, "model.joblib")) def model_fn(model_dir): clf = joblib.load(os.path.join(model_dir, "model.joblib")) return clf The line that's giving the problem is this one: X_train = scaler.fit_transform(X_train) I tried df = df.astype(np.float) after I loaded in the df, but that didn't work either. This file loads in without a problem when I'm not in SageMaker.
Use matplotlib to plot scikit learn linear regression results
How can you plot the linear regression results from scikit learn after the analysis to see the "testing" data (real values vs. predicted values) at the end of the program? The code below is close but I believe it is missing a scaling factor. input: import pandas as pd import numpy as np import datetime pd.core.common.is_list_like = pd.api.types.is_list_like # temp fix import fix_yahoo_finance as yf from pandas_datareader import data, wb from datetime import date from sklearn.linear_model import LinearRegression from sklearn import preprocessing, cross_validation, svm import matplotlib.pyplot as plt df = yf.download('MMM', start = date (2012, 1, 1), end = date (2018, 1, 1) , progress = False) df_low = df[['Low']] # create a new df with only the low column forecast_out = int(5) # predicting some days into future df_low['low_prediction'] = df_low[['Low']].shift(-forecast_out) # create a new column based on the existing col but shifted some days X_low = np.array(df_low.drop(['low_prediction'], 1)) X_low = preprocessing.scale(X_low) # scaling the input values X_low_forecast = X_low[-forecast_out:] # set X_forecast equal to last 5 days X_low = X_low[:-forecast_out] # remove last 5 days from X y_low = np.array(df_low['low_prediction']) y_low = y_low[:-forecast_out] X_low_train, X_low_test, y_low_train, y_low_test = cross_validation.train_test_split(X_low, y_low, test_size = 0.2) clf_low = LinearRegression() # classifier clf_low.fit(X_low_train, y_low_train) # training confidence_low = clf_low.score(X_low_test, y_low_test) # testing print("confidence for lows: ", confidence_low) forecast_prediction_low = clf_low.predict(X_low_forecast) print(forecast_prediction_low) plt.figure(figsize = (17,9)) plt.grid(True) plt.plot(X_low_test, color = "red") plt.plot(y_low_test, color = "green") plt.show() image:
You plot y_test and X_test, while you should plot y_test and clf_low.predict(X_test) instead, if you want to compare target and predicted. BTW, clf_low in your code is not a classifier, it is a regressor. It's better to use the alias model instead of clf.
small test_set xgb predict
i would like to ask a question about a problem that i have for the last couple days. First of all i am a beginner in machine learning and this is my first time using the XGBoost algorithm so excuse me for any mistakes I have done. I trained my model to predict whether a log file is malicious or not. After i save and reload my model on a different session i use the predict function which seems to be working normally ( with a few deviations in probabilities but that is another topic, I know I, have seen it in another topic ) The problem is this: Sometimes when i try to predict a "small" csv file after load it seems to be broken predicting only the Zero label, even for indexes that are categorized correct previously. For example, i load a dataset containing 20.000 values , the predict() is working. I keep only the first 5 of these values using pandas drop, again its working. If i save the 5 values on a different csv and reload it its not working. The same error happens if i just remove by hand all indexes (19.995) and save file only with 5 remaining. I would bet it is a size of file problem but when i drop the indexes on the dataframe through pandas it seems to be working Also the number 5 ( of indexes ) is for example purpose the same happens if I delete a large portion of the dataset. I first came up with this problem after trying to verify by hand some completely new logs, which seem to be classified correctly if thrown into the big csv file but not in a new file on their own. Here is my load and predict code ##IMPORTS import os import pandas as pd from pandas.compat import StringIO from datetime import datetime from langid.langid import LanguageIdentifier, model import langid import time from sklearn.model_selection import train_test_split from sklearn.linear_model import SGDClassifier from sklearn.metrics import accuracy_score from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import precision_score, recall_score from sklearn.metrics import precision_recall_curve from sklearn.externals import joblib from ggplot import ggplot, aes, geom_line from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from sklearn.metrics import average_precision_score import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import FeatureUnion from sklearn.base import BaseEstimator, TransformerMixin from collections import defaultdict import pickle df = pd.read_csv('big_test.csv') df3 = pd.read_csv('small_test.csv') #This one is necessary for the loaded_model class ColumnSelector(BaseEstimator, TransformerMixin): def init(self, column_list): self.column_list = column_list def fit(self, x, y=None): return self def transform(self, x): if len(self.column_list) == 1: return x[self.column_list[0]].values else: return x[self.column_list].to_dict(orient='records') loaded_model = joblib.load('finalized_model.sav') result = loaded_model.predict(df) print(result) df2=df[:5] result2 = loaded_model.predict(df2) print(result2) result3 = loaded_model.predict(df3) print(result3) The results i get are these: [1 0 1 ... 0 0 0] [1 0 1 0 1] [0 0 0 0 0] I can provide any code even from training or my dataset if necessary. *EDIT: I use a pipeline for my data. I tried to reproduce the error after using xgb to fit the iris data and i could not. Maybe there is something wrong with my pipeline? the code is below : df = pd.read_csv('big_test.csv') # df.info() # Split Dataset attributes = ['uri','code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ] x_train, x_test, y_train, y_test = train_test_split(df[attributes], df['Scan'], test_size=0.2, stratify=df['Scan'], random_state=0) x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=0) # print('Train:', len(y_train), 'Dev:', len(y_dev), 'Test:', len(y_test)) # set up graph function def plot_precision_recall_curve(y_true, y_pred_scores): precision, recall, thresholds = precision_recall_curve(y_true, y_pred_scores) return ggplot(aes(x='recall', y='precision'), data=pd.DataFrame({"precision": precision, "recall": recall})) + geom_line() # XGBClassifier class ColumnSelector(BaseEstimator, TransformerMixin): def __init__(self, column_list): self.column_list = column_list def fit(self, x, y=None): return self def transform(self, x): if len(self.column_list) == 1: return x[self.column_list[0]].values else: return x[self.column_list].to_dict(orient='records') count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), min_df=10) dict_vectorizer = DictVectorizer() xgb = XGBClassifier(seed=0) pipeline = Pipeline([ ("feature_union", FeatureUnion([ ('text_features', Pipeline([ ('selector', ColumnSelector(['uri'])), ('count_vectorizer', count_vectorizer) ])), ('categorical_features', Pipeline([ ('selector', ColumnSelector(['code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ])), ('dict_vectorizer', dict_vectorizer) ])) ])), ('xgb', xgb) ]) pipeline.fit(x_train, y_train) filename = 'finalized_model.sav' joblib.dump(pipeline, filename)
Thats due to different dtypes in big and small file. When you do: df = pd.read_csv('big_test.csv') The dtypes are these: print(df.dtypes) # Output uri object code object # <== Observe this r_size object # <== Observe this Scan int64 ... ... ... Now when you do: df3 = pd.read_csv('small_test.csv') the dtypes are changed: print(df3.dtypes) # Output uri object code int64 # <== Now this has changed r_size int64 # <== Now this has changed Scan int64 ... ... You see, pandas will try to determine the dtypes of the columns by itself. When you load the big_test.csv, there are some values in code and r_size column which are of string types, due to this whole column dtype is changed to string, which is not done in small_test.csv. Now due to this change, the dictVectorizer encodes the data in a different way than before and the features are changed, and hence the results are also changed. If you do this: df3[['code', 'r_size']] = df3[['code', 'r_size']].astype(str) and then call the predict(), the results are same again.