ValueError: could not convert string to float: in python colab - google-colaboratory

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
uploded = files.upload()
df = pd.read_csv("quotation_test_data.csv")
print(df)
import seaborn as sns
df . describe() #describe the data as it shows mean , count
df.isnull().sum() #for see is there any column dont have a data
df ["Net profit"].value_counts()
sns.countplot(df['Net profit'])
#spli data into x and y then make it into train and test data
x=df.iloc[: , :-1]
y=df.iloc[: , :-1]
# [:,:-1] the (:) mean the coumln (:-1) this mean like print the alll column except the last one
#npw i will split the data into train and test data will use sklearn
from sklearn.model_selection import train_test_split
x_train , x_test,y_train,y_test = train_test_split(x,y,random_state=100)
x_train.shape
y_train.shape
from sklearn.tree import DecisionTreeClassifier #intilize desion tree
from pandas.core.common import random_state
clf = DecisionTreeClassifier(criterion="gini",max_depth=7,min_samples_split=10,random_state=10) # criteraition
import pandas as pd
data=pd.read_csv('quotation_test_data.csv')
dataconver = data.replace('[^\d.]','', regex=True).astype(float)
i was try to make a desion tree
then this is show for me :
ValueError Traceback (most recent call last)
<ipython-input-75-df55a55b03a4> in <module>
----> 1 dataconver = data.replace('[^\d.]', regex=True).astype(float)
2
7 frames
/usr/local/lib/python3.8/dist-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
1199 if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
1200 # Explicit copy, or required since NumPy can't view from / to object.
-> 1201 return arr.astype(dtype, copy=True)
1202
1203 return arr.astype(dtype, copy=copy)
ValueError: could not convert string to float: 'Kellie Scott'
i was try make desion tree

Related

Error finding attribute `feature_names_in_` that exists in docs

I'm getting the error AttributeError: 'LogisticRegression' object has no attribute 'feature_names_in_' even though that attribute is written in the docs.
I'm on scikit-learn version 1.0.2.
I created an object LogisticRegression and I am trying to use the documented attribute of feature_names_in_ but it's returning an error.
#imports
import numpy as np
import pandas as pd
import statistics
import scipy.sparse
from scipy.stats import chi2_contingency
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
#create functions for preprocessing
# function to replace NaN's in the ordinal and interval data
def replace_NAN_median(X_df):
opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
'household_children']
for column in opinions:
X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
return X_df
# function to replace NaN's in the catagorical data
def replace_NAN_mode(X_df):
miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
for column in miss_cat_features:
X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
return X_df
# Instantiate transformers
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)
col_transformer = ColumnTransformer(transformers=
# replace NaN's in the binary data
[("NAN_0", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0),
['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance']),
# MinMaxScaler on our numeric ordinal and interval data
("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc',
'household_adults', 'household_children']),
# OHE catagorical string data
("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex',
'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa'])],
remainder="passthrough")
# Preprocessing Pipeline
preprocessing_pipe = Pipeline(steps=[
("NAN_median", NAN_median),
("NAN_mode", NAN_mode),
("col_transformer", col_transformer)
])
# model
logreg_optimized_pipe = Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])
#fit model to training data
logreg_optimized_pipe.fit(X_train, y_train)
#trying to get feature names
logreg_optimized_pipe.named_steps["log_reg"].feature_names_in_
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-38-512bfaf5962d> in <module>
----> 1 logreg_optimized_pipe.named_steps["log_reg"].feature_names_in_
AttributeError: 'LogisticRegression' object has no attribute 'feature_names_in_'
I'm open to alternative suggestions on how to get the feature names as well.
Docs says the following:
feature_names_in_ndarray of shape (n_features_in_,)
Names of features seen during fit. Defined only when X has feature names that are all strings.
You should make sure that data that reaches model has names in.
Also, it is defined only when fit is called.
Link to the docs for your version 1.0.2
LogisticRegression
So it turns out that SimpleImputer returns an array - thereby removing the column names. I replaced SimpleImputer with a function to fix this. I wasn't able to figure out how to use .feature_names_in_ on the LogisticRegression() model, but it did work when I called it on the preprocessing pipeline ColumnTransformer, and most importantly I was able to use .get_feature_names_out() on the preprocessing pipeline to get the feature names that were fed into the model.
Code:
#imports
import numpy as np
import pandas as pd
import statistics
import scipy.sparse
from scipy.stats import chi2_contingency
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
# train_test_split()
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state = 42)
#create functions for preprocessing
# function to replace NaN's in the ordinal and interval data
def replace_NAN_median(X_df):
opinions = ['opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
'household_children']
for column in opinions:
X_df[column].replace(np.nan, X_df[column].median(), inplace = True)
return X_df
# function to replace NaN's in the catagorical data
def replace_NAN_mode(X_df):
miss_cat_features = ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status']
for column in miss_cat_features:
X_df[column].replace(np.nan, statistics.mode(X_df[column]), inplace = True)
return X_df
# function to replace NaN's in the binary data
def replace_NAN_0(X_df):
miss_binary = ['behavioral_antiviral_meds', 'behavioral_avoidance','behavioral_face_mask' ,
'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
'behavioral_touch_face', 'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance']
for column in miss_binary:
X_df[column].replace(np.nan, 0, inplace = True)
return X_df
# Instantiate transformers
NAN_median = FunctionTransformer(replace_NAN_median)
NAN_mode = FunctionTransformer(replace_NAN_mode)
NAN_0 = FunctionTransformer(replace_NAN_0)
col_transformer = ColumnTransformer(transformers= [
# MinMaxScaler on our numeric ordinal and interval data
("scaler", MinMaxScaler(), ['opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc',
'household_adults', 'household_children']),
# OHE catagorical string data
("ohe", OneHotEncoder(sparse = False), ['age_group','education', 'race', 'sex',
'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa'])],
remainder="passthrough")
# Preprocessing Pipeline
preprocessing_pipe = Pipeline(steps=[
("NAN_median", NAN_median),
("NAN_mode", NAN_mode),
("NAN_0", NAN_0),
("col_transformer", col_transformer)
])
# model
logreg_optimized_pipe = Pipeline(steps=[("preprocessing_pipe", preprocessing_pipe),
("log_reg", LogisticRegression(solver = 'liblinear', random_state = 42, C = 10, penalty= 'l1'))])
#fit model to training data
logreg_optimized_pipe.fit(X_train, y_train)
#trying to get feature names
logreg_optimized_pipe.named_steps["preprocessing_pipe"][3].feature_names_in_
#output - feature names put into `ColumnTransformer`
array(['respondent_id', 'behavioral_antiviral_meds',
'behavioral_avoidance', 'behavioral_face_mask',
'behavioral_wash_hands', 'behavioral_large_gatherings',
'behavioral_outside_home', 'behavioral_touch_face',
'doctor_recc_seasonal', 'chronic_med_condition',
'child_under_6_months', 'health_worker', 'health_insurance',
'opinion_seas_vacc_effective', 'opinion_seas_risk',
'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race',
'sex', 'income_poverty', 'marital_status', 'rent_or_own',
'employment_status', 'census_msa', 'household_adults',
'household_children'], dtype=object)
logreg_optimized_pipe.named_steps["preprocessing_pipe"][3].get_feature_names_out()
#output - feature names after `ColumnTransformer`
array(['scaler__opinion_seas_vacc_effective', 'scaler__opinion_seas_risk',
'scaler__opinion_seas_sick_from_vacc', 'scaler__household_adults',
'scaler__household_children', 'ohe__age_group_18 - 34 Years',
'ohe__age_group_35 - 44 Years', 'ohe__age_group_45 - 54 Years',
'ohe__age_group_55 - 64 Years', 'ohe__age_group_65+ Years',
'ohe__education_12 Years', 'ohe__education_< 12 Years',
'ohe__education_College Graduate', 'ohe__education_Some College',
'ohe__race_Black', 'ohe__race_Hispanic',
'ohe__race_Other or Multiple', 'ohe__race_White',
'ohe__sex_Female', 'ohe__sex_Male',
'ohe__income_poverty_<= $75,000, Above Poverty',
'ohe__income_poverty_> $75,000',
'ohe__income_poverty_Below Poverty', 'ohe__marital_status_Married',
'ohe__marital_status_Not Married', 'ohe__rent_or_own_Own',
'ohe__rent_or_own_Rent', 'ohe__employment_status_Employed',
'ohe__employment_status_Not in Labor Force',
'ohe__employment_status_Unemployed',
'ohe__census_msa_MSA, Not Principle City',
'ohe__census_msa_MSA, Principle City', 'ohe__census_msa_Non-MSA',
'remainder__respondent_id', 'remainder__behavioral_antiviral_meds',
'remainder__behavioral_avoidance',
'remainder__behavioral_face_mask',
'remainder__behavioral_wash_hands',
'remainder__behavioral_large_gatherings',
'remainder__behavioral_outside_home',
'remainder__behavioral_touch_face',
'remainder__doctor_recc_seasonal',
'remainder__chronic_med_condition',
'remainder__child_under_6_months', 'remainder__health_worker',
'remainder__health_insurance'], dtype=object)

"im2col_out_cpu" not implemented for 'Byte'

I am trying to generate overlap patches from image size (112,112) but i am unable to do so. I have already tried a lot but it didn't work out.
**Code**
import torch
import numpy as np
import torch.nn as nn
from torch import nn
from PIL import Image
import cv2
import os
import math
import torch.nn.functional as F
import torchvision.transforms as T
from timm import create_model
from typing import List
import matplotlib.pyplot as plt
from torchvision import io, transforms
from utils_torch import Image, ImageDraw
from torchvision.transforms.functional import to_pil_image
IMG_SIZE = 112
# PATCH_SIZE = 64
resize = transforms.Resize((IMG_SIZE, IMG_SIZE))
img = resize(io.read_image("Adam_Brody_233.png"))
img = img.to(torch.float32)
image_size = 112
patch_size = 28
ac_patch_size = 12
pad = 4
img = img.unsqueeze(0)
soft_split = nn.Unfold(kernel_size=(ac_patch_size, ac_patch_size), stride=(patch_size, patch_size), padding=(pad, pad))
patches = soft_split(img).transpose(1, 2)
fig, ax = plt.subplots(16, 16)
for i in range(16):
for j in range(16):
sub_img = patches[:, i, j]
ax[i][j].imshow(to_pil_image(sub_img))
ax[i][j].axis('off')
plt.show()
Traceback
Traceback (most recent call last):
File "/home/cvpr/Documents/OPVT/unfold_ours.py", line 32, in <module>
patches = soft_split(img).transpose(1, 2)
File "/home/cvpr/anaconda3/envs/OPVT/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/cvpr/anaconda3/envs/OPVT/lib/python3.7/site-packages/torch/nn/modules/fold.py", line 295, in forward
self.padding, self.stride)
File "/home/cvpr/anaconda3/envs/OPVT/lib/python3.7/site-packages/torch/nn/functional.py", line 3831, in unfold
_pair(dilation), _pair(padding), _pair(stride))
RuntimeError: "im2col_out_cpu" not implemented for 'Byte'
Yes this is an open issue in PyTorch. A simple fix is just to convert your image tensor from ints to floats you can do it like this:
img = img.to(torch.float32)
This should solve your problem

Simple logistic regression with Statsmodels: Adding an intercept and visualizing the logistic regression equation

Using Statsmodels, I am trying to generate a simple logistic regression model to predict whether a person smokes or not (Smoke) based on their height (Hgt).
I have a feeling that an intercept needs to be included into the logistic regression model but I am not sure how to implement one using the add_constant() function. Also, I am unsure why the error below is generated.
This is the dataset, Pulse.CSV: https://drive.google.com/file/d/1FdUK9p4Dub4NXsc-zHrYI-AGEEBkX98V/view?usp=sharing
The full code and output are in this PDF file: https://drive.google.com/file/d/1kHlrAjiU7QvFXF2a7tlTSFPgfpq9bOXJ/view?usp=sharing
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
raw_data = pd.read_csv('Pulse.csv')
raw_data
x1 = raw_data['Hgt']
y = raw_data['Smoke']
reg_log = sm.Logit(y,x1,missing='Drop')
results_log = reg_log.fit()
def f(x,b0,b1):
return np.array(np.exp(b0+x*b1) / (1 + np.exp(b0+x*b1)))
f_sorted = np.sort(f(x1,results_log.params[0],results_log.params[1]))
x_sorted = np.sort(np.array(x1))
plt.scatter(x1,y,color='C0')
plt.xlabel('Hgt', fontsize = 20)
plt.ylabel('Smoked', fontsize = 20)
plt.plot(x_sorted,f_sorted,color='C8')
plt.show()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
4729 try:
-> 4730 return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
4731 except KeyError as e1:
((( Truncated for brevity )))
IndexError: index out of bounds
Intercept is not added by default in Statsmodels regression, but if you need you can include it manually.
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
raw_data = pd.read_csv('Pulse.csv')
raw_data
x1 = raw_data['Hgt']
y = raw_data['Smoke']
x1 = sm.add_constant(x1)
reg_log = sm.Logit(y,x1,missing='Drop')
results_log = reg_log.fit()
results_log.summary()
def f(x,b0,b1):
return np.array(np.exp(b0+x*b1) / (1 + np.exp(b0+x*b1)))
f_sorted = np.sort(f(x1,results_log.params[0],results_log.params[1]))
x_sorted = np.sort(np.array(x1))
plt.scatter(x1['Hgt'],y,color='C0')
plt.xlabel('Hgt', fontsize = 20)
plt.ylabel('Smoked', fontsize = 20)
plt.plot(x_sorted,f_sorted,color='C8')
plt.show()
This will also resolve the error as there was no intercept in your initial code.Source

hstack csr matrix with pandas array

I am doing an exercise on Amazon Reviews, Below is the code.
Basically I am not able to add column (pandas array) to CSR Matrix which i got after applying BoW.
Even though the number of rows in both matrices matches i am not able to get through.
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from sklearn.manifold import TSNE
#Create Connection to sqlite3
con = sqlite3.connect('C:/Users/609316120/Desktop/Python/Amazon_Review_Exercise/database/database.sqlite')
filtered_data = pd.read_sql_query("""select * from Reviews where Score != 3""", con)
def partition(x):
if x < 3:
return 'negative'
return 'positive'
actualScore = filtered_data['Score']
actualScore.head()
positiveNegative = actualScore.map(partition)
positiveNegative.head(10)
filtered_data['Score'] = positiveNegative
filtered_data.head(1)
filtered_data.shape
display = pd.read_sql_query("""select * from Reviews where Score !=3 and Userid="AR5J8UI46CURR" ORDER BY PRODUCTID""", con)
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape
display = pd.read_sql_query(""" select * from reviews where score != 3 and id=44737 or id = 64422 order by productid""", con)
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
final['Score'].value_counts()
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['Text'].values)
final_counts.shape
type(final_counts)
positive_negative = final['Score']
#Below is giving error
final_counts = hstack((final_counts,positive_negative))
sparse.hstack combines the coo format matrices of the inputs into a new coo format matrix.
final_counts is a csr matrix, so the sparse.coo_matrix(final_counts) conversion is trivial.
positive_negative is a column of a DataFrame. Look at
sparse.coo_matrix(positive_negative)
It probably is a (1,n) sparse matrix. But to combine it with final_counts it needs to be (1,n) shaped.
Try creating the sparse matrix, and transposing it:
sparse.hstack((final_counts, sparse.coo_matrix(positive_negative).T))
Used Below but still getting error
merged_data = scipy.sparse.hstack((final_counts, scipy.sparse.coo_matrix(positive_negative).T))
Below is the error
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'sparse' is not defined
>>> merged_data = scipy.sparse.hstack((final_counts, sparse.coo_matrix(positive_
negative).T))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'sparse' is not defined
>>> merged_data = scipy.sparse.hstack((final_counts, scipy.sparse.coo_matrix(pos
itive_negative).T))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python34\lib\site-packages\scipy\sparse\construct.py", line 464, in h
stack
return bmat([blocks], format=format, dtype=dtype)
File "C:\Python34\lib\site-packages\scipy\sparse\construct.py", line 600, in b
mat
dtype = upcast(*all_dtypes) if all_dtypes else None
File "C:\Python34\lib\site-packages\scipy\sparse\sputils.py", line 52, in upca
st
raise TypeError('no supported conversion for types: %r' % (args,))
TypeError: no supported conversion for types: (dtype('int64'), dtype('O'))
Even I was facing the same issue with sparse matrices. you can convert the CSR matrix to dense by todense() and then you can use np.hstack((dataframe.values,converted_dense_matrix)). It will work fine. you can't deal with sparse matrices by using numpy.hstack
However for very large data set converting to dense matrix is not a good idea. In your case scipy hstack won't work because the data types are different in hstack(int,object).
Try positive_negative = final['Score'].values and scipy.sparse.hstack it. if it doesn't work can you give me the output of your positive_negative.dtype

small test_set xgb predict

i would like to ask a question about a problem that i have for the last couple days.
First of all i am a beginner in machine learning and this is my first time using the XGBoost algorithm so excuse me for any mistakes I have done.
I trained my model to predict whether a log file is malicious or not. After i save and reload my model on a different session i use the predict function which seems to be working normally ( with a few deviations in probabilities but that is another topic, I know I, have seen it in another topic )
The problem is this: Sometimes when i try to predict a "small" csv file after load it seems to be broken predicting only the Zero label, even for indexes that are categorized correct previously.
For example, i load a dataset containing 20.000 values , the predict() is working. I keep only the first 5 of these values using pandas drop, again its working. If i save the 5 values on a different csv and reload it its not working. The same error happens if i just remove by hand all indexes (19.995) and save file only with 5 remaining.
I would bet it is a size of file problem but when i drop the indexes on the dataframe through pandas it seems to be working
Also the number 5 ( of indexes ) is for example purpose the same happens if I delete a large portion of the dataset.
I first came up with this problem after trying to verify by hand some completely new logs, which seem to be classified correctly if thrown into the big csv file but not in a new file on their own.
Here is my load and predict code
##IMPORTS
import os
import pandas as pd
from pandas.compat import StringIO
from datetime import datetime
from langid.langid import LanguageIdentifier, model
import langid
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from ggplot import ggplot, aes, geom_line
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
import pickle
df = pd.read_csv('big_test.csv')
df3 = pd.read_csv('small_test.csv')
#This one is necessary for the loaded_model
class ColumnSelector(BaseEstimator, TransformerMixin):
def init(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
loaded_model = joblib.load('finalized_model.sav')
result = loaded_model.predict(df)
print(result)
df2=df[:5]
result2 = loaded_model.predict(df2)
print(result2)
result3 = loaded_model.predict(df3)
print(result3)
The results i get are these:
[1 0 1 ... 0 0 0]
[1 0 1 0 1]
[0 0 0 0 0]
I can provide any code even from training or my dataset if necessary.
*EDIT: I use a pipeline for my data. I tried to reproduce the error after using xgb to fit the iris data and i could not. Maybe there is something wrong with my pipeline? the code is below :
df = pd.read_csv('big_test.csv')
# df.info()
# Split Dataset
attributes = ['uri','code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ]
x_train, x_test, y_train, y_test = train_test_split(df[attributes], df['Scan'], test_size=0.2,
stratify=df['Scan'], random_state=0)
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2,
stratify=y_train, random_state=0)
# print('Train:', len(y_train), 'Dev:', len(y_dev), 'Test:', len(y_test))
# set up graph function
def plot_precision_recall_curve(y_true, y_pred_scores):
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_scores)
return ggplot(aes(x='recall', y='precision'),
data=pd.DataFrame({"precision": precision, "recall": recall})) + geom_line()
# XGBClassifier
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), min_df=10)
dict_vectorizer = DictVectorizer()
xgb = XGBClassifier(seed=0)
pipeline = Pipeline([
("feature_union", FeatureUnion([
('text_features', Pipeline([
('selector', ColumnSelector(['uri'])),
('count_vectorizer', count_vectorizer)
])),
('categorical_features', Pipeline([
('selector', ColumnSelector(['code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ])),
('dict_vectorizer', dict_vectorizer)
]))
])),
('xgb', xgb)
])
pipeline.fit(x_train, y_train)
filename = 'finalized_model.sav'
joblib.dump(pipeline, filename)
Thats due to different dtypes in big and small file.
When you do:
df = pd.read_csv('big_test.csv')
The dtypes are these:
print(df.dtypes)
# Output
uri object
code object # <== Observe this
r_size object # <== Observe this
Scan int64
...
...
...
Now when you do:
df3 = pd.read_csv('small_test.csv')
the dtypes are changed:
print(df3.dtypes)
# Output
uri object
code int64 # <== Now this has changed
r_size int64 # <== Now this has changed
Scan int64
...
...
You see, pandas will try to determine the dtypes of the columns by itself. When you load the big_test.csv, there are some values in code and r_size column which are of string types, due to this whole column dtype is changed to string, which is not done in small_test.csv.
Now due to this change, the dictVectorizer encodes the data in a different way than before and the features are changed, and hence the results are also changed.
If you do this:
df3[['code', 'r_size']] = df3[['code', 'r_size']].astype(str)
and then call the predict(), the results are same again.