LightGBM model returning the same score with different parameters - data-science

I'm trying to train a LightGBM model on the Kaggle Iowa housing dataset and I wrote a small script to randomly try different parameters within a given range. I'm not sure what's wrong with my code, but the script returns the same score with different parameters, which shouldn't be happening. I tried the same script with Catboost and it works as expected, so I'm guessing the issue is with LGBM.
The code:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from random import choice, randrange, uniform
complete_train = pd.read_csv(
"train.csv",
encoding = "UTF-8",
index_col = "Id")
complete_test = pd.read_csv(
"test.csv",
encoding = "UTF-8",
index_col = "Id")
def encode_impute(*datasets):
for dataset in datasets:
for column in dataset.columns:
dataset[
column].fillna(
-999,
inplace = True)
if dataset[
column].dtype == "object":
dataset[
column] = dataset[
column].astype("category", copy = False)
encode_impute(
complete_train,
complete_test)
X = complete_train.drop(
columns = "SalePrice")
y = complete_train[
"SalePrice"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
def objective():
while True:
params = {
"boosting_type": choice(["gbdt", "goss", "dart", "rf"]),
"num_leaves": randrange(10000),
"learning_rate": uniform(0.01, 1),
"subsample_for_bin": randrange(100000000),
"min_data_in_leaf": randrange(100000000),
"reg_alpha": uniform(0, 1),
"reg_lambda": uniform(0, 1),
"feature_fraction": uniform(0, 1),
"bagging_fraction": uniform(0, 1),
"bagging_freq": randrange(1, 100)}
params["bagging_fraction"] = 1.0 if params[
"boosting_type"] == "goss" else params[
"bagging_fraction"]
model = LGBMRegressor().set_params(**params)
model.fit(X_train, y_train)
predictions = model.predict(X_valid)
error_rate = mean_absolute_error(
y_valid, predictions)
print(f"Score = {error_rate} with parameters: {params}","\n" *5)
objective()
Example of the output I'm getting:
Score = 55967.70375930444 with parameters: {'boosting_type': 'gbdt', 'num_leaves': 6455, 'learning_rate': 0.2479700848039991, 'subsample_for_bin': 83737077, 'min_data_in_leaf': 51951103, 'reg_alpha': 0.1856001984332697, 'reg_lambda': 0.7849262049058852, 'feature_fraction': 0.10550627738309537, 'bagging_fraction': 0.2613298736131875, 'bagging_freq': 96}
Score = 55967.70375930444 with parameters: {'boosting_type': 'dart', 'num_leaves': 9678, 'learning_rate': 0.28670432435369037, 'subsample_for_bin': 24246091, 'min_data_in_leaf': 559094, 'reg_alpha': 0.07261459695501371, 'reg_lambda': 0.8834743560240725, 'feature_fraction': 0.5361519020265366, 'bagging_fraction': 0.9120030047714073, 'bagging_freq': 10}
Score = 55967.70375930444 with parameters: {'boosting_type': 'goss', 'num_leaves': 4898, 'learning_rate': 0.09237499846487345, 'subsample_for_bin': 32620066, 'min_data_in_leaf': 71317820, 'reg_alpha': 0.9818297737748625, 'reg_lambda': 0.11638265354331834, 'feature_fraction': 0.4230342728468828, 'bagging_fraction': 1.0, 'bagging_freq': 64}

I would point out that that min_data_in_leaf parameter in all the options seems very high and I suspect that the model is not learning anything and just sending the average value of the response variable with only root node.

Related

How to concat laserembeddings with huggingface funnel transformers simple CLS output for NLP sequence classification task?

i was approaching NLP sequence classification problem (3 classes) using huggingface transformers (funnel-transformer/large) and tensorflow.
first i created laserembedding like this :
from laserembeddings import Laser
laser = Laser()
df = pd.read_csv("mycsv.csv")
embeds = laser.embed_sentences(df['text'].values, lang='en')
write_pickle_to_file('train.pkl', embeds )
part 1 : Tensorflow version
for data preparation i use code like below :
df['text']=temp['column1']+tokenizer.sep_token+temp['column2']+tokenizer.sep_token+temp['column3']
def encode_text(texts):
enc_di = tokenizer.batch_encode_plus(
texts,
padding='max_length',
truncation=True,
return_token_type_ids=True,
pad_to_max_length=True,
max_length=cfg.max_len
)
return [np.asarray(enc_di['input_ids'], dtype=np.int64),
np.asarray(enc_di['attention_mask'], dtype=np.int64),
np.asarray(enc_di['token_type_ids'], dtype=np.int64)]
then inside training function :
x_train = encode_text(df.text.to_list())
train_ds = (
tf.data.Dataset
.from_tensor_slices((
{
"input_ids": x_train[0],
"input_masks": x_train[1],
"input_segments": x_train[2],
"lasers": np.array( train[laser_columns].values, dtype=np.float32 ) #laser_columns contains all the laser embedded columns
},
tf.one_hot(df["label"].to_list(), 3) #3 class
))
.repeat()
.shuffle(2048)
.batch(cfg.batch_size)
.prefetch(AUTO)
)
i add laser embedding in my model like this :
def create_model():
transformer = transformers.TFAutoModel.from_pretrained(cfg.pretrained,config=config,from_pt=True)
max_len=512
# transformer
input_ids = Input(shape=(max_len,), dtype="int32", name="input_ids")
input_masks = Input(shape=(max_len,), dtype="int32", name="input_masks")
input_segments = Input(shape=(max_len,), dtype="int32", name="input_segments")
sequence_output = transformer(input_ids, attention_mask=input_masks, token_type_ids=input_segments)[0]
cls_token = sequence_output[:, 0, :]
# lasers
lasers = Input(shape=(n_lasers,), dtype=tf.float32, name="lasers") #n_lasers = 1024
lasers_output = tf.keras.layers.Dense(n_lasers, activation='tanh')(lasers)
x = tf.keras.layers.Concatenate()([cls_token, lasers_output])
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(2048, activation='tanh')(x)
x = tf.keras.layers.Dropout(0.1)(x)
out = tf.keras.layers.Dense(3, activation='softmax')(x)
model = Model(inputs=[input_ids, input_masks, input_segments, lasers], outputs=out)
model.compile(Adam(lr=1e-5), loss=losses.CategoricalCrossentropy(), metrics=["acc", metrics.CategoricalCrossentropy(name='xentropy')])
return model
now my question is, how do we do the same with pytorch for exact same problem and same dataset?
part 2 : pytorch version
df = pd.read_csv("mytrain.csv")
class myDataset(Dataset):
def __init__(self,df, max_length, tokenizer, training=True):
self.df = df
self.max_len = max_length
self.tokenizer = tokenizer
self.column1 = self.df['column1'].values
self.column2 = self.df['column2'].values
self.column3= self.df['column3'].values
self.column4= self.df['column4'].values
self.training = training
if self.training:
self.targets = self.df['label'].values
def __len__(self):
return len(self.df)
def __getitem__(self, index):
column1 = self.column1[index]
column2= self.column2[index]
column3= self.column3[index]
text0 = self.column4[index]
text1 = column1 + ' ' + column2+ ' ' + column3
inputs = self.tokenizer.encode_plus(
text1 ,
text0 ,
truncation = True,
add_special_tokens = True,
return_token_type_ids = True,
is_split_into_words=False,
max_length = self.max_len
)
samples = {
'input_ids': inputs['input_ids'],
'attention_mask': inputs['attention_mask'],
}
if 'token_type_ids' in inputs:
samples['token_type_ids'] = inputs['token_type_ids']
if self.training:
samples['target'] = self.targets[index]
return samples
collate_fn = DataCollatorWithPadding(tokenizer=CONFIG['tokenizer'])
class myModel(nn.Module):
def __init__(self, model_name):
super(myModel, self).__init__()
self.model = AutoModel.from_pretrained(model_name)
if(True):
print("using gradient_checkpoint...")
self.model.gradient_checkpointing_enable()
self.config = AutoConfig.from_pretrained(model_name)
self.config.update(
{
"output_hidden_states": True,
"hidden_dropout_prob": 0.0,
"layer_norm_eps": 1e-7,
"add_pooling_layer": False,
"attention_probs_dropout_prob":0.0,
}
)
self.fc = nn.Linear(self.config.hidden_size, 3)
def forward(self, ids, mask):
out = self.model(input_ids=ids,attention_mask=mask,output_hidden_states=False)
out = out[0][:, 0, :]
outputs = self.fc(out)
return outputs
and in train and validation loop i have code like this :
bar = tqdm(enumerate(dataloader), total=len(dataloader))
for step, data in bar:
ids = data['input_ids'].to(device, dtype = torch.long)
mask = data['attention_mask'].to(device, dtype = torch.long)
targets = data['target'].to(device, dtype=torch.long)
batch_size = ids.size(0)
optimizer.zero_grad()
# forward pass with `autocast` context manager
with autocast(enabled=True):
outputs = model(ids, mask)
loss = loss_fct(outputs, targets)
i would like to know where and how in my huggingface pytorch pipeline i can use the laserembedding that i created earlier and used in tensorflow huggingface model?
i would like to concat laserembeddings with funnel transformer's simple CLS token output and train the transformers model with laser embed as extra feature in pytorch implementation exactly like i did in tensorflow example,do you know how to modify my pytorch code to make it working in pytorch? the tensorflow implementation with laserembedding concatenated above that i have posted here works good,i just wanted to do the same in pytorch implementation,,your help is highly appreciated,thanks in advance

tensorflow Exception encountered when calling layer (type CategoryEncoding)

I'm trying to code a layer to interface between a data set (numerical and categorical features) so it can be fed into a model.
I can't understand the error I get when it comes to categorical columns.
ValueError: Exception encountered when calling layer (type CategoryEncoding).
When output_mode is not 'int', maximum supported output rank is 2. Received
output_mode multi_hot and input shape (10, 7, 1), which would result in output rank 3.
From what I understand, the batch size should not have been counted in, but it is. And that seems to break.
Note that reproducing with only numerical features works fine.
Thank you for your help.
import tensorflow as tf
import pandas as pd
import numpy as np
# Simulate a data set of categorical and numerical values
# Configure simulation specifications: {feature: number of unique categories or None for numerical}
theSimSpecs = {'Cat1': 54, 'Cat2': 2, 'Cat3': 4, 'Num1': None, 'Num2': None}
# theSimSpecs = {'Num1': None, 'Num2': None}
# batch size and timesteps
theBatchSz, theTimeSteps = 10, 4
# Creation of the dataset as pandas.DataFrame
theDFs = []
for theFeature, theUniques in theSimSpecs.items():
if theUniques is None:
theDF = pd.DataFrame(np.random.random(size=theBatchSz * theTimeSteps), columns=[theFeature])
else:
theDF = pd.DataFrame(np.random.randint(low=0, high=theUniques, size=theBatchSz * theTimeSteps),
columns=[theFeature]).astype('category')
theDFs.append(theDF)
theDF = pd.concat(theDFs, axis=1)
# code excerpt
# inventory of the categorical features' values ( None for the numerical)
theCatCodes = {theCol: (theDF[theCol].unique().tolist() if str(theDF[theCol].dtypes) == "category" else None)
for theCol in theDF.columns}
# Creation of the batched tensorflow.data.Dataset
theDS = tf.data.Dataset.from_tensor_slices(dict(theDF))
theDS = theDS.window(size=theTimeSteps, shift=1, stride=1, drop_remainder=True)
theDS = theDS.flat_map(lambda x: tf.data.Dataset.zip(x))
theDS = theDS.batch(batch_size=theTimeSteps, drop_remainder=True)
theDS = theDS.batch(batch_size=theBatchSz, drop_remainder=True)
# extracting one batch
theBatch = next(iter(theDS))
tf.print(theBatch)
# Creation of the components for the interface layer
theFeaturesInputs = {}
theFeaturesEncoded = {}
for theFeature, theCodes in theCatCodes.items():
if theCodes is None: # Pass-through for numerical features
theNumInput = tf.keras.layers.Input(shape=[], dtype=tf.float32, name=theFeature)
theFeaturesInputs[theFeature] = theNumInput
theFeatureExp = tf.expand_dims(input=theNumInput, axis=-1)
theFeaturesEncoded[theFeature] = theFeatureExp
else: # Process for categorical features
theCatInput = tf.keras.layers.Input(shape=[], dtype=tf.int64, name=theFeature)
theFeaturesInputs[theFeature] = theCatInput
theFeatureExp = tf.expand_dims(input=theCatInput, axis=-1)
theEncodingLayer = tf.keras.layers.CategoryEncoding(num_tokens=theSimSpecs[theFeature], name=f"{theFeature}_enc",
output_mode="multi_hot", sparse=False)
theFeaturesEncoded[theFeature] = theEncodingLayer(theFeatureExp)
theStackedInputs = tf.concat(tf.nest.flatten(theFeaturesEncoded), axis=1)
theModel = tf.keras.Model(inputs=theFeaturesInputs, outputs=theStackedInputs)
theOutput = theModel(theBatch)
tf.print(theOutput)

GoogleNet fails to classify images

I built Keras Google Net from here:
https://www.analyticsvidhya.com/blog/2018/10/understanding-inception-network-from-scratch/
The only difference is that I replaced 1000 classes in output layers with 3. data is prepared this way :
def grey_preprocessor (xarray):
xarray=(xarray/127.5)-1
return xarray
img_resol = (224,224)
train_batches = ImageDataGenerator(horizontal_flip = True, preprocessing_function = grey_preprocessor).flow_from_directory(
directory = train_path, target_size=img_resol, classes = ['bacterial', 'healthy', 'viral'], batch_size = 10)
valid_batches = ImageDataGenerator(horizontal_flip = True, preprocessing_function = grey_preprocessor).flow_from_directory(
directory = valid_path, target_size=img_resol, classes = ['bacterial', 'healthy', 'viral'], batch_size = 10)
test_batches = ImageDataGenerator(horizontal_flip = True, preprocessing_function = grey_preprocessor).flow_from_directory(
directory = test_path, target_size=img_resol, classes = ['bacterial', 'healthy', 'viral'], batch_size = 10, shuffle = False)
assert train_batches.n == 4222
assert valid_batches.n == 300
assert test_batches.n == 150
assert train_batches.num_classes == valid_batches.num_classes == test_batches.num_classes == 3
I train it like this:
history = model.fit(train_batches, validation_data=valid_batches, epochs=epochs, batch_size=256, callbacks=[lr_sc])
However, all the accuracies on every batch are 0.3333, which means it doesn't classify at all. I understand that it can be anything. What is a good way to troubleshoot it?
If you want to normalize your grayscale image use this!
def gray_preprocessor (xarray):
xarray=xarray/255.0
return xarray
or you can also use lambda function:
gray_preprocessor = lambda xarray : xarray / 255.0

AttributeError: 'list' object has no attribute 'model_dir'

I'm running a wide_deep.py script for linear regression in tensorflow.
I have cloned the models directory also as a part of process.
But i'm getting a error like AttributeError: 'list' object has no attribute 'model_dir'.
If I hard code this particular variable, I m getting other errors as AttributeError: 'list' object has no attribute 'data_dir' and so on .
Code:
"""Example code for TensorFlow Wide & Deep Tutorial using tf.estimator API."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import shutil
from absl import app as absl_app
from absl import flags
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.flags import core as flags_core
from official.utils.logs import hooks_helper
from official.utils.misc import model_helpers
_CSV_COLUMNS = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
_CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
[0], [0], [0], [''], ['']]
_NUM_EXAMPLES = {
'train': 32561,
'validation': 16281,
}
LOSS_PREFIX = {'wide': 'linear/', 'deep': 'dnn/'}
def define_wide_deep_flags():
"""Add supervised learning flags, as well as wide-deep model type."""
flags_core.define_base()
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_enum(
name="model_type", short_name="mt", default="wide_deep",
enum_values=['wide', 'deep', 'wide_deep'],
help="Select model topology.")
flags_core.set_defaults(data_dir='/tmp/census_data',
model_dir='/tmp/census_model',
train_epochs=40,
epochs_between_evals=2,
batch_size=40)
def build_model_columns():
"""Builds a set of wide and deep feature columns."""
# Continuous columns
age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column('education_num')
capital_gain = tf.feature_column.numeric_column('capital_gain')
capital_loss = tf.feature_column.numeric_column('capital_loss')
hours_per_week = tf.feature_column.numeric_column('hours_per_week')
education = tf.feature_column.categorical_column_with_vocabulary_list(
'education', [
'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
'5th-6th', '10th', '1st-4th', 'Preschool', '12th'])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
'marital_status', [
'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
'relationship', [
'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
'Other-relative'])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
'workclass', [
'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])
# To show an example of hashing:
occupation = tf.feature_column.categorical_column_with_hash_bucket(
'occupation', hash_bucket_size=1000)
# Transformations.
age_buckets = tf.feature_column.bucketized_column(
age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
# Wide columns and deep columns.
base_columns = [
education, marital_status, relationship, workclass, occupation,
age_buckets,
]
crossed_columns = [
tf.feature_column.crossed_column(
['education', 'occupation'], hash_bucket_size=1000),
tf.feature_column.crossed_column(
[age_buckets, 'education', 'occupation'], hash_bucket_size=1000),
]
wide_columns = base_columns + crossed_columns
deep_columns = [
age,
education_num,
capital_gain,
capital_loss,
hours_per_week,
tf.feature_column.indicator_column(workclass),
tf.feature_column.indicator_column(education),
tf.feature_column.indicator_column(marital_status),
tf.feature_column.indicator_column(relationship),
# To show an example of embedding
tf.feature_column.embedding_column(occupation, dimension=8),
]
return wide_columns, deep_columns
def build_estimator(model_dir, model_type):
"""Build an estimator appropriate for the given model type."""
wide_columns, deep_columns = build_model_columns()
hidden_units = [100, 75, 50, 25]
# Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
# trains faster than GPU for this model.
run_config = tf.estimator.RunConfig().replace(
session_config=tf.ConfigProto(device_count={'GPU': 0}))
if model_type == 'wide':
return tf.estimator.LinearClassifier(
model_dir=model_dir,
feature_columns=wide_columns,
config=run_config)
elif model_type == 'deep':
return tf.estimator.DNNClassifier(
model_dir=model_dir,
feature_columns=deep_columns,
hidden_units=hidden_units,
config=run_config)
else:
return tf.estimator.DNNLinearCombinedClassifier(
model_dir=model_dir,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=hidden_units,
config=run_config)
def input_fn(data_file, num_epochs, shuffle, batch_size):
"""Generate an input function for the Estimator."""
assert tf.gfile.Exists(data_file), (
'%s not found. Please make sure you have run data_download.py and '
'set the --data_dir argument to the correct path.' % data_file)
def parse_csv(value):
print('Parsing', data_file)
columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
features = dict(zip(_CSV_COLUMNS, columns))
labels = features.pop('income_bracket')
return features, tf.equal(labels, '>50K')
# Extract lines from input files using the Dataset API.
dataset = tf.data.TextLineDataset(data_file)
if shuffle:
dataset = dataset.shuffle(buffer_size=_NUM_EXAMPLES['train'])
dataset = dataset.map(parse_csv, num_parallel_calls=5)
# We call repeat after shuffling, rather than before, to prevent separate
# epochs from blending together.
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
return dataset
def export_model(model, model_type, export_dir):
"""Export to SavedModel format.
Args:
model: Estimator object
model_type: string indicating model type. "wide", "deep" or "wide_deep"
export_dir: directory to export the model.
"""
wide_columns, deep_columns = build_model_columns()
if model_type == 'wide':
columns = wide_columns
elif model_type == 'deep':
columns = deep_columns
else:
columns = wide_columns + deep_columns
feature_spec = tf.feature_column.make_parse_example_spec(columns)
example_input_fn = (
tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec))
model.export_savedmodel(export_dir, example_input_fn)
def run_wide_deep(flags_obj):
"""Run Wide-Deep training and eval loop.
Args:
flags_obj: An object containing parsed flag values.
"""
# Clean up the model directory if present
shutil.rmtree(flags_obj.model_dir, ignore_errors=True)
model = build_estimator(flags_obj.model_dir, flags_obj.model_type)
train_file = os.path.join(flags_obj.data_dir, 'adult.data')
test_file = os.path.join(flags_obj.data_dir, 'adult.test')
# Train and evaluate the model every `flags.epochs_between_evals` epochs.
def train_input_fn():
return input_fn(
train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size)
def eval_input_fn():
return input_fn(test_file, 1, False, flags_obj.batch_size)
loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '')
train_hooks = hooks_helper.get_train_hooks(
flags_obj.hooks, batch_size=flags_obj.batch_size,
tensors_to_log={'average_loss': loss_prefix + 'head/truediv',
'loss': loss_prefix + 'head/weighted_loss/Sum'})
# Train and evaluate the model every `flags.epochs_between_evals` epochs.
for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals):
model.train(input_fn=train_input_fn, hooks=train_hooks)
results = model.evaluate(input_fn=eval_input_fn)
# Display evaluation metrics
print('Results at epoch', (n + 1) * flags_obj.epochs_between_evals)
print('-' * 60)
for key in sorted(results):
print('%s: %s' % (key, results[key]))
if model_helpers.past_stop_threshold(
flags_obj.stop_threshold, results['accuracy']):
break
# Export the model
if flags_obj.export_dir is not None:
export_model(model, flags_obj.model_type, flags_obj.export_dir)
def main(_):
run_wide_deep(flags.FLAGS)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
define_wide_deep_flags()
absl_app.run(main)
Hunter, I tried to run without hardcoding but still faced issues with attributes , so I tried to hard code to avoid this .
But, The issue is resolved now.
I cloned the directory again and instead of copying the wide_deep.py to another directory and run from there(which I was doing before), I ran directly from the same directory and now it is working fine.

Scikit-pandas, cross val score number of features

I am getting familiar with scikit and its pandas integration using the Titanic tutorial on Kaggle. I have cleaned my data and would like to make some prediction. I can do it calling a pipeline fit and transform - unfortunately I get an error trying to do the same with cross_val_score.
I am using the sklearn-pandas cross_val_score
The code is as follows:
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline([
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
X = df_train[df_train.columns.drop('Survived')]
y = df_train['Survived']
#model = pipe.fit(X = X, y = y)
#prediction = model.predict(df_train)
score = cross_val_score(pipe, X = X, y = y, scoring = 'accuracy')
df_train is a Pandas dataframe containing all my training set, including outcomes. The two commented lines:
model = pipe.fit(X = X, y = y)
prediction = model.predict(df_train)
Work fine and prediction returns me an array with predicted outcomes. Using the same with cross_val_score, I get the following error:
X has 20 features per sample; expecting 19
Full code below, can be run with the Titanic CSV files on Kaggle (https://www.kaggle.com/c/titanic/data)
#%% Libraries import
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#%% Read the data
path = 'E:/Kaggle/Titanic/Data/'
file_training = 'train.csv'
file_test = 'test.csv'
#Import the training and test dataset and concatenate them
df_training = pd.read_csv(path + file_training, header = 0, index_col = 'PassengerId')
df_test = pd.read_csv(path + file_test, header = 0, index_col = 'PassengerId')
# Work on the concatenated training and test data for feature engineering and clean-up
df = pd.concat([df_training, df_test], keys = ['train','test'])
#%% Initial data exploration and cleaning
df.describe(include = 'all')
pd.isnull(df).sum() > 0
#%% Preprocesing and Cleanup
#Create new columns with the name (to identify individuals part of a family)
df['LName'] = df['Name'].apply(lambda x:x.split(',')[0].strip())
df['FName'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[1].strip())
#Get the title
df['Title'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
titleDic = {
'Master' : 'kid',
'Mlle' : 'unmarriedWoman',
'Miss' : 'unmarriedWoman',
'Ms' : 'unmarriedWoman',
'Jonkheer' : 'noble',
'Don' : 'noble',
'Dona' : 'noble',
'Sir' : 'noble',
'Lady' : 'noble',
'the Countess' : 'noble',
'Capt' : 'ranked',
'Major' : 'ranked',
'Col' : 'ranked',
'Mr' : 'standard',
'Mme' : 'standard',
'Mrs' : 'standard',
'Dr' : 'academic',
'Rev' : 'academic'
}
df['Group'] = df['Title'].map(titleDic)
#%% Working with the family size
#Get the family size
df['familySize'] = df['Parch'] + df['SibSp'] + 1
#Add a family tag (single, couple, small, large)
df['familyType'] = pd.cut(df['familySize'],
[1,2,3,5,np.inf],
labels = ['single','couple','sFamily','bFamily'],
right = False)
#%% Filling empty values
#Fill empty values with the mean or mode for the column
#Fill the missing values with mean for age per title, class and gender. Store value in AgeFull variable
agePivot = pd.DataFrame(df.groupby(['Group', 'Sex'])['Age'].median())
agePivot.columns = ['AgeFull']
df = pd.merge(df, agePivot, left_on = ['Group', 'Sex'], right_index = True)
df.loc[df['Age'].isnull(),['Age']] = df['AgeFull']
#Embark location missing values
embarkPivot = pd.DataFrame(df.groupby(['Group'])['Embarked'].agg(lambda x:x.value_counts().index[0]))
embarkPivot.columns = ['embarkFull']
df = pd.merge(df, embarkPivot, left_on = ['Group'], right_index = True)
df.loc[df['Embarked'].isnull(),['Embarked']] = df['embarkFull']
#Fill the missing fare value
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mean()
#%% Final clean-up (drop temporary columns)
df = df.drop(['AgeFull', 'embarkFull'], 1)
#%% Preparation for training
df_train = df.loc['train']
df_test = df.loc['test']
#Creation of dummy variables
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline(steps = [
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
#Uncommenting the line below fixes the code - why?
#df_train = df_train.sort_index()
X = df_train[df_train.columns.drop(['Survived'])]
y = df_train.Survived
score = cross_val_score(pipe, X = df_train, y = df_train.Survived, scoring = 'accuracy')
This is very interesting. I have solved the issue just by sorting using the index the DataFrame before passing it to the cross_val_score in the pipeline.
df_train = df_train.sort_index()
Could anyone explain me why this would have an impact on how Scikit is working?