Related
I've been stressing out on this problem for so long and I can't seem to find a solution.
I want to train a NER model to recognise animal and species names.
I created a mock training set to test it out. However, I keep getting a ValueError: [E973] Unexpected type for NER data
I have tried other solutions on other posts on StackOverflow, including:
Double checking if my formatting and type of the training set was right
Using spacy.load('en_core_web_sm') instead of spacy.blank('en')
Installing spacy-lookups-data
All of these result in the same error.
import os
import spacy
from spacy.lang.en import English
from spacy.training.example import Example
import random
def train_spacy(data, iterations = 30):
TRAIN_DATA = data
nlp = spacy.blank("en") #start with a blank model
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last = True)
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for itn in range(iterations):
print ("Starting iterations "+str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
print(isinstance(annotations["entities"], (list,tuple))) #this prints True
example = Example.from_dict(doc, {"entities":annotations})
nlp.update(
[example],
drop = 0.2,
sgd = optimizer,
losses = losses
)
print(losses)
return (nlp)
if __name__ == "__main__":
#mock training set
TRAIN_DATA=[('Dog is an animal',{'entities':[(0,3,'ANIMAL')]}),
('Cat is on the table',{'entities':[(0,3,'ANIMAL')]}),
('Rats are pets',{'entities':[(0,4,'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
The error message
File "c:\...\summarizer\src\feature_extraction\feature_extraction.py", line 49, in <module>
nlp = train_spacy(TRAIN_DATA)
File "c:\...\summarizer\src\feature_extraction\feature_extraction.py", line 35, in train_spacy
example = Example.from_dict(doc, {"entities":annotations})
File "spacy\training\example.pyx", line 118, in spacy.training.example.Example.from_dict
File "spacy\training\example.pyx", line 24, in spacy.training.example.annotations_to_doc
File "spacy\training\example.pyx", line 388, in spacy.training.example._add_entities_to_doc
ValueError: [E973] Unexpected type for NER data```
I had the same problem when I migrated a code that I had from a 2.x version of spacy to a 3.x version since several things changed.
Also, in your case it looks like you have a mix of spacy 2.x and 3.x syntaxt. The next version of your code with a few changes work for me using spacy 3.2.1
import random
import spacy
from spacy.training import Example
def train_spacy(data, iterations=30):
TRAIN_DATA = data
# nlp = spacy.blank("en") # start with a blank model
nlp = spacy.load("en_core_web_lg")
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
# with nlp.disable_pipes(*other_pipes):
losses = None
optimizer = nlp.create_optimizer()
for itn in range(iterations):
print("Starting iterations " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
print(isinstance(annotations["entities"], (list, tuple))) # this prints True
example = Example.from_dict(doc, annotations)
losses = nlp.update(
[example],
drop=0.2,
sgd=optimizer
)
print(losses)
return nlp
if __name__ == "__main__":
# mock training set
TRAIN_DATA = [('Dog is an animal', {'entities': [(0, 3, 'ANIMAL')]}),
('Cat is on the table', {'entities': [(0, 3, 'ANIMAL')]}),
('Rats are pets', {'entities': [(0, 4, 'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
Notice the following changes:
I changed your import of Example class to from spacy.training import Example. I think you were importing the wrong clase.
I'm using en_core_web_lg but with a blank model it should work too!
I commented other pipeline models disabling because in spacy 3.x pipeline is more complex and I think you can't disable the whole pipeline for NER task. How ever feel free to read official documentation and try if some of the other models are not needed.
Optimizer now is initialized using nlp.create_optimizer() instead of nlp.begin_training()
Note that annotations are already a dictionary in the expected format so you don't need to wrap it in a new dictionary: Example.from_dict(doc, annotations) should do the job.
Finally the loss now is returned as a result of model update instead of being passed as parameter.
I hope this help you and please ask questions if you need more help.
Best regards!
EDIT:
I also want to suggest some changes in your training script to take more advantage of spacy utils:
Use spacy.utilis.minibatch util to create mini batches from your training data.
Pass a whole minibacth of examples to update method instead of a minibatch of only one example.
Your code including this improve among other minor changes would looks as follos:
import random
import spacy
from spacy.training import Example
def train_spacy(data, iterations=30):
TRAIN_DATA = data
# nlp = spacy.blank("en") # start with a blank model
nlp = spacy.load("en_core_web_lg")
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# Init loss
losses = None
# Init and configure optimizer
optimizer = nlp.create_optimizer()
optimizer.learn_rate = 0.001 # Change to some lr you prefers
batch_size = 32 # Choose batch size you prefers
for itn in range(iterations):
print("Starting iterations " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
# Batch the examples and iterate over them
for batch in spacy.util.minibatch(TRAIN_DATA, size=batch_size):
# Create Example instance for each training example in mini batch
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in batch]
# Update model with mini batch
losses = nlp.update(examples, drop=0.2, sgd=optimizer)
print(losses)
return nlp
if __name__ == "__main__":
# mock training set
TRAIN_DATA = [('Dog is an animal', {'entities': [(0, 3, 'ANIMAL')]}),
('Cat is on the table', {'entities': [(0, 3, 'ANIMAL')]}),
('Rats are pets', {'entities': [(0, 4, 'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
I am trying to get a GLCM implementation running in a custom Keras Layer in a reasonable fast time. So far I took the _glcm_loop from skimage-implementation, reduced it to what I needed and put it into a basic layer, like this:
import numpy as np
import tensorflow as tf
from time import time
from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras import layers
from skimage.feature import *
from numpy import array
from math import sin, cos
from time import time
import matplotlib.pyplot as plt
class GLCMLayer(keras.layers.Layer):
def __init__(self, greylevels=32, angles=[0], distances=[1], name=None, **kwargs):
self.greylevels = greylevels
self.angles = angles
self.distances = distances
super(GLCMLayer, self).__init__(name=name, **kwargs)
def _glcm_loop(self, image, distances, angles, levels, out):
rows = image.shape[0]
cols = image.shape[1]
for a_idx in range(len(angles)):
angle = angles[a_idx]
for d_idx in range(len(distances)):
distance = distances[d_idx]
offset_row = round(sin(angle) * distance)
offset_col = round(cos(angle) * distance)
start_row = max(0, -offset_row)
end_row = min(rows, rows - offset_row)
start_col = max(0, -offset_col)
end_col = min(cols, cols - offset_col)
for r in range(start_row, end_row):
for c in range(start_col, end_col):
i = image[r, c]
row = r + offset_row
col = c + offset_col
j = image[row, col]
out[i, j, d_idx, a_idx] += 1
def call(self, inputs):
P = np.zeros((self.greylevels, self.greylevels, len(self.distances), len(self.angles)), dtype=np.uint32, order='C')
self._glcm_loop(inputs, self.distances, self.angles, self.greylevels, P)
return P
def get_config(self):
config = {
'angle': self.angle,
'distance': self.distance,
'greylevel': self.greylevel,
}
base_config = super(GLCMLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
My execution code looks like this:
def quant(img, greylevels):
return array(img)//(256//greylevels)
if __name__ == "__main__":
source_file = "<some sour file>"
img_raw = image.load_img(source_file, target_size=(150,150), color_mode="grayscale")
img = quant(img_raw, 32)
layer = GLCMLayer()
start = time()
aug = layer(img)
tf.print(time()-start)
This is my first step to create it as a preprocessing layer. The second step then will be to modify it to run it also as hidden layer inside a model. That is why I didn't put it to a complete model yet, but I feel like there will be additional changes required when doing so.
For some reason the execution time is about 15-20 seconds long. Executing the code on the CPU without the layer takes about 0.0009 seconds. Obviously, something is going wrong here.
I am fairly new to tf and keras, so I fear I am missing something in the way on how to use the framework. In order to resolve it, I read about (which doesn't mean I understood):
do not use np-functions inside tensorflow, but tf-functions instead,
use tf.Variable,
use tf.Data,
unfolding is not possible in some way (whatever that means)
I tried a little here and there, but couldn't get them running, instead finding various different exceptions. So my questions are:
What is the correct way to use tf-functions in a GLCM to get the best performance on the GPU?
What do I need to take care of when using the layer in a complete model?
From that point on, I should hopefully be able to then implement the GLCM properties.
Any help is greatly appreciated.
(Disclaimer: I assume that there is a lot of other stuff not optimal yet, if anything comes to your mind just add it.)
SOLUTION BELOW:
Scenario:
I am trying to compute the jacobian of a user defined function many, many times in a loop. I am able to do this with TF 2's GradientTape as well as the older session based tf.gradients() method. The problem is that GradientTape is terribly slow (100x slower) than tf.gradients(). It has features i'd like to use (bath_jacobian, hessian support, etc), but if it's 100x slower then i can't use it.
The Question:
It's not clear to me if i'm simply misusing GradientTape, or if it will always be slower because it has to re-differentiate the provided function every time its called (my suspicion). I'm asking for tips to fix my use of GradientTape or a confirmation that it will always be fundamentally slower than tf.gradients by orders of magnitude.
Related Questions:
Repeated use of GradientTape for multiple Jacobian calculations - same scenario, unanswered
Does `GradientTape` need to re-differentiate each evaluation of a derivative? - same scenario, unanswered
using one GradientTape with global context - loosely related, having trouble applyng that solution to my scenario
Fully contained minimum example to compare GradientTape and tf.gradients():
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
# from tensorflow.python.ops.parallel_for.gradients import jacobian, batch_jacobian
import timeit
class FunctionCaller(object):
def __init__(self, func, nX, dtype=tf.float64, useSessions=True):
if useSessions:
disable_eager_execution()
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
return
#
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
#
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with session.run()
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
else:
return self._useGradientTape(x_i)
# THIS FUNCTION IS SUPER INEFFICIENT.
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
xTensor = tf.Variable(x_i, dtype=self.dtype) # is this my problem??? i recreate x every time?
y = tf.reshape(self.func(xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, xTensor)
# del g
return jac_x_at_i.numpy()
def __del__(self):
if self.sess is not None:
self.sess.close()
def main():
#tf.function
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.arange(nT*nX).reshape([-1, nX])
nTrials = 100
# try the eager version first
caller_eager = FunctionCaller(Xdot, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nX, useSessions=True)
start_time = timeit.default_timer()
caller_sessions.jac(x_i) # call it once to do its graph building stuff?
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
main()
EDIT - SOLUTION:
xdurch0 pointed out below that I should wrap _useGradientTape() in a #tf.function - something I was unsuccessful with before for other reasons. Once I did that, I had to move xTensor's definition outside the #tf.function wrapper by making it a member variable and using tf.assign().
With all this done, I find that GradientTape (for this simple example) is now on the same order of magnitude as tf.gradints. When running enough trials (~1E5), it's twice as fast as tf.gradients. awesome!
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
import numpy as np
import timeit
class FunctionCaller(object):
def __init__(self, func, nT, nX, dtype=tf.float64, useSessions=True):
if useSessions:
disable_eager_execution()
self.func = func
self.nX = nX
self.useSessions = useSessions
self.dtype = dtype
self.sess = tf.compat.v1.Session() if useSessions else None
if not useSessions:
# you should be able to create without an initial value, but tf is demanding one
# despite what the docs say. bug?
# tf.Variable(initial_value=None, shape=[None, nX], validate_shape=False, dtype=self.dtype)
self.xTensor = tf.Variable([[0]*nX]*nT, dtype=self.dtype) # x needs to be properly sized once
return
#
# we are in session mode, so build the graph and take the batch-jacobian of the function's outputs
#
xTensor = tf.compat.v1.placeholder(dtype, shape=[None, nX])
# add function to graph and guarantee its output shape
func_tensor = tf.reshape(func(xTensor), [-1, nX])
# take the gradient for each output, one at a time, and stack the results back together
each_output = tf.unstack(func_tensor, nX, axis=1)
jac_x = tf.stack([tf.gradients(output, xTensor, unconnected_gradients='zero')[0]
for output in each_output], axis=1)
# record these tensors so we can use them later with session.run()
self.xTensor = xTensor
self.func_tensor = func_tensor
self.jac_func_tensor = jac_x
def jac(self, x_i):
if self.useSessions:
return self.sess.run(self.jac_func_tensor, {self.xTensor: x_i})
else:
return self._useGradientTape(x_i).numpy()
#tf.function # THIS IS CRUCIAL
def _useGradientTape(self, x_i):
with tf.GradientTape(persistent=True) as g:
self.xTensor.assign(x_i) # you need to create the variable once outside the graph
y = tf.reshape(self.func(self.xTensor), [-1, self.nX])
jac_x_at_i = g.batch_jacobian(y, self.xTensor)
# del g
return jac_x_at_i
def __del__(self):
if self.sess is not None:
self.sess.close()
def main():
#tf.function
def Xdot(x_i):
x_0, x_1, x_2 = tf.split(x_i, 3, axis=1)
return tf.concat([x_2 * tf.sin(x_2), x_2 * tf.cos(x_2), x_2], axis=1)
nT = 20
nX = 3
# create some trash data
x_i = np.random.random([nT, nX])
nTrials = 1000 # i find that nTrials<=1E3, eager is slower, it's faster for >=1E4, it's TWICE as fast for >=1E5
# try the eager version first
caller_eager = FunctionCaller(Xdot, nT, nX, useSessions=False)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_eager = caller_eager.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("eager code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
# now try the sessions version
caller_sessions = FunctionCaller(Xdot, nT, nX, useSessions=True)
start_time = timeit.default_timer()
for _ in range(nTrials):
jac_session = caller_sessions.jac(x_i)
elapsed = timeit.default_timer() - start_time
print("session code took {} sec: {} sec/trial".format(elapsed, elapsed/nTrials))
residual = np.max(np.abs(jac_eager - jac_session))
print('residual between eager and session trials is {}'.format(residual))
if __name__ == "__main__":
main()
I have my data in multiple pickle files stored on disk. I want to use tensorflow's tf.data.Dataset to load my data into training pipeline. My code goes:
def _parse_file(path):
image, label = *load pickle file*
return image, label
paths = glob.glob('*.pkl')
print(len(paths))
dataset = tf.data.Dataset.from_tensor_slices(paths)
dataset = dataset.map(_parse_file)
iterator = dataset.make_one_shot_iterator()
Problem is I don't know how to implement the _parse_file fuction. The argument to this function, path, is of tensor type. I tried
def _parse_file(path):
with tf.Session() as s:
p = s.run(path)
image, label = pickle.load(open(p, 'rb'))
return image, label
and got error message:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'arg0' with dtype string
[[Node: arg0 = Placeholder[dtype=DT_STRING, shape=<unknown>, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
After some search on the Internet I still have no idea how to do it. I will be grateful to anyone providing me a hint.
I have solved this myself. I should use tf.py_func as in this doc.
This is how I solved this issue. I didn't use the tf.py_func; check out function "load_encoding()" below, which is what's doing the pickle reading. The FACELIB_DIR contains directories of pickled vggface2 encodings, each directory named for the person of those face encodings.
import tensorflow as tf
import pickle
import os
FACELIB_DIR='/var/noggin/FaceEncodings'
# Get list of all classes & build a quick int-lookup dictionary
labelNames = sorted([x for x in os.listdir(FACELIB_DIR) if os.path.isdir(os.path.join(FACELIB_DIR,x)) and not x.startswith('.')])
labelStrToInt = dict([(x,i) for i,x in enumerate(labelNames)])
# Function load_encoding - Loads Encoding data from enc2048 file in filepath
# This reads an encoding from disk, and through the file path gets the label oneHot value, returns both
def load_encoding(file_path):
with open(os.path.join(FACELIB_DIR,file_path),'rb') as fin:
A,_ = pickle.loads(fin.read()) # encodings, source_image_name
label_str = tf.strings.split(file_path, os.path.sep)[-2]
return (A, labelStrToInt[label_str])
# Build the dataset of every enc2048 file in our data library
encpaths = []
for D in sorted([x for x in os.listdir(FACELIB_DIR) if os.path.isdir(os.path.join(FACELIB_DIR,x)) and not x.startswith('.')]):
# All the encoding files
encfiles = sorted(filter((lambda x: x.endswith('.enc2048')), os.listdir(os.path.join(FACELIB_DIR, D))))
encpaths += [os.path.join(D,x) for x in encfiles]
dataset = tf.data.Dataset.from_tensor_slices(encpaths)
# Shuffle and speed improvements on the dataset
BATCH_SIZE = 64
from tensorflow.data import AUTOTUNE
dataset = (dataset
.shuffle(1024)
.cache()
.repeat()
.batch(BATCH_SIZE)
.prefetch(AUTOTUNE)
)
# Benchmark our tf.data pipeline
import time
datasetGen = iter(dataset)
NUM_STEPS = 10000
start_time = time.time()
for i in range(0, NUM_STEPS):
X = next(datasetGen)
totalTime = time.time() - start_time
print('==> tf.data generated {} tensors in {:.2f} seconds'.format(BATCH_SIZE * NUM_STEPS, totalTime))
tf.py_func
This function is used to solved that problem and also as menstion in doc.
i would like to ask a question about a problem that i have for the last couple days.
First of all i am a beginner in machine learning and this is my first time using the XGBoost algorithm so excuse me for any mistakes I have done.
I trained my model to predict whether a log file is malicious or not. After i save and reload my model on a different session i use the predict function which seems to be working normally ( with a few deviations in probabilities but that is another topic, I know I, have seen it in another topic )
The problem is this: Sometimes when i try to predict a "small" csv file after load it seems to be broken predicting only the Zero label, even for indexes that are categorized correct previously.
For example, i load a dataset containing 20.000 values , the predict() is working. I keep only the first 5 of these values using pandas drop, again its working. If i save the 5 values on a different csv and reload it its not working. The same error happens if i just remove by hand all indexes (19.995) and save file only with 5 remaining.
I would bet it is a size of file problem but when i drop the indexes on the dataframe through pandas it seems to be working
Also the number 5 ( of indexes ) is for example purpose the same happens if I delete a large portion of the dataset.
I first came up with this problem after trying to verify by hand some completely new logs, which seem to be classified correctly if thrown into the big csv file but not in a new file on their own.
Here is my load and predict code
##IMPORTS
import os
import pandas as pd
from pandas.compat import StringIO
from datetime import datetime
from langid.langid import LanguageIdentifier, model
import langid
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from ggplot import ggplot, aes, geom_line
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
import pickle
df = pd.read_csv('big_test.csv')
df3 = pd.read_csv('small_test.csv')
#This one is necessary for the loaded_model
class ColumnSelector(BaseEstimator, TransformerMixin):
def init(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
loaded_model = joblib.load('finalized_model.sav')
result = loaded_model.predict(df)
print(result)
df2=df[:5]
result2 = loaded_model.predict(df2)
print(result2)
result3 = loaded_model.predict(df3)
print(result3)
The results i get are these:
[1 0 1 ... 0 0 0]
[1 0 1 0 1]
[0 0 0 0 0]
I can provide any code even from training or my dataset if necessary.
*EDIT: I use a pipeline for my data. I tried to reproduce the error after using xgb to fit the iris data and i could not. Maybe there is something wrong with my pipeline? the code is below :
df = pd.read_csv('big_test.csv')
# df.info()
# Split Dataset
attributes = ['uri','code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ]
x_train, x_test, y_train, y_test = train_test_split(df[attributes], df['Scan'], test_size=0.2,
stratify=df['Scan'], random_state=0)
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2,
stratify=y_train, random_state=0)
# print('Train:', len(y_train), 'Dev:', len(y_dev), 'Test:', len(y_test))
# set up graph function
def plot_precision_recall_curve(y_true, y_pred_scores):
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_scores)
return ggplot(aes(x='recall', y='precision'),
data=pd.DataFrame({"precision": precision, "recall": recall})) + geom_line()
# XGBClassifier
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), min_df=10)
dict_vectorizer = DictVectorizer()
xgb = XGBClassifier(seed=0)
pipeline = Pipeline([
("feature_union", FeatureUnion([
('text_features', Pipeline([
('selector', ColumnSelector(['uri'])),
('count_vectorizer', count_vectorizer)
])),
('categorical_features', Pipeline([
('selector', ColumnSelector(['code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ])),
('dict_vectorizer', dict_vectorizer)
]))
])),
('xgb', xgb)
])
pipeline.fit(x_train, y_train)
filename = 'finalized_model.sav'
joblib.dump(pipeline, filename)
Thats due to different dtypes in big and small file.
When you do:
df = pd.read_csv('big_test.csv')
The dtypes are these:
print(df.dtypes)
# Output
uri object
code object # <== Observe this
r_size object # <== Observe this
Scan int64
...
...
...
Now when you do:
df3 = pd.read_csv('small_test.csv')
the dtypes are changed:
print(df3.dtypes)
# Output
uri object
code int64 # <== Now this has changed
r_size int64 # <== Now this has changed
Scan int64
...
...
You see, pandas will try to determine the dtypes of the columns by itself. When you load the big_test.csv, there are some values in code and r_size column which are of string types, due to this whole column dtype is changed to string, which is not done in small_test.csv.
Now due to this change, the dictVectorizer encodes the data in a different way than before and the features are changed, and hence the results are also changed.
If you do this:
df3[['code', 'r_size']] = df3[['code', 'r_size']].astype(str)
and then call the predict(), the results are same again.