Related
Trained NER spacy custom training model using the document https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7 and https://spacy.io/usage/processing-pipelines by sample test case dataset to find the currency exactly in the given text.
Examble dataset:
TRAIN_DATA = [('This is AFN currency', {'entities': [(8, 11, 'CUR')]}),
('I have EUR european currency', {'entities': [(7, 10, 'CUR')]}),
('let as have ALL money', {'entities': [(12, 15, 'CUR')]}),
('DZD is a dollar', {'entities': [(0, 3, 'CUR')]}),
('money USD united states', {'entities': [(6, 9, 'CUR')]})
]
trained a model successfully by naming the model 'currency'. It predicts good for the trained dataset with proper label but mostly it predicts untrained text data with wrong label.
Input test line: 'I have AZWSQTS lot LOT of Indian MZW currency USD INR'
output:
AZWSQTS - CUR , LOT - CUR, MZW - CUR, USD - CUR, INR - CUR
Here, 'AZWSQTS' & 'LOT' is not a currency but it predicts, this is the problem I am getting.
Complete code:
from __future__ import unicode_literals, print_function
import random
from pathlib import Path
import spacy
from tqdm import tqdm
from spacy.training import Example
def spacy_train_model():
''' Sample traning dataset format'''
'''list of currency'''
currency_list = ['AFN', 'EUR', 'EUR', 'ALL', 'DZD', 'USD', 'EUR', 'AOA', 'XCD', 'XCD', 'ARS',
'AMD', 'AWG', 'SHP', 'AUD', 'EUR', 'AZN', '', 'BSD', 'BHD', 'BDT', 'BBD', 'BYN', 'EUR', 'BZD',
'XOF', 'BMD', 'BTN', 'BOB', 'USD', 'BAM', 'BWP', 'BRL', 'USD', 'USD', 'BND', 'BGN', 'XOF', 'BIF',
'CVE', 'KHR', 'XAF', 'CAD', 'USD', 'KYD', 'XAF', 'XAF', 'NZD', 'CLP', 'CNY', 'AUD', 'AUD', 'COP',
'KMF', 'CDF', 'XAF', 'none', 'CRC', 'XOF', 'HRK', 'CUP', 'ANG', 'EUR', 'CZK', '', 'DKK', 'DJF',
'XCD', 'DOP', '', 'USD', 'EGP', 'USD', 'XAF', 'ERN', 'EUR', 'SZL', 'ETB', '', 'FKP', 'FJD',
'EUR', 'EUR', 'EUR', 'XPF', '', 'XAF', 'GMD', 'GEL', 'EUR', 'GHS', 'GIP', 'EUR', 'DKK', 'XCD',
'EUR', 'USD', 'GTQ', 'GGP', 'GNF', 'XOF', 'GYD', '', 'HTG', 'HNL', 'HKD', 'HUF', 'ISK', 'INR',
'IDR', 'XDR', 'IRR', 'IQD', 'EUR', 'IMP', 'ILS', 'EUR', '', 'JMD', 'JPY', 'JEP', 'JOD',
'KZT', 'KES', 'AUD', 'EUR', 'KWD', 'KGS', '', 'LAK', 'EUR', 'LBP', 'LSL', 'LRD', 'LYD', 'CHF',
'EUR', 'EUR', '', 'MOP', 'MGA', 'MWK', 'MYR', 'MVR', 'XOF', 'EUR', 'USD', 'EUR', 'MRU', 'MUR',
'EUR', 'MXN', 'USD', 'MDL', 'EUR', 'MNT', 'EUR', 'XCD', 'MAD', 'MZN', 'MMK', '', 'NAD', 'AUD',
'NPR', 'EUR', 'XPF', 'NZD', 'NIO', 'XOF', 'NGN', 'NZD', 'AUD', 'USD', 'KPW', 'MKD', 'NOK',
'OMR','PKR', 'USD', 'ILS', 'USD', 'PGK', 'PYG', 'PEN', 'PHP', 'NZD', 'PLN', 'EUR', 'USD','QAR',
'EUR', 'RON', 'RUB', 'RWF', '', 'USD', 'EUR', 'SHP', 'XCD', 'XCD', 'EUR', 'EUR', 'XCD', 'WST',
'EUR', 'STN', 'SAR', 'XOF', 'RSD', 'SCR', 'SLL', 'SGD', 'USD', 'ANG', 'EUR', 'EUR', 'SBD', 'SOS',
'ZAR', 'GBP', 'KRW', 'SSP', 'EUR', 'LKR', 'SDG', 'SRD', 'NOK', 'SEK', 'CHF', 'SYP', '', 'TWD',
'TJS', 'TZS', 'THB', 'USD', 'XOF', 'NZD', 'TOP', 'TTD', 'GBP', 'TND', 'TRY', 'TMT', 'USD', 'AUD',
'UGX', 'UAH', 'AED', 'GBP', 'USD', 'UYU', 'USD', 'UZS', '', 'VUV', 'EUR', 'VES', 'VND', '',
'USD', 'XPF', 'YER', 'ZMW', 'USD']
TRAIN_DATA = [('This is AFN currency', {'entities': [(8, 11, 'CUR')]}),
('I have EUR europen currency', {'entities': [(7, 10, 'CUR')]}),
('let as have ALL money', {'entities': [(12, 15, 'CUR')]}),
('DZD is a dollar', {'entities': [(0, 3, 'CUR')]}),
('money USD united states', {'entities': [(6, 9, 'CUR')]})
]
# model = "en_core_web_lg"
model = None
output_dir=Path(r"D:\currency") # Path to save training model - create new empty directory
n_iter=100
#load the model
if model is not None:
nlp = spacy.load(model)
optimise = nlp.create_optimizer()
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en')
optimise = nlp.begin_training()
print("Created blank 'en' model")
#set up the pipeline
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe('ner', last=True)
else:
ner = nlp.get_pipe('ner')
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.initialize()
# optimizer = optimise
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in tqdm(TRAIN_DATA):
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update(
[example],
drop=0.5,
sgd=optimizer,
losses=losses)
print(losses)
for text, _ in TRAIN_DATA:
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
def test_model(text):
nlp = spacy.load(r'D:\currency')
for tex in text.split('\n'):
doc = nlp(tex)
for token in doc.ents:
print(token.text, token.label_)
spacy_train_model() #Training the model
test_model('text') #Testing the model
Couple of thoughts here...
You can't train a model with only five examples. Maybe this is just example code and you have more, but you generally needs hundreds of examples.
If you only need to recognize currency names like USD or GBP, use spaCy's rule-based matchers. You would only need an NER model if these are ambiguous somehow. Like if ALL is a currency, but you don't want to recognize it in "I ate ALL the donuts", an NER model can help, but that's a pretty hard distinction to learn, so you'll need hundreds of examples.
What is probably happening in your example problem is that the NER model has learned that any all capital token is a currency. If you want to fix that with an NER model, you'll need to give it examples where an all capital token isn't currency to learn from.
I'm trying to populate a dictionary of dictionaries with entries from a Pandas data frame in Python by iterating through the nested dictionary and populating the values of each sub-dictionary with entries from a row of a Pandas data frame.
Although there are as many sub-dictionaries as there are rows in the data frame, all dictionaries get populated with the data from the last row of the data frame, instead of using every row for every dictionary.
Here is a toy reproducible example.
import pandas as pd
# initialize an empty df
data = pd.DataFrame()
# populate data frame with entries
data['name'] = ['Joe Smith', 'Mary James', 'Charles Williams']
data['school'] = ["Jollywood Secondary", "Northgate Sixth From", "Brompton High"]
data['subjects'] = [['Maths', 'Art', 'Biology'], ['English', 'French', 'History'], ['Chemistry', 'Biology', 'English']]
# use dictionary comprehensions to set up main dictionary and sub-dictionary templates
# sub-dictionary
keys = ['name', 'school', 'subjects']
record = {key: None for key in keys}
# main dictionary
keys2 = ['cand1', 'cand2', 'cand3']
candidates = {key: record for key in keys2}
# as a result i get something like this
# {'cand1': {'name': None, 'school': None, 'subjects': None},
# 'cand2': {'name': None, 'school': None, 'subjects': None},
# 'cand3': {'name': None, 'school': None, 'subjects': None}}
# iterate through main dictionary and populate each sub-dict with row of df
for i, d in enumerate(candidates.items()):
d[1]['name'] = data['name'].iloc[i]
d[1]['school'] = data['school'].iloc[i]
d[1]['subjcts'] = data['subjects'].iloc[i]
# what i end up with is the last row entry in each sub-dictionary
#{'cand1': {'name': 'Charles Williams',
# 'school': 'Brompton High',
# 'subjects': None,
# 'subjcts': ['Chemistry', 'Biology', 'English']},
# 'cand2': {'name': 'Charles Williams',
# 'school': 'Brompton High',
# 'subjects': None,
# 'subjcts': ['Chemistry', 'Biology', 'English']},
# 'cand3': {'name': 'Charles Williams',
# 'school': 'Brompton High',
# 'subjects': None,
# 'subjcts': ['Chemistry', 'Biology', 'English']}}
How do I need to modify my code to get each dictionary populated with a different row from my data frame?
I did not work through your code to look for the bug, because the solution is a one-liner with the method to_dict.
Here is a minimal working example with your sample data.
import pandas as pd
# initialize an empty df
data = pd.DataFrame()
# populate data frame with entries
data['name'] = ['Joe Smith', 'Mary James', 'Charles Williams']
data['school'] = ["Jollywood Secondary", "Northgate Sixth From", "Brompton High"]
data['subjects'] = [['Maths', 'Art', 'Biology'], ['English', 'French', 'History'], ['Chemistry', 'Biology', 'English']]
# redefine index to match your keys
data.index = ['cand{}'.format(i) for i in range(1,len(data)+1)]
# convert to dict
data_dict = data.to_dict(orient='index')
print(data_dict)
This will look something like this
{'cand1': {
'name': 'Joe Smith',
'school': 'Jollywood Secondary',
'subjects': ['Maths', 'Art', 'Biology']},
'cand2': {
'name': 'Mary James',
'school': 'Northgate Sixth From',
'subjects': ['English', 'French', 'History']},
'cand3': {
'name': 'Charles Williams',
'school': 'Brompton High',
'subjects': ['Chemistry', 'Biology', 'English']}}
Consider avoiding the roundabout away of building dictionary as Pandas maintains various methods to render nested structures such as to_dict and to_json. Specifically, consider adding a new column, cand and set it as index for to_dict output:
data['cand'] = 'cand' + pd.Series((data.index.astype('int') + 1).astype('str'))
mydict = data.set_index('cand').to_dict(orient='index')
print(mydict)
{'cand1': {'name': 'Joe Smith', 'school': 'Jollywood Secondary',
'subjects': ['Maths', 'Art', 'Biology']},
'cand2': {'name': 'Mary James', 'school': 'Northgate Sixth From',
'subjects': ['English', 'French', 'History']},
'cand3': {'name': 'Charles Williams', 'school': 'Brompton High',
'subjects': ['Chemistry', 'Biology', 'English']}}
I want to use Lucid to analyze the feature extraction of a detection model I trained using the tensorflow Object Detection API on my own dataset. The model used is one from the Tensorflow Object Detection Zoo, namely faster_rcnn_resnet101.
I followed the Lucid tutorial to import my own model and saved a frozen graph of the model with the node /all_class_predictions_with_background as output_node.
I'm having trouble finding the input node of the graph to make Lucid run on it.
Furthermore I don't really think I have the right approach. Maybe I should first extract all the classification part of the detection model and freeze a new graph with only this part before going to Lucid.
Or maybe I should just import a resnet_101 classification model and copy/paste the correct weights from the detection model on it?
But I don't really know how to do those kind of things.
Can someone help me? I really want to try running Lucid on my detection network.
Yes, you should export an inference (frozen) graph to work with in Lucid.
I use the following script to export a graph from the training checkpoint files.
Useful information about the nodes in the exported file is logged to the console.
training_model="ssd_mnet_v2_ppn_512x288.config"
model_signature="eb_13_v09_ppmn2_13_256_adam_512x288_tf_1.14_200k"
# the specific checkpoint to export from
checkpoint_path="/TRAIN/models/model/train/model.ckpt-200000"
# directory to export into
output_path="/XYZ/graphs/${model_signature}"
# ensure these graph nodes are exported, and everything in between
additional_output_tensor_names="Preprocessor/sub,concat_1"
#
python export_inference_graph.py \
--input_type=image_tensor \
--pipeline_config_path /TRAIN/models/model/$training_model \
--trained_checkpoint_prefix=$checkpoint_path \
--output_directory=$output_path \
--additional_output_tensor_names=$additional_output_tensor_names
I found it convenient to make my own Lucid Model class, after reviewing the examples in the Lucid model zoo.
You have to examine your graph carefully as you need to specify the input node, and provide a list of layers that Lucid can work with.
from lucid.modelzoo.vision_base import Model, _layers_from_list_of_dicts
# the input node "Preprocessor/sub" is appropriate for image injection
class SSD_Mnet2_PPN( Model ):
def __init__(self, image_shape=None, graph_path=None, labels_path=None ):
self.model_path = graph_path
self.labels_path = labels_path
self.image_shape = image_shape
self.image_value_range = (-1, 1)
self.input_name = "Preprocessor/sub"
super().__init__()
# a hand-crafted list of layers - by inspection of the graph
SSD_Mnet2_PPN.layers = _layers_from_list_of_dicts(SSD_Mnet2_PPN, [
{ 'id': 0, 'tags': ['conv'], 'name': 'FeatureExtractor/MobilenetV2/expanded_conv_2/add', 'depth': 24, 'shape': [ 1, 72, 128, 24 ], 'transform_id': 2 },
{ 'id': 2, 'tags': ['conv'], 'name': 'FeatureExtractor/MobilenetV2/expanded_conv_5/add', 'depth': 32, 'shape': [ 1, 36, 64, 32 ], 'transform_id': 2 },
{ 'id': 5, 'tags': ['conv'], 'name': 'FeatureExtractor/MobilenetV2/expanded_conv_9/add', 'depth': 64, 'shape': [ 1, 18, 32, 64 ], 'transform_id': 2 },
{ 'id': 7, 'tags': ['conv'], 'name': 'FeatureExtractor/MobilenetV2/expanded_conv_12/add', 'depth': 96, 'shape': [ 1, 18, 32, 96 ], 'transform_id': 2 },
{ 'id': 9, 'tags': ['conv'], 'name': 'FeatureExtractor/MobilenetV2/expanded_conv_15/add', 'depth': 160, 'shape': [ 1, 9, 16, 160 ], 'transform_id': 2 },
{ 'id': 11, 'tags': ['concat'], 'name': 'concat_1', 'depth': 13, 'shape': [ 1, 1212, 13 ], 'transform_id': 4 },
])
def model_for_version( version=None, path=None ):
if "320x180" in version:
return SSD_Mnet2_PPN( graph_path=path, image_shape=[ 320, 180, 3 ] )
if "480x270" in version:
return SSD_Mnet2_PPN( graph_path=path, image_shape=[ 480, 270, 3 ] )
if "512x288" in version:
return SSD_Mnet2_PPN( graph_path=path, image_shape=[ 512, 288, 3 ] )
if "720x405" in version:
return SSD_Mnet2_PPN( graph_path=path, image_shape=[ 720, 405, 3 ] )
raise ValueError( "No model for graph_version: {}".format( version ) )
Then you can write code as follows:
from lucid.optvis import render
model = model_for_version(
version = "eb_13_v09_ppmn2_13_256_adam_512x288_tf_1.14",
path = "/XYZ/graphs/eb_13_v09_ppmn2_13_256_adam_512x288_tf_1.14_200k/frozen_inference_graph.pb"
)
model.load_graphdef()
_ = render.render_vis( model, "FeatureExtractor/MobilenetV2/expanded_conv_15/add:17", thresholds=( 32, 256, 1024 ) )
Inevitably, one has to experiment quite a bit.
I'm using Keras 2.2.0 and am trying to do something like the following:
import keras.backend as K
K.clear_session()
sess = tf.Session()
K.set_session(sess)
...
with K.get_session() as sess:
However, I get errors saying AttributeError: 'module' object has no attribute 'clear_session'. So it seems this functionality is no longer in keras.backend?
For instance, if I do dir(keras.backend), I get:
['Function', 'NAME_SCOPE_STACK', 'Print', 'RandomStreams', 'T', 'T_softsign', '_BACKEND', '__builtins__', '__doc__', '__file__', '__name__', '__package__', '__path__', '_backend', '_config', '_config_path', '_epsilon', '_floatx', '_image_data_format', '_keras_base_dir', '_keras_dir', 'abs', 'absolute_import', 'all', 'any', 'arange', 'argmax', 'argmin', 'backend', 'batch_dot', 'batch_flatten', 'batch_get_value', 'batch_normalization', 'batch_set_value', 'bias_add', 'binary_crossentropy', 'cast', 'cast_to_floatx', 'categorical_crossentropy', 'clip', 'common', 'concatenate', 'constant', 'contextmanager', 'conv1d', 'conv2d', 'conv2d_transpose', 'conv3d', 'conv3d_transpose', 'cos', 'count_params', 'ctc_batch_cost', 'ctc_cost', 'ctc_create_skip_idxs', 'ctc_interleave_blanks', 'ctc_path_probs', 'ctc_update_log_p', 'cumprod', 'cumsum', 'defaultdict', 'depthwise_conv2d', 'division', 'dot', 'dropout', 'dtype', 'elu', 'epsilon', 'equal', 'eval', 'exp', 'expand_dims', 'eye', 'f', 'flatten', 'floatx', 'foldl', 'foldr', 'function', 'gather', 'get_uid', 'get_value', 'get_variable_shape', 'gradients', 'greater', 'greater_equal', 'hard_sigmoid', 'has_arg', 'identity', 'ifelse', 'image_data_format', 'image_dim_ordering', 'importlib', 'in_test_phase', 'in_top_k', 'in_train_phase', 'int_shape', 'is_keras_tensor', 'is_placeholder', 'is_sparse', 'is_tensor', 'json', 'l2_normalize', 'learning_phase', 'less', 'less_equal', 'local_conv1d', 'local_conv2d', 'log', 'logsumexp', 'map_fn', 'max', 'maximum', 'mean', 'min', 'minimum', 'moving_average_update', 'name_scope', 'ndim', 'normalize_batch_in_training', 'not_equal', 'np', 'one_hot', 'ones', 'ones_like', 'os', 'pattern_broadcast', 'permute_dimensions', 'placeholder', 'pool', 'pool2d', 'pool3d', 'pow', 'print_function', 'print_tensor', 'prod', 'py_all', 'py_any', 'py_slice', 'py_sum', 'random_binomial', 'random_normal', 'random_normal_variable', 'random_uniform', 'random_uniform_variable', 'relu', 'repeat', 'repeat_elements', 'reset_uids', 'reshape', 'resize_images', 'resize_volumes', 'reverse', 'rnn', 'round', 'separable_conv1d', 'separable_conv2d', 'set_epsilon', 'set_floatx', 'set_image_data_format', 'set_image_dim_ordering', 'set_learning_phase', 'set_value', 'shape', 'sigmoid', 'sign', 'sin', 'slice', 'softmax', 'softplus', 'softsign', 'sparse_categorical_crossentropy', 'spatial_2d_padding', 'spatial_3d_padding', 'sqrt', 'square', 'squeeze', 'stack', 'std', 'stop_gradient', 'sum', 'switch', 'sys', 'tanh', 'temporal_padding', 'th_sparse_module', 'theano', 'theano_backend', 'tile', 'to_dense', 'transpose', 'truncated_normal', 'update', 'update_add', 'update_sub', 'var', 'variable', 'zeros', 'zeros_like']
and don't see any of those 3 in there.
How should I be writing this code in modern Keras?
Thanks!
EDIT: https://github.com/keras-team/keras/issues/11015
Seems like it is not available any may have to downgrade
It might be that your backend is set to using Theano (I believe clear_session is only available through the Tensorflow backend with Keras). You can change these settings in your keras.json to TF and clear_session should be available to you.
I'm trying to get to grips with sci-kit learn for some simple machine learning projects but I'm coming unstuck with Pipelines and wonder what I've done wrong...
I'm trying to work through a tutorial on Kaggle
Here's my code:
import pandas as pd
train = pd.read_csv(local path to training data)
train_labels = pd.read_csv(local path to labels)
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
pca = PCA()
clf = LinearSVC()
n_components = arange(1, 39)
loss =['l1','l2']
penalty =['l1','l2']
C = arange(0, 1, .1)
whiten = [True, False]
from sklearn.pipeline import Pipeline
#set up pipeline
pipe = Pipeline(steps=[('pca', pca), ('clf', clf)])
#set up GridsearchCV
estimator = GridSearchCV(pipe, dict(pca__n_components = n_components, pca__whiten = whiten,
clf__loss = loss, clf__penalty = penalty, clf__C = C))
estimator
Returns:
GridSearchCV(cv=None,
estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
random_state=None, tol=0.0001, verbose=0))]),
fit_params={}, iid=True, loss_func=None, n_jobs=1,
param_grid={'clf__penalty': ['l1', 'l2'], 'clf__loss': ['l1', 'l2'], 'clf__C': array([ 0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'pca__n_components': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
35, 36, 37, 38]), 'pca__whiten': [True, False]},
pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
verbose=0)
But when I try to train data:
estimator.fit(train, train_labels)
The error is:
428 for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)):
429 for label, (_, test_split) in zip(unique_labels, per_label_splits):
--> 430 label_test_folds = test_folds[y == label]
431 # the test split can be too big because we used
432 # KFold(max(c, self.n_folds), self.n_folds) instead of
IndexError: too many indices for array
Can anyone point me in the right direction?
It turns out that the Pandas dataframe is the wrong shape.
estimator.fit(train.values, train_labels[0].values)
works, although I also had to drop the penalty term.