how to add multiple custom entity recognition using spacy 3.x - spacy

Here is Input.json
[
"How to preorder the iPhone X",
"iPhone X is coming",
"Should I pay $1,000 for the iPhone X?",
"The iPhone 8 reviews are here",
"iPhone 11 vs iPhone 8: What's the difference?",
"I need a new phone! Any tips?",
"Why the iPhone 21",
"iPhone 9 is coming",
"Should I pay $1,000 for the X iPhone?",
"The iPhone 29 reviews are here",
"iPhone 999 vs iPhone 8: What's the difference?",
"I need a new phone! Any tips?",
"Are we talking about phones"
]
Question: why "coming" is not showing in the output. If I remove matcher.add for GADGET
then only it shows otherwise not
The final question is how can we add multiple patterns in SpaCy
All the detail is given below
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin
import random
with open("input.json", encoding="utf8") as f:
TEXTS = json.loads(f.read())
nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
# Add patterns to the matcher
pattern1 = ([{"LOWER": "iphone"}, {"LOWER": "x"}])
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", [pattern1, pattern2])
pattern3 = [{"LOWER": "coming"}]
matcher.add("COME", [pattern3])
docs = []
for doc in nlp.pipe(TEXTS):
matches = matcher(doc)
spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
print (spans)
doc.ents = spans
print(f'spans is: {spans} with match: {matches}')
#print([(ent.text, ent.label_) for ent in doc.ents])
docs.append(doc)
print (docs)
random.shuffle(docs)
train_docs = docs[:len(docs) // 2]
dev_docs = docs[len(docs) // 2:]
# Create and save a collection of training docs
train_docbin = DocBin(docs=train_docs)
train_docbin.to_disk("./train.spacy")
# Create and save a collection of evaluation docs
dev_docbin = DocBin(docs=dev_docs)
dev_docbin.to_disk("./dev.spacy")
#the below given code is triggered for config and training
# For config
#python -m spacy init config ./config.cfg --lang en --pipeline ner
# For training
# python -m spacy train ./config.cfg --output ./output --paths.train train.spacy --paths.dev dev.spacy
# Question: why "coming" is not showing in the output. If I remove matcher.add for GADGET
# then only it shows otherwise not
# The final question is how can we add multiple patterns in SpaCy
nlp = spacy.load("output/model-best")
doc = nlp("iPhone 55 vs iPhone 8: What's the difference? coming")
print(doc.ents)
for ent in doc.ents:
# Print the entity text and its label
print(ent.text, ent.label_)
End of code
The complete detail has been added

Related

Tensor flow and tflearn Chatbot keeps on getting high probability even when user input is wrong

I coded a simple AI chatbot with TensorFlow and tflearn and it runs just fine but the issue is when the user inputs the wrong thing, the bot is supposed to say it doesnt understand if the prediction accuracy is less than 70%, but the bot always scores above that even if the user gives jibberish like "rjrigrejfr". The bot assumes theyre greeting them. The patterns its supposed to study in the json are "patterns": ["Hi", "How are you", "Wassup", "Hello", "Good day", "Waddup", "Yo"]. I can share the json file if needed its short. Anyway, this is the python code:
import numpy as np
import nltk
import tensorflow
import tflearn
import random
import json
import pickle
# Some extra configuration:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
nltk.download('punkt')
# Load the data from the json file into a variable.
with open("intents.json") as file:
data = json.load(file)
# If we already have saved data, we do not need to retrain the model and waste time (could develop into an issue in more complex programs. Save in pickle. )
try:
with open("data.pickle", "rb") as f: # rb stands for bytes.
words, labels, training, output = pickle.load(f)
# --- Pre-training data preparation ---
except:
words = []
docsx = [] # Stores patterns
docsy = [] # Stores intents
labels = [] # All the specific tag values such as greeting, contact, etc.
for intent in data["intents"]:
for pattern in intent["patterns"]:
w = nltk.word_tokenize(pattern) # nltk function that splits the sentences inside intent into words list.
words.extend(w) # Add the tokenized list to words list.
docsx.append(w)
docsy.append(intent["tag"]) # append the classification of the sentence
if intent["tag"] not in labels:
labels.append(intent["tag"])
words = [stemmer.stem(w.lower()) for w in words if w not in ".?!"] # Stemming the words to remove unnecessary elements leaving their root. Convert all to lowercase.
words = sorted(list(set(words))) # Set ensures no duplicate elements then we convert back to list and sort it.
labels = sorted(labels)
training = []
output = []
out_empty = [0 for i in range(len(labels))] # Gives a list of 0 ints based on # of tags. This is useful later in the program when binerizing.
# One hot encoding the intent categories. Need to one-hot code the data which improves the efficiency of the ML to "binerize" the data.
# In this case, we have a list of 0s and 1s if the word appears it is assigned a 1 else a 0.
for x, doc in enumerate(docsx):
bag = [] # Bag of words or the one-hot coded data for the ML.
docx_word_stemmed = [stemmer.stem(word) for word in doc] # Stemming the data in docx.
# Now adding and transforming data into the one-hot coded list/bag of words data.
for i in words:
if i in docx_word_stemmed: # Checking against stemmed words:
# Word exists
bag.append(1)
else:
bag.append(0)
output_row = out_empty[:] # Copying out_empty
# Going through the labels list using .index() and for the occurance of docx value in docy, assign binary 1.
output_row[labels.index(docsy[x])] = 1
training.append(bag)
output.append(output_row)
# Required to use numpy arrays for use in tflearn. It is also faster.
training = np.array(training)
output = np.array(output)
# Saving the data so we do not need to do the data configuration every time.
with open("data.pickle", "wb") as f:
pickle.dump((words, labels, training, output), f)
try:
model.load('model.tflearn')
except:
tensorflow.compat.v1.reset_default_graph()
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(output[0]), activation='softmax')
net = tflearn.regression(net)
model = tflearn.DNN(net)
model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
model.save("model.tflearn")
def bagofwords(sentence, words):
bag = [0 for _ in range(len(words))] # blank bag of words.
# Tokenize s and then stem it.
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
for string in sentence_words:
for i, word in enumerate(words):
if word == string:
bag[i] = 1
return np.array(bag)
def chat():
print("Hello there! I'm the SRO AI Virtual Assistant. How am I help you?")
# Figure out the error slime!
while True:
user_input = input("Type here:")
if user_input == "quit":
break
result = model.predict([bagofwords(user_input, words)])[0] #bagofwords func and predict function to give predictions on what the user is saying.
best_result = np.argmax(result) # We want to only use the best result.
tag = labels[best_result]
print(result[best_result])
# Open JSON file and pick a response.
if result[best_result] > 0.7:
for tg in data["intents"]:
if tg['tag'] == tag:
responses = tg['responses']
print(random.choice(responses))
else:
print("I don't quite understand")
chat()

How to merge same consecutive entity types using Spacy

this is sample example, which uses entity_ruler to create patterns. but I want to merge same consecutive entity types into one entity and token
import spacy
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
ent_list_sample = ["brain", "ischimia", "heart failufe", "parenchyma"]
print("Adding patterns to EntityRuler:\n-----------")
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp("It has a brain and also might have brain parenchyma ")
print("Entities:")
print(doc.ents)
output: (brain, brain, parenchyma)
expected: (brain, brain parenchyma)
PS: how we can reach expected output without adding extra pattern for "brain parenchyma"
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
from spacy.pipeline import merge_entities
nlp = spacy.load("en_core_web_sm")
ent_list_sample = ['algorithm', 'data', 'engineering', 'software']
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ent_list_sample1 = ["brain", "ischimia", "heart failufe", "parenchyma"]
patterns1 = []
for concept in ent_list_sample1:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns1.append({"label": "HE", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns1.append({"label": "HE", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns+patterns1)
nlp.add_pipe(ruler, before="ner")
class EntityRetokenizeComponent:
def __init__(self, nlp):
pass
def __call__(self, doc):
new_ents = []
for ent in doc.ents:
if ent.label_ == doc[ent.start - 1].ent_type_ and ent.start != 0:
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
new_ents.append(new_ent)
else:
new_ents.append(ent)
doc.ents =filter_spans(new_ents+ list(doc.ents))
return doc
retokenizer = EntityRetokenizeComponent(nlp)
nlp.add_pipe(retokenizer, name='merge_phrases', last=True)
nlp.add_pipe(merge_entities, last=True)
nlp.pipe_names
doc = nlp("I love Ann is good as well data software is good for brain parenchyma and Apple is good company")
print([(ent.text, ent.label_) for ent in doc.ents])
This gave me desired output I wanted to get:
[('Ann', 'PERSON'), ('data software', 'SCI'), ('brain parenchyma', 'HE'), ('Apple', 'ORG')]

How do I train a pseudo-projective parser on spaCy?

I am trying to train a parser for custom semantics following the sample code from https://raw.githubusercontent.com/explosion/spaCy/master/examples/training/train_intent_parser.py
The idea is to get a non-projective parse so when I pass a text like: ROOT AAAA BBBB 12 21 12 becomes a child of AAAA and 21 becomes a child of BBBB. To test this I am training only this case and testing this same case but it doesn't seem to work, what I get as a response is:
[('ROOT', 'ROOT', 'ROOT'), ('AAAA', 'LETTERS', 'ROOT'), ('BBBB', 'LETTERS', 'ROOT'), ('12', 'NUMBERS', 'BBBB'), ('21', 'NUMBERS', 'BBBB')]
As you can see both numbers are dependent on BBBB when 12 should be dependent on AAAA.
The code I am using to train and test is:
import plac
import random
import spacy
from spacy.util import minibatch, compounding
TRAIN_DATA = list()
samples = 1000
for _ in range(samples):
sample = (
'ROOT AAAA BBBB 12 21',
{
'heads': [0, 0, 0, 1, 2],
'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
}
)
TRAIN_DATA.append(sample)
def test_model(nlp):
texts = ['ROOT AAAA BBBB 12 21']
docs = nlp.pipe(texts)
for doc in docs:
print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
#plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
n_iter=("Number of training iterations", "option", "n", int),
)
# Just in case I am using the german model since it supports pseudo-projective parsing (https://explosion.ai/blog/german-model#word-order)
def main(model='de_core_news_sm', n_iter=15):
"""Load the model, set up the pipeline and train the parser."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# We'll use the built-in dependency parser class, but we want to create a
# fresh instance – just in case.
if "parser" in nlp.pipe_names:
nlp.remove_pipe("parser")
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser, first=True)
for text, annotations in TRAIN_DATA:
for dep in annotations.get("deps", []):
parser.add_label(dep)
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_model(nlp)
if __name__ == "__main__":
plac.call(main)
So, what am I doing wrong?
Thank you in advance for any help on this!
The problem is that the simple training example script isn't projectivitizing the training instances when initializing and training the model. The parsing algorithm itself can only handle projective parses, but if the parser component finds projectivized labels in its output, they're deprojectivitzed in a postprocessing step. You don't need to modify any parser settings (so starting with a German model makes no difference), just provide projectivized input in the right format.
The initial projectivization is handled automatically by the train CLI, which uses GoldCorpus.train_docs() to prepare the training examples for nlp.update() and sets make_projective=True when creating the GoldParses. In general, I'd recommend switching to the train CLI (which also requires switching to the internal JSON training format, which is admittedly a minor hassle), because the train CLI sets a lot of better defaults.
However, a toy example also works fine as long as you create projectivized training examples (with GoldParse(make_projective=True), add all the projectivized dependency labels to the parser, and train with Doc and the projectivized GoldParse input instead of the text/annotation input:
# tested with spaCy v2.2.4
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
TRAIN_DATA = [
(
'ROOT AAAA BBBB 12 21',
{
'heads': [0, 0, 0, 1, 2],
'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
}
)
]
samples = 200
def test_model(nlp):
texts = ["ROOT AAAA BBBB 12 21"]
for doc in nlp.pipe(texts):
print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
spacy.displacy.serve(doc)
#plac.annotations(
n_iter=("Number of training iterations", "option", "n", int),
)
def main(n_iter=10):
"""Load the model, set up the pipeline and train the parser."""
nlp = spacy.blank("xx")
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser)
docs_golds = []
for text, annotation in TRAIN_DATA:
doc = nlp.make_doc(text)
gold = GoldParse(doc, **annotation, make_projective=True)
# add the projectivized labels
for dep in gold.labels:
parser.add_label(dep)
docs_golds.append((doc, gold))
# duplicate the training instances
docs_golds = docs_golds * samples
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training(min_action_freq=1)
for itn in range(n_iter):
random.shuffle(docs_golds)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(docs_golds, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_model(nlp)
if __name__ == "__main__":
plac.call(main)

Where does the "compound" DEP come from?

In the following SpaCy code:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("It was yesterday morning.")
doc[2].dep_
# 'compound'
doc[2].dep
# 7037928807040764755
doc[3].dep_
# 'npadmod'
doc[3].dep
# 428
I can track npadmod back to symbols.pxdenum.
But, where is 'compound' defined (both string and ID) ?
[SOLVED]
Some DEPs come from the symbols enum, some are dynamically added by the language itself. The value is then the hash of the string name, and they are contained into a StringStore.
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("It was yesterday morning.")
doc[2].dep_
# 'compound'
doc[2].dep
# 7037928807040764755
nlp.vocab.strings[doc[2].dep]
# 'compound'
nlp.vocab.strings['compound']
# 7037928807040764755

How to set the "band description" option/tag of a GeoTIFF file using GDAL (gdalwarp/gdal_translate)

Does anybody know how to change or set the "Description" option/tag of a GeoTIFF file using GDAL?
To specify what I mean, this is an example of gdalinfo return from a GeoTIFF file with set "Description":
Band 1 Block=64x64 Type=UInt16, ColorInterp=Undefined
Description = AVHRR Channel 1: 0.58 micrometers -- 0.68 micrometers
Min=0.000 Max=814.000
Minimum=0.000, Maximum=814.000, Mean=113.177, StdDev=152.897
Metadata:
LAYER_TYPE=athematic
STATISTICS_MAXIMUM=814
STATISTICS_MEAN=113.17657236931
STATISTICS_MINIMUM=0
STATISTICS_STDDEV=152.89720574652
In the example you can see: Description = AVHRR Channel 1: 0.58 micrometers -- 0.68 micrometers
How do I set this parameter using GDAL?
In Python you can set the band description like this:
from osgeo import gdal, osr
import numpy
# Define output image name, size and projection info:
OutputImage = 'test.tif'
SizeX = 20
SizeY = 20
CellSize = 1
X_Min = 563220.0
Y_Max = 699110.0
N_Bands = 10
srs = osr.SpatialReference()
srs.ImportFromEPSG(2157)
srs = srs.ExportToWkt()
GeoTransform = (X_Min, CellSize, 0, Y_Max, 0, -CellSize)
# Create the output image:
Driver = gdal.GetDriverByName('GTiff')
Raster = Driver.Create(OutputImage, SizeX, SizeY, N_Bands, 2) # Datatype = 2 same as gdal.GDT_UInt16
Raster.SetProjection(srs)
Raster.SetGeoTransform(GeoTransform)
# Iterate over each band
for band in range(N_Bands):
BandNumber = band + 1
BandName = 'SomeBandName '+ str(BandNumber).zfill(3)
RasterBand = Raster.GetRasterBand(BandNumber)
RasterBand.SetNoDataValue(0)
RasterBand.SetDescription(BandName) # This sets the band name!
RasterBand.WriteArray(numpy.ones((SizeX, SizeY)))
# close the output image
Raster = None
print("Done.")
Unfortunately, I'm not sure if ArcGIS or QGIS are able to read the band descriptions. However, the band names are clearly visible in Tuiview:
GDAL includes a python application called gdal_edit.py which can be used to modify the metadata of a file in place. I am not familiar with the Description field you are referring to, but this tool should be the one to use.
Here is the man page: gdal_edit.py
Here is an example script using an ortho-image I downloaded from the USGS Earth-Explorer.
#!/bin/sh
# Image to modify
IMAGE_PATH='11skd505395.tif'
# Field to modify
IMAGE_FIELD='TIFFTAG_IMAGEDESCRIPTION'
# Print the tiff image description tag
gdalinfo $IMAGE_PATH | grep $IMAGE_FIELD
# Change the Field
CMD="gdal_edit.py -mo ${IMAGE_FIELD}='Lake-Tahoe' $IMAGE_PATH"
echo $CMD
$CMD
# Print the new field value
gdalinfo $IMAGE_PATH | grep $IMAGE_FIELD
Output
$ ./gdal-script.py
TIFFTAG_IMAGEDESCRIPTION=OrthoVista
gdal_edit.py -mo TIFFTAG_IMAGEDESCRIPTION='Lake-Tahoe' 11skd505395.tif
TIFFTAG_IMAGEDESCRIPTION='Lake-Tahoe'
Here is another link that should provide useful info.
https://gis.stackexchange.com/questions/111610/how-to-overwrite-metadata-in-a-tif-file-with-gdal
Here's a single purpose python commandline script to edit band description in place.
''' Set image band description to specified text'''
import os
import sys
from osgeo import gdal
gdal.UseExceptions()
if len(sys.argv) < 4:
print(f"Usage: {sys.argv[0]} [in_file] [band#] [text]")
sys.exit(1)
infile = sys.argv[1] # source filename and path
inband = int(sys.argv[2]) # source band number
descrip = sys.argv[3] # description text
data_in = gdal.Open(infile, gdal.GA_Update)
band_in = data_in.GetRasterBand(inband)
old_descrip = band_in.GetDescription()
band_in.SetDescription(descrip)
new_descrip = band_in.GetDescription()
# de-reference the datasets, which triggers gdal to save
data_in = None
data_out = None
print(f"Description was: {old_descrip}")
print(f"Description now: {new_descrip}")
In use:
$ python scripts\gdal-edit-band-desc.py test-edit.tif 1 "Red please"
Description was:
Description now: Red please
$ gdal-edit-band-desc test-edit.tif 1 "Red please also"
$ python t:\ENV.558\scripts\gdal-edit-band-desc.py test-edit.tif 1 "Red please also"
Description was: Red please
Description now: Red please also
Properly it should be added to gdal_edit.py but I don't know enough do feel safe adding it directly.
gdal_edit.py with the -mo flag can be used to edit the band descriptions, with the bands numbered starting from 1:
gdal_edit.py -mo BAND_1=AVHRR_Channel_1_p58_p68_um -mo BAND_2=AVHRR_Channel_2 avhrr.tif
I didn't try it with the special characters but that might work if you use the right quotes.