Spacy does not recognise people names when they contain an apostrophe - spacy

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
doc = nlp("That had been Megan's plan when she got him dressed earlier.")
labels = [ent.label_ for ent in doc.ents]
entity_text = [ent.text for ent in doc.ents]
print(labels)
print(entity_text)
This returns [ORG] for Megan insetad of [PERSON]. What am I doing wrong? I didn't think a simple apostrophe could throw off the NER model like that.

Related

Why spacy morphologizer doesn't work when we use a custom tokenizer?

I don't understand why when i'm doing this
import spacy
from copy import deepcopy
nlp = spacy.load("fr_core_news_lg")
class MyTokenizer:
def __init__(self, tokenizer):
self.tokenizer = deepcopy(tokenizer)
def __call__(self, text):
return self.tokenizer(text)
nlp.tokenizer = MyTokenizer(nlp.tokenizer)
doc = nlp("Un texte en français.")
Tokens don't have any morph assigned
print([tok.morph for tok in doc])
> ['','','','','']
Is this behavior expected? If yes, why ? (spacy v3.0.7)
The pipeline expects nlp.vocab and nlp.tokenizer.vocab to refer to the exact same Vocab object, which isn't the case after running deepcopy.
I admit that I'm not entirely sure off the top of my head why you end up with empty analyses instead of more specific errors, but I think the MorphAnalysis objects, which are stored centrally in the vocab in vocab.morphology, end up out-of-sync between the two vocabs.

How to merge same consecutive entity types using Spacy

this is sample example, which uses entity_ruler to create patterns. but I want to merge same consecutive entity types into one entity and token
import spacy
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
ent_list_sample = ["brain", "ischimia", "heart failufe", "parenchyma"]
print("Adding patterns to EntityRuler:\n-----------")
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp("It has a brain and also might have brain parenchyma ")
print("Entities:")
print(doc.ents)
output: (brain, brain, parenchyma)
expected: (brain, brain parenchyma)
PS: how we can reach expected output without adding extra pattern for "brain parenchyma"
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
from spacy.pipeline import merge_entities
nlp = spacy.load("en_core_web_sm")
ent_list_sample = ['algorithm', 'data', 'engineering', 'software']
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ent_list_sample1 = ["brain", "ischimia", "heart failufe", "parenchyma"]
patterns1 = []
for concept in ent_list_sample1:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns1.append({"label": "HE", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns1.append({"label": "HE", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns+patterns1)
nlp.add_pipe(ruler, before="ner")
class EntityRetokenizeComponent:
def __init__(self, nlp):
pass
def __call__(self, doc):
new_ents = []
for ent in doc.ents:
if ent.label_ == doc[ent.start - 1].ent_type_ and ent.start != 0:
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
new_ents.append(new_ent)
else:
new_ents.append(ent)
doc.ents =filter_spans(new_ents+ list(doc.ents))
return doc
retokenizer = EntityRetokenizeComponent(nlp)
nlp.add_pipe(retokenizer, name='merge_phrases', last=True)
nlp.add_pipe(merge_entities, last=True)
nlp.pipe_names
doc = nlp("I love Ann is good as well data software is good for brain parenchyma and Apple is good company")
print([(ent.text, ent.label_) for ent in doc.ents])
This gave me desired output I wanted to get:
[('Ann', 'PERSON'), ('data software', 'SCI'), ('brain parenchyma', 'HE'), ('Apple', 'ORG')]

Remove stopwords in Tensorflow extended

I have to preprocess NLP data, so I've to remove the stopwords (from nltk library) from a Tensorflow dataset. I tried many thing like this:
docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]])
tokenizer = text.WhitespaceTokenizer()
tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
data = tokenized_docs.filter(lambda x: x. not in stop_words)
or this:
tokens = docs.map(lambda x: tokenizer.tokenize(x))
data = tokens.filter(lambda x: tf.strings.strip(x).ref() not in stopwords)
But it didn't work. This first code shows an error like: RaggedTensor is unhashable.
From what I can tell Tensorflow supports basic string normalization (lowercasing + punctuation stripping) using the standardize callback's standardization function. There doesn't appear to be support for more advanced options, like removing stop words without doing it yourself.
It's probably easier to just do the standardization beforehand, outside of TensorFlow and then pass the result on.
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def parse_text(text):
print(f'Input: {text}')
text = re.sub("[^a-zA-Z]", ' ', text)
print(f'Remove punctuation and numbers: {text}')
text = text.lower().split()
print(f'Lowercase and split: {text}')
swords = set(stopwords.words("english"))
text = [w for w in text if w not in swords]
print(f'Remove stop words: {text}')
text = " ".join(text)
print(f'Final: {text}')
return text
list1 = [["NEver tell me the odds."],["It's a trap!"]]
for sublist in list1:
for i in range(len(sublist)):
sublist[i] = parse_text(sublist[i])
print(list1)
# [['never tell odds'], ['trap']]
You can use this to remove stopwords when using tfx
from nltk.corpus import stopwords
outputs['review'] = tf.strings.regex_replace(inputs['review'], r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*',"")

Cannot replace spaCy lemmatized pronouns (-PRON-) through text

I'm trying to lemmatise a text with spaCy. Since spaCy uses -PRON- as lemma for personal pronouns, I want to keep the original text in all those cases.
Here's the relevant section of my code:
...
fout = open('test.txt', 'w+')
doc = nlp(text)
for word in doc:
if word.lemma_ == "-PRON-":
write = word.text
print(write)
else:
write = word.lemma_
fout.write(str(write))
fout.write(" ")
...
The print statement does print the original words for the cases where spaCy attributes the lemma '-PRON-'.
However, my output file (test.txt) always contains '-PRON-' for those cases, even though I would expect it to write the original words for those cases (I, us etc.)
What am I missing?
I tried different versions, including using the pos_ tag to identify the pronouns etc. but always with the same result, i.e., that my output contains '-PRON-'s
Try this somewhat altered code snipped to see what you get...
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Did he write the code for her?'
doc = nlp(text)
out_sent = [w.lemma_ if w.lemma_ !='-PRON-' else w.text for w in doc]
out_sent = ' '.join(out_sent)
print(out_sent)
with open('out_sent.txt', 'w') as f:
f.write(out_sent + '\n')
This should produce...
do he write the code for her ?

My question is about "module 'textacy' has no attribute 'Doc'"

Can't find module 'textacy' has no attribute 'Doc'
I am trying to extract verb phrases from spacy but there is such no library. Please help me how can I extract the verb phrases or adjective phrases using spacy. I want to do full shallow parsing.
def extract_named_nouns(row_series):
"""Combine nouns and non-numerical entities.
Keyword arguments:
row_series -- a Pandas Series object
"""
ents = set()
idxs = set()
# remove duplicates and merge two lists together
for noun_tuple in row_series['nouns']:
for named_ents_tuple in row_series['named_ents']:
if noun_tuple[1] == named_ents_tuple[1]:
idxs.add(noun_tuple[1])
ents.add(named_ents_tuple)
if noun_tuple[1] not in idxs:
ents.add(noun_tuple)
return sorted(list(ents), key=lambda x: x[1])
def add_named_nouns(df):
"""Create new column in data frame with nouns and named ents.
Keyword arguments:
df -- a dataframe object
"""
df['named_nouns'] = df.apply(extract_named_nouns, axis=1)
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
from textacy import io
#using spacy for nlp
nlp = en_core_web_sm.load()
sentence = 'The author is writing a new book.'
pattern = r'<VERB>?<ADV>*<VERB>+'
doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
# doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
lists = textacy.extract.pos_regex_matches(doc, pattern)
for list in lists:
print(list.text)
module 'textacy' has no attribute 'Doc'
Try following the examples here: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html#make-a-doc
It should be as simple as:
doc = textacy.make_spacy_doc("The author is writing a new book.", lang='en_core_web_sm')
You might look into just using spacy (without textacy) with its built-in Matcher instead (https://spacy.io/usage/rule-based-matching).
spacy_lang = textacy.load_spacy_lang("en_core_web_en")
docx_textacy = spacy_lang(sentence)