Problem with "Span.as_doc()" method in Spacy - spacy

I am working on extraction of dative and direct object using Spacy. Noun.chunks already have already dependency tagging for their roots like dative and dobj, and what I am trying to do is to get Span and save it as Doc to apply further analysis.
I have the following code:
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(open("/-textfile").read())
so far so good, next I got Span objects;
datives = []
for dat in doc.noun_chunks:
if dat.root.dep_ == "dative" and dat.root.head.pos_ == "VERB":
dative.append(dat.sent)
Now I have all the sentences with noun.chunks of which roots are dative and head is a VERB
However, I would to like get token data like from the datives []
dativesent = datives.as_doc()
But the problem is as datives [] is already a list, I cannot convert it to a DOC.
How can I save the sentences with dative-noun.chunks as a DOC?

You can iterate over a sentence (which is a Span) just like a Doc to access the tokens:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("She gave the dog a bone. He read a book. They gave her a book.")
dative_sents = []
for nc in doc.noun_chunks:
if nc.root.dep_ == "dative" and nc.root.head.pos_ == "VERB":
dative_sents.append(nc.sent)
for dative_sent in dative_sents:
print("Sentence with dative:", dative_sent.text)
for token in dative_sent:
print(token.text, token.pos_, token.dep_)
print()
Output:
Sentence with dative: She gave the dog a bone.
She PRON nsubj
gave VERB ROOT
the DET det
dog NOUN dative
a DET det
bone NOUN dobj
. PUNCT punct
Sentence with dative: They gave her a book.
They PRON nsubj
gave VERB ROOT
her PRON dative
a DET det
book NOUN dobj
. PUNCT punct

Related

Spacy does not recognise people names when they contain an apostrophe

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
doc = nlp("That had been Megan's plan when she got him dressed earlier.")
labels = [ent.label_ for ent in doc.ents]
entity_text = [ent.text for ent in doc.ents]
print(labels)
print(entity_text)
This returns [ORG] for Megan insetad of [PERSON]. What am I doing wrong? I didn't think a simple apostrophe could throw off the NER model like that.

how to add multiple custom entity recognition using spacy 3.x

Here is Input.json
[
"How to preorder the iPhone X",
"iPhone X is coming",
"Should I pay $1,000 for the iPhone X?",
"The iPhone 8 reviews are here",
"iPhone 11 vs iPhone 8: What's the difference?",
"I need a new phone! Any tips?",
"Why the iPhone 21",
"iPhone 9 is coming",
"Should I pay $1,000 for the X iPhone?",
"The iPhone 29 reviews are here",
"iPhone 999 vs iPhone 8: What's the difference?",
"I need a new phone! Any tips?",
"Are we talking about phones"
]
Question: why "coming" is not showing in the output. If I remove matcher.add for GADGET
then only it shows otherwise not
The final question is how can we add multiple patterns in SpaCy
All the detail is given below
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin
import random
with open("input.json", encoding="utf8") as f:
TEXTS = json.loads(f.read())
nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
# Add patterns to the matcher
pattern1 = ([{"LOWER": "iphone"}, {"LOWER": "x"}])
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", [pattern1, pattern2])
pattern3 = [{"LOWER": "coming"}]
matcher.add("COME", [pattern3])
docs = []
for doc in nlp.pipe(TEXTS):
matches = matcher(doc)
spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
print (spans)
doc.ents = spans
print(f'spans is: {spans} with match: {matches}')
#print([(ent.text, ent.label_) for ent in doc.ents])
docs.append(doc)
print (docs)
random.shuffle(docs)
train_docs = docs[:len(docs) // 2]
dev_docs = docs[len(docs) // 2:]
# Create and save a collection of training docs
train_docbin = DocBin(docs=train_docs)
train_docbin.to_disk("./train.spacy")
# Create and save a collection of evaluation docs
dev_docbin = DocBin(docs=dev_docs)
dev_docbin.to_disk("./dev.spacy")
#the below given code is triggered for config and training
# For config
#python -m spacy init config ./config.cfg --lang en --pipeline ner
# For training
# python -m spacy train ./config.cfg --output ./output --paths.train train.spacy --paths.dev dev.spacy
# Question: why "coming" is not showing in the output. If I remove matcher.add for GADGET
# then only it shows otherwise not
# The final question is how can we add multiple patterns in SpaCy
nlp = spacy.load("output/model-best")
doc = nlp("iPhone 55 vs iPhone 8: What's the difference? coming")
print(doc.ents)
for ent in doc.ents:
# Print the entity text and its label
print(ent.text, ent.label_)
End of code
The complete detail has been added

How to merge same consecutive entity types using Spacy

this is sample example, which uses entity_ruler to create patterns. but I want to merge same consecutive entity types into one entity and token
import spacy
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
ent_list_sample = ["brain", "ischimia", "heart failufe", "parenchyma"]
print("Adding patterns to EntityRuler:\n-----------")
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp("It has a brain and also might have brain parenchyma ")
print("Entities:")
print(doc.ents)
output: (brain, brain, parenchyma)
expected: (brain, brain parenchyma)
PS: how we can reach expected output without adding extra pattern for "brain parenchyma"
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
from spacy.pipeline import merge_entities
nlp = spacy.load("en_core_web_sm")
ent_list_sample = ['algorithm', 'data', 'engineering', 'software']
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ent_list_sample1 = ["brain", "ischimia", "heart failufe", "parenchyma"]
patterns1 = []
for concept in ent_list_sample1:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns1.append({"label": "HE", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns1.append({"label": "HE", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns+patterns1)
nlp.add_pipe(ruler, before="ner")
class EntityRetokenizeComponent:
def __init__(self, nlp):
pass
def __call__(self, doc):
new_ents = []
for ent in doc.ents:
if ent.label_ == doc[ent.start - 1].ent_type_ and ent.start != 0:
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
new_ents.append(new_ent)
else:
new_ents.append(ent)
doc.ents =filter_spans(new_ents+ list(doc.ents))
return doc
retokenizer = EntityRetokenizeComponent(nlp)
nlp.add_pipe(retokenizer, name='merge_phrases', last=True)
nlp.add_pipe(merge_entities, last=True)
nlp.pipe_names
doc = nlp("I love Ann is good as well data software is good for brain parenchyma and Apple is good company")
print([(ent.text, ent.label_) for ent in doc.ents])
This gave me desired output I wanted to get:
[('Ann', 'PERSON'), ('data software', 'SCI'), ('brain parenchyma', 'HE'), ('Apple', 'ORG')]

Cannot replace spaCy lemmatized pronouns (-PRON-) through text

I'm trying to lemmatise a text with spaCy. Since spaCy uses -PRON- as lemma for personal pronouns, I want to keep the original text in all those cases.
Here's the relevant section of my code:
...
fout = open('test.txt', 'w+')
doc = nlp(text)
for word in doc:
if word.lemma_ == "-PRON-":
write = word.text
print(write)
else:
write = word.lemma_
fout.write(str(write))
fout.write(" ")
...
The print statement does print the original words for the cases where spaCy attributes the lemma '-PRON-'.
However, my output file (test.txt) always contains '-PRON-' for those cases, even though I would expect it to write the original words for those cases (I, us etc.)
What am I missing?
I tried different versions, including using the pos_ tag to identify the pronouns etc. but always with the same result, i.e., that my output contains '-PRON-'s
Try this somewhat altered code snipped to see what you get...
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Did he write the code for her?'
doc = nlp(text)
out_sent = [w.lemma_ if w.lemma_ !='-PRON-' else w.text for w in doc]
out_sent = ' '.join(out_sent)
print(out_sent)
with open('out_sent.txt', 'w') as f:
f.write(out_sent + '\n')
This should produce...
do he write the code for her ?

My question is about "module 'textacy' has no attribute 'Doc'"

Can't find module 'textacy' has no attribute 'Doc'
I am trying to extract verb phrases from spacy but there is such no library. Please help me how can I extract the verb phrases or adjective phrases using spacy. I want to do full shallow parsing.
def extract_named_nouns(row_series):
"""Combine nouns and non-numerical entities.
Keyword arguments:
row_series -- a Pandas Series object
"""
ents = set()
idxs = set()
# remove duplicates and merge two lists together
for noun_tuple in row_series['nouns']:
for named_ents_tuple in row_series['named_ents']:
if noun_tuple[1] == named_ents_tuple[1]:
idxs.add(noun_tuple[1])
ents.add(named_ents_tuple)
if noun_tuple[1] not in idxs:
ents.add(noun_tuple)
return sorted(list(ents), key=lambda x: x[1])
def add_named_nouns(df):
"""Create new column in data frame with nouns and named ents.
Keyword arguments:
df -- a dataframe object
"""
df['named_nouns'] = df.apply(extract_named_nouns, axis=1)
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
from textacy import io
#using spacy for nlp
nlp = en_core_web_sm.load()
sentence = 'The author is writing a new book.'
pattern = r'<VERB>?<ADV>*<VERB>+'
doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
# doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
lists = textacy.extract.pos_regex_matches(doc, pattern)
for list in lists:
print(list.text)
module 'textacy' has no attribute 'Doc'
Try following the examples here: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html#make-a-doc
It should be as simple as:
doc = textacy.make_spacy_doc("The author is writing a new book.", lang='en_core_web_sm')
You might look into just using spacy (without textacy) with its built-in Matcher instead (https://spacy.io/usage/rule-based-matching).
spacy_lang = textacy.load_spacy_lang("en_core_web_en")
docx_textacy = spacy_lang(sentence)