How to merge same consecutive entity types using Spacy - spacy

this is sample example, which uses entity_ruler to create patterns. but I want to merge same consecutive entity types into one entity and token
import spacy
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
ent_list_sample = ["brain", "ischimia", "heart failufe", "parenchyma"]
print("Adding patterns to EntityRuler:\n-----------")
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp("It has a brain and also might have brain parenchyma ")
print("Entities:")
print(doc.ents)
output: (brain, brain, parenchyma)
expected: (brain, brain parenchyma)
PS: how we can reach expected output without adding extra pattern for "brain parenchyma"

import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
from spacy.pipeline import merge_entities
nlp = spacy.load("en_core_web_sm")
ent_list_sample = ['algorithm', 'data', 'engineering', 'software']
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ent_list_sample1 = ["brain", "ischimia", "heart failufe", "parenchyma"]
patterns1 = []
for concept in ent_list_sample1:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns1.append({"label": "HE", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns1.append({"label": "HE", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns+patterns1)
nlp.add_pipe(ruler, before="ner")
class EntityRetokenizeComponent:
def __init__(self, nlp):
pass
def __call__(self, doc):
new_ents = []
for ent in doc.ents:
if ent.label_ == doc[ent.start - 1].ent_type_ and ent.start != 0:
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
new_ents.append(new_ent)
else:
new_ents.append(ent)
doc.ents =filter_spans(new_ents+ list(doc.ents))
return doc
retokenizer = EntityRetokenizeComponent(nlp)
nlp.add_pipe(retokenizer, name='merge_phrases', last=True)
nlp.add_pipe(merge_entities, last=True)
nlp.pipe_names
doc = nlp("I love Ann is good as well data software is good for brain parenchyma and Apple is good company")
print([(ent.text, ent.label_) for ent in doc.ents])
This gave me desired output I wanted to get:
[('Ann', 'PERSON'), ('data software', 'SCI'), ('brain parenchyma', 'HE'), ('Apple', 'ORG')]

Related

can i use OR-tools for TSP with partial distance matrix (for a huge set of nodes)?

i'm trying to solve tsp with OR-tools for a problem of something like 80,000 nodes, the problem is, I need a huge distance matrix that takes to much memory ,so its infeasible and i don't get a solution.
so:
is there an option to work with partial distance matrix in or-tools?
if not is there a way to improve my code?
is there another external solver that can work for this task in python?
import math
from collections import namedtuple
import random
import time
from collections import namedtuple
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import numba
from scipy.spatial import distance_matrix
from sklearn.metrics.pairwise import euclidean_distances
from math import sqrt
Point = namedtuple("Point", ['x', 'y'])
def solve_it(input_data):
# Modify this code to run your optimization algorithm
global POINTS
# parse the input
lines = input_data.split('\n')
nodeCount = int(lines[0])
points = []
for i in range(1, nodeCount+1):
line = lines[i]
parts = line.split()
points.append(Point(float(parts[0]), float(parts[1])))
#2.routing with or tools
def dist_matrix(nodeCount,points):
data=[]
for k in range(len(points)):
data.append([int(points[k].x),int(points[k].y)])
D=euclidean_distances(data, data)
return D
def create_data_model(D):
"""Stores the data for the problem."""
data = {}
data['distance_matrix'] = D # yapf: disable
data['num_vehicles'] = 1
data['depot'] = 0
return data
def print_solution(manager, routing, solution):
index = routing.Start(0)
plan_output = []#Route for vehicle 0:\n'
route_distance = 0
while not routing.IsEnd(index):
plan_output.append(manager.IndexToNode(index))
index = solution.Value(routing.NextVar(index))
return plan_output
def or_main(nodeCount,points):
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
"""Entry point of the program."""
# Instantiate the data problem.
global sol
D=dist_matrix(nodeCount,points)
data = create_data_model(D)
# Create the routing index manager.
manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),
data['num_vehicles'], data['depot'])
# Create Routing Model.
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index):
"""Returns the distance between the two nodes."""
# Convert from routing variable Index to distance matrix NodeIndex.
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return data['distance_matrix'][from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
# Define cost of each arc.
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
# Setting first solution heuristic.
search_parameters = pywrapcp.DefaultRoutingSearchParameters()
search_parameters.local_search_metaheuristic = (
routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
k = 100
if nodeCount <= 100:
k = 30
elif 100 <= nodeCount <= 1000:
k = 300
elif nodeCount > 1000:
k = 17000
search_parameters.time_limit.seconds =k
search_parameters.log_search = True
# Solve the problem.
solution = routing.SolveWithParameters(search_parameters)
# #print solution on console.
if solution:
sol=print_solution(manager, routing, solution)
return sol
######################################################################
solution=or_main(nodeCount,points)
# calculate the length of the tour
obj = length(points[solution[-1]], points[solution[0]])
for index in range(0, nodeCount-1):
obj += length(points[solution[index]], points[solution[index+1]])
# prepare the solution in the specified output format
output_data = '%.2f' % obj + ' ' + str(0) + '\n'
output_data += ' '.join(map(str, solution))
return output_data
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
file_location = sys.argv[1].strip()
with open(file_location, 'r') as input_data_file:
input_data = input_data_file.read()
#print(solve_it(input_data))
else:
print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/tsp_51_1)')

Remove stopwords in Tensorflow extended

I have to preprocess NLP data, so I've to remove the stopwords (from nltk library) from a Tensorflow dataset. I tried many thing like this:
docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]])
tokenizer = text.WhitespaceTokenizer()
tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
data = tokenized_docs.filter(lambda x: x. not in stop_words)
or this:
tokens = docs.map(lambda x: tokenizer.tokenize(x))
data = tokens.filter(lambda x: tf.strings.strip(x).ref() not in stopwords)
But it didn't work. This first code shows an error like: RaggedTensor is unhashable.
From what I can tell Tensorflow supports basic string normalization (lowercasing + punctuation stripping) using the standardize callback's standardization function. There doesn't appear to be support for more advanced options, like removing stop words without doing it yourself.
It's probably easier to just do the standardization beforehand, outside of TensorFlow and then pass the result on.
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def parse_text(text):
print(f'Input: {text}')
text = re.sub("[^a-zA-Z]", ' ', text)
print(f'Remove punctuation and numbers: {text}')
text = text.lower().split()
print(f'Lowercase and split: {text}')
swords = set(stopwords.words("english"))
text = [w for w in text if w not in swords]
print(f'Remove stop words: {text}')
text = " ".join(text)
print(f'Final: {text}')
return text
list1 = [["NEver tell me the odds."],["It's a trap!"]]
for sublist in list1:
for i in range(len(sublist)):
sublist[i] = parse_text(sublist[i])
print(list1)
# [['never tell odds'], ['trap']]
You can use this to remove stopwords when using tfx
from nltk.corpus import stopwords
outputs['review'] = tf.strings.regex_replace(inputs['review'], r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*',"")

Pandas UDF (PySpark) - Incorrect type Error

I'm trying entity extraction with spaCy and Pandas UDF (PySpark) but I get an error.
Using a UDF works without errors but is slow. What am I doing wrong?
Loading the model every time is to avoid load error - Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.
Working UDF:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
get_entities_udf = F.udf(__get_entities), T.ArrayType(T.StringType()))
Pandas UDF with error:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return pd.Series(ents)
get_entities_udf = F.pandas_udf(lambda x: __get_entities(x), "array<string>", F.PandasUDFType.SCALAR)
Error message:
TypeError: Argument 'string'has incorrect type (expected str, got series)
Sample Spark DataFrame:
df = spark.createDataFrame([
['John Doe'],
['Jane Doe'],
['Microsoft Corporation'],
['Apple Inc.'],
]).toDF("name",)
New column:
df_new = df.withColumn('entity',get_entities_udf('name'))
You need to see the input as pd.Series instead of single value
I was able to get it working by refactoring the code a bit. Notice x.apply call which is pandas specific and applies function to a pd.Series.
def entities(x):
global nlp
import spacy
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('name'))
df_new.show()
+--------------------+--------+
| name| entity|
+--------------------+--------+
| John Doe|[PERSON]|
| Jane Doe|[PERSON]|
|Microsoft Corpora...| [ORG]|
| Apple Inc.| [ORG]|
+--------------------+--------+
I'm using: pyspark 3.1.1 and python 3.7
The answer above didn't work for me, I and spend quite some time making things work, so I thought I'd share the solution I came up with.
Setting things up
creating a sample of 16 random person and company names
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
As it is
First, checking the current solution proposed. I'm just Adding a print statement upon loading the spacy model to see how many time we do load the model.
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
def entities(x):
global nlp
import spacy
nlp = load_spacy_model()
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('value'))
df_new.show()
We can then see that the model is loaded 16 times, so one for every single entry we process. Not what I want.
Batch processing
Rewriting using the decorator introduce in spark 3.0+ that is using Type Hints (python 3.6+). Then our UDF is using the nlp.pipe() for batch processing the entire pd.Series
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
global nlp
nlp = load_spacy_model()
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
df_new = df.withColumn('entity',entities('value'))
df_new.show()
In my case the model was loaded 4 times, that's better. It's each time a python worker is created to process a batch. So the number will depend how many cores is Spark using but more critically in my case: how much partitioned is our data. So it's yet to be optimum
broadcasting the nlp object
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
#pandas_udf(ArrayType(StringType()))
def entities(list_of_text: pd.Series) -> pd.Series:
nlp = boardcasted_nlp.value
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show()
Now the model is loaded only once then broadcasted to every python worker that is getting spawned.
The complete Code
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
# creating our set of fake person and company names
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
# retrieving the shared nlp object
nlp = boardcasted_nlp.value
# batch processing our list of text
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
# we load the spacy model and broadcast it
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show(truncate=False)
Result
+----------------------------------+--------------------------------+
|value |entity |
+----------------------------------+--------------------------------+
|Ferguson, Price and Green Ltd |[ORG, ORG, ORG] |
|Cassandra Goodman MD |[PERSON] |
|Solis Ltd LLC |[ORG] |
|Laurie Foster |[PERSON] |
|Lane-Vasquez Group |[ORG] |
|Matthew Wright |[PERSON] |
|Scott, Pugh and Rodriguez and Sons|[PERSON, PERSON, PERSON, PERSON]|
|Tina Cooke |[PERSON] |
|Watkins, Blake and Foster Ltd |[ORG] |
|Charles Reyes |[PERSON] |
|Cooper, Norris and Roberts PLC |[ORG] |
|Michael Tate |[PERSON] |
|Powell, Lawson and Perez and Sons |[PERSON, PERSON, PERSON, PERSON]|
|James Wolf PhD |[PERSON] |
|Greer-Swanson PLC |[ORG] |
|Nicholas Hale |[PERSON] |
+----------------------------------+--------------------------------+

My question is about "module 'textacy' has no attribute 'Doc'"

Can't find module 'textacy' has no attribute 'Doc'
I am trying to extract verb phrases from spacy but there is such no library. Please help me how can I extract the verb phrases or adjective phrases using spacy. I want to do full shallow parsing.
def extract_named_nouns(row_series):
"""Combine nouns and non-numerical entities.
Keyword arguments:
row_series -- a Pandas Series object
"""
ents = set()
idxs = set()
# remove duplicates and merge two lists together
for noun_tuple in row_series['nouns']:
for named_ents_tuple in row_series['named_ents']:
if noun_tuple[1] == named_ents_tuple[1]:
idxs.add(noun_tuple[1])
ents.add(named_ents_tuple)
if noun_tuple[1] not in idxs:
ents.add(noun_tuple)
return sorted(list(ents), key=lambda x: x[1])
def add_named_nouns(df):
"""Create new column in data frame with nouns and named ents.
Keyword arguments:
df -- a dataframe object
"""
df['named_nouns'] = df.apply(extract_named_nouns, axis=1)
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
from textacy import io
#using spacy for nlp
nlp = en_core_web_sm.load()
sentence = 'The author is writing a new book.'
pattern = r'<VERB>?<ADV>*<VERB>+'
doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
# doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
lists = textacy.extract.pos_regex_matches(doc, pattern)
for list in lists:
print(list.text)
module 'textacy' has no attribute 'Doc'
Try following the examples here: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html#make-a-doc
It should be as simple as:
doc = textacy.make_spacy_doc("The author is writing a new book.", lang='en_core_web_sm')
You might look into just using spacy (without textacy) with its built-in Matcher instead (https://spacy.io/usage/rule-based-matching).
spacy_lang = textacy.load_spacy_lang("en_core_web_en")
docx_textacy = spacy_lang(sentence)

POS Tags to Wordnet in Pandas Dataframe

I am using NLTK on a dataset stored as a pandas dataframe. All the raw text processing procedures worked fine until I tried to convert the Treebank POS tags to Wordnet POS tags. These are the codes which worked fine for me.
import pandas as pd
import string
from nltk import WordPunctTokenizer, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn, stopwords
# Example dataframe
df = pd.DataFrame([[2, "I am new at programming."],
[7, "Leaves are falling from the tree."],
[4, "Sophia has been studying since this morning."]], columns = ['ID', 'Text'])
# Tokenize text
tokenizer = nltk.WordPunctTokenizer()
df["Tokens"] = df["Text"].str.lower().apply(tokenizer.tokenize)
# Remove punctuations
pattern = string.punctuation
print(pattern)
def remove_punctuation(tokens):
filtered = [word for word in tokens if word not in pattern]
return filtered
df["Tokens"] = df["Tokens"].apply(remove_punctuation)
# Remove stopwords
stopwords = stopwords.words('english')
def remove_stopwords(tokens):
filtered_words = [word for word in tokens if word not in stopwords]
return filtered_words
df["Tokens"] = df["Tokens"].apply(remove_stopwords)
The following lines of codes did not work and I got this error:
ValueError: too many values to unpack (expected 2)
def wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wn.ADJ
elif pos_tag.startswith('V'):
return wn.VERB
elif pos_tag.startswith('N'):
return wn.NOUN
elif pos_tag.startswith('R'):
return wn.ADV
else:
return None
def wordnet(tokens):
pos_tokens = [nltk.pos_tag(token) for token in tokens]
pos_tokens = [(word, wordnet_pos(pos_tag)) for (word, pos_tag) in pos_tokens]
return pos_tokens
df["Wordnet"] = df["Tokens"].apply(wordnet)
This is what I had hoped to achieve - to create df["Wordnet"] with the Wordnet POS tags.
print(df["Wordnet"])
0 [(new, a), (programming, n)]
1 [(leaves, n), (falling, v), (tree, n)]
2 [(sophia, n), (studying, v), (since, n), (...
Name: Wordnet, dtype: object