My question is about "module 'textacy' has no attribute 'Doc'" - spacy

Can't find module 'textacy' has no attribute 'Doc'
I am trying to extract verb phrases from spacy but there is such no library. Please help me how can I extract the verb phrases or adjective phrases using spacy. I want to do full shallow parsing.
def extract_named_nouns(row_series):
"""Combine nouns and non-numerical entities.
Keyword arguments:
row_series -- a Pandas Series object
"""
ents = set()
idxs = set()
# remove duplicates and merge two lists together
for noun_tuple in row_series['nouns']:
for named_ents_tuple in row_series['named_ents']:
if noun_tuple[1] == named_ents_tuple[1]:
idxs.add(noun_tuple[1])
ents.add(named_ents_tuple)
if noun_tuple[1] not in idxs:
ents.add(noun_tuple)
return sorted(list(ents), key=lambda x: x[1])
def add_named_nouns(df):
"""Create new column in data frame with nouns and named ents.
Keyword arguments:
df -- a dataframe object
"""
df['named_nouns'] = df.apply(extract_named_nouns, axis=1)
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
from textacy import io
#using spacy for nlp
nlp = en_core_web_sm.load()
sentence = 'The author is writing a new book.'
pattern = r'<VERB>?<ADV>*<VERB>+'
doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
# doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
lists = textacy.extract.pos_regex_matches(doc, pattern)
for list in lists:
print(list.text)
module 'textacy' has no attribute 'Doc'

Try following the examples here: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html#make-a-doc
It should be as simple as:
doc = textacy.make_spacy_doc("The author is writing a new book.", lang='en_core_web_sm')
You might look into just using spacy (without textacy) with its built-in Matcher instead (https://spacy.io/usage/rule-based-matching).

spacy_lang = textacy.load_spacy_lang("en_core_web_en")
docx_textacy = spacy_lang(sentence)

Related

How to merge same consecutive entity types using Spacy

this is sample example, which uses entity_ruler to create patterns. but I want to merge same consecutive entity types into one entity and token
import spacy
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
ent_list_sample = ["brain", "ischimia", "heart failufe", "parenchyma"]
print("Adding patterns to EntityRuler:\n-----------")
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp("It has a brain and also might have brain parenchyma ")
print("Entities:")
print(doc.ents)
output: (brain, brain, parenchyma)
expected: (brain, brain parenchyma)
PS: how we can reach expected output without adding extra pattern for "brain parenchyma"
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
from spacy.pipeline import merge_entities
nlp = spacy.load("en_core_web_sm")
ent_list_sample = ['algorithm', 'data', 'engineering', 'software']
patterns = []
for concept in ent_list_sample:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ent_list_sample1 = ["brain", "ischimia", "heart failufe", "parenchyma"]
patterns1 = []
for concept in ent_list_sample1:
doc = nlp.make_doc(concept)
if len(doc) > 1:
patterns1.append({"label": "HE", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
else:
patterns1.append({"label": "HE", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns+patterns1)
nlp.add_pipe(ruler, before="ner")
class EntityRetokenizeComponent:
def __init__(self, nlp):
pass
def __call__(self, doc):
new_ents = []
for ent in doc.ents:
if ent.label_ == doc[ent.start - 1].ent_type_ and ent.start != 0:
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
new_ents.append(new_ent)
else:
new_ents.append(ent)
doc.ents =filter_spans(new_ents+ list(doc.ents))
return doc
retokenizer = EntityRetokenizeComponent(nlp)
nlp.add_pipe(retokenizer, name='merge_phrases', last=True)
nlp.add_pipe(merge_entities, last=True)
nlp.pipe_names
doc = nlp("I love Ann is good as well data software is good for brain parenchyma and Apple is good company")
print([(ent.text, ent.label_) for ent in doc.ents])
This gave me desired output I wanted to get:
[('Ann', 'PERSON'), ('data software', 'SCI'), ('brain parenchyma', 'HE'), ('Apple', 'ORG')]

Remove stopwords in Tensorflow extended

I have to preprocess NLP data, so I've to remove the stopwords (from nltk library) from a Tensorflow dataset. I tried many thing like this:
docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]])
tokenizer = text.WhitespaceTokenizer()
tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
data = tokenized_docs.filter(lambda x: x. not in stop_words)
or this:
tokens = docs.map(lambda x: tokenizer.tokenize(x))
data = tokens.filter(lambda x: tf.strings.strip(x).ref() not in stopwords)
But it didn't work. This first code shows an error like: RaggedTensor is unhashable.
From what I can tell Tensorflow supports basic string normalization (lowercasing + punctuation stripping) using the standardize callback's standardization function. There doesn't appear to be support for more advanced options, like removing stop words without doing it yourself.
It's probably easier to just do the standardization beforehand, outside of TensorFlow and then pass the result on.
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def parse_text(text):
print(f'Input: {text}')
text = re.sub("[^a-zA-Z]", ' ', text)
print(f'Remove punctuation and numbers: {text}')
text = text.lower().split()
print(f'Lowercase and split: {text}')
swords = set(stopwords.words("english"))
text = [w for w in text if w not in swords]
print(f'Remove stop words: {text}')
text = " ".join(text)
print(f'Final: {text}')
return text
list1 = [["NEver tell me the odds."],["It's a trap!"]]
for sublist in list1:
for i in range(len(sublist)):
sublist[i] = parse_text(sublist[i])
print(list1)
# [['never tell odds'], ['trap']]
You can use this to remove stopwords when using tfx
from nltk.corpus import stopwords
outputs['review'] = tf.strings.regex_replace(inputs['review'], r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*',"")

Pandas UDF (PySpark) - Incorrect type Error

I'm trying entity extraction with spaCy and Pandas UDF (PySpark) but I get an error.
Using a UDF works without errors but is slow. What am I doing wrong?
Loading the model every time is to avoid load error - Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.
Working UDF:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
get_entities_udf = F.udf(__get_entities), T.ArrayType(T.StringType()))
Pandas UDF with error:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return pd.Series(ents)
get_entities_udf = F.pandas_udf(lambda x: __get_entities(x), "array<string>", F.PandasUDFType.SCALAR)
Error message:
TypeError: Argument 'string'has incorrect type (expected str, got series)
Sample Spark DataFrame:
df = spark.createDataFrame([
['John Doe'],
['Jane Doe'],
['Microsoft Corporation'],
['Apple Inc.'],
]).toDF("name",)
New column:
df_new = df.withColumn('entity',get_entities_udf('name'))
You need to see the input as pd.Series instead of single value
I was able to get it working by refactoring the code a bit. Notice x.apply call which is pandas specific and applies function to a pd.Series.
def entities(x):
global nlp
import spacy
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('name'))
df_new.show()
+--------------------+--------+
| name| entity|
+--------------------+--------+
| John Doe|[PERSON]|
| Jane Doe|[PERSON]|
|Microsoft Corpora...| [ORG]|
| Apple Inc.| [ORG]|
+--------------------+--------+
I'm using: pyspark 3.1.1 and python 3.7
The answer above didn't work for me, I and spend quite some time making things work, so I thought I'd share the solution I came up with.
Setting things up
creating a sample of 16 random person and company names
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
As it is
First, checking the current solution proposed. I'm just Adding a print statement upon loading the spacy model to see how many time we do load the model.
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
def entities(x):
global nlp
import spacy
nlp = load_spacy_model()
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('value'))
df_new.show()
We can then see that the model is loaded 16 times, so one for every single entry we process. Not what I want.
Batch processing
Rewriting using the decorator introduce in spark 3.0+ that is using Type Hints (python 3.6+). Then our UDF is using the nlp.pipe() for batch processing the entire pd.Series
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
global nlp
nlp = load_spacy_model()
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
df_new = df.withColumn('entity',entities('value'))
df_new.show()
In my case the model was loaded 4 times, that's better. It's each time a python worker is created to process a batch. So the number will depend how many cores is Spark using but more critically in my case: how much partitioned is our data. So it's yet to be optimum
broadcasting the nlp object
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
#pandas_udf(ArrayType(StringType()))
def entities(list_of_text: pd.Series) -> pd.Series:
nlp = boardcasted_nlp.value
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show()
Now the model is loaded only once then broadcasted to every python worker that is getting spawned.
The complete Code
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
# creating our set of fake person and company names
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
# retrieving the shared nlp object
nlp = boardcasted_nlp.value
# batch processing our list of text
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
# we load the spacy model and broadcast it
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show(truncate=False)
Result
+----------------------------------+--------------------------------+
|value |entity |
+----------------------------------+--------------------------------+
|Ferguson, Price and Green Ltd |[ORG, ORG, ORG] |
|Cassandra Goodman MD |[PERSON] |
|Solis Ltd LLC |[ORG] |
|Laurie Foster |[PERSON] |
|Lane-Vasquez Group |[ORG] |
|Matthew Wright |[PERSON] |
|Scott, Pugh and Rodriguez and Sons|[PERSON, PERSON, PERSON, PERSON]|
|Tina Cooke |[PERSON] |
|Watkins, Blake and Foster Ltd |[ORG] |
|Charles Reyes |[PERSON] |
|Cooper, Norris and Roberts PLC |[ORG] |
|Michael Tate |[PERSON] |
|Powell, Lawson and Perez and Sons |[PERSON, PERSON, PERSON, PERSON]|
|James Wolf PhD |[PERSON] |
|Greer-Swanson PLC |[ORG] |
|Nicholas Hale |[PERSON] |
+----------------------------------+--------------------------------+

How to quickly convert a pandas dataframe to a list of tuples

I have a pandas dataframe as follows.
thi 0.969378
text 0.969378
is 0.969378
anoth 0.699030
your 0.497120
first 0.497120
book 0.497120
third 0.445149
the 0.445149
for 0.445149
analysi 0.445149
I want to convert it to a list of tuples as follows.
[["this", 0.969378], ["text", 0.969378], ..., ["analysi", 0.445149]]
My code is as follows.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
def tokenize(text):
tokens = word_tokenize(text)
stems = []
for item in tokens: stems.append(PorterStemmer().stem(item))
return stems
# your corpus
text = ["This is your first text book", "This is the third text for analysis", "This is another text"]
# word tokenize and stem
text = [" ".join(tokenize(txt.lower())) for txt in text]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text).todense()
# transform the matrix to a pandas df
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
# sum over each document (axis=0)
top_words = matrix.sum(axis=0).sort_values(ascending=False)
print(top_words)
I tried the following two options.
list(zip(*map(top_words.get, top_words)))
I got the error as TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [0.9693779251346359] of <class 'float'>
list(top_words.itertuples(index=True))
I got the error as AttributeError: 'Series' object has no attribute 'itertuples'.
Please let me know a quick way of doing this in pandas.
I am happy to provide more details if needed.
Use zip by index with map tuples to lists:
a = list(map(list,zip(top_words.index,top_words)))
Or convert index to column, convert to nupy array and then to lists:
a = top_words.reset_index().to_numpy().tolist()
print (a)
[['thi', 0.9693780000000001], ['text', 0.9693780000000001],
['is', 0.9693780000000001], ['anoth', 0.69903],
['your', 0.49712], ['first', 0.49712], ['book', 0.49712],
['third', 0.44514899999999996], ['the', 0.44514899999999996],
['for', 0.44514899999999996], ['analysi', 0.44514899999999996]]

Lemmatize tokenised column in pandas

I'm trying to lemmatize tokenized column comments_tokenized
I do:
import nltk
from nltk.stem import WordNetLemmatizer
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
return [lemmatizer.lemmatize(w) for w in df1["comments_tokenized"]]
df1['comments_lemmatized'] = df1["comments_tokenized"].apply(lemmatize_text)
but have
TypeError: unhashable type: 'list'
What can I do to lemmatize a column with bag of words?
And also how to avoid the problem with tokenization that divides [don't] to [do,n't]?
You were close on your function! since you are using apply on the series, you don't need to specifically call out the column in the function. you also are not using the input text at all in your function. So change
def lemmatize_text(text):
return [lemmatizer.lemmatize(w) for w in df1["comments_tokenized"]]
to
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(w) for w in text] ##Notice the use of text.
An example:
df = pd.DataFrame({'A':[["cats","cacti","geese","rocks"]]})
A
0 [cats, cacti, geese, rocks]
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(w) for w in text]
df['A'].apply(lemmatize_text)
0 [cat, cactus, goose, rock]