Where does the "compound" DEP come from? - spacy

In the following SpaCy code:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("It was yesterday morning.")
doc[2].dep_
# 'compound'
doc[2].dep
# 7037928807040764755
doc[3].dep_
# 'npadmod'
doc[3].dep
# 428
I can track npadmod back to symbols.pxdenum.
But, where is 'compound' defined (both string and ID) ?
[SOLVED]
Some DEPs come from the symbols enum, some are dynamically added by the language itself. The value is then the hash of the string name, and they are contained into a StringStore.
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("It was yesterday morning.")
doc[2].dep_
# 'compound'
doc[2].dep
# 7037928807040764755
nlp.vocab.strings[doc[2].dep]
# 'compound'
nlp.vocab.strings['compound']
# 7037928807040764755

Related

Tensor flow and tflearn Chatbot keeps on getting high probability even when user input is wrong

I coded a simple AI chatbot with TensorFlow and tflearn and it runs just fine but the issue is when the user inputs the wrong thing, the bot is supposed to say it doesnt understand if the prediction accuracy is less than 70%, but the bot always scores above that even if the user gives jibberish like "rjrigrejfr". The bot assumes theyre greeting them. The patterns its supposed to study in the json are "patterns": ["Hi", "How are you", "Wassup", "Hello", "Good day", "Waddup", "Yo"]. I can share the json file if needed its short. Anyway, this is the python code:
import numpy as np
import nltk
import tensorflow
import tflearn
import random
import json
import pickle
# Some extra configuration:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
nltk.download('punkt')
# Load the data from the json file into a variable.
with open("intents.json") as file:
data = json.load(file)
# If we already have saved data, we do not need to retrain the model and waste time (could develop into an issue in more complex programs. Save in pickle. )
try:
with open("data.pickle", "rb") as f: # rb stands for bytes.
words, labels, training, output = pickle.load(f)
# --- Pre-training data preparation ---
except:
words = []
docsx = [] # Stores patterns
docsy = [] # Stores intents
labels = [] # All the specific tag values such as greeting, contact, etc.
for intent in data["intents"]:
for pattern in intent["patterns"]:
w = nltk.word_tokenize(pattern) # nltk function that splits the sentences inside intent into words list.
words.extend(w) # Add the tokenized list to words list.
docsx.append(w)
docsy.append(intent["tag"]) # append the classification of the sentence
if intent["tag"] not in labels:
labels.append(intent["tag"])
words = [stemmer.stem(w.lower()) for w in words if w not in ".?!"] # Stemming the words to remove unnecessary elements leaving their root. Convert all to lowercase.
words = sorted(list(set(words))) # Set ensures no duplicate elements then we convert back to list and sort it.
labels = sorted(labels)
training = []
output = []
out_empty = [0 for i in range(len(labels))] # Gives a list of 0 ints based on # of tags. This is useful later in the program when binerizing.
# One hot encoding the intent categories. Need to one-hot code the data which improves the efficiency of the ML to "binerize" the data.
# In this case, we have a list of 0s and 1s if the word appears it is assigned a 1 else a 0.
for x, doc in enumerate(docsx):
bag = [] # Bag of words or the one-hot coded data for the ML.
docx_word_stemmed = [stemmer.stem(word) for word in doc] # Stemming the data in docx.
# Now adding and transforming data into the one-hot coded list/bag of words data.
for i in words:
if i in docx_word_stemmed: # Checking against stemmed words:
# Word exists
bag.append(1)
else:
bag.append(0)
output_row = out_empty[:] # Copying out_empty
# Going through the labels list using .index() and for the occurance of docx value in docy, assign binary 1.
output_row[labels.index(docsy[x])] = 1
training.append(bag)
output.append(output_row)
# Required to use numpy arrays for use in tflearn. It is also faster.
training = np.array(training)
output = np.array(output)
# Saving the data so we do not need to do the data configuration every time.
with open("data.pickle", "wb") as f:
pickle.dump((words, labels, training, output), f)
try:
model.load('model.tflearn')
except:
tensorflow.compat.v1.reset_default_graph()
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(output[0]), activation='softmax')
net = tflearn.regression(net)
model = tflearn.DNN(net)
model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
model.save("model.tflearn")
def bagofwords(sentence, words):
bag = [0 for _ in range(len(words))] # blank bag of words.
# Tokenize s and then stem it.
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
for string in sentence_words:
for i, word in enumerate(words):
if word == string:
bag[i] = 1
return np.array(bag)
def chat():
print("Hello there! I'm the SRO AI Virtual Assistant. How am I help you?")
# Figure out the error slime!
while True:
user_input = input("Type here:")
if user_input == "quit":
break
result = model.predict([bagofwords(user_input, words)])[0] #bagofwords func and predict function to give predictions on what the user is saying.
best_result = np.argmax(result) # We want to only use the best result.
tag = labels[best_result]
print(result[best_result])
# Open JSON file and pick a response.
if result[best_result] > 0.7:
for tg in data["intents"]:
if tg['tag'] == tag:
responses = tg['responses']
print(random.choice(responses))
else:
print("I don't quite understand")
chat()

how to add multiple custom entity recognition using spacy 3.x

Here is Input.json
[
"How to preorder the iPhone X",
"iPhone X is coming",
"Should I pay $1,000 for the iPhone X?",
"The iPhone 8 reviews are here",
"iPhone 11 vs iPhone 8: What's the difference?",
"I need a new phone! Any tips?",
"Why the iPhone 21",
"iPhone 9 is coming",
"Should I pay $1,000 for the X iPhone?",
"The iPhone 29 reviews are here",
"iPhone 999 vs iPhone 8: What's the difference?",
"I need a new phone! Any tips?",
"Are we talking about phones"
]
Question: why "coming" is not showing in the output. If I remove matcher.add for GADGET
then only it shows otherwise not
The final question is how can we add multiple patterns in SpaCy
All the detail is given below
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin
import random
with open("input.json", encoding="utf8") as f:
TEXTS = json.loads(f.read())
nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
# Add patterns to the matcher
pattern1 = ([{"LOWER": "iphone"}, {"LOWER": "x"}])
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", [pattern1, pattern2])
pattern3 = [{"LOWER": "coming"}]
matcher.add("COME", [pattern3])
docs = []
for doc in nlp.pipe(TEXTS):
matches = matcher(doc)
spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
print (spans)
doc.ents = spans
print(f'spans is: {spans} with match: {matches}')
#print([(ent.text, ent.label_) for ent in doc.ents])
docs.append(doc)
print (docs)
random.shuffle(docs)
train_docs = docs[:len(docs) // 2]
dev_docs = docs[len(docs) // 2:]
# Create and save a collection of training docs
train_docbin = DocBin(docs=train_docs)
train_docbin.to_disk("./train.spacy")
# Create and save a collection of evaluation docs
dev_docbin = DocBin(docs=dev_docs)
dev_docbin.to_disk("./dev.spacy")
#the below given code is triggered for config and training
# For config
#python -m spacy init config ./config.cfg --lang en --pipeline ner
# For training
# python -m spacy train ./config.cfg --output ./output --paths.train train.spacy --paths.dev dev.spacy
# Question: why "coming" is not showing in the output. If I remove matcher.add for GADGET
# then only it shows otherwise not
# The final question is how can we add multiple patterns in SpaCy
nlp = spacy.load("output/model-best")
doc = nlp("iPhone 55 vs iPhone 8: What's the difference? coming")
print(doc.ents)
for ent in doc.ents:
# Print the entity text and its label
print(ent.text, ent.label_)
End of code
The complete detail has been added

Pandas UDF (PySpark) - Incorrect type Error

I'm trying entity extraction with spaCy and Pandas UDF (PySpark) but I get an error.
Using a UDF works without errors but is slow. What am I doing wrong?
Loading the model every time is to avoid load error - Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.
Working UDF:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
get_entities_udf = F.udf(__get_entities), T.ArrayType(T.StringType()))
Pandas UDF with error:
def __get_entities(x):
global nlp
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' OR ent.label_ == 'ORG':
ents.append(ent.label_)
return pd.Series(ents)
get_entities_udf = F.pandas_udf(lambda x: __get_entities(x), "array<string>", F.PandasUDFType.SCALAR)
Error message:
TypeError: Argument 'string'has incorrect type (expected str, got series)
Sample Spark DataFrame:
df = spark.createDataFrame([
['John Doe'],
['Jane Doe'],
['Microsoft Corporation'],
['Apple Inc.'],
]).toDF("name",)
New column:
df_new = df.withColumn('entity',get_entities_udf('name'))
You need to see the input as pd.Series instead of single value
I was able to get it working by refactoring the code a bit. Notice x.apply call which is pandas specific and applies function to a pd.Series.
def entities(x):
global nlp
import spacy
nlp = spacy.load("en_core_web_lg")
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('name'))
df_new.show()
+--------------------+--------+
| name| entity|
+--------------------+--------+
| John Doe|[PERSON]|
| Jane Doe|[PERSON]|
|Microsoft Corpora...| [ORG]|
| Apple Inc.| [ORG]|
+--------------------+--------+
I'm using: pyspark 3.1.1 and python 3.7
The answer above didn't work for me, I and spend quite some time making things work, so I thought I'd share the solution I came up with.
Setting things up
creating a sample of 16 random person and company names
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
As it is
First, checking the current solution proposed. I'm just Adding a print statement upon loading the spacy model to see how many time we do load the model.
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
def entities(x):
global nlp
import spacy
nlp = load_spacy_model()
ents=[]
doc = nlp(x)
for ent in doc.ents:
if ent.label_ == 'PERSON' or ent.label_ == 'ORG':
ents.append(ent.label_)
return ents
def __get_entities(x):
return x.apply(entities)
get_entities_udf = pandas_udf(lambda x: __get_entities(x), "array<string>", PandasUDFType.SCALAR)
df_new = df.withColumn('entity',get_entities_udf('value'))
df_new.show()
We can then see that the model is loaded 16 times, so one for every single entry we process. Not what I want.
Batch processing
Rewriting using the decorator introduce in spark 3.0+ that is using Type Hints (python 3.6+). Then our UDF is using the nlp.pipe() for batch processing the entire pd.Series
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
global nlp
nlp = load_spacy_model()
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
df_new = df.withColumn('entity',entities('value'))
df_new.show()
In my case the model was loaded 4 times, that's better. It's each time a python worker is created to process a batch. So the number will depend how many cores is Spark using but more critically in my case: how much partitioned is our data. So it's yet to be optimum
broadcasting the nlp object
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
#pandas_udf(ArrayType(StringType()))
def entities(list_of_text: pd.Series) -> pd.Series:
nlp = boardcasted_nlp.value
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show()
Now the model is loaded only once then broadcasted to every python worker that is getting spawned.
The complete Code
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import pandas_udf, PandasUDFType
from faker import Faker
import spacy
spark = SparkSession.builder.appName("pyspark_sandbox").getOrCreate()
# creating our set of fake person and company names
names = []
fake = Faker()
for _ in range(8):
names.append(f"{fake.company()} {fake.company_suffix()}")
names.append(fake.name())
df = spark.createDataFrame(names, StringType())
# printing a msg each time we load the model
def load_spacy_model():
print("Loading spacy model...")
return spacy.load("en_core_web_sm")
# decorator indicating that this function is pandas_udf
# and that it's gonna process list of string
#pandas_udf(ArrayType(StringType()))
# function receiving a pd.Series and returning a pd.Series
def entities(list_of_text: pd.Series) -> pd.Series:
# retrieving the shared nlp object
nlp = boardcasted_nlp.value
# batch processing our list of text
docs = nlp.pipe(list_of_text)
# retrieving the str representation of entity label
# as we are limited in the types of obj
# we can return from a panda_udf
# we couldn't return a Span obj for example
ents=[
[ent.label_ for ent in doc.ents]
for doc in docs
]
return pd.Series(ents)
# we load the spacy model and broadcast it
boardcasted_nlp = spark.sparkContext.broadcast(load_spacy_model())
df_new = df.withColumn('entity',entities('value'))
df_new.show(truncate=False)
Result
+----------------------------------+--------------------------------+
|value |entity |
+----------------------------------+--------------------------------+
|Ferguson, Price and Green Ltd |[ORG, ORG, ORG] |
|Cassandra Goodman MD |[PERSON] |
|Solis Ltd LLC |[ORG] |
|Laurie Foster |[PERSON] |
|Lane-Vasquez Group |[ORG] |
|Matthew Wright |[PERSON] |
|Scott, Pugh and Rodriguez and Sons|[PERSON, PERSON, PERSON, PERSON]|
|Tina Cooke |[PERSON] |
|Watkins, Blake and Foster Ltd |[ORG] |
|Charles Reyes |[PERSON] |
|Cooper, Norris and Roberts PLC |[ORG] |
|Michael Tate |[PERSON] |
|Powell, Lawson and Perez and Sons |[PERSON, PERSON, PERSON, PERSON]|
|James Wolf PhD |[PERSON] |
|Greer-Swanson PLC |[ORG] |
|Nicholas Hale |[PERSON] |
+----------------------------------+--------------------------------+

"Wrong" TF IDF Scores

I have 1000 .txt files and planned searching for various keywords and calculate their TF-IDF Score. But for some reason the results are > 1. I did a test with 2 .txt files then: "I am studying nfc" and "You don't need AI" . For nfc and AI the TF-IDF should be 0.25 but when I open the .csv it says 1.4054651081081644.
I must admit that I did not choose the most efficient way for the code. I think the mistake is with the folders since I originally planned to check the documents by their year (annual reports from 2000-2010). But I canceled those plans and decided to check all annual reports as a whole corpus. I think the folders workaround is the problem still. I placed the 2 txt. files into the folder "-". Is there a way to make it count right?
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from pathlib import Path
# root dir
root = '/Users/Tom/PycharmProjects/TextMining/'
#
words_to_find = ['AI', 'nfc']
# tf_idf file writing
wrote_tf_idf_header = False
tf_idf_file_idx = 0
#
vectorizer_tf_idf = TfidfVectorizer(max_df=.80, min_df=1, stop_words='english', use_idf=True, norm=None, vocabulary=words_to_find, ngram_range=(1, 3))
vectorizer_cnt = CountVectorizer(stop_words='english', vocabulary=words_to_find, ngram_range=(1, 3))
#
years = ['-']
year_folders = [root + folder for folder in years]
# remove previous results file
if os.path.isfile('summary.csv'):
os.remove('summary.csv')
if os.path.isfile('tf_idf.csv'):
os.remove('tf_idf.csv')
#process every folder (for every year)
for year_idx, year_folder in enumerate(year_folders):
# get file paths in folder
file_paths = []
for file in Path(year_folder).rglob("*.txt"):
file_paths.append(file)
# count of files for each year
file_cnt = len(file_paths)
# read every file's text as string
docs_per_year = []
words_in_folder = 0
for txt_file in file_paths:
with open(txt_file, encoding='utf-8', errors="replace") as f:
txt_file_as_string = f.read()
words_in_folder += len(txt_file_as_string.split())
docs_per_year.append(txt_file_as_string)
#
tf_idf_documents_as_array = vectorizer_tf_idf.fit_transform(docs_per_year).toarray()
# tf_idf_documents_as_array = vectorizer_tf_idf.fit_transform([' '.join(docs_per_year)]).toarray()
#
cnt_documents_as_array = vectorizer_cnt.fit_transform(docs_per_year).toarray()
#
with open('summary.csv', 'a') as f:
f.write('Index;Term;Count;Df;Idf;Rel. Frequency\n')
for idx, word in enumerate(words_to_find):
abs_freq = cnt_documents_as_array[:, idx].sum()
f.write('{};{};{};{};{};{}\n'.format(idx + 1,
word,
np.count_nonzero(cnt_documents_as_array[:, idx]),
abs_freq,
vectorizer_tf_idf.idf_[idx],
abs_freq / words_in_folder))
f.write('\n')
with open('tf_idf.csv', 'a') as f:
if not wrote_tf_idf_header:
f.write('{}\n'.format(years[year_idx]))
f.write('Index;Year;File;')
for word in words_to_find:
f.write('{};'.format(word))
f.write('Sum\n')
wrote_tf_idf_header = True
for idx, tf_idfs in enumerate(tf_idf_documents_as_array):
f.write('{};{};{};'.format(tf_idf_file_idx, years[year_idx], file_paths[idx].name))
for word_idx, _ in enumerate(words_to_find):
f.write('{};'.format(tf_idf_documents_as_array[idx][word_idx]))
f.write('{}\n'.format(sum(tf_idf_documents_as_array[idx])))
tf_idf_file_idx += 1
print()
I think the mistake is, that you are defining the norm as norm=None, but the norm should be l1 or l2 as specified in the documentation.

My question is about "module 'textacy' has no attribute 'Doc'"

Can't find module 'textacy' has no attribute 'Doc'
I am trying to extract verb phrases from spacy but there is such no library. Please help me how can I extract the verb phrases or adjective phrases using spacy. I want to do full shallow parsing.
def extract_named_nouns(row_series):
"""Combine nouns and non-numerical entities.
Keyword arguments:
row_series -- a Pandas Series object
"""
ents = set()
idxs = set()
# remove duplicates and merge two lists together
for noun_tuple in row_series['nouns']:
for named_ents_tuple in row_series['named_ents']:
if noun_tuple[1] == named_ents_tuple[1]:
idxs.add(noun_tuple[1])
ents.add(named_ents_tuple)
if noun_tuple[1] not in idxs:
ents.add(noun_tuple)
return sorted(list(ents), key=lambda x: x[1])
def add_named_nouns(df):
"""Create new column in data frame with nouns and named ents.
Keyword arguments:
df -- a dataframe object
"""
df['named_nouns'] = df.apply(extract_named_nouns, axis=1)
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
from textacy import io
#using spacy for nlp
nlp = en_core_web_sm.load()
sentence = 'The author is writing a new book.'
pattern = r'<VERB>?<ADV>*<VERB>+'
doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
# doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
lists = textacy.extract.pos_regex_matches(doc, pattern)
for list in lists:
print(list.text)
module 'textacy' has no attribute 'Doc'
Try following the examples here: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html#make-a-doc
It should be as simple as:
doc = textacy.make_spacy_doc("The author is writing a new book.", lang='en_core_web_sm')
You might look into just using spacy (without textacy) with its built-in Matcher instead (https://spacy.io/usage/rule-based-matching).
spacy_lang = textacy.load_spacy_lang("en_core_web_en")
docx_textacy = spacy_lang(sentence)