Spacy NLP example: cant covert token into lowercase, throwing error - spacy

from collections import defaultdict
item_ratings = defaultdict(list)
for idx, review in data.iterrows():
doc = nlp(review.text)
matches = matcher(doc)
found_items = set(doc[match[1]:match[2]].lower_ for match in matches)
for item in found_items:
item_ratings[item].append(review.stars)
Error:
AttributeError: 'spacy.tokens.span.Span' object has no attribute 'lower_'
I need the tokens to be in lowercase, how can i do that?

found_items = set(doc[match[1]:match[2]].text.lower() for match in matches)
Like the error says, Span objects don't have lower_. You can use text and then that' just a string so you can lowercase it.

Related

Remove stopwords in Tensorflow extended

I have to preprocess NLP data, so I've to remove the stopwords (from nltk library) from a Tensorflow dataset. I tried many thing like this:
docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]])
tokenizer = text.WhitespaceTokenizer()
tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
data = tokenized_docs.filter(lambda x: x. not in stop_words)
or this:
tokens = docs.map(lambda x: tokenizer.tokenize(x))
data = tokens.filter(lambda x: tf.strings.strip(x).ref() not in stopwords)
But it didn't work. This first code shows an error like: RaggedTensor is unhashable.
From what I can tell Tensorflow supports basic string normalization (lowercasing + punctuation stripping) using the standardize callback's standardization function. There doesn't appear to be support for more advanced options, like removing stop words without doing it yourself.
It's probably easier to just do the standardization beforehand, outside of TensorFlow and then pass the result on.
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def parse_text(text):
print(f'Input: {text}')
text = re.sub("[^a-zA-Z]", ' ', text)
print(f'Remove punctuation and numbers: {text}')
text = text.lower().split()
print(f'Lowercase and split: {text}')
swords = set(stopwords.words("english"))
text = [w for w in text if w not in swords]
print(f'Remove stop words: {text}')
text = " ".join(text)
print(f'Final: {text}')
return text
list1 = [["NEver tell me the odds."],["It's a trap!"]]
for sublist in list1:
for i in range(len(sublist)):
sublist[i] = parse_text(sublist[i])
print(list1)
# [['never tell odds'], ['trap']]
You can use this to remove stopwords when using tfx
from nltk.corpus import stopwords
outputs['review'] = tf.strings.regex_replace(inputs['review'], r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*',"")

How to quickly convert a pandas dataframe to a list of tuples

I have a pandas dataframe as follows.
thi 0.969378
text 0.969378
is 0.969378
anoth 0.699030
your 0.497120
first 0.497120
book 0.497120
third 0.445149
the 0.445149
for 0.445149
analysi 0.445149
I want to convert it to a list of tuples as follows.
[["this", 0.969378], ["text", 0.969378], ..., ["analysi", 0.445149]]
My code is as follows.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
def tokenize(text):
tokens = word_tokenize(text)
stems = []
for item in tokens: stems.append(PorterStemmer().stem(item))
return stems
# your corpus
text = ["This is your first text book", "This is the third text for analysis", "This is another text"]
# word tokenize and stem
text = [" ".join(tokenize(txt.lower())) for txt in text]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text).todense()
# transform the matrix to a pandas df
matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
# sum over each document (axis=0)
top_words = matrix.sum(axis=0).sort_values(ascending=False)
print(top_words)
I tried the following two options.
list(zip(*map(top_words.get, top_words)))
I got the error as TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [0.9693779251346359] of <class 'float'>
list(top_words.itertuples(index=True))
I got the error as AttributeError: 'Series' object has no attribute 'itertuples'.
Please let me know a quick way of doing this in pandas.
I am happy to provide more details if needed.
Use zip by index with map tuples to lists:
a = list(map(list,zip(top_words.index,top_words)))
Or convert index to column, convert to nupy array and then to lists:
a = top_words.reset_index().to_numpy().tolist()
print (a)
[['thi', 0.9693780000000001], ['text', 0.9693780000000001],
['is', 0.9693780000000001], ['anoth', 0.69903],
['your', 0.49712], ['first', 0.49712], ['book', 0.49712],
['third', 0.44514899999999996], ['the', 0.44514899999999996],
['for', 0.44514899999999996], ['analysi', 0.44514899999999996]]

Cannot replace spaCy lemmatized pronouns (-PRON-) through text

I'm trying to lemmatise a text with spaCy. Since spaCy uses -PRON- as lemma for personal pronouns, I want to keep the original text in all those cases.
Here's the relevant section of my code:
...
fout = open('test.txt', 'w+')
doc = nlp(text)
for word in doc:
if word.lemma_ == "-PRON-":
write = word.text
print(write)
else:
write = word.lemma_
fout.write(str(write))
fout.write(" ")
...
The print statement does print the original words for the cases where spaCy attributes the lemma '-PRON-'.
However, my output file (test.txt) always contains '-PRON-' for those cases, even though I would expect it to write the original words for those cases (I, us etc.)
What am I missing?
I tried different versions, including using the pos_ tag to identify the pronouns etc. but always with the same result, i.e., that my output contains '-PRON-'s
Try this somewhat altered code snipped to see what you get...
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Did he write the code for her?'
doc = nlp(text)
out_sent = [w.lemma_ if w.lemma_ !='-PRON-' else w.text for w in doc]
out_sent = ' '.join(out_sent)
print(out_sent)
with open('out_sent.txt', 'w') as f:
f.write(out_sent + '\n')
This should produce...
do he write the code for her ?

My question is about "module 'textacy' has no attribute 'Doc'"

Can't find module 'textacy' has no attribute 'Doc'
I am trying to extract verb phrases from spacy but there is such no library. Please help me how can I extract the verb phrases or adjective phrases using spacy. I want to do full shallow parsing.
def extract_named_nouns(row_series):
"""Combine nouns and non-numerical entities.
Keyword arguments:
row_series -- a Pandas Series object
"""
ents = set()
idxs = set()
# remove duplicates and merge two lists together
for noun_tuple in row_series['nouns']:
for named_ents_tuple in row_series['named_ents']:
if noun_tuple[1] == named_ents_tuple[1]:
idxs.add(noun_tuple[1])
ents.add(named_ents_tuple)
if noun_tuple[1] not in idxs:
ents.add(noun_tuple)
return sorted(list(ents), key=lambda x: x[1])
def add_named_nouns(df):
"""Create new column in data frame with nouns and named ents.
Keyword arguments:
df -- a dataframe object
"""
df['named_nouns'] = df.apply(extract_named_nouns, axis=1)
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
from textacy import io
#using spacy for nlp
nlp = en_core_web_sm.load()
sentence = 'The author is writing a new book.'
pattern = r'<VERB>?<ADV>*<VERB>+'
doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
# doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
lists = textacy.extract.pos_regex_matches(doc, pattern)
for list in lists:
print(list.text)
module 'textacy' has no attribute 'Doc'
Try following the examples here: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html#make-a-doc
It should be as simple as:
doc = textacy.make_spacy_doc("The author is writing a new book.", lang='en_core_web_sm')
You might look into just using spacy (without textacy) with its built-in Matcher instead (https://spacy.io/usage/rule-based-matching).
spacy_lang = textacy.load_spacy_lang("en_core_web_en")
docx_textacy = spacy_lang(sentence)

Autocorrect a column in a pandas dataframe using pyenchant

I tried to apply the code from the accepted answer of this question to one of my dataframe columns where each row is a sentence, but it didn't work.
My code looks this:
from enchant.checker import SpellChecker
checker = SpellChecker("id_ID")
h = df['Jawaban'].astype(str).str.lower()
hayo = []
for text in h:
checker.set_text(text)
for s in checker:
sug = s.suggest()[0]
s.replace(sug)
hayo.append(checker.get_text())
I got this following error:
IndexError: list index out of range
Any help is greatly appreciated.
I don't get the error using your code. The only thing I'm doing differently is to import the spell checker.
from enchant.checker import SpellChecker
checker = SpellChecker('en_US','en_UK') # not using id_ID
# sample data
ds = pd.DataFrame({ 'text': ['here is a spllng mstke','the wrld is grwng']})
p = ds['text'].str.lower()
hayo = []
for text in p:
checker.set_text(text)
for s in checker:
sug = s.suggest()[0]
s.replace(sug)
print(checker.get_text())
hayo.append(checker.get_text())
print(hayo)
here is a spelling mistake
the world is growing