spaCy: How to update doc.ents when using doc.retokenize() - spacy

I am trying to update a pre-trained model with tokens using the retokenizer. I created a pipeline in order to do this. In this pipeline, I also set "ENT_TYPE" when merging the tokens.
#Language.factory("re_tokenize")
def re_tokenize(nlp, name):
return ReTokenize(nlp.vocab)
class ReTokenize:
pattern = ""
def __init__(self, vocab):
self.pattern = r"[a-zA-Z0-9]+\[{0,1}[a-zA-Z0-9_]+\]{0,1}\[{0,1}[a-zA-Z0-9_]+\]{0,1}\[{0,1}[a-zA-Z0-9_]+\]{0,1}#{0,1}"
def __call__(self, doc):
spans = []
for match in re.finditer(self.pattern, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
if span is not None:
spans.append(span)
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span, attrs={"ENT_TYPE": "VAR"})
return doc
Using this pipeline, I can tokenize the words correctly. Also, the data in ent_type_ seems to be updated.
BEFORE:
# Set model
nlp = spacy.load("ja_ginza")
text = "aaa_bbbとaaa_CCCの2バイトマップ"
text = mojimoji.zen_to_han(text).lower()
doc = nlp(text)
print([token.text for token in doc])
print([token.ent_type_ for token in doc])
['aaa', '', 'bbb', 'と', 'aaa', '', 'ccc', 'の', '2', 'バイト', 'マップ']
['Product_Other', 'Product_Other', 'Product_Other', '', 'Product_Other', 'Product_Other', 'Product_Other', '', 'N_Product', 'N_Product', 'N_Product']
AFTER:
nlp.add_pipe("re_tokenize", before="parser")
doc = nlp(text)
print([token.text for token in doc])
print([token.ent_type_ for token in doc])
['aaa_bbb', 'と', 'aaa_ccc', 'の', '2', 'バイト', 'マップ']
['VAR', '', 'VAR', '', 'N_Product', 'N_Product', 'N_Product']
However, it seems that doc.ents is not being updated:
print([ent.label_ for ent in doc.ents])
['N_Product']
How do I also update doc.ents?

To add a single new entity to a doc without modifying any other entity annotation, use doc.set_ents():
span = doc.char_span(start, end, label="VAR")
doc.set_ents(entities=[span], default="unmodified")
More docs: https://spacy.io/api/doc#set_ents

Related

Vocab for LSA Topic Modelling returning letters rather than words

I am trying to topic model a list of descriptions using LSA. When I tokenize and then create a vocab from the descriptions, the vocab returns letters rather than words.
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•#'
custom_stopwords = ['author', 'book', 'books', 'story', 'stories', 'novel', 'series', 'collection', 'edition', 'volume', 'readers', 'reader', 'reprint', 'writer', 'writing']
final_stopword_list = custom_stopwords + my_stopwords
# cleaning master function
def clean_tokens(tokens):
tokens = (tokens).lower() # lower case
tokens = re.sub('['+my_punctuation + ']+', ' ', tokens) # strip punctuation
tokens = re.sub('([0-9]+)', '', tokens) # remove numbers
token_list = [word for word in tokens.split(' ') if word not in final_stopword_list] # remove stopwords
tokens = ' '.join(token_list)
return tokens
This is my tokenizer
count_vectoriser = CountVectorizer(tokenizer=clean_tokens)
bag_of_words = count_vectoriser.fit_transform(df.Description)
vocab = count_vectoriser.get_feature_names_out()
print(vocab[:10])
And my vocab, which returns
[' ' '#' '\\' 'a' 'b' 'c' 'd' 'e' 'f' 'g']
When I want it to give me words
I am tokenizing from a pandas dataframe so I don't know if that is altering the way I am tokenizing.

What is the most efficient way of creating a tf.dataset from multiple json.gz files with multiple text records?

I have thousands of json.gz files, each with a variety of information about scientific papers. For each file, I have to extract the relevant information - e.g. title and labels - to make a dataset, then transform it to a tf.dataset. However, it is poorly efficient since I cannot filter the subjects directly or shuffle them in a single step.
I would like to read them using tf.dataset.interleave in order to shuffle them, but also to filter them according to specific labels.
Here is how I'm doing it up to now.
import tensorflow as tf
import pandas as pd
#For relevant feature extraction
def load_file(file):
#with gzip.open(bytes.decode(file), 'r') as fin: # 4. gzip
with gzip.open(file, 'r') as fin:
json_bytes = fin.read()
json_str = json_bytes.decode('utf-8') # 2. string (i.e. JSON)
bb = json.loads(json_str)
bb = pd.json_normalize(bb, 'items', ['indexed', ['title', 'publisher', 'type','indexed.date-parts', 'subject']],
errors='ignore')
bb.dropna(subset=['title', 'publisher', 'type','indexed.date-parts', 'subject'], inplace=True)
bb.subject = bb.subject.apply(lambda x: int(themes[list(set(x) & set(list(themes.keys())))[0]]) if len(list(set(x) & set(list(themes.keys()))))>0 else len(list(themes.keys()))+1)
bb.title = bb.title.str.join('').values
#bb['author'] = bb['author'].apply(lambda x: '; '.join([', '.join([i['given'], i['family']]) for i in x]))
bb['indexed.date-parts'] = bb['indexed.date-parts'].apply(lambda tpl: datetime.datetime.strptime('-'.join(str(x) for x in tpl[0]), '%Y-%m-%d').strftime('%Y-%m-%d'))
#bb = bb.sample(n=32, replace=True)
#return bb.title.str.join('').values, bb.subject.str.join(', ').values
return dict(bb[['title', 'publisher', 'type','indexed.date-parts', 'subject' ]])
file_list = ['file_2021_01/10625.json.gz',
'file_2021_01/23897.json.gz',
'file_2021_01/12169.json.gz',
'file_2021_01/427.json.gz',...]
filenames = tf.data.Dataset.list_files(file_list, shuffle=True)
dataset = filenames.apply(
tf.data.experimental.parallel_interleave(
lambda x: tf.data.Dataset.from_tensor_slices(tf.numpy_function(load_file, [x], (tf.int64))), cycle_length=1))
However, it results it a error:
InternalError: Unsupported object type dict
[[{{node PyFunc}}]] [Op:IteratorGetNext]
Thanks

I have several problems with Logistic Regression using pandas

Can't create X_train and X_test Dataframes ( from 2 differenet csv files) and also can't use them as integer
data=pd.read_csv('action_train.csv', delimiter=';', header=0)
data=data.replace(to_replace ='[act1_]', value = '', regex = True).replace(to_replace ='[act2_]', value = '', regex = True).replace(to_replace ='[type ]', value = '', regex = True)
print(data.shape)
print(list(data.columns))
data1=pd.read_csv('action_test.csv', delimiter=';', header=0)
data1=data1.replace(to_replace ='[act1_]', value='', regex=True).replace(to_replace='[act2_]', value = '', regex = True).replace(to_replace ='[type ]', value = '', regex = True)
print(data1.shape)
print(list(data1.columns))
X_train=data['action_id', 'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9', 'char_10']
print(X_train)
y_train=data['result']
X_test=data1['action_id', 'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9', 'char_10']
print(X_test)
y_test=data1['result']
I tried to use them in different way but got tuple instead of array. Also can't convert object type in integer

Pool apply function hangs and never executes

I am trying to fetch Rally data by using its python library pyral. Sequentially the same code works, but its slow.
I thought of using python multiprocess package, however my pool.apply method gets stuck and never executes. I tried running it in Pycharm IDE as well as the windows cmd prompt.
import pandas as pd
from pyral import Rally
from multiprocessing import Pool, Manager
from pyral.entity import Project
def process_row(sheetHeaders: list, item: Project, L: list):
print('processing row : ' + item.Name) ## this print never gets called
row = ()
for header in sheetHeaders:
row.append(process_cell(header, item))
L.append(row)
def process_cell(attr, item: Project):
param = getattr(item, attr)
if param is None:
return None
try:
if attr == 'Owner':
return param.__getattr__('Name')
elif attr == 'Parent':
return param.__getattr__('ObjectID')
else:
return param
except KeyError as e:
print(e)
# Projects
# PortfolioItem
# User Story
# Hierarchical Req
# tasks
# defects
# -------------MAIN-----------------
def main():
# Rally connection
rally = Rally('rally1.rallydev.com', apikey='<my_key>')
file = 'rally_data.xlsx'
headers = {
'Project': ['Name', 'Description', 'CreationDate', 'ObjectID', 'Parent', 'Owner', 'State'],
}
sheetName = 'Project'
sheetHeaders = headers.get(sheetName)
p = Pool(1)
result = rally.get(sheetName, fetch=True, pagesize=10)
with Manager() as manager:
L = manager.list()
for item in result:
print('adding row for : ' + item.Name)
p.apply_async(func=process_row, args=(sheetHeaders, item, L)) ## gets stuck here
p.close()
p.join()
pd.DataFrame(L).to_excel(file, sheet_name=sheetName)
if __name__ == '__main__':
main()
Also tried without Manager list without any difference in the outcome
def main():
# Rally connection
rally = Rally('rally1.rallydev.com', apikey='<key>')
file = 'rally_data.xlsx'
headers = {
'Project': ['Name', 'Description', 'CreationDate', 'ObjectID', 'Parent', 'Owner', 'State'],
}
sheetName = 'Project'
sheetHeaders = headers.get(sheetName)
result = rally.get(sheetName, fetch=True, pagesize=10)
async_results = []
with Pool(50) as p:
for item in result:
print('adding row for : ' + item.Name)
async_results.append(p.apply_async(func=process_row, args=(sheetHeaders, item)))
res = [r.get() for r in async_results]
pd.DataFrame(res).to_excel(file, sheet_name=sheetName)
I dont know why, but replacing multiprocessing
with multiprocessing.dummy in the import statement worked.

Scrapy Spider not writing to Postgres in the correct format

I'm scraping the Science of Us website for articles related to mental health and trying to dump it to a postgres database I'm running locally. The scrapy output is stored in a dictionary that looks like articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
On running my code, it dumps the entire list of values for each key into the column with name == key. Instead, I would like each article to be one row in the database e.g. Article 1 would have its own row with its title, teaser, link, date, author and source in each of the columns.
Here is the relevant code:
1) spider.py
from scrapy.spiders import Spider
from scrapy import Request
from mhnewsbot_app.items import SOUItem
import string
mh_search_terms = ["DEPRESS", "MENTAL HEALTH", "EMOTIONAL HEALTH", "MENTAL DISORDER", "DIGITAL MEDICINE", "ANXI", "PSYCH", "THERAPY", "THERAPIST"]
tbl = string.maketrans('-', ' ') #To protect against cases where the article has hyphens or other special characters
articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
def url_lister():
url_list = []
article_count = 0
while article_count < 150:
url = 'http://nymag.com/scienceofus/?start=%s' %article_count
url_list.append(url)
article_count += 50
return url_list
class SOUSpider(Spider):
name = 'scienceofus'
start_urls = url_lister()
def parse(self, response):
for article in response.xpath('//ul[#class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
articles['title'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]/text()').extract()[title.index(i)])
articles['teaser'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/p[#class = "teaser"]/text()').extract()[title.index(i)])
articles['link'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/a[#class = "read-more"]/#href').extract()[title.index(i)])
articles['date'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/div[#class="headline-above"]/time/text()').extract()[title.index(i)])
articles['author'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/span[#class="by-authors"]/span/span[#class="author"]/text()').extract()[title.index(i)])
articles['source'].append('Science Of Us')
return articles
2) pipelines.py
from sqlalchemy.orm import sessionmaker
from models import Articles, db_connect, create_articles_table
class ArticlesPipeline(object):
def __init__(self):
engine = db_connect()
create_articles_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
article = Articles(**item)
try:
session.add(article)
session.commit()
except :
session.rollback()
raise
finally:
session.close()
return item
you are outputting 1 item, with multiple values on their fields, better output one item per value, because that's how your database seems to accept it:
def parse(self, response):
for article in response.xpath('//ul[#class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
article_item = {}
article_item['title'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]/text()').extract()[title.index(i)]
article_item['teaser'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/p[#class = "teaser"]/text()').extract()[title.index(i)]
article_item['link'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/a[#class = "read-more"]/#href').extract()[title.index(i)]
article_item['date'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/div[#class="headline-above"]/time/text()').extract()[title.index(i)]
article_item['author'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/span[#class="by-authors"]/span/span[#class="author"]/text()').extract()[title.index(i)]
article_item['source'] = 'Science Of Us'
yield article_item