Get annotation text from its position (PDFMiner) - pdfminer

I want to extract the text of annotations (such as highlighted text of hyperlinks) from its position. For this I could scrape the positions and urls by using PDFminer as in the below code. Is that possible passing this position to a layout object and get out the text?
Here are the code blocks I used for this purpose.
First part includes a function, named parse_annotation, to parse annotations from each page.
def parse_annotations(page):
positions = []
urls = []
for annot in pdftypes.resolve1(page.annots):
if isinstance(annot, pdftypes.PDFObjRef):
annotationDict = annot.resolve()
# Skip over any annotations that are not links
if str(annotationDict["Subtype"]) != "/'Link'":
continue
destID = 0
position = annotationDict["Rect"]
uriDict = "None"
if any(k in annotationDict for k in {"Dest", "D"}):
destID = (annotationDict["Dest"][0]).objid
url = "Cross reference"
elif "A" in annotationDict:
# Key A contains PDFObjRef, then resolve it again
if isinstance(annotationDict["A"], pdftypes.PDFObjRef):
uriDict = pdftypes.resolve1(annotationDict["A"])
if any(k in uriDict for k in {"Dest", "D"}):
destID = (uriDict["D"][0]).objid
else:
uriDict = annotationDict["A"]
# Check if the key exists within resolved uriDict
if str(uriDict["S"]) == "/'GoTo'":
url = "Cross reference"
elif str(uriDict["S"]) == "/'URI'":
url = str(uriDict["URI"])
url = url.lstrip("b")
url = url.replace("'", "")
else:
# Skip if key S in uriDict does not contain value URI, GoTo
continue
else:
sys.stderr.write("Warning: unknown key in annotationDict : ", annotationDict)
#print(annot, '\n', annotationDict, '\n', destID, '\n', position, '\n', uriDict, '\n', url, '\n')
print(position, '\n', url, '\n')
positions.append(position)
urls.append(url)
else:
sys.stderr.write("Warning: unknown annotation: %s\n" % annot)
return positions, urls
Example PDF file can be found from the following link below.
https://www2.ed.gov/about/offices/list/ocr/docs/20200512-qa-psi-covid-19.pdf
Now, by using PDFMiner, created a document object and start looping over the pages found in PDF.
manager = PDFResourceManager()
output = StringIO()
codec = 'utf-8'
laparams = LAParams()
converter = TextConverter(manager, output, codec=codec, laparams=laparams)
device = PDFPageAggregator(manager, laparams=laparams)
interpreter = PDFPageInterpreter(manager, device)
page_interpreter = PDFPageInterpreter(manager, converter)
filename = '20200512-qa-psi-covid-19.pdf'
fp = open(filename, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
page_no = 0
for pageNumber, page in enumerate(PDFPage.create_pages(document)):
print("\n================ PageNumber ", pageNumber+1, "===================\n")
if pageNumber == page_no:
page_interpreter.process_page(page)
raw_text = output.getvalue()
output.truncate(0)
output.seek(0)
interpreter.process_page(page)
layout = device.get_result()
if page.annots:
positions, urls = parse_annotations(page)
for obj in layout:
print('Object name and position %s \t %s \n' % (obj.__class__.__name__ , obj.bbox))
page_no += 1
fp.close()
converter.close()
output.close()
device.close()
Thanks in advance,
A.

Related

Webcrawler - Scrapy Python

I need help with my webcrawler.
I got an invalid syntax here:
"f.write("{},{},{}\n".format(word,url,count))"
and also when I command "scrapy crawl FirstSpider > wordlist.csv" a csv file shows up but either is empty or not as structured as I want it to be.
I want to crawl 300 websites and need the data as structured as possible.
How can I get a csv file with the urls structured and then the count of the certain keywords next to it,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.item import Item
import requests
def find_all_substrings(string, sub):
import re
starts = [match.start() for match in re.finditer(re.escape(sub), string)]
return starts
class FirstSpider(CrawlSpider):
name = "FirstSpider"
allowed_domains = ["www.example.com"]
start_urls = ["https://www.example.com/"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]
crawl_count = 0
words_found = 0
def check_buzzwords(self, response):
self.__class__.crawl_count += 1
wordlist = [
"keyword1",
"keyword2",
"keyword3"
]
url = response.url
data = response.body.decode('utf-8')
count = 0
for word in wordlist:
substrings = find_all_substrings(data, word)
count = 0
word_counts = {}
links = []
"f = open('wordlist.csv', 'w')"
for pos in substrings:
ok = False
if not ok:
count += 1
word_counts[word] = {url: count}
for link in links:
page = requests.get(link)
data = page.text
for word in wordlist:
substrings = find_all_substrings(data, word)
count = 0
for word in wordlist:
substrings = find_all_substrings(data, word)
for pos in substrings:
ok = False
if not ok:
"f.write("{},{},{}\n".format(word,url,count))"
self.__class__.words_found += 1
print(word + ";" + url + ";" + str(count) + ";")
with open('wordlist.csv', 'w') as f:
for word, data in word_counts.items():
for url, count in data.items():
f.write("{},{},{}\n".format(word, url, count))
f.close()
return Item()
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
I want to crawl websites for certain keywords (wordlist). My output should be a csv file with the following information: url, count of keyword found on the website.
I got an invalid syntax for the following ``` "f.write("{},{},{}\n".format(word,url,count))"
And the output csv file is often empty or does not crawl all the urls.
You have unnecessary quotation marks around lines 41 and 61
line 41 ---> "f = open('wordlist.csv', 'w')"
line 61 ---> "f.write("{},{},{}\n".format(word,url,count))"
Also usually you don't need to manually save data to a file because Scrapy has a built-in mechanism - Feed export
By using FEED_EXPORT_FIELDS setting you can specify which fields of the item should be exported and their order.
Here is the command to run the spider and save data to a file:
scrapy crawl FirstSpider -O url.csv
-O (capital 'O') means "rewrite a file"
-o (lowercase 'o') means "append to an existent file".

Numbering tickets on a PDF

I have a PDF with the art of a ticket for a fundraising dinner. I am providing a mock-up here so you can reproduce my problem:
mock up ticket
I would like to run the following pseudocode:
for i in 1:200
copy "mock up.pdf" to $i.pdf
inject $i into $i.pdf using font "OpenDyslexic" # place the ticket number in the pdf
end
create "final.pdf"
i = 0
for p in 1:20
add page to "final.pdf"
for column in 1:2
for row in 1:5
i = i + 1
inject $i.pdf in "final.pdf" in row, column of page p
end
end
end
Thank you!
I might have a solution:
#!/bin/env python3
# adapted from https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-insert-text
# and https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-combine-single-pages
import fitz # in fact, pip install pymupdf
#from sys import argv # in a future, I might get all the parameter via command line
width, height = fitz.paper_size("A4")
r = fitz.Rect(0, 0, width, height)
doc = fitz.open("mock up.pdf")
page = doc[0]
print("input file information:")
print(doc.metadata)
print("rotation=",page.rotation)
print("cropbox=",page.cropbox)
print("mediabox=",page.mediabox)
print("rect=",page.rect)
page.set_rotation(0)
artsize=page.rect
(nx, ny) = (200,140) # position of the ticket number inside the pdf
(dx, dy) = page.mediabox_size # the displacement of the ticket inside the output pdf
ntickets=25
nrows=5 # of tickets vertically
ncols=2 # of tickets horizontally
ntickets_per_page = nrows*ncols
outpdf = fitz.open()
nrow = ncol = 0
i = 0
while i < ntickets:
if i % ntickets_per_page == 0:
#print("new page for ticket #",i)
newpage = outpdf.new_page()
nrow, ncol = 0, 0
for ncol in range(1,ncols+1):
for nrow in range(1,nrows+1):
i += 1
if i > ntickets:
break
text = "{:04d}".format(i)
locr = fitz.Rect((ncol-1)*dx,(nrow-1)*dy,ncol*dx,nrow*dy)
#print("location of the ticket:", locr)
newpage.show_pdf_page(locr,doc,0)
p = fitz.Point(nx+(ncol-1)*dx,ny+(nrow-1)*dy)
#print("location of the number for ticket ", i, ": ", p)
rc = newpage.insert_text(p, # bottom left of 1st char
text,
fontname="tibo", # Times, bold
fontsize=12,
rotate=0,
)
i -= 1
print("%i lines printed on %i tickets." % (rc, i))
outpdf.save("tmp{:04d}.pdf".format(i))

PyGtk Serialization

I am currently working on a Note taking app in pyGtk and have set up a TextView where a user can type and add text tags for Bold Underline and Italics.
However, when it comes to saving the formatted text I cannot figure out how to do so.
I am trying to save in Gtk's native tagset format however after using
tag_format = TextBuffer.register_serialize_tagset()
content = TextBuffer.serialize(self, tag_format, start,end)
I cannot write this to a file with
open(filename, 'w').write(content)
because I get an error which states that it cannot write in bytes and needs a string instead.
I am currently working on a Note taking app in pyGtk and have set up a TextView where a user can type and add text tags for Bold Underline and Italics.
However, when it comes to saving the formatted text I cannot figure out how to do so.
I am trying to save in Gtk's native tagset format however after using
tag_format = TextBuffer.register_serialize_tagset()
content = TextBuffer.serialize(self, tag_format, start,end)
I cannot write this to a file with
open(filename, 'w').write(content)
because I get an error which states that it cannot write in bytes and needs a string instead.
import gi
gi.require_version('Gtk', '3.0')
from gi.repository import Gtk, Pango
I am currently working on a Note taking app in pyGtk and have set up a TextView where a user can type and add text tags for Bold Underline and Italics.
However, when it comes to saving the formatted text I cannot figure out how to do so.
I am trying to save in Gtk's native tagset format however after using
tag_format = TextBuffer.register_serialize_tagset()
content = TextBuffer.serialize(self, tag_format, start,end)
I cannot write this to a file with
open(filename, 'w').write(content)
because I get an error which states that it cannot write in bytes and needs a string instead.
File "example.py", line 87, in save_file
open(filename, 'w').write(content)
TypeError: write() argument must be str, not bytes
Here is sample code you can run and test by typing and then saving
import gi
gi.require_version('Gtk', '3.0')
from gi.repository import Gtk, Pango
class MainWindow(Gtk.ApplicationWindow):
def __init__(self):
Gtk.Window.__init__(self, title = "TwoNote")
self.grid = Gtk.Grid()
self.toolbar = Gtk.Toolbar()
self.grid.add(self.toolbar)
#buttons for toolbar
self.button_bold = Gtk.ToggleToolButton()
self.button_italic = Gtk.ToggleToolButton()
self.button_underline = Gtk.ToggleToolButton()
self.button_save = Gtk.ToolButton()
self.button_open = Gtk.ToolButton()
self.mytext = TextSet(self.button_bold, self.button_italic, self.button_underline)
self.button_bold.set_icon_name("format-text-bold-symbolic")
self.toolbar.insert(self.button_bold, 0)
self.button_italic.set_icon_name("format-text-italic-symbolic")
self.toolbar.insert(self.button_italic, 1)
self.button_underline.set_icon_name("format-text-underline-symbolic")
self.toolbar.insert(self.button_underline, 2)
self.toolbar.insert(self.button_save, 3)
self.toolbar.insert(self.button_open, 4)
self.button_open.set_icon_name("document-open-data")
self.button_save.set_icon_name("document-save")
self.button_save.connect("clicked", self.save_file)
self.button_open.connect("clicked", self.open_file)
self.button_bold.connect("toggled", self.mytext.on_button_clicked, "Bold", self.button_italic, self.button_underline)
self.button_italic.connect("toggled", self.mytext.on_button_clicked, "Italic", self.button_bold, self.button_underline)
self.button_underline.connect("toggled", self.mytext.on_button_clicked, "Underline", self.button_bold, self.button_italic)
self.grid.attach_next_to(self.mytext, self.toolbar, Gtk.PositionType.BOTTOM, 10,30)
self.add(self.grid)
filename = "Untitled"
def open_file(self, widget):
open_dialog = Gtk.FileChooserDialog("Open an existing file", self, Gtk.FileChooserAction.OPEN,(Gtk.STOCK_CANCEL,Gtk.ResponseType.CANCEL,Gtk.STOCK_OPEN, Gtk.ResponseType.OK))
open_response = open_dialog.run()
if open_response == Gtk.ResponseType.OK:
filename = open_dialog.get_filename()
text = open(filename).read()
self.mytext.get_buffer().set_text(text)
open_dialog.destroy()
elif open_response == Gtk.ResponseType.CANCEL:
print("Cancel clicked")
open_dialog.destroy()
def save_file(self, widget):
savechooser = Gtk.FileChooserDialog('Save File', self, Gtk.FileChooserAction.SAVE, (Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, Gtk.STOCK_SAVE, Gtk.ResponseType.OK))
allfilter = Gtk.FileFilter()
allfilter.set_name('All files')
allfilter.add_pattern('*')
savechooser.add_filter(allfilter)
txtFilter = Gtk.FileFilter()
txtFilter.set_name('Text file')
txtFilter.add_pattern('*.txt')
savechooser.add_filter(txtFilter)
response = savechooser.run()
if response == Gtk.ResponseType.OK:
filename = savechooser.get_filename()
print(filename, 'selected.')
buf = self.mytext.get_buffer()
start, end = buf.get_bounds()
tag_format = buf.register_serialize_tagset()
content = buf.serialize(buf, tag_format, start, end)
try:
open(filename, 'w').write(content)
except SomeError as e:
print('Could not save %s: %s' % (filename, err))
savechooser.destroy()
elif response == Gtk.ResponseType.CANCEL:
print('Closed, file not saved.')
savechooser.destroy()
class TextSet(Gtk.TextView):
def __init__(self, buttonBold, buttonItalic, buttonUnderline, interval = 1 ):
# Textview Setup
Gtk.TextView.__init__(self)
self.set_vexpand(True)
self.set_indent(10)
self.set_top_margin(90)
self.set_left_margin(20)
self.set_right_margin(20)
self.set_wrap_mode(Gtk.WrapMode.CHAR)
self.tb = TextBuffer()
self.set_buffer(self.tb)
# Thread setup
self.button_bold = buttonBold
self.button_italic = buttonItalic
self.button_underline = buttonUnderline
def on_button_clicked(self, widget, tagname, widget1, widget2):
state = widget.get_active()
name = widget.get_icon_name()
bounds = self.tb.get_selection_bounds()
self.tagname = tagname
if(state):
widget1.set_active(False)
widget2.set_active(False)
#highlighting
if(len(bounds) != 0):
start, end = bounds
myIter = self.tb.get_iter_at_mark(self.tb.get_insert())
myTags = myIter.get_tags()
if(myTags == [] and state == True):
self.tb.apply_tag_by_name(tagname, start, end)
elif(myTags != [] and state == True):
self.tb.remove_all_tags(start, end)
self.tb.apply_tag_by_name(tagname, start, end)
else:
for i in range(len(myTags)):
if(myTags[i].props.name == tagname):
self.tb.remove_tag_by_name(tagname,start,end)
myTags = []
self.tb.markup(widget, tagname)
def mouse_clicked(self, window, event):
self.button_bold.set_active(False)
self.button_italic.set_active(False)
self.button_underline.set_active(False)
class TextBuffer(Gtk.TextBuffer):
def __init__(self):
Gtk.TextBuffer.__init__(self)
self.connect_after('insert-text', self.text_inserted)
# A list to hold our active tags
self.taglist_on = []
# Our Bold tag.
self.tag_bold = self.create_tag("Bold", weight=Pango.Weight.BOLD)
self.tag_none = self.create_tag("None", weight=Pango.Weight.NORMAL)
self.tag_italic = self.create_tag("Italic", style=Pango.Style.ITALIC)
self.tag_underline = self.create_tag("Underline", underline=Pango.Underline.SINGLE)
def get_iter_position(self):
return self.get_iter_at_mark(self.get_insert())
def markup(self, widget, tagname):
self.tag_name = tagname
self.check = True
''' add "bold" to our active tags list '''
if(widget.get_active() == True):
if(self.tag_name == 'Bold'):
if 'Bold' in self.taglist_on:
del self.taglist_on[self.taglist_on.index('Bold')]
else:
self.taglist_on.append('Bold')
if(self.tag_name == 'Italic'):
if 'Italic' in self.taglist_on:
del self.taglist_on[self.taglist_on.index('Italic')]
else:
self.taglist_on.append('Italic')
if(self.tag_name == 'Underline'):
if 'Underline' in self.taglist_on:
del self.taglist_on[self.taglist_on.index('Underline')]
else:
self.taglist_on.append('Underline')
else:
self.check = False
def text_inserted(self, buffer, iter, text, length):
# A text was inserted in the buffer. If there are ny tags in self.tags_on, apply them
#if self.taglist_None or self.taglist_Italic or self.taglist_Underline or self.taglist_Bold:
if self.taglist_on:
# This sets the iter back N characters
iter.backward_chars(length)
# And this applies tag from iter to end of buffer
if(self.check == True):
if(self.tag_name == 'Italic'):
self.apply_tag_by_name('Italic', self.get_iter_position(), iter)
if(self.tag_name == 'Bold'):
self.apply_tag_by_name('Bold', self.get_iter_position(), iter)
if(self.tag_name == 'Underline'):
self.apply_tag_by_name('Underline', self.get_iter_position(), iter)
else:
self.remove_all_tags(self.get_iter_position(), iter)
win = MainWindow()
win.connect("delete-event", Gtk.main_quit)
win.show_all()
Gtk.main()
I figured it out rather than using
open(filename, 'w').write(content)
to save the content I imported GLib and used
GLib.file_set_contents(filename, content)

python TypeError: expected string or buffer when parsing JSON from a file

I realize this problem has been answered for other folks but none of the threads are helping me solve it. I'm trying to parse a JSON structure and add all values in the sent_file when the keys match with the tweet_file. The error I'm getting
import sys
import json
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
scores = {}
#tweet = {}
#tweet_text = {}
#hw()
#lines(sent_file)
#lines(tweet_file)
for line in sent_file:
term,score = line.split("\t")
scores[term] = int(score)
#print scores.items()
for tweets in tweet_file:
current_sent_value = 0
tweet = {} #this is a dict
#print type(tweets) str
tweet = json.loads(tweets)#[0] #this assignment changes tweet to a list. Why?
if 'text' in tweet:
tweet_text = {}
unicode_string = tweet['text']
encoded_string = unicode_string.encode('utf-8')
tweet_text = encoded_string.split()
for key in tweet_text:
for key in scores:
#print type(tweet_text) -- list
#print type(scores) --dict
if tweet_text.get(key) == scores.get(key): # get() does not work on a list. tweet_text is a list.
current_sent_value += scores(value)
print current_sent_value
if name == 'main':
main()
The error is here \assignment1\tweet_sentiment2.py", line 42, in main
if tweet_text.get(key) == scores.get(key): # get() does not work on a list. tweet_text is a list.
AttributeError: 'list' object has no attribute 'get'

Extracted outlook.msg body to text as below. I'ld like to search patterns in text file lines which is being read but chars are preceeded with '\x00'

extract outlook.msg body to text files
def getEmailBodyFromMsg():
mapi.MAPIInitialize ((mapi.MAPI_INIT_VERSION, 0))
storage_flags = win32com.storagecon.STGM_DIRECT | win32com.storagecon.STGM_READ | win32com.storagecon.STGM_SHARE_EXCLUSIVE
filepathList = glob.glob('*.msg')
for filepath in filepathList :
txtFilepath = os.path.splitext(ntpath.basename(filepath))[0]
resultFile = txtFilepath + datetime.now().strftime('%Y-%m-%d %H_%M_%S')+".txt"
#get body of email and save as txt
storage = pythoncom.StgOpenStorage (filepath, None, storage_flags, None, 0)
mapi_session = mapi.OpenIMsgSession ()
message = mapi.OpenIMsgOnIStg (mapi_session, None, storage, None, 0, mapi.MAPI_UNICODE)
#write to txt file
CHUNK_SIZE = 10000
stream = message.OpenProperty (win32com.mapi.mapitags.PR_BODY, pythoncom.IID_IStream, 0, 0)
text = u""
while True:
bytes = stream.read (CHUNK_SIZE)
if bytes:
text += bytes
else:
break
with codecs.open(resultFile, mode='w', encoding='utf-8') as a_file:
a_file.write(text)
opening the file written to above to search for text the lines are as:
with codecs.open(absFilepath, 'rb', encoding='utf-8') as inFile :
for index, line in enumerate(inFile) :
mymatch = re.search(csResultEmailPattern, line, re.UNICODE)
#line = 'R\x00e\x00s\x00u\x00l\x00t\x00s\x00 \x00f\x00r\x00o\x00m\x00\n'
#OR line = u'R\x00e\x00s\x00u\x00l\x00t\x00s\x00 \x00f\x00r\x00o\x00m\x00\r'
I would like to know if they is a valid way to specify a regex like resultEmailPattern = ur'Results' that would match the 'Rx00e..line as above or better way to encode the txt file