The texts in the pdf files are text formats, not scanned. PDFMiner does not support python3, is there any other solutions?
There is also the pdfminer2 fork, supported for python 3.4, which available through pip3.
https://github.com/metachris/pdfminer
This thread helped me patch something together.
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO, BytesIO
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(pdfFile, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
device.close()
textstr = retstr.getvalue()
retstr.close()
return textstr
if __name__ == "__main__":
#scrape = open("../warandpeace/chapter1.pdf", 'rb') # for local files
scrape = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") # for external files
pdfFile = BytesIO(scrape.read())
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
For python3, you can download pdfminer as:
python -m pip install pdfminer.six
tika worked the best for me. It won't be wrong if I say it's better than PyPDF2 and pdfminer This made it really easy to extract each line in the pdf into a list. You can install it by pip install tika
And, use the code below:
from tika import parser
rawText = parser.from_file(path_to_pdf)
rawList = rawText['content'].splitlines()
print(rawList)
Related
I need help to make the pytesseract could select or choose folder or location to save the outputt result.
This is the code that I use
from tkinter import filedialog
import pytesseract as tess
from PIL import Image
tess.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def extract():
files = filedialog.askopenfilenames(title="Select Images")
text = []
n = len(files)
for i in range(0,n):
img = Image.open(files[i])
text.append(tess.image_to_pdf_or_hocr(img, lang='nld', config='--psm 11 --oem 3'))
text_files = open(f'test{i+1}.pdf',"+wb")
text_files.write(text[i])
if __name__ == "__main__":
extract()
I don't have python background but eager to learn more.
Thanks
I have try few code but still not working.
Even better if someone could write and add the code to make it works
according to the instructions in Colab I could get buffer & even take a pd.DataFrame from it (file is just example)...
# ... authentification
file_id = '1S1w0Z7g3bI1PGLPR49PW5VBRo7c_KYgU' # titanic
# loading data
import io
from googleapiclient.http import MediaIoBaseDownload
drive_service = build('drive', 'v3') # , credentials=creds
request = drive_service.files().get_media(fileId=file_id)
buf = io.BytesIO()
downloader = MediaIoBaseDownload(buf, request)
buf.seek(0)
import pandas as pd
df= pd.read_csv(buf);
print(df.head())
But have trouble with correct creation of dataFlow to Dataset - "buf" var is not working in =>
dataset = tf.data.experimental.make_csv_dataset(csv_file_path,
batch_size=100, num_epochs=1)
only "csv_file_path" as 1st arg. Is it possible in Colab to get IO from my GoogleDrive's csv-file into Dataset (used further in training)? And how to do it in a memory-efficient manner?..
P.S.
I understand that I perhaps can make file opened for all (in GoogleDrive) & get url to use the simple way:
#TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TRAIN_DATA_URL = "https://drive.google.com/file/d/1S1w0Z7g3bI1PGLPR49PW5VBRo7c_KYgU/view?usp=sharing"
train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
dataset = tf.data.experimental.make_csv_dataset(train_file_path, batch_size=100, num_epochs=1)
! but I DON'T need to share real file... How to save file confidential & get IO from it (in GoogleDrive) to tf.data.Dataset in Colab ? (preferably the shortest code - there will be much more code in real project tested in Colab)
drive.CreateFile HELPED (link) - as so as I understand that working in Colab - I am working in a separate environment (separate from my PC & I'net env)... So I tried (according link)
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# https://drive.google.com/file/d/1S1w0Z7g3bI1PGLPR49PW5VBRo7c_KYgU/view?usp=sharing
link = 'https://drive.google.com/open?id=1S1w0Z7g3bI1PGLPR49PW5VBRo7c_KYgU'
fluff, id = link.split('=')
print (id) # Verify that you have everything after '='
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('Filename.csv')
import tensorflow as tf
ds = tf.data.experimental.make_csv_dataset('Filename.csv', batch_size=100, num_epochs=1)
iterator = ds.as_numpy_iterator()
print(next(iterator))
it works for me. Thanks for the interest to the topic (if somebody tried)
even simplier
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')
_types = [float(), float(), float(), float(), str()]
_lines = tf.data.TextLineDataset('/content/drive/My Drive/iris.csv')
ds=_lines.skip(1).map(lambda x: tf.io.decode_csv(x, record_defaults=_types) )
ds0= ds.take(2)
print(*ds0.as_numpy_iterator(), sep='\n') # print list with sep => by rows.
OR from df: (and batched for memory economical usage)
import tensorflow as tf
# Load the Drive helper and mount
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')
df= pd.read_csv('/content/drive/My Drive/iris.csv', dtype = 'float32', converters = {'variety' : str}, nrows=20, decimal='.')
ds = tf.data.Dataset.from_tensor_slices(dict(df)) # if mixed types
ds = ds.shuffle(20, reshuffle_each_iteration=False ) # for train.ds ONLY!
ds = ds.batch(batch_size=4)
ds = ds.prefetch(4)
# labels
label= ds.map(lambda x: x['variety'])
print(list(label.as_numpy_iterator()))
# features
#features = ds.map(lambda x: (x['sepal.length'], x['sepal.width']))
# Or with dynamic keys:
features = ds.map(lambda x: (list(map(x.get, list(np.setdiff1d(list(x.keys()),['variety']))))))
print(list(features.as_numpy_iterator()))
with any Transformations in map...
This script, not elaborated by me, allows me to remove a watermark from a PDF file. The problem is that it doesn't have an option to do batch operations, and I have more than 1,000 PDF files in the same directory
from PyPDF2 import PdfFileReader, PdfFileWriter
def removeWatermark(input_fname: str, output_fname: str):
with open(input_fname, "rb") as inputFile, open(output_fname, "wb") as outputFile:
reader = PdfFileReader(inputFile)
writer = PdfFileWriter()
for n in range(reader.numPages):
page = reader.getPage(n)
del page["/Contents"][-1]
writer.addPage(page)
writer.write(outputFile)
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
raise RuntimeError("Arguments not correct!")
removeWatermark(sys.argv[1], sys.argv[2])
If anyone can help me I appreciate.
Dependences: PyPDF2 and PDFtk
I solved the problem:
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
path = os.path.dirname(__file__)
files = os.listdir(path)
def removeWater(inputfilename,outputfilename):
"input"
inputFile = open(inputfilename, "rb")
outputFile = open(outputfilename, "wb")
pdfReader = PdfFileReader(inputFile)
pdfWriter = PdfFileWriter()
for n in range(pdfReader.numPages):
page = pdfReader.getPage(n)
del page["/Contents"][-1]
pdfWriter.addPage(page)
pdfWriter.write(outputFile)
if __name__ == '__main__':
for filename in files:
if filename.endswith(('.pdf')):
removeWater(os.path.splitext(filename)[0] + ".pdf",os.path.splitext(filename)[0] + "- limpo" + ".pdf")
Could someone please tell me whether Tensorboard supports exporting CSV files from the command line? The reason why I ask this is because I have a lots of logging directory and I am hoping to have a script file that automates the process. Thanks.
The API supports reading files programmatically. Here's an example of extracting data for a tag and saving it to a .csv in a similar format to those generated by tensorboard
import argparse
import numpy as np
import tensorflow as tf
def save_tag_to_csv(fn, tag='test_metric', output_fn=None):
if output_fn is None:
output_fn = '{}.csv'.format(tag.replace('/', '_'))
print("Will save to {}".format(output_fn))
sess = tf.InteractiveSession()
wall_step_values = []
with sess.as_default():
for e in tf.train.summary_iterator(fn):
for v in e.summary.value:
if v.tag == tag:
wall_step_values.append((e.wall_time, e.step, v.simple_value))
np.savetxt(output_fn, wall_step_values, delimiter=',', fmt='%10.5f')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('fn')
parser.add_argument('--tag', default='test_metric')
args = parser.parse_args()
save_tag_to_csv(args.fn, tag=args.tag)
I am trying to show a matplotlib plot with axes labeled using gettext's _("label") construct. Trying to create a minimal example, I came up with the following python code. It runs fine through the NULLTranslations() like this:
python mpl_i18n_test.py
But when I switch to japanese, I get nothing but small squares in the plot -- though on the command-line, the translations look fine:
LANG=ja_JP.utf8 python mpl_i18n_test.py
Here is the file mpl_i18n_test.py
Note that this requires the mona-sazanami font installed, and the various python modules: pygtk, numpy, matplotlib, gettext and polib
So my question: Is there some trick to getting matplotlib play nicely with gettext? Am I missing something obvious here? Thank you.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import gtk
import numpy as np
import matplotlib as mpl
from matplotlib.figure import Figure
from matplotlib.backends.backend_gtkagg import \
FigureCanvasGTKAgg as FigureCanvas
from matplotlib.backends.backend_gtkagg import \
NavigationToolbar2GTKAgg as NavigationToolbar
import locale
import gettext
import polib
mpl.rcParams['font.family'] = 'mona-sazanami'
def append(po, msg):
occurances = []
for i,l in enumerate(open(__file__,'r')):
if "_('"+msg[0]+"')" in l:
occurances += [(__file__,str(i+1))]
entry = polib.POEntry(msgid=msg[0],
msgstr=msg[1],
occurrences=occurances)
print msg
print occurances
po.append(entry)
def generate_ja_mo_file():
po = polib.POFile()
msgs = [
(u'hello', u'こんにちは'),
(u'good-bye', u'さようなら'),
]
for msg in msgs:
append(po, msg)
po.save('mpl_i18n_test.po')
po.save_as_mofile('mpl_i18n_test.mo')
return 'mpl_i18n_test.mo'
def initialize():
'''prepare i18n/l10n'''
locale.setlocale(locale.LC_ALL, '')
loc,enc = locale.getlocale()
lang,country = loc.split('_')
l = lang.lower()
if l == 'ja':
filename = generate_ja_mo_file()
trans = gettext.GNUTranslations(open(filename, 'rb'))
else:
trans = gettext.NullTranslations()
trans.install()
if __name__ == '__main__':
initialize() # provides _() method for translations
win = gtk.Window(gtk.WINDOW_TOPLEVEL)
win.connect("destroy", lambda x: gtk.main_quit())
win.connect("delete_event", lambda x,y: False)
win.set_default_size(400,300)
win.set_title("Test of unicode in plot")
fig = Figure()
fig.subplots_adjust(bottom=.14)
ax = fig.add_subplot(1,1,1)
xx = np.linspace(0,10,100)
yy = xx*xx + np.random.normal(0,1,100)
ax.plot(xx,yy)
print 'hello --> ', _('hello')
print 'good-bye --> ', _('good-bye')
ax.set_title(u'こんにちは')
ax.set_xlabel(_('hello'))
ax.set_ylabel(_('good-bye'))
can = FigureCanvas(fig)
tbar = NavigationToolbar(can,None)
vbox = gtk.VBox()
vbox.pack_start(can, True, True, 0)
vbox.pack_start(tbar, False, False, 0)
win.add(vbox)
win.show_all()
gtk.main()
A solution I found was to merely specify unicode when the translation is "installed." It was a one-line change:
trans.install(unicode=True)
I will add that this is only needed in python 2.7, but not needed in python 3. Looks like python 2.6 and earlier still have issues with this