Convert PDF to .txt file with Google Cloud Storage - pdf

I have this code for Python on a local file system.
What is the equivalent Python object API for os.getcwd(), os.listdir?
I want this code to work using files from GCS?
In order to use GCS folders, I include this code
from google.cloud import storage
client = storage.Client()
bucket = client.bucket('my-bucket')
pdfDir = bucket.get_blob('uploads/pdf/')
txtDir = bucket.get_blob('uploads/txt/')
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to
txtdir
def PDF2txt(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
pdfDir = "C:/pdftotxt/pdfs/"
txtDir = "C:/pdftotxt/txt/"
PDF2txt(pdfDir, txtDir)

I assume that what you want is to list objects in a bucket and objects in particular folders inside a bucket. For doing that you can use directly the Python Client Libraries that Google Cloud Storage provide. Use bucket.list_blobs() for listing the whole bucket and bucket.list_blobs(prefix=prefix, delimiter=delimiter) for listing a particular folder or object.
A more detailed documentation can be found here [1] and the Git repository containing the whole libraries here [2].

Related

Using Python 3.8, I would like to extract text from a random PDF file

I would like to import a PDF file and find the most common words.
import PyPDF2
# Open the PDF file and read the text
pdf_file = open("nita20.pdf", "rb")
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in range(pdf_reader.pages):
text += pdf_reader.getPage(page).extractText()
I get this error:
TypeError: '_VirtualList' object cannot be interpreted as an integer
How to resolve this issue? So I can extract every word from the PDF file, thanks.
I got some deprecation warnings on your code, but this works (tested on Python 3.11, PyPDF2 version: 3.0.1)
import PyPDF2
# Open the PDF file and read the text
pdf_file = open("..\test.pdf", "rb")
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
i=0
print(len(pdf_reader.pages))
for page in range(len(pdf_reader.pages)):
text += pdf_reader.pages[i].extract_text()
i=i+1
print(text)

Read pdf object from S3

I am trying to create a lambda function that will access a pdf form uploaded to s3 and strip out the data entered into the form and send it elsewhere.
I am able to do this when I can download the file locally. So the below script works and allows me to read the data from the pdf into my pandas dataframe.:
import PyPDF2 as pypdf
import pandas as pd
s3 = boto3.resource('s3')
s3.meta.client.download_file(bucket_name, asset_key, './target.pdf')
pdfobject = open("./target.pdf", 'rb')
pdf = pypdf.PdfFileReader(pdfobject)
data = pdf.getFormTextFields()
pdf_df = pd.DataFrame(data, columns=get_cols(data), index=[0])
But with lambda I cannot save the file locally because I get a "read only filesystem" error.
I have tried using the s3.get_object() method like below:
s3_response_object= s3.get_object(
Bucket='pdf-forms-bucket',
Key='target.pdf',
)
pdf_bytes = s3_response_object['Body'].read()
But I have no idea how to convert the resulting bytes into an object that can be parsed with PyDF2. The output that I need and that PyDF2 will produce is like below:
{'form1[0].#subform[0].nameandmail[0]': 'Burt Lancaster',
'form1[0].#subform[0].mailaddress[0]': '675 Creighton Ave, Washington DC',
'form1[0].#subform[0].Principal[0]': 'David St. Hubbins',
'Principal[1]': None,
'form1[0].#subform[0].Principal[2]': 'Bart Simpson',
'Principal[3]': None}
So in summary, I need o be able to read a pdf with fillable forms, into memory and parse it without downloading the file because my lambda function environment won't allow local temp files.
Solved:
This does the trick:
import boto3
from PyPDF2 import PdfFileReader
from io import BytesIO
bucket_name ="pdf-forms-bucket"
item_name = "form.pdf"
s3 = boto3.resource('s3')
obj = s3.Object(bucket_name, item_name)
fs = obj.get()['Body'].read()
pdf = PdfFileReader(BytesIO(fs))
data = pdf.getFormTextFields()

How to convert all type of images to text using python tesseract

I'm trying to convert all type of images in a folder to text using python tesseract. Below is the that I'm using, with this only .png files are being converted to .txt, and other types are not being converted to text.
import os
import pytesseract
import cv2
import re
import glob
import concurrent.futures
import time
def ocr(img_path):
out_dir = "Output//"
img = cv2.imread(img_path)
text = pytesseract.image_to_string(img,lang='eng',config='--psm 6')
out_file = re.sub(".png",".txt",img_path.split("\\")[-1])
out_path = out_dir + out_file
fd = open(out_path,"w")
fd.write("%s" %text)
return out_file
os.environ['OMP_THREAD_LIMIT'] = '1'
def main():
path = input("Enter the path : ")
if os.path.isdir(path) == 1:
out_dir = "ocr_results//"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
image_list = glob.glob(path+"\\*.*")
for img_path,out_file in zip(image_list,executor.map(ocr,image_list)):
print(img_path.split("\\")[-1],',',out_file,', processed')
if __name__ == '__main__':
start = time.time()
main()
end = time.time()
print(end-start)
How to convert all type of image files to text. Please help me with the above code.
There is a bug in the ocr function.
First of all, the following does convert all type of image files to text.
text = pytesseract.image_to_string(img,lang='eng',config='--psm 6'))
However, what the next chunk of code does are
Select those file with .png extension using a regex
Create a new path with the same filename and a a .txt extension
Write the OCR output to the newly create text file.
out_file = re.sub(".png",".txt",img_path.split("\\")[-1])
out_path = out_dir + out_file
fd = open(out_path,"w")
fd.write("%s" %text)
In other words, all types of images files are converted but not all are written back correctly. The regex matching logic only replace .png with .txt and assign to out_path. When there is no .png (other image types), the variable gets the same value as the original filename (e.g. sampe.jpg). The next lines of code open the original image and overwrite with the OCR result.
One way to fix is by adding all the image formats you want to cover into the regex.
For example,
out_file = re.sub(".png|.jpg|.bmp|.tiff",".txt",img_path.split("\\")[-1])

Can't convert 'bytes' object to str implicitly for DCM to raw file

I learn how to convert DCM file to Raw file .Got the code from Git Hub:
https://github.com/xiasun/dicom2raw/blob/master/dicom2raw.py
And it got a error"Can't convert 'bytes' object to str implicitly" on the line
"allInOne += dataset.PixelData"
I try to use "encode("utf-8")",but it make allInOne to be empty.
By the way ,Is there any code to generate the .mhd file corresponding to the .raw file?
import dicom
import os
import numpy
import sys
dicomPath = "C:/DataLuna16pen/dcmdata/"
lstFilesDCM = [] # create an empty list
for dirName, subdirList, fileList in os.walk(dicomPath):
allInOne = ""
print(subdirList)
i=0
for filename in fileList:
i+=1
if "".join(filename).endswith((".dcm", ".DCM")):
path = dicomPath + "".join(filename)
dataset = dicom.read_file(path)
for n,val in enumerate(dataset.pixel_array.flat):
dataset.pixel_array.flat[n] = val / 60
if val < 0:
dataset.pixel_array.flat[n] = 0
dataset.PixelData = numpy.uint8(dataset.pixel_array).tostring()
allInOne += dataset.PixelData
print ("slice " + "".join(filename) + " done ",end=" ")
print (i)
newFile = open("./all_in_one.raw", "wb")
newFile.write(allInOne)
newFile.close()
print ("RAW file generated")
There are several things:
PyDicom still doesn't read compressed DICOMs properly (loseless jpeg). You should check Transfer Syntax of the files to check if this is the case. As a workaround you can use GDCM tool dcmdjpeg
you should not convert byte array into string (np.array.tostring returns in fact the array of bytes)
for writing mha files, take a look at MedPy. You can also use ITK directly. There is python wrapper and SimpleITK - some kind lightweight modification of ITK

minimal example of how to export a jupyter notebook to pdf using nbconvert and PDFExporter()

I am trying to export a pdf copy of a jupyter notebook using nbconvert from within a notebook cell. I have read the documentation, but I just cannot find some basic code to actually execute the nbconvert command and export to pdf.
I was able to get this far, but I was hoping that someone could just fill in the final gaps.
from nbconvert import PDFExporter
notebook_pdf = PDFExporter()
notebook_pdf.template_file = '../print_script/pdf_nocode.tplx'
Note sure how to get from here to actually getting the pdf created.
Any help would be appreciated.
I'm no expert, but managed to get this working. The key is that you need to preprocess the notebook which will allow you to use the PDFExporter.from_notebook_node() function. This will give you your pdf_data in byte format that can then be written to file:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert import PDFExporter
notebook_filename = "notebook.ipynb"
with open(notebook_filename) as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {'metadata': {'path': 'notebooks/'}})
pdf_exporter = PDFExporter()
pdf_data, resources = pdf_exporter.from_notebook_node(nb)
with open("notebook.pdf", "wb") as f:
f.write(pdf_data)
f.close()
It's worth noting that the ExecutePreprocessor requires the resources dict, but we don't use it in this example.
Following is rest api that convert .ipynb file into .html
POST: http://URL/export/<id>
Get: http://URL/export/<id> will return a id.html
import os
from flask import Flask, render_template, make_response
from flask_cors import CORS
from flask_restful import reqparse, abort, Api, Resource
from nbconvert.exporters import HTMLExporter
exporter = HTMLExporter()
app = Flask(__name__)
cors = CORS(app, resources={r"/export/*": {"origins": "*"}})
api = Api(app)
parser = reqparse.RequestParser()
parser.add_argument('path')
notebook_file_srv = '/path of your .ipynb file'
def notebook_doesnt_exist(nb):
abort(404, message="Notebook {} doesn't exist".format(nb))
class Notebook(Resource):
def get(self, id):
headers = {'Content-Type': 'text/html'}
return make_response(render_template(id + '.html'), 200, headers)
def post(self, id):
args = parser.parse_args()
notebook_file = args['path']
notebook_file = notebook_file_srv + id + '.ipynb'
if not os.path.exists(notebook_file):
return 'notebook \'.ipynb\' file not found', 404
else:
nb_name, _ = os.path.splitext(os.path.basename(notebook_file))
# dirname = os.path.dirname(notebook_file)
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'templates')
output_path = os.path.join(output_path, '{}.html'.format(nb_name))
output, resources = exporter.from_filename(notebook_file)
f = open(output_path, 'wb')
f.write(output.encode('utf8'))
f.close()
return 'done', 201
api.add_resource(Notebook, '/export/<id>')
if __name__ == '__main__':
app.run(debug=True)