I want to parse multiple HTML documents with beautiful soup but I can't make it work - beautifulsoup

Is there a way to use beautiful soup to parse multiple HTML documents at the same time? I am modifying the code online that extracts HTML.txt files from edgar with beautiful soup so they can be downloaded as formated files: however, I found that my codes now only prints one edgar document (it's intended to print 5) and I don't know what's wrong with it.
import csv
import requests
import re
from bs4 import BeautifulSoup
with open('General Motors Co 11-15.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for line in reader:
fn1 = line[0]
fn2 = re.sub(r'[/\\]', '', line[1])
fn3 = re.sub(r'[/\\]', '', line[2])
fn4 = line[3]
saveas = '-'.join([fn1, fn2, fn3, fn4])
# Reorganize to rename the output filename.
url = 'https://www.sec.gov/Archives/' + line[4].strip()
bodytext=requests.get(url).text
parsedContent=BeautifulSoup(bodytext, 'html.parser')
for script in parsedContent(["script", "style"]):
script.extract()
text = parsedContent.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
with open(saveas, 'wb') as f:
f.write(requests.get('%s' % text).content)
print(file, 'downloaded and wrote to text file')
Do you know what's wrong with my codes?

I would guess that you're overwriting the existing document every time you write to the file. trying changing with open(saveas, 'wb') as f: to with open(saveas, 'ab') as f:
opening a file as wb creates a new document at with the same name as saveas, essentially clearing the existing document.

Related

Using Python 3.8, I would like to extract text from a random PDF file

I would like to import a PDF file and find the most common words.
import PyPDF2
# Open the PDF file and read the text
pdf_file = open("nita20.pdf", "rb")
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in range(pdf_reader.pages):
text += pdf_reader.getPage(page).extractText()
I get this error:
TypeError: '_VirtualList' object cannot be interpreted as an integer
How to resolve this issue? So I can extract every word from the PDF file, thanks.
I got some deprecation warnings on your code, but this works (tested on Python 3.11, PyPDF2 version: 3.0.1)
import PyPDF2
# Open the PDF file and read the text
pdf_file = open("..\test.pdf", "rb")
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
i=0
print(len(pdf_reader.pages))
for page in range(len(pdf_reader.pages)):
text += pdf_reader.pages[i].extract_text()
i=i+1
print(text)

I want to read and print this data in the picture by using the follow codes. But I get some troubles in this program, how could I fix this codes?

import re
file_path = 'D:/Speech/data/test2.txt'
useful_regex = re.compile(r'\[.+\]\n', re.IGNORECASE)
with open(file_path) as f:
file_content = f.read()
info_lines = re.findall(useful_regex, file_content)
len(info_lines)
for l in info_lines[1:10]:
print(l.strip().split('\t'))
As stated in the title, I want to read and print this data in the picture by using the follow codes. But I get some troubles in this program, how could I fix this codes?

How to open Russian-language PDFs for NLTK processing

I'm trying to extract text from a pdf file in Russian, and use this text as data for tokenisation, lemmatisation etc. with NLTK on Jupyter Notebook. I'm using PyPDF2, but I keep running into problems.
I am creating a function and passing to it the pdf as the input:
from PyPDF2 import PdfFileReader
def getTextPDF(pdfFileName):
pdf_file = open(pdfFileName, "rb")
read_pdf = PdfFileReader(pdf_file)
text = []
for i in range(0, read_pdf.getNumPages()):
text.append(read_pdf.getPage(i).extractText())
return "\n".join(text)
Then I call the function:
pdfFile = "sample_russian.pdf"
print("PDF: \n", myreader_pdf.getTextPDF(pdfFile))
But I get a long pink list of the same error warning:
PdfReadWarning: Superfluous whitespace found in object header b'1' b'0' [pdf.py:.....]
Any ideas would be very helpful! Thanks in advance!

How to convert all type of images to text using python tesseract

I'm trying to convert all type of images in a folder to text using python tesseract. Below is the that I'm using, with this only .png files are being converted to .txt, and other types are not being converted to text.
import os
import pytesseract
import cv2
import re
import glob
import concurrent.futures
import time
def ocr(img_path):
out_dir = "Output//"
img = cv2.imread(img_path)
text = pytesseract.image_to_string(img,lang='eng',config='--psm 6')
out_file = re.sub(".png",".txt",img_path.split("\\")[-1])
out_path = out_dir + out_file
fd = open(out_path,"w")
fd.write("%s" %text)
return out_file
os.environ['OMP_THREAD_LIMIT'] = '1'
def main():
path = input("Enter the path : ")
if os.path.isdir(path) == 1:
out_dir = "ocr_results//"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
image_list = glob.glob(path+"\\*.*")
for img_path,out_file in zip(image_list,executor.map(ocr,image_list)):
print(img_path.split("\\")[-1],',',out_file,', processed')
if __name__ == '__main__':
start = time.time()
main()
end = time.time()
print(end-start)
How to convert all type of image files to text. Please help me with the above code.
There is a bug in the ocr function.
First of all, the following does convert all type of image files to text.
text = pytesseract.image_to_string(img,lang='eng',config='--psm 6'))
However, what the next chunk of code does are
Select those file with .png extension using a regex
Create a new path with the same filename and a a .txt extension
Write the OCR output to the newly create text file.
out_file = re.sub(".png",".txt",img_path.split("\\")[-1])
out_path = out_dir + out_file
fd = open(out_path,"w")
fd.write("%s" %text)
In other words, all types of images files are converted but not all are written back correctly. The regex matching logic only replace .png with .txt and assign to out_path. When there is no .png (other image types), the variable gets the same value as the original filename (e.g. sampe.jpg). The next lines of code open the original image and overwrite with the OCR result.
One way to fix is by adding all the image formats you want to cover into the regex.
For example,
out_file = re.sub(".png|.jpg|.bmp|.tiff",".txt",img_path.split("\\")[-1])

How to see link extension before downloading the content?

I have a question that I think is quite interesting. I have collected lots of links with web scraping and I would like download the content from normal links, so I ignored all links with extensions like .PDF, .avi, .jpeg and similar during scraping phase.
So I have a list of scraped links without extension but when I start to
download the content, someo of them turn out to be PDF, music file, image or MS Word document. How can I ignore them and forsee that "hidden" extension of the link before downloading the content ?
Examples:
PDF: http://www.komunala-radovljica.si/library/includes/file.asp?FileId=168
PDF: http://www.hyundai.si/files/9861/HY-Mursak15_204x280-Motorevija_TISK.pdf?download
(Here I should probably look for string ".PDF" in the link)
MS Word: http://www.plinarna-maribor.si/bin?bin.svc=obj&bin.id=2D7F844C-C294-34B6-CECC-A65C2ADCF92A
Image: http://www.ddmaribor.si/index.php/fotografije/70-lep-literarnoglasbeni-vecer-s-ferijem-lainsckom/detail/1874-lep-literarnoglasbeni-vecer-s-ferijem-lainsckom?tmpl=component&phocadownload=2
MP4: http://www.hyundai.si/files/9865/Hyundai_Hokej_Mursak_Zvok_17sek_MP4.mp4?download
(here I should probably look for string "MP4" in the link)
CSS: http://global.careers.ppg.com/CMSPages/GetResource.ashx?stylesheetname=CPJobsLayout
My code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# encoding=UTF-8
#
# DOWNLOADER
# To grab the text content of webpages and save it to TinyDB database.
import re, time, urllib, requests, bs4
from bs4 import BeautifulSoup
start_time = time.time()
# Open file with urls.
with open("Q:/SIIT/JV_Marko_Boro/Detector/test_podjetja_2015/podjetja_0_100_url_test.txt") as f:
urls = f.readlines()
# Open file to write content to.
with open("Q:/SIIT/JV_Marko_Boro/Detector/test_podjetja_2015/podjetja_0_100_vsebina_test.txt", 'wb') as v:
# Read the urls one by one
for url in urls[0:len(urls)]:
# HTTP
if str(url)[0:7] == "http://":
print "URL " + str(url)
# Read the HTML of url
soup = BeautifulSoup(urllib.urlopen(url).read(), "html.parser")
# EXTRACT TEXT
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text().encode('utf-8')
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
# manually insert Slavic characters
text = text.replace('ÄŤ', 'č')
text = text.replace('ÄŤ', 'č')
text = text.replace('ÄŚ', 'Č')
text = text.replace('Ĺľ', 'ž')
text = text.replace('Ĺľ', 'ž')
text = text.replace('Ĺ˝', 'Ž')
text = text.replace('Ĺ˝', 'Ž')
text = text.replace('š', 'š')
text = text.replace('š', 'š')
text = text.replace('Ĺ ', 'Š')
text = text.replace('Â', '')
text = text.replace('–', '')
# Write url to file.
v.write(url)
# Write delimiter between url and text
v.write("__delimiter_*_between_*_url_*_and_*_text__")
v.write(text)
# Delimiter to separate contents. Stupid way of writing content to file but due to problems with čšž characters ...
v.write("__delimiter_*_between_*_two_*_webpages__")
# HTTPS
elif str(url)[0:8] == "https://":
print "URL " + str(url)
r = requests.get(url, verify=True)
html = r.text.encode('utf-8')
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(html, "html.parser")
# EXTRACT TEXT
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text().encode('utf-8')
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
# manually insert Slavic characters
text = text.replace('ž', 'ž')
text = text.replace('Ž', 'Ž')
text = text.replace('Å¡', 'š')
text = text.replace('Å ', 'Š')
text = text.replace('Ä', 'č')
#text = text.replace('•', '')
# Write url to file.
v.write(url)
# Write delimiter between url and text
v.write("__delimiter_*_between_*_url_*_and_*_text__")
v.write(text)
# Delimiter to separate contents. Stupid way of writing content to file but due to problems with čšž characters ...
v.write("__delimiter_*_between_*_two_*_webpages__")
else:
print "URL ERROR"
print "--- %s seconds ---" % round((time.time() - start_time),2)