How to split a PDF every n page using PyPDF2?

How to split a PDF every n page using PyPDF2? - pdf

I'm trying to learn how to split a pdf every n page.
In my case I want to split a 64p PDF into several chunks containing four pages each: file 1: p.1-4, file 2: p.5-8 etc.
I'm trying to understand PyPDF2 but my noobness overwhelms me:
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf = PdfFileReader('my_pdf.pdf')
I guess I need to make a loop of sorts using addPage and write files till there's no pages left?

Little late but I ran into your question while looking for help trying to do the same thing.
I ended up doing the following, which does what you're asking. Mind you it's probably more than you're asking for, but the answer is in there. It's a rough first draft, in heavy need of refactoring and some variable renaming.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def split_pdf(in_pdf, step=1):
"""Splits a given pdf into seperate pdfs and saves
those to a supfolder of the parent pdf's folder, called
splitted_pdf.
Arguments:
in_pdf: [str] Absolute path (and filename) of the
input pdf or just the filename, if the file
is in the current directory.
step: [int] Desired number of pages in each of the
output pdfs.
Returns:
dunno yet
"""
#TODO: Add choice for output dir
#TODO: Add logging instead of prints
#TODO: Refactor
try:
with open(in_pdf, 'rb') as in_file:
input_pdf = PdfFileReader(in_file)
num_pages = input_pdf.numPages
input_dir, filename = os.path.split(in_pdf)
filename = os.path.splitext(filename)[0]
output_dir = input_dir + "/" + filename + "_splitted/"
os.mkdir(output_dir)
intervals = range(0, num_pages, step)
intervals = dict(enumerate(intervals, 1))
naming = f'{filename}_p'
count = 0
for key, val in intervals.items():
output_pdf = PdfFileWriter()
if key == len(intervals):
for i in range(val, num_pages):
output_pdf.addPage(input_pdf.getPage(i))
nums = f'{val + 1}' if step == 1 else f'{val + 1}-{val + step}'
with open(f'{output_dir}{naming}{nums}.pdf', 'wb') as outfile:
output_pdf.write(outfile)
print(f'{naming}{nums}.pdf written to {output_dir}')
count += 1
else:
for i in range(val, intervals[key + 1]):
output_pdf.addPage(input_pdf.getPage(i))
nums = f'{val + 1}' if step == 1 else f'{val + 1}-{val + step}'
with open(f'{output_dir}{naming}{nums}.pdf', 'wb') as outfile:
output_pdf.write(outfile)
print(f'{naming}{nums}.pdf written to {output_dir}')
count += 1
except FileNotFoundError as err:
print('Cannot find the specified file. Check your input:')
print(f'{count} pdf files written to {output_dir}')
Hope it helps you.

from PyPDF2 import PdfFileReader, PdfFileWriter
import os
# Method to split the pdf at every given n pages.
def split_at_every(self,infile , step = 1):
# Copy the input file path to a local variable infile
input_pdf = PdfFileReader(open(infile, "rb"))
pdf_len = input_pdf.number_of_pages
# Get the complete file name along with its path and split the text to take only the first part.
fname = os.path.splitext(os.path.basename(infile))[0]
# Get the list of page numbers in the order of given step
# If there are 10 pages in a pdf, and the step is 2
# page_numbers = [0,2,4,6,8]
page_numbers = list(range(0,pdf_len,step))
# Loop through the pdf pages
for ind,val in enumerate(page_numbers):
# Check if the index is last in the given page numbers
# If the index is not the last one, carry on with the If block.
if(ind+1 != len(page_numbers)):
# Initialize the PDF Writer
output_1 = PdfFileWriter()
# Loop through the pdf pages starting from the value of current index till the value of next index
# Ex : page numbers = [0,2,4,6,8]
# If the current index is 0, loop from 1st page till the 2nd page in the pdf doc.
for page in range(page_numbers[ind], page_numbers[ind+1]):
# Get the data from the given page number
page_data = input_pdf.getPage(page)
# Add the page data to the pdf_writer
output_1.addPage(page_data)
# Frame the output file name
output_1_filename = '{}_page_{}.pdf'.format(fname, page + 1)
# Write the output content to the file and save it.
self.write_to_file(output_1_filename, output_1)
else:
output_final = PdfFileWriter()
output_final_filename = "Last_Pages"
# Loop through the pdf pages starting from the value of current index till the last page of the pdf doc.
# Ex : page numbers = [0,2,4,6,8]
# If the current index is 8, loop from 8th page till the last page in the pdf doc.
for page in range(page_numbers[ind], pdf_len):
# Get the data from the given page number
page_data = input_pdf.getPage(page)
# Add the page data to the pdf_writer
output_final.addPage(page_data)
# Frame the output file name
output_final_filename = '{}_page_{}.pdf'.format(fname, page + 1)
# Write the output content to the file and save it.
self.write_to_file(output_final_filename,output_final)

Related

Getting an IndexError when trying to run pcolormesh from a pandas DataFrame

I'm trying to generate a pcolormesh plot from a large dataset, where the rows are in units of hertz, the rows are individual files, and the body is an array of magnitude values per file for each frequency. My DataFrame gets constructed correctly with correct labels, but when I pass it in to pcolormesh, it throws the exception "arrays used as indices must be of integer (or boolean) type". The code I am attaching reflects a conversion of the frequency array to an integer array using .astype(int). Note, if I convert the PSD_array (magnitudes) to integers, it DOES work (but isn't helpful), but it doesn't like it otherwise. I also played around with other pcolormesh generations using decimals as the body of the DataFrame and it worked fine.
Ideas would be lovely, I'll keep working on it.
Code: (Note: specific call file paths redacted).
'''def file_List():
files = [file for file in os.listdir('###)]
file_list = []
for file in files:
file_list += [file]
#print(natsorted(file_list))
#print(len(file_list))
return(natsorted(file_list))
###Using Fast Fourier Transform, take in file list generated from
###file_List program and perform FFT on each file.
###We read the files by adding them to the file directory. Could improve
###by making an overarching program that runs everything with an input
###file directory.
def FFT():
'''
runs through FFT for the files in a file list as determined by the
file_List() program.
'''
file_list = file_List() #runs file_List() program and saves
#the list of files as a variable.
df = pd.DataFrame()
freq_array = np.empty((0,204800))
PSD_array = np.empty((0,204800))
print(len(file_list))
count = 0
for file in file_list:
while count < 10:
file_read = pd.read_csv('###'+file,skiprows=22,sep = '\t')
df = pd.DataFrame(file_read, columns = ['X_Value','Acceleration'])
#print(df.head())
q = df['Acceleration'] #data set input
n = len(df['Acceleration']) #number of data points
dt = 2/len(df['X_Value']) #
f_hat = np.fft.fft(q,n) #Runs FFT
PSD = f_hat * np.conj(f_hat) / n #Power Spectral Density
freq = (1/(dt*n)) * np.arange(n) #Creates x axis of frequencies
freq_array = np.append(freq_array,np.array([freq]),axis=0)
PSD_array = np.append(PSD_array,np.array([PSD]),axis=0)
count += 1
#trans_freq = np.transpose(freq_array)
#trans_PSD = np.transpose(PSD_array)
print(freq_array)
int_freq = freq_array.astype(int)
print(int_freq)
#PSD_int = PSD_array.astype(int)
PSD_df = pd.DataFrame(PSD_array, index = np.arange(len(PSD_array)), columns = int_freq[0])
#print(np.arange(len(PSD_array)))
print(PSD_df)
return(PSD_df)
def heatmap(df):
'''
Constructs a heatmap given an input dataframe
'''
plt.pcolormesh(df)
'''

"Wrong" TF IDF Scores

I have 1000 .txt files and planned searching for various keywords and calculate their TF-IDF Score. But for some reason the results are > 1. I did a test with 2 .txt files then: "I am studying nfc" and "You don't need AI" . For nfc and AI the TF-IDF should be 0.25 but when I open the .csv it says 1.4054651081081644.
I must admit that I did not choose the most efficient way for the code. I think the mistake is with the folders since I originally planned to check the documents by their year (annual reports from 2000-2010). But I canceled those plans and decided to check all annual reports as a whole corpus. I think the folders workaround is the problem still. I placed the 2 txt. files into the folder "-". Is there a way to make it count right?
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from pathlib import Path
# root dir
root = '/Users/Tom/PycharmProjects/TextMining/'
#
words_to_find = ['AI', 'nfc']
# tf_idf file writing
wrote_tf_idf_header = False
tf_idf_file_idx = 0
#
vectorizer_tf_idf = TfidfVectorizer(max_df=.80, min_df=1, stop_words='english', use_idf=True, norm=None, vocabulary=words_to_find, ngram_range=(1, 3))
vectorizer_cnt = CountVectorizer(stop_words='english', vocabulary=words_to_find, ngram_range=(1, 3))
#
years = ['-']
year_folders = [root + folder for folder in years]
# remove previous results file
if os.path.isfile('summary.csv'):
os.remove('summary.csv')
if os.path.isfile('tf_idf.csv'):
os.remove('tf_idf.csv')
#process every folder (for every year)
for year_idx, year_folder in enumerate(year_folders):
# get file paths in folder
file_paths = []
for file in Path(year_folder).rglob("*.txt"):
file_paths.append(file)
# count of files for each year
file_cnt = len(file_paths)
# read every file's text as string
docs_per_year = []
words_in_folder = 0
for txt_file in file_paths:
with open(txt_file, encoding='utf-8', errors="replace") as f:
txt_file_as_string = f.read()
words_in_folder += len(txt_file_as_string.split())
docs_per_year.append(txt_file_as_string)
#
tf_idf_documents_as_array = vectorizer_tf_idf.fit_transform(docs_per_year).toarray()
# tf_idf_documents_as_array = vectorizer_tf_idf.fit_transform([' '.join(docs_per_year)]).toarray()
#
cnt_documents_as_array = vectorizer_cnt.fit_transform(docs_per_year).toarray()
#
with open('summary.csv', 'a') as f:
f.write('Index;Term;Count;Df;Idf;Rel. Frequency\n')
for idx, word in enumerate(words_to_find):
abs_freq = cnt_documents_as_array[:, idx].sum()
f.write('{};{};{};{};{};{}\n'.format(idx + 1,
word,
np.count_nonzero(cnt_documents_as_array[:, idx]),
abs_freq,
vectorizer_tf_idf.idf_[idx],
abs_freq / words_in_folder))
f.write('\n')
with open('tf_idf.csv', 'a') as f:
if not wrote_tf_idf_header:
f.write('{}\n'.format(years[year_idx]))
f.write('Index;Year;File;')
for word in words_to_find:
f.write('{};'.format(word))
f.write('Sum\n')
wrote_tf_idf_header = True
for idx, tf_idfs in enumerate(tf_idf_documents_as_array):
f.write('{};{};{};'.format(tf_idf_file_idx, years[year_idx], file_paths[idx].name))
for word_idx, _ in enumerate(words_to_find):
f.write('{};'.format(tf_idf_documents_as_array[idx][word_idx]))
f.write('{}\n'.format(sum(tf_idf_documents_as_array[idx])))
tf_idf_file_idx += 1
print()

I think the mistake is, that you are defining the norm as norm=None, but the norm should be l1 or l2 as specified in the documentation.

How can I use a loop to apply a function to a list of csv files?

I'm trying to loop through all files in a directory and add "indicator" data to them. I had the code working where I could select 1 file and do this, but now am trying to make it work on all files. The problem is when I make the loop it says
ValueError: Invalid file path or buffer object type: <class 'list'>
The goal would be for each loop to read another file from list, make changes, and save file back to folder with changes.
Here is complete code w/o imports. I copied 1 of the "file_path"s from the list and put in comment at bottom.
### open dialog to select file
#file_path = filedialog.askopenfilename()
###create list from dir
listdrs = os.listdir('c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/')
###append full path to list
string = 'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/'
listdrs_path = [ string + x for x in listdrs]
print (listdrs_path)
###start loop, for each "file" in listdrs run the 2 functions below and overwrite saved csv.
for file in listdrs_path:
file_path = listdrs_path
data = pd.read_csv(file_path, index_col=0)
########################################
####function 1
def get_price_hist(ticker):
# Put stock price data in dataframe
data = pd.read_csv(file_path)
#listdr = os.listdir('Users\17409\AppData\Local\Programs\Python\Python38\Indicators\Sentdex Tutorial\stock_dfs')
print(listdr)
# Convert date to timestamp and make index
data.index = data["Date"].apply(lambda x: pd.Timestamp(x))
data.drop("Date", axis=1, inplace=True)
return data
df = data
##print(data)
######Indicator data#####################
def get_indicators(data):
# Get MACD
data["macd"], data["macd_signal"], data["macd_hist"] = talib.MACD(data['Close'])
# Get MA10 and MA30
data["ma10"] = talib.MA(data["Close"], timeperiod=10)
data["ma30"] = talib.MA(data["Close"], timeperiod=30)
# Get RSI
data["rsi"] = talib.RSI(data["Close"])
return data
#####end functions#######
data2 = get_indicators(data)
print(data2)
data2.to_csv(file_path)
###################################################
#here is an example of what path from list looks like
#'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/A.csv'

The problem is in line number 13 and 14. Your filename is in variable file but you are using file_path which you've assigned the file list. Because of this you are getting ValueError. Try this:
### open dialog to select file
#file_path = filedialog.askopenfilename()
###create list from dir
listdrs = os.listdir('c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/')
###append full path to list
string = 'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/Sentdex Tutorial/stock_dfs/'
listdrs_path = [ string + x for x in listdrs]
print (listdrs_path)
###start loop, for each "file" in listdrs run the 2 functions below and overwrite saved csv.
for file_path in listdrs_path:
data = pd.read_csv(file_path, index_col=0)
########################################
####function 1
def get_price_hist(ticker):
# Put stock price data in dataframe
data = pd.read_csv(file_path)
#listdr = os.listdir('Users\17409\AppData\Local\Programs\Python\Python38\Indicators\Sentdex Tutorial\stock_dfs')
print(listdr)
# Convert date to timestamp and make index
data.index = data["Date"].apply(lambda x: pd.Timestamp(x))
data.drop("Date", axis=1, inplace=True)
return data
df = data
##print(data)
######Indicator data#####################
def get_indicators(data):
# Get MACD
data["macd"], data["macd_signal"], data["macd_hist"] = talib.MACD(data['Close'])
# Get MA10 and MA30
data["ma10"] = talib.MA(data["Close"], timeperiod=10)
data["ma30"] = talib.MA(data["Close"], timeperiod=30)
# Get RSI
data["rsi"] = talib.RSI(data["Close"])
return data
#####end functions#######
data2 = get_indicators(data)
print(data2)
data2.to_csv(file_path)
Let me know if it helps.

Break document sections into list for export Python

I am very new to Python, and I am trying to break some legal documents into sections for export into SQL. I need to do two things:
Define the section numbers by the table of contents, and
Break up the document given the defined section numbers
The table of contents lists section numbers: 1.1, 1.2, 1.3, etc.
Then the document itself is broken up by those section numbers:
1.1 "...Text...",
1.2 "...Text...",
1.3 "...Text...", etc.
Similar to the chapters of a book, but delimited by ascending decimal numbers.
I have the document parsed using Tika, and I've been able to create a list of sections with some basic regex:
import tika
import re
from tika import parser
parsed = parser.from_file('test.pdf')
content = (parsed["content"])
headers = re.findall("[0-9]*[.][0-9]",content)
Now I need to do something like this:
splitsections = content.split() by headers
var_string = ', '.join('?' * len(splitsections))
query_string = 'INSERT INTO table VALUES (%s);' % var_string
cursor.execute(query_string, splitsections)
Sorry if all this is unclear. Still very new to this.
Any help you can provide would be most appreciated.

Everything tested except the last part with DB. Also the code can be improved, but this is another task. The main task is done.
In the list split_content there are all pieces of info you wanted (i.e. the text between 2.1 and 2.2, then 2.2 and 2.3, and so on, EXCLUDING num+name of sections itself (i.e. excluding 2.1 Continuation, 2.2 Name and so on).
I replaced tika by PyPDF2, as tika does not provide instruments needed for this task (i.e. I did not find how to provide the num of page I need and get its content).
def get_pdf_content(pdf_path,
start_page_table_contents, end_page_table_contents,
first_parsing_page, last_phrase_to_stop):
"""
:param pdf_path: Full path to the PDF file
:param start_page_table_contents: The page where the "Contents table" starts
:param end_page_table_contents: The page where the "Contents Table" ends
(i.e. the number of the page where Contents Table ENDs, i.e. not the next one)
:param first_parsing_page: The 1st page where we need to start data grabbing
:param last_phrase_to_stop: The phrase that tells the code where to stop grabbing.
The phrase must match exactly what is written in PDF.
This phrase will be excluded from the grabbed data.
:return:
"""
# ======== GRAB TABLE OF CONTENTS ========
start_page = start_page_table_contents
end_page = end_page_table_contents
table_of_contents_page_nums = range(start_page-1, end_page)
sections_of_articles = [] # ['2.1 Continuation', '2.2 Name', ... ]
open_file = open(pdf_path, "rb")
pdf = PyPDF2.PdfFileReader(open_file)
for page_num in table_of_contents_page_nums:
page_content = pdf.getPage(page_num).extractText()
page_sections = re.findall("[\d]+[.][\d][™\s\w;,-]+", page_content)
for section in page_sections:
cleared_section = section.replace('\n', '').strip()
sections_of_articles.append(cleared_section)
# ======== GRAB ALL NECESSARY CONTENT (MERGE ALL PAGES) ========
total_num_pages = pdf.getNumPages()
parsing_pages = range(first_parsing_page-1, total_num_pages)
full_parsing_content = '' # Merged pages
for parsing_page in parsing_pages:
page_content = pdf.getPage(parsing_page).extractText()
cleared_page = page_content.replace('\n', '')
# Remove page num from the start of "page_content"
# Covers the case with the page 65, 71 and others when the "page_content" starts
# with, for example, "616.6 Liability to Partners. (a) It is understood that"
# i.e. "61" is the page num and "6.6 Liability ..." is the section data
already_cleared = False
first_50_chars = cleared_page[:51]
for section in sections_of_articles:
if section in first_50_chars:
indx = cleared_page.index(section)
cleared_page = cleared_page[indx:]
already_cleared = True
break
# Covers all other cases
if not already_cleared:
page_num_to_remove = re.match(r'^\d+', cleared_page)
if page_num_to_remove:
cleared_page = cleared_page[len(str(page_num_to_remove.group(0))):]
full_parsing_content += cleared_page
# ======== BREAK ALL CONTENT INTO PIECES ACCORDING TO TABLE CONTENTS ========
split_content = []
num_sections = len(sections_of_articles)
for num_section in range(num_sections):
start = sections_of_articles[num_section]
# Get the last piece, i.e. "11.16 FATCA" (as there is no any "end" section after "11.16 FATCA", so we cant use
# the logic like "grab info between sections 11.1 and 11.2, 11.2 and 11.3 and so on")
if num_section == num_sections-1:
end = last_phrase_to_stop
else:
end = sections_of_articles[num_section + 1]
content = re.search('%s(.*)%s' % (start, end), full_parsing_content).group(1)
cleared_piece = content.replace('™', "'").strip()
if cleared_piece[0:3] == '. ':
cleared_piece = cleared_piece[3:]
# There are few appearances of "[Signature Page Follows]", as a "last_phrase_to_stop".
# We need the text between "11.16 FATCA" and the 1st appearance of "[Signature Page Follows]"
try:
indx = cleared_piece.index(end)
cleared_piece = cleared_piece[:indx]
except ValueError:
pass
split_content.append(cleared_piece)
# ======== INSERT TO DB ========
# Did not test this section
for piece in split_content:
var_string = ', '.join('?' * len(piece))
query_string = 'INSERT INTO table VALUES (%s);' % var_string
cursor.execute(query_string, parts)
How to use: (one of the possible way):
1) Save the code above in my_pdf_code.py
2) In the python shell:
import path.to.my_pdf_code as the_code
the_code.get_pdf_content('/home/username/Apollo_Investment_Fund_VIII_LPA_S1.pdf', 2, 4, 24, '[Signature Page Follows]')

How to see link extension before downloading the content?

I have a question that I think is quite interesting. I have collected lots of links with web scraping and I would like download the content from normal links, so I ignored all links with extensions like .PDF, .avi, .jpeg and similar during scraping phase.
So I have a list of scraped links without extension but when I start to
download the content, someo of them turn out to be PDF, music file, image or MS Word document. How can I ignore them and forsee that "hidden" extension of the link before downloading the content ?
Examples:
PDF: http://www.komunala-radovljica.si/library/includes/file.asp?FileId=168
PDF: http://www.hyundai.si/files/9861/HY-Mursak15_204x280-Motorevija_TISK.pdf?download
(Here I should probably look for string ".PDF" in the link)
MS Word: http://www.plinarna-maribor.si/bin?bin.svc=obj&bin.id=2D7F844C-C294-34B6-CECC-A65C2ADCF92A
Image: http://www.ddmaribor.si/index.php/fotografije/70-lep-literarnoglasbeni-vecer-s-ferijem-lainsckom/detail/1874-lep-literarnoglasbeni-vecer-s-ferijem-lainsckom?tmpl=component&phocadownload=2
MP4: http://www.hyundai.si/files/9865/Hyundai_Hokej_Mursak_Zvok_17sek_MP4.mp4?download
(here I should probably look for string "MP4" in the link)
CSS: http://global.careers.ppg.com/CMSPages/GetResource.ashx?stylesheetname=CPJobsLayout
My code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# encoding=UTF-8
#
# DOWNLOADER
# To grab the text content of webpages and save it to TinyDB database.
import re, time, urllib, requests, bs4
from bs4 import BeautifulSoup
start_time = time.time()
# Open file with urls.
with open("Q:/SIIT/JV_Marko_Boro/Detector/test_podjetja_2015/podjetja_0_100_url_test.txt") as f:
urls = f.readlines()
# Open file to write content to.
with open("Q:/SIIT/JV_Marko_Boro/Detector/test_podjetja_2015/podjetja_0_100_vsebina_test.txt", 'wb') as v:
# Read the urls one by one
for url in urls[0:len(urls)]:
# HTTP
if str(url)[0:7] == "http://":
print "URL " + str(url)
# Read the HTML of url
soup = BeautifulSoup(urllib.urlopen(url).read(), "html.parser")
# EXTRACT TEXT
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text().encode('utf-8')
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
# manually insert Slavic characters
text = text.replace('Ă„Ĺ¤', 'č')
text = text.replace('ÄŤ', 'č')
text = text.replace('ÄŚ', 'Č')
text = text.replace('ÄąÄľ', 'ž')
text = text.replace('Ĺľ', 'ž')
text = text.replace('ÄąËť', 'Ž')
text = text.replace('Ĺ˝', 'Ž')
text = text.replace('Ĺˇ', 'š')
text = text.replace('ÄąË‡', 'š')
text = text.replace('Ĺ ', 'Š')
text = text.replace('Â', '')
text = text.replace('â€“', '')
# Write url to file.
v.write(url)
# Write delimiter between url and text
v.write("__delimiter_*_between_*_url_*_and_*_text__")
v.write(text)
# Delimiter to separate contents. Stupid way of writing content to file but due to problems with čšž characters ...
v.write("__delimiter_*_between_*_two_*_webpages__")
# HTTPS
elif str(url)[0:8] == "https://":
print "URL " + str(url)
r = requests.get(url, verify=True)
html = r.text.encode('utf-8')
#soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(html, "html.parser")
# EXTRACT TEXT
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text().encode('utf-8')
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
# manually insert Slavic characters
text = text.replace('Å¾', 'ž')
text = text.replace('Å½', 'Ž')
text = text.replace('Å¡', 'š')
text = text.replace('Å ', 'Š')
text = text.replace('Ä', 'č')
#text = text.replace('â€˘', '')
# Write url to file.
v.write(url)
# Write delimiter between url and text
v.write("__delimiter_*_between_*_url_*_and_*_text__")
v.write(text)
# Delimiter to separate contents. Stupid way of writing content to file but due to problems with čšž characters ...
v.write("__delimiter_*_between_*_two_*_webpages__")
else:
print "URL ERROR"
print "--- %s seconds ---" % round((time.time() - start_time),2)

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to split a PDF every n page using PyPDF2? - pdf

Related

Getting an IndexError when trying to run pcolormesh from a pandas DataFrame

"Wrong" TF IDF Scores

How can I use a loop to apply a function to a list of csv files?

Break document sections into list for export Python

How to see link extension before downloading the content?

Categories

Resources