Using BeautifulSoup to exploit a URL and its dependent pages and store results in csv? - pandas

This code does not crash, which is good. However, it generates and empty icao_publications.csv f. I want to populate icao_publications.csv with all record on all the pages from the URL and capture all the pages. The dataset should be about 10,000 rows or their about in all.
I want to get these 10,000 or so rows in the csv file.
import requests, csv
from bs4 import BeautifulSoup
url = 'https://www.icao.int/publications/DOC8643/Pages/Search.aspx'
with open('Test1_Aircraft_Type_Designators.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Manufacturers", "Model", "Type_Designator", "Description", "Engine_Type", "Engine_Count", "WTC"])
while True:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
for row in soup.select('table tbody tr'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
if soup.select_one('li.paginate_button.active + li a'):
url = soup.select_one('li.paginate_button.active + li a')['href']
else:
break

Here you go:
import requests
import pandas as pd
url = 'https://www4.icao.int/doc8643/External/AircraftTypes'
resp = requests.post(url).json()
df = pd.DataFrame(resp)
df.to_csv('aircraft.csv',encoding='utf-8',index=False)
print('Saved to aircraft.csv')

Related

Pandas: Creating multiple new columns from function with multiple output values

Im trying to scrape a website for multiple values regarding a list of books. The links to the book pages are stored in a dataframe. Now I need a function that iterates those links and adds the book values to new columns in the dataframe. I don't want to request the page again every time I'm scraping a new book value, so I want to do it all in one function.
The problem is the function then returns multiple values (e.g. book_title and book_rating) which I don't know how to best add to the dataframe.
I tried the following, which I know can't work but I'm stuck:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
#function to get the book page
def get_book_page(page):
# Construct the URL
books_page_url = page
# Get the HTML page content using requests
response = rq.get(books_page_url, headers = headers)
# Ensure that the response is valid
if response.status_code != 200:
print('Status code:', response.status_code)
raise Exception('Failed to fetch web page ' + books_page_url)
# Construct a beautiful soup document
doc = bs(response.content, "html.parser")
return doc
#function to scrape the book title
def scrape_book_title(book_content):
try:
title_tag = book_content.find("h1", class_="bc-heading bc-color-base bc-size-large bc-text-bold").text.strip()
except:
title_tag = "fehlt"
return title_tag
#function to scrape the book rating
def scrape_book_rating(book_content):
star_tag = book_content.find("li", class_="bc-list-item ratingsLabel")
try:
rating_tag = star_tag.find("span", class_="bc-text bc-pub-offscreen").text.strip()
except:
rating_tag = "fehlt"
return rating_tag
#function I'm trying to fix
def get_book_title(links):
bs_page = get_book_page(links)
bs_content = bs_page.find("ul", class_="bc-list bc-spacing-s2 bc-color-secondary bc-list-nostyle")
book_title = scrape_book_title(bs_content)
book_rating = scrape_book_rating(bs_content)
return book_title, book_rating
#here I would like to add the columns "A_Titel" and "A_Rating" with the values of "book_title" and "book_rating"
df['A_Titel'], df['A_Rating'] = df.apply(lambda x: get_book_title(x.Link), axis=1)

How to improve the speed of getting request content via the request module

The below functions extract content from 'http://thegreyhoundrecorder.com.au/form-guides/' and append all content to a list. The function works fine, although the speed at which the content is scraped from the website is slow. This line tree = html.fromstring(page.content) in particular slows down the process. Is there a way I can improve on the speed of my request.
import lxml
from lxml import html
import requests
import re
import pandas as pd
from requests.exceptions import ConnectionError
greyhound_url = 'http://thegreyhoundrecorder.com.au/form-guides/'
def get_page(url):
"""fxn take page url and return the links to the acticle(Field) we
want to scrape in a list.
"""
page = requests.get(url)
tree = html.fromstring(page.content)
my_list = tree.xpath('//tbody/tr/td[2]/a/#href') # grab all link
print('Length of all links = ', len(my_list))
my_url = [page.url.split('/form-guides')[0] + str(s) for s in my_list]
return my_url
def extract_data(my_url):
"""
fxn take a list of urls and extract the needed infomation from
greyhound website.
return: a list with the extracted field
"""
new_list = []
try:
for t in my_url:
print(t)
page_detail = requests.get(t)
tree_1 = html.fromstring(page_detail.content)
title = ''.join(tree_1.xpath('//div/h1[#class="title"]/text()'))
race_number = tree_1.xpath("//tr[#id = 'tableHeader']/td[1]/text()")
Distance = tree_1.xpath("//tr[#id = 'tableHeader']/td[3]/text()")
TGR_Grade = tree_1.xpath("//tr[#id = 'tableHeader']/td[4]/text()")
TGR1 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[1]/text()")
TGR2 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[2]/text()")
TGR3 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[3]/text()")
TGR4 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[4]/text()")
clean_title = title.split(' ')[0].strip()
#clean title and extract track number
Track = title.split(' ')[0].strip()
#clean title and extract track date
date = title.split('-')[1].strip()
#clean title and extract track year
year = pd.to_datetime('now').year
#convert date to pandas datetime
race_date = pd.to_datetime(date + ' ' + str(year)).strftime('%d/%m/%Y')
#extract race number
new_rn = []
for number in race_number:
match = re.search(r'^(.).*?(\d+)$', number)
new_rn.append(match.group(1) + match.group(2))
new_list.append((race_date,Track,new_rn,Distance,TGR_Grade,TGR1,TGR2,TGR3,TGR4))
return new_list
except ConnectionError as e:
print('Connection error, connect to a stronger network or reload the page')

Emails Scraping using BeautifulSoup from a list of URLs

Below is a simple script using BS to scrape emails from a single website, how do I modify the script if I have a list of URLs saved in excel and saved the results into csv file?
I am thinking if i should read the list of URL using pandas so it will be converted to pd dataframe?
from bs4 import BeautifulSoup
import re
import csv
from urllib.request import urlopen
f = urlopen('http://www.nus.edu.sg/contact')
s = BeautifulSoup(f, 'html.parser')
s = s.get_text()
phone = re.findall(r"((?:\d{3}|\(\d{3}\))?(?:\s|-|\.)?\d{3}(?:\s|-|\.)\d{4})",s)
emails = re.findall(r"[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s)
if len(phone) == 0:
print ("Sorry, no phone number found.")
print('------------')
print ()
else :
count = 1
for item in phone:
print ( count, ' phone number(s) found : ',item )
count += 1
print('------------')
print()
if len(emails) == 0:
print("Sorry, no email address found.")
print('------------')
print()
else:
count = 1
for item in emails:
print(count, ' email address(es) found : ', item)
count += 1
findAll / find_all can search for text based on a regex pattern.
You can use the re.compile(email-pattern) and then pass that to findAll.
findAll(text=email_pattern)
The email pattern used is as per RFC 5322.
Email scraping from a single url
from bs4 import BeautifulSoup, Comment
import re
from urllib.request import urlopen
email_pattern = re.compile(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""")
f = urlopen('http://www.nus.edu.sg/contact').read()
soup = BeautifulSoup(f, 'html5lib')
emails = [x for x in soup.findAll(text=email_pattern) if not isinstance(x, Comment)]
print(emails)
Output:
[' oarconnect#nus.edu.sg', ' ocssec#nus.edu.sg', ' dpo#nus.edu.sg', ' enterprise#nus.edu.sg', ' ohrsharedservices#nus.edu.sg', ' itcare#nus.edu.sg', ' askalib#nus.edu.sg', ' ofmhelp#nus.edu.sg', ' nus.comms#nus.edu.sg', ' whistleblow#nus.edu.sg']
Reading urls from excel and saving in csv
You can just read the urls from excel file column, loop over each, get the emails and write to the csv file. You don't have to use pandas for this (although you can). You can use openpyxl to read excel.
websites.xlsx
Code
from bs4 import BeautifulSoup, Comment
import re
import csv
from urllib.request import urlopen
from openpyxl import load_workbook
email_pattern = re.compile(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""")
# The source xlsx file is named as source.xlsx
wb = load_workbook("websites.xlsx")
ws = wb.active
# Default name of first column is A
# change column if if you have a different column name
first_column = ws['A']
with open('output.csv', 'w') as output_file:
writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for x in range(len(first_column)):
link = first_column[x].value.strip()
f = urlopen(link).read()
soup = BeautifulSoup(f, 'html5lib')
emails = [x for x in soup.findAll(text=email_pattern) if not isinstance(x, Comment)]
# Add the link also
emails.insert(0, link)
writer.writerow(emails)
output.csv
http://www.nus.edu.sg/contact, oarconnect#nus.edu.sg, ocssec#nus.edu.sg, dpo#nus.edu.sg, enterprise#nus.edu.sg, ohrsharedservices#nus.edu.sg, itcare#nus.edu.sg, askalib#nus.edu.sg, ofmhelp#nus.edu.sg, nus.comms#nus.edu.sg, whistleblow#nus.edu.sg
https://sloanreview.mit.edu/contact/,smrfeedback#mit.edu,smr-help#mit.edu
Ref:
How to validate an email address using a regular expression?

How to get value of a cell in html page when click to a link in list link?

I have a list about 5000 link.
Ex 2 in 5000 link:
https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019
...
I want to get value of column Time of Day and row Finish of links.
Ex:
09:51:07 AM - https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
07:50:55 AM - https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019
I got user infor of a website, that website has id, class. But table in https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019 have not id, class in table. So I can't.
#!/usr/bin/python
from urllib.request import urlopen
from bs4 import BeautifulSoup
list_user = []
for userID in range(1, 100000):
link = "https://example.com/member.php?u=" + str(userID)
html = urlopen(link)
bsObj = BeautifulSoup(html, "lxml")
user_name = bsObj.find("div", {"id":"main_userinfo"}).h1.get_text()
list_user.append(user_name)
print("username", userID, "is: ", user_name)
with open("result.txt", "a") as myfile:
myfile.write(user_name)
Please help me.
Thank you.
Using bs4 4.7.1.
There is only one table and you want the second column (td) of the last row. You can use last:child to select the last row; which should be used in conjunction with tbody type selector, and > child combinator, so as not to get header row. You can use nth-of-type to specify the td cell to return.
Now you may wish to develop this in at least two ways:
Handle cases where not found e.g.
name = getattr(soup.select_one('title'), 'text', 'N/A')
timing = getattr(soup.select_one('tbody > tr:last-child td:nth-of-type(2)'), 'text', 'N/A')
Add items to lists/data structure, which can be output as a dataframe at end and written out as csv. Or you may wish to stick with your current method
Python:
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019', 'https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
name = soup.select_one('title').text
timing = soup.select_one('tbody > tr:last-child td:nth-of-type(2)').text
print(name, timing)
This is my code.
It's working Ok.
import requests
from bs4 import BeautifulSoup
f = open("input.ecopark","r")
f_content = f.readlines()
f.close()
for url in f_content:
r = requests.get(url.rstrip())
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.select("table tbody tr td")
x = ""
for i in result:
if not x:
if i.get_text() == "Finish":
x = 1
continue
if x:
print(url.rstrip()+ " "+i.get_text())
break

automatic crawling web site

I got help from here to crawl on law.go.kr with the code below.
I'm trying to crawl other websites like http://lawbot.org, http://law.go.kr, https://casenote.kr.
But problem is that I have no understanding of html...
I understood all the code and how to get html address for the code below but it's different on other websites...
I want to know how to use the code below to crawl other web pages.
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
# Using request get 50 items from first page. pg=1 is page number, outmax=50 items
per page
response = requests.post(
"http://law.go.kr/precScListR.doq=*&section=evtNm&outmax=79329&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")
# Parse html using BeautifulSoup
page = BeautifulSoup(response.text, "html.parser")
# Go through all pages and collect posts numbers in items
items = []
for i in range(1, 2):
# Get all links
links = page.select("#viewHeightDiv .s_tit a")
# Loop all links and collect post numbers
for link in links:
# Parse post number from "onclick" attribute
items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))
# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
response = requests.get(url)
parsed = BeautifulSoup(response.text, "html.parser")
text = parsed.find('div', attrs={'id': 'contentBody'}).text #전문 저장
'id': 'contentBody', 제목제외 저장 'class': 'pgroup'
title = parsed.select_one("h2").text
posts.append({'number': item, 'url': url, 'text': text, 'title': title})
with open("D://\LAWGO_DATA/" + item + '.txt', 'w', encoding='utf8') as f:
f.write(text)
One more example for lawbot.org:
import requests
from bs4 import BeautifulSoup
base_url = 'http://lawbot.org'
search_url = base_url + '/?q=유죄'
response = requests.get(search_url)
page = BeautifulSoup(response.text, "html.parser")
lastPageNumber = int(page.select_one("li.page-item:not(.next):nth-last-child(2)").text)
casesList = []
for i in range(1, lastPageNumber + 1):
if i > 1:
response = requests.get(search_url + "&page=" + str(i))
page = BeautifulSoup(response.text, "html.parser")
cases = page.select("div.panre_center > ul.media-list li.panre_lists")
for case in cases:
title = case.findChild("h6").text
caseDocNumber = case.findChild(attrs={"class": "caseDocNumber"}).text
caseCourt = case.findChild(attrs={"class": "caseCourt"}).text
case_url = base_url + case.findChild("a")['href']
casesList.append({"title": title, "caseDocNumber": caseDocNumber, "caseCourt": caseCourt, "case_url": case_url})
# print("title:{}, caseDocNumber:{}, caseCourt:{}, caseUrl:{}".format(title, caseDocNumber, caseCourt, case_url))
for case in casesList:
response = requests.get(case["case_url"])
page = BeautifulSoup(response.text, "html.parser")
body = page.find(attrs={"class": "panre_body"}).text
print(body)