Emails Scraping using BeautifulSoup from a list of URLs - beautifulsoup

Below is a simple script using BS to scrape emails from a single website, how do I modify the script if I have a list of URLs saved in excel and saved the results into csv file?
I am thinking if i should read the list of URL using pandas so it will be converted to pd dataframe?
from bs4 import BeautifulSoup
import re
import csv
from urllib.request import urlopen
f = urlopen('http://www.nus.edu.sg/contact')
s = BeautifulSoup(f, 'html.parser')
s = s.get_text()
phone = re.findall(r"((?:\d{3}|\(\d{3}\))?(?:\s|-|\.)?\d{3}(?:\s|-|\.)\d{4})",s)
emails = re.findall(r"[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s)
if len(phone) == 0:
print ("Sorry, no phone number found.")
print('------------')
print ()
else :
count = 1
for item in phone:
print ( count, ' phone number(s) found : ',item )
count += 1
print('------------')
print()
if len(emails) == 0:
print("Sorry, no email address found.")
print('------------')
print()
else:
count = 1
for item in emails:
print(count, ' email address(es) found : ', item)
count += 1

findAll / find_all can search for text based on a regex pattern.
You can use the re.compile(email-pattern) and then pass that to findAll.
findAll(text=email_pattern)
The email pattern used is as per RFC 5322.
Email scraping from a single url
from bs4 import BeautifulSoup, Comment
import re
from urllib.request import urlopen
email_pattern = re.compile(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""")
f = urlopen('http://www.nus.edu.sg/contact').read()
soup = BeautifulSoup(f, 'html5lib')
emails = [x for x in soup.findAll(text=email_pattern) if not isinstance(x, Comment)]
print(emails)
Output:
[' oarconnect#nus.edu.sg', ' ocssec#nus.edu.sg', ' dpo#nus.edu.sg', ' enterprise#nus.edu.sg', ' ohrsharedservices#nus.edu.sg', ' itcare#nus.edu.sg', ' askalib#nus.edu.sg', ' ofmhelp#nus.edu.sg', ' nus.comms#nus.edu.sg', ' whistleblow#nus.edu.sg']
Reading urls from excel and saving in csv
You can just read the urls from excel file column, loop over each, get the emails and write to the csv file. You don't have to use pandas for this (although you can). You can use openpyxl to read excel.
websites.xlsx
Code
from bs4 import BeautifulSoup, Comment
import re
import csv
from urllib.request import urlopen
from openpyxl import load_workbook
email_pattern = re.compile(r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""")
# The source xlsx file is named as source.xlsx
wb = load_workbook("websites.xlsx")
ws = wb.active
# Default name of first column is A
# change column if if you have a different column name
first_column = ws['A']
with open('output.csv', 'w') as output_file:
writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for x in range(len(first_column)):
link = first_column[x].value.strip()
f = urlopen(link).read()
soup = BeautifulSoup(f, 'html5lib')
emails = [x for x in soup.findAll(text=email_pattern) if not isinstance(x, Comment)]
# Add the link also
emails.insert(0, link)
writer.writerow(emails)
output.csv
http://www.nus.edu.sg/contact, oarconnect#nus.edu.sg, ocssec#nus.edu.sg, dpo#nus.edu.sg, enterprise#nus.edu.sg, ohrsharedservices#nus.edu.sg, itcare#nus.edu.sg, askalib#nus.edu.sg, ofmhelp#nus.edu.sg, nus.comms#nus.edu.sg, whistleblow#nus.edu.sg
https://sloanreview.mit.edu/contact/,smrfeedback#mit.edu,smr-help#mit.edu
Ref:
How to validate an email address using a regular expression?

Related

Webcrawler - Scrapy Python

I need help with my webcrawler.
I got an invalid syntax here:
"f.write("{},{},{}\n".format(word,url,count))"
and also when I command "scrapy crawl FirstSpider > wordlist.csv" a csv file shows up but either is empty or not as structured as I want it to be.
I want to crawl 300 websites and need the data as structured as possible.
How can I get a csv file with the urls structured and then the count of the certain keywords next to it,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.item import Item
import requests
def find_all_substrings(string, sub):
import re
starts = [match.start() for match in re.finditer(re.escape(sub), string)]
return starts
class FirstSpider(CrawlSpider):
name = "FirstSpider"
allowed_domains = ["www.example.com"]
start_urls = ["https://www.example.com/"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]
crawl_count = 0
words_found = 0
def check_buzzwords(self, response):
self.__class__.crawl_count += 1
wordlist = [
"keyword1",
"keyword2",
"keyword3"
]
url = response.url
data = response.body.decode('utf-8')
count = 0
for word in wordlist:
substrings = find_all_substrings(data, word)
count = 0
word_counts = {}
links = []
"f = open('wordlist.csv', 'w')"
for pos in substrings:
ok = False
if not ok:
count += 1
word_counts[word] = {url: count}
for link in links:
page = requests.get(link)
data = page.text
for word in wordlist:
substrings = find_all_substrings(data, word)
count = 0
for word in wordlist:
substrings = find_all_substrings(data, word)
for pos in substrings:
ok = False
if not ok:
"f.write("{},{},{}\n".format(word,url,count))"
self.__class__.words_found += 1
print(word + ";" + url + ";" + str(count) + ";")
with open('wordlist.csv', 'w') as f:
for word, data in word_counts.items():
for url, count in data.items():
f.write("{},{},{}\n".format(word, url, count))
f.close()
return Item()
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
I want to crawl websites for certain keywords (wordlist). My output should be a csv file with the following information: url, count of keyword found on the website.
I got an invalid syntax for the following ``` "f.write("{},{},{}\n".format(word,url,count))"
And the output csv file is often empty or does not crawl all the urls.
You have unnecessary quotation marks around lines 41 and 61
line 41 ---> "f = open('wordlist.csv', 'w')"
line 61 ---> "f.write("{},{},{}\n".format(word,url,count))"
Also usually you don't need to manually save data to a file because Scrapy has a built-in mechanism - Feed export
By using FEED_EXPORT_FIELDS setting you can specify which fields of the item should be exported and their order.
Here is the command to run the spider and save data to a file:
scrapy crawl FirstSpider -O url.csv
-O (capital 'O') means "rewrite a file"
-o (lowercase 'o') means "append to an existent file".

Using BeautifulSoup to exploit a URL and its dependent pages and store results in csv?

This code does not crash, which is good. However, it generates and empty icao_publications.csv f. I want to populate icao_publications.csv with all record on all the pages from the URL and capture all the pages. The dataset should be about 10,000 rows or their about in all.
I want to get these 10,000 or so rows in the csv file.
import requests, csv
from bs4 import BeautifulSoup
url = 'https://www.icao.int/publications/DOC8643/Pages/Search.aspx'
with open('Test1_Aircraft_Type_Designators.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Manufacturers", "Model", "Type_Designator", "Description", "Engine_Type", "Engine_Count", "WTC"])
while True:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
for row in soup.select('table tbody tr'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
if soup.select_one('li.paginate_button.active + li a'):
url = soup.select_one('li.paginate_button.active + li a')['href']
else:
break
Here you go:
import requests
import pandas as pd
url = 'https://www4.icao.int/doc8643/External/AircraftTypes'
resp = requests.post(url).json()
df = pd.DataFrame(resp)
df.to_csv('aircraft.csv',encoding='utf-8',index=False)
print('Saved to aircraft.csv')

How to improve the speed of getting request content via the request module

The below functions extract content from 'http://thegreyhoundrecorder.com.au/form-guides/' and append all content to a list. The function works fine, although the speed at which the content is scraped from the website is slow. This line tree = html.fromstring(page.content) in particular slows down the process. Is there a way I can improve on the speed of my request.
import lxml
from lxml import html
import requests
import re
import pandas as pd
from requests.exceptions import ConnectionError
greyhound_url = 'http://thegreyhoundrecorder.com.au/form-guides/'
def get_page(url):
"""fxn take page url and return the links to the acticle(Field) we
want to scrape in a list.
"""
page = requests.get(url)
tree = html.fromstring(page.content)
my_list = tree.xpath('//tbody/tr/td[2]/a/#href') # grab all link
print('Length of all links = ', len(my_list))
my_url = [page.url.split('/form-guides')[0] + str(s) for s in my_list]
return my_url
def extract_data(my_url):
"""
fxn take a list of urls and extract the needed infomation from
greyhound website.
return: a list with the extracted field
"""
new_list = []
try:
for t in my_url:
print(t)
page_detail = requests.get(t)
tree_1 = html.fromstring(page_detail.content)
title = ''.join(tree_1.xpath('//div/h1[#class="title"]/text()'))
race_number = tree_1.xpath("//tr[#id = 'tableHeader']/td[1]/text()")
Distance = tree_1.xpath("//tr[#id = 'tableHeader']/td[3]/text()")
TGR_Grade = tree_1.xpath("//tr[#id = 'tableHeader']/td[4]/text()")
TGR1 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[1]/text()")
TGR2 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[2]/text()")
TGR3 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[3]/text()")
TGR4 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[4]/text()")
clean_title = title.split(' ')[0].strip()
#clean title and extract track number
Track = title.split(' ')[0].strip()
#clean title and extract track date
date = title.split('-')[1].strip()
#clean title and extract track year
year = pd.to_datetime('now').year
#convert date to pandas datetime
race_date = pd.to_datetime(date + ' ' + str(year)).strftime('%d/%m/%Y')
#extract race number
new_rn = []
for number in race_number:
match = re.search(r'^(.).*?(\d+)$', number)
new_rn.append(match.group(1) + match.group(2))
new_list.append((race_date,Track,new_rn,Distance,TGR_Grade,TGR1,TGR2,TGR3,TGR4))
return new_list
except ConnectionError as e:
print('Connection error, connect to a stronger network or reload the page')

Why my code is giving me data in 1 column it should give me in two different column

i need to know what is happening in my code? it should give data in separate columns it is giving me same data in a oath columns.
i tried to change the value of row variable but it didn't found the reason
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
import time
arrayofRequest= []
prices=[]
location=[]
columns=['Price', 'Location']
df = pd.DataFrame(columns=columns)
for i in range(0,50):
arrayofRequest.append("https://www.zameen.com/Homes/Karachi-2-"+str(i+1)+".html?gclid=Cj0KCQjw3JXtBRC8ARIsAEBHg4mj4jX1zZUt3WzGScjH6nfwzrEqkuILarcmg372imSneelSXPj0fGIaArNeEALw_wcB")
request = requests.get(arrayofRequest[i])
soupobj= BeautifulSoup(request.content,"lxml")
# print(soupobj.prettify())
links =soupobj.find_all('span',{'class':'f343d9ce'})
addresses =soupobj.find_all('div',{'class':'_162e6469'})
price = ""
for i in range(0,len(links)):
price = str(links[i]).split(">")
price = price[len(price)-2].split("<")[0]
prices.append(price)
address = str(addresses[i]).split(">")
address = address[len(address)-2].split("<")[0]
location.append(address)
row=location[i]+","+prices[i]
df = df.append(pd.Series(row, index=columns), ignore_index=False)
# filewriter = csv.writer(csvfile, delimiter=',',filewriter.writerow(['Price', 'Location']),filewriter.writerow([prices[0],location[0]])
df.to_csv('DATA.csv', index=False)
because of this:
pd.Series(row, index=columns)
try smthg like
pd.DataFrame([[locations[i], prices[i]]], index=columns))
However this could be done only once outside of your for loop
pd.DataFrame(list(zip(locations, prices)), index=columns))

How to get value of a cell in html page when click to a link in list link?

I have a list about 5000 link.
Ex 2 in 5000 link:
https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019
...
I want to get value of column Time of Day and row Finish of links.
Ex:
09:51:07 AM - https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
07:50:55 AM - https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019
I got user infor of a website, that website has id, class. But table in https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019 have not id, class in table. So I can't.
#!/usr/bin/python
from urllib.request import urlopen
from bs4 import BeautifulSoup
list_user = []
for userID in range(1, 100000):
link = "https://example.com/member.php?u=" + str(userID)
html = urlopen(link)
bsObj = BeautifulSoup(html, "lxml")
user_name = bsObj.find("div", {"id":"main_userinfo"}).h1.get_text()
list_user.append(user_name)
print("username", userID, "is: ", user_name)
with open("result.txt", "a") as myfile:
myfile.write(user_name)
Please help me.
Thank you.
Using bs4 4.7.1.
There is only one table and you want the second column (td) of the last row. You can use last:child to select the last row; which should be used in conjunction with tbody type selector, and > child combinator, so as not to get header row. You can use nth-of-type to specify the td cell to return.
Now you may wish to develop this in at least two ways:
Handle cases where not found e.g.
name = getattr(soup.select_one('title'), 'text', 'N/A')
timing = getattr(soup.select_one('tbody > tr:last-child td:nth-of-type(2)'), 'text', 'N/A')
Add items to lists/data structure, which can be output as a dataframe at end and written out as csv. Or you may wish to stick with your current method
Python:
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019', 'https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
name = soup.select_one('title').text
timing = soup.select_one('tbody > tr:last-child td:nth-of-type(2)').text
print(name, timing)
This is my code.
It's working Ok.
import requests
from bs4 import BeautifulSoup
f = open("input.ecopark","r")
f_content = f.readlines()
f.close()
for url in f_content:
r = requests.get(url.rstrip())
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.select("table tbody tr td")
x = ""
for i in result:
if not x:
if i.get_text() == "Finish":
x = 1
continue
if x:
print(url.rstrip()+ " "+i.get_text())
break