automatic crawling web site - selenium

I got help from here to crawl on law.go.kr with the code below.
I'm trying to crawl other websites like http://lawbot.org, http://law.go.kr, https://casenote.kr.
But problem is that I have no understanding of html...
I understood all the code and how to get html address for the code below but it's different on other websites...
I want to know how to use the code below to crawl other web pages.
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
# Using request get 50 items from first page. pg=1 is page number, outmax=50 items
per page
response = requests.post(
"http://law.go.kr/precScListR.doq=*&section=evtNm&outmax=79329&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")
# Parse html using BeautifulSoup
page = BeautifulSoup(response.text, "html.parser")
# Go through all pages and collect posts numbers in items
items = []
for i in range(1, 2):
# Get all links
links = page.select("#viewHeightDiv .s_tit a")
# Loop all links and collect post numbers
for link in links:
# Parse post number from "onclick" attribute
items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))
# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
response = requests.get(url)
parsed = BeautifulSoup(response.text, "html.parser")
text = parsed.find('div', attrs={'id': 'contentBody'}).text #전문 저장
'id': 'contentBody', 제목제외 저장 'class': 'pgroup'
title = parsed.select_one("h2").text
posts.append({'number': item, 'url': url, 'text': text, 'title': title})
with open("D://\LAWGO_DATA/" + item + '.txt', 'w', encoding='utf8') as f:
f.write(text)

One more example for lawbot.org:
import requests
from bs4 import BeautifulSoup
base_url = 'http://lawbot.org'
search_url = base_url + '/?q=유죄'
response = requests.get(search_url)
page = BeautifulSoup(response.text, "html.parser")
lastPageNumber = int(page.select_one("li.page-item:not(.next):nth-last-child(2)").text)
casesList = []
for i in range(1, lastPageNumber + 1):
if i > 1:
response = requests.get(search_url + "&page=" + str(i))
page = BeautifulSoup(response.text, "html.parser")
cases = page.select("div.panre_center > ul.media-list li.panre_lists")
for case in cases:
title = case.findChild("h6").text
caseDocNumber = case.findChild(attrs={"class": "caseDocNumber"}).text
caseCourt = case.findChild(attrs={"class": "caseCourt"}).text
case_url = base_url + case.findChild("a")['href']
casesList.append({"title": title, "caseDocNumber": caseDocNumber, "caseCourt": caseCourt, "case_url": case_url})
# print("title:{}, caseDocNumber:{}, caseCourt:{}, caseUrl:{}".format(title, caseDocNumber, caseCourt, case_url))
for case in casesList:
response = requests.get(case["case_url"])
page = BeautifulSoup(response.text, "html.parser")
body = page.find(attrs={"class": "panre_body"}).text
print(body)

Related

Using BeautifulSoup to exploit a URL and its dependent pages and store results in csv?

This code does not crash, which is good. However, it generates and empty icao_publications.csv f. I want to populate icao_publications.csv with all record on all the pages from the URL and capture all the pages. The dataset should be about 10,000 rows or their about in all.
I want to get these 10,000 or so rows in the csv file.
import requests, csv
from bs4 import BeautifulSoup
url = 'https://www.icao.int/publications/DOC8643/Pages/Search.aspx'
with open('Test1_Aircraft_Type_Designators.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Manufacturers", "Model", "Type_Designator", "Description", "Engine_Type", "Engine_Count", "WTC"])
while True:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
for row in soup.select('table tbody tr'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
if soup.select_one('li.paginate_button.active + li a'):
url = soup.select_one('li.paginate_button.active + li a')['href']
else:
break
Here you go:
import requests
import pandas as pd
url = 'https://www4.icao.int/doc8643/External/AircraftTypes'
resp = requests.post(url).json()
df = pd.DataFrame(resp)
df.to_csv('aircraft.csv',encoding='utf-8',index=False)
print('Saved to aircraft.csv')

How to improve the speed of getting request content via the request module

The below functions extract content from 'http://thegreyhoundrecorder.com.au/form-guides/' and append all content to a list. The function works fine, although the speed at which the content is scraped from the website is slow. This line tree = html.fromstring(page.content) in particular slows down the process. Is there a way I can improve on the speed of my request.
import lxml
from lxml import html
import requests
import re
import pandas as pd
from requests.exceptions import ConnectionError
greyhound_url = 'http://thegreyhoundrecorder.com.au/form-guides/'
def get_page(url):
"""fxn take page url and return the links to the acticle(Field) we
want to scrape in a list.
"""
page = requests.get(url)
tree = html.fromstring(page.content)
my_list = tree.xpath('//tbody/tr/td[2]/a/#href') # grab all link
print('Length of all links = ', len(my_list))
my_url = [page.url.split('/form-guides')[0] + str(s) for s in my_list]
return my_url
def extract_data(my_url):
"""
fxn take a list of urls and extract the needed infomation from
greyhound website.
return: a list with the extracted field
"""
new_list = []
try:
for t in my_url:
print(t)
page_detail = requests.get(t)
tree_1 = html.fromstring(page_detail.content)
title = ''.join(tree_1.xpath('//div/h1[#class="title"]/text()'))
race_number = tree_1.xpath("//tr[#id = 'tableHeader']/td[1]/text()")
Distance = tree_1.xpath("//tr[#id = 'tableHeader']/td[3]/text()")
TGR_Grade = tree_1.xpath("//tr[#id = 'tableHeader']/td[4]/text()")
TGR1 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[1]/text()")
TGR2 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[2]/text()")
TGR3 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[3]/text()")
TGR4 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[4]/text()")
clean_title = title.split(' ')[0].strip()
#clean title and extract track number
Track = title.split(' ')[0].strip()
#clean title and extract track date
date = title.split('-')[1].strip()
#clean title and extract track year
year = pd.to_datetime('now').year
#convert date to pandas datetime
race_date = pd.to_datetime(date + ' ' + str(year)).strftime('%d/%m/%Y')
#extract race number
new_rn = []
for number in race_number:
match = re.search(r'^(.).*?(\d+)$', number)
new_rn.append(match.group(1) + match.group(2))
new_list.append((race_date,Track,new_rn,Distance,TGR_Grade,TGR1,TGR2,TGR3,TGR4))
return new_list
except ConnectionError as e:
print('Connection error, connect to a stronger network or reload the page')

Capture product url data in case of pagination using BeautifulSoup?

I have fetched the html data from a site and trying to fetch product urls:
def get_soup(url):
soup = None
response = requests.get(url)
if response.status_code == 200:
html = response.content
soup = BeautifulSoup(html, "html.parser")
return soup
def get_category_urls(url):
soup = get_soup(url)
cat_urls = []
categories = soup.find('div', attrs={'id': 'menu_oc'})
if categories is not None:
for c in categories.findAll('a'):
if c['href'] is not None:
cat_urls.append(c['href'])
return cat_urls
def get_product_urls(url):
soup = get_soup(url)
prod_urls = []
if soup.find('div', attrs={'class': 'pagination'}):
for link in soup.select('div.links a'):
if link.string.isdecimal(): # dump next and last links
prod_urls.append(link['href'])
print("Found following product urls::", prod_urls)
return prod_urls
if __name__ == '__main__':
category_urls = get_category_urls(URL)
product_urls = get_product_urls(URL)
How to efficiently identify the pagination condition in above loc?
screen shots of actual site with pagination:
and without pagination:
site link
pagination category
should be okay
from bs4 import BeautifulSoup
import requests
def get_soup(url):
soup = None
response = requests.get(url)
if response.status_code == 200:
html = response.content
soup = BeautifulSoup(html, "html.parser")
return soup
def get_category_urls(url):
soup = get_soup(url)
cat_urls = []
categories = soup.find('div', attrs={'id': 'menu_oc'})
if categories is not None:
for c in categories.findAll('a'):
if c['href'] is not None:
cat_urls.append(c['href'])
return cat_urls
def get_all_products(url):
prod_urls = []
soup = get_soup(url)
prod_urls.append(get_product_urls(soup))
links = get_pagination(soup)
print("Found those pages:", links)
if not links:
return prod_urls
for link in links:
soup = get_soup(link)
prod_urls.append(get_product_urls(soup))
print("Found following product urls:", prod_urls)
return prod_urls
def get_product_urls(soup):
links = soup.select('div.product-list .span .name a')
return [link['href'] for link in links]
def get_pagination(soup):
pages = soup.select('div.pagination div.links a')
return [link['href'] for link in pages if link.string.isdecimal()]
if __name__ == '__main__':
URL = 'http://www.example.com/shop/index.php?route=product/category&path=63_64'
category_urls = get_category_urls(URL)
product_urls = get_all_products(URL)

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item

requests + bs4 no results from pages

Here the code that can get info from https://www.gabar.org/membersearchresults.cfm
but cannot from https://www.gabar.org/membersearchresults.cfm?start=1&id=70FFBD1B-9C8E-9913-79DBB8B989DED6C1
from bs4 import BeautifulSoup
import requests
import traceback
links_to_visit = []
navigation_links = [] # for testing next button
base_url = 'https://www.gabar.org'
def make_soup(link):
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def all_results(url):
global links_to_visit
global navigation_links
soup = make_soup(url)
print(soup)
div = soup.find('div', {'class': 'cs_control'})
links = div.find_all('a')
print(links)
for link in links:
try:
if link.text == 'Next': # prev, next, new search
navigation_links.append(link)
print('got it')
elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
pass # I dont need that link
else:
links_to_visit.append(link)
except:
traceback.print_exc()
print(len(links_to_visit))
print(links_to_visit)
#print(links_to_visit[-1].get('href'))
def start():
flag = 1
page = 1
while page < 60716:
flag = 0
if navigation_links[-1].text == 'Next':
flag = 1
next_link = navigation_links[-1]
#print(next_link.get('href'))
page += 25
print(base_url + next_link.get('href'))
all_results(base_url + next_link.get('href'))
print('page is:', page)
if __name__ == '__main__':
all_results('https://www.gabar.org/membersearchresults.cfm')
start()
What I need to understand or do if I want to get full result?
What you need to understand is that there is more than a URL to an HTTP-request. In this case, a search result is only available to the session that executed the search and can therefore only be paged through if you are the "owner" of that session. Most websites identify a session using session-cookies that you need to send along with your HTTP-request.
This can be a huge hassle, but luckily pythons requests takes care of all of that for you with requests.session. Instead of using requests.get(url) you initialize the session session=requests.session() and then use that session in subsequent requests session.get(url). This will automagically preserve cookies for you and in many ways behave like an actual browser would.
You can read more about how requests.session works here.
And last but not least, your fixed code =)
from bs4 import BeautifulSoup
import requests
import traceback
links_to_visit = []
navigation_links = [] # for testing next button
# we initialize the session here
session = requests.session()
base_url = 'https://www.gabar.org'
def make_soup(link):
# r = requests.get(link)
# we use the session here in order to preserve cookies across requests
r = session.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def all_results(url):
# globals are almost never needed or recommended and certainly not here.
# you can just leave this out
# global links_to_visit
# global navigation_links
soup = make_soup(url)
print(soup)
div = soup.find('div', {'class': 'cs_control'})
links = div.find_all('a')
print(links)
for link in links:
try:
if link.text == 'Next': # prev, next, new search
navigation_links.append(link)
print('got it')
elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
pass # I dont need that link
else:
links_to_visit.append(link)
except:
traceback.print_exc()
print(len(links_to_visit))
print(links_to_visit)
#print(links_to_visit[-1].get('href'))
def start():
flag = 1
page = 1
while page < 60716:
flag = 0
if navigation_links[-1].text == 'Next':
flag = 1
next_link = navigation_links[-1]
#print(next_link.get('href'))
page += 25
print(base_url + next_link.get('href'))
all_results(base_url + next_link.get('href'))
print('page is:', page)
if __name__ == '__main__':
all_results('https://www.gabar.org/membersearchresults.cfm')
start()