I'm trying to make crawler for Youtube.
I encountered strange behavior.
In the following source code, driver.page_source is obtained by selenium.
I passed the result to Beautifulsoup for parsing.
The problem is that the length of driver.page_source changes.
How can this happen? Is there any idea about this?
elif 'src' in seq:
print('video-src')
print(seq['src'])
soup = bs(driver.page_source, "html.parser")
print('driver.page_source length='+str(len(driver.page_source)))
f = open('test.txt','w',encoding='UTF-8')
f.write(driver.page_source)
f.close()
print('driver.page_source length='+str(len(driver.page_source)))
tag = '<span dir="auto" class="style-scope yt-formatted-string">'
find_start = driver.page_source.find(tag+'댓글')
print('driver.page_source length='+str(len(driver.page_source)))
tag_value = driver.page_source[find_start:find_start+200]
print('driver.page_source length='+str(len(driver.page_source)))
p = re.compile('\>([\d,]+)\<')
m = p.search(tag_value)
if m:
print(m.group(1))
video[item['name']] = m.group(1)
else:
print('error')
print(tag_value)
driver.page_source length=4103114
driver.page_source length=4102392
driver.page_source length=4102392
driver.page_source length=4103129
The page_source can change, elements can be loaded later.
Instead of checking the page_source length you can save the different driver.page_sources in text files and compare them to understand what is different. A method to do so could be using difflib:
import difflib
source1 = driver.page_source
file1 = open("file1.txt", "w")
text1 = file1.write(source1)
text1.close()
source2 = driver.page_source
file2 = open("file2.txt", "w")
text2 = file2.write(source2)
text2.close()
with open('file1.txt') as file_1:
file_1_text = file_1.readlines()
with open('file2.txt') as file_2:
file_2_text = file_2.readlines()
# Find and print the diff:
for line in difflib.unified_diff(
file_1_text, file_2_text, fromfile='file1.txt',
tofile='file2.txt', lineterm=''):
print(line)
Related
I'm having some issues with scraping fish images off a website.
species_with_foto = ['/fangster/aborre-perca-fluviatilis/1',
'/fangster/almindelig-tangnaal-syngnathus-typhle/155',
'/fangster/ansjos-engraulis-encrasicholus/66',
'/fangster/atlantisk-tun-blaafinnet-tun-thunnus-thynnus-/137']
titles = []
species = []
for x in species_with_foto:
specie_page = 'https://www.fiskefoto.dk'+x
driver.get(specie_page)
content = driver.page_source
soup = BeautifulSoup(content)
brutto = soup.find_all('img', attrs={'class':'rapportBillede'})
species.append(brutto)
#print(brutto)
titles.append(x)
try:
driver.find_element(by=By.XPATH, value='/html/body/form/div[4]/div[1]/div/div[13]/div[2]/div/div').click()
print('CLicked next', x)
except NoSuchElementException:
print('Succesfully finished - :', x)
time.sleep(2)
This returns a list of lists with the sublist looking like this:
[<img alt="Aborre (Perca fluviatilis) aborrefiskeri, striber, rygfinne, regnorm, majs, spinner, " class="rapportBillede" src="/images/400/aborre-perca-fluviatilis-medefiskeri-bundrig-0,220kg-24cm-striber-rygfinne-regnorm-majs-spinner-358-22-29-14-740-2013-21-4.jpg" style="width:50%;"/>,
<img alt="Aborre (Perca fluviatilis) aborrefiskeri, striber, rygfinne, regnorm, majs, spinner, " class="rapportBillede" src="/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg" style="width:calc(50% - 6px);margin-bottom:7px;"/>]
How can i clean up the list and only keep the src="/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg" - part? I have tried with other variables in the soup.find_all but can't get it to work.
(The selenium part is also not functioning properly, but I'll get to that after......)
EDIT:
This is my code now, I'm really getting close :) One issue is that now my photos are not saved in a list of lists but just a list - I for the love of god don't understand why this happens?
Help to fix and understand would be greatly appreciated!
titles = []
fish_photos = []
for x in species_with_foto_mini:
site = "https://www.fiskefoto.dk/"+x
html = urlopen(site)
bs = BeautifulSoup(html, 'html.parser')
titles.append(x)
try:
images = bs.find_all('img', attrs={'class':'rapportBillede'})
for img in images:
if img.has_attr('src'):
#print(img['src'])
a = (img['src'])
fish_photos.append(a)
except KeyError:
print('No src')
#navigate pages
try:
driver.find_element(by=By.XPATH, value='/html/body/form/div[4]/div[1]/div/div[13]/div[2]/div/div').click()
print('CLicked next', x)
except NoSuchElementException:
print('Succesfully finished -', x)
time.sleep(2)
EDIT:
I need the end result to be a list of lists looking something like this:
fish_photos =
[['/images/400/aborre-perca-fluviatilis-medefiskeri-bundrig-0,220kg-24cm-striber-rygfinne-regnorm-majs-spinner-358-22-29-14-740-2013-21-4.jpg',
'/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg',['/images/400/tungehvarre-arnoglossus-laterna-medefiskeri-6650-2523403.jpg', '/images/400/ulk-myoxocephalus-scorpius-medefiskeri-bundrig-koebenhavner-koebenhavner-torsk-mole-sild-boersteorm-pigge-351-18-48-9-680-2013-6-4.jpg'],[ '/images/400/graeskarpe-ctenopharyngodon-idella-medefiskeri-bobleflaad-med-toastbroed-paa-enkeltkrog-5.02kg-77cm-6436-7486.jpg','/images/400/graeskarpe-ctenopharyngodon-idella-medefiskeri-bobleflaad-med-toastbroed-paa-enkeltkrog-10.38kg-96cm-6337-4823146.jpg']
EDIT:
My output now is a list with identical lists. I need it to put every specie in its own list, like this: fish_photo_list = [[trout1, trout2, trout3], [other fish1, other fish 2, other], [salmon1, salmon2]]
My initial code this, but not now.
Here is an example, you can change:
from urllib.request import urlopen
from bs4 import BeautifulSoup
site = "[insert name of the site]"
html = urlopen(site)
bs = BeautifulSoup(html, 'html.parser')
try:
images = bs.find_all('img')
for img in images:
if img.has_attr('src'):
print(img['src'])
except KeyError:
print('No src')
I got help from here to crawl on law.go.kr with the code below.
I'm trying to crawl other websites like http://lawbot.org, http://law.go.kr, https://casenote.kr.
But problem is that I have no understanding of html...
I understood all the code and how to get html address for the code below but it's different on other websites...
I want to know how to use the code below to crawl other web pages.
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
# Using request get 50 items from first page. pg=1 is page number, outmax=50 items
per page
response = requests.post(
"http://law.go.kr/precScListR.doq=*§ion=evtNm&outmax=79329&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")
# Parse html using BeautifulSoup
page = BeautifulSoup(response.text, "html.parser")
# Go through all pages and collect posts numbers in items
items = []
for i in range(1, 2):
# Get all links
links = page.select("#viewHeightDiv .s_tit a")
# Loop all links and collect post numbers
for link in links:
# Parse post number from "onclick" attribute
items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))
# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
response = requests.get(url)
parsed = BeautifulSoup(response.text, "html.parser")
text = parsed.find('div', attrs={'id': 'contentBody'}).text #전문 저장
'id': 'contentBody', 제목제외 저장 'class': 'pgroup'
title = parsed.select_one("h2").text
posts.append({'number': item, 'url': url, 'text': text, 'title': title})
with open("D://\LAWGO_DATA/" + item + '.txt', 'w', encoding='utf8') as f:
f.write(text)
One more example for lawbot.org:
import requests
from bs4 import BeautifulSoup
base_url = 'http://lawbot.org'
search_url = base_url + '/?q=유죄'
response = requests.get(search_url)
page = BeautifulSoup(response.text, "html.parser")
lastPageNumber = int(page.select_one("li.page-item:not(.next):nth-last-child(2)").text)
casesList = []
for i in range(1, lastPageNumber + 1):
if i > 1:
response = requests.get(search_url + "&page=" + str(i))
page = BeautifulSoup(response.text, "html.parser")
cases = page.select("div.panre_center > ul.media-list li.panre_lists")
for case in cases:
title = case.findChild("h6").text
caseDocNumber = case.findChild(attrs={"class": "caseDocNumber"}).text
caseCourt = case.findChild(attrs={"class": "caseCourt"}).text
case_url = base_url + case.findChild("a")['href']
casesList.append({"title": title, "caseDocNumber": caseDocNumber, "caseCourt": caseCourt, "case_url": case_url})
# print("title:{}, caseDocNumber:{}, caseCourt:{}, caseUrl:{}".format(title, caseDocNumber, caseCourt, case_url))
for case in casesList:
response = requests.get(case["case_url"])
page = BeautifulSoup(response.text, "html.parser")
body = page.find(attrs={"class": "panre_body"}).text
print(body)
I made a web scraping program using python and webdriver and I want to extract the ASIN from 2 different pages. I would like xpath to work for these 2 links at the same .
These are the amazon pages:https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds and
https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1. They have the same parent nodes(id, classes). How can I make this program work for these 2 links at the same time?
So the problem is on these lines of code: 36, 41
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
and
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text. I have to change these lines to output in the csv the ASINs for these 2 products. For the first link it prints the wrong information and for the second it prints the ASIN.
I attached the code. I will appreciate any help.
from selenium import webdriver
import csv
import io
# set the proxies to hide actual IP
proxies = {
'http': 'http://5.189.133.231:80',
'https': 'https://27.111.43.178:8080'
}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server="%s"' % ';'.join(['%s=%s' % (k, v) for k, v in proxies.items()]))
driver = webdriver.Chrome(executable_path="C:\\Users\Andrei-PC\Downloads\webdriver\chromedriver.exe",
chrome_options=chrome_options)
header = ['Product title', 'ASIN']
with open('csv/bot_1.csv', "w") as output:
writer = csv.writer(output)
writer.writerow(header)
links=['https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds',
'https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1'
]
for i in range(len(links)):
driver.get(links[i])
product_title = driver.find_elements_by_xpath('//*[#id="productTitle"][1]')
prod_title = [x.text for x in product_title]
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
except:
print('no ASIN template one')
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text
except:
print('no ASIN template two')
try:
data = [prod_title[0], asin]
except:
print('no items v3 ')
with io.open('csv/bot_1.csv', "a", newline="", encoding="utf-8") as output:
writer = csv.writer(output)
writer.writerow(data)
You can simply use
//li[b="ASIN:"]
to get required element on both pages
I'm trying to use the following xpath for this page but it is not loading correctly.
groups = ".//*[contains(#class, 'sl-CouponParticipantWithBookCloses_Name ')]"
xp_bp1 = ".//following::div[contains(#class,'sl-MarketCouponValuesExplicit33')][./div[contains(#class,'gl-MarketColumnHeader')][.='1']]//span[#class='gl-ParticipantOddsOnly_Odds']"
The output currently is..
[['3.00'], ['3.00'], ['3.00'] etc,,
Desired:
[['3.00'], ['1.30'], ['1.25'] etc,,
Data I am after
Script:
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://www.bet365.com.au/#/AC/B1/C1/D13/E108/F16/S1/')
driver.get('https://www.bet365.com.au/#/AC/B1/C1/D13/E108/F16/S1/')
time.sleep(10)
groups = ".//*[contains(#class, 'sl-CouponParticipantWithBookCloses_Name ')]"
#//div[contains(#class, 'gl-ParticipantOddsOnlyDarker gl-ParticipantOddsOnly gl-Participant_General sl-MarketCouponAdvancedBase_LastChild ')]
xp_bp1 = ".//following::div[contains(#class,'sl-MarketCouponValuesExplicit33')][./div[contains(#class,'gl-MarketColumnHeader')][.='1']]//span[#class='gl-ParticipantOddsOnly_Odds']"
while True:
try:
time.sleep(2)
data = []
for elem in driver.find_elements_by_xpath(groups):
try:
bp1 = elem.find_element_by_xpath(xp_bp1).text
except:
bp1 = None
url1 = driver.current_url
data.append([bp1])
print(data)
url1 = driver.current_url
with open('test.csv', 'a', newline='', encoding="utf-8") as outfile:
writer = csv.writer(outfile)
for row in data:
writer.writerow(row + [url1])
except TimeoutException as ex:
pass
except NoSuchElementException as ex:
print(ex)
break
Here the code that can get info from https://www.gabar.org/membersearchresults.cfm
but cannot from https://www.gabar.org/membersearchresults.cfm?start=1&id=70FFBD1B-9C8E-9913-79DBB8B989DED6C1
from bs4 import BeautifulSoup
import requests
import traceback
links_to_visit = []
navigation_links = [] # for testing next button
base_url = 'https://www.gabar.org'
def make_soup(link):
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def all_results(url):
global links_to_visit
global navigation_links
soup = make_soup(url)
print(soup)
div = soup.find('div', {'class': 'cs_control'})
links = div.find_all('a')
print(links)
for link in links:
try:
if link.text == 'Next': # prev, next, new search
navigation_links.append(link)
print('got it')
elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
pass # I dont need that link
else:
links_to_visit.append(link)
except:
traceback.print_exc()
print(len(links_to_visit))
print(links_to_visit)
#print(links_to_visit[-1].get('href'))
def start():
flag = 1
page = 1
while page < 60716:
flag = 0
if navigation_links[-1].text == 'Next':
flag = 1
next_link = navigation_links[-1]
#print(next_link.get('href'))
page += 25
print(base_url + next_link.get('href'))
all_results(base_url + next_link.get('href'))
print('page is:', page)
if __name__ == '__main__':
all_results('https://www.gabar.org/membersearchresults.cfm')
start()
What I need to understand or do if I want to get full result?
What you need to understand is that there is more than a URL to an HTTP-request. In this case, a search result is only available to the session that executed the search and can therefore only be paged through if you are the "owner" of that session. Most websites identify a session using session-cookies that you need to send along with your HTTP-request.
This can be a huge hassle, but luckily pythons requests takes care of all of that for you with requests.session. Instead of using requests.get(url) you initialize the session session=requests.session() and then use that session in subsequent requests session.get(url). This will automagically preserve cookies for you and in many ways behave like an actual browser would.
You can read more about how requests.session works here.
And last but not least, your fixed code =)
from bs4 import BeautifulSoup
import requests
import traceback
links_to_visit = []
navigation_links = [] # for testing next button
# we initialize the session here
session = requests.session()
base_url = 'https://www.gabar.org'
def make_soup(link):
# r = requests.get(link)
# we use the session here in order to preserve cookies across requests
r = session.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def all_results(url):
# globals are almost never needed or recommended and certainly not here.
# you can just leave this out
# global links_to_visit
# global navigation_links
soup = make_soup(url)
print(soup)
div = soup.find('div', {'class': 'cs_control'})
links = div.find_all('a')
print(links)
for link in links:
try:
if link.text == 'Next': # prev, next, new search
navigation_links.append(link)
print('got it')
elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
pass # I dont need that link
else:
links_to_visit.append(link)
except:
traceback.print_exc()
print(len(links_to_visit))
print(links_to_visit)
#print(links_to_visit[-1].get('href'))
def start():
flag = 1
page = 1
while page < 60716:
flag = 0
if navigation_links[-1].text == 'Next':
flag = 1
next_link = navigation_links[-1]
#print(next_link.get('href'))
page += 25
print(base_url + next_link.get('href'))
all_results(base_url + next_link.get('href'))
print('page is:', page)
if __name__ == '__main__':
all_results('https://www.gabar.org/membersearchresults.cfm')
start()