Scrapy/Selenium: How do I call out Selenium in Spider? - selenium

I am new to web-scraping.
I would like to follow all the links in Webpage_A. For each link, I need to select some xpath and download the data in an Excel file.
In the below code, I used
yield response.follow
to follow all the links in Webpage_A
In
def parse_table
I used webdriver and driver.get to access the webpages of the links.
Is there another way to access the webpages instead? Eg, SeleniumRequest?
I noted driver.implicitly_wait(120) doesn't work in letting Chrome remain open.
I have to use sleep. Why doesn't implicitly_wait work?
Finally, why does the Chrome browser auto-close although I didn't use driver.close()?
class testSpider2(scrapy.Spider):
name = 'test_2'
def start_requests(self):
yield SeleniumRequest(
url='CONFIDENTIAL',
wait_time=15,
screenshot=True,
callback=self.parse
)
def parse(self, response):
tables_name = response.xpath("//div[#class='contain wrap:l m-x:a']//li")
for t in tables_name:
name=t.xpath(".//a/span/text()").get()
link = t.xpath(".//a/#href").get()
if link :
yield response.follow(url=link, callback=self.parse_table, meta={'table_name': name, 'link_t': link})
def parse_table(self, response):
chrome_path = "C:/Users/chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_path)
name = response.request.meta['table_name']
link_t = response.request.meta['link_t']
driver.get(link_t)
driver.implicitly_wait(120)
button_select=driver.find_element_by_xpath("(//a[text()='Select All'])").click()
button_st_yr=driver.find_element_by_xpath("//select[#name='ctl00$ContentPlaceHolder1$StartYearDropDownList'] /option[1]").click()
button_end_mth=driver.find_element_by_xpath("//select[#name='ctl00$ContentPlaceHolder1$EndMonthDropDownList']/option[text()='Dec']").click()
button_download=driver.find_element_by_xpath("//input[#id='ctl00_ContentPlaceHolder1_DownloadButton']").click()
time.sleep(120)

Related

How to scrape all article links from Reuters website using Scrapy when older links are dynamically loaded after scrolling down?

I am trying to scrape all the hyperlinks of the website for all the news articles to extract the content. I am able to retrieve all the data from all the articles which are loaded when you open the website but when you scroll down on the site more articles are automatically loaded due to an event. Currently I am using Scrapy_Splash but I receive the same amount of links when I do not use splash. I hope you can help me out. The spider is called class FinanceNewsScraperSpider(scrapy.Spider):. Below you can see my code:
name = "audinewsarticles"
def start_requests(self):
start_urls = ['https://www.reuters.com/companies/NSUG.DE/news',
]
urls = start_urls
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_newspage)
def parse_newspage(self, response):
links = response.xpath('//a[contains(#href,"/article/")]/#href').extract() #extract hyperlink
for url in links:
yield SplashRequest(url=url,
callback=self.parse_article,
endpoint='render.html')
def parse_article(self, response):
item = AudiItem()
item['article_link'] = response.url
item['article_headline'] = response.xpath('//*[contains(#class,"ArticleHeader_headline")]/text()').extract()
item['article_date'] = response.xpath('//*[contains(#class,"ArticleHeader_date")]/text()').extract()
item['article_text'] = response.xpath('//div[#class="StandardArticleBody_body"]//p/text()').extract()
print(item)
#saving data to file.
path = 'news/'
file = 'audinews_' + str(datetime.now().strftime("%Y%m%d-%H%M")) + '.csv'
file_name = open(path + file, 'a')
fieldnames = ['article_link', 'article_headline','article_date','article_text'] #adding header to file
writer = csv.writer(file_name, lineterminator='\n')
writer.writerow([item[key] for key in item.keys()])
Please let me know if you need more information from me.

Need to return Scrapy callback method data to calling function

In below code I am trying to collect email ids from a website. It can be on contact or about us page.
From parse method I follow extemail method for all those pages.
From every page I collected few email ids.
Now I need to print them with original record sent to init method.
For example:
record = "https://www.wockenfusscandies.com/"
I want to print output as,
https://www.wockenfusscandies.com/|abc#gamil.com|def#outlook.com
I am not able to store them in self.emails and deliver back to init method.
Please help.
import scrapy
from scrapy.crawler import CrawlerProcess
class EmailSpider(scrapy.Spider):
def __init__(self, record):
self.record = record
self.emails = []
url = record.split("|")[4]
if not url.startswith("http"):
url = "http://{}".format(url)
if url:
self.start_urls = ["https://www.wockenfusscandies.com/"]
else:
self.start_urls = []
def parse(self, response):
contact_list = [a.attrib['href'] for a in response.css('a') if 'contact' in a.attrib['href'] or 'about' in a.attrib['href']]
contact_list.append(response.request.url)
for fllink in contact_list:
yield response.follow(fllink, self.extemail)
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
yield {
'emails': emails
}
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
f = open("/Users/kalpesh/work/data/test.csv")
for rec in f:
process.crawl(EmailSpider, record=rec)
f.close()
process.start()
If I understand your intend correctly you could try the following proceeding:
a) collect the mail-ids in self.emails like
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
self.emails = emails.copy()
yield {
'emails': emails
}
(Or on what other way you get the email-ids from emails)
b) add a close(self, reason) method as in GitHub-Example which is called when the spider has finished
def close(self, reason):
mails_for_record = ""
for mail in self.emails:
mails_for_record += mail + "|"
print(self.record + mails_for_record)
Please also note, I read somewhere that for some versions of Scrapy it is def close(self, reason), for others it is def closed(self, reason).
Hope, this proceeding helps you.
You should visit all the site pages before yielding result for this one site.
This means that you should have queue of pages to visit and results storage.
It can be done using meta.
Some pseudocode:
def parse(self, response):
meta = response.meta
if not meta.get('seen'):
# -- finding urls of contact and about us pages --
# -- putting it to meta['queue'] --
# -- setting meta['seen'] = True
page_emails_found = ...getting emails here...
# --- extending already discovered emails
# --- from other pages/initial empty list with new ones
meta['emails'].extend(page_emails_found)
# if queue isn't empty - yielding new request
if meta['queue']:
next_url = meta['queue'].pop()
yield Request(next_url, callback=self.parse, meta=copy(meta))
# if queue is empty - yielding result from meta
else:
yield {'url': current_domain, 'emails': meta['emails']}
Something like this..

How to use the yield function to scrape data from multiple pages

I'm trying to scrape data from amazon India website. I am not able collect response and parse the elements using the yield() method when:
1) I have to move from product page to review page
2) I have to move from one review page to another review page
Product page
Review page
Code flow:
1) customerReviewData() calls the getCustomerRatingsAndComments(response)
2) The getCustomerRatingsAndComments(response)
finds the URL of the review page and call the yield request method with getCrrFromReviewPage(request) as callback method, with url of this review page
3) getCrrFromReviewPage() gets new response of the firstreview page and scrape all the elements from the first review page (page loaded) and add it to customerReviewDataList[]
4) get URL of the next page if it exists and recursively call getCrrFromReviewPage() method, and crawl elements from next page, until all the review page is crawled
5) All the reviews gets added to the customerReviewDataList[]
I have tried playing around with yield() changing the parameters and also looked up the scrapy documentation for yield() and Request/Response yield
# -*- coding: utf-8 -*-
import scrapy
import logging
customerReviewDataList = []
customerReviewData = {}
#Get product name in <H1>
def getProductTitleH1(response):
titleH1 = response.xpath('normalize-space(//*[#id="productTitle"]/text())').extract()
return titleH1
def getCustomerRatingsAndComments(response):
#Fetches the relative url
reviewRelativePageUrl = response.css('#reviews-medley-footer a::attr(href)').extract()[0]
if reviewRelativePageUrl:
#get absolute URL
reviewPageAbsoluteUrl = response.urljoin(reviewRelativePageUrl)
yield Request(url = reviewPageAbsoluteUrl, callback = getCrrFromReviewPage())
self.log("yield request complete")
return len(customerReviewDataList)
def getCrrFromReviewPage():
userReviewsAndRatings = response.xpath('//div[#id="cm_cr-review_list"]/div[#data-hook="review"]')
for userReviewAndRating in userReviewsAndRatings:
customerReviewData[reviewTitle] = response.css('#cm_cr-review_list .review-title span ::text').extract()
customerReviewData[reviewDescription] = response.css('#cm_cr-review_list .review-text span::text').extract()
customerReviewDataList.append(customerReviewData)
reviewNextPageRelativeUrl = response.css('#cm_cr-pagination_bar .a-pagination .a-last a::attr(href)')[0].extract()
if reviewNextPageRelativeUrl:
reviewNextPageAbsoluteUrl = response.urljoin(reviewNextPageRelativeUrl)
yield Request(url = reviewNextPageAbsoluteUrl, callback = getCrrFromReviewPage())
class UsAmazonSpider(scrapy.Spider):
name = 'Test_Crawler'
allowed_domains = ['amazon.in']
start_urls = ['https://www.amazon.in/Philips-Trimmer-Cordless-Corded-QT4011/dp/B00JJIDBIC/ref=sr_1_3?keywords=philips&qid=1554266853&s=gateway&sr=8-3']
def parse(self, response):
titleH1 = getProductTitleH1(response),
customerReviewData = getCustomerRatingsAndComments(response)
yield{
'Title_H1' : titleH1,
'customer_Review_Data' : customerReviewData
}
I'm getting the following response:
{'Title_H1': (['Philips Beard Trimmer Cordless and Corded for Men QT4011/15'],), 'customer_Review_Data': <generator object getCustomerRatingsAndComments at 0x048AC630>}
The "Customer_review_Data" should be a list of dict of title and review
I am not able to figure out as to what mistake I am doing here.
When I use the log() or print() to see what data is captured in customerReviewDataList[], unable to see the data in the console either.
I am able to scrape all the reviews in customerReviewDataList[], if they are present in the product page,
In this scenario where I have to use the yield function I am getting the output stated above like this [https://ibb.co/kq8w6cf]
This is the kind of output I am looking for:
{'customerReviewTitle': ['Difficult to find a charger adapter'],'customerReviewComment': ['I already have a phillips trimmer which was only cordless. ], 'customerReviewTitle': ['Good Product'],'customerReviewComment': ['Solves my need perfectly HK']}]}
Any help is appreciated. Thanks in advance.
You should complete the Scrapy tutorial. The Following links section should be specially helpful to you.
This is a simplified version of your code:
def data_request_iterator():
yield Request('https://example.org')
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
yield {
'title': response.css('title::text').get(),
'data': data_request_iterator(),
}
Instead, it should look like this:
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
item = {
'title': response.css('title::text').get(),
}
yield Request('https://example.org', meta={'item': item}, callback=self.parse_data)
def parse_data(self, response):
item = response.meta['item']
# TODO: Extend item with data from this second response as needed.
yield item

How to include try and Exceptions tests in a thousands downloads program that uses selenium and requests?

I have a program to download photos on various websites. Each url is formed at the end of the address by codes, which are accessed in a dataframe. In a dataframe of 8,583 lines
The sites have javascript, so I use selenium to access the src of the photos. And I download it with urllib.request.urlretrieve
Example of a photo site: http://divulgacandcontas.tse.jus.br/divulga/#/candidato/2018/2022802018/PB/150000608817
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import urllib.request, urllib.parse, urllib.error
# Root URL of the site that is accessed to fetch the photo link
url_raiz = 'http://divulgacandcontas.tse.jus.br/divulga/#/candidato/2018/2022802018/'
# Accesses the dataframe that has the "sequencial" type codes
candidatos = pd.read_excel('candidatos_2018.xlsx',sheet_name='Sheet1', converters={'sequencial': lambda x: str(x), 'cpf': lambda x: str(x),'numero_urna': lambda x: str(x)})
# Function that opens each page and takes the link from the photo
def pegalink(url):
profile = webdriver.FirefoxProfile()
browser = webdriver.Firefox(profile)
browser.get(url)
time.sleep(10)
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
browser.close()
link = soup.find("img", {"class": "img-thumbnail img-responsive dvg-cand-foto"})['src']
return link
# Function that downloads the photo and saves it with the code name "cpf"
def baixa_foto(nome, url):
urllib.request.urlretrieve(url, nome)
# Iteration in the dataframe
for num, row in candidatos.iterrows():
cpf = (row['cpf']).strip()
uf = (row['uf']).strip()
print(cpf)
print("-/-")
sequencial = (row['sequencial']).strip()
# Creates full page address
url = url_raiz + uf + '/' + sequencial
link_foto = pegalink(url)
baixa_foto(cpf, link_foto)
Please I look guidance for:
Put a try-Exception type to wait for the page to load (I'm having errors reading the src - after many hits the site takes more than ten seconds to load)
And I would like to record all possible errors - in a file or dataframe - to write down the "sequencial" code that gave error and continue the program
Would anyone know how to do it? The guidelines below were very useful, but I was unable to move forward
I put in a folder a part of the data I use and the program, if you want to look: https://drive.google.com/drive/folders/1lAnODBgC5ZUDINzGWMcvXKTzU7tVZXsj?usp=sharing
put your code within :
try:
WebDriverWait(browser, 30).until(wait_for(page_has_loaded))
# here goes your code
except: Exception
print "This is an unexpected condition!"
For waitForPageToLoad :
def page_has_loaded():
page_state = browser.execute_script(
'return document.readyState;'
)
return page_state == 'complete'
30 above is time in seconds. You can adjust it as per your need.
Approach 2 :
class wait_for_page_load(object):
def __init__(self, browser):
self.browser = browser
def __enter__(self):
self.old_page = self.browser.find_element_by_tag_name('html')
def page_has_loaded(self):
new_page = self.browser.find_element_by_tag_name('html')
return new_page.id != self.old_page.id
def __exit__(self, *_):
wait_for(self.page_has_loaded)
def pegalink(url):
profile = webdriver.FirefoxProfile()
browser = webdriver.Firefox(profile)
browser.get(url)
try:
with wait_for_page_load(browser):
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
browser.close()
link = soup.find("img", {"class": "img-thumbnail img-responsive dvg-cand-foto"})['src']
except Exception:
print ("This is an unexpected condition!")
print("Erro em: ", url)
link = "Erro"
return link

Update response.body in scrapy(without reload)

I use scrapy and selenium for crawl! my site use ajax for pagination! actully , url no change and so response.body also no change! I want to click with selenium (for pagination) and get self.driver.page_source and use it instead response.body!
So i writed this code :
res = scrapy.http.TextResponse(url=self.driver.current_url, body=self.driver.page_source,
encoding='utf-8')
print(str(res)) //nothing to print!
for quote in res.css("#ctl00_ContentPlaceHolder1_Grd_Dr_DXMainTable > tr.dxgvDataRow_Office2003Blue"):
i = i+1
item = dict()
item['id'] = int(quote.css("td.dxgv:nth-child(1)::text").extract_first())
And no error !
You can replace body of original response in scrapy by using response.replace() method:
def parse(self, response):
response = response.replace(body=driver.page_source)