Setup
Using scrapy as framework, I implemented following SeleniumMiddleWare to render the url:
class SeleniumMiddleware(object):
#classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
request.meta['driver'] = self.driver
self.driver.get(request.url)
#Click button
self.driver.find_element (
By.XPATH, '/html/body/header/div[2]/div/div/div[3]/nav/div[2]/div/div/ul/li[1]/a'
).click()
body = to_bytes(self.driver.page_source)
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
def spider_opened(self, spider):
self.driver = webdriver.Firefox(
service=Service (r'C:\Users\j.metzger\Downloads\geckodriver-v0.30.0-win64\geckodriver.exe')
)
def spider_closed(self, spider):
pass
# self.driver.close()
Problem
The HtmlResponse generated by process_request is not fully rendered when it is received by the spider:
def parse(self, response, **kwargs):
open_in_browser (response)
Therefore, I don't have access to all html elements in the spider.
However, the url gets fully rendered in the selenium browser. I included some wait/sleep commands without any success.
Related
i used selenium firefox to successfully get a specific API and saved the session cookies using pickle, where i am stuck at now is loading the cookies to scrapy spider to get 200 request status.
below is the unsuccessful approach i used :
import scrapy
import os
import json
import pickle
class ProductsSpider(scrapy.Spider):
name = "Products"
start_urls = ["https://www.woolworths.com.au/apis/ui/products/305224,221667,305223,317793,341058,201689,221228,230414,201688,221029"]
params = {"excludeUnavailable": "true", "source":"RR-Best Sellers"}
#with open("./woolworths.json", 'r') as inputfile:
# cookie = json.load(inputfile)
with open("./woolworths.pkl", 'rb') as f:
cookies = pickle.load(f)
def start_requests(self):
url = self.start_urls[0]
yield scrapy.Request(url=url, cookies=self.cookies, meta=self.params, callback=self.parse)
def parse(self, response):
data = response.json()
for a in data:
print(a)
yield a['Name']
I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()
I am trying to login in a website using selenium then pass the authenticated session to scrapy to extract stuff.
The issue is that after I pass the session to scrapy I am still not logged in.
class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['*****']
start_urls = ['*****']
def __init__(self):
self.driver = webdriver.Firefox()
def start_requests(self):
# driver = webdriver.Firefox()
self.driver.get('*****')
time.sleep(5)
portalButton = self.driver.find_element_by_xpath('//*[#id="fb_submit"]')
portalButton.click()
time.sleep(2)
self.driver.find_element_by_xpath('//*[#id="email"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="password"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="btn-login"]').click()
time.sleep(5)
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
yield Request(url="****",cookies=c,callback=self.parse)
def parse(self,response):
# self.log("->>>>>>>>>>>>")
open_in_browser(response)
# view(response)
self.log("->>>>>>>>>>>>")
I would suggest changing that step a bit:
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
to something like that:
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url="****",cookies=_cookies,callback=self.parse)
in each iteration you re-create c with new {cookie['name']: cookie['value']}
my code examples:
import time
import scrapy
from scrapy import Request
from scrapy.utils.response import open_in_browser
from selenium import webdriver
from selenium.webdriver.common.by import By
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['URL']
def __init__(self):
super().__init__()
self.driver = webdriver.Chrome()
def start_requests(self):
self.driver.get('URL')
time.sleep(5)
self.driver.find_element(By.ID, ('email')).send_keys('EMAIL')
self.driver.find_element(By.ID, ('passwd')).send_keys('PASSWORD')
self.driver.find_element(By.ID, ('SubmitLogin')).click()
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url='URL',
cookies=_cookies,
callback=self.parse)
self.driver.quit()
def parse(self, response, **kwargs):
open_in_browser(response)
self.log(response)
I'm using scrapy + selenium + phantomjs to get the data from web which has javascript loading the data. When i use chrome. it works well, but when i change the browser to phantomjs, it can't work(can't get the data loaded by javascript). Here is the code(windows 7 platform):
class MyCustomDownloaderMiddleware(object):
def __init__(self):
#self.driver = webdriver.Chrome()
self.driver = webdriver.PhantomJS()
def process_request(self, request, spider):
if r"http://wenshu.court.gov.cn/list/list/?sorttype=1&conditions=searchWord+QWJS+++%E5%85%A8%E6%96%87%E6%A3%80%E7%B4%A2:%E6%88%90%E9%83%BD%E9%93%B6%E8%A1%8C%E8%82%A1%E4%BB%BD%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8" == request.url:
self.driver.get(request.url)
try:
element = WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "dataItem")))
body = self.driver.page_source
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
except:
body = self.driver.page_source
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
else:
pass
def __del__(self):
self.driver.quit()
For some reasons I would like to reset the list of seen urls that scrapy maintains internally at some point of my spider code.
I know that by default scrapy uses the RFPDupeFilter class and that there is a fingerprint set.
How can this set be cleared within spider code?
To be more specific: I'd like to clear the set in a custom idle_handler method that is called by a spider_idle signal.
You can access the current dupefilter object used by the spider via self.crawler.engine.slot.scheduler.df.
from scrapy import signals, Spider
from scrapy.xlib.pydispatch import dispatcher
class ExampleSpider(Spider):
name = "example"
start_urls = ['http://www.example.com/']
def __init__(self, *args, **kwargs):
super(ExampleSpider, self).__init__(*args, **kwargs)
dispatcher.connect(self.reset_dupefilter, signals.spider_idle)
def reset_dupefilter(self, spider):
# clear stored fingerprints by the dupefilter when idle
self.crawler.engine.slot.scheduler.df.fingerprints = set()
def parse(self, response):
pass
You can reset the fingerprint set by initializing fingerprints
self.crawler.engine.slot.scheduler.df.fingerprints = set()
to an empty set.
Put the following code in your spider.
def reset_filter(self, spider):
self.crawler.engine.slot.scheduler.df.fingerprints = set()
#overriding the default from_crawler class method to access scrapy core components
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
#initiate an event signal when spider is idle
crawler.signals.connect(spider.reset_filter, signals.spider_idle)
return spider
please refer to https://github.com/scrapy/scrapy/issues/1762 for more information.