Scrapy sends multiple Documents to Elastic - scrapy

we use scrapy to crawl a website where you need to be logged in.
There is one website with different pages to crawl. So we have for example 3 different spiders and just need one login.
So we tried to use one driver for all spiders and we need to run the spiders sequentially:
#...
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from scrapy.utils.project import get_project_settings
#...
class LoginAndCrawl():
login_url = "https://example.com"
retry_count = 0
max_retries = 10
webdriver_timeout = 30
crawler_delay = 1
def __init__(self):
options = Options()
options.headless = True
self.driver = webdriver.Firefox(options=options)
self.prerender()
configure_logging()
self.runner = CrawlerRunner(get_project_settings())
self.crawl()
reactor.run()
self.driver.close()
#defer.inlineCallbacks
def crawl(self):
yield self.runner.crawl(MySpider1, driver=self.driver)
yield self.runner.crawl(MySpider2, driver=self.driver)
yield self.runner.crawl(MySpider3, driver=self.driver)
reactor.stop ()
def prerender(self):
try:
self.log_in()
except Exception as e:
self.retry_count += 1
if self.retry_count > self.max_retries:
self.driver.close()
self.driver = None
else:
self.prerender()
def log_in(self):
#... login code
helper = LoginAndCrawl()
class MySpider1(AbstractSpider)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(**kwargs)
self.driver = driver
# do some stuff
class MySpider2(MySpider1)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(driver, **kwargs)
# do some stuff
class MySpider3(MySpider1)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(driver, **kwargs)
# do some stuff
If we just run one spider, everything is fine. But if we run more than one, the crawled documents are stored multiple times in out elasticsearch index. For example every document from MySpider1 is stored 3 times, MySpider2 twice and MySpider3 has every document stored once.
We tried to check if the duplicates are in our pipline before passing to elasticsearch but there aren't any duplicates passed from our side.
Our impression is that the elastic pipeline somehow keeps the documents from each spider and then saves them for each of them.
Is there any known issue with this implementation?
Can someone confirm this wrong behavior?
Is there any way to fix this problem?

Related

Scrapy Selenium: Why pagination is not working for scrapy-selenium?

I am trying to get data using scrapy-selenium but there is some issue with the pagination. I have tried my level best to use different selectors and methods but nothing changes. It can only able to scrape the 1st page. I have also checked the other solutions but still, I am unable to make it work. Looking forward to experts' advice.
Source: https://www.gumtree.com/property-for-sale/london
import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class Basic2Spider(scrapy.Spider):
name = 'basic2'
def start_requests(self):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.gumtree.com/property-for-sale/london")
time.sleep(2)
property_xpath = driver.find_elements(By.XPATH, "(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
href= detail.get_attribute('href')
time.sleep(2)
yield SeleniumRequest(
url = href,
)
driver.quit()
return super().start_requests()
def parse(self, response):
yield {
'Title': response.xpath("//div[#class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
'Price': response.xpath("//h3[#itemprop='price']/text()").get(),
'Add Posted': response.xpath("//*[#id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
'Links': response.url
}
next_page = response.xpath("//li[#class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
if next_page:
abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
yield SeleniumRequest(
url= abs_url,
wait_time=5,
callback=self.parse
)
Your code seem to be correct but getting tcp ip block. I also tried alternative way where code is correct and pagination is working and this type of pagination is two times faster than others but gives me sometimes strange result and sometimes getting ip block.
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class Basic2Spider(scrapy.Spider):
name = 'basic2'
responses = []
def start_requests(self):
url='https://www.gumtree.com/property-for-sale/london/page{page}'
for page in range(1,6):
print(page)
yield SeleniumRequest(
url=url.format(page=page),
callback=self.parse,
wait_time=5
)
def parse(self, response):
driver = response.meta['driver']
intial_page = driver.page_source
self.responses.append(intial_page)
for resp in self.responses:
r = Selector(text=resp)
property_xpath = r.xpath("(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
yield {
'Title': detail.xpath('.//*[#class="listing-title"]/text()').get().strip(),
'Price': detail.xpath('.//*[#class="listing-price"]/strong/text()').get(),
'Add Posted': detail.xpath('.//*[#class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
'Links': response.url
}

Need screenshot from URL using chromedriver and selenium

I am able to take screenshot of Website, but I need to take screenshot from URL like we do using snipping tool, but based on screen_size , pixel or want to do some scroll up and scroll down operation to get other images. Please suggest me a way to get picture from below URL
https://artsandculture.google.com/asset/FgEEOnrrqsn9OA
I am using below code in AWS Lambda:
import json
#coding=utf-8
import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import logging
import os
class WebDriver(object):
def __init__(self):
self.options = Options()
self.options.binary_location = '/opt/headless-chromium'
#self.options.add_argument('--headless')
self.options.add_argument('--no-sandbox')
#self.options.add_argument('--start-maximized')
#self.options.add_argument('--start-fullscreen')
self.options.add_argument('--single-process')
self.options.add_argument('--disable-dev-shm-usage')
self.options.add_argument("--kiosk")
def get(self):
driver = Chrome('/opt/chromedriver', options=self.options)
return driver
def lambda_handler(event, context):
instance_ = WebDriver()
driver = instance_.get()
options = Options()
a = os.listdir('/tmp')
for x in a:
print(x)
URL = os.environ.get("URL")
driver.get(URL)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
#driver.set_window_size(S('Width'),S('Height'))
#driver.set_window_size(1280, 720)
# May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('/tmp/job_status.png')
driver.quit()
I am using xpath to get particular division of class.
driver.find_element_by_xpath('/html/body/div/div[1]/section[2]/div[1]/div/div[2]').screenshot('/tmp/daily_job_status.png')

Passing authenticated session from selenium to scrapy

I am trying to login in a website using selenium then pass the authenticated session to scrapy to extract stuff.
The issue is that after I pass the session to scrapy I am still not logged in.
class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['*****']
start_urls = ['*****']
def __init__(self):
self.driver = webdriver.Firefox()
def start_requests(self):
# driver = webdriver.Firefox()
self.driver.get('*****')
time.sleep(5)
portalButton = self.driver.find_element_by_xpath('//*[#id="fb_submit"]')
portalButton.click()
time.sleep(2)
self.driver.find_element_by_xpath('//*[#id="email"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="password"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="btn-login"]').click()
time.sleep(5)
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
yield Request(url="****",cookies=c,callback=self.parse)
def parse(self,response):
# self.log("->>>>>>>>>>>>")
open_in_browser(response)
# view(response)
self.log("->>>>>>>>>>>>")
I would suggest changing that step a bit:
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
to something like that:
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url="****",cookies=_cookies,callback=self.parse)
in each iteration you re-create c with new {cookie['name']: cookie['value']}
my code examples:
import time
import scrapy
from scrapy import Request
from scrapy.utils.response import open_in_browser
from selenium import webdriver
from selenium.webdriver.common.by import By
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['URL']
def __init__(self):
super().__init__()
self.driver = webdriver.Chrome()
def start_requests(self):
self.driver.get('URL')
time.sleep(5)
self.driver.find_element(By.ID, ('email')).send_keys('EMAIL')
self.driver.find_element(By.ID, ('passwd')).send_keys('PASSWORD')
self.driver.find_element(By.ID, ('SubmitLogin')).click()
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url='URL',
cookies=_cookies,
callback=self.parse)
self.driver.quit()
def parse(self, response, **kwargs):
open_in_browser(response)
self.log(response)

How to reduce number of selenium webdriver instances being spawned by scrapy on running crawl on a spider?

On running crawl process for any spider, Scrapy tends to spawn a lot of (27 average varying between 19 - 30) Firefox instances, even if the spider being run is not using selenium.
I have tried driver.quit() inside def __del__(self) in each of the spiders using selenium. The problem still persists.
The Firefox instances stay open even after the crawling process is finished.
example spider using selenium:
import logging
import time
from os.path import abspath, dirname, join
import requests
import scrapy
import selenium
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.remote.remote_connection import LOGGER
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
LOGGER.setLevel(logging.ERROR)
PATH_DIR = dirname(abspath(__file__))
GECKODRIVER_PATH = abspath(join(PATH_DIR, "../../geckodriver"))
WAIT_TIME = 10
class ExampleSpider(sso_singapore.SsoSpider):
name = "Example"
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options, executable_path=GECKODRIVER_PATH)
def __del__(self):
self.driver.quit()
def parse(self, response):
meta = response.meta
try:
self.driver.get(response.url)
body = self.driver.page_source
try:
element = WebDriverWait(self.driver, WAIT_TIME).until(
EC.presence_of_element_located(
(By.ID, '//select[#id="rows_sort"]/option[text()="All"]')
)
)
except:
pass
response = HtmlResponse(
self.driver.current_url, body=body, encoding="utf-8"
)
except Exception as e:
logging.error(str(e))
finally:
self.driver.quit()
# Create Items based on response
def start_requests(self):
for url, meta in zip(urls, meta_list):
yield scrapy.Request(url, callback=parse, meta=meta)
Any help will be much appreciated.
from scrapy import signals
class ExampleSpider(sso_singapore.SsoSpider):
def __init__(self, *args, **kwargs):
options = Options()
options.headless = True
self.driver = webdriver.Firefox(options=options, executable_path="your_path")
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(ExampleSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
self.driver.quit()
This should do the job.
More on Scrapy signals:
https://docs.scrapy.org/en/latest/topics/signals.html
You can also use Pipeline if you have many spiders and don't want to add the same driver.quit() logic:
class YourPipeline:
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
if hasattr(spider, 'driver'):
spider.driver.quit()

how can i use the driver init in the setupModule

when i use unittest in python3, i tried like this:
import unittest
from selenium import webdriver
def setupModule():
driver = webdriver.Firefox
driver.maximize_window()
driver.get('www.google.com')
def teardownModule():
driver.close()
class test_01(unittest.TestCase):
def setUp(self):
driver.xxxx
def tearDown(self):
driver.xxxx
def test_0001(self):
driver.yyyy
def test_0002(self):
driver.zzzz
class test_02(unittest.TestCase):
def setUp(self):
driver.xxxx
def tearDown(self):
driver.xxxx
def test_0001(self):
driver.yyyy
def test_0002(self):
driver.zzzz
the driver in class and teardownModule can't be recognized. Is there any way to make it available?
I don't want to put driver = webdriver.Firefox out of def, as if i have 2 py file for different cases, it will init 2 or more firefox open firstly, nor cases in that file will be run or not, it will cause that browser always opened.
I'd recommend having a base class to handle the webdriver setup and teardown, i.e:
class BaseTest(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox
def tearDown(self):
self.driver.quit()
class test_01(BaseTest):
def test_0001(self):
self.driver.xxx
class test_02(BaseTest):
def test_0002(self):
self.driver.xxx