How can I schedule scrapy spider to crawl after certain time?

How can I schedule scrapy spider to crawl after certain time? - scrapy

I want to schedule my spider to run again in 1 hour when crawling is finished. In my code spider_closed method is calling after crawling end. Now How to run the spider again from this method. Or are there any available settings to schedule the scrapy spider.
Here is my basic spider code.
import scrapy
import codecs
from a2i.items import A2iItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.conf import settings
from scrapy.crawler import CrawlerProcess
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
class A2iSpider(scrapy.Spider):
name = "notice"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
allowed_domains = ["prothom-alo.com"]
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def parse(self, response):
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
print "*"*70
print url
print "\n\n"
yield scrapy.Request(url, callback=self.parse_page,meta={'depth':2,'url' : url})
def parse_page(self, response):
filename = "response.txt"
depth = response.meta['depth']
with open(filename, 'a') as f:
f.write(str(depth))
f.write("\n")
f.write(response.meta['url'])
f.write("\n")
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_page,meta={'depth':depth+1,'url' : url})
def spider_closed(self, spider):
print "$"*2000

You can use cron.
crontab -e to create schedule and run scripts as root, or
crontab -u [user] -e to run as a specific user.
at the bottom you can add
0 * * * * cd /path/to/your/scrapy && scrapy crawl [yourScrapy] >> /path/to/log/scrapy_log.log
0 * * * * makes the script runs hourly, and I believe you can find more details about the settings online.

You can run your spider with the JOBDIR setting, it will save your requests loaded in the scheduler
scrapy crawl somespider -s JOBDIR=crawls/somespider-1
https://doc.scrapy.org/en/latest/topics/jobs.html

Related

How to reduce number of selenium webdriver instances being spawned by scrapy on running crawl on a spider?

On running crawl process for any spider, Scrapy tends to spawn a lot of (27 average varying between 19 - 30) Firefox instances, even if the spider being run is not using selenium.
I have tried driver.quit() inside def __del__(self) in each of the spiders using selenium. The problem still persists.
The Firefox instances stay open even after the crawling process is finished.
example spider using selenium:
import logging
import time
from os.path import abspath, dirname, join
import requests
import scrapy
import selenium
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.remote.remote_connection import LOGGER
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
LOGGER.setLevel(logging.ERROR)
PATH_DIR = dirname(abspath(__file__))
GECKODRIVER_PATH = abspath(join(PATH_DIR, "../../geckodriver"))
WAIT_TIME = 10
class ExampleSpider(sso_singapore.SsoSpider):
name = "Example"
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options, executable_path=GECKODRIVER_PATH)
def __del__(self):
self.driver.quit()
def parse(self, response):
meta = response.meta
try:
self.driver.get(response.url)
body = self.driver.page_source
try:
element = WebDriverWait(self.driver, WAIT_TIME).until(
EC.presence_of_element_located(
(By.ID, '//select[#id="rows_sort"]/option[text()="All"]')
)
)
except:
pass
response = HtmlResponse(
self.driver.current_url, body=body, encoding="utf-8"
)
except Exception as e:
logging.error(str(e))
finally:
self.driver.quit()
# Create Items based on response
def start_requests(self):
for url, meta in zip(urls, meta_list):
yield scrapy.Request(url, callback=parse, meta=meta)
Any help will be much appreciated.

from scrapy import signals
class ExampleSpider(sso_singapore.SsoSpider):
def __init__(self, *args, **kwargs):
options = Options()
options.headless = True
self.driver = webdriver.Firefox(options=options, executable_path="your_path")
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(ExampleSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
self.driver.quit()
This should do the job.
More on Scrapy signals:
https://docs.scrapy.org/en/latest/topics/signals.html
You can also use Pipeline if you have many spiders and don't want to add the same driver.quit() logic:
class YourPipeline:
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
if hasattr(spider, 'driver'):
spider.driver.quit()

unable to connect spider closed when yielding to s3 bucket

# -*- coding: utf-8 -*-
import scrapy
from scrapy.utils.response import open_in_browser
from pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
#from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
class ExampleSpider(scrapy.Spider):
name = 'forever'
allowed_domains = ['example.com']
kohlind = max_kohls = 0
total_products = 0
colected = 0
items = []
#AWS_ACCESS_KEY_ID = 'id'
#AWS_SECRET_ACCESS_KEY = 'pass'
start_urls=['https://www.example.com/']
custom_settings = {'FEED_URI' : f's3://example-products/fulltest.csv',
'FEED_EXPORT_FIELDS': ['ITEM_ID','URL','SELLER','PRICE','SALE_PRICE','MAIN_IMAGE','OTHER_IMAGE','SKU','PRODUCT_NAME']
}
def __init__(self):
SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
#spider code
def _close(self):
print(f"\n\nClosing Spider with {len(self.items)} New Items")
for i in self.items:
if "URL" in i.keys():
yield item
print("Done")
Program is not connecting to _close function, no error is found and the yield items in spider code are uploaded normally (except the _close nothing happens)
I tried removing the s3 in the settings, It worked fine (i.e entered the _close function)
How can I fix?

Try this code below, and it should work
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class ExampleSpider(scrapy.Spider):
name = 'forever'
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print(f"\n\nClosing Spider with {len(self.items)} New Items")

Selenium & Scrapy: Last URL overwrites other URLs

I am currently trying to crawl data from three websites (three different URLs). Therefore, I am using a text-file to load the different URLs into the start_url.
At the moment, there are three URLs in my file. However, the script just saves/overwrites the data of the two URLs before.
This is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from time import sleep
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import re
import csv
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [l.strip() for l in f.readlines()]
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
yield Request(self.driver.current_url, callback=self.crawltips)
def crawltips(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}

scrapy yield Request not working

I wrote the following scrapy spider but it's not continuing the crawling process after the initial request, although I've yielded more scrapy.Requests for scrapy to follow.
import regex as re
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
class myspider(Spider):
name = 'haha'
allowed_domains = ['https://blog.scrapinghub.com/']
start_urls = ['https://blog.scrapinghub.com/']
extractor = LinkExtractor(allow=allowed_domains)
def parse(self, response):
# To extract all the links on this page
links_in_page = self.extractor.extract_links(response)
for link in links_in_page:
yield scrapy.Request(link.url, callback=self.parse)

allowed_domains needs to be a list of domains, not a list of URLs.
So it should be:
allowed_domains = ['blog.scrapinghub.com']

Scrapy won't get results from first page

here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!

if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How can I schedule scrapy spider to crawl after certain time? - scrapy

You can run your spider with the JOBDIR setting, it will save your requests loaded in the scheduler scrapy crawl somespider -s JOBDIR=crawls/somespider-1 https://doc.scrapy.org/en/latest/topics/jobs.html

Related

How to reduce number of selenium webdriver instances being spawned by scrapy on running crawl on a spider?

unable to connect spider closed when yielding to s3 bucket

Selenium & Scrapy: Last URL overwrites other URLs

scrapy yield Request not working

Scrapy won't get results from first page

Categories

Resources