Normally you provide hard coded start_urls but what if I want to mention certain listing url one at a time?
I know I can make a typical text file open operation but wonder whether there's some Scrapy way to perform the task?
start_urls will be saved in file
Let's say you put your config files inside the spiders directory and config dir. So overall path scrapy_project -> spiders -> configs-> <spider_name>.txt
Then you can override init of your spiders to populate your start_urls something like this.
def __init__(self, *args, **kwargs):
script_dir = os.path.dirname(__file__)
abs_file_path = os.path.join(script_dir, "configs/%s.txt" % self.name)
with open(abs_file_path) as f:
self.start_urls = [line.strip() for line in f.readlines()]
You can do it, but you have to modify __init__() of your spider:
from scrapy import Spider
class MySpider(Spider):
name = 'start_urls'
start_urls = ['http://google.com']
def __init__(self, **kwargs):
super(MySpider, self).__init__(**kwargs)
if isinstance(self.start_urls, str):
self.start_urls = self.start_urls.split(',')
def parse(self, response):
print(response.url)
Now you can modify the start_urls by passing comma separated list via -a argument:
scrapy crawl start_urls -a "start_urls=http://stackoverflow.com,http://httpbin.org"
Unfortunately AFAIK you can only pass arguments as string via -a argument so you need to handle interpretation yourself (in this case convert string into list since start_requests() iterates through start_urls value, if it's a string it will iterate through every character and break).
Related
I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()
scrapy crawl test -o test123.csv
How can I call the Output filename from code i.e I would like to use the filename inputed in terminal in spider_closed function
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=scrapy.signals.spider_closed)
def spider_closed(self):
#read test123.csv (whatever the filename is)
You can use self.settings.attributes["FEED_URI"].value in your spider to get output file name.
Scrapy newbie here. I'm currently trying to extend my crawlspider so it can take in multiple arguments from a text document (instead of manually typing in each arguments into the command line like scrapy crawl crawl5 -a start_url="argument"). Currently, I can input one argument and that generates a few items. But I'd like some guidance on two problems:
How can I create an item for each argument?
How can I use that item as a container for the items I generate from each argument?
My goal here is to sort of imitate running my crawlspider multiple times, while keeping the items returned from each argument separate.
EDIT.. here's my code- as you can see it's a scraper for thesaurus.com
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from thesaurus.items import ThesaurusItem
class MySpider(CrawlSpider):
name = 'crawl5'
def __init__(self, *args, **kwargs):
self.start_urls = ["http://www.thesaurus.com/browse/%s" %kwargs.get('start_url')]
self.allowed_domains = ["thesaurus.com"]
self.rules = (
Rule(LinkExtractor(restrict_xpaths=("//div[id='paginator']//a/#href"))),
Rule(LinkExtractor(allow=('http://www.thesaurus.com/browse/%s/.$' %kwargs.get('start_url'), 'http://www.thesaurus.com/browse/%s/..$' %kwargs.get('start_url'))), callback='parse_item', follow=True)
)
super(MySpider, self).__init__(*args, **kwargs)
def parse_start_url(self, response):
for sel in response.xpath("//div[contains(#class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[#class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
def parse_item(self, response):
for sel in response.xpath("//div[contains(#class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[#class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
For some reasons I would like to reset the list of seen urls that scrapy maintains internally at some point of my spider code.
I know that by default scrapy uses the RFPDupeFilter class and that there is a fingerprint set.
How can this set be cleared within spider code?
To be more specific: I'd like to clear the set in a custom idle_handler method that is called by a spider_idle signal.
You can access the current dupefilter object used by the spider via self.crawler.engine.slot.scheduler.df.
from scrapy import signals, Spider
from scrapy.xlib.pydispatch import dispatcher
class ExampleSpider(Spider):
name = "example"
start_urls = ['http://www.example.com/']
def __init__(self, *args, **kwargs):
super(ExampleSpider, self).__init__(*args, **kwargs)
dispatcher.connect(self.reset_dupefilter, signals.spider_idle)
def reset_dupefilter(self, spider):
# clear stored fingerprints by the dupefilter when idle
self.crawler.engine.slot.scheduler.df.fingerprints = set()
def parse(self, response):
pass
You can reset the fingerprint set by initializing fingerprints
self.crawler.engine.slot.scheduler.df.fingerprints = set()
to an empty set.
Put the following code in your spider.
def reset_filter(self, spider):
self.crawler.engine.slot.scheduler.df.fingerprints = set()
#overriding the default from_crawler class method to access scrapy core components
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
#initiate an event signal when spider is idle
crawler.signals.connect(spider.reset_filter, signals.spider_idle)
return spider
please refer to https://github.com/scrapy/scrapy/issues/1762 for more information.
here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko