Scrapy newbie here. I'm currently trying to extend my crawlspider so it can take in multiple arguments from a text document (instead of manually typing in each arguments into the command line like scrapy crawl crawl5 -a start_url="argument"). Currently, I can input one argument and that generates a few items. But I'd like some guidance on two problems:
How can I create an item for each argument?
How can I use that item as a container for the items I generate from each argument?
My goal here is to sort of imitate running my crawlspider multiple times, while keeping the items returned from each argument separate.
EDIT.. here's my code- as you can see it's a scraper for thesaurus.com
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from thesaurus.items import ThesaurusItem
class MySpider(CrawlSpider):
name = 'crawl5'
def __init__(self, *args, **kwargs):
self.start_urls = ["http://www.thesaurus.com/browse/%s" %kwargs.get('start_url')]
self.allowed_domains = ["thesaurus.com"]
self.rules = (
Rule(LinkExtractor(restrict_xpaths=("//div[id='paginator']//a/#href"))),
Rule(LinkExtractor(allow=('http://www.thesaurus.com/browse/%s/.$' %kwargs.get('start_url'), 'http://www.thesaurus.com/browse/%s/..$' %kwargs.get('start_url'))), callback='parse_item', follow=True)
)
super(MySpider, self).__init__(*args, **kwargs)
def parse_start_url(self, response):
for sel in response.xpath("//div[contains(#class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[#class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
def parse_item(self, response):
for sel in response.xpath("//div[contains(#class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[#class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
Related
I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()
I am looking to try and remove duplicate timestamps for when I scrape the following site for data on BTC. I want to remove the duplicates after every time requests are sent, so that scrapy can remove the duplicates.
However, I cannot understand how the duplicates are removed when it involves the json response. I had thought it would remove the duplicates when I put the json into a dataframe, however it will not do this.
Here's the items pipeline:
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
for time in adapter['data']['timestamp']:
if time in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(time)
return item
The pipelines seems to not produce any errors however it is not removing the duplicate timestamps, so it is not working.
Here's the script that I am using to grab the data.
import scrapy
import numpy as np
from collections import defaultdict
import pandas as pd
import time
def storeBitcoin(response):
bitcoin = defaultdict(list)
resp = response.json()['data']['KdataInfo']
for row in range(0, len(resp)):
bitcoin['timestamp'].append(resp[row]['T'])
bitcoin['open'].append(resp[row]['O'])
bitcoin['closed'].append(resp[row]['C'])
bitcoin['high'].append(resp[row]['H'])
bitcoin['low'].append(resp[row]['L'])
return bitcoin
sec_begin = [55, 75]
sec_end = [15, 35]
class BtcSpider(scrapy.Spider):
name = 'btcc2'
start_urls = ['https://www.btcc.com/quot/history?']
custom_settings = {
'DOWNLOAD_DELAY':0.2
}
def start_requests(self):
for urls in self.start_urls:
for begin, end in zip(sec_begin, sec_end):
yield scrapy.FormRequest(
url=urls,
method="GET",
formdata = {
'codeid': '3223607',
'token': 'm19JU98eIFQjRgwsf9b3eXXI1jmDSW9N',
'interval': '35',
'from': f'16517697{begin}',
'to': f'16518562{end}',
},
callback = self.parse,
)
def parse(self, response):
data = pd.DataFrame(storeBitcoin(response))
data = data.drop_duplicates(subset=['timestamp'])
yield data
In CrawlSpider, how can I scrape the marked field "4 days ago" in the image before extracting each link?
The below-mentioned CrawlSpider is working fine. But in 'parse_item' I want to add a new field named 'Add posted' where I want to get the field marked on the image.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
'Add posted': ?
}
When using the Rule object of the scrapy crawl spider, the extracted link's text is saved in a meta field of the request named link_text. You can obtain this value in the parse_item method and extract the time information using regex. You can read more about it from the docs. See below example.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
link_text = response.request.meta.get("link_text")
m = re.search(r"(Last Updated.*ago)", link_text)
if m:
posted = m.group(1).replace("\xa0", " ")
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
"Add posted": posted
}
To show in a loop, you can use the following xpath to receive that data point:
x = response.xpath('//div[#class="timeStamp"]')
for i in x:
yield {'result': i.xpath("./i/following-sibling::text()").get().strip() }
For some reasons I would like to reset the list of seen urls that scrapy maintains internally at some point of my spider code.
I know that by default scrapy uses the RFPDupeFilter class and that there is a fingerprint set.
How can this set be cleared within spider code?
To be more specific: I'd like to clear the set in a custom idle_handler method that is called by a spider_idle signal.
You can access the current dupefilter object used by the spider via self.crawler.engine.slot.scheduler.df.
from scrapy import signals, Spider
from scrapy.xlib.pydispatch import dispatcher
class ExampleSpider(Spider):
name = "example"
start_urls = ['http://www.example.com/']
def __init__(self, *args, **kwargs):
super(ExampleSpider, self).__init__(*args, **kwargs)
dispatcher.connect(self.reset_dupefilter, signals.spider_idle)
def reset_dupefilter(self, spider):
# clear stored fingerprints by the dupefilter when idle
self.crawler.engine.slot.scheduler.df.fingerprints = set()
def parse(self, response):
pass
You can reset the fingerprint set by initializing fingerprints
self.crawler.engine.slot.scheduler.df.fingerprints = set()
to an empty set.
Put the following code in your spider.
def reset_filter(self, spider):
self.crawler.engine.slot.scheduler.df.fingerprints = set()
#overriding the default from_crawler class method to access scrapy core components
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
#initiate an event signal when spider is idle
crawler.signals.connect(spider.reset_filter, signals.spider_idle)
return spider
please refer to https://github.com/scrapy/scrapy/issues/1762 for more information.
here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko