Scrapy not yielding result (crawled 0 pages) - scrapy

Trying to figure out how scrapy works and using it to find information on forums.
items.py
import scrapy
class BodybuildingItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
pass
spider.py
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from bodybuilding.items import BodybuildingItem
class BodyBuildingSpider(BaseSpider):
name = "bodybuilding"
allowed_domains = ["forum.bodybuilding.nl"]
start_urls = [
"https://forum.bodybuilding.nl/fora/supplementen.22/"
]
def parse(self, response):
responseSelector = Selector(response)
for sel in responseSelector.css('li.past.line.event-item'):
item = BodybuildingItem()
item['title'] = sel.css('a.data-previewUrl::text').extract()
yield item
The forum I'm trying to get the post titles from in this example is this: https://forum.bodybuilding.nl/fora/supplementen.22/
However I keep getting no results:
class BodyBuildingSpider(BaseSpider): 2017-10-07 00:42:28
[scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: bodybuilding)
2017-10-07 00:42:28 [scrapy.utils.log] INFO: Overridden settings:
{'NEWSPIDER_MODULE': 'bodybuilding.spiders', 'SPIDER_MODULES':
['bodybuilding.spiders'], 'ROBOTSTXT_OBEY': True, 'BOT_NAME':
'bodybuilding'} 2017-10-07 00:42:28 [scrapy.middleware] INFO: Enabled
extensions: ['scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.corestats.CoreStats'] 2017-10-07 00:42:28
[scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats'] 2017-10-07
00:42:28 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware'] 2017-10-07 00:42:28
[scrapy.middleware] INFO: Enabled item pipelines: [] 2017-10-07
00:42:28 [scrapy.core.engine] INFO: Spider opened 2017-10-07 00:42:28
[scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min),
scraped 0 items (at 0 items/min) 2017-10-07 00:42:28
[scrapy.core.engine] DEBUG: Crawled (404) https://forum.bodybuilding.nl/robots.txt> (referer: None) 2017-10-07
00:42:29 [scrapy.core.engine] DEBUG: Crawled (200) https://forum.bodybuilding.nl/fora/supplementen.22/> (referer: None)
2017-10-07 00:42:29 [scrapy.core.engine] INFO: Closing spider
(finished) 2017-10-07 00:42:29 [scrapy.statscollectors] INFO: Dumping
Scrapy stats: {'downloader/request_bytes': 469,
'downloader/request_count': 2, 'downloader/request_method_count/GET':
2, 'downloader/response_bytes': 22878, 'downloader/response_count':
2, 'downloader/response_status_count/200': 1,
'downloader/response_status_count/404': 1, 'finish_reason':
'finished', 'finish_time': datetime.datetime(2017, 10, 6, 22, 42, 29,
223305), 'log_count/DEBUG': 2, 'log_count/INFO': 7, 'memusage/max':
31735808, 'memusage/startup': 31735808, 'response_received_count':
2, 'scheduler/dequeued': 1, 'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1, 'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2017, 10, 6, 22, 42, 28, 816043)}
2017-10-07 00:42:29 [scrapy.core.engine] INFO: Spider closed
(finished)
I have been following the guide here: http://blog.florian-hopf.de/2014/07/scrapy-and-elasticsearch.html
Update 1:
As someone told me I needed to update my code to the new standards, which I did but it didnt change the outcome:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from bodybuilding.items import BodybuildingItem
class BodyBuildingSpider(BaseSpider):
name = "bodybuilding"
allowed_domains = ["forum.bodybuilding.nl"]
start_urls = [
"https://forum.bodybuilding.nl/fora/supplementen.22/"
]
def parse(self, response):
for sel in response.css('li.past.line.event-item'):
item = BodybuildingItem()
yield {'title': title.css('a.data-previewUrl::text').extract_first()}
yield item
Last update with fix
After some good help I finally got it working with this spider:
import scrapy
class BlogSpider(scrapy.Spider):
name = 'bodybuilding'
start_urls = ['https://forum.bodybuilding.nl/fora/supplementen.22/']
def parse(self, response):
for title in response.css('h3.title'):
yield {'title': title.css('a::text').extract_first()}
next_page_url = response.xpath("//a[text()='Volgende >']/#href").extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse)

You should use response.css('li.past.line.event-item') and there is no need for responseSelector = Selector(response).
Also the CSS you are using li.past.line.event-item, is no more valid, so you need update those first based on the latest web page
To get the next page URL you can use
>>> response.css("a.text::attr(href)").extract_first()
'fora/supplementen.22/page-2'
And then use response.follow to follow this relative url
Edit-2: Next Page processing correction
The previous edit didn't work because on the next page it matches the previous page url, so you need to use below
next_page_url = response.xpath("//a[text()='Volgende >']/#href").extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse)
Edit-1: Next Page processing
next_page_url = response.css("a.text::attr(href)").extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse)

Related

Scrapy - Multiple resquest by looping JSON file

I'm trying to get the latitude and longitude of different cities. The name of cities are stored in a JSON file. Here is my code:
import scrapy
import json
with open('C:/Users/coppe/tutorial/cities.json') as json_file:
cities = json.load(json_file)
class communes_spider(scrapy.Spider):
name = "geo"
start_urls = ['https://www.latlong.net/']
def parse(self, response):
for city in cities:
return scrapy.FormRequest.from_response(response, formdata={'place': city['city']}, callback=self.get_geo)
def get_geo(self, response):
yield {response.css('span.coordinatetxt::text').get()}
The objective is to loop through the JSON file and for each city send a resquest to a form from the url "https://www.latlong.net/". However nothing is prompting from this request. Is this a bad way to make loop ? Should I treat the JSON file inside the class ?
Log:
2019-04-01 16:27:17 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: tutorial)
2019-04-01 16:27:17 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 18.9.0, Python 3.7.1 (default, Oct 28 2018, 08:39:03) [MSC v.1912 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2p 14 Aug 2018), cryptography 2.3.1, Platform Windows-10-10.0.17763-SP0
2019-04-01 16:27:17 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'tutorial', 'NEWSPIDER_MODULE': 'tutorial.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['tutorial.spiders']}
2019-04-01 16:27:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2019-04-01 16:27:17 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-04-01 16:27:17 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-04-01 16:27:17 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2019-04-01 16:27:17 [scrapy.core.engine] INFO: Spider opened
2019-04-01 16:27:17 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-04-01 16:27:17 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2019-04-01 16:27:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.latlong.net/robots.txt> (referer: None)
2019-04-01 16:27:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.latlong.net/> (referer: None)
2019-04-01 16:27:18 [scrapy.core.engine] DEBUG: Crawled (200) <POST https://www.latlong.net/> (referer: https://www.latlong.net/)
2019-04-01 16:27:18 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.latlong.net/>
{'latlong': '0,0'}
2019-04-01 16:27:18 [scrapy.core.engine] INFO: Closing spider (finished)
2019-04-01 16:27:18 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 874,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 2,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 29252,
'downloader/response_count': 3,
'downloader/response_status_count/200': 3,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 4, 1, 14, 27, 18, 923987),
'item_scraped_count': 1,
'log_count/DEBUG': 5,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 3,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2019, 4, 1, 14, 27, 17, 773592)}
2019-04-01 16:27:18 [scrapy.core.engine] INFO: Spider closed (finished)
Your parse method should be a generator, so you need to use yield instead of return on the for loop, otherwise you'll finish the loop on the first iteration. Furthermore, get_get method is returning a set, but it must return Request, BaseItem, dict or None.
I suggest changing the code as follow:
import scrapy
import json
with open('C:/Users/coppe/tutorial/cities.json') as json_file:
cities = json.load(json_file)
class communes_spider(scrapy.Spider):
name = "geo"
start_urls = ['https://www.latlong.net/']
def parse(self, response):
for city in cities:
yield scrapy.FormRequest.from_response(response, formdata={'place': city['city']}, callback=self.get_geo)
def get_geo(self, response):
yield {'coord': response.css('span.coordinatetxt::text').get()}
https://www.geeksforgeeks.org/use-yield-keyword-instead-return-keyword-python/

Spider Runs using scrapy but there is no data stored into a csv

I have written a code that passes through links within a web page to extract data and move to the next page. It is the about link from each author in quotes.toscrape.com.
import scrapy
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com',]
def parse(self, response):
linkto = response.css('div.quote > span > a::attr(href)').extract()
for links in linkto:
links = response.urljoin(links)
yield scrapy.Request(url=links, callback = scrapy.parse_about)
nextp = response.css('li.next > a::attr(href)').extract()
if nextp:
nextp = response.urljoin(nextp)
yield scrapy.Request(url=nextp, callback=self.parse)
def parse_about(self, response):
yield {
'date_of_birth': response.css('span.author-born-date::text').extract(),
'author': response.css('h3.author-title::text').extract(),
}
I executed in the command prompt:
scrapy crawl test -o test.csv
but the results I got:
019-03-20 16:36:03 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: quotestoscrape)
2019-03-20 16:36:03 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 17.5.0, Python 2.7.15 |Anaconda, Inc.| (default, Nov 13 2018, 17:33:26) [MSC v.1500 64 bit (AMD64)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1 11 Sep 2018), cryptography 2.5, Platform Windows-10-10.0.17134
2019-03-20 16:36:03 [scrapy.crawler] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'quotestoscrape.spiders', 'SPIDER_MODULES': ['quotestoscrape.spiders'], 'ROBOTSTXT_OBEY': True, 'BOT_NAME': 'quotestoscrape'}
2019-03-20 16:36:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2019-03-20 16:36:03 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-03-20 16:36:03 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-03-20 16:36:03 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2019-03-20 16:36:03 [scrapy.core.engine] INFO: Spider opened
2019-03-20 16:36:03 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-03-20 16:36:03 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2019-03-20 16:36:03 [scrapy.core.engine] DEBUG: Crawled (404) <GET http://quotes.toscrape.com/robots.txt> (referer: None)
2019-03-20 16:36:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com> (referer: None)
2019-03-20 16:36:04 [scrapy.core.scraper] ERROR: Spider error processing <GET http://quotes.toscrape.com> (referer: None)
Traceback (most recent call last):
File "C:\Users\kenny\Anaconda3\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "C:\Users\kenny\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
for x in result:
File "C:\Users\kenny\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\kenny\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\kenny\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\kenny\quotestoscrape\quotestoscrape\spiders\QuoteTestSpider.py", line 13, in parse
yield scrapy.Request(url=links, callback = scrapy.parse_about)
AttributeError: 'module' object has no attribute 'parse_about'
2019-03-20 16:36:04 [scrapy.core.engine] INFO: Closing spider (finished)
2019-03-20 16:36:04 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 446,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 2701,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 3, 20, 21, 36, 4, 41000),
'log_count/DEBUG': 3,
'log_count/ERROR': 1,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/AttributeError': 1,
'start_time': datetime.datetime(2019, 3, 20, 21, 36, 3, 468000)}
2019-03-20 16:36:04 [scrapy.core.engine] INFO: Spider closed (finished)
And my csv file I moved it to is empty:
enter image description here
Please let me know what I am doing wrong
According to your log method parse_about is not called because you are trying to call scrapy.parse_about instead of spider's self.parse_about:
....
for links in linkto:
links = response.urljoin(links)
yield scrapy.Request(url=links, callback = self.parse_about)
As your application doesn't scrape any data -> It creates empty csv file as result.

Not sure how to use scrapy's itemLoaders

I'm trying to learn how to use scrappy's itemLoaders, can anybody told me what am I doing wrong???I would like to thank you in advance.
import scrapy
from items.items import ItemsItem
from scrapy.loader import ItemLoader
class ItemspiderSpider(scrapy.Spider):
name = 'itemspider'
allowed_domains = ['yellowpages.com']
start_urls = ['https://www.yellowpages.com/search?search_terms=handyman&geo_location_terms=Miami%2C+FL']
def parse(self, response):
#create the loader using the response
l = ItemLoader(item=ItemsItem(), response=response)
#create a for loop
for listing in response.css('div.search-results.organic div.srp-listing'):
l.add_css('Name', listing.css('a.business-name span::text').extract())
l.add_css('Details', response.urljoin(listing.css('a.business-name::attr(href)')))
l.add_css('WebSite', listing.css('a.track-visit-website::attr(href)').extract_first())
l.add_css('Phones', listing.css('div.phones::text').extract())
yield l.load_item()
When I run the code I keep getting this error:
root#debian:~/Desktop/items/items/spiders# scrapy runspider itemspider.py -o item.csv
/usr/local/lib/python3.5/dist-packages/scrapy/spiderloader.py:37: UserWarning: There are several spiders with the same name:
ItemspiderSpider named 'itemspider' (in items.spiders.itemspider)
ItemspiderSpider named 'itemspider' (in items.spiders.itemspiderLog)
This can cause unexpected behavior.
warnings.warn(msg, UserWarning)
2017-07-04 16:33:20 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: items)
2017-07-04 16:33:20 [scrapy.utils.log] INFO: Overridden settings: {'BOT_NAME': 'items', 'FEED_FORMAT': 'csv', 'SPIDER_LOADER_WARN_ONLY': True, 'SPIDER_MODULES': ['items.spiders'], 'FEED_URI': 'item.csv', 'ROBOTSTXT_OBEY': True, 'NEWSPIDER_MODULE': 'items.spiders'}
2017-07-04 16:33:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.feedexport.FeedExporter',
'scrapy.extensions.logstats.LogStats']
2017-07-04 16:33:20 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-07-04 16:33:20 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2017-07-04 16:33:20 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2017-07-04 16:33:20 [scrapy.core.engine] INFO: Spider opened
2017-07-04 16:33:20 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-07-04 16:33:20 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2017-07-04 16:33:21 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.yellowpages.com/robots.txt> (referer: None)
2017-07-04 16:33:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.yellowpages.com/search?search_terms=handyman&geo_location_terms=Miami%2C+FL> (referer: None)
2017-07-04 16:33:24 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.yellowpages.com/search?search_terms=handyman&geo_location_terms=Miami%2C+FL> (referer: None)
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.5/dist-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/root/Desktop/items/items/spiders/itemspider.py", line 17, in parse
l.add_css('Details', response.urljoin(listing.css('a.business-name::attr(href)')))
File "/usr/local/lib/python3.5/dist-packages/scrapy/http/response/text.py", line 82, in urljoin
return urljoin(get_base_url(self), url)
File "/usr/lib/python3.5/urllib/parse.py", line 416, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "/usr/lib/python3.5/urllib/parse.py", line 112, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2017-07-04 16:33:24 [scrapy.core.engine] INFO: Closing spider (finished)
2017-07-04 16:33:24 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 503,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 52924,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 7, 4, 21, 33, 24, 121098),
'log_count/DEBUG': 3,
'log_count/ERROR': 1,
'log_count/INFO': 7,
'memusage/max': 49471488,
'memusage/startup': 49471488,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/TypeError': 1,
'start_time': datetime.datetime(2017, 7, 4, 21, 33, 20, 705391)}
2017-07-04 16:33:24 [scrapy.core.engine] INFO: Spider closed (finished)
Not sure what is going this is actually the first time I tried to use the ItemLoaders
There are a few issues with your code:
response.urljoin() expects a single string as parameter, not a list. You are passing the result of listing.css(), which is a SelectorList. You can use response.urljoin(listing.css('a.business-name::attr(href)').extract_first())
you need to instantiate one item loader per loop iteration, otherwise, you're accumulating values for each field of a single yielded item
you are using .add_css() with some values (result of .extract...() calls. .add_css() needs a CSS selector string, not the result of a selector extraction. The CSS extraction will then be done by the item loader. Or, you can use .add_value() if you want to pass the "final" field value directly.
Here are 2 versions that should get you going:
import scrapy
from items.items import ItemsItem
from scrapy.loader import ItemLoader
class ItemspiderSpider(scrapy.Spider):
name = 'itemspider'
allowed_domains = ['yellowpages.com']
start_urls = ['https://www.yellowpages.com/search?search_terms=handyman&geo_location_terms=Miami%2C+FL']
def parse(self, response):
for listing in response.css('div.search-results.organic div.srp-listing'):
# create the loader using the SELECTOR, inside the loop
l = ItemLoader(item=ItemsItem())
# use .add_value() since we pass the extraction result directly
l.add_value('Name', listing.css('a.business-name span::text').extract())
# pass a single value to response.urljoin()
l.add_value('Details',
response.urljoin(
listing.css('a.business-name::attr(href)').extract_first()
))
l.add_value('WebSite', listing.css('a.track-visit-website::attr(href)').extract_first())
l.add_value('Phones', listing.css('div.phones::text').extract())
yield l.load_item()
Or, using .add_css():
import scrapy
from items.items import ItemsItem
from scrapy.loader import ItemLoader
class ItemspiderSpider(scrapy.Spider):
name = 'itemspider'
allowed_domains = ['yellowpages.com']
start_urls = ['https://www.yellowpages.com/search?search_terms=handyman&geo_location_terms=Miami%2C+FL']
def parse(self, response):
for listing in response.css('div.search-results.organic div.srp-listing'):
# pass the 'listing' selector to the item loader
# so that CSS selection is relative to it
l = ItemLoader(ItemsItem(), selector=listing)
l.add_css('Name', 'a.business-name span::text')
l.add_css('Details', 'a.business-name::attr(href)')
l.add_css('WebSite', 'a.track-visit-website::attr(href)')
l.add_css('Phones', 'div.phones::text')
yield l.load_item()

Conflicts When Generating Start Urls

I'm working on retrieving information from the National Gallery of Art's online catalog. Due to the catalog's structure, I can't navigate by extracting and following links from entry to entry. Fortunately, each object in the collection has a predictable url. I want my spider to navigate the collection by generating start urls.
I have attempted to solve my problem by implementing the solution from this thread. Unfortunately, this seems to break another part of my spider. The error log reveals that my urls are being successfully generated, but they aren't being processed correctly. If I'm interpreting the log correctly—which I suspect I'm not—there is a conflict between the redefinition of the start_urls that allows me to generate the urls I need and the rules section of the spider. As things stand now, the spider also doesn't respect the number of pages that I ask it to crawl.
You'll find my spider and a typical error below. I appreciate any help you can offer.
Spider:
URL = "http://www.nga.gov/content/ngaweb/Collection/art-object-page.%d"
starting_number = 1312
number_of_pages = 10
class NGASpider(CrawlSpider):
name = 'ngamedallions'
allowed_domains = ['nga.gov']
start_urls = [URL % starting_number]
rules = (
Rule(LinkExtractor(allow=('art-object-page.*','objects/*')),callback='parse_CatalogRecord',
follow=True))
def __init__(self):
self.page_number = starting_number
def start_requests(self):
for i in range (self.page_number, number_of_pages, -1):
yield Request(url = URL % i + ".html" , callback=self.parse)
def parse_CatalogRecord(self, response):
CatalogRecord = ItemLoader(item=NgamedallionsItem(), response=response)
CatalogRecord.default_output_processor = TakeFirst()
CatalogRecord.image_urls_out = scrapy.loader.processors.Identity()
keywords = "medal|medallion"
r = re.compile('.*(%s).*' % keywords, re.IGNORECASE|re.MULTILINE|re.UNICODE)
if r.search(response.body_as_unicode()):
CatalogRecord.add_xpath('title', './/dl[#class="artwork-details"]/dt[#class="title"]/text()')
CatalogRecord.add_xpath('accession', './/dd[#class="accession"]/text()')
CatalogRecord.add_xpath('inscription', './/div[#id="inscription"]/p/text()')
CatalogRecord.add_xpath('image_urls', './/img[#class="mainImg"]/#src')
return CatalogRecord.load_item()
Typical Error:
2016-04-29 15:35:00 [scrapy] ERROR: Spider error processing <GET http://www.nga.gov/content/ngaweb/Collection/art-object-page.1178.html> (referer: None)
Traceback (most recent call last):
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/offsite.py", line 28, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/spiders/crawl.py", line 51, in _requests_to_follow
for n, rule in enumerate(self._rules):
AttributeError: 'NGASpider' object has no attribute '_rules'
Update in Resonse to eLRuLL's Solution
Simply removing def __init__ and start_urls allows my spider to crawl my generated urls. However, it also seems to prevent 'def parse_CatalogRecord(self, response)' from being applied. When I run the spider now, it only scrapes pages from outside the range of generated urls. My revised spider and log output follow below.
Spider:
URL = "http://www.nga.gov/content/ngaweb/Collection/art-object-page.%d"
starting_number = 1312
number_of_pages = 1311
class NGASpider(CrawlSpider):
name = 'ngamedallions'
allowed_domains = ['nga.gov']
rules = (
Rule(LinkExtractor(allow=('art-object-page.*','objects/*')),callback='parse_CatalogRecord',
follow=True))
def start_requests(self):
self.page_number = starting_number
for i in range (self.page_number, number_of_pages, -1):
yield Request(url = URL % i + ".html" , callback=self.parse)
def parse_CatalogRecord(self, response):
CatalogRecord = ItemLoader(item=NgamedallionsItem(), response=response)
CatalogRecord.default_output_processor = TakeFirst()
CatalogRecord.image_urls_out = scrapy.loader.processors.Identity()
keywords = "medal|medallion"
r = re.compile('.*(%s).*' % keywords, re.IGNORECASE|re.MULTILINE|re.UNICODE)
if r.search(response.body_as_unicode()):
CatalogRecord.add_xpath('title', './/dl[#class="artwork-details"]/dt[#class="title"]/text()')
CatalogRecord.add_xpath('accession', './/dd[#class="accession"]/text()')
CatalogRecord.add_xpath('inscription', './/div[#id="inscription"]/p/text()')
CatalogRecord.add_xpath('image_urls', './/img[#class="mainImg"]/#src')
return CatalogRecord.load_item()
Log:
2016-05-02 15:50:02 [scrapy] INFO: Scrapy 1.0.5.post4+g4b324a8 started (bot: ngamedallions)
2016-05-02 15:50:02 [scrapy] INFO: Optional features available: ssl, http11
2016-05-02 15:50:02 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'ngamedallions.spiders', 'FEED_URI': 'items.json', 'SPIDER_MODULES': ['ngamedallions.spiders'], 'BOT_NAME': 'ngamedallions', 'FEED_FORMAT': 'json', 'DOWNLOAD_DELAY': 3}
2016-05-02 15:50:02 [scrapy] INFO: Enabled extensions: CloseSpider, FeedExporter, TelnetConsole, LogStats, CoreStats, SpiderState
2016-05-02 15:50:02 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2016-05-02 15:50:02 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2016-05-02 15:50:02 [scrapy] INFO: Enabled item pipelines: ImagesPipeline
2016-05-02 15:50:02 [scrapy] INFO: Spider opened
2016-05-02 15:50:02 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-05-02 15:50:02 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-05-02 15:50:02 [scrapy] DEBUG: Crawled (200) <GET http://www.nga.gov/content/ngaweb/Collection/art-object-page.1312.html> (referer: None)
2016-05-02 15:50:02 [scrapy] DEBUG: Filtered duplicate request: <GET http://www.nga.gov/content/ngaweb/Collection/art-object-page.1312.html> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2016-05-02 15:50:05 [scrapy] DEBUG: Crawled (200) <GET http://www.nga.gov/content/ngaweb/Collection/art-object-page.1313.html> (referer: http://www.nga.gov/content/ngaweb/Collection/art-object-page.1312.html)
2016-05-02 15:50:05 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET http://media.nga.gov/public/objects/1/3/1/3/1313-primary-0-440x400.jpg> referred in <None>
2016-05-02 15:50:05 [scrapy] DEBUG: Scraped from <200 http://www.nga.gov/content/ngaweb/Collection/art-object-page.1313.html>
{'accession': u'1942.9.163.b',
'image_urls': [u'http://media.nga.gov/public/objects/1/3/1/3/1313-primary-0-440x400.jpg'],
'images': [{'checksum': '9d5f2e30230aeec1582ca087bcde6bfa',
'path': 'full/3a692347183d26ffefe9ba0af80b0b6bf247fae5.jpg',
'url': 'http://media.nga.gov/public/objects/1/3/1/3/1313-primary-0-440x400.jpg'}],
'inscription': u'around top circumference: TRINACRIA IANI; upper center: PELORVS ; across center: PA LI; across bottom: BELAVRA',
'title': u'House between Two Hills [reverse]'}
2016-05-02 15:50:05 [scrapy] INFO: Closing spider (finished)
2016-05-02 15:50:05 [scrapy] INFO: Stored json feed (1 items) in: items.json
2016-05-02 15:50:05 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 631,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 26324,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'dupefilter/filtered': 3,
'file_count': 1,
'file_status_count/uptodate': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 5, 2, 19, 50, 5, 810570),
'item_scraped_count': 1,
'log_count/DEBUG': 6,
'log_count/INFO': 8,
'request_depth_max': 2,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2016, 5, 2, 19, 50, 2, 455508)}
2016-05-02 15:50:05 [scrapy] INFO: Spider closed (finished)
don't override the __init__ method if you are not going to call super.
Now, you don't really need to declare start_urls for your spider to work if you are going to use start_requests.
Just remove your def __init__ method and no need for start_urls to exist.
UPDATE
Ok my mistake, looks like CrawlSpider needs the start_urls attribute, so just create it instead of using the start_requests method:
start_urls = [URL % i + '.html' for i in range (starting_number, number_of_pages, -1)]
and remove start_requests

Scrapy crawl resume does not crawl anything and just finishes

I start a crawl with a CrawlSpider Derived class, and pause it with Ctrl+C. When I execute the command again to resume it, it does not continue.
My start and resume command:
scrapy crawl mycrawler -s JOBDIR=crawls/test5_mycrawl
Scrapy creates the folder. The permissions are 777.
When I resume the crawl, it just outputs:
/home/adminuser/.virtualenvs/rg_harvest/lib/python2.7/site-packages/twisted/internet/_sslverify.py:184: UserWarning: You do not have the service_identity module installed. Please install it from <https://pypi.python.org/pypi/service_identity>. Without the service_identity module and a recent enough pyOpenSSL tosupport it, Twisted can perform only rudimentary TLS client hostnameverification. Many valid certificate/hostname mappings may be rejected.
verifyHostname, VerificationError = _selectVerifyImplementation()
2014-11-21 11:05:10-0500 [scrapy] INFO: Scrapy 0.24.4 started (bot: rg_harvest_scrapy)
2014-11-21 11:05:10-0500 [scrapy] INFO: Optional features available: ssl, http11, django
2014-11-21 11:05:10-0500 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'rg_harvest_scrapy.spiders', 'SPIDER_MODULES': ['rg_harvest_scrapy.spiders'], 'BOT_NAME': 'rg_harvest_scrapy'}
2014-11-21 11:05:10-0500 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2014-11-21 11:05:10-0500 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2014-11-21 11:05:10-0500 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2014-11-21 11:05:10-0500 [scrapy] INFO: Enabled item pipelines: ValidateMandatory, TypeConversion, ValidateRange, ValidateLogic, RestegourmetImagesPipeline, SaveToDB
2014-11-21 11:05:10-0500 [mycrawler] INFO: Spider opened
2014-11-21 11:05:10-0500 [mycrawler] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-11-21 11:05:10-0500 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2014-11-21 11:05:10-0500 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080
2014-11-21 11:05:10-0500 [mycrawler] DEBUG: Crawled (200) <GET http://eatsmarter.de/suche/rezepte> (referer: None)
2014-11-21 11:05:10-0500 [mycrawler] DEBUG: Filtered duplicate request: <GET http://eatsmarter.de/suche/rezepte?page=1> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2014-11-21 11:05:10-0500 [mycrawler] INFO: Closing spider (finished)
2014-11-21 11:05:10-0500 [mycrawler] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 225,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 19242,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'dupefilter/filtered': 29,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2014, 11, 21, 16, 5, 10, 733196),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/disk': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/disk': 1,
'start_time': datetime.datetime(2014, 11, 21, 16, 5, 10, 528629)}
I have one start_url. Could this be the reason? My crawler uses one start_url and then follows the pagination by a Rule with a LinkExtractor, and calls the parse item by a specific url format:
My Spider code:
class MyCrawlSpiderBase(CrawlSpider):
name = 'test_spider'
testmode = True
crawl_start = datetime.utcnow().isoformat()
def __init__(self, testmode=True, urls=None, *args, **kwargs):
self.testmode = bool(int(testmode))
super(MyCrawlSpiderBase, self).__init__(*args, **kwargs)
def parse_item(self, response):
# Item Values
l = MyItemLoader(RecipeItem(), response=response)
l.replace_value('url', response.url)
l.replace_value('crawl_start', self.crawl_start)
return l.load_item()
class MyCrawlSpider(MyCrawlSpiderBase):
name = 'example_de'
allowed_domains = ['example.de']
start_urls = [
"http://example.de",
]
rules = (
Rule(
LinkExtractor(
allow=('/search/entry\?page=', )
)
),
Rule(
LinkExtractor(
allow=('/entry/[0-9A-z\-]{3,}$', ),
),
callback='parse_item'
),
)
def parse_item(self, response):
item = super(MyCrawlSpider, self).parse_item(response)
l = MyItemLoader(item=item, response=response)
l.replace_xpath("name", "//h1[#class='fn title']/text()")
(...)
return l.load_item()
Since your URL is always the same, the requests are most likely being filtered.
You can solve this in two ways:
In yoursettings.py file, add this line:
DUPEFILTER_CLASS = 'scrapy.dupefilter.BaseDupeFilter'
This replaces the default RFPDupeFilter with the BaseDupeFilter which will not filter any requests. This my not be what you want if you in fact want to filter out some other requests not relevant to this question.
You can get more involved in the process of creating requests, and create them with the parameter dont_filter=True, which will disable filtering on a per-requests basis. To achieve this, you could remove the start_urls and replace it with a method start_requests() that would yield the requests for parsing. Check out more info in the official documentation.
If you click Ctrl+C twice (force stop) it won't be able to be continued. Click Ctrl+C just once and wait.