Scrapy finishing process before all pages are scraped - scrapy

I sat up a test Scrapy scraper which looks like this:
import scrapy
class testSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.realestate.com.kh/buy/']
def parse(self, response):
nr_pages = response.xpath('//div[#class="desktop-buttons"]/a[#class="css-1en2dru"]//text()').getall()
for nr in range(1, 40):
req_url = f'?page={nr}'
self.item = {}
self.item['page'] = nr
yield scrapy.Request(url=response.urljoin(req_url), callback=self.parse_page, meta={'item': self.item})
def parse_page(self, response):
page = response.meta['item']['page']
ads = response.xpath('//*[#class="featured css-ineky e1jqslr40"]//a/#href')
for url in ads:
absolute_url = response.urljoin(url.extract())
self.item = {}
self.item['page'] = page
yield scrapy.Request(absolute_url, callback=self.parse_ad, meta={'item': self.item})
def parse_ad(self, response):
page = response.meta['item']['page']
# DO THINGS
yield {
'page': page
}
It goes though loads each https://www.realestate.com.kh/buy/?page=NR, where nr is all numbers between 1 and 40
On each of these pages, it get all ads
On each ad of each page, it scraps things and yield them.
It work fine for the first 26 items (two first pages, and 2 or 3 items from the 3rd one, out of 40) and then finished the scraping without an error.
Here are the stats :
{'downloader/request_bytes': 23163,
'downloader/request_count': 66,
'downloader/request_method_count/GET': 66,
'downloader/response_bytes': 3801022,
'downloader/response_count': 66,
'downloader/response_status_count/200': 66,
'elapsed_time_seconds': 5.420036,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 10, 22, 19, 48, 37, 549853),
'item_scraped_count': 26,
'log_count/INFO': 9,
'memusage/max': 49963008,
'memusage/startup': 49963008,
'request_depth_max': 2,
'response_received_count': 66,
'scheduler/dequeued': 66,
'scheduler/dequeued/memory': 66,
'scheduler/enqueued': 66,
'scheduler/enqueued/memory': 66,
'start_time': datetime.datetime(2020, 10, 22, 19, 48, 32, 129817)}
What could be ending the scraping so early?

Your spider is actually going through all the pages, the problem is that in parse_page the selector for ads only works in the earlier pages, in later pages the class name changes. The class name seems to be dynamically generated, so you need an XPath that won't select by class.
This XPath '//div/header/parent::div' would return the same div element that '//*[#class="featured css-ineky e1jqslr40"]' so replacing this line should allow you to select all ads from all pages:
ads = response.xpath('//div/header/parent::div/article/a/#href')
Unrelated note:
This isn't causing any problems yet, but it's a recipe for future problems.
for url in ads:
absolute_url = response.urljoin(url.extract())
self.item = {}
self.item['page'] = page
yield scrapy.Request(absolute_url, callback=self.parse_ad, meta={'item': self.item})
Scrapy works in an asynchronous way, so most of the time using an instance variable ( like self.item) gives the wrong intuition, as you don't really control the order in which the requests are parsed. That's why when you need to pass information between methods you use meta (or cb_kwargs) and not just store it in an instance variable.

You are working on the wrong way to get pages, there are 50 pages on the site right now. You should walk around by the next page. Look this code:
import scrapy
from scrapy.shell import inspect_response
class testSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.realestate.com.kh/buy/']
def parse(self, response):
page = response.xpath('//div[#class="list"]//div[#class="desktop-buttons"]/a[#class="css-owq2hj"]/span/text()').get()
ads = response.xpath('//div[#class="list"]/div/header/a/#href').getall()
for url in ads:
yield scrapy.Request(response.urljoin(url), callback=self.parse_ad, meta={'page': page})
# next page
url = response.xpath('//div[#class="list"]//div[#class="desktop-buttons"]/a[#class="css-owq2hj"]/following-sibling::a[1]/#href').get()
if url:
yield scrapy.Request(response.urljoin(url), callback=self.parse)
def parse_ad(self, response):
page = response.meta['page']
# DO THINGS
yield {
'page': page
}

Related

Scrapy Pagination - Works for 2 pages but not after that

I'm crawling cdw.com website. For a given URL, there are around 17 pages. The script that I have written is able t fetch data from Page 1 and Page 2. Spider closes on its own after giving result of first 2 pages. Please let me know, how can I fetch data for remaining 15 pages.
TIA.
import scrapy
from cdwfinal.items import CdwfinalItem
from scrapy.selector import Selector
import datetime
import pandas as pd
import time
class CdwSpider(scrapy.Spider):
name = 'cdw'
allowed_domains = ['www.cdw.com']
start_urls = ['http://www.cdw.com/']
base_url = 'http://www.cdw.com'
def start_requests(self):
yield scrapy.Request(url = 'https://www.cdw.com/search/?key=axiom' , callback=self.parse )
def parse(self, response):
item=[]
hxs = Selector(response)
item = CdwfinalItem()
abc = hxs.xpath('//*[#id="main"]//*[#class="grid-row"]')
for i in range(len(abc)):
try:
item['mpn'] = hxs.xpath("//div[contains(#class,'search-results')]/div[contains(#class,'search-result')]["+ str(i+1) +"]//*[#class='mfg-code']/text()").extract()
except:
item['mpn'] = 'NA'
try:
item['part_no'] = hxs.xpath("//div[contains(#class,'search-results')]/div[contains(#class,'search-result')]["+ str(i+1) +"]//*[#class='cdw-code']/text()").extract()
except:
item['part_no'] = 'NA'
yield item
next_page = hxs.xpath('//*[#id="main"]//*[#class="no-hover" and #aria-label="Next Page"]').extract()
if next_page:
new_page_href = hxs.xpath('//*[#id="main"]//*[#class="no-hover" and #aria-label="Next Page"]/#href').extract_first()
new_page_url = response.urljoin(new_page_href)
yield scrapy.Request(new_page_url, callback=self.parse, meta={"searchword": '123'})
LOG:
2023-02-11 15:39:55 [scrapy_user_agents.middlewares] DEBUG: Assigned User-Agent Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36
2023-02-11 15:39:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cdw.com/search/?key=axiom&pcurrent=3> (referer: https://www.cdw.com/search/?key=axiom&pcurrent=2) ['cached']
2023-02-11 15:39:55 [scrapy.core.engine] INFO: Closing spider (finished)
2023-02-11 15:39:55 [scrapy.extensions.feedexport] INFO: Stored csv feed (48 items) in: Test5.csv
2023-02-11 15:39:55 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2178,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'downloader/response_bytes': 68059,
'downloader/response_count': 3,
'downloader/response_status_count/200': 3,
'elapsed_time_seconds': 1.30903,
'feedexport/success_count/FileFeedStorage': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2023, 2, 11, 10, 9, 55, 327740),
'httpcache/hit': 3,
'httpcompression/response_bytes': 384267,
'httpcompression/response_count': 3,
'item_scraped_count': 48,
'log_count/DEBUG': 62,
'log_count/INFO': 11,
'log_count/WARNING': 45,
'request_depth_max': 2,
'response_received_count': 3,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2023, 2, 11, 10, 9, 54, 18710)}
Your next_page selector is failing to extract the information for the next page. In general your selectors are more complicated then they need to be, for example you should be using relative xpath expressions in your for loop .
Here is an example that replicates the same behaviour as your spider except using much simpler selectors, and successfully extracts the results from all of the pages.
import scrapy
class CdwSpider(scrapy.Spider):
name = 'cdw'
allowed_domains = ['www.cdw.com']
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
def start_requests(self):
yield scrapy.Request(url='https://www.cdw.com/search/?key=axiom' , callback=self.parse )
def parse(self, response):
for row in response.xpath('//div[#class="grid-row"]'):
mpn = row.xpath(".//span[#class='mfg-code']/text()").get()
cdw = row.xpath('.//span[#class="cdw-code"]/text()').get()
yield {"mpn": mpn, "part_no": cdw}
current = response.css("div.search-pagination-active")
next_page = current.xpath('./following-sibling::a/#href').get()
if next_page:
new_page_url = response.urljoin(next_page)
yield scrapy.Request(new_page_url, callback=self.parse)
EDIT
The only non-default setting I am using is setting a the user agent.
I have made adjustments in the example above to reflect that.
Partial output:
2023-02-11 22:10:58 [scrapy.core.engine] INFO: Closing spider (finished)
2023-02-11 22:10:58 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 106555,
'downloader/request_count': 41,
'downloader/request_method_count/GET': 41,
'downloader/response_bytes': 1099256,
'downloader/response_count': 41,
'downloader/response_status_count/200': 41,
'elapsed_time_seconds': 22.968986,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2023, 2, 12, 6, 10, 58, 700080),
'httpcompression/response_bytes': 7962149,
'httpcompression/response_count': 41,
'item_scraped_count': 984,
'log_count/DEBUG': 1028,
'log_count/INFO': 10,
'request_depth_max': 40,
'response_received_count': 41,
'scheduler/dequeued': 41,
'scheduler/dequeued/memory': 41,
'scheduler/enqueued': 41,
'scheduler/enqueued/memory': 41,
'start_time': datetime.datetime(2023, 2, 12, 6, 10, 35, 731094)}
2023-02-11 22:10:58 [scrapy.core.engine] INFO: Spider closed (finished)

How to store all scraped stats moment before spider closes?

I want to store all the stats collected from the spider into a single output file stored as json format. However, I get this error:
'MemoryStatsCollector' object has no attribute 'get_all'
: The documentation mentions that stats.get_all is how you get all the stores. What is the correct method of implementation for this?
import scrapy
from scrapy import signals
from scrapy import crawler
import jsonlines
class TestSpider(scrapy.Spider):
name = 'stats'
start_urls = ['http://quotes.toscrape.com']
def __init__(self, stats):
self.stats = stats
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
#spider = super(TestSpider, cls).from_crawler(crawler, *args, **kwargs)
stat = cls(crawler.stats)
crawler.signals.connect(stat.spider_closed, signals.spider_closed)
return stat
def spider_closed(self):
#self.stats = stat
txt_file = 'some_text.jl'
with jsonlines.open(txt_file, 'w') as f:
f.write(self.stats.get_all())
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get()
}
Turns out there is no get_all for the method and instead I had to input get_stats(), the documentation provides a few examples of some:
stats.get_value()
stats.get_stats()
stats.max_value()/stats.min_value()
stats.inc_value()
stats.set_value()
Some further information provided in the documentation for stats.
The working part:
def spider_closed(self):
#self.stats = stat
txt_file = 'some_text.jl'
with jsonlines.open(txt_file, 'w') as f:
# f.write(f'{self.stats.get_all()}') --- Changed
f.write(f'{self.stats.get_stats()}')
Output:
{
"log_count/INFO": 10,
"log_count/DEBUG": 3,
"start_time": datetime.datetime(2022, 7, 6, 16, 16, 30, 553373),
"memusage/startup": 59895808,
"memusage/max": 59895808,
"scheduler/enqueued/memory": 1,
"scheduler/enqueued": 1,
"scheduler/dequeued/memory": 1,
"scheduler/dequeued": 1,
"downloader/request_count": 1,
"downloader/request_method_count/GET": 1,
"downloader/request_bytes": 223,
"downloader/response_count": 1,
"downloader/response_status_count/200": 1,
"downloader/response_bytes": 2086,
"httpcompression/response_bytes": 11053,
"httpcompression/response_count": 1,
"response_received_count": 1,
"item_scraped_count": 1,
"elapsed_time_seconds": 0.34008,
"finish_time": datetime.datetime(2022, 7, 6, 16, 16, 30, 893453),
"finish_reason": "finished",
}

Scrapy Crawlspider is only crawling the first 5 pages in a category

Currently, my crawlspider will only crawl roughly 20,000 products of the over 6.5M available. It seems that each category is being scraped but only the first 5 pages of each category are being scraped. I believe that is is something with my linkextractor but I am not sure.
CrawlSpider:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
class DigikeyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
partnumber = scrapy.Field()
manufacturer = scrapy.Field()
description = scrapy.Field()
quanity= scrapy.Field()
minimumquanity = scrapy.Field()
price = scrapy.Field()
class DigikeySpider(CrawlSpider):
name = 'digikey'
allowed_domains = ['digikey.com']
start_urls = ['https://www.digikey.com/products/en']
rules = (
Rule(LinkExtractor(allow=('products', )),callback='parse_item'),
)
def parse_item(self, response):
for row in response.css('table#productTable tbody tr'):
item = DigikeyItem()
item['partnumber'] = row.css('.tr-mfgPartNumber [itemprop="name"]::text').extract_first()
item['manufacturer'] = row.css('[itemprop="manufacture"] [itemprop="name"]::text').extract_first()
item['description'] = row.css('.tr-description::text').extract_first()
item['quanity'] = row.css('.tr-qtyAvailable::text').extract_first()
item['price'] = row.css('.tr-unitPrice::text').extract_first()
item['minimumquanity'] = row.css('.tr-minQty::text').extract_first()
yield item
Setting:
BOT_NAME = 'digikey'
SPIDER_MODULES = ['digikey.spiders']
NEWSPIDER_MODULE = 'digikey.spiders'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
Output:
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Closing spider (finished)
2017-11-01 10:53:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 6,
'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 6,
'downloader/request_bytes': 1198612,
'downloader/request_count': 988,
'downloader/request_method_count/GET': 988,
'downloader/response_bytes': 23932614,
'downloader/response_count': 982,
'downloader/response_status_count/200': 982,
'dupefilter/filtered': 46,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 11, 1, 17, 53, 11, 421641),
'item_scraped_count': 21783,
'log_count/DEBUG': 22773,
'log_count/ERROR': 2,
'log_count/INFO': 10,
'request_depth_max': 1,
'response_received_count': 982,
'retry/count': 4,
'retry/max_reached': 2,
'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 4,
'scheduler/dequeued': 988,
'scheduler/dequeued/memory': 988,
'scheduler/enqueued': 988,
'scheduler/enqueued/memory': 988,
'start_time': datetime.datetime(2017, 11, 1, 17, 49, 38, 427669)}
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Spider closed (finished)
PS C:\Users\dalla_000\digikey>
With this particular site, it might make sense to do a two-stage crawl:
get a list of all the category URLs
iterate over all the URLs
One approach could be to use two spiders and a message queue. The first spider might look like this:
import scrapy
from bs4 import BeautifulSoup
import re
import math
import urllib
from kafka import KafkaClient, SimpleProducer
ITEMS_PER_PAGE = 500
class CreateXxxxxxxUrlListSpider(scrapy.Spider):
kafka = KafkaClient('10.0.1.12:9092')
producer = SimpleProducer(kafka)
name = "create_xxxxxxx_url_list"
allowed_domains = ["xxxxxxx.com"]
start_urls = [
"http://www.xxxxxxx.com/product-search/en?stock=1"
]
def parse(self, response):
soup = BeautifulSoup(response.body)
catfilterlinks = soup.find_all('a', {'class':'catfilterlink'})
for catfilterlink in catfilterlinks:
location = catfilterlink['href'].split("?")[0]
items = re.match(".*\(([0-9]+) items\).*", catfilterlink.next_sibling).group(1)
for page in range(int(math.ceil(float(items) / ITEMS_PER_PAGE))):
if page == 0:
url = "http://www.xxxxxxx.com" + location + "?" + urllib.urlencode({"stock":1})
self.producer.send_messages("xxxxxxx_search_page_urls", url)
else:
url = "http://www.xxxxxxx.com" + location + "/page/" + str(page + 1) + "?" + urllib.urlencode({"stock":1})
self.producer.send_messages("xxxxxxx_search_page_urls", url)
The first spider gets a list of all the pages to crawl and then writes them to a message queue (e.g. Kafka).
The second spider would consume URLs from from the Kafka topic and crawl them. It might look something like this:
from scrapy_kafka.spiders import ListeningKafkaSpider
from .. items import PageHtml
from calendar import timegm
import time
class CrawlXxxxxxxUrlsSpider(ListeningKafkaSpider):
name = 'crawl_xxxxxxx_urls_spider'
allowed_domains = ["xxxxxxx.com"]
topic = "xxxxxxx_search_page_urls"
def parse(self, response):
item = PageHtml()
item['url'] = response.url
item['html'] = response.body_as_unicode()
item['ts'] = timegm(time.gmtime())
return item
# .... or whatever

why the scrapy-plugins/scrapy-jsonrpc can't get the spider's stats

I just want monitor my running spider's stats.I get the latest scrapy-plugins/scrapy-jsonrpc and set the spider as follows:
EXTENSIONS = {
'scrapy_jsonrpc.webservice.WebService': 500,
}
JSONRPC_ENABLED = True
JSONRPC_PORT = [60853]
but when I browse the http://localhost:60853/ , it just return
{"resources": ["crawler"]}
and I just can get the running spiders name without the stats.
anyone who can told me, which place I set wrong, thanks!
http://localhost:60853/ returns the resources available, /crawler being the only top-level one.
If you want to get stats for a spider, you'll need to query the /crawler/stats endpoint and call get_stats().
Here's an example using python-jsonrpc: (here I configured the webservice to listen on localhost and port 6024)
>>> import pyjsonrpc
>>> http_client = pyjsonrpc.HttpClient('http://localhost:6024/crawler/stats')
>>> http_client.call('get_stats', 'httpbin')
{u'log_count/DEBUG': 4, u'scheduler/dequeued': 4, u'log_count/INFO': 9, u'downloader/response_count': 2, u'downloader/response_status_count/200': 2, u'log_count/WARNING': 1, u'scheduler/enqueued/memory': 4, u'downloader/response_bytes': 639, u'start_time': u'2016-09-28 08:49:57', u'scheduler/dequeued/memory': 4, u'scheduler/enqueued': 4, u'downloader/request_bytes': 862, u'response_received_count': 2, u'downloader/request_method_count/GET': 4, u'downloader/request_count': 4}
>>> http_client.call('get_stats')
{u'log_count/DEBUG': 4, u'scheduler/dequeued': 4, u'log_count/INFO': 9, u'downloader/response_count': 2, u'downloader/response_status_count/200': 2, u'log_count/WARNING': 1, u'scheduler/enqueued/memory': 4, u'downloader/response_bytes': 639, u'start_time': u'2016-09-28 08:49:57', u'scheduler/dequeued/memory': 4, u'scheduler/enqueued': 4, u'downloader/request_bytes': 862, u'response_received_count': 2, u'downloader/request_method_count/GET': 4, u'downloader/request_count': 4}
>>> from pprint import pprint
>>> pprint(http_client.call('get_stats'))
{u'downloader/request_bytes': 862,
u'downloader/request_count': 4,
u'downloader/request_method_count/GET': 4,
u'downloader/response_bytes': 639,
u'downloader/response_count': 2,
u'downloader/response_status_count/200': 2,
u'log_count/DEBUG': 4,
u'log_count/INFO': 9,
u'log_count/WARNING': 1,
u'response_received_count': 2,
u'scheduler/dequeued': 4,
u'scheduler/dequeued/memory': 4,
u'scheduler/enqueued': 4,
u'scheduler/enqueued/memory': 4,
u'start_time': u'2016-09-28 08:49:57'}
>>>
You can also use jsonrpc_client_call from scrapy_jsonrpc.jsonrpc.
>>> from scrapy_jsonrpc.jsonrpc import jsonrpc_client_call
>>> jsonrpc_client_call('http://localhost:6024/crawler/stats', 'get_stats', 'httpbin')
{u'log_count/DEBUG': 5, u'scheduler/dequeued': 4, u'log_count/INFO': 11, u'downloader/response_count': 3, u'downloader/response_status_count/200': 3, u'log_count/WARNING': 1, u'scheduler/enqueued/memory': 4, u'downloader/response_bytes': 870, u'start_time': u'2016-09-28 09:01:47', u'scheduler/dequeued/memory': 4, u'scheduler/enqueued': 4, u'downloader/request_bytes': 862, u'response_received_count': 3, u'downloader/request_method_count/GET': 4, u'downloader/request_count': 4}
This is what you get "on the wire" for a request made with a modified example-client.py (see code a bit below, the example in https://github.com/scrapy-plugins/scrapy-jsonrpc is outdated as I write these lines):
POST /crawler/stats HTTP/1.1
Accept-Encoding: identity
Content-Length: 73
Host: localhost:6024
Content-Type: application/x-www-form-urlencoded
Connection: close
User-Agent: Python-urllib/2.7
{"params": ["httpbin"], "jsonrpc": "2.0", "method": "get_stats", "id": 1}
And the response
HTTP/1.1 200 OK
Content-Length: 504
Access-Control-Allow-Headers: X-Requested-With
Server: TwistedWeb/16.4.1
Connection: close
Date: Tue, 27 Sep 2016 11:21:43 GMT
Access-Control-Allow-Origin: *
Access-Control-Allow-Methods: GET, POST, PATCH, PUT, DELETE
Content-Type: application/json
{"jsonrpc": "2.0", "result": {"log_count/DEBUG": 5, "scheduler/dequeued": 4, "log_count/INFO": 11, "downloader/response_count": 3, "downloader/response_status_count/200": 3, "log_count/WARNING": 3, "scheduler/enqueued/memory": 4, "downloader/response_bytes": 870, "start_time": "2016-09-27 11:16:25", "scheduler/dequeued/memory": 4, "scheduler/enqueued": 4, "downloader/request_bytes": 862, "response_received_count": 3, "downloader/request_method_count/GET": 4, "downloader/request_count": 4}, "id": 1}
Here's the modified client to query /crawler/stats, which I called with ./example-client.py -H localhost -P 6024 get-spider-stats httpbin (for a running "httpbin" spider, JSONRPC_PORT being 6024 for me)
#!/usr/bin/env python
"""
Example script to control a Scrapy server using its JSON-RPC web service.
It only provides a reduced functionality as its main purpose is to illustrate
how to write a web service client. Feel free to improve or write you own.
Also, keep in mind that the JSON-RPC API is not stable. The recommended way for
controlling a Scrapy server is through the execution queue (see the "queue"
command).
"""
from __future__ import print_function
import sys, optparse, urllib, json
from six.moves.urllib.parse import urljoin
from scrapy_jsonrpc.jsonrpc import jsonrpc_client_call, JsonRpcError
def get_commands():
return {
'help': cmd_help,
'stop': cmd_stop,
'list-available': cmd_list_available,
'list-running': cmd_list_running,
'list-resources': cmd_list_resources,
'get-global-stats': cmd_get_global_stats,
'get-spider-stats': cmd_get_spider_stats,
}
def cmd_help(args, opts):
"""help - list available commands"""
print("Available commands:")
for _, func in sorted(get_commands().items()):
print(" ", func.__doc__)
def cmd_stop(args, opts):
"""stop <spider> - stop a running spider"""
jsonrpc_call(opts, 'crawler/engine', 'close_spider', args[0])
def cmd_list_running(args, opts):
"""list-running - list running spiders"""
for x in json_get(opts, 'crawler/engine/open_spiders'):
print(x)
def cmd_list_available(args, opts):
"""list-available - list name of available spiders"""
for x in jsonrpc_call(opts, 'crawler/spiders', 'list'):
print(x)
def cmd_list_resources(args, opts):
"""list-resources - list available web service resources"""
for x in json_get(opts, '')['resources']:
print(x)
def cmd_get_spider_stats(args, opts):
"""get-spider-stats <spider> - get stats of a running spider"""
stats = jsonrpc_call(opts, 'crawler/stats', 'get_stats', args[0])
for name, value in stats.items():
print("%-40s %s" % (name, value))
def cmd_get_global_stats(args, opts):
"""get-global-stats - get global stats"""
stats = jsonrpc_call(opts, 'crawler/stats', 'get_stats')
for name, value in stats.items():
print("%-40s %s" % (name, value))
def get_wsurl(opts, path):
return urljoin("http://%s:%s/"% (opts.host, opts.port), path)
def jsonrpc_call(opts, path, method, *args, **kwargs):
url = get_wsurl(opts, path)
return jsonrpc_client_call(url, method, *args, **kwargs)
def json_get(opts, path):
url = get_wsurl(opts, path)
return json.loads(urllib.urlopen(url).read())
def parse_opts():
usage = "%prog [options] <command> [arg] ..."
description = "Scrapy web service control script. Use '%prog help' " \
"to see the list of available commands."
op = optparse.OptionParser(usage=usage, description=description)
op.add_option("-H", dest="host", default="localhost", \
help="Scrapy host to connect to")
op.add_option("-P", dest="port", type="int", default=6080, \
help="Scrapy port to connect to")
opts, args = op.parse_args()
if not args:
op.print_help()
sys.exit(2)
cmdname, cmdargs, opts = args[0], args[1:], opts
commands = get_commands()
if cmdname not in commands:
sys.stderr.write("Unknown command: %s\n\n" % cmdname)
cmd_help(None, None)
sys.exit(1)
return commands[cmdname], cmdargs, opts
def main():
cmd, args, opts = parse_opts()
try:
cmd(args, opts)
except IndexError:
print(cmd.__doc__)
except JsonRpcError as e:
print(str(e))
if e.data:
print("Server Traceback below:")
print(e.data)
if __name__ == '__main__':
main()
In the example command above, I got this:
log_count/DEBUG 5
scheduler/dequeued 4
log_count/INFO 11
downloader/response_count 3
downloader/response_status_count/200 3
log_count/WARNING 3
scheduler/enqueued/memory 4
downloader/response_bytes 870
start_time 2016-09-27 11:16:25
scheduler/dequeued/memory 4
scheduler/enqueued 4
downloader/request_bytes 862
response_received_count 3
downloader/request_method_count/GET 4
downloader/request_count 4

Why does my "scrapy" not scrape anything?

I don't know where the issues lies probably super easy to fix since I am new to scrapy. I hope to find a solution. Thanks in advance.
I am using utnutu 14.04, python 3.4
My Spider:
``
class EnActressSpider(scrapy.Spider):
name = "en_name"
allowed_domains = ["www.r18.com/", "r18.com/"]
start_urls = ["http://www.r18.com/videos/vod/movies/actress/letter=a/sort=popular/page=1",]
def parse(self, response):
for sel in response.xpath('//*[#id="contents"]/div[2]/section/div[3]/ul/li'):
item = En_Actress()
item['image_urls'] = sel.xpath('a/p/img/#src').extract()
name_link = sel.xpath('a/#href').extract()
request = scrapy.Request(name_link, callback = self.parse_item, dont_filter=True)
request.meta['item'] = item
yield request
next_page = response.css("#contents > div.main > section > div.cmn-sec-item01.pb00 > div > ol > li.next > a::attr('href')")
if next_page:
url = response.urljoin(next_page[0].extract())
yield scrapy.Request(url, self.parse, dont_filter=True)
def parse_item(self, response):
item = reponse.meta['item']
name = response.xpath('//*[#id="contents"]/div[1]/ul/li[5]/span/text()')
item['name'] = name[0].encode('utf-8')
yield item
``
LOG:
``
{'downloader/request_bytes': 988,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 48547,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 25, 6, 46, 36, 940936),
'log_count/DEBUG': 1,
'log_count/INFO': 1,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'spider_exceptions/TypeError': 1,
'start_time': datetime.datetime(2016, 7, 25, 6, 46, 35, 908281)}
``
Any help is greatly appreciated.
There seems to be few syntax errors. I've cleaned it up and it seems to be working fine here.
Another edit I made is removed dont_filter parameter from Request objects since you don't want to scrape duplicates. Also adjusted allowed_domains since it was filtering out some content.
In the future you should post whole log.
import scrapy
class EnActressSpider(scrapy.Spider):
name = "en_name"
allowed_domains = ["r18.com"]
start_urls = ["http://www.r18.com/videos/vod/movies/actress/letter=a/sort=popular/page=1", ]
def parse(self, response):
for sel in response.xpath('//*[#id="contents"]/div[2]/section/div[3]/ul/li'):
item = dict()
item['image_urls'] = sel.xpath('a/p/img/#src').extract()
name_link = sel.xpath('a/#href').extract_first()
request = scrapy.Request(name_link, callback=self.parse_item)
request.meta['item'] = item
yield request
next_page = response.css(
"#contents > div.main > section > div.cmn-sec-item01.pb00 > "
"div > ol > li.next > a::attr('href')").extract_first()
if next_page:
url = response.urljoin(next_page)
yield scrapy.Request(url, self.parse)
def parse_item(self, response):
item = response.meta['item']
name = response.xpath('//*[#id="contents"]/div[1]/ul/li[5]/span/text()').extract_first()
item['name'] = name.encode('utf-8')
yield item