Why does my "scrapy" not scrape anything? - scrapy

I don't know where the issues lies probably super easy to fix since I am new to scrapy. I hope to find a solution. Thanks in advance.
I am using utnutu 14.04, python 3.4
My Spider:
``
class EnActressSpider(scrapy.Spider):
name = "en_name"
allowed_domains = ["www.r18.com/", "r18.com/"]
start_urls = ["http://www.r18.com/videos/vod/movies/actress/letter=a/sort=popular/page=1",]
def parse(self, response):
for sel in response.xpath('//*[#id="contents"]/div[2]/section/div[3]/ul/li'):
item = En_Actress()
item['image_urls'] = sel.xpath('a/p/img/#src').extract()
name_link = sel.xpath('a/#href').extract()
request = scrapy.Request(name_link, callback = self.parse_item, dont_filter=True)
request.meta['item'] = item
yield request
next_page = response.css("#contents > div.main > section > div.cmn-sec-item01.pb00 > div > ol > li.next > a::attr('href')")
if next_page:
url = response.urljoin(next_page[0].extract())
yield scrapy.Request(url, self.parse, dont_filter=True)
def parse_item(self, response):
item = reponse.meta['item']
name = response.xpath('//*[#id="contents"]/div[1]/ul/li[5]/span/text()')
item['name'] = name[0].encode('utf-8')
yield item
``
LOG:
``
{'downloader/request_bytes': 988,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 48547,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 25, 6, 46, 36, 940936),
'log_count/DEBUG': 1,
'log_count/INFO': 1,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'spider_exceptions/TypeError': 1,
'start_time': datetime.datetime(2016, 7, 25, 6, 46, 35, 908281)}
``
Any help is greatly appreciated.

There seems to be few syntax errors. I've cleaned it up and it seems to be working fine here.
Another edit I made is removed dont_filter parameter from Request objects since you don't want to scrape duplicates. Also adjusted allowed_domains since it was filtering out some content.
In the future you should post whole log.
import scrapy
class EnActressSpider(scrapy.Spider):
name = "en_name"
allowed_domains = ["r18.com"]
start_urls = ["http://www.r18.com/videos/vod/movies/actress/letter=a/sort=popular/page=1", ]
def parse(self, response):
for sel in response.xpath('//*[#id="contents"]/div[2]/section/div[3]/ul/li'):
item = dict()
item['image_urls'] = sel.xpath('a/p/img/#src').extract()
name_link = sel.xpath('a/#href').extract_first()
request = scrapy.Request(name_link, callback=self.parse_item)
request.meta['item'] = item
yield request
next_page = response.css(
"#contents > div.main > section > div.cmn-sec-item01.pb00 > "
"div > ol > li.next > a::attr('href')").extract_first()
if next_page:
url = response.urljoin(next_page)
yield scrapy.Request(url, self.parse)
def parse_item(self, response):
item = response.meta['item']
name = response.xpath('//*[#id="contents"]/div[1]/ul/li[5]/span/text()').extract_first()
item['name'] = name.encode('utf-8')
yield item

Related

Scrapy Pagination - Works for 2 pages but not after that

I'm crawling cdw.com website. For a given URL, there are around 17 pages. The script that I have written is able t fetch data from Page 1 and Page 2. Spider closes on its own after giving result of first 2 pages. Please let me know, how can I fetch data for remaining 15 pages.
TIA.
import scrapy
from cdwfinal.items import CdwfinalItem
from scrapy.selector import Selector
import datetime
import pandas as pd
import time
class CdwSpider(scrapy.Spider):
name = 'cdw'
allowed_domains = ['www.cdw.com']
start_urls = ['http://www.cdw.com/']
base_url = 'http://www.cdw.com'
def start_requests(self):
yield scrapy.Request(url = 'https://www.cdw.com/search/?key=axiom' , callback=self.parse )
def parse(self, response):
item=[]
hxs = Selector(response)
item = CdwfinalItem()
abc = hxs.xpath('//*[#id="main"]//*[#class="grid-row"]')
for i in range(len(abc)):
try:
item['mpn'] = hxs.xpath("//div[contains(#class,'search-results')]/div[contains(#class,'search-result')]["+ str(i+1) +"]//*[#class='mfg-code']/text()").extract()
except:
item['mpn'] = 'NA'
try:
item['part_no'] = hxs.xpath("//div[contains(#class,'search-results')]/div[contains(#class,'search-result')]["+ str(i+1) +"]//*[#class='cdw-code']/text()").extract()
except:
item['part_no'] = 'NA'
yield item
next_page = hxs.xpath('//*[#id="main"]//*[#class="no-hover" and #aria-label="Next Page"]').extract()
if next_page:
new_page_href = hxs.xpath('//*[#id="main"]//*[#class="no-hover" and #aria-label="Next Page"]/#href').extract_first()
new_page_url = response.urljoin(new_page_href)
yield scrapy.Request(new_page_url, callback=self.parse, meta={"searchword": '123'})
LOG:
2023-02-11 15:39:55 [scrapy_user_agents.middlewares] DEBUG: Assigned User-Agent Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36
2023-02-11 15:39:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cdw.com/search/?key=axiom&pcurrent=3> (referer: https://www.cdw.com/search/?key=axiom&pcurrent=2) ['cached']
2023-02-11 15:39:55 [scrapy.core.engine] INFO: Closing spider (finished)
2023-02-11 15:39:55 [scrapy.extensions.feedexport] INFO: Stored csv feed (48 items) in: Test5.csv
2023-02-11 15:39:55 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2178,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'downloader/response_bytes': 68059,
'downloader/response_count': 3,
'downloader/response_status_count/200': 3,
'elapsed_time_seconds': 1.30903,
'feedexport/success_count/FileFeedStorage': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2023, 2, 11, 10, 9, 55, 327740),
'httpcache/hit': 3,
'httpcompression/response_bytes': 384267,
'httpcompression/response_count': 3,
'item_scraped_count': 48,
'log_count/DEBUG': 62,
'log_count/INFO': 11,
'log_count/WARNING': 45,
'request_depth_max': 2,
'response_received_count': 3,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2023, 2, 11, 10, 9, 54, 18710)}
Your next_page selector is failing to extract the information for the next page. In general your selectors are more complicated then they need to be, for example you should be using relative xpath expressions in your for loop .
Here is an example that replicates the same behaviour as your spider except using much simpler selectors, and successfully extracts the results from all of the pages.
import scrapy
class CdwSpider(scrapy.Spider):
name = 'cdw'
allowed_domains = ['www.cdw.com']
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
def start_requests(self):
yield scrapy.Request(url='https://www.cdw.com/search/?key=axiom' , callback=self.parse )
def parse(self, response):
for row in response.xpath('//div[#class="grid-row"]'):
mpn = row.xpath(".//span[#class='mfg-code']/text()").get()
cdw = row.xpath('.//span[#class="cdw-code"]/text()').get()
yield {"mpn": mpn, "part_no": cdw}
current = response.css("div.search-pagination-active")
next_page = current.xpath('./following-sibling::a/#href').get()
if next_page:
new_page_url = response.urljoin(next_page)
yield scrapy.Request(new_page_url, callback=self.parse)
EDIT
The only non-default setting I am using is setting a the user agent.
I have made adjustments in the example above to reflect that.
Partial output:
2023-02-11 22:10:58 [scrapy.core.engine] INFO: Closing spider (finished)
2023-02-11 22:10:58 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 106555,
'downloader/request_count': 41,
'downloader/request_method_count/GET': 41,
'downloader/response_bytes': 1099256,
'downloader/response_count': 41,
'downloader/response_status_count/200': 41,
'elapsed_time_seconds': 22.968986,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2023, 2, 12, 6, 10, 58, 700080),
'httpcompression/response_bytes': 7962149,
'httpcompression/response_count': 41,
'item_scraped_count': 984,
'log_count/DEBUG': 1028,
'log_count/INFO': 10,
'request_depth_max': 40,
'response_received_count': 41,
'scheduler/dequeued': 41,
'scheduler/dequeued/memory': 41,
'scheduler/enqueued': 41,
'scheduler/enqueued/memory': 41,
'start_time': datetime.datetime(2023, 2, 12, 6, 10, 35, 731094)}
2023-02-11 22:10:58 [scrapy.core.engine] INFO: Spider closed (finished)

Scrapy not returned all items from pagination

I want to scrape all monitor item from the site https://www.startech.com.bd. But
The problem arise when I run my spider it returns only 60 result.
Here is my code, which doesn't work right:
import scrapy
import time
class StartechSpider(scrapy.Spider):
name = 'startech'
allowed_domains = ['startech.com.bd']
start_urls = ['https://www.startech.com.bd/monitor/']
def parse(self, response):
monitors = response.xpath("//div[#class='p-item']")
for monitor in monitors:
item = monitor.xpath(".//h4[#class = 'p-item-name']/a/text()").get()
price = monitor.xpath(".//div[#class = 'p-item-price']/span/text()").get()
yield{
'item' : item,
'price' : price
}
next_page = response.xpath("//ul[#class = 'pagination']/li/a/#href").get()
print (next_page)
if next_page:
yield response.follow(next_page, callback = self.parse)
Any help is much appreciated!
//ul[#class = 'pagination']/li/a/#href selects 10 items/pages at once but you have to select unique meaning only the next page.The following xpath expression grab the right pagination.
Code:
next_page = response.xpath("//a[contains(text(), 'NEXT')]/#href").get()
print (next_page)
if next_page:
yield response.follow(next_page, callback = self.parse)
Output:
2022-11-26 01:45:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.startech.com.bd/monitor?page=19> (referer: https://www.startech.com.bd/monitor?page=18)
2022-11-26 01:45:06 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.startech.com.bd/monitor?page=19>
{'item': 'HP E27q G4 27 Inch 2K QHD IPS Monitor', 'price': '41,000৳'}
None
2022-11-26 01:45:06 [scrapy.core.engine] INFO: Closing spider (finished)
2022-11-26 01:45:06 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 6702,
'downloader/request_count': 19,
'downloader/request_method_count/GET': 19,
'downloader/response_bytes': 546195,
'downloader/response_count': 19,
'downloader/response_status_count/200': 19,
'elapsed_time_seconds': 9.939978,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 11, 25, 19, 45, 6, 915772),
'httpcompression/response_bytes': 6200506,
'httpcompression/response_count': 19,
'item_scraped_count': 361,

How to store all scraped stats moment before spider closes?

I want to store all the stats collected from the spider into a single output file stored as json format. However, I get this error:
'MemoryStatsCollector' object has no attribute 'get_all'
: The documentation mentions that stats.get_all is how you get all the stores. What is the correct method of implementation for this?
import scrapy
from scrapy import signals
from scrapy import crawler
import jsonlines
class TestSpider(scrapy.Spider):
name = 'stats'
start_urls = ['http://quotes.toscrape.com']
def __init__(self, stats):
self.stats = stats
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
#spider = super(TestSpider, cls).from_crawler(crawler, *args, **kwargs)
stat = cls(crawler.stats)
crawler.signals.connect(stat.spider_closed, signals.spider_closed)
return stat
def spider_closed(self):
#self.stats = stat
txt_file = 'some_text.jl'
with jsonlines.open(txt_file, 'w') as f:
f.write(self.stats.get_all())
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get()
}
Turns out there is no get_all for the method and instead I had to input get_stats(), the documentation provides a few examples of some:
stats.get_value()
stats.get_stats()
stats.max_value()/stats.min_value()
stats.inc_value()
stats.set_value()
Some further information provided in the documentation for stats.
The working part:
def spider_closed(self):
#self.stats = stat
txt_file = 'some_text.jl'
with jsonlines.open(txt_file, 'w') as f:
# f.write(f'{self.stats.get_all()}') --- Changed
f.write(f'{self.stats.get_stats()}')
Output:
{
"log_count/INFO": 10,
"log_count/DEBUG": 3,
"start_time": datetime.datetime(2022, 7, 6, 16, 16, 30, 553373),
"memusage/startup": 59895808,
"memusage/max": 59895808,
"scheduler/enqueued/memory": 1,
"scheduler/enqueued": 1,
"scheduler/dequeued/memory": 1,
"scheduler/dequeued": 1,
"downloader/request_count": 1,
"downloader/request_method_count/GET": 1,
"downloader/request_bytes": 223,
"downloader/response_count": 1,
"downloader/response_status_count/200": 1,
"downloader/response_bytes": 2086,
"httpcompression/response_bytes": 11053,
"httpcompression/response_count": 1,
"response_received_count": 1,
"item_scraped_count": 1,
"elapsed_time_seconds": 0.34008,
"finish_time": datetime.datetime(2022, 7, 6, 16, 16, 30, 893453),
"finish_reason": "finished",
}

Scrapy finishing process before all pages are scraped

I sat up a test Scrapy scraper which looks like this:
import scrapy
class testSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.realestate.com.kh/buy/']
def parse(self, response):
nr_pages = response.xpath('//div[#class="desktop-buttons"]/a[#class="css-1en2dru"]//text()').getall()
for nr in range(1, 40):
req_url = f'?page={nr}'
self.item = {}
self.item['page'] = nr
yield scrapy.Request(url=response.urljoin(req_url), callback=self.parse_page, meta={'item': self.item})
def parse_page(self, response):
page = response.meta['item']['page']
ads = response.xpath('//*[#class="featured css-ineky e1jqslr40"]//a/#href')
for url in ads:
absolute_url = response.urljoin(url.extract())
self.item = {}
self.item['page'] = page
yield scrapy.Request(absolute_url, callback=self.parse_ad, meta={'item': self.item})
def parse_ad(self, response):
page = response.meta['item']['page']
# DO THINGS
yield {
'page': page
}
It goes though loads each https://www.realestate.com.kh/buy/?page=NR, where nr is all numbers between 1 and 40
On each of these pages, it get all ads
On each ad of each page, it scraps things and yield them.
It work fine for the first 26 items (two first pages, and 2 or 3 items from the 3rd one, out of 40) and then finished the scraping without an error.
Here are the stats :
{'downloader/request_bytes': 23163,
'downloader/request_count': 66,
'downloader/request_method_count/GET': 66,
'downloader/response_bytes': 3801022,
'downloader/response_count': 66,
'downloader/response_status_count/200': 66,
'elapsed_time_seconds': 5.420036,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 10, 22, 19, 48, 37, 549853),
'item_scraped_count': 26,
'log_count/INFO': 9,
'memusage/max': 49963008,
'memusage/startup': 49963008,
'request_depth_max': 2,
'response_received_count': 66,
'scheduler/dequeued': 66,
'scheduler/dequeued/memory': 66,
'scheduler/enqueued': 66,
'scheduler/enqueued/memory': 66,
'start_time': datetime.datetime(2020, 10, 22, 19, 48, 32, 129817)}
What could be ending the scraping so early?
Your spider is actually going through all the pages, the problem is that in parse_page the selector for ads only works in the earlier pages, in later pages the class name changes. The class name seems to be dynamically generated, so you need an XPath that won't select by class.
This XPath '//div/header/parent::div' would return the same div element that '//*[#class="featured css-ineky e1jqslr40"]' so replacing this line should allow you to select all ads from all pages:
ads = response.xpath('//div/header/parent::div/article/a/#href')
Unrelated note:
This isn't causing any problems yet, but it's a recipe for future problems.
for url in ads:
absolute_url = response.urljoin(url.extract())
self.item = {}
self.item['page'] = page
yield scrapy.Request(absolute_url, callback=self.parse_ad, meta={'item': self.item})
Scrapy works in an asynchronous way, so most of the time using an instance variable ( like self.item) gives the wrong intuition, as you don't really control the order in which the requests are parsed. That's why when you need to pass information between methods you use meta (or cb_kwargs) and not just store it in an instance variable.
You are working on the wrong way to get pages, there are 50 pages on the site right now. You should walk around by the next page. Look this code:
import scrapy
from scrapy.shell import inspect_response
class testSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.realestate.com.kh/buy/']
def parse(self, response):
page = response.xpath('//div[#class="list"]//div[#class="desktop-buttons"]/a[#class="css-owq2hj"]/span/text()').get()
ads = response.xpath('//div[#class="list"]/div/header/a/#href').getall()
for url in ads:
yield scrapy.Request(response.urljoin(url), callback=self.parse_ad, meta={'page': page})
# next page
url = response.xpath('//div[#class="list"]//div[#class="desktop-buttons"]/a[#class="css-owq2hj"]/following-sibling::a[1]/#href').get()
if url:
yield scrapy.Request(response.urljoin(url), callback=self.parse)
def parse_ad(self, response):
page = response.meta['page']
# DO THINGS
yield {
'page': page
}

Scrapy Crawlspider is only crawling the first 5 pages in a category

Currently, my crawlspider will only crawl roughly 20,000 products of the over 6.5M available. It seems that each category is being scraped but only the first 5 pages of each category are being scraped. I believe that is is something with my linkextractor but I am not sure.
CrawlSpider:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
class DigikeyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
partnumber = scrapy.Field()
manufacturer = scrapy.Field()
description = scrapy.Field()
quanity= scrapy.Field()
minimumquanity = scrapy.Field()
price = scrapy.Field()
class DigikeySpider(CrawlSpider):
name = 'digikey'
allowed_domains = ['digikey.com']
start_urls = ['https://www.digikey.com/products/en']
rules = (
Rule(LinkExtractor(allow=('products', )),callback='parse_item'),
)
def parse_item(self, response):
for row in response.css('table#productTable tbody tr'):
item = DigikeyItem()
item['partnumber'] = row.css('.tr-mfgPartNumber [itemprop="name"]::text').extract_first()
item['manufacturer'] = row.css('[itemprop="manufacture"] [itemprop="name"]::text').extract_first()
item['description'] = row.css('.tr-description::text').extract_first()
item['quanity'] = row.css('.tr-qtyAvailable::text').extract_first()
item['price'] = row.css('.tr-unitPrice::text').extract_first()
item['minimumquanity'] = row.css('.tr-minQty::text').extract_first()
yield item
Setting:
BOT_NAME = 'digikey'
SPIDER_MODULES = ['digikey.spiders']
NEWSPIDER_MODULE = 'digikey.spiders'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
Output:
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Closing spider (finished)
2017-11-01 10:53:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 6,
'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 6,
'downloader/request_bytes': 1198612,
'downloader/request_count': 988,
'downloader/request_method_count/GET': 988,
'downloader/response_bytes': 23932614,
'downloader/response_count': 982,
'downloader/response_status_count/200': 982,
'dupefilter/filtered': 46,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 11, 1, 17, 53, 11, 421641),
'item_scraped_count': 21783,
'log_count/DEBUG': 22773,
'log_count/ERROR': 2,
'log_count/INFO': 10,
'request_depth_max': 1,
'response_received_count': 982,
'retry/count': 4,
'retry/max_reached': 2,
'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 4,
'scheduler/dequeued': 988,
'scheduler/dequeued/memory': 988,
'scheduler/enqueued': 988,
'scheduler/enqueued/memory': 988,
'start_time': datetime.datetime(2017, 11, 1, 17, 49, 38, 427669)}
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Spider closed (finished)
PS C:\Users\dalla_000\digikey>
With this particular site, it might make sense to do a two-stage crawl:
get a list of all the category URLs
iterate over all the URLs
One approach could be to use two spiders and a message queue. The first spider might look like this:
import scrapy
from bs4 import BeautifulSoup
import re
import math
import urllib
from kafka import KafkaClient, SimpleProducer
ITEMS_PER_PAGE = 500
class CreateXxxxxxxUrlListSpider(scrapy.Spider):
kafka = KafkaClient('10.0.1.12:9092')
producer = SimpleProducer(kafka)
name = "create_xxxxxxx_url_list"
allowed_domains = ["xxxxxxx.com"]
start_urls = [
"http://www.xxxxxxx.com/product-search/en?stock=1"
]
def parse(self, response):
soup = BeautifulSoup(response.body)
catfilterlinks = soup.find_all('a', {'class':'catfilterlink'})
for catfilterlink in catfilterlinks:
location = catfilterlink['href'].split("?")[0]
items = re.match(".*\(([0-9]+) items\).*", catfilterlink.next_sibling).group(1)
for page in range(int(math.ceil(float(items) / ITEMS_PER_PAGE))):
if page == 0:
url = "http://www.xxxxxxx.com" + location + "?" + urllib.urlencode({"stock":1})
self.producer.send_messages("xxxxxxx_search_page_urls", url)
else:
url = "http://www.xxxxxxx.com" + location + "/page/" + str(page + 1) + "?" + urllib.urlencode({"stock":1})
self.producer.send_messages("xxxxxxx_search_page_urls", url)
The first spider gets a list of all the pages to crawl and then writes them to a message queue (e.g. Kafka).
The second spider would consume URLs from from the Kafka topic and crawl them. It might look something like this:
from scrapy_kafka.spiders import ListeningKafkaSpider
from .. items import PageHtml
from calendar import timegm
import time
class CrawlXxxxxxxUrlsSpider(ListeningKafkaSpider):
name = 'crawl_xxxxxxx_urls_spider'
allowed_domains = ["xxxxxxx.com"]
topic = "xxxxxxx_search_page_urls"
def parse(self, response):
item = PageHtml()
item['url'] = response.url
item['html'] = response.body_as_unicode()
item['ts'] = timegm(time.gmtime())
return item
# .... or whatever