Scrapy not returning data but instead a JSON error - scrapy

I'm trying to scrape the artists from a site, and them scrape the artists songs and duration , however I keep getting an error and not sure what is causing this error.
Here's what I have tried:
import scrapy
from scrapy.crawler import CrawlerProcess
class LyricalSpider(scrapy.Spider):
name = 'lyrical'
artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
start_urls = []
for art in artists:
start_urls.append(f'https://www.lyrics.com/artists/{art}')
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath("//table[#class='tdata']//tbody//tr")
for artists in container:
links = artists.xpath("(.//a)[position() mod 2 = 1]//#href").get()
yield response.follow(
url = response.urljoin(links),
callback = self.parse_artists
)
def parse_artists(self, response):
table = response.xpath("//div[#class='tdata-ext']//table")
for items in table:
yield {
'songs':items.xpath("(.//td)[position() mod 2=1]//text()"),
'duration':items.xpath("(.//td)[position() mod 2=0]/text()")
}
process = CrawlerProcess(
settings = {
'FEED_URI':'artists.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(LyricalSpider)
process.start()
TypeError: Object of type Selector is not JSON serializable
2022-01-23 14:50:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.lyrics.com/artist/H-monroy/2138107522> (referer: https://www.lyrics.com/artists/H)
2022-01-23 14:50:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.lyrics.com/artist/U-God/278759> (referer: https://www.lyrics.com/artists/U)

Related

Scrapy not returned all items from pagination

I want to scrape all monitor item from the site https://www.startech.com.bd. But
The problem arise when I run my spider it returns only 60 result.
Here is my code, which doesn't work right:
import scrapy
import time
class StartechSpider(scrapy.Spider):
name = 'startech'
allowed_domains = ['startech.com.bd']
start_urls = ['https://www.startech.com.bd/monitor/']
def parse(self, response):
monitors = response.xpath("//div[#class='p-item']")
for monitor in monitors:
item = monitor.xpath(".//h4[#class = 'p-item-name']/a/text()").get()
price = monitor.xpath(".//div[#class = 'p-item-price']/span/text()").get()
yield{
'item' : item,
'price' : price
}
next_page = response.xpath("//ul[#class = 'pagination']/li/a/#href").get()
print (next_page)
if next_page:
yield response.follow(next_page, callback = self.parse)
Any help is much appreciated!
//ul[#class = 'pagination']/li/a/#href selects 10 items/pages at once but you have to select unique meaning only the next page.The following xpath expression grab the right pagination.
Code:
next_page = response.xpath("//a[contains(text(), 'NEXT')]/#href").get()
print (next_page)
if next_page:
yield response.follow(next_page, callback = self.parse)
Output:
2022-11-26 01:45:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.startech.com.bd/monitor?page=19> (referer: https://www.startech.com.bd/monitor?page=18)
2022-11-26 01:45:06 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.startech.com.bd/monitor?page=19>
{'item': 'HP E27q G4 27 Inch 2K QHD IPS Monitor', 'price': '41,000৳'}
None
2022-11-26 01:45:06 [scrapy.core.engine] INFO: Closing spider (finished)
2022-11-26 01:45:06 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 6702,
'downloader/request_count': 19,
'downloader/request_method_count/GET': 19,
'downloader/response_bytes': 546195,
'downloader/response_count': 19,
'downloader/response_status_count/200': 19,
'elapsed_time_seconds': 9.939978,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 11, 25, 19, 45, 6, 915772),
'httpcompression/response_bytes': 6200506,
'httpcompression/response_count': 19,
'item_scraped_count': 361,

How to store all scraped stats moment before spider closes?

I want to store all the stats collected from the spider into a single output file stored as json format. However, I get this error:
'MemoryStatsCollector' object has no attribute 'get_all'
: The documentation mentions that stats.get_all is how you get all the stores. What is the correct method of implementation for this?
import scrapy
from scrapy import signals
from scrapy import crawler
import jsonlines
class TestSpider(scrapy.Spider):
name = 'stats'
start_urls = ['http://quotes.toscrape.com']
def __init__(self, stats):
self.stats = stats
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
#spider = super(TestSpider, cls).from_crawler(crawler, *args, **kwargs)
stat = cls(crawler.stats)
crawler.signals.connect(stat.spider_closed, signals.spider_closed)
return stat
def spider_closed(self):
#self.stats = stat
txt_file = 'some_text.jl'
with jsonlines.open(txt_file, 'w') as f:
f.write(self.stats.get_all())
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get()
}
Turns out there is no get_all for the method and instead I had to input get_stats(), the documentation provides a few examples of some:
stats.get_value()
stats.get_stats()
stats.max_value()/stats.min_value()
stats.inc_value()
stats.set_value()
Some further information provided in the documentation for stats.
The working part:
def spider_closed(self):
#self.stats = stat
txt_file = 'some_text.jl'
with jsonlines.open(txt_file, 'w') as f:
# f.write(f'{self.stats.get_all()}') --- Changed
f.write(f'{self.stats.get_stats()}')
Output:
{
"log_count/INFO": 10,
"log_count/DEBUG": 3,
"start_time": datetime.datetime(2022, 7, 6, 16, 16, 30, 553373),
"memusage/startup": 59895808,
"memusage/max": 59895808,
"scheduler/enqueued/memory": 1,
"scheduler/enqueued": 1,
"scheduler/dequeued/memory": 1,
"scheduler/dequeued": 1,
"downloader/request_count": 1,
"downloader/request_method_count/GET": 1,
"downloader/request_bytes": 223,
"downloader/response_count": 1,
"downloader/response_status_count/200": 1,
"downloader/response_bytes": 2086,
"httpcompression/response_bytes": 11053,
"httpcompression/response_count": 1,
"response_received_count": 1,
"item_scraped_count": 1,
"elapsed_time_seconds": 0.34008,
"finish_time": datetime.datetime(2022, 7, 6, 16, 16, 30, 893453),
"finish_reason": "finished",
}

How to get cookies from response of scrapy splash

I want to get the cookie value from the response object of a splash. but it is not working as I expected.
Here is spider code
class AmazonSpider(scrapy.Spider):
name = 'amazon'
allowed_domains = ['amazon.com']
def start_requests(self):
url = 'https://www.amazon.com/gp/goldbox?ref_=nav_topnav_deals'
yield SplashRequest(url, self.parse, args={'wait': 0.5})
def parse(self, response):
print(response.headers)
Output log:
2019-08-17 11:53:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/robots.txt> (referer: None)
2019-08-17 11:53:08 [scrapy.core.engine] DEBUG: Crawled (404) <GET http://192.168.99.100:8050/robots.txt> (referer: None)
2019-08-17 11:53:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/gp/goldbox?ref_=nav_topnav_deals via http://192.168.99.100:8050/render.html> (referer: None)
{b'Date': [b'Sat, 17 Aug 2019 06:23:09 GMT'], b'Server': [b'TwistedWeb/18.9.0'], b'Content-Type': [b'text/html; charset=utf-8']}
2019-08-17 11:53:24 [scrapy.core.engine] INFO: Closing spider (finished)
You can try the following approach:
- write a small Lua script that returns the html + the cookies:
lua_request = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go(splash.args.url))
splash:wait(0.5)
return {
html = splash:html(),
cookies = splash:get_cookies()
}
end
"""
Change your Request to the following:
yield SplashRequest(
url,
self.parse,
endpoint='execute',
args={'lua_source': self.lua_request}
)
Then find the cookies in your parse-method as follows:
def parse(self, response):
cookies = response.data['cookies']
headers = response.headers

Scrapy Crawlspider is only crawling the first 5 pages in a category

Currently, my crawlspider will only crawl roughly 20,000 products of the over 6.5M available. It seems that each category is being scraped but only the first 5 pages of each category are being scraped. I believe that is is something with my linkextractor but I am not sure.
CrawlSpider:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
class DigikeyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
partnumber = scrapy.Field()
manufacturer = scrapy.Field()
description = scrapy.Field()
quanity= scrapy.Field()
minimumquanity = scrapy.Field()
price = scrapy.Field()
class DigikeySpider(CrawlSpider):
name = 'digikey'
allowed_domains = ['digikey.com']
start_urls = ['https://www.digikey.com/products/en']
rules = (
Rule(LinkExtractor(allow=('products', )),callback='parse_item'),
)
def parse_item(self, response):
for row in response.css('table#productTable tbody tr'):
item = DigikeyItem()
item['partnumber'] = row.css('.tr-mfgPartNumber [itemprop="name"]::text').extract_first()
item['manufacturer'] = row.css('[itemprop="manufacture"] [itemprop="name"]::text').extract_first()
item['description'] = row.css('.tr-description::text').extract_first()
item['quanity'] = row.css('.tr-qtyAvailable::text').extract_first()
item['price'] = row.css('.tr-unitPrice::text').extract_first()
item['minimumquanity'] = row.css('.tr-minQty::text').extract_first()
yield item
Setting:
BOT_NAME = 'digikey'
SPIDER_MODULES = ['digikey.spiders']
NEWSPIDER_MODULE = 'digikey.spiders'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
Output:
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Closing spider (finished)
2017-11-01 10:53:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 6,
'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 6,
'downloader/request_bytes': 1198612,
'downloader/request_count': 988,
'downloader/request_method_count/GET': 988,
'downloader/response_bytes': 23932614,
'downloader/response_count': 982,
'downloader/response_status_count/200': 982,
'dupefilter/filtered': 46,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 11, 1, 17, 53, 11, 421641),
'item_scraped_count': 21783,
'log_count/DEBUG': 22773,
'log_count/ERROR': 2,
'log_count/INFO': 10,
'request_depth_max': 1,
'response_received_count': 982,
'retry/count': 4,
'retry/max_reached': 2,
'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 4,
'scheduler/dequeued': 988,
'scheduler/dequeued/memory': 988,
'scheduler/enqueued': 988,
'scheduler/enqueued/memory': 988,
'start_time': datetime.datetime(2017, 11, 1, 17, 49, 38, 427669)}
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Spider closed (finished)
PS C:\Users\dalla_000\digikey>
With this particular site, it might make sense to do a two-stage crawl:
get a list of all the category URLs
iterate over all the URLs
One approach could be to use two spiders and a message queue. The first spider might look like this:
import scrapy
from bs4 import BeautifulSoup
import re
import math
import urllib
from kafka import KafkaClient, SimpleProducer
ITEMS_PER_PAGE = 500
class CreateXxxxxxxUrlListSpider(scrapy.Spider):
kafka = KafkaClient('10.0.1.12:9092')
producer = SimpleProducer(kafka)
name = "create_xxxxxxx_url_list"
allowed_domains = ["xxxxxxx.com"]
start_urls = [
"http://www.xxxxxxx.com/product-search/en?stock=1"
]
def parse(self, response):
soup = BeautifulSoup(response.body)
catfilterlinks = soup.find_all('a', {'class':'catfilterlink'})
for catfilterlink in catfilterlinks:
location = catfilterlink['href'].split("?")[0]
items = re.match(".*\(([0-9]+) items\).*", catfilterlink.next_sibling).group(1)
for page in range(int(math.ceil(float(items) / ITEMS_PER_PAGE))):
if page == 0:
url = "http://www.xxxxxxx.com" + location + "?" + urllib.urlencode({"stock":1})
self.producer.send_messages("xxxxxxx_search_page_urls", url)
else:
url = "http://www.xxxxxxx.com" + location + "/page/" + str(page + 1) + "?" + urllib.urlencode({"stock":1})
self.producer.send_messages("xxxxxxx_search_page_urls", url)
The first spider gets a list of all the pages to crawl and then writes them to a message queue (e.g. Kafka).
The second spider would consume URLs from from the Kafka topic and crawl them. It might look something like this:
from scrapy_kafka.spiders import ListeningKafkaSpider
from .. items import PageHtml
from calendar import timegm
import time
class CrawlXxxxxxxUrlsSpider(ListeningKafkaSpider):
name = 'crawl_xxxxxxx_urls_spider'
allowed_domains = ["xxxxxxx.com"]
topic = "xxxxxxx_search_page_urls"
def parse(self, response):
item = PageHtml()
item['url'] = response.url
item['html'] = response.body_as_unicode()
item['ts'] = timegm(time.gmtime())
return item
# .... or whatever

scrapy:Why no use of parse_item function

Here is my Spider:
import scrapy
import urlparse
from scrapy.http import Request
class BasicSpider(scrapy.Spider):
name = "basic2"
allowed_domains = ["cnblogs"]
start_urls = (
'http://www.cnblogs.com/kylinlin/',
)
def parse(self, response):
next_site = response.xpath(".//*[#id='nav_next_page']/a/#href")
for url in next_site.extract():
yield Request(urlparse.urljoin(response.url,url))
item_selector = response.xpath(".//*[#class='postTitle']/a/#href")
for url in item_selector.extract():
yield Request(url=urlparse.urljoin(response.url, url),
callback=self.parse_item)
def parse_item(self, response):
print "+=====================>>test"
Here is the output:
2016-08-12 14:46:20 [scrapy] INFO: Spider opened
2016-08-12 14:46:20 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-08-12 14:46:20 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-08-12 14:46:20 [scrapy] DEBUG: Crawled (200) http://www.cnblogs.com/robots.txt> (referer: None)
2016-08-12 14:46:20 [scrapy] DEBUG: Crawled (200) http://www.cnblogs.com/kylinlin/> (referer: None)
2016-08-12 14:46:20 [scrapy] DEBUG: Filtered offsite request to 'www.cnblogs.com': http://www.cnblogs.com/kylinlin/default.html?page=2>
2016-08-12 14:46:20 [scrapy] INFO: Closing spider (finished)
2016-08-12 14:46:20 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 445,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 5113,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 8, 12, 6, 46, 20, 420000),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'offsite/domains': 1,
'offsite/filtered': 11,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 8, 12, 6, 46, 20, 131000)}
2016-08-12 14:46:20 [scrapy] INFO: Spider closed (finished)
Why crawled pages are 0?
I cannot understand why there are no output like "+=====================>>test".
Could someone help me out?
2016-08-12 14:46:20 [scrapy] DEBUG: Filtered offsite request to 'www.cnblogs.com': http://www.cnblogs.com/kylinlin/default.html?page=2>
and your's is set to:
allowed_domains = ["cnblogs"]
which is not even a domain. It should be:
allowed_domains = ["cnblogs.com"]