CsvItemExporter for multiple files in custom item pipeline not exporting all items - scrapy

I have created an item pipeline as an answer to this question.
It is supposed to create a new file for every page according to the page_no value set in the item. This works mostly fine.
The problem is with the last csv file generated by the pipeline/item exporter, page-10.csv.
The last 10 values are not exported, so the file stays empty.
What could be the reason for this behaviour?
pipelines.py
from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
"""Distribute items across multiple CSV files according to their 'page_no' field"""
def open_spider(self, spider):
self.filename_to_exporter = {}
def spider_closed(self, spider):
for exporter in self.filename_to_exporter.values():
exporter.finish_exporting()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
f = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
self.filename_to_exporter[filename] = exporter
return self.filename_to_exporter[filename]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item
spider
import scrapy
from ..pipelines import PerFilenameExportPipeline
class spidey(scrapy.Spider):
name = "idk"
custom_settings = {
'ITEM_PIPELINES': {
PerFilenameExportPipeline: 100
}
}
def start_requests(self):
yield scrapy.Request("http://quotes.toscrape.com/", cb_kwargs={'page_no': 1})
def parse(self, response, page_no):
for qts in response.xpath("//*[#class=\"quote\"]"):
yield {
'page_no': page_no,
'author' : qts.xpath("./span[2]/small/text()").get(),
'quote' : qts.xpath("./*[#class=\"text\"]/text()").get()
}
next_pg = response.xpath('//li[#class="next"]/a/#href').get()
if next_pg is not None:
yield response.follow(next_pg, cb_kwargs={'page_no': page_no + 1})

I know, 2 years later, but still - it might turn out helpful for someone.
It looks like you're never closing the file you're writing to (as you're using inline open). Please compare your code to the one in Scrapy's docs (the "Using Item Exporters" section): https://docs.scrapy.org/en/latest/topics/exporters.html
Besides, the method should now be called "close_spider", not "spider_closed"
Changing your code to the following should help:
from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
def open_spider(self, spider):
self.filename_to_exporter = {}
def close_spider(self, spider):
#iterating over exporter-file tuples instead of only exporters
for exporter, csv_file in self.filename_to_exporter.values():
exporter.finish_exporting()
#closing the file
csv_file.close()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
csv_file = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
#adding both exporter & file to later be closed as the dict's value
self.filename_to_exporter[filename] = (exporter, csv_file)
#picking only the exporter via [0]
return self.filename_to_exporter[filename][0]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item

Related

No adapter found for objects of type: 'itemadapter.adapter.ItemAdapter'

I want to change the names of images downloaded from a webpage. I want to use standard names given by the website as opposed to cleaning the request url for it.
I have the following pipeline.py
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class ScrapyExercisesPipeline:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
return adapter
class DownfilesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, item=None):
adapter = ScrapyExercisesPipeline().process_item()[0]
image_name: str = f'{adapter}.jpg'
return image_name
This produces the following error:
raise TypeError(f"No adapter found for objects of type: {type(item)} ({item})")
TypeError: No adapter found for objects of type: <class 'itemadapter.adapter.ItemAdapter'> (<ItemAdapter for ScrapyExercisesItem(name='unknown267', images=['https://bl-web-assets.britishland.com/live/meadowhall/s3fs-public/styles/retailer_thumbnail/public/retailer/boots_1.jpg?qQ.NHRs04tdmGxoyZKerRHcrhCImB3JH&itok=PD5LxLmS&cb=1657061667-curtime&v=1657061667-curtime'])>)
scraper.py:
import scrapy
from scrapy_exercises.items import ScrapyExercisesItem
class TestSpider(scrapy.Spider):
name = 'test'
#allowed_domains = ['x']
start_urls = ['https://www.meadowhall.co.uk/eatdrinkshop?page=1']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
cb_kwargs = {'pg':0}
)
def parse(self, response,pg):
pg=0
content_page = response.xpath("//div[#class='view-content']//div")
for cnt in content_page:
image_url = cnt.xpath(".//img//#src").get()
image_name = cnt.xpath(".//img//#alt").get()
if image_url != None:
pg+=1
items = ScrapyExercisesItem()
if image_name == '':
items['name'] = 'unknown'+f'{pg}'
items['images'] = [image_url]
yield items
else:
items['name'] = image_name
items['images'] = [image_url]
yield items
settings.py
ITEM_PIPELINES = {
#'scrapy.pipelines.images.ImagesPipeline': 1,
'scrapy_exercises.pipelines.ScrapyExercisesPipeline':45,
'scrapy_exercises.pipelines.DownfilesPipeline': 55
}
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parent.parent
IMAGES_STORE = os.path.join(BASE_DIR, 'images')
IMAGES_URLS_FIELD = 'images'
IMAGES_RESULT_FIELD = 'results'
You are calling on a pipeline from within your pipeline while that pipeline is also registered in your settings to be run as a pipeline. It would be simpler to just extract the name field from your item in your DownFilesPipeLine and return it.
Change your pipelines.py file to:
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class DownfilesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, item=None):
return item['name'] + '.jpg'
You also need to turn off the ScrapyExercisesPipeline in your settings

How to store scraped links in Scrapy

I did a lot of searches on the web but I couldn't find anything related or maybe it has to do with the wording used.
Basically, I would like to write a spider that would able to save the scraped links and to check if some other links have been already scraped. Is there any build in function in scrapy to do so?
Many thanks
You can write your own method for this purpose. I have written in my project and you can take reference from this. A dictionary called already_parsed_urls and for every callback, I am updating this dictionary.
You can look at the below code snippet and take reference.
from scrapy.spiders import CrawlSpider
from scrapy_splash import SplashRequest
class Spider(CrawlSpider):
name = 'test'
allowed_domains = []
web_url = ''
start_urls = ['']
counter = 0
already_parsed_urls = {}
wait_time = 3
timeout = '90'
def start_requests(self):
for start_url in self.start_urls:
yield SplashRequest(start_url, callback=self.parse_courses,
args={'wait': self.wait_time, 'timeout': self.timeout})
def parse_courses(self, response):
course_urls = []
yield SplashRequest(course_urls[0], callback=self.parse_items, args={'wait': self.wait_time})
def parse_items(self, response):
if not self.already_parsed_urls.get(response.url):
# Get Program URL
program_url = response.url
self.already_parsed_urls[response.url] = 1
else:
return {}

Count scraped items from scrapy

Looking to just count the number of things scraped. New to python and scraping just following the example and what to know how to just count the number of times Albert Einstein shows up and print to a json file. Just can not get it to print to file using print, yield, or return.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
i=0
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
i+=1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
I found out how to get to the item_scraped_count that shows up in the log output at the end of the spider.
import scrapy
from scrapy import signals
class CountSpider(scrapy.Spider):
name = 'count'
start_urls = ['https://example.com']
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CountSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
stats = spider.crawler.stats.get_stats()
numcount = str(stats['item_scraped_count'])
Here I can create a csv file with the stats
In scrapy request are made asynchronously, and each request will callback to the parse function indepedently. Your i variable is not an instance variable, so it's scope is limited to each function call.
Even if that wasn't the case, the recursion would turn your counter to 0 in each callback.
I would suggest you to take a look at scrapy items, at the end of the scrapy process it will return a counter with the nr of scraped items. Although that maybe an overkill if you don't want to store anymore information but the number of occurrences of "Albert Einstein".
If that's all you want, you can use a dirtier solution, set your counter var to be a instance var and have parse method to increment it, like this:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
counter = 0
def parse(self, response):
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
self.counter += 1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.
It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]

Scrape multiple URLs with Scrapy

How can I scrape multiple URLs with Scrapy?
Am I forced to make multiple crawlers?
class TravelSpider(BaseSpider):
name = "speedy"
allowed_domains = ["example.com"]
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4),"http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Python says:
NameError: name 'i' is not defined
But when I use one URL it works fine!
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)"]
Your python syntax is incorrect, try:
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)] + \
["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
If you need to write code to generate start requests, you can define a start_requests() method instead of using start_urls.
You can initialize start_urls in __init__.py method:
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class TravelItem(Item):
url = Field()
class TravelSpider(BaseSpider):
def __init__(self, name=None, **kwargs):
self.start_urls = []
self.start_urls.extend(["http://example.com/category/top/page-%d/" % i for i in xrange(4)])
self.start_urls.extend(["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)])
super(TravelSpider, self).__init__(name, **kwargs)
name = "speedy"
allowed_domains = ["example.com"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Hope that helps.
There are only four ranges in Python: LEGB, because the local scope of the class definition and the local extent of the list derivation are not nested functions, so they do not form the Enclosing scope.Therefore, they are two separate local scopes that cannot be accessed from each other.
so, don't use 'for' and class variables at the same time