No adapter found for objects of type: 'itemadapter.adapter.ItemAdapter' - scrapy

I want to change the names of images downloaded from a webpage. I want to use standard names given by the website as opposed to cleaning the request url for it.
I have the following pipeline.py
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class ScrapyExercisesPipeline:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
return adapter
class DownfilesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, item=None):
adapter = ScrapyExercisesPipeline().process_item()[0]
image_name: str = f'{adapter}.jpg'
return image_name
This produces the following error:
raise TypeError(f"No adapter found for objects of type: {type(item)} ({item})")
TypeError: No adapter found for objects of type: <class 'itemadapter.adapter.ItemAdapter'> (<ItemAdapter for ScrapyExercisesItem(name='unknown267', images=['https://bl-web-assets.britishland.com/live/meadowhall/s3fs-public/styles/retailer_thumbnail/public/retailer/boots_1.jpg?qQ.NHRs04tdmGxoyZKerRHcrhCImB3JH&itok=PD5LxLmS&cb=1657061667-curtime&v=1657061667-curtime'])>)
scraper.py:
import scrapy
from scrapy_exercises.items import ScrapyExercisesItem
class TestSpider(scrapy.Spider):
name = 'test'
#allowed_domains = ['x']
start_urls = ['https://www.meadowhall.co.uk/eatdrinkshop?page=1']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
cb_kwargs = {'pg':0}
)
def parse(self, response,pg):
pg=0
content_page = response.xpath("//div[#class='view-content']//div")
for cnt in content_page:
image_url = cnt.xpath(".//img//#src").get()
image_name = cnt.xpath(".//img//#alt").get()
if image_url != None:
pg+=1
items = ScrapyExercisesItem()
if image_name == '':
items['name'] = 'unknown'+f'{pg}'
items['images'] = [image_url]
yield items
else:
items['name'] = image_name
items['images'] = [image_url]
yield items
settings.py
ITEM_PIPELINES = {
#'scrapy.pipelines.images.ImagesPipeline': 1,
'scrapy_exercises.pipelines.ScrapyExercisesPipeline':45,
'scrapy_exercises.pipelines.DownfilesPipeline': 55
}
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parent.parent
IMAGES_STORE = os.path.join(BASE_DIR, 'images')
IMAGES_URLS_FIELD = 'images'
IMAGES_RESULT_FIELD = 'results'

You are calling on a pipeline from within your pipeline while that pipeline is also registered in your settings to be run as a pipeline. It would be simpler to just extract the name field from your item in your DownFilesPipeLine and return it.
Change your pipelines.py file to:
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class DownfilesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, item=None):
return item['name'] + '.jpg'
You also need to turn off the ScrapyExercisesPipeline in your settings

Related

scrapy-playwright returning nothing but an error

I'm learning Scrapy-playwright and it's fighting me. I'm attempting to gather store locations from a site using the CrawlSpider with a rule including a process_request that triggers the request to run through playwright. In my callback def I can print a value found on the page, but not return or yield anything. I've attempted to cache the data into an item, and return/yield a dict, all of which produces the error.
ERROR: Spider must return request, item, or None, got 'Deferred'
I'm stumped.
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from banners.items import StoreItem
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
def set_playwright_true(request, response):
request.meta["playwright"] = True
request.meta["playwright_include_page"] = True
request.meta["playwright_page_coroutines"] = ('wait_for_selector', 'span.store-name-city')
return request
class StoreSpider(CrawlSpider):
name = "retailer"
allowed_domains = ['retailer.com']
start_urls = ['https://www.retailer.com/store/0000-city-ak']
custom_settings = {
'ROBOTSTXT_OBEY': True ,
#'DOWNLOAD_DELAY': .5 ,
#'CONCURRENT_REQUESTS_PER_DOMAIN': 3 ,
'DOWNLOAD_HANDLERS': {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
} ,
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
}
rules = (
Rule(LinkExtractor(allow=('directory/ak/anchorage'))),
Rule(LinkExtractor(allow=(r'store/[0-9]+'), deny=(r'store/[0-9]+.+/.+')), callback='parse_item', follow=False, process_request=set_playwright_true),
)
async def parse_item(self, response):
items = []
item = StoreItem()
self.logger.info('***** Start processing ' + response.url + '. *****')
Name = response.css('meta[itemprop=alternateName]').attrib['content'] + ' - ' + response.css('span.store-name-city::text').get()
print(Name)
item['Name'] = Name
item['StoreID'] = response.css('meta[itemprop=storeID]').attrib['content']
item['Address1'] = response.css('span.store-address-line-1::text').get()
item['City'] = response.css('span.store-address-city::text').get()
item['State'] = response.css('span.store-address-state::text').get()
item['Zip'] = response.css('span.store-address-postal::text').get()
item['Phone'] = response.css('div.store-phone::text').get()
item['Latitude'] = response.css('meta[itemprop=latitude]').attrib['content']
item['Longitude'] = response.css('meta[itemprop=longitude]').attrib['content']
items.append(item)
return(items)
Changing parse_item from an async def to a plain def resolved the issue.
async def parse_item(self, response):
changed to
def parse_item(self, response):

CsvItemExporter for multiple files in custom item pipeline not exporting all items

I have created an item pipeline as an answer to this question.
It is supposed to create a new file for every page according to the page_no value set in the item. This works mostly fine.
The problem is with the last csv file generated by the pipeline/item exporter, page-10.csv.
The last 10 values are not exported, so the file stays empty.
What could be the reason for this behaviour?
pipelines.py
from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
"""Distribute items across multiple CSV files according to their 'page_no' field"""
def open_spider(self, spider):
self.filename_to_exporter = {}
def spider_closed(self, spider):
for exporter in self.filename_to_exporter.values():
exporter.finish_exporting()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
f = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
self.filename_to_exporter[filename] = exporter
return self.filename_to_exporter[filename]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item
spider
import scrapy
from ..pipelines import PerFilenameExportPipeline
class spidey(scrapy.Spider):
name = "idk"
custom_settings = {
'ITEM_PIPELINES': {
PerFilenameExportPipeline: 100
}
}
def start_requests(self):
yield scrapy.Request("http://quotes.toscrape.com/", cb_kwargs={'page_no': 1})
def parse(self, response, page_no):
for qts in response.xpath("//*[#class=\"quote\"]"):
yield {
'page_no': page_no,
'author' : qts.xpath("./span[2]/small/text()").get(),
'quote' : qts.xpath("./*[#class=\"text\"]/text()").get()
}
next_pg = response.xpath('//li[#class="next"]/a/#href').get()
if next_pg is not None:
yield response.follow(next_pg, cb_kwargs={'page_no': page_no + 1})
I know, 2 years later, but still - it might turn out helpful for someone.
It looks like you're never closing the file you're writing to (as you're using inline open). Please compare your code to the one in Scrapy's docs (the "Using Item Exporters" section): https://docs.scrapy.org/en/latest/topics/exporters.html
Besides, the method should now be called "close_spider", not "spider_closed"
Changing your code to the following should help:
from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
def open_spider(self, spider):
self.filename_to_exporter = {}
def close_spider(self, spider):
#iterating over exporter-file tuples instead of only exporters
for exporter, csv_file in self.filename_to_exporter.values():
exporter.finish_exporting()
#closing the file
csv_file.close()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
csv_file = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
#adding both exporter & file to later be closed as the dict's value
self.filename_to_exporter[filename] = (exporter, csv_file)
#picking only the exporter via [0]
return self.filename_to_exporter[filename][0]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item

Following urls in javascript - Scrapy Splash

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is https://www.ta.com/portfolio/investments/ari-network-services-inc and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['https://www.ta.com/portfolio/investments/ari-network-services-inc']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
}
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('https://www.ta.com',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.
Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
'https://www.ta.com/portfolio/business-services',
'https://www.ta.com/portfolio/consumer',
'https://www.ta.com/portfolio/financial-services',
'https://www.ta.com/portfolio/healthcare',
'https://www.ta.com/portfolio/technology'
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('https://www.ta.com' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail
}

Broad Scrapy Crawl: sgmlLinkextractor rule does not work

I've spent a lot of time playing around and using google but I could not solve my problem. I am new to Scrapy and I hope you can help me.
Part of the spider that works: I define my start_requests urls out of a MySQL Datbase. With the 'parse_item' statement I write the response into seperate files. Both of these steps work fine.
My Problem: Additionally I want to follow every url which contains '.ch' and - as I do for the start_requests - send them to the 'parse_item' method. I therefore defined a rule with a sgmlLinkExtractor and the 'parse_item' method as the callback. This does not work. After completion, I only have the files for the urls defined in 'start_requests'. I don't get any error messages.
Here is my code:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import mysql.connector
from scrapy.http import Request
class FirstSpider(CrawlSpider):
name = 'firstspider'
start_urls = []
def start_requests(self):
conn = mysql.connector.connect(user='root', password = 'root', host= 'localhost', database = 'Eistee')
cursor = conn.cursor()
query = ("SELECT Domain, CompanyName FROM Crawlbydomain LIMIT 300, 100")
cursor.execute(query)
results = cursor.fetchall()
for result in results:
urlrequest = 'http://'+result[0]
yield Request(urlrequest, callback = self.parse_item )
rules = (Rule (SgmlLinkExtractor(allow=('.ch', )), callback='parse_item', follow= True),)
def parse_item(self, response):
filename = response.url.translate(None, './')
open(filename, 'wb').write(response.body)
Can you help me?
To make CrawlSpider do its "magic" you need the requests to go through CrawlSpider's parse() callback.
So in start_requests() your Requests must use callback=self.parse (or not set the callback argument)
If you also want the start Requests to go through parse_item you need to set a parse_start_url attribute in your spider set to parse_item.
So you need to have something like:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import mysql.connector
from scrapy.http import Request
class FirstSpider(CrawlSpider):
name = 'firstspider'
def start_requests(self):
conn = mysql.connector.connect(user='root', password = 'root', host= 'localhost', database = 'Eistee')
cursor = conn.cursor()
query = ("SELECT Domain, CompanyName FROM Crawlbydomain LIMIT 300, 100")
cursor.execute(query)
results = cursor.fetchall()
for result in results:
urlrequest = 'http://'+result[0]
yield Request(urlrequest)
rules = (Rule (SgmlLinkExtractor(allow=('.ch', )), callback='parse_item', follow= True),)
def parse_item(self, response):
filename = response.url.translate(None, './')
open(filename, 'wb').write(response.body)
parse_start_url = parse_item

Scrape multiple URLs with Scrapy

How can I scrape multiple URLs with Scrapy?
Am I forced to make multiple crawlers?
class TravelSpider(BaseSpider):
name = "speedy"
allowed_domains = ["example.com"]
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4),"http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Python says:
NameError: name 'i' is not defined
But when I use one URL it works fine!
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)"]
Your python syntax is incorrect, try:
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)] + \
["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
If you need to write code to generate start requests, you can define a start_requests() method instead of using start_urls.
You can initialize start_urls in __init__.py method:
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class TravelItem(Item):
url = Field()
class TravelSpider(BaseSpider):
def __init__(self, name=None, **kwargs):
self.start_urls = []
self.start_urls.extend(["http://example.com/category/top/page-%d/" % i for i in xrange(4)])
self.start_urls.extend(["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)])
super(TravelSpider, self).__init__(name, **kwargs)
name = "speedy"
allowed_domains = ["example.com"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Hope that helps.
There are only four ranges in Python: LEGB, because the local scope of the class definition and the local extent of the list derivation are not nested functions, so they do not form the Enclosing scope.Therefore, they are two separate local scopes that cannot be accessed from each other.
so, don't use 'for' and class variables at the same time