scrapy-playwright returning nothing but an error - scrapy

I'm learning Scrapy-playwright and it's fighting me. I'm attempting to gather store locations from a site using the CrawlSpider with a rule including a process_request that triggers the request to run through playwright. In my callback def I can print a value found on the page, but not return or yield anything. I've attempted to cache the data into an item, and return/yield a dict, all of which produces the error.
ERROR: Spider must return request, item, or None, got 'Deferred'
I'm stumped.
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from banners.items import StoreItem
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
def set_playwright_true(request, response):
request.meta["playwright"] = True
request.meta["playwright_include_page"] = True
request.meta["playwright_page_coroutines"] = ('wait_for_selector', 'span.store-name-city')
return request
class StoreSpider(CrawlSpider):
name = "retailer"
allowed_domains = ['retailer.com']
start_urls = ['https://www.retailer.com/store/0000-city-ak']
custom_settings = {
'ROBOTSTXT_OBEY': True ,
#'DOWNLOAD_DELAY': .5 ,
#'CONCURRENT_REQUESTS_PER_DOMAIN': 3 ,
'DOWNLOAD_HANDLERS': {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
} ,
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
}
rules = (
Rule(LinkExtractor(allow=('directory/ak/anchorage'))),
Rule(LinkExtractor(allow=(r'store/[0-9]+'), deny=(r'store/[0-9]+.+/.+')), callback='parse_item', follow=False, process_request=set_playwright_true),
)
async def parse_item(self, response):
items = []
item = StoreItem()
self.logger.info('***** Start processing ' + response.url + '. *****')
Name = response.css('meta[itemprop=alternateName]').attrib['content'] + ' - ' + response.css('span.store-name-city::text').get()
print(Name)
item['Name'] = Name
item['StoreID'] = response.css('meta[itemprop=storeID]').attrib['content']
item['Address1'] = response.css('span.store-address-line-1::text').get()
item['City'] = response.css('span.store-address-city::text').get()
item['State'] = response.css('span.store-address-state::text').get()
item['Zip'] = response.css('span.store-address-postal::text').get()
item['Phone'] = response.css('div.store-phone::text').get()
item['Latitude'] = response.css('meta[itemprop=latitude]').attrib['content']
item['Longitude'] = response.css('meta[itemprop=longitude]').attrib['content']
items.append(item)
return(items)

Changing parse_item from an async def to a plain def resolved the issue.
async def parse_item(self, response):
changed to
def parse_item(self, response):

Related

No adapter found for objects of type: 'itemadapter.adapter.ItemAdapter'

I want to change the names of images downloaded from a webpage. I want to use standard names given by the website as opposed to cleaning the request url for it.
I have the following pipeline.py
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class ScrapyExercisesPipeline:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
return adapter
class DownfilesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, item=None):
adapter = ScrapyExercisesPipeline().process_item()[0]
image_name: str = f'{adapter}.jpg'
return image_name
This produces the following error:
raise TypeError(f"No adapter found for objects of type: {type(item)} ({item})")
TypeError: No adapter found for objects of type: <class 'itemadapter.adapter.ItemAdapter'> (<ItemAdapter for ScrapyExercisesItem(name='unknown267', images=['https://bl-web-assets.britishland.com/live/meadowhall/s3fs-public/styles/retailer_thumbnail/public/retailer/boots_1.jpg?qQ.NHRs04tdmGxoyZKerRHcrhCImB3JH&itok=PD5LxLmS&cb=1657061667-curtime&v=1657061667-curtime'])>)
scraper.py:
import scrapy
from scrapy_exercises.items import ScrapyExercisesItem
class TestSpider(scrapy.Spider):
name = 'test'
#allowed_domains = ['x']
start_urls = ['https://www.meadowhall.co.uk/eatdrinkshop?page=1']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
cb_kwargs = {'pg':0}
)
def parse(self, response,pg):
pg=0
content_page = response.xpath("//div[#class='view-content']//div")
for cnt in content_page:
image_url = cnt.xpath(".//img//#src").get()
image_name = cnt.xpath(".//img//#alt").get()
if image_url != None:
pg+=1
items = ScrapyExercisesItem()
if image_name == '':
items['name'] = 'unknown'+f'{pg}'
items['images'] = [image_url]
yield items
else:
items['name'] = image_name
items['images'] = [image_url]
yield items
settings.py
ITEM_PIPELINES = {
#'scrapy.pipelines.images.ImagesPipeline': 1,
'scrapy_exercises.pipelines.ScrapyExercisesPipeline':45,
'scrapy_exercises.pipelines.DownfilesPipeline': 55
}
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parent.parent
IMAGES_STORE = os.path.join(BASE_DIR, 'images')
IMAGES_URLS_FIELD = 'images'
IMAGES_RESULT_FIELD = 'results'
You are calling on a pipeline from within your pipeline while that pipeline is also registered in your settings to be run as a pipeline. It would be simpler to just extract the name field from your item in your DownFilesPipeLine and return it.
Change your pipelines.py file to:
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class DownfilesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, item=None):
return item['name'] + '.jpg'
You also need to turn off the ScrapyExercisesPipeline in your settings

Scraper not grabbing all text in list/div

My scraper seems to skip on information, for example I want to extract all the countries and the highway codes belonging to each country. However, when I iterate over this I only get one country and one highway code for each list of numbered highways.
How do I get all the highway routes and their respective countries?
Here's the scraper that I am working with:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose
from itemloaders import ItemLoader
class roadItem(scrapy.Item):
highway_names = Field(output_processor = TakeFirst())
country = Field(output_processor = TakeFirst())
route_name = Field(output_processor = TakeFirst())
class roadSpider(scrapy.Spider):
name = "road"
start_urls = ["https://en.wikipedia.org/w/index.php?title=Category:Lists_of_roads_sharing_the_same_title&pageuntil=092%0AList+of+highways+numbered+92#mw-pages"]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
list_data = response.xpath('(//div[#class="mw-category-group"][last()])[2]//ul')
for items in list_data:
for link in items.xpath(".//a/#href").getall():
yield response.follow(
response.urljoin(link),
callback = self.parse_roads
)
next_page = response.xpath("//div[#id='mw-pages']/a[1]//#href").get()
if next_page is not None:
yield response.follow(
response.urljoin(next_page),
callback = self.parse
)
def parse_roads(self, response):
loader = ItemLoader(roadItem())
loader.add_value("highway_names", response.xpath("//h1[#id='firstHeading']//text()").get())
data = response.xpath("//div[#id='mw-content-text']")
for list_h2 in data:
if list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get():
loader.add_value("country", list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get())
else:
loader.add_value("country", list_h2.xpath("(//h2//span)[1]//text()").get())
if list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
for routes in list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
loader.add_value('route_name', routes)
else:
for routess in list_h2.xpath("//ul//li//#title").getall():
loader.add_value('route_name', routess)
yield loader.load_item()

scrapy pagination not working on tripadvisor

i'm trying to scrape the restaurant pages on tripadvisor (just to learn how it works)
However, i only get the first page.
What am I missing?
here is the code, thanks!
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name' : stores.css('a.bHGqj::text').extract(),
'link' : stores.css('a.bHGqj').xpath("#href").extract()
}
next_page = ('http://tripadvisor.com' + response.css('a.nav').attrib['href']).extract()
##next_page = response.xpath('//a[contains(text(), "Next")]/#href).extract())
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
#djmystica, Now it's working fine
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = [
'https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name': stores.css('a.bHGqj::text').extract_first(),
'link': stores.css('a.bHGqj').xpath("#href").extract_first()}
#next_page = ('http://tripadvisor.com' +response.css('a.nav').attrib['href']).extract()
next_page = response.xpath('//a[contains(text(), "Next")]/#href').extract_first()
abs_next_page = f'https://www.tripadvisor.com{next_page}'
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if abs_next_page is not None:
yield response.follow(abs_next_page, callback=self.parse)

Following urls in javascript - Scrapy Splash

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is https://www.ta.com/portfolio/investments/ari-network-services-inc and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['https://www.ta.com/portfolio/investments/ari-network-services-inc']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
}
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('https://www.ta.com',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.
Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
'https://www.ta.com/portfolio/business-services',
'https://www.ta.com/portfolio/consumer',
'https://www.ta.com/portfolio/financial-services',
'https://www.ta.com/portfolio/healthcare',
'https://www.ta.com/portfolio/technology'
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('https://www.ta.com' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail
}

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item