why scrapy can not find xpath that is found in my browser xpath? - scrapy

Im a newby to scrapy and Im having dificulties extracting the price but not the name using the code below.
Any idea what Im doing wrong to get the price? Thank you!
This is the code:
import scrapy
class BfPreciosSpider(scrapy.Spider):
name = 'BF_precios'
allowed_domains = ['https://www.boerse-frankfurt.de']
start_urls = ['https://www.boerse-frankfurt.de/anleihe/xs1186131717-fce-bank-plc-1-134-15-22']
def parse(self, response):
what_name=response.xpath('/html/body/app-root/app-wrapper/div/div[2]/app-bond/div[1]/div/app-widget-datasheet-header/div/div/div/div/div[1]/div/h1/text()').extract_first()
what_price=response.xpath('/html/body/app-root/app-wrapper/div/div[2]/app-bond/div[2]/div[3]/div[1]/font/text()').extract_first()
yield{'name': what_name , 'price': what_price}
And these are the items(in red) - name and price:

The name information is available directly on the page but the price information is obtained from an api. If you investigate the Network traffic you will find an api call that returns the price information. See below example of how you could obtain this data.
import scrapy
from time import time
class RealtorSpider(scrapy.Spider):
name = 'BF_precios'
allowed_domains = ['boerse-frankfurt.de']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36'
}
start_urls = ['https://www.boerse-frankfurt.de/anleihe/xs1186131717-fce-bank-plc-1-134-15-22']
def parse(self, response):
item = {}
current_time = int(time())
name = response.xpath('//h1/text()').get()
isin = response.xpath("//span[contains(text(),'ISIN:')]/text()").re_first(r"ISIN:\s(.*)$")
mic = response.xpath("//app-widget-index-price-information/#mic").get()
api_url = f"https://api.boerse-frankfurt.de/v1/tradingview/lightweight/history/single?\
resolution=D&isKeepResolutionForLatestWeeksIfPossible=false\
&from={current_time}&to={current_time}&isBidAskPrice=false&symbols={mic}%3A{isin}"
item['name'] = name
item['isin'] = isin
item['mic'] = mic
yield response.follow(api_url, callback=self.parse_price, cb_kwargs={"item": item})
def parse_price(self, response, item):
item['price'] = response.json()[0]['quotes']['timeValuePairs'][0]['value']
yield item
Running the above spider will yield a dictionary similar to the below
{'name': 'FCE Bank PLC 1,134% 15/22', 'isin': 'XS1186131717', 'mic': 'XFRA', 'price': 99.955}

Related

How can i get the UPC of a product from bestbuy using scrapy

hi there
i need to scrap bestbuy i am currently using scrapy i was able to get most of the data i need but however i had faced some problems trying to get the
specification data section where UPC is. i was able to get features but that part i am not
able to grab the data.
really appreciate your help this is my code
from scrapy import Spider
from bestbuy_spider.items import BestbuyProductItem
from scrapy import Request
import re
import json
class Bestbuy2Spider(Spider):
name = 'bestbuy2'
# allowed_domains = ['https://www.bestbuy.com']
allowed_domains = ['bestbuy.com']
# https://www.bestbuy.com/site/searchpage.jsp?cp=1&searchType=search&browsedCategory=pcmcat209400050001&ks=960&sp=-bestsellingsort%20skuidsaas&sc=Global&list=y&usc=All%20Categories&type=page&id=pcat17071&iht=n&nrp=15&seeAll=&st=categoryid%24pcmcat209400050001&qp=carrier_facet%3DCarrier~Verizon
# start_urls = ['https://www.bestbuy.com/site/laptop-computers/all-laptops/pcmcat138500050001.c?id=pcmcat138500050001']
start_urls = ['https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A']
def parse(self, response):
text = response.xpath('//div[#class="left-side"]/span/text()').extract_first()
_, items_page, total = tuple(map(lambda x: int(x), re.findall('\d+',text)))
num_pages = total // items_page
#print('number of pages:', num_pages)
urls = [
'https://www.bestbuy.com/site/searchpage.jsp?cp={}&id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A'.format(
x) for x in range(1, num_pages + 1)]
for url in urls[:1]:
# product list page
yield Request(url=url, callback=self.parse_product_list)
def parse_product_list(self, response):
# product list
rows = response.xpath('//ol[#class="sku-item-list"]/li')
# print(len(rows))
# print('=' * 50)
for row in rows:
url = row.xpath('.//div[#class="sku-title"]/h4/a/#href').extract_first()
print(url)
yield Request(url='https://www.bestbuy.com' + str(url), callback=self.parse_product)
#'//ul[#Class="thumbnail-list"]//#src'
def parse_product(self, response):
price_txt = response.xpath('//div[#class="pricing-price__regular-price"]/text()').extract_first()
#reg_price = price_txt.replace('Was ', '')
item = BestbuyProductItem(
product = response.xpath('//div[#class="sku-title"]/h1/text()').extract_first(),
#color = response.xpath('li[#class="image selected"]/div/a/#title').extract_first(),
#skuId = response.xpath('//div[#class="sku product-data"]/span[2]/text()').extract_first(),
#price = response.xpath('//div[#class="priceView-hero-price priceView-customer-price"]/span[1]/text()').extract_first(),
#model = response.xpath('//div[#class="model product-data"]/span[2]/text()').extract_first(),
#main_image = response.xpath('//img[#class="primary-image"]/#src').extract_first(),
#images = response.xpath('//*[#class="thumbnail-list"]//img/#src').extract(),
#description = response.xpath('//div[#class="long-description-container body-copy "]//div/text()').extract(),
#features = response.xpath('//div[#class="list-row"]/p/text()').extract(),
#regular_price = price_txt,
Location = response.xpath('//div[#class="fulfillment-fulfillment-summary"]//div/p[1]/span/text()').extract()
)
yield item
Looking at one product page code (https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306) i notice there's a json with the gtin13 field (the upc code you're looking for). Should be easy to parse it with json module and get what you need.
{
"#context":"http://schema.org/",
"#type":"Product",
"name":"Sony - 65\" class BRAVIA XR X95J 4K UHD Smart Google TV",
"image":"https://pisces.bbystatic.com/image2/BestBuy_US/images/products/6459/6459306_sd.jpg",
"url":"https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306",
"description":"Shop Sony 65\" class BRAVIA XR X95J 4K UHD Smart Google TV at Best Buy. Find low everyday prices and buy online for delivery or in-store pick-up. Price Match Guarantee.",
"sku":"6459306",
"gtin13":"0027242921818",
"model":"XR65X95J",
"width":{
"#type":"http://schema.org/QuantitativeValue",
"unitCode":"INH",
"value":"56.87"
},
"color":"Black",
"brand":{
"#type":"Brand",
"name":"Sony"
},
"aggregateRating":{
"#type":"AggregateRating",
"ratingValue":"4.7",
"reviewCount":"221"
},
"offers":{
"#type":"AggregateOffer",
"priceCurrency":"USD",
"seller":{
"#type":"Organization",
"name":"Best Buy"
},
"lowPrice":"1184.99",
"highPrice":"1499.99",
"offercount":5,
"offers":[
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1499.99",
"availability":"http://schema.org/InStock",
"itemCondition":"http://schema.org/NewCondition",
"description":"New"
},
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1319.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Excellent - Certified"
},
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1274.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Excellent"
},
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1229.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Satisfactory"
},
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1184.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Fair"
}
]
}
}

How can I extract the item id from the response in Scrapy?

import scrapy
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.donedeal.ie']
start_urls = ['https://www.donedeal.ie/farmtools/']
def parse(self, response):
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
yield {
'item_id': row.xpath('.//a/#href').get(),
'item_title': row.xpath('.//div[1]/p[#class="card__body-
title"]/text()').get(),
'item_county': row.xpath('.//ul[#class="card__body-
keyinfo"]/li[2]/text()').get(),
'item_price':
row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
}
I want to extract the item number from the item_id response which is a url.
Is it possible to do this?
The response looks like this:
{'item_id': 'https://www.donedeal.ie/farmtools-for-sale/international-784-
tractor/25283884?campaign=3', 'item_title': 'INTERNATIONAL 784 TRACTOR',
'item_county': 'Derry', 'item_price': '3,000'}
I'd appreciate any advice, thanks
Somethink like this would work. Not clean but still, spliting the string up until you get the id you want.
def parse(self, response):
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get()
link_split = link.split('/')[-1]
link_id = link_split.split('?')[0]
yield {
'item_id': link_id,
'item_title': row.xpath('.//div[1]/p[#class="card__body
title"]/text()').get(),
'item_county': row.xpath('.//ul[#class="card__body-
keyinfo"]/li[2]/text()').get(),
'item_price':
row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
}
Update in response to comment
Complete code example
import scrapy
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['donedeal.ie']
start_urls = ['https://www.donedeal.ie/farmtools/']
def parse(self, response):
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get()
link_split = link.split('/')[-1]
link_id = link_split.split('?')[0]
yield {
'item_id':link_id,
'item_title': row.xpath('.//p[#class="card__body-title"]/text()').get(),
'item_county': row.xpath('.//ul[#class="card__body-keyinfo"]/li[2]/text()').get(),
'item_price': row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
}
A note, when looping over each 'card', you don't need to specify the div if you're aiming to get a selector with a unique class like card__body-title.
Please note that yielding a dictionary is one of three ways thinking about grabbing data from Scrapy. Consider using items and itemloaders.
Items: Here
ItemLoaders: Here
ItemLoaders Example: Here
A cleaner alternative would be to use regex. You can even use it with Scrapy selectors (docs)
'item_title': row.xpath('.//div[1]/p[#class="card__body-title"]/text()').re_first(r'/(\d+)\?campaign')
In the snippet above, the regex will return a string with only the digits between / and ?campaign.
In this particular URL https://www.donedeal.ie/farmtools-for-sale/international-784-tractor/25283884?campaign=3 it would return '25283884'
Edited: Corrected the regex

What are the correct tags and properties to select?

I want to crawl a web site (http://theschoolofkyiv.org/participants/220/dan-acostioaei) to extract artist's name and biography only. When I define the tags and properties, it comes out without any text, which I want to see.
I am using scrapy to crawl the web site. For other websites, it works fine. I have tested my codes but it seems I can not define the correct tags or properties. Can you please have a look at my codes?
This is the code that I used to crawl the website. (I do not understand why stackoverflow enforces me to enter irrelevant text all the time. I have already explained what I wanted to say.)
import scrapy
from scrapy.selector import Selector
from artistlist.items import ArtistlistItem
class ArtistlistSpider(scrapy.Spider):
name = "artistlist"
allowed_domains = ["theschoolofkyiv.org"]
start_urls = ['http://theschoolofkyiv.org/participants/220/dan-acostioaei']
enter code here
def parse(self, response):
titles = response.xpath("//div[#id='participants']")
for titles in titles:
item = ArtistlistItem()
item['artist'] = response.css('.ng-binding::text').extract()
item['biography'] = response.css('p::text').extract()
yield item
This is the output that I get:
{'artist': [],
'biography': ['\n ',
'\n ',
'\n ',
'\n ',
'\n ',
'\n ']}
Simple illustration (assuming you already know about AJAX request mentioned by Tony Montana):
import scrapy
import re
import json
from artistlist.items import ArtistlistItem
class ArtistlistSpider(scrapy.Spider):
name = "artistlist"
allowed_domains = ["theschoolofkyiv.org"]
start_urls = ['http://theschoolofkyiv.org/participants/220/dan-acostioaei']
def parse(self, response):
participant_id = re.search(r'participants/(\d+)', response.url).group(1)
if participant_id:
yield scrapy.Request(
url="http://theschoolofkyiv.org/wordpress/wp-json/posts/{participant_id}".format(participant_id=participant_id),
callback=self.parse_participant,
)
def parse_participant(self, response):
data = json.loads(response.body)
item = ArtistlistItem()
item['artist'] = data["title"]
item['biography'] = data["acf"]["en_participant_bio"]
yield item

Scrapy - Copying only the xpath into .csv file

I have many other scripts with simlar basic code that work, but when I run this spider in cmd, and I open the .csv file to look at the "titles" saved, I get the xpath copied into excel. Any idea why?
import scrapy
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['https://www.imdb.com/search/title?start=1']
start_urls = ['https://www.imdb.com/search/title?start=1/']
def parse(self, response):
titles = response.xpath('//*[#id="main"]/div/div/div[3]/div[1]/div[3]/h3/a')
pass
print(titles)
for title in titles:
yield {'Title': title}
--- Try Two Below:------
for subject in titles:
yield {
'Title': subject.xpath('.//h3[#class="lister-item-header"]/a/text()').extract_first(),
'Runtime': subject.xpath('.//p[#class="text-muted"]/span/text()').extract_first(),
'Description': subject.xpath('.//p[#class="text-muted"]/p/text()').extract_first(),
'Director': subject.xpath('.//*[#id="main"]/a/text()').extract_first(),
'Rating': subject.xpath('.//div[#class="inline-block ratings-imdb-rating"]/strong/text()').extract_first()
}
Use extract() or extract_first(), also use shorter and more capacious notation for xpath:
import scrapy
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['https://www.imdb.com/search/title?start=1']
start_urls = ['https://www.imdb.com/search/title?start=1/']
def parse(self, response):
subjects = response.xpath('//div[#class="lister-item mode-advanced"]')
for subject in subjects:
yield {
'Title': subject.xpath('.//h3[#class="lister-item-header"]/a/text()').extract_first(),
'Rating': subject.xpath('.//div[#class="inline-block ratings-imdb-rating"]/strong/text()').extract_first(),
'Runtime': subject.xpath('.//span[#class="runtime"]/text()').extract_first(),
'Description': subject.xpath('.//p[#class="text-muted"]/text()').extract_first(),
'Directior': subject.xpath('.//p[contains(text(), "Director")]/a[1]/text()').extract_first(),
}
output:

correct way to nest Item data in scrapy

What is the correct way to nest Item data?
For example, I want the output of a product:
{
'price': price,
'title': title,
'meta': {
'url': url,
'added_on': added_on
}
I have scrapy.Item of:
class ProductItem(scrapy.Item):
url = scrapy.Field(output_processor=TakeFirst())
price = scrapy.Field(output_processor=TakeFirst())
title = scrapy.Field(output_processor=TakeFirst())
url = scrapy.Field(output_processor=TakeFirst())
added_on = scrapy.Field(output_processor=TakeFirst())
Now, the way I do it is just to reformat the whole item in the pipeline according to new item template:
class FormatedItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
meta = scrapy.Field()
and in pipeline:
def process_item(self, item, spider):
formated_item = FormatedItem()
formated_item['title'] = item['title']
formated_item['price'] = item['price']
formated_item['meta'] = {
'url': item['url'],
'added_on': item['added_on']
}
return formated_item
Is this correct way to approach this or is there a more straight-forward way to approach this without breaking the philosophy of the framework?
UPDATE from comments: Looks like nested loaders is the updated approach. Another comment suggests this approach will cause errors during serialization.
Best way to approach this is by creating a main and a meta item class/loader.
from scrapy.item import Item, Field
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst
class MetaItem(Item):
url = Field()
added_on = Field()
class MainItem(Item):
price = Field()
title = Field()
meta = Field(serializer=MetaItem)
class MainItemLoader(ItemLoader):
default_item_class = MainItem
default_output_processor = TakeFirst()
class MetaItemLoader(ItemLoader):
default_item_class = MetaItem
default_output_processor = TakeFirst()
Sample usage:
from scrapy.spider import Spider
from qwerty.items import MainItemLoader, MetaItemLoader
from scrapy.selector import Selector
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["example.com"]
start_urls = ["http://example.com"]
def parse(self, response):
mainloader = MainItemLoader(selector=Selector(response))
mainloader.add_value('title', 'test')
mainloader.add_value('price', 'price')
mainloader.add_value('meta', self.get_meta(response))
return mainloader.load_item()
def get_meta(self, response):
metaloader = MetaItemLoader(selector=Selector(response))
metaloader.add_value('url', response.url)
metaloader.add_value('added_on', 'now')
return metaloader.load_item()
After that, you can easily expand your items in the future by creating more "sub-items."
I think it would be more straightforward to construct the dictionary in the spider. Here are two different ways of doing it, both achieving the same result. The only possible dealbreaker here is that the processors apply on the item['meta'] field, not on the item['meta']['added_on'] and item['meta']['url'] fields.
def parse(self, response):
item = MyItem()
item['meta'] = {'added_on': response.css("a::text").extract()[0]}
item['meta']['url'] = response.xpath("//a/#href").extract()[0]
return item
Is there a specific reason for which you want to construct it that way instead of unpacking the meta field ?