Get some extra details from another page for each list item using Scrapy framework - scrapy

I have managed to parse the list of advertisements, put some information in AdvertItem and load this item using AdvertLoader. But I could not figure out how I can get some extra information about each advertisement from item page details, put this additional information in the same AdvertItem object and then load the item with all information using AdvertLoader.
class AdvertLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip, remove_tags)
default_output_processor = Join()
class AdvertSpider(scrapy.Spider):
name = "adverts"
start_urls = [
"http://blablaadverts.com/",
]
adverts_list_xpath = '//table[#class="object-list-table"]/tbody/tr[#class="object-type-apartment"]'
advert_item_fields = {
'id': './#id',
'link': './/td[#class="object-name"]/h2[contains(#class, "object-title")]/a/#href',
'status': 'normalize-space(.//td[contains(#class, "object-media")]/div/p/a/span[contains(#class, '
'"sold-overlay-list")]/span/text())',
'state': './/td[#class="object-name"]/h2[contains(#class, "object-title")]/a/text()',
'city': './/td[#class="object-name"]/h2[contains(#class, "object-title")]/a/text()',
'zone': './/td[#class="object-name"]/h2[contains(#class, "object-title")]/a/text()',
'address': './/td[#class="object-name"]/h2[contains(#class, "object-title")]/a/text()',
'rooms': './/td[contains(#class, "object-rooms")]/text()',
'area': 'normalize-space(.//td[contains(#class, "object-m2")]/text())',
'price': 'normalize-space(.//td[contains(#class, "object-price")]/p/text())',
}
advert_details_xpath = '//table[contains(#class, "object-data-meta")]/tbody/tr'
advert_item_details_fields = {
'floor': './/td/text()',
'built_in_year': './/td/text()',
'condition': './/td/text()',
'ownership': './/td/text()',
'energy_level': './/td/text()',
}
contact_name = '//div[contains(#class, "object-article-contact")]/p[#class="fn"]/text()'
next_page = '//li[contains(#class, "next")]/a/#href'
def parse(self, response):
selector = Selector(response)
for advert in selector.xpath(self.adverts_list_xpath):
loader = AdvertLoader(AdvertItem(), selector=advert)
for field, xpath in self.advert_item_fields.iteritems():
loader.add_xpath(field, xpath)
# This request is not working as I expect.
yield scrapy.Request("http://blablaadverts.com/index.htmlnr=55&search_key=ca41231a29d2ab921aed02e864152c0e",
callback=self.parse_page2, meta={'loader': loader})
yield loader.load_item()
next_page = response.xpath(self.next_page).extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_page2(self, response):
selector = Selector(response)
loader = response.meta['loader'] # type: AdvertLoader
loader.selector = selector
loader.add_xpath('contact_name', self.contact_name)
# yield loader.load_item()
Below code saves only information about each advertisement without extra details from second item details page.
Function parse_page2() is working if I run it separately from parse() function.
How can I collect all information and only then load my AdvertItem object in loader?

I am not sure I get you correctly or not.
But change this part of code
# This request is not working as I expect.
yield scrapy.Request("http://blablaadverts.com/index.htmlnr=55&search_key=ca41231a29d2ab921aed02e864152c0e",
callback=self.parse_page2, meta={'loader': loader})
yield loader.load_item()
to
# This request is not working as I expect.
scrapy.Request("http://blablaadverts.com/index.htmlnr=55&search_key=ca41231a29d2ab921aed02e864152c0e",
callback=self.parse_page2, meta={'loader': loader})
loader.load_item()
And then yield in this function when all information is available.
def parse_page2(self, response):
selector = Selector(response)
loader = response.meta['loader'] # type: AdvertLoader
loader.selector = selector
loader.add_xpath('contact_name', self.contact_name)
yield loader.load_item()

Related

How can i get the UPC of a product from bestbuy using scrapy

hi there
i need to scrap bestbuy i am currently using scrapy i was able to get most of the data i need but however i had faced some problems trying to get the
specification data section where UPC is. i was able to get features but that part i am not
able to grab the data.
really appreciate your help this is my code
from scrapy import Spider
from bestbuy_spider.items import BestbuyProductItem
from scrapy import Request
import re
import json
class Bestbuy2Spider(Spider):
name = 'bestbuy2'
# allowed_domains = ['https://www.bestbuy.com']
allowed_domains = ['bestbuy.com']
# https://www.bestbuy.com/site/searchpage.jsp?cp=1&searchType=search&browsedCategory=pcmcat209400050001&ks=960&sp=-bestsellingsort%20skuidsaas&sc=Global&list=y&usc=All%20Categories&type=page&id=pcat17071&iht=n&nrp=15&seeAll=&st=categoryid%24pcmcat209400050001&qp=carrier_facet%3DCarrier~Verizon
# start_urls = ['https://www.bestbuy.com/site/laptop-computers/all-laptops/pcmcat138500050001.c?id=pcmcat138500050001']
start_urls = ['https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A']
def parse(self, response):
text = response.xpath('//div[#class="left-side"]/span/text()').extract_first()
_, items_page, total = tuple(map(lambda x: int(x), re.findall('\d+',text)))
num_pages = total // items_page
#print('number of pages:', num_pages)
urls = [
'https://www.bestbuy.com/site/searchpage.jsp?cp={}&id=pcat17071&qp=storepickupstores_facet%3DStore%20Availability%20-%20In%20Store%20Pickup~237&st=%2A'.format(
x) for x in range(1, num_pages + 1)]
for url in urls[:1]:
# product list page
yield Request(url=url, callback=self.parse_product_list)
def parse_product_list(self, response):
# product list
rows = response.xpath('//ol[#class="sku-item-list"]/li')
# print(len(rows))
# print('=' * 50)
for row in rows:
url = row.xpath('.//div[#class="sku-title"]/h4/a/#href').extract_first()
print(url)
yield Request(url='https://www.bestbuy.com' + str(url), callback=self.parse_product)
#'//ul[#Class="thumbnail-list"]//#src'
def parse_product(self, response):
price_txt = response.xpath('//div[#class="pricing-price__regular-price"]/text()').extract_first()
#reg_price = price_txt.replace('Was ', '')
item = BestbuyProductItem(
product = response.xpath('//div[#class="sku-title"]/h1/text()').extract_first(),
#color = response.xpath('li[#class="image selected"]/div/a/#title').extract_first(),
#skuId = response.xpath('//div[#class="sku product-data"]/span[2]/text()').extract_first(),
#price = response.xpath('//div[#class="priceView-hero-price priceView-customer-price"]/span[1]/text()').extract_first(),
#model = response.xpath('//div[#class="model product-data"]/span[2]/text()').extract_first(),
#main_image = response.xpath('//img[#class="primary-image"]/#src').extract_first(),
#images = response.xpath('//*[#class="thumbnail-list"]//img/#src').extract(),
#description = response.xpath('//div[#class="long-description-container body-copy "]//div/text()').extract(),
#features = response.xpath('//div[#class="list-row"]/p/text()').extract(),
#regular_price = price_txt,
Location = response.xpath('//div[#class="fulfillment-fulfillment-summary"]//div/p[1]/span/text()').extract()
)
yield item
Looking at one product page code (https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306) i notice there's a json with the gtin13 field (the upc code you're looking for). Should be easy to parse it with json module and get what you need.
{
"#context":"http://schema.org/",
"#type":"Product",
"name":"Sony - 65\" class BRAVIA XR X95J 4K UHD Smart Google TV",
"image":"https://pisces.bbystatic.com/image2/BestBuy_US/images/products/6459/6459306_sd.jpg",
"url":"https://www.bestbuy.com/site/sony-65-class-bravia-xr-x95j-4k-uhd-smart-google-tv/6459306.p?skuId=6459306",
"description":"Shop Sony 65\" class BRAVIA XR X95J 4K UHD Smart Google TV at Best Buy. Find low everyday prices and buy online for delivery or in-store pick-up. Price Match Guarantee.",
"sku":"6459306",
"gtin13":"0027242921818",
"model":"XR65X95J",
"width":{
"#type":"http://schema.org/QuantitativeValue",
"unitCode":"INH",
"value":"56.87"
},
"color":"Black",
"brand":{
"#type":"Brand",
"name":"Sony"
},
"aggregateRating":{
"#type":"AggregateRating",
"ratingValue":"4.7",
"reviewCount":"221"
},
"offers":{
"#type":"AggregateOffer",
"priceCurrency":"USD",
"seller":{
"#type":"Organization",
"name":"Best Buy"
},
"lowPrice":"1184.99",
"highPrice":"1499.99",
"offercount":5,
"offers":[
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1499.99",
"availability":"http://schema.org/InStock",
"itemCondition":"http://schema.org/NewCondition",
"description":"New"
},
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1319.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Excellent - Certified"
},
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1274.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Excellent"
},
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1229.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Satisfactory"
},
{
"#type":"Offer",
"priceCurrency":"USD",
"price":"1184.99",
"itemCondition":"http://schema.org/UsedCondition",
"description":"Open-Box Fair"
}
]
}
}

Scrapy/Selenium: How do I follow the links in 1 webpage?

I am new to web-scraping.
I want to go to Webpage_A and follow all the links there.
Each of the link lead to a page where I can select some button and download the data in an Excel file.
I tried the below code. But I believe there is an error with
if link:
yield SeleniumRequest(
Instead of using "SeleniumRequest" to follow the links, what should I use?
If using pure Scrapy, I know I can use
yield response.follow(
Thank you
class testSpider(scrapy.Spider):
name = 'test_s'
def start_requests(self):
yield SeleniumRequest(
url='CONFIDENTIAL',
wait_time=15,
screenshot=True,
callback=self.parse
)
def parse(self, response):
tables_name = response.xpath("//div[#class='contain wrap:l']//li")
for t in tables_name:
name=t.xpath(".//a/span/text()").get()
link = t.xpath(".//a/#href").get()
if link:
yield SeleniumRequest(
meta={'table_name': name},
url= link,
wait_time=15,
screenshot=True,
callback=self.parse_table
)
def parse_table(self, response):
name = response.request.meta['table_name']
button_select=response.find_element_by_xpath("(//a[text()='Select All'])").click()
button_st_yr=response.find_element_by_xpath("//select[#name='ctl00$ContentPlaceHolder1$StartYearDropDownList'] /option[1]").click()
button_end_mth=response.find_element_by_xpath("//select[#name='ctl00$ContentPlaceHolder1$EndMonthDropDownList']/option[text()='Dec']").click()
button_download=response.find_element_by_xpath("//input[#id='ctl00_ContentPlaceHolder1_DownloadButton']").click()
yield{
'table_name': name
}

Scrapy - Copying only the xpath into .csv file

I have many other scripts with simlar basic code that work, but when I run this spider in cmd, and I open the .csv file to look at the "titles" saved, I get the xpath copied into excel. Any idea why?
import scrapy
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['https://www.imdb.com/search/title?start=1']
start_urls = ['https://www.imdb.com/search/title?start=1/']
def parse(self, response):
titles = response.xpath('//*[#id="main"]/div/div/div[3]/div[1]/div[3]/h3/a')
pass
print(titles)
for title in titles:
yield {'Title': title}
--- Try Two Below:------
for subject in titles:
yield {
'Title': subject.xpath('.//h3[#class="lister-item-header"]/a/text()').extract_first(),
'Runtime': subject.xpath('.//p[#class="text-muted"]/span/text()').extract_first(),
'Description': subject.xpath('.//p[#class="text-muted"]/p/text()').extract_first(),
'Director': subject.xpath('.//*[#id="main"]/a/text()').extract_first(),
'Rating': subject.xpath('.//div[#class="inline-block ratings-imdb-rating"]/strong/text()').extract_first()
}
Use extract() or extract_first(), also use shorter and more capacious notation for xpath:
import scrapy
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['https://www.imdb.com/search/title?start=1']
start_urls = ['https://www.imdb.com/search/title?start=1/']
def parse(self, response):
subjects = response.xpath('//div[#class="lister-item mode-advanced"]')
for subject in subjects:
yield {
'Title': subject.xpath('.//h3[#class="lister-item-header"]/a/text()').extract_first(),
'Rating': subject.xpath('.//div[#class="inline-block ratings-imdb-rating"]/strong/text()').extract_first(),
'Runtime': subject.xpath('.//span[#class="runtime"]/text()').extract_first(),
'Description': subject.xpath('.//p[#class="text-muted"]/text()').extract_first(),
'Directior': subject.xpath('.//p[contains(text(), "Director")]/a[1]/text()').extract_first(),
}
output:

correct way to nest Item data in scrapy

What is the correct way to nest Item data?
For example, I want the output of a product:
{
'price': price,
'title': title,
'meta': {
'url': url,
'added_on': added_on
}
I have scrapy.Item of:
class ProductItem(scrapy.Item):
url = scrapy.Field(output_processor=TakeFirst())
price = scrapy.Field(output_processor=TakeFirst())
title = scrapy.Field(output_processor=TakeFirst())
url = scrapy.Field(output_processor=TakeFirst())
added_on = scrapy.Field(output_processor=TakeFirst())
Now, the way I do it is just to reformat the whole item in the pipeline according to new item template:
class FormatedItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
meta = scrapy.Field()
and in pipeline:
def process_item(self, item, spider):
formated_item = FormatedItem()
formated_item['title'] = item['title']
formated_item['price'] = item['price']
formated_item['meta'] = {
'url': item['url'],
'added_on': item['added_on']
}
return formated_item
Is this correct way to approach this or is there a more straight-forward way to approach this without breaking the philosophy of the framework?
UPDATE from comments: Looks like nested loaders is the updated approach. Another comment suggests this approach will cause errors during serialization.
Best way to approach this is by creating a main and a meta item class/loader.
from scrapy.item import Item, Field
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst
class MetaItem(Item):
url = Field()
added_on = Field()
class MainItem(Item):
price = Field()
title = Field()
meta = Field(serializer=MetaItem)
class MainItemLoader(ItemLoader):
default_item_class = MainItem
default_output_processor = TakeFirst()
class MetaItemLoader(ItemLoader):
default_item_class = MetaItem
default_output_processor = TakeFirst()
Sample usage:
from scrapy.spider import Spider
from qwerty.items import MainItemLoader, MetaItemLoader
from scrapy.selector import Selector
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["example.com"]
start_urls = ["http://example.com"]
def parse(self, response):
mainloader = MainItemLoader(selector=Selector(response))
mainloader.add_value('title', 'test')
mainloader.add_value('price', 'price')
mainloader.add_value('meta', self.get_meta(response))
return mainloader.load_item()
def get_meta(self, response):
metaloader = MetaItemLoader(selector=Selector(response))
metaloader.add_value('url', response.url)
metaloader.add_value('added_on', 'now')
return metaloader.load_item()
After that, you can easily expand your items in the future by creating more "sub-items."
I think it would be more straightforward to construct the dictionary in the spider. Here are two different ways of doing it, both achieving the same result. The only possible dealbreaker here is that the processors apply on the item['meta'] field, not on the item['meta']['added_on'] and item['meta']['url'] fields.
def parse(self, response):
item = MyItem()
item['meta'] = {'added_on': response.css("a::text").extract()[0]}
item['meta']['url'] = response.xpath("//a/#href").extract()[0]
return item
Is there a specific reason for which you want to construct it that way instead of unpacking the meta field ?

Scrapy: having problems in crawling a .aspx page

I'm trying to crawl a .aspx page, but it redirects me to a page which doesn't exist.
To solve this, I tried to set 'dont_merge_cookies': True and 'dont_redirect': True, and overwrite my start_requests, but now, it gives me an error "'Response' object has no attribute 'body_as_unicode'" and my response class type is 'scrapy.http.response.Response'.
Here's my code:
class Inon_Spider(BaseSpider):
name = 'Inon'
allowed_domains = ['www.shop.inonit.in']
start_urls = ['http://www.shop.inonit.in/Products/Inonit-Men-Jackets/QUIRK-BOX/Toy-Factory-Jacket---Soldiers/pid-1177471.aspx?Rfs=&pgctl=713619&cid=CU00049295']
#redirects to http://www.shop.inonit.in/Products/Inonit-Men-Jackets/QUIRK-BOX/Toy-Factory-Jacket---Soldiers/1177471
def start_requests(self):
start_urls = ['http://www.shop.inonit.in/Products/Inonit-Men-Jackets/QUIRK-BOX/Toy-Factory-Jacket---Soldiers/pid-1177471.aspx?Rfs=&pgctl=713619&cid=CU00049295']
for i in start_urls:
yield Request(i, meta = {
'dont_merge_cookies': True,
'dont_redirect': True,
'handle_httpstatus_list': [302]
},callback=self.parse)
def parse(self, response):
print "Response %s" %response.__class__
resp = TextResponse
item = DealspiderItem()
hxs = HtmlXPathSelector(resp)
title = hxs.select('//div[#class="aboutproduct"]/div[#class="container9"]/div[#class="ctl_aboutbrand"]/h1/text()').extract()
price = hxs.select('//span[#id="ctl00_ContentPlaceHolder1_Price_ctl00_spnWebPrice"]/span[#class="offer"]/span[#id="ctl00_ContentPlaceHolder1_Price_ctl00_lblOfferPrice"]/text()').extract()
prc = price[0].replace("Rs. ","")
description = []
item['price'] = prc
item['title'] = title
item['description'] = description
item['url'] = response.url
return item