For whatever reason all items are returned as tuples. Not sure what I am missing. In all other spiders and projects it was just a list (when i use extract()).
{'acne': (None,),
'function': ([u'\u2027Preservative'],),
'function0': u'\u2027Preservative',
'irritant': (None,),
'name': (u'Potassium Sorbate',),
'safety': (u'3',),
'url': 'http://cosdna.com/eng/383bb7435.html'}
Here is my spider code.
def parse(self, response):
inspect_response(response, self)
a = response.xpath('//table//tr')
for i in a:
item = CosdnaExtItem()
item['name'] = i.xpath('./td/a/text()').extract_first(),
item['url'] = i.xpath('./td/a/#href').extract_first(),
item['function'] = i.xpath('.//td[2]/span//text()').extract(),
item['acne'] = i.xpath('.//td[3]/span//text()').extract_first(),
item['irritant'] = i.xpath('.//td[4]/span//text()').extract_first(),
item['safety'] = i.xpath('.//td[5]/div//text()').extract_first(),
yield item
Note extra commas at the end of lines:
item['function'] = i.xpath('.//td[2]/span//text()').extract(),
in Python
x = y,
is the same as
x = (y,)
Related
I am attempting to paginate through the data table on this page, located below the search form.
My code successfully scrapes the first page and I successfully click the next button (using Selenium) to get the next page of results.
However, attempting to create a Response instance and passing it to self.parse() does not work:
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
Also, even though if you analyze the call stack, I am returning None from self.parse, I get this warning when running this scrapy spider:
The "SignalStartSpider.parse" method is a generator and includes a "return" statement with a value different than None. This could lead to unexpected behaviour. Please see https://docs.python.org/3/reference/simple_stmts.html#the-return-statement for details about the semantics of the "return" statement within generators
warn_on_generator_with_return_value(spider, callback)
Here is my current source code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
else:
print(" **** NEXT IS NONE")
return None
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
Instead of recursively calling self.parse it is better to use a while loop and simply re-bind the Response instance with the page_source from Selenium webdriver. working code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
while True:
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> looping self.parse again")
response = r
else:
print(" **** NEXT IS NONE")
break
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
My spider starts off with the start_urls, being:
https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL
Based on a keywords.csv file, located in my resource folder, the keywordsID (number 20035386) will change. Once the number changed, the spider will fetch the data from another product.
I also have a chunk of code which constantly checks the page if isTruncated = true, if that's the case, it will change the page number in the URL to +1. The only problem I am having right now, is that I don't know how to set a second variable in one string (URL). When isTruncated = true the code need to adjust the URL's page number AND keywordsID accordingly. Currently, I only managed to add a variable for the page number.
Currently the chunk of code is:
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
However, it should become something like:
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/ {keywordsid} ?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
When I run the spider, it will crawl all the pages of the product with keywordsID 20035386, but it will only crawl the first page of all the other products listed in the keywords.csv file.
FULL CODE
./krc/spiders/krc_spider.py
# -*- coding: utf-8 -*-
import scrapy
from krc.items import KrcItem
import json
import os
import csv
import time
import datetime
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
search_text=keyword["keyword"]
category = keyword["keywordtype"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(search_text)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text, "category": category})
def parse(self, response):
category = response.meta["category"]
current_page = response.meta.get("page", 1)
next_page = current_page + 1
#Printing the timestamp when fetching the data, using default timezone from the requesting machine
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%d-%m-%Y %H:%M:%S')
#Defining the items
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["category"] = category
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
item["timestamp"] = timestamp
yield item
#Checking whether "isTruncated" is true (boolean), if so, next page will be triggered
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
./krc/resources/keywords.csv
keyword,keywordtype
20035386,Hogedrukreiniger
20035424,Window Vacs
Current Output
When I run the spider it fetches the data from all the page's of the product with keywordsID 20035386. From all the other products with a different keywordsID, only the data from the first page will be fetched.
Use response.meta for this:
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
product_id = keyword["keyword"]
category = keyword["keywordtype"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(product_id)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"category": category, "product_id": product_id})
def parse(self, response):
category = response.meta["category"]
product_id = response.meta["product_id"]
current_page = response.meta.get("page", 1)
next_page = current_page + 1
#Printing the timestamp when fetching the data, using default timezone from the requesting machine
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%d-%m-%Y %H:%M:%S')
#Defining the items
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["category"] = category
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
item["timestamp"] = timestamp
yield item
#Checking whether "isTruncated" is true (boolean), if so, next page will be triggered
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{product_id}?page={page}&size=8&isocode=nl-NL".format(page=next_page, product_id=product_id),
callback=self.parse,
meta={'page': next_page, "category": category, "product_id": product_id},
)
I believe you need a nested for when your search_text change.
for [first iterating variable] in [outer loop]: # Outer loop
[do something] # Optional
for [second iterating variable] in [nested loop]: # Nested loop
[do something]
Checks this out, it might help you.
For Loops
I think adding the keyword to the url would be the following. It may or may not need + signs before and after search_text, my knowledge is limited.
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/"search_text"?page={page}&size=8&isocode=nl-NL".format(page=next_page),
though I'm not really following what this line is doing, at least the format(search_text) portion of it.
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(search_text)
my spider: autospd.py
class AutospdSpider(scrapy.Spider):
name = 'autospd'
start_urls = ['http://news.dayoo.com/guangzhou/150960_2.shtml']
dt_ft = "%Y-%m-%d %H:%M"
def parse(self, response):
list_objs = response.css("div.dy-list>div")
for li in list_objs:
loader = AutopjtItemLoader(item=AutopjtItem(), selector=li, context=self.dt_ft)
print(loader.context.items()) #please see print-1
loader.nested_css("h2>a").add_css("title", "::text")
loader.nested_css("h2>a").add_css("url", "::attr(href)")
loader.nested_css("div.txt-area>div.news-time").add_xpath("pub_time", "string()")
yield loader.load_item()
print-1: dict_items([('context', '%Y-%m-%d %H:%M'), ('selector',
\r\n '>), ('response', None), ('item',
{}) ])
items.py
def func(value, loader_context):
print(loader_context.items()) # please see print-2
# ft = loader_context.get("context")
# time_dt = datetime.strptime(value, ft)
return value
class AutopjtItemLoader(ItemLoader):
default_output_processor = TakeFirst()
pub_time_in = MapCompose(func)
class AutopjtItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
pub_time = scrapy.Field()
print-2: [('selector', [2019-06-12 08:59< '>]), ('response',
None), ('item', {})]
Why don't have "context" in loader_context?
def nested_xpath(self, xpath, **context):
selector = self.selector.xpath(xpath)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
def nested_css(self, css, **context):
selector = self.selector.css(css)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
From the scrapy's source code, if you use nested_css or nested_xpath, you must add your context. eg:
loader.nested_css("div.txt-area>div.news-time", dt_ft=self.dt_ft).add_xpath("pub_time", "string()")
Scrapy does not work for pages with pagination page> 1, although the links given are correct. My code:
Linux, Debian 9, Python 3.5, MongoDB, Scrapy, Scrapy-Splash
code
import scrapy
import copy
import datetime
import json
import pymongo
from webscrapy.items import WebscrapyItem
from scrapy.conf import settings
from bson.objectid import ObjectId
class YooxSpiderSpider(scrapy.Spider):
name = 'yoox-spider'
allowed_domains = ['yoox.com']
base_url = 'https://www.yoox.com'
job = {}
start_url = ''
splash_url = ''
connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
def __init__(self, job_id):
self.job = self.db.jobs.find_one({'_id':ObjectId(job_id)})
self.start_url = self.job['start_url']
self.splash_url = self.job['splash_url']
def start_requests(self):
# job['start_url'] - This is the starting link for the desired category, for example Yoox/Woman or Yoox/Men
print("------------- start ---------")
yield scrapy.Request(url=''.join((self.splash_url, self.start_url)), callback=self.parse)
def parse(self, response):
for cat in [response.xpath(
"//div[#id='teleyooxCategories']/div[#class='teleyoox-section-content']/div[#class='teleyoox-section-padding']/ul[#class='text-size-default']/li")[0]]:
#url_category = response.urljoin('/render.html?url=https://www.yoox.com' + cat.xpath('./a/#href').extract_first())
sub_url_category = cat.xpath('./a/#href').extract_first()
if sub_url_category:
url_category = ''.join((self.base_url, cat.xpath('./a/#href').extract_first()))
Item = WebscrapyItem()
Item['job_id'] = self.job['_id']
Item['basecat'] = self.job['basecat']
Item['gender'] = self.job['gender']
Item['category'] = cat.xpath('./a/text()').extract_first().strip()
Item['url_category'] = url_category
yield scrapy.Request(url=''.join((self.splash_url, url_category)), meta={'Item': Item}, callback=self.parse_data, dont_filter=True)
def parse_data(self, response):
print('')
print('')
print(' ++++++++ current page ++++++++ ', response.url)
print('')
print('')
# Getting product references and product_id
for product in response.xpath("//div[#id='itemsGrid']/div[#id='srpage1']/div[#class='col-8-24']"):
sub_url_product = product.xpath('./div/div/a/#href').extract_first()
#url_product = response.urljoin('/render.html?url=https://www.yoox.com' + product.xpath('./div/div/a/#href').extract_first())
if sub_url_product:
Item = copy.deepcopy(response.meta['Item'])
product_id = product.xpath('./div/#id').extract_first()
price = product.xpath("./div/div[#class='itemData text-center']/a[#class='itemlink']/div[#class='price']/span/text()").extract()[-1][2:]
sizes = [size for size in product.xpath("./div/div/a[#class='itemlink']/div[#class='colorSize']/div[#class='size text-light']/span/text()").extract()]
available_products = {
'basecat': Item['basecat'],
'category': Item['category'],
'job_id': Item['job_id'],
'product_id': product_id,
}
#if not self.db.data.find(available_products).count():
#print('NEW product: ', product_id)
cutout_images = [
product.xpath("./div/div/a/img/#data-original").extract_first(),
product.xpath("./div/div/a/img/#rel").extract_first(),
]
Item['dt'] = datetime.datetime.utcnow()
Item['product_id'] = product_id
Item['url_product'] = ''.join((self.base_url, sub_url_product))
Item['data'] = {
'sku':'',
'date':'',
'cutout_image': cutout_images,
'data-category': product.xpath("./div/#data-category").extract_first(),
'microcategory': product.xpath("./div/div/a[#class='itemlink']/div[#class='microcategory font-sans']/text()").extract_first().strip(),
'description':'',
'price': price,
#currency - получаю из первого символа стоимости товара
'currency': product.xpath("./div/div[#class='itemData text-center']/a[#class='itemlink']/div[#class='price']/span/text()").extract()[-1][0],
'brand': product.xpath("./div/div/a[#class='itemlink']/div[#class='brand font-bold text-uppercase']/text()").extract_first(),
'merchant':'',
'sizes':sizes,
#response.xpath().extract_first()
}
#yield scrapy.Request(url=''.join((self.splash_url, Item['url_product'])), meta={'Item': Item}, callback=self.parse_details, dont_filter=True)
yield Item
#next_page_url = response.xpath("//div[#id='navigation-bar-top']/div[#class='col-6-24']/div[#id='pagination-lite']/a[#class='pure-menu-item nextPage js-track-me']/#href").extract_first()
next_page_url = response.xpath(
"//div[#id='navigation-bar-bottom']/div[#class='col-16-24']/ul[#class='pagination list-inline pull-right text-center js-pagination']/li[#class='next-page']/a/#href").extract_first()
if next_page_url:
print('')
print('')
print(' ++++++++ next page ++++++++ ', next_page_url)
print('')
print('')
yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data, dont_filter=True)
else:
print(' ++++++++ NEXT CATEGORY ++++++++ ')
pass
def parse_details(self, response):
# Производим глубокое копирование для избежания перемешивания данных
Item = copy.deepcopy(response.meta['Item'])
#other_data = json.loads(response.xpath('//section[#id="product"]/script[#type="application/ld+json"]//text()').extract_first())
Item['details'] = {
'header': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='itemTitle']/h1/a/text()").extract_first().strip(),
'price': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='item-price']/span[#class='font-bold']/span[1]/text()").extract_first(),
'priceCurrency': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='item-price']/span[#class='font-bold']/span[2]/#content").extract_first(),
#'colorName': response.xpath("//div[#id='js-item-color-size']/div[#id='itemColors']/div[#class='dataTitleBox font-bold text-uppercase text-size-xs margin-bottom']/span[#class='select-color-size-label']/text()").extract_first(),
#'reference': response.xpath("//div[#class='info-section']/div[#class='product-info-wrapper _product-info'][1]/p[#class='product-color']/span[2]/text()").extract_first(),
'description': response.xpath("//div[#id='itemContent']/div[#class='row text-size-default info-2cols']/div[#class='info-col-1 item-info-column col-1-2']/ul/li[#id='itemDescription']/div[#class='info-body font-sans padding-half-top']/text()").extract_first(),
#'sizeList': response.xpath("//div[#id='js-item-details']/div[#id='js-item-color-size']/div[#id='itemSizes']/ul").extract_first(),
#'other_data': other_data,
}
print('')
print('')
print(Item)
print('')
print('')
yield Item
Parsing works only for the first pages of all categories, although there is code and refers to callback = pars_data:
if next_page_url:
yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data)
else:
pass
A message is displayed on the console url for new page=2, but every current page url is new category (((. Please help my.
++++++++ next page ++++++++ https://***/us/women/shoponline/underwear_mc/2#/dept=women&gender=D&page=2&attributes=%7b%27ctgr%27%3a%5b%27ntm%27%5d%7d&season=X
Could anyone tell me why the number of index variable in parse() is 10013 at all time?
class GetsourcesSpider(scrapy.Spider):
name = 'getSources'
allowed_domains = ['bizhi.feihuo.com']
base_url = 'http://bizhi.feihuo.com/wallpaper/share?rsid={index}/'
def start_requests(self):
for index in range(10010, 10014):#11886
yield scrapy.Request(url=self.base_url.format(index=index), callback=lambda response:self.parse(response,index))
def parse(self, response, index):
video_label = response.xpath('//video')[0]
item = DynamicdesktopItem()
item['index'] = index # response.url[-6:-1]
item['video'] = video_label.attrib['src']
item['image'] = video_label.attrib['poster']
yield item
That is because you are giving the index variable reference and not a value, that's why you get the last value. You need to use meta object for the same. Please see the updated code below
class GetsourcesSpider(scrapy.Spider):
name = 'getSources'
allowed_domains = ['bizhi.feihuo.com']
base_url = 'http://bizhi.feihuo.com/wallpaper/share?rsid={index}/'
def start_requests(self):
for index in range(10010, 10014):#11886
yield scrapy.Request(url=self.base_url.format(index=index), callback=self.parse, meta = {'index': index})
def parse(self, response):
index = response.meta['index']
video_label = response.xpath('//video')[0]
item = DynamicdesktopItem()
item['index'] = index # response.url[-6:-1]
item['video'] = video_label.attrib['src']
item['image'] = video_label.attrib['poster']
yield item
Because index variable referred to from all lambdas is not copied to their local scope. It is rewritten on each next loop iteration.
Consider this snippet:
lambdas = []
for i in range(3):
lambdas.append(lambda: print(i))
for fn in lambdas:
fn()
this will print three 2's, the last value of i.
Instead of doing lambda callbacks you should utilize meta= keyword of a Request class:
https://doc.scrapy.org/en/latest/topics/request-response.html#request-meta-special-keys