i'm trying to extract input fields from a website page and the page url (which contain these inputs ) and store them into a database ... ok
*** code works fine with no errors , but this isn't the desired output i want
spider code :
class MySpider(CrawlSpider):
name = 'isa_spider'
allowed_domains = ['testaspnet.vulnweb.com']
start_urls = ['http://testaspnet.vulnweb.com']
rules = (
Rule(SgmlLinkExtractor(allow=('/*' ) ),callback='parse_item'),)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item=IsaItem()
item['response_fld']=response.url
res = hxs.select("//input[(#id or #name) and (#type = 'text' )]/#id ").extract()
item['text_input'] = res[0] if res else None # None is default value in case no field found
res = hxs.select("//input[(#id or #name) and (#type = 'password')]/#id").extract()
item['pass_input'] = res[0] if res else None # None is default value in case no field found
res = hxs.select("//input[(#id or #name) and (#type = 'file')]/#id").extract()
item['file_input'] = res[0] if res else None # None is default value in case no field found
return item
pipeline code
class SQLiteStorePipeline(object):
def __init__(self):
self.conn = sqlite3.connect('./project.db')
self.cur = self.conn.cursor()
def process_item(self, item, spider):
self.cur.execute("insert into inputs ( input_name) values(?)", (item['text_input'],))
self.cur.execute("insert into inputs ( input_name) values(?)", (item['pass_input'],))
self.cur.execute("insert into inputs ( input_name) values(?)", (item['file_input'],))
self.cur.execute("insert into links (link) values(?)", (item['response_fld'],))
self.conn.commit()
return item
database schema picture
required output picture
(sorry for not inserting images directly since my reputation is less than 10)
Haven't tested this:
class SQLiteStorePipeline(object):
def __init__(self):
self.conn = sqlite3.connect('./project.db')
self.cur = self.conn.cursor()
def process_item(self, item, spider):
cursor = self.cur
target_id = ? # determine target id
cursor.execute("insert into links (target, link) values(?, ?)", (target_id, item['response_fld'],))
link_id = cursor.lastrowid # find out just inserted link id
cursor.execute("insert into inputs (link_id, input_name, input_type) values(?, ?, ?)", (link_id, item['text_input'], 1))
cursor.execute("insert into inputs (link_id, input_name, input_type) values(?, ?, ?)", (link_id, item['pass_input'], 2))
cursor.execute("insert into inputs (link_id, input_name, input_type) values(?, ?, ?)", (link_id, item['file_input'], 3))
self.conn.commit()
Related
i use scrapy to get data from a website (www.imensa.de) and i need to persist those data/items into a sqlite3 database. The Spider works fine, but i think there's something wrong about the pipeline.
Within the pipeline, the new .db-file is created properly with the specified tables and columns. But its not getting populated.
Spider.py
# Import library
from __future__ import absolute_import
import scrapy
from scrapy.loader import Item
from scrapy.loader import ItemLoader
from ..items import Mensen_Table
from ..items import Mensa_Meal
import datetime
from datetime import date
from datetime import timedelta
global base_url
global meal_plan_day_list
meal_plan_day_list = []
# Create Spider class
class mensen_crawler(scrapy.Spider):
# Name of spider
name = 'mensen'
# Website to scrape
allowed_domains = ['imensa.de']
start_urls = ['https://www.imensa.de/']
# STATE_NAME / STATE_LINK
def parse(self, response):
base_url = 'https://www.imensa.de/'
# Loop through response to parse state_name & state_link
for i, (name, link) in enumerate(zip(response.css('.group a::text'),response.css('.group a::attr(href)'))):
# Condition to only get german states (16 states, index starting from 0 -> 15)
if i < 16:
state_name = name.get()
state_link = base_url + link.get()
yield scrapy.Request(state_link, callback=self.parse_layer2, cb_kwargs={'state_name': state_name, 'state_link':state_link})
# CITY_NAME / CITY_LINK
def parse_layer2(self, response, state_name, state_link):
global base_url
base_url = 'https://www.imensa.de/'
for (city, link) in zip(response.css('.group a::text'), response.css('.group a::attr(href)')):
city_name = city.get()
yield print('current_city: ',city_name,' (state: ',state_name,')')
city_link_part = link.get().split('/')[0]
yield print('city_link_part: ', city_link_part)
city_link_real = base_url + city_link_part + '/index.html'
yield scrapy.Request(city_link_real, callback=self.parse_layer3, cb_kwargs={'state_name': state_name, 'state_link':state_link, 'city_name': city_name,'city_link': city_link_real})
# MENSA_NAME/MENSA_LINK
def parse_layer3(self, response, state_name, state_link, city_name, city_link):
base_url = 'https://www.imensa.de/'
for group in response.css('.group'):
uni_name = group.css('h2::text').get()
yield print('UNI_NAME: ',uni_name)
for mensa in group.css('.element'):
mensa_name = mensa.css('a::text').get()
yield print('mensa_name: ',mensa_name,' (state: ',state_name,') (city: ',city_name,') (uni_name: ',uni_name,')')
mensa_link = base_url + city_link.replace('https://www.imensa.de/','').split('/')[0] + '/' + mensa.css('a::attr(href)').get()
yield print('mensa_link: ', mensa_link)
yield scrapy.Request(mensa_link, callback=self.parse_layer4, cb_kwargs={'state_name': state_name,
'state_link':state_link,
'city_name': city_name,
'city_link': city_link,
'uni_name': uni_name,
'mensa_name': mensa_name,
'mensa_link': mensa_link
})
# CREATE MENSA ITEM -----------------------------------------------------------------------------------------------------------------
def parse_layer4(self, response, state_name, state_link, city_name, city_link, uni_name, mensa_name, mensa_link):
l = ItemLoader(item=Mensen_Table(), response=response)
try:
rating_avg = response.css('.aw-ratings-average::text').get()
except:
rating_avg = 0
try:
rating_count = response.css('.aw-ratings-count::text').get()
except:
rating_count = 0
address_list = []
for address_element in response.css('a.panel-body::text'):
address_list.append(address_element.get())
mensa_location = ', '.join(address_list)
yield print('mensa_location: ', mensa_location)
yield print('parse_layer_3 -- rating_avg: ',rating_avg)
yield print('parse_layer_3 -- rating_count: ', rating_count)
l.add_value('state_name', state_name)
l.add_value('state_link', state_link)
l.add_value('city_name', city_name)
l.add_value('city_link', city_link)
l.add_value('uni_name', uni_name)
l.add_value('mensa_name', mensa_name)
l.add_value('mensen_link', mensa_link)
l.add_value('mensa_address', mensa_location)
l.add_value('mensen_rating_avg', rating_avg)
l.add_value('mensen_rating_count', rating_count)
yield l.load_item()
for i,x in enumerate(response.css('.col-md-4.no-padding-xs .list-group')):
if i == 0:
date_list = x.css('.pull-right::text').extract()
day_list = x.css('a::text').extract()
link_list = x.css('a::attr(href)').extract()
yield print('date_list: ',date_list)
yield print('day_list: ', day_list)
yield print('link_list: ',link_list)
yield print('mensa_link: ',mensa_link)
# PROCESS DATE LIST
#------------------------------------------
meal_plan_date_list = []
for ele in date_list:
if ele == 'heute':
today = datetime.date.today().strftime('%d.%m.%Y')
meal_plan_date_list.append(today)
elif ele == 'morgen':
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
meal_plan_date_list.append(tomorrow.strftime('%d.%m.%Y'))
else:
meal_plan_date_list.append(ele)
yield print('meal_plan_date_list: ',meal_plan_date_list)
# PROCESS LINK LIST
#--------------------------------------------
meal_plan_link_list = []
for ele in day_list:
link = mensa_link.replace('index.html','') + ele.lower() + '.html'
meal_plan_link_list.append(link)
yield print('meal_plan_link_list: ',meal_plan_link_list)
#meal_plan_list = []
#meal_plan_prep_list = zip(meal_plan_date_list, day_list, meal_plan_link_list)
#for item in meal_plan_prep_list:
# yield print('meal_plan_list_item: ', item)
for date, day, link in zip(meal_plan_date_list, day_list, meal_plan_link_list):
yield scrapy.Request(link, callback=self.parse_layer5, cb_kwargs={'mensa_name': mensa_name, 'mensa_link': link, 'day': day, 'date': date})
# PARSE MEAL PLAN --------------------------------------------------------------------------
def parse_layer5(self, response, mensa_name, mensa_link, day, date):
for element in response.css('.aw-meal-category'):
for sub in element.css('.aw-meal.row.no-margin-xs'):
l = ItemLoader(item=Mensa_Meal(),response=response,selector=sub)
meal_name = sub.css('p.aw-meal-description::text').get()
try:
meal_price = sub.css('.col-sm-2.no-padding-xs.aw-meal-price::text').get().replace('€','').strip()
except:
meal_price = 0
try:
meal_attributes = sub.css('.small.aw-meal-attributes span::text').extract_first().replace(u'\xa0', u'')
except:
meal_attributes = ''
#if not meal_price == None:
l.add_value('mensa_name', mensa_name)
l.add_value('date_of_meal_plan', date)
l.add_value('meal_name', meal_name)
l.add_value('meal_attributes', meal_attributes)
l.add_value('meal_price', meal_price)
yield l.load_item()
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from itemadapter import ItemAdapter
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
from scrapy.item import Item
class Mensen_Table(scrapy.Item):
state_name = scrapy.Field()
state_link = scrapy.Field()
city_name = scrapy.Field()
city_link = scrapy.Field()
uni_name = scrapy.Field()
mensa_name = scrapy.Field()
mensen_link = scrapy.Field()
mensa_address = scrapy.Field()
mensen_rating_avg = scrapy.Field()
mensen_rating_count = scrapy.Field()
five_star_ratings = scrapy.Field()
four_star_ratings = scrapy.Field()
three_star_ratings = scrapy.Field()
two_star_ratings = scrapy.Field()
one_star_ratings = scrapy.Field()
class Mensa_Meal(scrapy.Item):
mensa_name = scrapy.Field()
date_of_meal_plan = scrapy.Field()
meal_name = scrapy.Field()
meal_attributes = scrapy.Field()
meal_price = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import item
from scrapy.exceptions import DropItem
from itemadapter import ItemAdapter
import sqlite3
# useful for handling different item types with a single interface
from items import Mensen_Table, Mensa_Meal
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
def create_table_mensen(self):
print('TABLE MENSEN CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS mensen_table (
state_name TEXT,
city_name TEXT,
uni_name TEXT,
mensa_name TEXT,
mensa_address LONG,
mensen_rating_avg FLOAT,
mensen_rating_count TEXT)
""")
def create_table_meal(self):
return print('TABLE MEAL CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS meal_table (
mensa_name TEXT,
date_of_meal_plan DATE,
meal_name LONG,
meal_attributes LONG,
meal_price FLOAT)
""")
def process_item(self, item, spider):
if isinstance(item, Mensen_Table):
print('MENSEN TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO mensen_table (state_name, city_name, uni_name, mensa_name, mensa_address, mensen_rating_avg, mensen_rating_count) VALUES (?, ?, ?, ?, ?, ?, ?)""",
(item['state_name'], item['city_name'], item['uni_name'], item['mensa_name'], item['mensa_address'], item['mensen_rating_avg'], item['mensen_rating_count']))
self.con.commit()
return item
if isinstance(item, Mensa_Meal):
print('MEAL TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO meal_table (mensa_name, date_of_meal_plan, meal_name, meal_attributes, meal_price) VALUES (?, ?, ?, ?, ?)""",
(item['mensa_name'], item['date_of_meal_plan'], item['meal_name'], item['meal_attributes'], item['meal_price']))
self.con.commit()
return item
What am i doing wrong? The items get displayed properly, but the dont reach the database-file.
Any help would be highly appreciated!
The only problem I see is that you never close the connection to the database, although I am not sure that this will solve your issue. It might though and it's good practice to do anyway.
In your pipeline add close_spider method that closes the connection to the database. This method is triggered automatically by the scrapy engine just before it deletes the spider and closes the twisted reactor.
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
...
def close_spider(self, spider):
self.con.close()
I am attempting to paginate through the data table on this page, located below the search form.
My code successfully scrapes the first page and I successfully click the next button (using Selenium) to get the next page of results.
However, attempting to create a Response instance and passing it to self.parse() does not work:
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
Also, even though if you analyze the call stack, I am returning None from self.parse, I get this warning when running this scrapy spider:
The "SignalStartSpider.parse" method is a generator and includes a "return" statement with a value different than None. This could lead to unexpected behaviour. Please see https://docs.python.org/3/reference/simple_stmts.html#the-return-statement for details about the semantics of the "return" statement within generators
warn_on_generator_with_return_value(spider, callback)
Here is my current source code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
else:
print(" **** NEXT IS NONE")
return None
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
Instead of recursively calling self.parse it is better to use a while loop and simply re-bind the Response instance with the page_source from Selenium webdriver. working code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
while True:
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> looping self.parse again")
response = r
else:
print(" **** NEXT IS NONE")
break
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
my spider: autospd.py
class AutospdSpider(scrapy.Spider):
name = 'autospd'
start_urls = ['http://news.dayoo.com/guangzhou/150960_2.shtml']
dt_ft = "%Y-%m-%d %H:%M"
def parse(self, response):
list_objs = response.css("div.dy-list>div")
for li in list_objs:
loader = AutopjtItemLoader(item=AutopjtItem(), selector=li, context=self.dt_ft)
print(loader.context.items()) #please see print-1
loader.nested_css("h2>a").add_css("title", "::text")
loader.nested_css("h2>a").add_css("url", "::attr(href)")
loader.nested_css("div.txt-area>div.news-time").add_xpath("pub_time", "string()")
yield loader.load_item()
print-1: dict_items([('context', '%Y-%m-%d %H:%M'), ('selector',
\r\n '>), ('response', None), ('item',
{}) ])
items.py
def func(value, loader_context):
print(loader_context.items()) # please see print-2
# ft = loader_context.get("context")
# time_dt = datetime.strptime(value, ft)
return value
class AutopjtItemLoader(ItemLoader):
default_output_processor = TakeFirst()
pub_time_in = MapCompose(func)
class AutopjtItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
pub_time = scrapy.Field()
print-2: [('selector', [2019-06-12 08:59< '>]), ('response',
None), ('item', {})]
Why don't have "context" in loader_context?
def nested_xpath(self, xpath, **context):
selector = self.selector.xpath(xpath)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
def nested_css(self, css, **context):
selector = self.selector.css(css)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
From the scrapy's source code, if you use nested_css or nested_xpath, you must add your context. eg:
loader.nested_css("div.txt-area>div.news-time", dt_ft=self.dt_ft).add_xpath("pub_time", "string()")
Scrapy does not work for pages with pagination page> 1, although the links given are correct. My code:
Linux, Debian 9, Python 3.5, MongoDB, Scrapy, Scrapy-Splash
code
import scrapy
import copy
import datetime
import json
import pymongo
from webscrapy.items import WebscrapyItem
from scrapy.conf import settings
from bson.objectid import ObjectId
class YooxSpiderSpider(scrapy.Spider):
name = 'yoox-spider'
allowed_domains = ['yoox.com']
base_url = 'https://www.yoox.com'
job = {}
start_url = ''
splash_url = ''
connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
def __init__(self, job_id):
self.job = self.db.jobs.find_one({'_id':ObjectId(job_id)})
self.start_url = self.job['start_url']
self.splash_url = self.job['splash_url']
def start_requests(self):
# job['start_url'] - This is the starting link for the desired category, for example Yoox/Woman or Yoox/Men
print("------------- start ---------")
yield scrapy.Request(url=''.join((self.splash_url, self.start_url)), callback=self.parse)
def parse(self, response):
for cat in [response.xpath(
"//div[#id='teleyooxCategories']/div[#class='teleyoox-section-content']/div[#class='teleyoox-section-padding']/ul[#class='text-size-default']/li")[0]]:
#url_category = response.urljoin('/render.html?url=https://www.yoox.com' + cat.xpath('./a/#href').extract_first())
sub_url_category = cat.xpath('./a/#href').extract_first()
if sub_url_category:
url_category = ''.join((self.base_url, cat.xpath('./a/#href').extract_first()))
Item = WebscrapyItem()
Item['job_id'] = self.job['_id']
Item['basecat'] = self.job['basecat']
Item['gender'] = self.job['gender']
Item['category'] = cat.xpath('./a/text()').extract_first().strip()
Item['url_category'] = url_category
yield scrapy.Request(url=''.join((self.splash_url, url_category)), meta={'Item': Item}, callback=self.parse_data, dont_filter=True)
def parse_data(self, response):
print('')
print('')
print(' ++++++++ current page ++++++++ ', response.url)
print('')
print('')
# Getting product references and product_id
for product in response.xpath("//div[#id='itemsGrid']/div[#id='srpage1']/div[#class='col-8-24']"):
sub_url_product = product.xpath('./div/div/a/#href').extract_first()
#url_product = response.urljoin('/render.html?url=https://www.yoox.com' + product.xpath('./div/div/a/#href').extract_first())
if sub_url_product:
Item = copy.deepcopy(response.meta['Item'])
product_id = product.xpath('./div/#id').extract_first()
price = product.xpath("./div/div[#class='itemData text-center']/a[#class='itemlink']/div[#class='price']/span/text()").extract()[-1][2:]
sizes = [size for size in product.xpath("./div/div/a[#class='itemlink']/div[#class='colorSize']/div[#class='size text-light']/span/text()").extract()]
available_products = {
'basecat': Item['basecat'],
'category': Item['category'],
'job_id': Item['job_id'],
'product_id': product_id,
}
#if not self.db.data.find(available_products).count():
#print('NEW product: ', product_id)
cutout_images = [
product.xpath("./div/div/a/img/#data-original").extract_first(),
product.xpath("./div/div/a/img/#rel").extract_first(),
]
Item['dt'] = datetime.datetime.utcnow()
Item['product_id'] = product_id
Item['url_product'] = ''.join((self.base_url, sub_url_product))
Item['data'] = {
'sku':'',
'date':'',
'cutout_image': cutout_images,
'data-category': product.xpath("./div/#data-category").extract_first(),
'microcategory': product.xpath("./div/div/a[#class='itemlink']/div[#class='microcategory font-sans']/text()").extract_first().strip(),
'description':'',
'price': price,
#currency - получаю из первого символа стоимости товара
'currency': product.xpath("./div/div[#class='itemData text-center']/a[#class='itemlink']/div[#class='price']/span/text()").extract()[-1][0],
'brand': product.xpath("./div/div/a[#class='itemlink']/div[#class='brand font-bold text-uppercase']/text()").extract_first(),
'merchant':'',
'sizes':sizes,
#response.xpath().extract_first()
}
#yield scrapy.Request(url=''.join((self.splash_url, Item['url_product'])), meta={'Item': Item}, callback=self.parse_details, dont_filter=True)
yield Item
#next_page_url = response.xpath("//div[#id='navigation-bar-top']/div[#class='col-6-24']/div[#id='pagination-lite']/a[#class='pure-menu-item nextPage js-track-me']/#href").extract_first()
next_page_url = response.xpath(
"//div[#id='navigation-bar-bottom']/div[#class='col-16-24']/ul[#class='pagination list-inline pull-right text-center js-pagination']/li[#class='next-page']/a/#href").extract_first()
if next_page_url:
print('')
print('')
print(' ++++++++ next page ++++++++ ', next_page_url)
print('')
print('')
yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data, dont_filter=True)
else:
print(' ++++++++ NEXT CATEGORY ++++++++ ')
pass
def parse_details(self, response):
# Производим глубокое копирование для избежания перемешивания данных
Item = copy.deepcopy(response.meta['Item'])
#other_data = json.loads(response.xpath('//section[#id="product"]/script[#type="application/ld+json"]//text()').extract_first())
Item['details'] = {
'header': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='itemTitle']/h1/a/text()").extract_first().strip(),
'price': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='item-price']/span[#class='font-bold']/span[1]/text()").extract_first(),
'priceCurrency': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='item-price']/span[#class='font-bold']/span[2]/#content").extract_first(),
#'colorName': response.xpath("//div[#id='js-item-color-size']/div[#id='itemColors']/div[#class='dataTitleBox font-bold text-uppercase text-size-xs margin-bottom']/span[#class='select-color-size-label']/text()").extract_first(),
#'reference': response.xpath("//div[#class='info-section']/div[#class='product-info-wrapper _product-info'][1]/p[#class='product-color']/span[2]/text()").extract_first(),
'description': response.xpath("//div[#id='itemContent']/div[#class='row text-size-default info-2cols']/div[#class='info-col-1 item-info-column col-1-2']/ul/li[#id='itemDescription']/div[#class='info-body font-sans padding-half-top']/text()").extract_first(),
#'sizeList': response.xpath("//div[#id='js-item-details']/div[#id='js-item-color-size']/div[#id='itemSizes']/ul").extract_first(),
#'other_data': other_data,
}
print('')
print('')
print(Item)
print('')
print('')
yield Item
Parsing works only for the first pages of all categories, although there is code and refers to callback = pars_data:
if next_page_url:
yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data)
else:
pass
A message is displayed on the console url for new page=2, but every current page url is new category (((. Please help my.
++++++++ next page ++++++++ https://***/us/women/shoponline/underwear_mc/2#/dept=women&gender=D&page=2&attributes=%7b%27ctgr%27%3a%5b%27ntm%27%5d%7d&season=X
in odoo9 I override the search_read method. The super method works ok. With the data returned I want to make a filter, the filter is on the context, the value was asigned on the click of the button comming from the view.
<button name="status_instalacion" string="Instalación" type="action" icon="fa-wrench fa-2x" context="{'stage_id' : 1, 'current_id': active_id}"/>
The problem occurs when I query the context in the search_read method. It exists but doesn't have the values I placed
context on click of button:
self._context
{u'lang': u'en_US', u'stage_id': 1, u'tz': False, u'uid': 1, u'current_id': 40, u'tipo_validacion': u'Sistemas Cr\xedticos', u'sistema_critico': u'AGUA'}
the stage_id is the value I want
context on read_search:
self._context
{u'lang': u'en_US', u'bin_size': True, u'tipo_validacion': u'Sistemas Cr\xedticos', u'tz': False, u'uid': 1,
u'active_test': False, u'sistema_critico': u'AGUA'}
as you can see the 'stage_id' value is missing
Tried also assigning the value to a property of the class, but the value never changes it is always the initial value.
from logging import getLogger
from openerp import api, fields, models
_logger = getLogger(__name__)
class MgmtsystemSistemasEquipos(models.Model):
""" Equipos."""
_name = 'mgmtsystem.sistemas.equipos'
dmy = 99 # ---> this value never changes
def dummy(self): # ---> tried calling a function. not work
return self.dmy
def set_dummy(self, id): # ----> set the value
self.dmy = id or self.dmy
codigo = fields.Char(
string=u'Código',
help=u"Código equipo",
required=True,
size=30)
name = fields.Char(
string=u'Nombre equipo',
required=True,
readonly=False,
index=True,
help="Nombre corto equipo",
size=30)
stage_id = fields.Many2one(
'mgmtsystem.action.stage',
'Fase',
default=_default_stage,
readonly=True)
#api.multi
def status_instalacion(self):
import pudb
pu.db
# save value to variable dmy to retrieve later
id = self._context.get('stage_id')
self.set_dummy(id)
#api.model
def search_read(
self, domain=None, fields=None, offset=0,
limit=None, order=None):
import pudb
pu.db
# here the variable allways has the original value (99)
current_stage_id = self.dmy
current_stage_id = self.dummy()
current_stage_id = getattr(self, dmy)
res = super(MgmtsystemSistemasEquipos, self).search_read(
domain, fields, offset, limit, order)
current_id = res[0]['id']
valid_protocols_ids = self._get_ids(
current_stage_id, current_id,
'mgmtsystem_equipos_protocolos',
'mgmtsystem_equipos_protocolos_rel',
'protocolo_id')
# # remove ids
res[0]['protocolos_ids'] = valid_protocols_ids
res[0]['informes_ids'] = valid_informes_ids
res[0]['anexos_ids'] = valid_anexos_ids
return res
# #api.multi
def _get_ids(self, current_stage_id, current_id, model, model_rel, field_rel):
import pudb
pu.db
# in this method the value of the variable is allways the original
current_stage_id = self.dummy()
sql = """ select a.id from
%s as a
join %s as b
on a.id = b.%s where b.equipo_id = %s
and a.stage_id = %s; """ % (model, model_rel, field_rel,
current_id, current_stage_id)
import psycopg2
try:
self.env.cr.execute(sql)
except psycopg2.ProgrammingError, ex:
message = 'Error trying to download data from server. \n {0} \n {1}'.format(ex.pgerror, sql)
_logger.info(message)
return False
rows = self.env.cr.fetchall()
list_of_ids = []
for row in rows:
list_of_ids.append(row[0])
return list_of_ids
I don't know Python very well, and thats the cause of my misunderstanding of how to read the value of the variable.
But then again, Why is the context modified in the search_read method?.
Thank you.
You should try following.
#api.model
def search_read(self, domain=None, fields=None, offset=0, limit=None, order=None):
import pudb
pu.db
# Here you need to get the value from the context.
current_stage_id = self._context.get('stage_id', getattr(self, dmy))
res = super(MgmtsystemSistemasEquipos, self).search_read(domain=domain, fields=fields, offset=offset, limit=limit, order=order)
current_id = res[0]['id']
valid_protocols_ids = self._get_ids(
current_stage_id, current_id,
'mgmtsystem_equipos_protocolos',
'mgmtsystem_equipos_protocolos_rel',
'protocolo_id')
# # remove ids
res[0]['protocolos_ids'] = valid_protocols_ids
res[0]['informes_ids'] = valid_informes_ids
res[0]['anexos_ids'] = valid_anexos_ids
return res
In your code those lines won't work just because there is no recordset available in self (it's correct behaviour search_read must have #api.model decorator).
# here the variable allways has the original value (99)
current_stage_id = self.dmy
current_stage_id = self.dummy()
current_stage_id = getattr(self, dmy)
So just remove those and lines and apply some other logic to get data.