Scrapy Item Pipeline does not process items to sqlite - scrapy

i use scrapy to get data from a website (www.imensa.de) and i need to persist those data/items into a sqlite3 database. The Spider works fine, but i think there's something wrong about the pipeline.
Within the pipeline, the new .db-file is created properly with the specified tables and columns. But its not getting populated.
Spider.py
# Import library
from __future__ import absolute_import
import scrapy
from scrapy.loader import Item
from scrapy.loader import ItemLoader
from ..items import Mensen_Table
from ..items import Mensa_Meal
import datetime
from datetime import date
from datetime import timedelta
global base_url
global meal_plan_day_list
meal_plan_day_list = []
# Create Spider class
class mensen_crawler(scrapy.Spider):
# Name of spider
name = 'mensen'
# Website to scrape
allowed_domains = ['imensa.de']
start_urls = ['https://www.imensa.de/']
# STATE_NAME / STATE_LINK
def parse(self, response):
base_url = 'https://www.imensa.de/'
# Loop through response to parse state_name & state_link
for i, (name, link) in enumerate(zip(response.css('.group a::text'),response.css('.group a::attr(href)'))):
# Condition to only get german states (16 states, index starting from 0 -> 15)
if i < 16:
state_name = name.get()
state_link = base_url + link.get()
yield scrapy.Request(state_link, callback=self.parse_layer2, cb_kwargs={'state_name': state_name, 'state_link':state_link})
# CITY_NAME / CITY_LINK
def parse_layer2(self, response, state_name, state_link):
global base_url
base_url = 'https://www.imensa.de/'
for (city, link) in zip(response.css('.group a::text'), response.css('.group a::attr(href)')):
city_name = city.get()
yield print('current_city: ',city_name,' (state: ',state_name,')')
city_link_part = link.get().split('/')[0]
yield print('city_link_part: ', city_link_part)
city_link_real = base_url + city_link_part + '/index.html'
yield scrapy.Request(city_link_real, callback=self.parse_layer3, cb_kwargs={'state_name': state_name, 'state_link':state_link, 'city_name': city_name,'city_link': city_link_real})
# MENSA_NAME/MENSA_LINK
def parse_layer3(self, response, state_name, state_link, city_name, city_link):
base_url = 'https://www.imensa.de/'
for group in response.css('.group'):
uni_name = group.css('h2::text').get()
yield print('UNI_NAME: ',uni_name)
for mensa in group.css('.element'):
mensa_name = mensa.css('a::text').get()
yield print('mensa_name: ',mensa_name,' (state: ',state_name,') (city: ',city_name,') (uni_name: ',uni_name,')')
mensa_link = base_url + city_link.replace('https://www.imensa.de/','').split('/')[0] + '/' + mensa.css('a::attr(href)').get()
yield print('mensa_link: ', mensa_link)
yield scrapy.Request(mensa_link, callback=self.parse_layer4, cb_kwargs={'state_name': state_name,
'state_link':state_link,
'city_name': city_name,
'city_link': city_link,
'uni_name': uni_name,
'mensa_name': mensa_name,
'mensa_link': mensa_link
})
# CREATE MENSA ITEM -----------------------------------------------------------------------------------------------------------------
def parse_layer4(self, response, state_name, state_link, city_name, city_link, uni_name, mensa_name, mensa_link):
l = ItemLoader(item=Mensen_Table(), response=response)
try:
rating_avg = response.css('.aw-ratings-average::text').get()
except:
rating_avg = 0
try:
rating_count = response.css('.aw-ratings-count::text').get()
except:
rating_count = 0
address_list = []
for address_element in response.css('a.panel-body::text'):
address_list.append(address_element.get())
mensa_location = ', '.join(address_list)
yield print('mensa_location: ', mensa_location)
yield print('parse_layer_3 -- rating_avg: ',rating_avg)
yield print('parse_layer_3 -- rating_count: ', rating_count)
l.add_value('state_name', state_name)
l.add_value('state_link', state_link)
l.add_value('city_name', city_name)
l.add_value('city_link', city_link)
l.add_value('uni_name', uni_name)
l.add_value('mensa_name', mensa_name)
l.add_value('mensen_link', mensa_link)
l.add_value('mensa_address', mensa_location)
l.add_value('mensen_rating_avg', rating_avg)
l.add_value('mensen_rating_count', rating_count)
yield l.load_item()
for i,x in enumerate(response.css('.col-md-4.no-padding-xs .list-group')):
if i == 0:
date_list = x.css('.pull-right::text').extract()
day_list = x.css('a::text').extract()
link_list = x.css('a::attr(href)').extract()
yield print('date_list: ',date_list)
yield print('day_list: ', day_list)
yield print('link_list: ',link_list)
yield print('mensa_link: ',mensa_link)
# PROCESS DATE LIST
#------------------------------------------
meal_plan_date_list = []
for ele in date_list:
if ele == 'heute':
today = datetime.date.today().strftime('%d.%m.%Y')
meal_plan_date_list.append(today)
elif ele == 'morgen':
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
meal_plan_date_list.append(tomorrow.strftime('%d.%m.%Y'))
else:
meal_plan_date_list.append(ele)
yield print('meal_plan_date_list: ',meal_plan_date_list)
# PROCESS LINK LIST
#--------------------------------------------
meal_plan_link_list = []
for ele in day_list:
link = mensa_link.replace('index.html','') + ele.lower() + '.html'
meal_plan_link_list.append(link)
yield print('meal_plan_link_list: ',meal_plan_link_list)
#meal_plan_list = []
#meal_plan_prep_list = zip(meal_plan_date_list, day_list, meal_plan_link_list)
#for item in meal_plan_prep_list:
# yield print('meal_plan_list_item: ', item)
for date, day, link in zip(meal_plan_date_list, day_list, meal_plan_link_list):
yield scrapy.Request(link, callback=self.parse_layer5, cb_kwargs={'mensa_name': mensa_name, 'mensa_link': link, 'day': day, 'date': date})
# PARSE MEAL PLAN --------------------------------------------------------------------------
def parse_layer5(self, response, mensa_name, mensa_link, day, date):
for element in response.css('.aw-meal-category'):
for sub in element.css('.aw-meal.row.no-margin-xs'):
l = ItemLoader(item=Mensa_Meal(),response=response,selector=sub)
meal_name = sub.css('p.aw-meal-description::text').get()
try:
meal_price = sub.css('.col-sm-2.no-padding-xs.aw-meal-price::text').get().replace('€','').strip()
except:
meal_price = 0
try:
meal_attributes = sub.css('.small.aw-meal-attributes span::text').extract_first().replace(u'\xa0', u'')
except:
meal_attributes = ''
#if not meal_price == None:
l.add_value('mensa_name', mensa_name)
l.add_value('date_of_meal_plan', date)
l.add_value('meal_name', meal_name)
l.add_value('meal_attributes', meal_attributes)
l.add_value('meal_price', meal_price)
yield l.load_item()
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from itemadapter import ItemAdapter
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
from scrapy.item import Item
class Mensen_Table(scrapy.Item):
state_name = scrapy.Field()
state_link = scrapy.Field()
city_name = scrapy.Field()
city_link = scrapy.Field()
uni_name = scrapy.Field()
mensa_name = scrapy.Field()
mensen_link = scrapy.Field()
mensa_address = scrapy.Field()
mensen_rating_avg = scrapy.Field()
mensen_rating_count = scrapy.Field()
five_star_ratings = scrapy.Field()
four_star_ratings = scrapy.Field()
three_star_ratings = scrapy.Field()
two_star_ratings = scrapy.Field()
one_star_ratings = scrapy.Field()
class Mensa_Meal(scrapy.Item):
mensa_name = scrapy.Field()
date_of_meal_plan = scrapy.Field()
meal_name = scrapy.Field()
meal_attributes = scrapy.Field()
meal_price = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import item
from scrapy.exceptions import DropItem
from itemadapter import ItemAdapter
import sqlite3
# useful for handling different item types with a single interface
from items import Mensen_Table, Mensa_Meal
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
def create_table_mensen(self):
print('TABLE MENSEN CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS mensen_table (
state_name TEXT,
city_name TEXT,
uni_name TEXT,
mensa_name TEXT,
mensa_address LONG,
mensen_rating_avg FLOAT,
mensen_rating_count TEXT)
""")
def create_table_meal(self):
return print('TABLE MEAL CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS meal_table (
mensa_name TEXT,
date_of_meal_plan DATE,
meal_name LONG,
meal_attributes LONG,
meal_price FLOAT)
""")
def process_item(self, item, spider):
if isinstance(item, Mensen_Table):
print('MENSEN TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO mensen_table (state_name, city_name, uni_name, mensa_name, mensa_address, mensen_rating_avg, mensen_rating_count) VALUES (?, ?, ?, ?, ?, ?, ?)""",
(item['state_name'], item['city_name'], item['uni_name'], item['mensa_name'], item['mensa_address'], item['mensen_rating_avg'], item['mensen_rating_count']))
self.con.commit()
return item
if isinstance(item, Mensa_Meal):
print('MEAL TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO meal_table (mensa_name, date_of_meal_plan, meal_name, meal_attributes, meal_price) VALUES (?, ?, ?, ?, ?)""",
(item['mensa_name'], item['date_of_meal_plan'], item['meal_name'], item['meal_attributes'], item['meal_price']))
self.con.commit()
return item
What am i doing wrong? The items get displayed properly, but the dont reach the database-file.
Any help would be highly appreciated!

The only problem I see is that you never close the connection to the database, although I am not sure that this will solve your issue. It might though and it's good practice to do anyway.
In your pipeline add close_spider method that closes the connection to the database. This method is triggered automatically by the scrapy engine just before it deletes the spider and closes the twisted reactor.
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
...
def close_spider(self, spider):
self.con.close()

Related

Scraper not grabbing all text in list/div

My scraper seems to skip on information, for example I want to extract all the countries and the highway codes belonging to each country. However, when I iterate over this I only get one country and one highway code for each list of numbered highways.
How do I get all the highway routes and their respective countries?
Here's the scraper that I am working with:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose
from itemloaders import ItemLoader
class roadItem(scrapy.Item):
highway_names = Field(output_processor = TakeFirst())
country = Field(output_processor = TakeFirst())
route_name = Field(output_processor = TakeFirst())
class roadSpider(scrapy.Spider):
name = "road"
start_urls = ["https://en.wikipedia.org/w/index.php?title=Category:Lists_of_roads_sharing_the_same_title&pageuntil=092%0AList+of+highways+numbered+92#mw-pages"]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
list_data = response.xpath('(//div[#class="mw-category-group"][last()])[2]//ul')
for items in list_data:
for link in items.xpath(".//a/#href").getall():
yield response.follow(
response.urljoin(link),
callback = self.parse_roads
)
next_page = response.xpath("//div[#id='mw-pages']/a[1]//#href").get()
if next_page is not None:
yield response.follow(
response.urljoin(next_page),
callback = self.parse
)
def parse_roads(self, response):
loader = ItemLoader(roadItem())
loader.add_value("highway_names", response.xpath("//h1[#id='firstHeading']//text()").get())
data = response.xpath("//div[#id='mw-content-text']")
for list_h2 in data:
if list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get():
loader.add_value("country", list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get())
else:
loader.add_value("country", list_h2.xpath("(//h2//span)[1]//text()").get())
if list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
for routes in list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
loader.add_value('route_name', routes)
else:
for routess in list_h2.xpath("//ul//li//#title").getall():
loader.add_value('route_name', routess)
yield loader.load_item()

scrapy-playwright returning nothing but an error

I'm learning Scrapy-playwright and it's fighting me. I'm attempting to gather store locations from a site using the CrawlSpider with a rule including a process_request that triggers the request to run through playwright. In my callback def I can print a value found on the page, but not return or yield anything. I've attempted to cache the data into an item, and return/yield a dict, all of which produces the error.
ERROR: Spider must return request, item, or None, got 'Deferred'
I'm stumped.
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from banners.items import StoreItem
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
def set_playwright_true(request, response):
request.meta["playwright"] = True
request.meta["playwright_include_page"] = True
request.meta["playwright_page_coroutines"] = ('wait_for_selector', 'span.store-name-city')
return request
class StoreSpider(CrawlSpider):
name = "retailer"
allowed_domains = ['retailer.com']
start_urls = ['https://www.retailer.com/store/0000-city-ak']
custom_settings = {
'ROBOTSTXT_OBEY': True ,
#'DOWNLOAD_DELAY': .5 ,
#'CONCURRENT_REQUESTS_PER_DOMAIN': 3 ,
'DOWNLOAD_HANDLERS': {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
} ,
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
}
rules = (
Rule(LinkExtractor(allow=('directory/ak/anchorage'))),
Rule(LinkExtractor(allow=(r'store/[0-9]+'), deny=(r'store/[0-9]+.+/.+')), callback='parse_item', follow=False, process_request=set_playwright_true),
)
async def parse_item(self, response):
items = []
item = StoreItem()
self.logger.info('***** Start processing ' + response.url + '. *****')
Name = response.css('meta[itemprop=alternateName]').attrib['content'] + ' - ' + response.css('span.store-name-city::text').get()
print(Name)
item['Name'] = Name
item['StoreID'] = response.css('meta[itemprop=storeID]').attrib['content']
item['Address1'] = response.css('span.store-address-line-1::text').get()
item['City'] = response.css('span.store-address-city::text').get()
item['State'] = response.css('span.store-address-state::text').get()
item['Zip'] = response.css('span.store-address-postal::text').get()
item['Phone'] = response.css('div.store-phone::text').get()
item['Latitude'] = response.css('meta[itemprop=latitude]').attrib['content']
item['Longitude'] = response.css('meta[itemprop=longitude]').attrib['content']
items.append(item)
return(items)
Changing parse_item from an async def to a plain def resolved the issue.
async def parse_item(self, response):
changed to
def parse_item(self, response):

How to parse two different items in scrapy?

I am using scrapy 2.1 for parsing a category result page.
There are 2 different things I want to scrape from that site:
Category information like e.g. Title and URL
Product items within that category page
Number 2 works, but I am struggling on how to implement the storage of category info. My first attempt is to create another Item Class CatItem:
class CatItem(scrapy.Item):
title = scrapy.Field() # char -
url = scrapy.Field() # char -
level = scrapy.Field() # int -
class ProductItem(scrapy.Item):
title = scrapy.Field() # char -
Let's parse the page:
def parse_item(self, response):
# save category info
category = CatItem()
category['url'] = response.url
category['title'] = response.url
category['level'] = 1
yield category
# now let's parse all products within that category
for selector in response.xpath("//article//ul/div[#data-qa-id='result-list-entry']"):
product = ProductItem()
product['title'] = selector.xpath(".//a/h2/text()").extract_first()
yield product
My Pipeline:
class mysql_pipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
settings = get_project_settings()
def process_item(self, item, spider):
self.store_db(item, spider)
return item
Now here I don't know how to proceed. There is only one "item" within process_item definition.
How can I pass the category information to the store_db method as well?
You can check item type in your pipeline:
from your_project.items import CatItem, ProductItem
class YourPipeline(object):
...
def process_item(self, item, spider):
if isinstance(item, CatItem):
save_category(item)
return item
UPDATE Simple PoC code:
import scrapy
import csv
from scrapy.crawler import CrawlerProcess
class BooksPipeline(object):
def process_item(self, item, spider):
filename = None
if isinstance(item, CategoryItem):
filename = 'Categories.csv'
elif isinstance(item, BookItem):
filename = 'Books.csv'
with open(filename, 'a', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Title', 'URL'], lineterminator="\n")
writer.writerow(item)
return item
class BooksSpider(scrapy.Spider):
name = "books"
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for book_url in response.xpath('//ol/li//h3/a/#href').getall():
yield scrapy.Request(
url=response.urljoin(book_url),
callback=self.parse_book,
)
def parse_book(self, response):
category = CategoryItem()
category['Title'] = response.xpath('//ul[#class="breadcrumb"]/li[last() - 1]/a/text()').get()
category['URL'] = response.xpath('//ul[#class="breadcrumb"]/li[last() - 1]/a/#href').get()
yield category
book = BookItem()
book['Title'] = response.xpath('//h1/text()').get()
book['URL'] = response.url
yield book
class BookItem(scrapy.Item):
Title = scrapy.Field()
URL = scrapy.Field()
class CategoryItem(scrapy.Item):
Title = scrapy.Field()
URL = scrapy.Field()
if __name__ == "__main__":
process = CrawlerProcess(
{
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'ITEM_PIPELINES': {
'__main__.BooksPipeline': 300,
}
}
)
process.crawl(BooksSpider)
process.start()

pagination in Scrapy on javascript-driven page navigation via Selenium webdriver

I am attempting to paginate through the data table on this page, located below the search form.
My code successfully scrapes the first page and I successfully click the next button (using Selenium) to get the next page of results.
However, attempting to create a Response instance and passing it to self.parse() does not work:
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
Also, even though if you analyze the call stack, I am returning None from self.parse, I get this warning when running this scrapy spider:
The "SignalStartSpider.parse" method is a generator and includes a "return" statement with a value different than None. This could lead to unexpected behaviour. Please see https://docs.python.org/3/reference/simple_stmts.html#the-return-statement for details about the semantics of the "return" statement within generators
warn_on_generator_with_return_value(spider, callback)
Here is my current source code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
else:
print(" **** NEXT IS NONE")
return None
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
Instead of recursively calling self.parse it is better to use a while loop and simply re-bind the Response instance with the page_source from Selenium webdriver. working code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
while True:
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> looping self.parse again")
response = r
else:
print(" **** NEXT IS NONE")
break
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)

ItemLoader doesn't pass the loader context to input processors

my spider: autospd.py
class AutospdSpider(scrapy.Spider):
name = 'autospd'
start_urls = ['http://news.dayoo.com/guangzhou/150960_2.shtml']
dt_ft = "%Y-%m-%d %H:%M"
def parse(self, response):
list_objs = response.css("div.dy-list>div")
for li in list_objs:
loader = AutopjtItemLoader(item=AutopjtItem(), selector=li, context=self.dt_ft)
print(loader.context.items()) #please see print-1
loader.nested_css("h2>a").add_css("title", "::text")
loader.nested_css("h2>a").add_css("url", "::attr(href)")
loader.nested_css("div.txt-area>div.news-time").add_xpath("pub_time", "string()")
yield loader.load_item()
print-1: dict_items([('context', '%Y-%m-%d %H:%M'), ('selector',
\r\n '>), ('response', None), ('item',
{}) ])
items.py
def func(value, loader_context):
print(loader_context.items()) # please see print-2
# ft = loader_context.get("context")
# time_dt = datetime.strptime(value, ft)
return value
class AutopjtItemLoader(ItemLoader):
default_output_processor = TakeFirst()
pub_time_in = MapCompose(func)
class AutopjtItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
pub_time = scrapy.Field()
print-2: [('selector', [2019-06-12 08:59< '>]), ('response',
None), ('item', {})]
Why don't have "context" in loader_context?
def nested_xpath(self, xpath, **context):
selector = self.selector.xpath(xpath)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
def nested_css(self, css, **context):
selector = self.selector.css(css)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
From the scrapy's source code, if you use nested_css or nested_xpath, you must add your context. eg:
loader.nested_css("div.txt-area>div.news-time", dt_ft=self.dt_ft).add_xpath("pub_time", "string()")