Extend scrapy settings value per spider - scrapy

Assume we want to add a specific item pipeline for a particular spider. In order to comply with the DRY principle I just want to access current pipelines from settings, add my specific pipeline and set the result back to the settings for spider.
We can not accomplish this via custom_settings class attribute. Even setting that via from_crawler does not work :
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
crawler.settings.setdict({'ITEM_PIPELINES':
{**dict(crawler.settings.getdict('ITEM_PIPELINES')),
'myscrapers.pipelines.CustomPipeline': 11}
}, priority='spider')
return super().from_crawler(cls, crawler, *args, **kwargs)
That causes this error:
TypeError: Trying to modify an immutable Settings object
How can we correctly extend a settings value in scrapy at spider level?

You can set the settings for the process:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}
process = CrawlerProcess(settings)
process.crawl(spidername)
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}
process.crawl(spidername)
process.start()
But if you really want to do all this inside the spider you can overwrite "update_settings" method:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
custom_settings1 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}}
custom_settings2 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}}
#classmethod
def update_settings(cls, settings):
settings.setdict(getattr(cls, 'custom_settings1' if getattr(cls, 'is_pipeline_1', True) else 'custom_settings2', None) or {}, priority='spider')
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
ExampleSpider.is_pipeline_1 = True
process.crawl(ExampleSpider)
ExampleSpider.is_pipeline_1 = False
process.crawl(ExampleSpider)
process.start()
But honestly I think the first way is better...

Related

How to get log and show it on GUI from multiprocessing work?

I try to get logs from multiprocessing work and show them on GUI.
Based on this document
gui.py:
from PyQt5 import QtCore, QtWidgets
import logging
from log_test import main
Signal = QtCore.pyqtSignal
Slot = QtCore.pyqtSlot
class Signaller(QtCore.QObject):
signal = Signal(str, logging.LogRecord)
class QtHandler(logging.Handler):
def __init__(self, slotfunc, *args, **kwargs):
super().__init__(*args, **kwargs)
self.signaller = Signaller()
self.signaller.signal.connect(slotfunc)
def emit(self, record):
s = self.format(record)
self.signaller.signal.emit(s, record)
class Worker(QtCore.QObject):
finished = Signal()
#Slot()
def start(self):
main()
self.finished.emit()
class Ui_Dialog(QtCore.QObject):
def __init__(self):
super().__init__()
def setupUi(self, Dialog):
Dialog.setObjectName("Dialog")
Dialog.setEnabled(True)
Dialog.resize(530, 440)
self.verticalLayout = QtWidgets.QVBoxLayout(Dialog)
self.verticalLayout.setObjectName("verticalLayout")
self.button = QtWidgets.QPushButton(Dialog)
self.button.setText("start working")
self.verticalLayout.addWidget(self.button)
self.logWidget = QtWidgets.QPlainTextEdit(Dialog)
self.logWidget.setReadOnly(True)
self.verticalLayout.addWidget(self.logWidget)
self.handler = QtHandler(self.update_log_gui)
logging.getLogger('log').addHandler(self.handler)
self.button.clicked.connect(self.start_work)
#Slot(str, logging.LogRecord)
def update_log_gui(self, status, record):
self.logWidget.appendPlainText(status)
def config_thread(self):
self.worker_thread = QtCore.QThread()
self.worker_thread.setObjectName('WorkerThread')
self.worker = Worker()
self.worker.moveToThread(self.worker_thread)
self.worker_thread.started.connect(self.worker.start)
self.worker.finished.connect(self.worker_thread.quit)
self.worker.finished.connect(self.worker.deleteLater)
self.worker_thread.finished.connect(self.worker_thread.deleteLater)
self.worker_thread.finished.connect(lambda: self.button.setEnabled(True))
pass
def start_work(self):
self.config_thread()
self.worker_thread.start()
self.button.setEnabled(False)
if __name__ == "__main__":
import sys
QtCore.QThread.currentThread().setObjectName('MainThread')
app = QtWidgets.QApplication(sys.argv)
Dialog = QtWidgets.QDialog()
ui = Ui_Dialog()
ui.setupUi(Dialog)
Dialog.show()
sys.exit(app.exec_())
log_test.py (where multiprocessing work happens)
import logging
import time
from multiprocessing import Pool
def f(name):
logger = logging.getLogger('log.' + name)
logger.error('hello there 1')
time.sleep(0.5)
logger.error('hello there 2')
time.sleep(0.5)
logger.error('hello there 3')
time.sleep(0.5)
def main():
with Pool(5) as p:
p.map(f, ['aaa', 'bbb', 'ccc'])
At first time, I thought working in single thread causing the problem. So I added QThread to this.
Later I discovered in debug, it seems to QtHandler.emit() works fine at receiving log messages. But the connected slot function, update_log_gui() does not work somehow.
I solved it myself.
#Alexander was right. Indeed my QtHandler has a problem when multiprocessing but I don't know exactly why. Rather, you wanna implement QueueHandler. An example in this article (Written in Korean) helped me.
from PyQt5 import QtCore, QtWidgets
import logging
import multiprocessing
from log_test import main
Signal = QtCore.pyqtSignal
Slot = QtCore.pyqtSlot
QThread = QtCore.QThread
class Signaller(QtCore.QObject):
signal = Signal(logging.LogRecord)
class Worker(QtCore.QObject):
finished = Signal()
def __init__(self, q):
super().__init__()
self.q = q
#Slot()
def start(self):
main(self.q)
self.finished.emit()
class Consumer(QThread):
popped = Signaller()
def __init__(self, q):
super().__init__()
self.q = q
self.setObjectName('ConsumerThread')
def run(self):
while True:
if not self.q.empty():
record = self.q.get()
self.popped.signal.emit(record)
class Ui_Dialog(QtCore.QObject):
def __init__(self, app):
super().__init__()
self.app = app
def setupUi(self, Dialog):
Dialog.setObjectName("Dialog")
Dialog.setEnabled(True)
Dialog.resize(530, 440)
self.verticalLayout = QtWidgets.QVBoxLayout(Dialog)
self.verticalLayout.setObjectName("verticalLayout")
self.button = QtWidgets.QPushButton(Dialog)
self.button.setText("start working")
self.verticalLayout.addWidget(self.button)
self.logWidget = QtWidgets.QPlainTextEdit(Dialog)
self.logWidget.setReadOnly(True)
self.verticalLayout.addWidget(self.logWidget)
self.button.clicked.connect(self.start_work)
self.q = multiprocessing.Manager().Queue()
self.consumer = Consumer(self.q)
self.consumer.popped.signal.connect(self.update_log_gui)
self.consumer.start()
app.aboutToQuit.connect(self.shutdown_consumer)
#Slot(logging.LogRecord)
def update_log_gui(self, record):
self.logWidget.appendPlainText(str(record.msg))
def config_thread(self):
self.worker_thread = QtCore.QThread()
self.worker_thread.setObjectName('WorkerThread')
self.worker = Worker(self.q)
self.worker.moveToThread(self.worker_thread)
self.worker_thread.started.connect(self.worker.start)
self.worker.finished.connect(self.worker_thread.quit)
self.worker.finished.connect(self.worker.deleteLater)
self.worker_thread.finished.connect(self.worker_thread.deleteLater)
self.worker_thread.finished.connect(lambda: self.button.setEnabled(True))
def start_work(self):
self.config_thread()
self.worker_thread.start()
self.button.setEnabled(False)
def shutdown_consumer(self):
if self.consumer.isRunning():
self.consumer.requestInterruption()
self.consumer.quit()
self.consumer.wait()
if __name__ == "__main__":
import sys
QtCore.QThread.currentThread().setObjectName('MainThread')
app = QtWidgets.QApplication(sys.argv)
Dialog = QtWidgets.QDialog()
ui = Ui_Dialog(app)
ui.setupUi(Dialog)
Dialog.show()
sys.exit(app.exec_())

Exporting the results from multiple spiders

I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()

how to transmit true value to formal parameter in decorator function(pyqtSlot())?

first, see code below:
import sys
from PyQt5.QtCore import (Qt, pyqtSignal, pyqtSlot)
from PyQt5.QtWidgets import (QWidget, QLCDNumber, QSlider,
QVBoxLayout, QApplication)
class Example(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def printLabel(self, str):
print(str)
#pyqtSlot(int)
def on_sld_valueChanged(self, value):
self.lcd.display(value)
self.printLabel(value)
def initUI(self):
self.lcd = QLCDNumber(self)
self.sld = QSlider(Qt.Horizontal, self)
vbox = QVBoxLayout()
vbox.addWidget(self.lcd)
vbox.addWidget(self.sld)
self.setLayout(vbox)
self.sld.valueChanged.connect(self.on_sld_valueChanged)
self.setGeometry(300, 300, 250, 150)
self.setWindowTitle('Signal & slot')
self.show()
if __name__ == '__main__':
app = QApplication(sys.argv)
ex = Example()
sys.exit(app.exec_())
I'm a little puzzled about how the true value in sld is transmitted to the formal parameter 'value' in the slot function : def sld_valChanged(self, value).
Because i can't see something like this: self.sld.valueChanged.connect(partial(self.sld_valChanged, self.sld.value))
Could someone explain that?

Relative path in scrapyd

import scrapy
import csv
from series.items import SeriesItem
class EpisodeScraperSpider(scrapy.Spider):
name = "episode_scraper"
allowed_domains = ["imdb.com"]
start_urls = []
def __init__(self, id=None, series=None, *args, **kwargs):
super(EpisodeScraperSpider, self).__init__(*args, **kwargs)
if id is not None:
self.start_urls = ['http://www.imdb.com/title/{!s}/episodes?season={!s}'.format(id, series)]
else:
with open('series_episode.csv') as f:
f_csv = csv.DictReader(f)
for row in f_csv:
self.start_urls.append('http://www.imdb.com/title/{!s}/episodes?season={!s}'.format(row["id"], row["series"]))
def parse(self, response):
episodes = response.xpath('//div[contains(#class, "list_item")]')
title = response.xpath('//h3/a/text()').extract()[0]
for episode in episodes:
global title
item = SeriesItem()
item['series_episode'] = episode.xpath('div/a/div[contains(#data-const,"tt")]/div/text()').extract()
item['title'] = '{!s}: {!s}'.format(title, episode.xpath('div[#class="info"]/strong/a/text()').extract())
item['imdb_id'] = episode.xpath('div[#class="image"]/a/div/#data-const').extract()
item['airdate'] = [x.strip() for x in episode.xpath('div/div[#class="airdate"]/text()').extract()]
yield item
When I try this script in scrapyd I got no result. It does have result in scrapy. I think the problem is in this line.
with open('series_episode.csv') as f:
I don't know where to put my csv file.
Please help me!!
Thanks
one option would be to save it in /tmp
with open('/tmp/series_episode.csv') as f:

Scrapy issue with csv output

Here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
from scrapy.http import Request
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (Rule(SgmlLinkExtractor(allow=('\?page=\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
subpages = hxs.select('//a[#class="detailsHyper_class"]/#href').extract()
ep = hxs.select('//a[#itemprop="name"]/text()').extract()
ad = hxs.select('//div[#class="results_address_class"]/text()').extract()
for eponimia,address,subpage in zip(ep,ad,subpages):
vriskoit = VriskoItem()
vriskoit['eponimia'] = eponimia
vriskoit['address'] = address
request = Request(subpage,callback = self.subPage)
request.meta['vriskoit'] = vriskoit
yield request
def subPage(self,response):
vriskoit = response.meta['vriskoit']
hxs = HtmlXPathSelector(response)
vriskoit['category'] = hxs.select('//div[#class="category_class"]/span/text()').extract()
yield vriskoit
and here is my pipiline:
import csv
class myExporter(object):
def __init__(self):
self.brandCategoryCsv = csv.writer(open('brandCategoryTable.csv', 'wb'))
self.brandCategoryCsv.writerow(['eponimia', 'address','category'])
def process_item(self, item, spider):
for e,a,c in zip(item['eponimia'],item['address'],item['category']):
self.brandCategoryCsv.writerow([e.encode('utf-8'), a.encode('utf-8'), c.encode('utf-8')])
return item
my problem is that for both the first 2 fields (eponimia,address), only the first character is written to the output csv file and i cant find why.
Any help would be much appreciated, i am out of ideas.
Remove zip function from myExporter.process_item
def process_item(self, item, spider):
self.brandCategoryCsv.writerow([item['eponimia'].encode('utf-8'),
item['address'].encode('utf-8'),
item['category'].encode('utf-8')])
return item
You already converted items list to individual items in vriskoSpider.parse_start_url.
zip iterates your strings:
In [1]: a = 'test1'
In [2]: b = 'test2'
In [3]: for x, y in zip(a, b):
...: print x, y
...:
t t
e e
s s
t t
1 2