Extend scrapy settings value per spider

Extend scrapy settings value per spider - scrapy

Assume we want to add a specific item pipeline for a particular spider. In order to comply with the DRY principle I just want to access current pipelines from settings, add my specific pipeline and set the result back to the settings for spider.
We can not accomplish this via custom_settings class attribute. Even setting that via from_crawler does not work :
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
crawler.settings.setdict({'ITEM_PIPELINES':
{**dict(crawler.settings.getdict('ITEM_PIPELINES')),
'myscrapers.pipelines.CustomPipeline': 11}
}, priority='spider')
return super().from_crawler(cls, crawler, *args, **kwargs)
That causes this error:
TypeError: Trying to modify an immutable Settings object
How can we correctly extend a settings value in scrapy at spider level?

You can set the settings for the process:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}
process = CrawlerProcess(settings)
process.crawl(spidername)
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}
process.crawl(spidername)
process.start()
But if you really want to do all this inside the spider you can overwrite "update_settings" method:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
custom_settings1 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}}
custom_settings2 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}}
#classmethod
def update_settings(cls, settings):
settings.setdict(getattr(cls, 'custom_settings1' if getattr(cls, 'is_pipeline_1', True) else 'custom_settings2', None) or {}, priority='spider')
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
ExampleSpider.is_pipeline_1 = True
process.crawl(ExampleSpider)
ExampleSpider.is_pipeline_1 = False
process.crawl(ExampleSpider)
process.start()
But honestly I think the first way is better...

Related

How to get log and show it on GUI from multiprocessing work?

I try to get logs from multiprocessing work and show them on GUI.
Based on this document
gui.py:
from PyQt5 import QtCore, QtWidgets
import logging
from log_test import main
Signal = QtCore.pyqtSignal
Slot = QtCore.pyqtSlot
class Signaller(QtCore.QObject):
signal = Signal(str, logging.LogRecord)
class QtHandler(logging.Handler):
def __init__(self, slotfunc, *args, **kwargs):
super().__init__(*args, **kwargs)
self.signaller = Signaller()
self.signaller.signal.connect(slotfunc)
def emit(self, record):
s = self.format(record)
self.signaller.signal.emit(s, record)
class Worker(QtCore.QObject):
finished = Signal()
#Slot()
def start(self):
main()
self.finished.emit()
class Ui_Dialog(QtCore.QObject):
def __init__(self):
super().__init__()
def setupUi(self, Dialog):
Dialog.setObjectName("Dialog")
Dialog.setEnabled(True)
Dialog.resize(530, 440)
self.verticalLayout = QtWidgets.QVBoxLayout(Dialog)
self.verticalLayout.setObjectName("verticalLayout")
self.button = QtWidgets.QPushButton(Dialog)
self.button.setText("start working")
self.verticalLayout.addWidget(self.button)
self.logWidget = QtWidgets.QPlainTextEdit(Dialog)
self.logWidget.setReadOnly(True)
self.verticalLayout.addWidget(self.logWidget)
self.handler = QtHandler(self.update_log_gui)
logging.getLogger('log').addHandler(self.handler)
self.button.clicked.connect(self.start_work)
#Slot(str, logging.LogRecord)
def update_log_gui(self, status, record):
self.logWidget.appendPlainText(status)
def config_thread(self):
self.worker_thread = QtCore.QThread()
self.worker_thread.setObjectName('WorkerThread')
self.worker = Worker()
self.worker.moveToThread(self.worker_thread)
self.worker_thread.started.connect(self.worker.start)
self.worker.finished.connect(self.worker_thread.quit)
self.worker.finished.connect(self.worker.deleteLater)
self.worker_thread.finished.connect(self.worker_thread.deleteLater)
self.worker_thread.finished.connect(lambda: self.button.setEnabled(True))
pass
def start_work(self):
self.config_thread()
self.worker_thread.start()
self.button.setEnabled(False)
if __name__ == "__main__":
import sys
QtCore.QThread.currentThread().setObjectName('MainThread')
app = QtWidgets.QApplication(sys.argv)
Dialog = QtWidgets.QDialog()
ui = Ui_Dialog()
ui.setupUi(Dialog)
Dialog.show()
sys.exit(app.exec_())
log_test.py (where multiprocessing work happens)
import logging
import time
from multiprocessing import Pool
def f(name):
logger = logging.getLogger('log.' + name)
logger.error('hello there 1')
time.sleep(0.5)
logger.error('hello there 2')
time.sleep(0.5)
logger.error('hello there 3')
time.sleep(0.5)
def main():
with Pool(5) as p:
p.map(f, ['aaa', 'bbb', 'ccc'])
At first time, I thought working in single thread causing the problem. So I added QThread to this.
Later I discovered in debug, it seems to QtHandler.emit() works fine at receiving log messages. But the connected slot function, update_log_gui() does not work somehow.

I solved it myself.
#Alexander was right. Indeed my QtHandler has a problem when multiprocessing but I don't know exactly why. Rather, you wanna implement QueueHandler. An example in this article (Written in Korean) helped me.
from PyQt5 import QtCore, QtWidgets
import logging
import multiprocessing
from log_test import main
Signal = QtCore.pyqtSignal
Slot = QtCore.pyqtSlot
QThread = QtCore.QThread
class Signaller(QtCore.QObject):
signal = Signal(logging.LogRecord)
class Worker(QtCore.QObject):
finished = Signal()
def __init__(self, q):
super().__init__()
self.q = q
#Slot()
def start(self):
main(self.q)
self.finished.emit()
class Consumer(QThread):
popped = Signaller()
def __init__(self, q):
super().__init__()
self.q = q
self.setObjectName('ConsumerThread')
def run(self):
while True:
if not self.q.empty():
record = self.q.get()
self.popped.signal.emit(record)
class Ui_Dialog(QtCore.QObject):
def __init__(self, app):
super().__init__()
self.app = app
def setupUi(self, Dialog):
Dialog.setObjectName("Dialog")
Dialog.setEnabled(True)
Dialog.resize(530, 440)
self.verticalLayout = QtWidgets.QVBoxLayout(Dialog)
self.verticalLayout.setObjectName("verticalLayout")
self.button = QtWidgets.QPushButton(Dialog)
self.button.setText("start working")
self.verticalLayout.addWidget(self.button)
self.logWidget = QtWidgets.QPlainTextEdit(Dialog)
self.logWidget.setReadOnly(True)
self.verticalLayout.addWidget(self.logWidget)
self.button.clicked.connect(self.start_work)
self.q = multiprocessing.Manager().Queue()
self.consumer = Consumer(self.q)
self.consumer.popped.signal.connect(self.update_log_gui)
self.consumer.start()
app.aboutToQuit.connect(self.shutdown_consumer)
#Slot(logging.LogRecord)
def update_log_gui(self, record):
self.logWidget.appendPlainText(str(record.msg))
def config_thread(self):
self.worker_thread = QtCore.QThread()
self.worker_thread.setObjectName('WorkerThread')
self.worker = Worker(self.q)
self.worker.moveToThread(self.worker_thread)
self.worker_thread.started.connect(self.worker.start)
self.worker.finished.connect(self.worker_thread.quit)
self.worker.finished.connect(self.worker.deleteLater)
self.worker_thread.finished.connect(self.worker_thread.deleteLater)
self.worker_thread.finished.connect(lambda: self.button.setEnabled(True))
def start_work(self):
self.config_thread()
self.worker_thread.start()
self.button.setEnabled(False)
def shutdown_consumer(self):
if self.consumer.isRunning():
self.consumer.requestInterruption()
self.consumer.quit()
self.consumer.wait()
if __name__ == "__main__":
import sys
QtCore.QThread.currentThread().setObjectName('MainThread')
app = QtWidgets.QApplication(sys.argv)
Dialog = QtWidgets.QDialog()
ui = Ui_Dialog(app)
ui.setupUi(Dialog)
Dialog.show()
sys.exit(app.exec_())

Exporting the results from multiple spiders

I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()

how to transmit true value to formal parameter in decorator function(pyqtSlot())?

first, see code below:
import sys
from PyQt5.QtCore import (Qt, pyqtSignal, pyqtSlot)
from PyQt5.QtWidgets import (QWidget, QLCDNumber, QSlider,
QVBoxLayout, QApplication)
class Example(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def printLabel(self, str):
print(str)
#pyqtSlot(int)
def on_sld_valueChanged(self, value):
self.lcd.display(value)
self.printLabel(value)
def initUI(self):
self.lcd = QLCDNumber(self)
self.sld = QSlider(Qt.Horizontal, self)
vbox = QVBoxLayout()
vbox.addWidget(self.lcd)
vbox.addWidget(self.sld)
self.setLayout(vbox)
self.sld.valueChanged.connect(self.on_sld_valueChanged)
self.setGeometry(300, 300, 250, 150)
self.setWindowTitle('Signal & slot')
self.show()
if __name__ == '__main__':
app = QApplication(sys.argv)
ex = Example()
sys.exit(app.exec_())
I'm a little puzzled about how the true value in sld is transmitted to the formal parameter 'value' in the slot function : def sld_valChanged(self, value).
Because i can't see something like this: self.sld.valueChanged.connect(partial(self.sld_valChanged, self.sld.value))
Could someone explain that?

Relative path in scrapyd

import scrapy
import csv
from series.items import SeriesItem
class EpisodeScraperSpider(scrapy.Spider):
name = "episode_scraper"
allowed_domains = ["imdb.com"]
start_urls = []
def __init__(self, id=None, series=None, *args, **kwargs):
super(EpisodeScraperSpider, self).__init__(*args, **kwargs)
if id is not None:
self.start_urls = ['http://www.imdb.com/title/{!s}/episodes?season={!s}'.format(id, series)]
else:
with open('series_episode.csv') as f:
f_csv = csv.DictReader(f)
for row in f_csv:
self.start_urls.append('http://www.imdb.com/title/{!s}/episodes?season={!s}'.format(row["id"], row["series"]))
def parse(self, response):
episodes = response.xpath('//div[contains(#class, "list_item")]')
title = response.xpath('//h3/a/text()').extract()[0]
for episode in episodes:
global title
item = SeriesItem()
item['series_episode'] = episode.xpath('div/a/div[contains(#data-const,"tt")]/div/text()').extract()
item['title'] = '{!s}: {!s}'.format(title, episode.xpath('div[#class="info"]/strong/a/text()').extract())
item['imdb_id'] = episode.xpath('div[#class="image"]/a/div/#data-const').extract()
item['airdate'] = [x.strip() for x in episode.xpath('div/div[#class="airdate"]/text()').extract()]
yield item
When I try this script in scrapyd I got no result. It does have result in scrapy. I think the problem is in this line.
with open('series_episode.csv') as f:
I don't know where to put my csv file.
Please help me!!
Thanks

one option would be to save it in /tmp
with open('/tmp/series_episode.csv') as f:

Scrapy issue with csv output

Here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
from scrapy.http import Request
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (Rule(SgmlLinkExtractor(allow=('\?page=\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
subpages = hxs.select('//a[#class="detailsHyper_class"]/#href').extract()
ep = hxs.select('//a[#itemprop="name"]/text()').extract()
ad = hxs.select('//div[#class="results_address_class"]/text()').extract()
for eponimia,address,subpage in zip(ep,ad,subpages):
vriskoit = VriskoItem()
vriskoit['eponimia'] = eponimia
vriskoit['address'] = address
request = Request(subpage,callback = self.subPage)
request.meta['vriskoit'] = vriskoit
yield request
def subPage(self,response):
vriskoit = response.meta['vriskoit']
hxs = HtmlXPathSelector(response)
vriskoit['category'] = hxs.select('//div[#class="category_class"]/span/text()').extract()
yield vriskoit
and here is my pipiline:
import csv
class myExporter(object):
def __init__(self):
self.brandCategoryCsv = csv.writer(open('brandCategoryTable.csv', 'wb'))
self.brandCategoryCsv.writerow(['eponimia', 'address','category'])
def process_item(self, item, spider):
for e,a,c in zip(item['eponimia'],item['address'],item['category']):
self.brandCategoryCsv.writerow([e.encode('utf-8'), a.encode('utf-8'), c.encode('utf-8')])
return item
my problem is that for both the first 2 fields (eponimia,address), only the first character is written to the output csv file and i cant find why.
Any help would be much appreciated, i am out of ideas.

Remove zip function from myExporter.process_item
def process_item(self, item, spider):
self.brandCategoryCsv.writerow([item['eponimia'].encode('utf-8'),
item['address'].encode('utf-8'),
item['category'].encode('utf-8')])
return item
You already converted items list to individual items in vriskoSpider.parse_start_url.
zip iterates your strings:
In [1]: a = 'test1'
In [2]: b = 'test2'
In [3]: for x, y in zip(a, b):
...: print x, y
...:
t t
e e
s s
t t
1 2

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Extend scrapy settings value per spider - scrapy

Related

How to get log and show it on GUI from multiprocessing work?

Exporting the results from multiple spiders

how to transmit true value to formal parameter in decorator function(pyqtSlot())?

Relative path in scrapyd

Scrapy issue with csv output

Categories

Resources