Here is a simple python file--test.py.
import math
class myClass():
def myFun(self,x):
return(math.sqrt(x))
if __name__ == "__main__":
myInstance=myClass()
print(myInstance.myFun(9))
It print 3 with python test.py,let's analyse the running process.
1. to instance myClass and assign it to myInstance.
2.to call myFun function and print the result.
It is scrapy's turn.
In the scrapy1.4 manual,quotes_spider.py is as below.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
To run the spider with scrapy crawl quotes,i am puzzled:
1.Where is the main function or main body for the spider?
2.Which class was instanced?
3.Which method was called?
mySpider = QuotesSpider(scrapy.Spider)
mySpider.parse(response)
How scrapy crawl work exactly?
So let's start. Assuming you use linux/mac. Let's check where us scrapy
$ which scrapy
/Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
Let's look at the content of this file
$ cat /Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
#!/Users/tarun.lalwani/.virtualenvs/myproject/bin/python3.6
# -*- coding: utf-8 -*-
import re
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
sys.exit(execute())
So this executes execute method from cmdline.py and her is your main method.
cmdline.py
from __future__ import print_function
....
....
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError: pass
else:
settings['EDITOR'] = editor
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
if __name__ == '__main__':
execute()
Now if you notice execute method it processes the arguments passed by you. which is crawl quotes in your case. The execute methods scans the projects for classes and check which has name defined as quotes. It creates the CrawlerProcess class and that runs the whole show.
Scrapy is based on Twisted Python Framework. Which is a scheduler based framework.
Consider the below part of the code
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
When the engine executes this function and first yield is execute. The value is returned to the engined. The engine now looks at other task that are pending executes them, (when they yield, some other pending task queue function gets a chance). So yield is what allows to break a function execution into parts and help Scrapy/Twisted work.
You can get a detailed overview on the link below
https://doc.scrapy.org/en/latest/topics/architecture.html
Related
I have a config file in which many website details are present. I am taking user input argument in scrapy using -a parameter and taking out matching allowed_domains and start_urls from config file. Since this is a generic spider, I am using rule extractor.
Below is my code:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
import yaml
import re
import scrapy
with open("/scrapyConfig.yaml", "r") as f:
config = yaml.load(f, Loader=yaml.FullLoader)
def cleanHtml(raw_html):
CLEANR = re.compile('<.*?>')
cleanText = str(re.sub(CLEANR,'', raw_html))
return cleanText
def remove_tags(html):
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script']):
data.decompose()
noTagsData = str(' '.join(soup.stripped_strings))
return noTagsData
class SpiderSpider(CrawlSpider):
name = 'spider1'
def __init__(self, **kwargs):
super().__init__(**kwargs)
userInp = self.site
self.allowed_domains=config[userInp]['allowed_domain']
self.start_urls=config[userInp]['start_url']
rules = [(Rule(LinkExtractor(unique=False,allow=(config[self.site]['regex1'],config[self.site]['regex2'])),callback='parse_item',follow=True))]
def parse_item(self,response):
uncleanText = response.xpath(config[self.site]['xpath1']).extract()
cleanText = [x.replace("\n","") for x in uncleanText]
cleanText = [x.replace("\t"," ") for x in cleanText]
cleanText = [x.replace("\r","") for x in cleanText]
cleanText = [x.replace("\xa0","") for x in cleanText]
cleanText = [x.replace(":"," ") for x in cleanText]
cleanText = remove_tags(str(cleanText))
finalCleanJD = cleanHtml(str(cleanText))
yield {"URL":response.url,"Job Description":finalCleanJD}
I am able to take the user input and fetch corresponding allowed_domains and start_urls from config file using init function but when I am passing the same argument in rule extractor, it is not recognising self.site and if I put this rule extractor inside init function then spider is not scraping the page. It's just written as crawled in terminal and then it exits. Even the rule variable is not highlighted when it is inside init function which tells that rule variable is not used anywhere but when it is put outside init function it is getting highlighted but it is not recognizing self.site variable. How can I make this generic spider take user input argument and take out the matching details from config file and start scraping?
I have a issue about scrapyd api.
I write simple spider, it gets domain url as a argument.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, domains=None):
self.allowed_domains = [domains]
self.start_urls = ['http://{}/'.format(domains)]
def parse(self, response):
# time.sleep(int(self.sleep))
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
yield item
It works perfect if I run it like
scrapy crawl quotes -a domains=quotes.toscrape.com
But when time comes to run it via scrapyd_api it goes wrong:
from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI('http://localhost:6800')
scrapyd.schedule(project='pd', spider='quotes', domains='http://quotes.toscrape.com/')
I get - builtins.TypeError: init() got an unexpected keyword argument '_job'
How can I start scrapy spiders via scrapyd api with args?
it is an answer.
According to this answer I was wrong with super method.
now my code looks like this:
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = []
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.allowed_domains = [kwargs.get('domains')]
self.start_urls.append('http://{}/'.format(kwargs.get('domains')))
I am trying to make this code output to a csv file when calling the spider with -o output.csv
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "test"
handle_httpstatus_list = [404]
def parse(self, response):
print response.url
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = []
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
print sites
def __init__(self, spider=None, *a, **kw):
super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
self.spider = spider
l = []
url = "https://channelstore.roku.com"
resp = requests.head(url + "/sitemap.xml")
if (resp.status_code != 404):
l.append(resp.url)
else:
resp = requests.head(url + "/robots.txt")
if (resp.status_code == 200):
l.append(resp.url)
self.sitemap_urls = l
print self.sitemap_urls
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l
I have tried changing the print response url on line 18 to a few things but I cant seem to make this script output to a CSV, all I can manage is seeing the exact information I want but on the terminal screen.
This code is from here but I am not working well with the easy part of completing the code.
Any help is greatly appreciated!
Not clear from your example, but it looks like you are not passing the command line arguments (-o) to your SitemapSpider.
A simpler solution, instead of passing the -o argument, is to just redirect your output to a file:
my_script.py > output.csv
OR
my_script.py | tee output.csv <-- this way will write to file, and also output in your terminal
EDIT:
Not the most efficient way, but without seeing a full script:
def parse(self, response):
with open('output.csv', 'a') as fh:
fh.write('{}\n'.format(response.url))
This will append each response.url to a new line in the output.csv file
I would like to repeatedly scrape the same URLs with different delays. After researching the issue it seemed that the appropriate solution was to use something like
nextreq = scrapy.Request(url, dont_filter=True)
d = defer.Deferred()
delay = 1
reactor.callLater(delay, d.callback, nextreq)
yield d
in parse.
However, I have been unable to make this work. I am getting the error message
ERROR: Spider must return Request, BaseItem, dict or None, got 'Deferred'
I am not familiar with twisted so I hope I am just missing something obvious
Is there a better way of achieving my goal that doesn't fight the framework so much?
I finally found an answer in an old PR
def parse():
req = scrapy.Request(...)
delay = 0
reactor.callLater(delay, self.crawler.engine.schedule, request=req, spider=self)
However, the spider can exit due to being idle too early. Based on the outdated middleware https://github.com/ArturGaspar/scrapy-delayed-requests, this can be remedied with
from scrapy import signals
from scrapy.exceptions import DontCloseSpider
class ImmortalSpiderMiddleware(object):
#classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_idle, signal=signals.spider_idle)
return s
#classmethod
def spider_idle(cls, spider):
raise DontCloseSpider()
The final option, updating the middleware by ArturGaspar, led to:
from weakref import WeakKeyDictionary
from scrapy import signals
from scrapy.exceptions import DontCloseSpider
from twisted.internet import reactor
class DelayedRequestsMiddleware(object):
requests = WeakKeyDictionary()
#classmethod
def from_crawler(cls, crawler):
ext = cls()
crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)
return ext
#classmethod
def spider_idle(cls, spider):
if cls.requests.get(spider):
spider.log("delayed requests pending, not closing spider")
raise DontCloseSpider()
def process_request(self, request, spider):
delay = request.meta.pop('delay_request', None)
if delay:
self.requests.setdefault(spider, 0)
self.requests[spider] += 1
reactor.callLater(delay, self.schedule_request, request.copy(),
spider)
raise IgnoreRequest()
def schedule_request(self, request, spider):
spider.crawler.engine.schedule(request, spider)
self.requests[spider] -= 1
And can be used in parse like:
yield Request(..., meta={'delay_request': 5})
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = 'example'
allowed_domains = ['www.example.com']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = ['http://www.example.com/washington/?page=%s' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=('//a[#name="listing_link"]')),
callback='parse_item'),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item['title'] = response.xpath('string(//h2[#class="post-title"]/text())').extract()
item['desc'] = response.xpath('string(//div[#class="section post-body"]/text())').extract()
item['url'] = response.url
item['location'] = response.xpath('string(//div[#class="posting"]/div[2]/text())').extract()
item['posted_date'] = response.xpath('string(//div[#class="post-date"]/span/text())').extract()#.re("(?<=Posted\s*).*")
item['crawled_date'] = time.strftime("%c")
# not sure how to get the other image urls right now
item['image_urls'] = response.xpath('string(//div[#class="section post-contact-container"]/div/div/img/#src)').extract()
# I can't find this section on any pages right now
item['other_ad_urls'] = response.xpath('//a[#name="listing_link"]/#href').extract()
item['phone_number'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()').extract())
item['email'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()').extract())
item['website'] = "".join(response.xpath('//div[#class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()').extract())
item['name'] = response.xpath('//div[#class="post-name"]/text()').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# 'ITEM_PIPELINES': {
# #'scrapy.contrib.pipeline.images.ImagesPipeline': 1
# 'backpage.pipelines.GeolocationPipeline': 4,
# 'backpage.pipelines.LocationExtractionPipeline': 3,
# 'backpage.pipelines.BackpagePipeline': 5
# }
# });
process1 = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process1.crawl(MySpider)
process1.start()
My spider works perfectly when I run it from the command line with
scrapy crawl example
but I will need to run multiple spiders, so I want to put them all in a script and use CrawlerProcess. When I try to run this I get the error,
AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
This is scrapy version 0.24.6.
All items and pipelines are correct, because the spider works from the command line.
There is (was?) a compatibility problem between Scrapy and Scrapyd. I needed to run Scrapy 0.24 and Scrapyd 1.0.1.
Here is the issue on Github
https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880