Why does my CrawlerProcess not have the function "crawl"? - scrapy

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = 'example'
allowed_domains = ['www.example.com']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = ['http://www.example.com/washington/?page=%s' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=('//a[#name="listing_link"]')),
callback='parse_item'),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item['title'] = response.xpath('string(//h2[#class="post-title"]/text())').extract()
item['desc'] = response.xpath('string(//div[#class="section post-body"]/text())').extract()
item['url'] = response.url
item['location'] = response.xpath('string(//div[#class="posting"]/div[2]/text())').extract()
item['posted_date'] = response.xpath('string(//div[#class="post-date"]/span/text())').extract()#.re("(?<=Posted\s*).*")
item['crawled_date'] = time.strftime("%c")
# not sure how to get the other image urls right now
item['image_urls'] = response.xpath('string(//div[#class="section post-contact-container"]/div/div/img/#src)').extract()
# I can't find this section on any pages right now
item['other_ad_urls'] = response.xpath('//a[#name="listing_link"]/#href').extract()
item['phone_number'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()').extract())
item['email'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()').extract())
item['website'] = "".join(response.xpath('//div[#class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()').extract())
item['name'] = response.xpath('//div[#class="post-name"]/text()').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# 'ITEM_PIPELINES': {
# #'scrapy.contrib.pipeline.images.ImagesPipeline': 1
# 'backpage.pipelines.GeolocationPipeline': 4,
# 'backpage.pipelines.LocationExtractionPipeline': 3,
# 'backpage.pipelines.BackpagePipeline': 5
# }
# });
process1 = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process1.crawl(MySpider)
process1.start()
My spider works perfectly when I run it from the command line with
scrapy crawl example
but I will need to run multiple spiders, so I want to put them all in a script and use CrawlerProcess. When I try to run this I get the error,
AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
This is scrapy version 0.24.6.
All items and pipelines are correct, because the spider works from the command line.

There is (was?) a compatibility problem between Scrapy and Scrapyd. I needed to run Scrapy 0.24 and Scrapyd 1.0.1.
Here is the issue on Github
https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880

Related

How to take user argument and pass it to Rule extractor in Scrapy

I have a config file in which many website details are present. I am taking user input argument in scrapy using -a parameter and taking out matching allowed_domains and start_urls from config file. Since this is a generic spider, I am using rule extractor.
Below is my code:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
import yaml
import re
import scrapy
with open("/scrapyConfig.yaml", "r") as f:
config = yaml.load(f, Loader=yaml.FullLoader)
def cleanHtml(raw_html):
CLEANR = re.compile('<.*?>')
cleanText = str(re.sub(CLEANR,'', raw_html))
return cleanText
def remove_tags(html):
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script']):
data.decompose()
noTagsData = str(' '.join(soup.stripped_strings))
return noTagsData
class SpiderSpider(CrawlSpider):
name = 'spider1'
def __init__(self, **kwargs):
super().__init__(**kwargs)
userInp = self.site
self.allowed_domains=config[userInp]['allowed_domain']
self.start_urls=config[userInp]['start_url']
rules = [(Rule(LinkExtractor(unique=False,allow=(config[self.site]['regex1'],config[self.site]['regex2'])),callback='parse_item',follow=True))]
def parse_item(self,response):
uncleanText = response.xpath(config[self.site]['xpath1']).extract()
cleanText = [x.replace("\n","") for x in uncleanText]
cleanText = [x.replace("\t"," ") for x in cleanText]
cleanText = [x.replace("\r","") for x in cleanText]
cleanText = [x.replace("\xa0","") for x in cleanText]
cleanText = [x.replace(":"," ") for x in cleanText]
cleanText = remove_tags(str(cleanText))
finalCleanJD = cleanHtml(str(cleanText))
yield {"URL":response.url,"Job Description":finalCleanJD}
I am able to take the user input and fetch corresponding allowed_domains and start_urls from config file using init function but when I am passing the same argument in rule extractor, it is not recognising self.site and if I put this rule extractor inside init function then spider is not scraping the page. It's just written as crawled in terminal and then it exits. Even the rule variable is not highlighted when it is inside init function which tells that rule variable is not used anywhere but when it is put outside init function it is getting highlighted but it is not recognizing self.site variable. How can I make this generic spider take user input argument and take out the matching details from config file and start scraping?

How do I change shell text to save to a CSV output?

I am trying to make this code output to a csv file when calling the spider with -o output.csv
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "test"
handle_httpstatus_list = [404]
def parse(self, response):
print response.url
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = []
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
print sites
def __init__(self, spider=None, *a, **kw):
super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
self.spider = spider
l = []
url = "https://channelstore.roku.com"
resp = requests.head(url + "/sitemap.xml")
if (resp.status_code != 404):
l.append(resp.url)
else:
resp = requests.head(url + "/robots.txt")
if (resp.status_code == 200):
l.append(resp.url)
self.sitemap_urls = l
print self.sitemap_urls
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l
I have tried changing the print response url on line 18 to a few things but I cant seem to make this script output to a CSV, all I can manage is seeing the exact information I want but on the terminal screen.
This code is from here but I am not working well with the easy part of completing the code.
Any help is greatly appreciated!
Not clear from your example, but it looks like you are not passing the command line arguments (-o) to your SitemapSpider.
A simpler solution, instead of passing the -o argument, is to just redirect your output to a file:
my_script.py > output.csv
OR
my_script.py | tee output.csv <-- this way will write to file, and also output in your terminal
EDIT:
Not the most efficient way, but without seeing a full script:
def parse(self, response):
with open('output.csv', 'a') as fh:
fh.write('{}\n'.format(response.url))
This will append each response.url to a new line in the output.csv file

How scrapy crawl work:which class instanced and which method called?

Here is a simple python file--test.py.
import math
class myClass():
def myFun(self,x):
return(math.sqrt(x))
if __name__ == "__main__":
myInstance=myClass()
print(myInstance.myFun(9))
It print 3 with python test.py,let's analyse the running process.
1. to instance myClass and assign it to myInstance.
2.to call myFun function and print the result.
It is scrapy's turn.
In the scrapy1.4 manual,quotes_spider.py is as below.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
To run the spider with scrapy crawl quotes,i am puzzled:
1.Where is the main function or main body for the spider?
2.Which class was instanced?
3.Which method was called?
mySpider = QuotesSpider(scrapy.Spider)
mySpider.parse(response)
How scrapy crawl work exactly?
So let's start. Assuming you use linux/mac. Let's check where us scrapy
$ which scrapy
/Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
Let's look at the content of this file
$ cat /Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
#!/Users/tarun.lalwani/.virtualenvs/myproject/bin/python3.6
# -*- coding: utf-8 -*-
import re
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
sys.exit(execute())
So this executes execute method from cmdline.py and her is your main method.
cmdline.py
from __future__ import print_function
....
....
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError: pass
else:
settings['EDITOR'] = editor
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
if __name__ == '__main__':
execute()
Now if you notice execute method it processes the arguments passed by you. which is crawl quotes in your case. The execute methods scans the projects for classes and check which has name defined as quotes. It creates the CrawlerProcess class and that runs the whole show.
Scrapy is based on Twisted Python Framework. Which is a scheduler based framework.
Consider the below part of the code
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
When the engine executes this function and first yield is execute. The value is returned to the engined. The engine now looks at other task that are pending executes them, (when they yield, some other pending task queue function gets a chance). So yield is what allows to break a function execution into parts and help Scrapy/Twisted work.
You can get a detailed overview on the link below
https://doc.scrapy.org/en/latest/topics/architecture.html

How to make rules of CrawlSpider context-sensitive?

I notice that, the rule of CrawlSpider extract urls on every none-leaf pages.
Can I enable rule only when current page meets some condition (for example: url matches a regex)?
I have two pages:
-------------------Page A-------------------
Page URL: http://www.site.com/pattern-match.html
--------------------------------------------
- [link](http://should-extract-this)
- [link](http://should-extract-this)
- [link](http://should-extract-this)
--------------------------------------------
--------------------Page B--------------------
Page URL: http://www.site.com/pattern-not-match.html
-----------------------------------------------
- [link](http://should-not-extract-this)
- [link](http://should-not-extract-this)
- [link](http://should-not-extract-this)
-----------------------------------------------
So, the rule should only extract urls from PageA. How to do it? Thanks!
I just found a dirty way to inject response to rule.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy.http import Request, HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
import inspect
class MyCrawlSpider(CrawlSpider):
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
if links and rule.process_links:
links = rule.process_links(links)
seen = seen.union(links)
for link in links:
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=n, link_text=link.text)
# ***>>> HACK <<<***
# pass `response` as additional argument to `process_request`
fun = rule.process_request
if not hasattr(fun, 'nargs'):
fun.nargs = len(inspect.getargs(fun.func_code).args)
if fun.nargs==1:
yield fun(r)
elif fun.nargs==2:
yield fun(r, response)
else:
raise Exception('too many arguments')
Try it out:
def process_request(request, response):
if 'magick' in response.url:
return request
class TestSpider(MyCrawlSpider):
name = 'test'
allowed_domains = ['test.com']
start_urls = ['http://www.test.com']
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//a'), callback='parse_item', process_request=process_request),
]
def parse_item(self, response):
print response.url

Broad Scrapy Crawl: sgmlLinkextractor rule does not work

I've spent a lot of time playing around and using google but I could not solve my problem. I am new to Scrapy and I hope you can help me.
Part of the spider that works: I define my start_requests urls out of a MySQL Datbase. With the 'parse_item' statement I write the response into seperate files. Both of these steps work fine.
My Problem: Additionally I want to follow every url which contains '.ch' and - as I do for the start_requests - send them to the 'parse_item' method. I therefore defined a rule with a sgmlLinkExtractor and the 'parse_item' method as the callback. This does not work. After completion, I only have the files for the urls defined in 'start_requests'. I don't get any error messages.
Here is my code:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import mysql.connector
from scrapy.http import Request
class FirstSpider(CrawlSpider):
name = 'firstspider'
start_urls = []
def start_requests(self):
conn = mysql.connector.connect(user='root', password = 'root', host= 'localhost', database = 'Eistee')
cursor = conn.cursor()
query = ("SELECT Domain, CompanyName FROM Crawlbydomain LIMIT 300, 100")
cursor.execute(query)
results = cursor.fetchall()
for result in results:
urlrequest = 'http://'+result[0]
yield Request(urlrequest, callback = self.parse_item )
rules = (Rule (SgmlLinkExtractor(allow=('.ch', )), callback='parse_item', follow= True),)
def parse_item(self, response):
filename = response.url.translate(None, './')
open(filename, 'wb').write(response.body)
Can you help me?
To make CrawlSpider do its "magic" you need the requests to go through CrawlSpider's parse() callback.
So in start_requests() your Requests must use callback=self.parse (or not set the callback argument)
If you also want the start Requests to go through parse_item you need to set a parse_start_url attribute in your spider set to parse_item.
So you need to have something like:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import mysql.connector
from scrapy.http import Request
class FirstSpider(CrawlSpider):
name = 'firstspider'
def start_requests(self):
conn = mysql.connector.connect(user='root', password = 'root', host= 'localhost', database = 'Eistee')
cursor = conn.cursor()
query = ("SELECT Domain, CompanyName FROM Crawlbydomain LIMIT 300, 100")
cursor.execute(query)
results = cursor.fetchall()
for result in results:
urlrequest = 'http://'+result[0]
yield Request(urlrequest)
rules = (Rule (SgmlLinkExtractor(allow=('.ch', )), callback='parse_item', follow= True),)
def parse_item(self, response):
filename = response.url.translate(None, './')
open(filename, 'wb').write(response.body)
parse_start_url = parse_item