Using scrapy in a script and passing args - scrapy

I want to use scrapy in a larger project, but I am unsure how to pass args like name,start_urls,and allowed_domains. As I understand it name,start_urls,and allowed_domains variables are settings for process.crawl, but I am not able to use self.var like I have with line- site = self.site since self obviously isn't defined there. There is also the problem of the proper way to return. At the end of the day I just want a way to crawl all urls on a single domain from within a script.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.crawler import CrawlerProcess
#from project.spiders.test_spider import SpiderName
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(settings={
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
crawledUrls = []
class MySpider(CrawlSpider):
name = 'spider_example_name'
def __init__(self,site):
self.site=site
site = self.site
domain = urlparse(site).netloc
start_urls = [site]
allowed_domains = [domain]
rules = (
Rule(LinkExtractor(unique=True), callback='parse_item', follow=True),
)
def parse_item(self, response):
#I think there is a way to do this with yeild
print(self.site)
crawledUrls.append(response.url)
def main():
spider = MySpider('http://quotes.toscrape.com')
process.crawl(spider)
process.start() # the script will block here until the crawling is finished
print("###########################################")
print(len(crawledUrls))
print(crawledUrls)
print("###########################################")
if __name__ == "__main__":
main()

See this comment on the scrapy github:
https://github.com/scrapy/scrapy/issues/1823#issuecomment-189731464
It appears you made the same mistakes as the reporter in that comment, where
process.crawl(...) takes a class, not instance, of Spider
params can be specified within the call to process.crawl(...) as keyword arguments. Check the possible kwargs in the Scrapy docs for CrawlerProcess.
So, for example, your main could look like this:
def main():
process.crawl(
MySpider,
start_urls=[
"http://example.com",
"http://example.org"
)
process.start()
...

Related

Simple scraper with Scrapy API

I am writing a scraper with Scrapy within a larger project, and I'm trying to keep it as minimal as possible (without create a whole scrapy project). This code downloads a single URL correctly:
import scrapy
from scrapy.crawler import CrawlerProcess
class WebsiteSpider(scrapy.Spider):
"""
https://docs.scrapy.org/en/latest/
"""
custom_settings = {'DOWNLOAD_DELAY': 1, 'DEPTH_LIMIT': 3}
name = 'my_website_scraper'
def parse(self,response):
html = response.body
url = response.url
# process page here
process = CrawlerProcess()
process.crawl(WebsiteSpider, start_urls=['https://www.bbc.co.uk/'])
process.start()
How can I enrich this code to keep scraping the links found in the start URLs (with a maximum depth, for example of 3)?
Try this.
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class WebsiteSpider(Spider):
name = 'bbc.co.uk'
allowed_domains = ['.bbc.co.uk']
start_urls = ['https://www.bbc.co.uk/']
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lstA = doc.listA(url=url["url"]) # Get link data for subsequent crawling
data = [{"title": doc.title.text}] # Get target data
return {"Urls": lstA, "Data": data} # Return data to framework
SimplifiedMain.startThread(WebsiteSpider()) # Start crawling

how Login session maintained in scrapy

I have a simple login and another page behind the login. Once I complete the login (I can get user session). If I request another page the session is gone
The sample implementation
from scrapy.item import Item, Field
from scrapy.http import FormRequest
from scrapy.spider import Spider
from scrapy.utils.response import open_in_browser
class TestSpider(Spider):
name = "test"
allowed_domains = ["example.com"]
start_urls = ["https://example.com/customer/account/login/"]
def parse(self, response):
token = response.xpath(".//input[contains(#name,'token')]/#value").extract()[0]
yield FormRequest.from_response(
response,
formnumber=1,
formxpath=".//*[#id='form-account-login']",
formdata={
'token' : token,
'LoginForm[email]': 'xxxx',
'LoginForm[password]': 'xxxx',
},
clickdata={'id': 'customer-account-login'},
callback=self.parse1,
)
def parse1(self, response):
return scrapy.Request(url="https://example.com/customer/account/list/", callback = self.parse_2, errback=self.error)
def parse1(self,response):
open_in_browser(response)
Make sure that you've got
COOKIES_ENABLED = True
in your settings.py file
UPD:
You define parse1 method twice in your code.
def parse1(self, response):
return scrapy.Request(url="https://example.com/customer/account/list/", callback = self.parse_2, errback=self.error)
def parse1(self,response):
open_in_browser(response)

Crawl whole site except links under specific path

I've got a scrappy spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class ExampleSpider(CrawlSpider):
name = "spidermaster"
allowed_domains = ["www.test.com"]
start_urls = ["http://www.test.com/"]
rules = [Rule(SgmlLinkExtractor(allow=()),
follow=True),
Rule(SgmlLinkExtractor(allow=()), callback='parse_item'),
]
def parse_item(self,response):
self.log('A response from %s just arrived!' % response.url)
What im trying is to crawl the whole webpage except what is under an specific path.
For example, i want to crawl all the test web site except www.test.com/too_much_links.
Thanks in advance
I usually do it in this way:
ignore = ['too_much_links', 'many_links']
rules = [Rule(SgmlLinkExtractor(allow=(), deny=ignore), follow=True),
Rule(SgmlLinkExtractor(allow=(), deny=ignore), callback='parse_item'),
]

Emailing when Scrapy project is finished

So i re-read this page in docs and still can't grasp in which files in project should i insert these lines?
from scrapy.mail import MailSender
mailer = MailSender()
mailer.send(to=["someone#example.com"], subject="Some subject", body="Some body", cc=["another#example.com"])
# ...
from scrapy.mail import MailSender
# ...
class MailSpider(Spider):
# ...
#classmethod
def from_crawler(cls, crawler):
spider = cls()
spider.mailer = MailSender()
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_closed(self, spider):
spider.mailer.send(to=["someone#example.com"], subject="Some subject", body="Some body", cc=["another#example.com"])
# ...
You could use signals like this to send the e-mail after the spider has closed. But I am not sure if this is the best way of doing this.
Also I believe you could send e-mails anywhere a python code is allowed.

ScrapyDeprecationWaring: Command's default `crawler` is deprecated and will be removed. Use `create_crawler` method to instantiate crawlers

Scrapy version 0.19
I am using the code at this page ( Run multiple scrapy spiders at once using scrapyd ). When I run scrapy allcrawl, I got
ScrapyDeprecationWaring: Command's default `crawler` is deprecated and will be removed. Use `create_crawler` method to instantiate crawlers
Here is the code:
from scrapy.command import ScrapyCommand
import urllib
import urllib2
from scrapy import log
class AllCrawlCommand(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def short_desc(self):
return "Schedule a run for all available spiders"
def run(self, args, opts):
url = 'http://localhost:6800/schedule.json'
for s in self.crawler.spiders.list(): #this line raise the warning
values = {'project' : 'YOUR_PROJECT_NAME', 'spider' : s}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
log.msg(response)
How do I fix the DeprecationWarning ?
Thanks
Use:
crawler = self.crawler_process.create_crawler()