Crawl whole site except links under specific path - scrapy

I've got a scrappy spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class ExampleSpider(CrawlSpider):
name = "spidermaster"
allowed_domains = ["www.test.com"]
start_urls = ["http://www.test.com/"]
rules = [Rule(SgmlLinkExtractor(allow=()),
follow=True),
Rule(SgmlLinkExtractor(allow=()), callback='parse_item'),
]
def parse_item(self,response):
self.log('A response from %s just arrived!' % response.url)
What im trying is to crawl the whole webpage except what is under an specific path.
For example, i want to crawl all the test web site except www.test.com/too_much_links.
Thanks in advance

I usually do it in this way:
ignore = ['too_much_links', 'many_links']
rules = [Rule(SgmlLinkExtractor(allow=(), deny=ignore), follow=True),
Rule(SgmlLinkExtractor(allow=(), deny=ignore), callback='parse_item'),
]

Related

Simple scraper with Scrapy API

I am writing a scraper with Scrapy within a larger project, and I'm trying to keep it as minimal as possible (without create a whole scrapy project). This code downloads a single URL correctly:
import scrapy
from scrapy.crawler import CrawlerProcess
class WebsiteSpider(scrapy.Spider):
"""
https://docs.scrapy.org/en/latest/
"""
custom_settings = {'DOWNLOAD_DELAY': 1, 'DEPTH_LIMIT': 3}
name = 'my_website_scraper'
def parse(self,response):
html = response.body
url = response.url
# process page here
process = CrawlerProcess()
process.crawl(WebsiteSpider, start_urls=['https://www.bbc.co.uk/'])
process.start()
How can I enrich this code to keep scraping the links found in the start URLs (with a maximum depth, for example of 3)?
Try this.
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class WebsiteSpider(Spider):
name = 'bbc.co.uk'
allowed_domains = ['.bbc.co.uk']
start_urls = ['https://www.bbc.co.uk/']
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lstA = doc.listA(url=url["url"]) # Get link data for subsequent crawling
data = [{"title": doc.title.text}] # Get target data
return {"Urls": lstA, "Data": data} # Return data to framework
SimplifiedMain.startThread(WebsiteSpider()) # Start crawling

Using scrapy in a script and passing args

I want to use scrapy in a larger project, but I am unsure how to pass args like name,start_urls,and allowed_domains. As I understand it name,start_urls,and allowed_domains variables are settings for process.crawl, but I am not able to use self.var like I have with line- site = self.site since self obviously isn't defined there. There is also the problem of the proper way to return. At the end of the day I just want a way to crawl all urls on a single domain from within a script.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.crawler import CrawlerProcess
#from project.spiders.test_spider import SpiderName
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(settings={
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
crawledUrls = []
class MySpider(CrawlSpider):
name = 'spider_example_name'
def __init__(self,site):
self.site=site
site = self.site
domain = urlparse(site).netloc
start_urls = [site]
allowed_domains = [domain]
rules = (
Rule(LinkExtractor(unique=True), callback='parse_item', follow=True),
)
def parse_item(self, response):
#I think there is a way to do this with yeild
print(self.site)
crawledUrls.append(response.url)
def main():
spider = MySpider('http://quotes.toscrape.com')
process.crawl(spider)
process.start() # the script will block here until the crawling is finished
print("###########################################")
print(len(crawledUrls))
print(crawledUrls)
print("###########################################")
if __name__ == "__main__":
main()
See this comment on the scrapy github:
https://github.com/scrapy/scrapy/issues/1823#issuecomment-189731464
It appears you made the same mistakes as the reporter in that comment, where
process.crawl(...) takes a class, not instance, of Spider
params can be specified within the call to process.crawl(...) as keyword arguments. Check the possible kwargs in the Scrapy docs for CrawlerProcess.
So, for example, your main could look like this:
def main():
process.crawl(
MySpider,
start_urls=[
"http://example.com",
"http://example.org"
)
process.start()
...

Looping through pages of Web Page's Request URL with Scrapy

I'm looking to adapt this tutorial, (https://medium.com/better-programming/a-gentle-introduction-to-using-scrapy-to-crawl-airbnb-listings-58c6cf9f9808) to scraping this site of tiny home listings: https://tinyhouselistings.com/.
The tutorial uses the request URL, to get a very complete and clean JSON file, but does so for the first page only. It seems that looping through the 121 pages of my tinyhouselistings request url should be pretty straight-forward but I have not been able to get anything to work. The tutorial does not loop through the pages of the request url, but rather uses scrapy splash, run within a Docker container to get all the listings. I am willing to try that, but I just feel like it should be possible to loop through this request url.
This outputs only the first page only of the tinyhouselistings request url for my project:
import scrapy
class TinyhouselistingsSpider(scrapy.Spider):
name = 'tinyhouselistings'
allowed_domains = ['tinyhouselistings.com']
start_urls = ['http://www.tinyhouselistings.com']
def start_requests(self):
url = 'https://thl-prod.global.ssl.fastly.net/api/v1/listings/search?area_min=0&measurement_unit=feet&page=1'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
_file = "tiny_listings.json"
with open(_file, 'wb') as f:
f.write(response.body)
I've tried this:
class TinyhouselistingsSpider(scrapy.Spider):
name = 'tinyhouselistings'
allowed_domains = ['tinyhouselistings.com']
start_urls = ['']
def start_requests(self):
url = 'https://thl-prod.global.ssl.fastly.net/api/v1/listings/search?area_min=0&measurement_unit=feet&page='
for page in range(1, 121):
self.start_urls.append(url + str(page))
yield scrapy.Request(url=start_urls, callback=self.parse)
But I'm not sure how to then pass start_urls to parse so as to write the response to the json being written at the end of the script.
Any help would be much appreciated!
Remove allowed_domains = ['tinyhouselistings.com'] because the url thl-prod.global.ssl.fastly.net will be filtered out by Scrapy
Since you are using start_requests method so you do not need start_urls, you can only have either of them
import json
class TinyhouselistingsSpider(scrapy.Spider):
name = 'tinyhouselistings'
listings_url = 'https://thl-prod.global.ssl.fastly.net/api/v1/listings/search?area_min=0&measurement_unit=feet&page={}'
def start_requests(self):
page = 1
yield scrapy.Request(url=self.listings_url.format(page),
meta={"page": page},
callback=self.parse)
def parse(self, response):
resp = json.loads(response.body)
for ad in resp["listings"]:
yield ad
page = int(response.meta['page']) + 1
if page < int(listings['meta']['pagination']['page_count'])
yield scrapy.Request(url=self.listings_url.format(page),
meta={"page": page},
callback=self.parse)
From terminal, run spider using to save scraped data to a JSON file
scrapy crawl tinyhouselistings -o output_file.json

Scrapy LinkExtractor Specific Url

i'm using to crawl a website. However, the current code redirects me and does not crawl from the URL I want.
URL:
http://www.example.com/book/diff/
Where diff can be anything except /.
To add on, I only want to crawl url that match the url.
Here is my current code:
name = "testing"
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/book/',
]
rules = (Rule(LinkExtractor(allow=(r'^http://www.example.com/book/[^/]*/$')),
callback='parse_page',follow=True),)
rules = (Rule(LinkExtractor(allow=(r'^http://www.example.com/book/')), callback='parse_page',follow=True),)
This should be enough.

Easier way to follow links with Scrapy

I have the following code in a scrapy spider:
class ContactSpider(Spider):
name = "contact"
# allowed_domains = ["http://www.domain.com/"]
start_urls = [
"http://web.domain.com/DECORATION"
]
BASE_URL = "http://web.domain.com"
def parse(self, response):
links = response.selector.xpath('//*[contains(#class,"MAIN")]/a/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield Request(absolute_url, headers= headers, callback=self.second)
I'm surprised there is not a simpler way in scrapy to follow links rather than build each absolute_url. Is there a a better way to do this?
For absolute urls you can use urlparse.urljoin, Response already has a shortcut for that via response.urljoin(link). So your code could easily be replaced by:
def parse(self, response):
links = response.selector.xpath('//*[contains(#class,"MAIN")]/a/#href').extract()
for link in links:
yield Request(response.urljoin(link), headers=headers, callback=self.second)
You can also use scrapy LinkExtractors which extract links according to some rules and manages all of the joining automatically.
from scrapy.linkextractors import LinkExtractor
def parse(self, response):
le = LinkExtractor(restrict_xpaths='//*[contains(#class,"MAIN")]/a/#href')
links = le.extract_links(response)
for link in links:
yield Request(link.url, headers= headers, callback=self.second)
Regarding more automated crawling experience - scrapy has CrawlSpider which uses set of rules to extract and follow links on each page. You can read about it more here: http://doc.scrapy.org/en/latest/topics/spiders.html#crawlspider
The docs have some examples of it as well.