Scrapy crawlspider does not follow links - scrapy

I'm trying to make a crawler that will crawl an entire website and output a list of all the domains that the said websites links to (without duplicates).
I have come up with the following code :
import scrapy
from crawler.items import CrawlerItem
from crawler.functions import urlToDomain
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class domainSpider(CrawlSpider):
global allDomains
allDomains = []
name = "domainSpider"
allowed_domains = ["example.com"]
start_urls = [
"https://example.com/"
]
rules = (
Rule(LinkExtractor(), callback='parse', follow=True),
)
def parse(self, response):
urls = response.xpath("//a/#href").extract()
# formating all URL formats to the same one (https://url.com)
urlsOk = []
for elt in urls :
if elt [:2] == "//" : # link is external, append http
urlsOk.append(elt)
elif elt[:4] == "http" :
urlsOk.append(elt)
domaines = list(set([urlToDomain(x) for x in urlsOk]))
item = CrawlerItem()
item["domaines"] = []
item["url"] = response.url
for elt in domaines:
if elt not in allDomains :
item['domaines'].append(elt)
allDomains.append(elt)
yield({
'domaines':elt
})
This is workinge exactly like expected in retrieving the domains, but it stops crawling (finished) after the very first page.

I was overriding a built-in CrawlSpider method (parse) which caused the bug...
The solution was to change the callback method's name from parse to anything else.

Related

Changing next page url within scraper and loading

I am trying to get within several urls of a webpage and follow the response to the next parser to grab another set of urls on a page. However, from this page I need to grab the next page urls but I wanted to try this by manipulating the page string by parsing it and then passing this as the next page. However, the scraper crawls but it returns nothing not even the output on the final parser when I load item.
Note: I know that I can grab the next page rather simply with an if-statement on the href. However, I wanted to try something different in case I had to face a situation where I would have to do this.
Here's my scraper:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
start_urls = ['https://www.zoopla.co.uk/overseas/']
def start_request(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse, )
def parse(self, response):
container = response.xpath("//ul[#class='list-inline list-unstyled']//li")
for links in container:
urls = links.xpath(".//a/#href").get()
yield response.follow(
urls, callback = self.parse_places
)
def parse_places(self, response):
container = response.xpath("//ul[#class='listing-results clearfix js-gtm-list']//li")
for links in container:
urls = links.xpath('(//div[#class="listing-results-right clearfix"]//a)[position() mod 3=1]//#href').get()
yield response.follow(
urls, callback = self.parse_listings
)
if response.xpath("//div[#id='content']//div//h1//text()").extract_first():
page_on = response.xpath("//div[#id='content']//div//h1//text()").extract_first()
name_of_page = page_on.split()[-1]
else:
pass
if response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first():
url_link = response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first()
url_link = url_link.split('/')
last_page = url_link[-1].split('=')[-1]
else:
pass
all_pages = []
for index, n in enumerate(url_link):
for page_name, page_num in zip(name_of_page, last_page):
if index == 5:
url_link[index] = page_name
testit='/'.join(url_link)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, page_num+1):
equal_split[another_i] = str(2)
all_pages.append('='.join(equal_split))
for urls in all_pages:
yield response.follow(
urls, callback = self.parse.places
)
def parse_listings(self, response):
loader = ItemLoader(ZooplasItem(), response=response)
loader.default.output_processor = TakeFirst()
loader.add_xpath("//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'zoopla.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ZooplasSpider)
process.start()
I know the way of grabbing the urls works as I have tried it on a single url using the following:
url = "https://www.zoopla.co.uk/overseas/property/ireland/?new_homes=include&include_sold=false&pn=16"
list_of_stuff = ['Ireland', 'Germany','France']
pages_of_stuff = [5, 7, 6]
test = []
all_pages = []
j=0
for index, n in enumerate(a):
for l_stuff, p_stuff in zip(list_of_stuff,pages_of_stuff):
if index == 5:
a[index] = l_stuff
testit='/'.join(a)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, p_stuff+1):
equal_split[another_i] = str(range_val)
print('='.join(equal_split))
Which is the same as the one used above just a change of variables. this outputs the following links and they work:
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=2
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=3
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=4
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=5
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=2
...
Your use case is suited for using scrapy crawl spider. You can write rules on how to extract links to the properties and how to extract links to the next pages. I have changed your code to use a crawl spider class and I have changed your FEEDS settings to use the recommended settings. FEED_URI and FEED_FORMAT are deprecated in newer versions of scrapy.
Read more about the crawl spider from the docs
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(#class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","//li[#class='ui-breadcrumbs__item'][3]/a/text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()

scrapy only crawl 1 picture

I want crawl image of link : "http://vnexpress.net/photo/cuoc-song-do-day/nguoi-trung-quoc-ra-be-boi-danh-mat-chuoc-tranh-nong-3445592.html" but but the code is only crawl a picture (in my computer) and crawl all pictures (in my friend computer). Plese please help me
import scrapy
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors import LinkExtractor
from imgur.items import ImgurItem
class ImgurSpider(CrawlSpider):
name = 'imgur'
allowed_domains = ['vnexpress.net']
start_urls = ['http://vnexpress.net/photo/cuoc-song-do-day/nguoi-trung-quoc-ra-be-boi-danh-mat-chuoc-tranh-nong-3445592.html']
# rules = [Rule(LinkExtractor(allow=['/*']), 'parse123')]
def parse(self, response):
image = ImgurItem()
# image['title'] = response.xpath(\
# "//img[data-notes-url=""]").extract()
rel = response.xpath("//div[#id='article_content']//img/#src").extract()
image['image_urls'] = [rel[0]]
return image
rel = response.xpath("//div[#id='article_content']//img/#src").extract()
image['image_urls'] = [rel[0]]
You take only one link by specifying [0] index.
Try
image['image_urls'] = rel
You can also split your code to url parsing function, and a callback for downloading the images.

Relative URL to absolute URL Scrapy

I need help to convert relative URL to absolute URL in Scrapy spider.
I need to convert links on my start pages to absolute URL to get the images of the scrawled items, which are on the start pages. I unsuccessfully tried different ways to achieve this and I'm stuck. Any suggestion?
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/billboard",
"http://www.example.com/billboard?page=1"
]
def parse(self, response):
image_urls = response.xpath('//div[#class="content"]/section[2]/div[2]/div/div/div/a/article/img/#src').extract()
relative_url = response.xpath(u'''//div[contains(concat(" ", normalize-space(#class), " "), " content ")]/a/#href''').extract()
for image_url, url in zip(image_urls, absolute_urls):
item = ExampleItem()
item['image_urls'] = image_urls
request = Request(url, callback=self.parse_dir_contents)
request.meta['item'] = item
yield request
There are mainly three ways to achieve that:
Using urljoin function from urllib:
from urllib.parse import urljoin
# Same as: from w3lib.url import urljoin
url = urljoin(base_url, relative_url)
Using the response's urljoin wrapper method, as mentioned by Steve.
url = response.urljoin(relative_url)
If you also want to yield a request from that link, you can use the handful response's follow method:
# It will create a new request using the above "urljoin" method
yield response.follow(relative_url, callback=self.parse)

Scrapy Spider Does Not Scrape Page 1

I want my spider to scrape the listings on every page of a website. I used CrawlSpider and LinkExtractor. But when I looked at the csv file, nothing on the first page (i.e. start url) was scraped. The scraped items started from page 2. I tested my crawler on the Scrapy shell and it seemed fine. I can't figure out where the problem lies. Below is my spider code. Please help. Thanks a lot!
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from shputuo.items_shputuo import ShputuoItem
class Shputuo(CrawlSpider):
name = "shputuo"
allowed_domains = ["shpt.gov.cn"] # DO NOT use www in allowed domains
start_urls = ["http://www.shpt.gov.cn/gb/n6132/n6134/n6156/n7110/n7120/index.html"]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=("//div[#class = 'page']/ul/li[5]/a",)), callback="parse_items", follow= True),
)
def parse_items(self, response):
for sel in response.xpath("//div[#class = 'neirong']/ul/li"):
item = ShputuoItem()
word = sel.xpath("a/text()").extract()[0]
item['id'] = word[3:11]
item['title'] = word[11:len(word)]
item['link'] = "http://www.shpt.gov.cn" + sel.xpath("a/#href").extract()[0]
item['time2'] = sel.xpath("span/text()").extract()[0][1:11]
request = scrapy.Request(item['link'], callback = self.parse_content)
request.meta['item'] = item
yield request
def parse_content(self, response):
item = response.meta['item']
item['question'] = response.xpath("//div[#id = 'ivs_content']/p[2]/text()").extract()[0]
item['question'] = "".join(map(unicode.strip, item['question'])) # get rid of unwated spaces and others
item['reply'] = response.xpath("//div[#id = 'ivs_content']/p[3]/text()").extract()[0]
item['reply'] = "".join(map(unicode.strip, item['reply']))
item['agency'] = item['reply'][6:10]
item['time1'] = "2015-" + item['question'][0] + "-" + item['question'][2]
yield item
looks like what you really need to do is to parse elements of the start_urls requests and not to only follow the rules.
For that use the parse_start_url method which is the callback by default of the start_urls requests.

scrapy isn't working right in extracting the title

In this code I want to scrape title,subtitle and data inside the links but having issues on pages beyond 1 and 2 as getting only 1 item scraped.I want to extract only those entries having title as delhivery only
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span[#style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[#class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[#class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[#class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[#class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
You need to change the item['title'] to this:
item['title'] = ''.join(site.xpath('//table[#width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
Also edit sites to this to extract the required links only (ones with Delhivery in it)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
EDIT:
so I understand now that you need to add a pagination rule to your code.
it should be something like this:
You just need to add your imports and write the new xpaths from the item's link itself, such as this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[#class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[#class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
Also I suggest when you write an xpath, that you don't use any styling parameters, try to use #class or #id, only use #width, #style or any styling params if it's the only way.