Scrapy multiple Rules for SgmlLinkExtractor don't work - scrapy

I want to crawl entire site and extract the links conditionally.
As suggested in this link I tried with multiple Rules but it doesn't work. Scrapy doesn't crawl all pages
I tried with this code but it doesn't scrap any details.
class BusinesslistSpider(CrawlSpider):
name = 'businesslist'
allowed_domains = ['www.businesslist.ae']
start_urls = ['http://www.businesslist.ae/']
rules = (
Rule(SgmlLinkExtractor()),
Rule(SgmlLinkExtractor(allow=r'company/(\d)+/'), callback='parse_item'),
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
i = BusinesslistItem()
company = hxs.select('//div[#class="text companyname"]/strong/text()').extract()[0]
address = hxs.select('//div[#class="text location"]/text()').extract()[0]
location = hxs.select('//div[#class="text location"]/a/text()').extract()[0]
i['url'] = response.url
i['company'] = company
i['address'] = address
i['location'] = location
return i
In my case it doesn't apply second rule, so it doesn't parse the detail pages.

First rule Rule(SgmlLinkExtractor()) matches every links, and scrapy just ignores the second one.
Try followings:
...
start_urls = ['http://www.businesslist.ae/sitemap.html']
...
# Rule(SgmlLinkExtractor()),

Related

Extract page from start_urls and find pdf link from every extracted page using Scrapy

I'm trying to extract some fields from start_url, and want to add the PDF link fields that are obtained from each URL that has been obtained. I tried Scrapy but no lucky to add PDF fields. Here is my code,
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
#pass
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
i = 0
for a in range(total_url):
title = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
url_source = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
thumbnail = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
pdf = scrapy.Request(book_urls[i], self.find_details)
yield {
'Book Title': title,
'URL': url_source,
'Mini IMG': thumbnail,
'PDF Link': pdf
}
i+=1
def find_details(self, response):
# find PDF link
pdf = response.xpath("//div[#class='td-post-content']//a/#href").get()
return pdf
How do I add a PDF link field correctly when I export it as CSV? Thanks in advance
pdf = scrapy.Request(book_urls[i], self.find_details)
It means pdf variable is a request.
Scrapy is asynchronous so you'll have trouble to get a return value from a function. Just make a request and pass the details to the callback with cb_kwargs.
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
for i in range(total_url):
item = dict()
item['title'] = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
item['url_source'] = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
item['thumbnail'] = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
yield scrapy.Request(url=book_urls[i], callback=self.find_details, cb_kwargs={'item': item})
def find_details(self, response, item):
# find PDF link
item['pdf'] = response.xpath("//div[#class='td-post-content']//a/#href").get()
yield item

How to scrape all article links from Reuters website using Scrapy when older links are dynamically loaded after scrolling down?

I am trying to scrape all the hyperlinks of the website for all the news articles to extract the content. I am able to retrieve all the data from all the articles which are loaded when you open the website but when you scroll down on the site more articles are automatically loaded due to an event. Currently I am using Scrapy_Splash but I receive the same amount of links when I do not use splash. I hope you can help me out. The spider is called class FinanceNewsScraperSpider(scrapy.Spider):. Below you can see my code:
name = "audinewsarticles"
def start_requests(self):
start_urls = ['https://www.reuters.com/companies/NSUG.DE/news',
]
urls = start_urls
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_newspage)
def parse_newspage(self, response):
links = response.xpath('//a[contains(#href,"/article/")]/#href').extract() #extract hyperlink
for url in links:
yield SplashRequest(url=url,
callback=self.parse_article,
endpoint='render.html')
def parse_article(self, response):
item = AudiItem()
item['article_link'] = response.url
item['article_headline'] = response.xpath('//*[contains(#class,"ArticleHeader_headline")]/text()').extract()
item['article_date'] = response.xpath('//*[contains(#class,"ArticleHeader_date")]/text()').extract()
item['article_text'] = response.xpath('//div[#class="StandardArticleBody_body"]//p/text()').extract()
print(item)
#saving data to file.
path = 'news/'
file = 'audinews_' + str(datetime.now().strftime("%Y%m%d-%H%M")) + '.csv'
file_name = open(path + file, 'a')
fieldnames = ['article_link', 'article_headline','article_date','article_text'] #adding header to file
writer = csv.writer(file_name, lineterminator='\n')
writer.writerow([item[key] for key in item.keys()])
Please let me know if you need more information from me.

Extract data from two pages with Scrapy

I have an agenda as a starting page. This page contains the start times and titles of events and links to the detail page of each event.
My spider extracts all events details (description, location, etc) on the detail page of each single event, except the start time i have to extract on my start page.
How can i extract start time from the start page and other data on each detail pages ?
What is the scrappy way to go ? Using meta['item'] ? i don't get it...
This is my spider for now. Any help greatly appreciated!
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_agenda_contents)
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = LuItem()
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item
Edit:
I tried to extract start time from the start page using request.meta['item'] and get a list of all the start time in the start page for each event. How to get the start time for each event ?
Can someone show me the right direction ?
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
item = LuItem()
item['StartTime'] = response.xpath('//div[contains(., "H")]/span/text()').extract()
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
request = scrapy.Request(url, callback=self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item
You are right. Using meta would do it in your case. Please see the official documentation here: http://doc.scrapy.org/en/latest/topics/request-response.html#passing-additional-data-to-callback-functions
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
This worked :
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
StartTimes = response.xpath('//div[#class="toggle_container_show"]/div/span/text()').extract()
urls =response.xpath('//div[#class="toggle_container_show"]/div/a/#href').extract()
for StartTime,url in zip(StartTimes,urls):
item = LuItem()
item['StartTime'] = StartTime
request = Request(url,callback = self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item

Scrapy Spider Does Not Scrape Page 1

I want my spider to scrape the listings on every page of a website. I used CrawlSpider and LinkExtractor. But when I looked at the csv file, nothing on the first page (i.e. start url) was scraped. The scraped items started from page 2. I tested my crawler on the Scrapy shell and it seemed fine. I can't figure out where the problem lies. Below is my spider code. Please help. Thanks a lot!
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from shputuo.items_shputuo import ShputuoItem
class Shputuo(CrawlSpider):
name = "shputuo"
allowed_domains = ["shpt.gov.cn"] # DO NOT use www in allowed domains
start_urls = ["http://www.shpt.gov.cn/gb/n6132/n6134/n6156/n7110/n7120/index.html"]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=("//div[#class = 'page']/ul/li[5]/a",)), callback="parse_items", follow= True),
)
def parse_items(self, response):
for sel in response.xpath("//div[#class = 'neirong']/ul/li"):
item = ShputuoItem()
word = sel.xpath("a/text()").extract()[0]
item['id'] = word[3:11]
item['title'] = word[11:len(word)]
item['link'] = "http://www.shpt.gov.cn" + sel.xpath("a/#href").extract()[0]
item['time2'] = sel.xpath("span/text()").extract()[0][1:11]
request = scrapy.Request(item['link'], callback = self.parse_content)
request.meta['item'] = item
yield request
def parse_content(self, response):
item = response.meta['item']
item['question'] = response.xpath("//div[#id = 'ivs_content']/p[2]/text()").extract()[0]
item['question'] = "".join(map(unicode.strip, item['question'])) # get rid of unwated spaces and others
item['reply'] = response.xpath("//div[#id = 'ivs_content']/p[3]/text()").extract()[0]
item['reply'] = "".join(map(unicode.strip, item['reply']))
item['agency'] = item['reply'][6:10]
item['time1'] = "2015-" + item['question'][0] + "-" + item['question'][2]
yield item
looks like what you really need to do is to parse elements of the start_urls requests and not to only follow the rules.
For that use the parse_start_url method which is the callback by default of the start_urls requests.

scrapy isn't working right in extracting the title

In this code I want to scrape title,subtitle and data inside the links but having issues on pages beyond 1 and 2 as getting only 1 item scraped.I want to extract only those entries having title as delhivery only
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span[#style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[#class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[#class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[#class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[#class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
You need to change the item['title'] to this:
item['title'] = ''.join(site.xpath('//table[#width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
Also edit sites to this to extract the required links only (ones with Delhivery in it)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
EDIT:
so I understand now that you need to add a pagination rule to your code.
it should be something like this:
You just need to add your imports and write the new xpaths from the item's link itself, such as this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[#class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[#class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
Also I suggest when you write an xpath, that you don't use any styling parameters, try to use #class or #id, only use #width, #style or any styling params if it's the only way.