Extract data from two pages with Scrapy - scrapy

I have an agenda as a starting page. This page contains the start times and titles of events and links to the detail page of each event.
My spider extracts all events details (description, location, etc) on the detail page of each single event, except the start time i have to extract on my start page.
How can i extract start time from the start page and other data on each detail pages ?
What is the scrappy way to go ? Using meta['item'] ? i don't get it...
This is my spider for now. Any help greatly appreciated!
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_agenda_contents)
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = LuItem()
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item
Edit:
I tried to extract start time from the start page using request.meta['item'] and get a list of all the start time in the start page for each event. How to get the start time for each event ?
Can someone show me the right direction ?
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
item = LuItem()
item['StartTime'] = response.xpath('//div[contains(., "H")]/span/text()').extract()
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
request = scrapy.Request(url, callback=self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item

You are right. Using meta would do it in your case. Please see the official documentation here: http://doc.scrapy.org/en/latest/topics/request-response.html#passing-additional-data-to-callback-functions
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item

This worked :
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
StartTimes = response.xpath('//div[#class="toggle_container_show"]/div/span/text()').extract()
urls =response.xpath('//div[#class="toggle_container_show"]/div/a/#href').extract()
for StartTime,url in zip(StartTimes,urls):
item = LuItem()
item['StartTime'] = StartTime
request = Request(url,callback = self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item

Related

Extract page from start_urls and find pdf link from every extracted page using Scrapy

I'm trying to extract some fields from start_url, and want to add the PDF link fields that are obtained from each URL that has been obtained. I tried Scrapy but no lucky to add PDF fields. Here is my code,
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
#pass
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
i = 0
for a in range(total_url):
title = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
url_source = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
thumbnail = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
pdf = scrapy.Request(book_urls[i], self.find_details)
yield {
'Book Title': title,
'URL': url_source,
'Mini IMG': thumbnail,
'PDF Link': pdf
}
i+=1
def find_details(self, response):
# find PDF link
pdf = response.xpath("//div[#class='td-post-content']//a/#href").get()
return pdf
How do I add a PDF link field correctly when I export it as CSV? Thanks in advance
pdf = scrapy.Request(book_urls[i], self.find_details)
It means pdf variable is a request.
Scrapy is asynchronous so you'll have trouble to get a return value from a function. Just make a request and pass the details to the callback with cb_kwargs.
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
for i in range(total_url):
item = dict()
item['title'] = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
item['url_source'] = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
item['thumbnail'] = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
yield scrapy.Request(url=book_urls[i], callback=self.find_details, cb_kwargs={'item': item})
def find_details(self, response, item):
# find PDF link
item['pdf'] = response.xpath("//div[#class='td-post-content']//a/#href").get()
yield item

scrapy isn't working right in extracting the title

In this code I want to scrape title,subtitle and data inside the links but having issues on pages beyond 1 and 2 as getting only 1 item scraped.I want to extract only those entries having title as delhivery only
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span[#style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[#class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[#class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[#class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[#class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
You need to change the item['title'] to this:
item['title'] = ''.join(site.xpath('//table[#width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
Also edit sites to this to extract the required links only (ones with Delhivery in it)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
EDIT:
so I understand now that you need to add a pagination rule to your code.
it should be something like this:
You just need to add your imports and write the new xpaths from the item's link itself, such as this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[#class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[#class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
Also I suggest when you write an xpath, that you don't use any styling parameters, try to use #class or #id, only use #width, #style or any styling params if it's the only way.

How to continue crawling after return item in Scrapy?

In my spider after receiving response I want to download and show captcha image and then continue crawling:
def get_captcha(self, response):
print '\nLoading captcha...\n'
item = CaptchaItem()
hxs = HtmlXPathSelector(response)
captcha_img_src = hxs.select('//*[#id="captcha-image"]/#src').extract()[0]
item['image_urls'] = [captcha_img_src]
return item
But I don't know when image is loaded and how to continue crawling after that.
FYI: Captcha image can't be downloaded without cookies.
Thanks in advance!
Use yield instead of return:
def get_captcha(self, response):
print '\nLoading captcha...\n'
item = CaptchaItem()
hxs = HtmlXPathSelector(response)
captcha_img_src = hxs.select('//*[#id="captcha-image"]/#src').extract()[0]
item['image_urls'] = [captcha_img_src]
yield item
#you may display here your scraped item and after that
#your further post request goes here...
yield your_request

Scrapy multiple Rules for SgmlLinkExtractor don't work

I want to crawl entire site and extract the links conditionally.
As suggested in this link I tried with multiple Rules but it doesn't work. Scrapy doesn't crawl all pages
I tried with this code but it doesn't scrap any details.
class BusinesslistSpider(CrawlSpider):
name = 'businesslist'
allowed_domains = ['www.businesslist.ae']
start_urls = ['http://www.businesslist.ae/']
rules = (
Rule(SgmlLinkExtractor()),
Rule(SgmlLinkExtractor(allow=r'company/(\d)+/'), callback='parse_item'),
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
i = BusinesslistItem()
company = hxs.select('//div[#class="text companyname"]/strong/text()').extract()[0]
address = hxs.select('//div[#class="text location"]/text()').extract()[0]
location = hxs.select('//div[#class="text location"]/a/text()').extract()[0]
i['url'] = response.url
i['company'] = company
i['address'] = address
i['location'] = location
return i
In my case it doesn't apply second rule, so it doesn't parse the detail pages.
First rule Rule(SgmlLinkExtractor()) matches every links, and scrapy just ignores the second one.
Try followings:
...
start_urls = ['http://www.businesslist.ae/sitemap.html']
...
# Rule(SgmlLinkExtractor()),

Sequentially crawl website using scrapy

Is there a way to tell scrapy to stop crawling based upon condition in the 2nd level page? I am doing the following:
I have a start_url to begin with (1st level page)
I have set of urls extracted from the start_url using parse(self,
response)
Then I add queue the links using Request with callback as parseDetailPage(self, response)
Under parseDetail (2nd level page) I come to know if I can stop crawling or not
Right now I am using CloseSpider() to accomplish this, but the problem is that the urls to be parsed are already queued by the time I start crawling second level pages and I do not know how to remove them from the queue. Is there a way to sequentially crawl the list of links and then be able to stop in parseDetailPage?
global job_in_range
start_urls = []
start_urls.append("http://sfbay.craigslist.org/sof/")
def __init__(self):
self.job_in_range = True
def parse(self, response):
hxs = HtmlXPathSelector(response)
results = hxs.select('//blockquote[#id="toc_rows"]')
items = []
if results:
links = results.select('.//p[#class="row"]/a/#href')
for link in links:
if link is self.end_url:
break;
nextUrl = link.extract()
isValid = WPUtil.validateUrl(nextUrl);
if isValid:
item = WoodPeckerItem()
item['url'] = nextUrl
item = Request(nextUrl, meta={'item':item},callback=self.parseDetailPage)
items.append(item)
else:
self.error.log('Could not parse the document')
return items
def parseDetailPage(self, response):
if self.job_in_range is False:
raise CloseSpider('End date reached - No more crawling for ' + self.name)
hxs = HtmlXPathSelector(response)
print response
body = hxs.select('//article[#id="pagecontainer"]/section[#class="body"]')
item = response.meta['item']
item['postDate'] = body.select('.//section[#class="userbody"]/div[#class="postinginfos"]/p')[1].select('.//date/text()')[0].extract()
if item['jobTitle'] is 'Admin':
self.job_in_range = False
raise CloseSpider('Stop crawling')
item['jobTitle'] = body.select('.//h2[#class="postingtitle"]/text()')[0].extract()
item['description'] = body.select(str('.//section[#class="userbody"]/section[#id="postingbody"]')).extract()
return item
Do you mean that you would like to stop the spider and resume it without parsing the urls which have been parsed?
If so, you may try to set the JOB_DIR setting. This setting can keep the request.queue in specified file on the disk.