Extract page from start_urls and find pdf link from every extracted page using Scrapy - scrapy

I'm trying to extract some fields from start_url, and want to add the PDF link fields that are obtained from each URL that has been obtained. I tried Scrapy but no lucky to add PDF fields. Here is my code,
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
#pass
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
i = 0
for a in range(total_url):
title = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
url_source = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
thumbnail = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
pdf = scrapy.Request(book_urls[i], self.find_details)
yield {
'Book Title': title,
'URL': url_source,
'Mini IMG': thumbnail,
'PDF Link': pdf
}
i+=1
def find_details(self, response):
# find PDF link
pdf = response.xpath("//div[#class='td-post-content']//a/#href").get()
return pdf
How do I add a PDF link field correctly when I export it as CSV? Thanks in advance

pdf = scrapy.Request(book_urls[i], self.find_details)
It means pdf variable is a request.
Scrapy is asynchronous so you'll have trouble to get a return value from a function. Just make a request and pass the details to the callback with cb_kwargs.
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
for i in range(total_url):
item = dict()
item['title'] = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
item['url_source'] = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
item['thumbnail'] = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
yield scrapy.Request(url=book_urls[i], callback=self.find_details, cb_kwargs={'item': item})
def find_details(self, response, item):
# find PDF link
item['pdf'] = response.xpath("//div[#class='td-post-content']//a/#href").get()
yield item

Related

How to scrape all article links from Reuters website using Scrapy when older links are dynamically loaded after scrolling down?

I am trying to scrape all the hyperlinks of the website for all the news articles to extract the content. I am able to retrieve all the data from all the articles which are loaded when you open the website but when you scroll down on the site more articles are automatically loaded due to an event. Currently I am using Scrapy_Splash but I receive the same amount of links when I do not use splash. I hope you can help me out. The spider is called class FinanceNewsScraperSpider(scrapy.Spider):. Below you can see my code:
name = "audinewsarticles"
def start_requests(self):
start_urls = ['https://www.reuters.com/companies/NSUG.DE/news',
]
urls = start_urls
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_newspage)
def parse_newspage(self, response):
links = response.xpath('//a[contains(#href,"/article/")]/#href').extract() #extract hyperlink
for url in links:
yield SplashRequest(url=url,
callback=self.parse_article,
endpoint='render.html')
def parse_article(self, response):
item = AudiItem()
item['article_link'] = response.url
item['article_headline'] = response.xpath('//*[contains(#class,"ArticleHeader_headline")]/text()').extract()
item['article_date'] = response.xpath('//*[contains(#class,"ArticleHeader_date")]/text()').extract()
item['article_text'] = response.xpath('//div[#class="StandardArticleBody_body"]//p/text()').extract()
print(item)
#saving data to file.
path = 'news/'
file = 'audinews_' + str(datetime.now().strftime("%Y%m%d-%H%M")) + '.csv'
file_name = open(path + file, 'a')
fieldnames = ['article_link', 'article_headline','article_date','article_text'] #adding header to file
writer = csv.writer(file_name, lineterminator='\n')
writer.writerow([item[key] for key in item.keys()])
Please let me know if you need more information from me.

Crawling RSS: Scrapy returned no data

Here's my code to crawl RSS BBC but it returned nothing.
I checked xpath interactively using "Inspect" in Chrome and it seemed OK.
import scrapy
class BbcSpider(scrapy.Spider):
name = "bbc"
allowed_domains = ["feeds.bbci.co.uk/news/world/rss.xml"]
start_urls = ["https://feeds.bbci.co.uk/news/world/rss.xml"]
def parse(self, response):
all_rss = response.xpath('//div[#id="item"]/ul/li')
for rss in all_rss:
rss_url = rss.xpath('//a/#href').extract_first()
rss_title = rss.xpath('//a/text()').extract_first()
rss_short_content = rss.xpath('//div/text()').extract_first()
yield {
"URL": rss_url,
"Title": rss_title,
"Short Content": rss_short_content
}
Any help would be greatly appreciated!
Response is a .txt file so you can parse it in following way:
import scrapy
class BbcSpider(scrapy.Spider):
name = "bbc"
allowed_domains = ["feeds.bbci.co.uk/news/world/rss.xml"]
start_urls = ["https://feeds.bbci.co.uk/news/world/rss.xml"]
def parse(self, response):
rss_url = response.xpath('//link/text()').extract()[2:]
rss_title = response.xpath('//title/text()').extract()[2:]
rss_short_content = response.xpath('//description/text()').extract()
for i in range(len(rss_url)):
yield {
"URL": rss_url[i],
"Title": rss_title[i],
"Short Content": rss_short_content[i],
}
The first two URLs and titles had nothing to do with news so I dropped them.
The main reason for this crawler not yielding any data, because all_rss list is empty. Secondly, In Scrapy you have access to only the first GET request so if you open-source code using ctrl/cmd + U, you will be unable to find item id. So your
response.xpath('//div[#id="item"]/ul/li') selector returns empty list and for loop didn't execute.
Try this
for rss in response.css('item'):
rss_url = rss.css('link::text').extract_first()
rss_title = rss.css('title::text').extract_first()
rss_short_content = response.css('description::text').extract_first()

How to use the yield function to scrape data from multiple pages

I'm trying to scrape data from amazon India website. I am not able collect response and parse the elements using the yield() method when:
1) I have to move from product page to review page
2) I have to move from one review page to another review page
Product page
Review page
Code flow:
1) customerReviewData() calls the getCustomerRatingsAndComments(response)
2) The getCustomerRatingsAndComments(response)
finds the URL of the review page and call the yield request method with getCrrFromReviewPage(request) as callback method, with url of this review page
3) getCrrFromReviewPage() gets new response of the firstreview page and scrape all the elements from the first review page (page loaded) and add it to customerReviewDataList[]
4) get URL of the next page if it exists and recursively call getCrrFromReviewPage() method, and crawl elements from next page, until all the review page is crawled
5) All the reviews gets added to the customerReviewDataList[]
I have tried playing around with yield() changing the parameters and also looked up the scrapy documentation for yield() and Request/Response yield
# -*- coding: utf-8 -*-
import scrapy
import logging
customerReviewDataList = []
customerReviewData = {}
#Get product name in <H1>
def getProductTitleH1(response):
titleH1 = response.xpath('normalize-space(//*[#id="productTitle"]/text())').extract()
return titleH1
def getCustomerRatingsAndComments(response):
#Fetches the relative url
reviewRelativePageUrl = response.css('#reviews-medley-footer a::attr(href)').extract()[0]
if reviewRelativePageUrl:
#get absolute URL
reviewPageAbsoluteUrl = response.urljoin(reviewRelativePageUrl)
yield Request(url = reviewPageAbsoluteUrl, callback = getCrrFromReviewPage())
self.log("yield request complete")
return len(customerReviewDataList)
def getCrrFromReviewPage():
userReviewsAndRatings = response.xpath('//div[#id="cm_cr-review_list"]/div[#data-hook="review"]')
for userReviewAndRating in userReviewsAndRatings:
customerReviewData[reviewTitle] = response.css('#cm_cr-review_list .review-title span ::text').extract()
customerReviewData[reviewDescription] = response.css('#cm_cr-review_list .review-text span::text').extract()
customerReviewDataList.append(customerReviewData)
reviewNextPageRelativeUrl = response.css('#cm_cr-pagination_bar .a-pagination .a-last a::attr(href)')[0].extract()
if reviewNextPageRelativeUrl:
reviewNextPageAbsoluteUrl = response.urljoin(reviewNextPageRelativeUrl)
yield Request(url = reviewNextPageAbsoluteUrl, callback = getCrrFromReviewPage())
class UsAmazonSpider(scrapy.Spider):
name = 'Test_Crawler'
allowed_domains = ['amazon.in']
start_urls = ['https://www.amazon.in/Philips-Trimmer-Cordless-Corded-QT4011/dp/B00JJIDBIC/ref=sr_1_3?keywords=philips&qid=1554266853&s=gateway&sr=8-3']
def parse(self, response):
titleH1 = getProductTitleH1(response),
customerReviewData = getCustomerRatingsAndComments(response)
yield{
'Title_H1' : titleH1,
'customer_Review_Data' : customerReviewData
}
I'm getting the following response:
{'Title_H1': (['Philips Beard Trimmer Cordless and Corded for Men QT4011/15'],), 'customer_Review_Data': <generator object getCustomerRatingsAndComments at 0x048AC630>}
The "Customer_review_Data" should be a list of dict of title and review
I am not able to figure out as to what mistake I am doing here.
When I use the log() or print() to see what data is captured in customerReviewDataList[], unable to see the data in the console either.
I am able to scrape all the reviews in customerReviewDataList[], if they are present in the product page,
In this scenario where I have to use the yield function I am getting the output stated above like this [https://ibb.co/kq8w6cf]
This is the kind of output I am looking for:
{'customerReviewTitle': ['Difficult to find a charger adapter'],'customerReviewComment': ['I already have a phillips trimmer which was only cordless. ], 'customerReviewTitle': ['Good Product'],'customerReviewComment': ['Solves my need perfectly HK']}]}
Any help is appreciated. Thanks in advance.
You should complete the Scrapy tutorial. The Following links section should be specially helpful to you.
This is a simplified version of your code:
def data_request_iterator():
yield Request('https://example.org')
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
yield {
'title': response.css('title::text').get(),
'data': data_request_iterator(),
}
Instead, it should look like this:
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
item = {
'title': response.css('title::text').get(),
}
yield Request('https://example.org', meta={'item': item}, callback=self.parse_data)
def parse_data(self, response):
item = response.meta['item']
# TODO: Extend item with data from this second response as needed.
yield item

Extract data from two pages with Scrapy

I have an agenda as a starting page. This page contains the start times and titles of events and links to the detail page of each event.
My spider extracts all events details (description, location, etc) on the detail page of each single event, except the start time i have to extract on my start page.
How can i extract start time from the start page and other data on each detail pages ?
What is the scrappy way to go ? Using meta['item'] ? i don't get it...
This is my spider for now. Any help greatly appreciated!
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_agenda_contents)
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = LuItem()
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item
Edit:
I tried to extract start time from the start page using request.meta['item'] and get a list of all the start time in the start page for each event. How to get the start time for each event ?
Can someone show me the right direction ?
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
item = LuItem()
item['StartTime'] = response.xpath('//div[contains(., "H")]/span/text()').extract()
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
request = scrapy.Request(url, callback=self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item
You are right. Using meta would do it in your case. Please see the official documentation here: http://doc.scrapy.org/en/latest/topics/request-response.html#passing-additional-data-to-callback-functions
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
This worked :
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
StartTimes = response.xpath('//div[#class="toggle_container_show"]/div/span/text()').extract()
urls =response.xpath('//div[#class="toggle_container_show"]/div/a/#href').extract()
for StartTime,url in zip(StartTimes,urls):
item = LuItem()
item['StartTime'] = StartTime
request = Request(url,callback = self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item

scrapy isn't working right in extracting the title

In this code I want to scrape title,subtitle and data inside the links but having issues on pages beyond 1 and 2 as getting only 1 item scraped.I want to extract only those entries having title as delhivery only
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span[#style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[#class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[#class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[#class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[#class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
You need to change the item['title'] to this:
item['title'] = ''.join(site.xpath('//table[#width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
Also edit sites to this to extract the required links only (ones with Delhivery in it)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
EDIT:
so I understand now that you need to add a pagination rule to your code.
it should be something like this:
You just need to add your imports and write the new xpaths from the item's link itself, such as this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[#class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[#class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
Also I suggest when you write an xpath, that you don't use any styling parameters, try to use #class or #id, only use #width, #style or any styling params if it's the only way.