Scrapy & Selenium - Load next pages - selenium

I am trying to scrape the following website: https://sabobic.blogabet.com
My crawler already crawls the content I need. But after clicking on the "See older"-Button I do not know how to start my method "crawltips" again.
This is my current code:
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
# Place all user urls here
url = "https://sabobic.blogabet.com"
self.driver.get(url)
yield scrapy.http.Request (url, callback=self.crawltips)
def crawltips(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}
try:
self.driver.find_element_by_id('last_item').click()
sleep(5)
except NoSuchElementException:
self.logger.info('No more tipps')
yield Request(url, callback=self.crawltips)
I think something is wrong with the yield function because I dont have a new url...

The following should work:
yield scrapy.Request(self.driver.current_url,callback=self.crawltips)

Related

How to set default cookies for SitemapSpider?

I am trying to set my own headers and cookies when crawling using SitemapSpider:
class MySpider(SitemapSpider):
name = 'myspider'
sitemap_urls = ['https://www.sitemap-1.xml']
headers = {'pragma': 'no-cache',}
cookies = {"sdsd": "23234",}
def _request_sitemaps(self, response):
for url in self.sitemap_urls:
yield scrapy.Request(url=url,headers=self.headers,cookies=self.cookies,callback=self._parse_sitemap)
def parse(self, response, **cb_kwargs):
print(response.css('title::text').get())
... but it doesn't work (cookies and headers are not passed), how can I implement it?
my decision
class MySpider(SitemapSpider):
name = 'spider'
sitemap_urls = ['https://www.sitemap-1.xml']
headers = {'authority': 'www.example.com',}
cookies = {"dsd": "jdjsj233",}
def start_requests(self):
for url in self.sitemap_urls:
yield Request(url, self._parse_sitemap)
def _parse_sitemap(self, response):
response = response.replace(body=self._get_sitemap_body(response))
for request in super()._parse_sitemap(response):
url = request.url
endpoint_request = request.replace(
url=url,
callback=self.parse,
headers=self.headers,
cookies=self.cookies,
)
yield endpoint_request
def parse(self, response, **cb_kwargs):
print(response.css('title::text').get())
According to the source code of the SitemapSpider I think renaming _request_sitemaps to start_requests should do the trick.

Extract page from start_urls and find pdf link from every extracted page using Scrapy

I'm trying to extract some fields from start_url, and want to add the PDF link fields that are obtained from each URL that has been obtained. I tried Scrapy but no lucky to add PDF fields. Here is my code,
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
#pass
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
i = 0
for a in range(total_url):
title = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
url_source = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
thumbnail = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
pdf = scrapy.Request(book_urls[i], self.find_details)
yield {
'Book Title': title,
'URL': url_source,
'Mini IMG': thumbnail,
'PDF Link': pdf
}
i+=1
def find_details(self, response):
# find PDF link
pdf = response.xpath("//div[#class='td-post-content']//a/#href").get()
return pdf
How do I add a PDF link field correctly when I export it as CSV? Thanks in advance
pdf = scrapy.Request(book_urls[i], self.find_details)
It means pdf variable is a request.
Scrapy is asynchronous so you'll have trouble to get a return value from a function. Just make a request and pass the details to the callback with cb_kwargs.
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
for i in range(total_url):
item = dict()
item['title'] = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
item['url_source'] = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
item['thumbnail'] = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
yield scrapy.Request(url=book_urls[i], callback=self.find_details, cb_kwargs={'item': item})
def find_details(self, response, item):
# find PDF link
item['pdf'] = response.xpath("//div[#class='td-post-content']//a/#href").get()
yield item

Scrapy-Selenium Pagination

Can anyone help me? I'm practicing and I can't understand what I did wrong on pagination! It only returns the first page to me and sometimes an error comes up. When it works, it just returns the first page.
"The source list for the Content Security Policy directive 'frame-src' contains an invalid source '*trackcmp.net' It will be ignored", source: https://naturaldaterra.com.br/hortifruti.html?page=2"
import scrapy
from scrapy_selenium import SeleniumRequest
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield SeleniumRequest(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
wait_time=3,
callback=self.parse
)
def parse(self, response):
for produto in response.xpath("//div[#class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[#class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[#class='itemPrice-price-1R-']/text()").getall(),
}
next_page = response.xpath("//button[#class='tile-root-1uO'][1]/text()").get()
if next_page:
absolute_url = f"https://naturaldaterra.com.br/hortifruti.html?page={next_page}"
yield SeleniumRequest(
url=absolute_url,
wait_time=3,
callback=self.parse
)
The problem is that your xpath selector returns None instead of the next page number. Consider changing it from
next_page = response.xpath("//button[#class='tile-root-1uO'][1]/text()").get()
to
next_page = response.xpath("//button[#class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
For your future projects consider using scrapy-playwright to scrape js rendered websites. It is faster and simple to use. See a sample implementation of your scraper using scrapy-playwright
import scrapy
from scrapy.crawler import CrawlerProcess
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
meta={"playwright": True}
)
def parse(self, response):
for produto in response.xpath("//div[#class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[#class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[#class='itemPrice-price-1R-']/text()").getall(),
}
# scrape next page
next_page = response.xpath(
"//button[#class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=' + next_page,
meta={"playwright": True}
)
if __name__ == "__main__":
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}, })
process.crawl(ComputerdealsSpider)
process.start()

scrapy plash set input value?

I've succesfully been able to load javascript generated html with scrapy-splash. Now I want to set a couple input value's which are not part of a form. As soon as I put in a value the content on the site changes. I haven't found a way to set the input value's and rescrap the adjusted html. Is this possible?
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = (
'https://example.com',
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 3}
}
})
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'screener-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You need to put the input inside a lua_script as someone suggested in the comments, following an example to click a button:
script ="""
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:runjs('document.getElementsByClassName("nameofbutton").click()'))
assert(splash:wait(0.75))
-- return result as a JSON object
return {
html = splash:html()
}
end
"""
then execute the script like this:
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse_item, meta={
'splash': {
'args': {'lua_source': self.script},
'endpoint': 'execute',
}
})

Extract data from two pages with Scrapy

I have an agenda as a starting page. This page contains the start times and titles of events and links to the detail page of each event.
My spider extracts all events details (description, location, etc) on the detail page of each single event, except the start time i have to extract on my start page.
How can i extract start time from the start page and other data on each detail pages ?
What is the scrappy way to go ? Using meta['item'] ? i don't get it...
This is my spider for now. Any help greatly appreciated!
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_agenda_contents)
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = LuItem()
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item
Edit:
I tried to extract start time from the start page using request.meta['item'] and get a list of all the start time in the start page for each event. How to get the start time for each event ?
Can someone show me the right direction ?
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
item = LuItem()
item['StartTime'] = response.xpath('//div[contains(., "H")]/span/text()').extract()
for href in response.css("div.toggle_container_show > div > a::attr('href')"):
url = response.urljoin(href.extract())
request = scrapy.Request(url, callback=self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item
You are right. Using meta would do it in your case. Please see the official documentation here: http://doc.scrapy.org/en/latest/topics/request-response.html#passing-additional-data-to-callback-functions
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
This worked :
class LuSpider(scrapy.Spider):
name = "lu"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/agenda"]
def parse(self, response):
StartTimes = response.xpath('//div[#class="toggle_container_show"]/div/span/text()').extract()
urls =response.xpath('//div[#class="toggle_container_show"]/div/a/#href').extract()
for StartTime,url in zip(StartTimes,urls):
item = LuItem()
item['StartTime'] = StartTime
request = Request(url,callback = self.parse_agenda_contents)
request.meta['item'] = item
yield request
def parse_agenda_contents(self, response):
for sel in response.xpath('//div[#class="container"]'):
item = response.meta['item']
item['EventTitle'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]/h1[#id]/text()').extract()
item['Description'] = sel.xpath('div[#class="content"]/div/div[#class="sliderContent"]//p').extract()
yield item