I've succesfully been able to load javascript generated html with scrapy-splash. Now I want to set a couple input value's which are not part of a form. As soon as I put in a value the content on the site changes. I haven't found a way to set the input value's and rescrap the adjusted html. Is this possible?
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = (
'https://example.com',
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 3}
}
})
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'screener-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You need to put the input inside a lua_script as someone suggested in the comments, following an example to click a button:
script ="""
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:runjs('document.getElementsByClassName("nameofbutton").click()'))
assert(splash:wait(0.75))
-- return result as a JSON object
return {
html = splash:html()
}
end
"""
then execute the script like this:
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse_item, meta={
'splash': {
'args': {'lua_source': self.script},
'endpoint': 'execute',
}
})
Related
I am trying to set my own headers and cookies when crawling using SitemapSpider:
class MySpider(SitemapSpider):
name = 'myspider'
sitemap_urls = ['https://www.sitemap-1.xml']
headers = {'pragma': 'no-cache',}
cookies = {"sdsd": "23234",}
def _request_sitemaps(self, response):
for url in self.sitemap_urls:
yield scrapy.Request(url=url,headers=self.headers,cookies=self.cookies,callback=self._parse_sitemap)
def parse(self, response, **cb_kwargs):
print(response.css('title::text').get())
... but it doesn't work (cookies and headers are not passed), how can I implement it?
my decision
class MySpider(SitemapSpider):
name = 'spider'
sitemap_urls = ['https://www.sitemap-1.xml']
headers = {'authority': 'www.example.com',}
cookies = {"dsd": "jdjsj233",}
def start_requests(self):
for url in self.sitemap_urls:
yield Request(url, self._parse_sitemap)
def _parse_sitemap(self, response):
response = response.replace(body=self._get_sitemap_body(response))
for request in super()._parse_sitemap(response):
url = request.url
endpoint_request = request.replace(
url=url,
callback=self.parse,
headers=self.headers,
cookies=self.cookies,
)
yield endpoint_request
def parse(self, response, **cb_kwargs):
print(response.css('title::text').get())
According to the source code of the SitemapSpider I think renaming _request_sitemaps to start_requests should do the trick.
I'm trying to extract some fields from start_url, and want to add the PDF link fields that are obtained from each URL that has been obtained. I tried Scrapy but no lucky to add PDF fields. Here is my code,
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
#pass
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
i = 0
for a in range(total_url):
title = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
url_source = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
thumbnail = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
pdf = scrapy.Request(book_urls[i], self.find_details)
yield {
'Book Title': title,
'URL': url_source,
'Mini IMG': thumbnail,
'PDF Link': pdf
}
i+=1
def find_details(self, response):
# find PDF link
pdf = response.xpath("//div[#class='td-post-content']//a/#href").get()
return pdf
How do I add a PDF link field correctly when I export it as CSV? Thanks in advance
pdf = scrapy.Request(book_urls[i], self.find_details)
It means pdf variable is a request.
Scrapy is asynchronous so you'll have trouble to get a return value from a function. Just make a request and pass the details to the callback with cb_kwargs.
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
for i in range(total_url):
item = dict()
item['title'] = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
item['url_source'] = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
item['thumbnail'] = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
yield scrapy.Request(url=book_urls[i], callback=self.find_details, cb_kwargs={'item': item})
def find_details(self, response, item):
# find PDF link
item['pdf'] = response.xpath("//div[#class='td-post-content']//a/#href").get()
yield item
Can anyone help me? I'm practicing and I can't understand what I did wrong on pagination! It only returns the first page to me and sometimes an error comes up. When it works, it just returns the first page.
"The source list for the Content Security Policy directive 'frame-src' contains an invalid source '*trackcmp.net' It will be ignored", source: https://naturaldaterra.com.br/hortifruti.html?page=2"
import scrapy
from scrapy_selenium import SeleniumRequest
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield SeleniumRequest(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
wait_time=3,
callback=self.parse
)
def parse(self, response):
for produto in response.xpath("//div[#class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[#class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[#class='itemPrice-price-1R-']/text()").getall(),
}
next_page = response.xpath("//button[#class='tile-root-1uO'][1]/text()").get()
if next_page:
absolute_url = f"https://naturaldaterra.com.br/hortifruti.html?page={next_page}"
yield SeleniumRequest(
url=absolute_url,
wait_time=3,
callback=self.parse
)
The problem is that your xpath selector returns None instead of the next page number. Consider changing it from
next_page = response.xpath("//button[#class='tile-root-1uO'][1]/text()").get()
to
next_page = response.xpath("//button[#class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
For your future projects consider using scrapy-playwright to scrape js rendered websites. It is faster and simple to use. See a sample implementation of your scraper using scrapy-playwright
import scrapy
from scrapy.crawler import CrawlerProcess
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
meta={"playwright": True}
)
def parse(self, response):
for produto in response.xpath("//div[#class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[#class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[#class='itemPrice-price-1R-']/text()").getall(),
}
# scrape next page
next_page = response.xpath(
"//button[#class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=' + next_page,
meta={"playwright": True}
)
if __name__ == "__main__":
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}, })
process.crawl(ComputerdealsSpider)
process.start()
I am trying to scrape the following website: https://sabobic.blogabet.com
My crawler already crawls the content I need. But after clicking on the "See older"-Button I do not know how to start my method "crawltips" again.
This is my current code:
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
# Place all user urls here
url = "https://sabobic.blogabet.com"
self.driver.get(url)
yield scrapy.http.Request (url, callback=self.crawltips)
def crawltips(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}
try:
self.driver.find_element_by_id('last_item').click()
sleep(5)
except NoSuchElementException:
self.logger.info('No more tipps')
yield Request(url, callback=self.crawltips)
I think something is wrong with the yield function because I dont have a new url...
The following should work:
yield scrapy.Request(self.driver.current_url,callback=self.crawltips)
I am looking for a Scrapy Spider that instead of getting URL's and crawls them, it gets as input a WARC file (preferably from S3) and send to the parse method the content.
I actually need to skip all the download phase, that means that from start_requests method i would like to return a Response that will then send to the parse method.
This is what i have so far:
class WarcSpider(Spider):
name = "warc_spider"
def start_requests(self):
f = warc.WARCFile(fileobj=gzip.open("file.war.gz"))
for record in f:
if record.type == "response":
payload = record.payload.read()
headers, body = payload.split('\r\n\r\n', 1)
url=record['WARC-Target-URI']
yield Response(url=url, status=200, body=body, headers=headers)
def parse(self, response):
#code that creates item
pass
Any ideas of what is the Scarpy way of doing that ?
What you want to do is something like this:
class DummyMdw(object):
def process_request(self, request, spider):
record = request.meta['record']
payload = record.payload.read()
headers, body = payload.split('\r\n\r\n', 1)
url=record['WARC-Target-URI']
return Response(url=url, status=200, body=body, headers=headers)
class WarcSpider(Spider):
name = "warc_spider"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {'x.DummyMdw': 1}
}
def start_requests(self):
f = warc.WARCFile(fileobj=gzip.open("file.war.gz"))
for record in f:
if record.type == "response":
yield Request(url, callback=self.parse, meta={'record': record})
def parse(self, response):
#code that creates item
pass