Scrapy Spider which reads from Warc file - scrapy

I am looking for a Scrapy Spider that instead of getting URL's and crawls them, it gets as input a WARC file (preferably from S3) and send to the parse method the content.
I actually need to skip all the download phase, that means that from start_requests method i would like to return a Response that will then send to the parse method.
This is what i have so far:
class WarcSpider(Spider):
name = "warc_spider"
def start_requests(self):
f = warc.WARCFile(fileobj=gzip.open("file.war.gz"))
for record in f:
if record.type == "response":
payload = record.payload.read()
headers, body = payload.split('\r\n\r\n', 1)
url=record['WARC-Target-URI']
yield Response(url=url, status=200, body=body, headers=headers)
def parse(self, response):
#code that creates item
pass
Any ideas of what is the Scarpy way of doing that ?

What you want to do is something like this:
class DummyMdw(object):
def process_request(self, request, spider):
record = request.meta['record']
payload = record.payload.read()
headers, body = payload.split('\r\n\r\n', 1)
url=record['WARC-Target-URI']
return Response(url=url, status=200, body=body, headers=headers)
class WarcSpider(Spider):
name = "warc_spider"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {'x.DummyMdw': 1}
}
def start_requests(self):
f = warc.WARCFile(fileobj=gzip.open("file.war.gz"))
for record in f:
if record.type == "response":
yield Request(url, callback=self.parse, meta={'record': record})
def parse(self, response):
#code that creates item
pass

Related

How to set default cookies for SitemapSpider?

I am trying to set my own headers and cookies when crawling using SitemapSpider:
class MySpider(SitemapSpider):
name = 'myspider'
sitemap_urls = ['https://www.sitemap-1.xml']
headers = {'pragma': 'no-cache',}
cookies = {"sdsd": "23234",}
def _request_sitemaps(self, response):
for url in self.sitemap_urls:
yield scrapy.Request(url=url,headers=self.headers,cookies=self.cookies,callback=self._parse_sitemap)
def parse(self, response, **cb_kwargs):
print(response.css('title::text').get())
... but it doesn't work (cookies and headers are not passed), how can I implement it?
my decision
class MySpider(SitemapSpider):
name = 'spider'
sitemap_urls = ['https://www.sitemap-1.xml']
headers = {'authority': 'www.example.com',}
cookies = {"dsd": "jdjsj233",}
def start_requests(self):
for url in self.sitemap_urls:
yield Request(url, self._parse_sitemap)
def _parse_sitemap(self, response):
response = response.replace(body=self._get_sitemap_body(response))
for request in super()._parse_sitemap(response):
url = request.url
endpoint_request = request.replace(
url=url,
callback=self.parse,
headers=self.headers,
cookies=self.cookies,
)
yield endpoint_request
def parse(self, response, **cb_kwargs):
print(response.css('title::text').get())
According to the source code of the SitemapSpider I think renaming _request_sitemaps to start_requests should do the trick.

Scrapy: how to send the items to the site via the api

Now my spiders are sending data to my site in this way:
def parse_product(response, **cb_kwargs):
item = {}
item[url] = response.url
data = {
"source_id": 505,
"token": f"{API_TOKEN}",
"products": [item]
}
headers = {'Content-Type': 'application/json'}
url = 'http://some.site.com/api/'
requests.post(url=url, headers=headers, data=json.dumps(data))
is it possible to somehow implement this design through a pipeline or middleware, because it is inconvenient to prescribe for each spider?
p.s. the data (data) needs to be sent in the json format (json.dumps(data)), if I make the item = MyItemClass() class, an error occurs...
It can be done using a pipeline fairly easily. You can also use scrapy's Item class and item Field class as long as you cast them to a dict prior to calling json.dumps.
For Example:
class Pipeline:
def process_item(self, item, spider):
data = dict(item)
headers = {'Content-Type': 'application/json'}
url = 'http://some.site.com/api/'
requests.post(url=url, headers=headers, data=json.dumps(data))
return item
If you use this example it will call it on each and every item you yield from your spider. Just remember to activate it in your settings.py file.
I found another additional solution (on github), maybe someone will be interested...
pipeline.py
import json
import logging
import requests
from scrapy.utils.serialize import ScrapyJSONEncoder
from twisted.internet.defer import DeferredLock
from twisted.internet.threads import deferToThread
default_serialize = ScrapyJSONEncoder().encode
class HttpPostPipeline(object):
settings = None
items_buffer = []
DEFAULT_HTTP_POST_PIPELINE_BUFFERED = False
DEFAULT_HTTP_POST_PIPELINE_BUFFER_SIZE = 100
def __init__(self, url, headers=None, serialize_func=default_serialize):
"""Initialize pipeline.
Parameters
----------
url : StrictRedis
Redis client instance.
serialize_func : callable
Items serializer function.
"""
self.url = url
self.headers = headers if headers else {}
self.serialize_func = serialize_func
self._lock = DeferredLock()
#classmethod
def from_crawler(cls, crawler):
params = {
'url': crawler.settings.get('HTTP_POST_PIPELINE_URL'),
}
if crawler.settings.get('HTTP_POST_PIPELINE_HEADERS'):
params['headers'] = crawler.settings['HTTP_POST_PIPELINE_HEADERS']
ext = cls(**params)
ext.settings = crawler.settings
return ext
def process_item(self, item, spider):
if self.settings.get('HTTP_POST_PIPELINE_BUFFERED', self.DEFAULT_HTTP_POST_PIPELINE_BUFFERED):
self._lock.run(self._process_items, item)
return item
else:
return deferToThread(self._process_item, item, spider)
def _process_item(self, item, spider):
data = self.serialize_func(item)
requests.post(self.url, json=json.loads(data), headers=self.headers)
return item
def _process_items(self, item):
self.items_buffer.append(item)
if len(self.items_buffer) >= int(self.settings.get('HTTP_POST_PIPELINE_BUFFER_SIZE',
self.DEFAULT_HTTP_POST_PIPELINE_BUFFER_SIZE)):
deferToThread(self.send_items, self.items_buffer)
self.items_buffer = []
def send_items(self, items):
logging.debug("Sending batch of {} items".format(len(items)))
serialized_items = [self.serialize_func(item) for item in items]
requests.post(self.url, json=[json.loads(data) for data in serialized_items], headers=self.headers)
def close_spider(self, spider):
if len(self.items_buffer) > 0:
deferToThread(self.send_items, self.items_buffer)

Need to return Scrapy callback method data to calling function

In below code I am trying to collect email ids from a website. It can be on contact or about us page.
From parse method I follow extemail method for all those pages.
From every page I collected few email ids.
Now I need to print them with original record sent to init method.
For example:
record = "https://www.wockenfusscandies.com/"
I want to print output as,
https://www.wockenfusscandies.com/|abc#gamil.com|def#outlook.com
I am not able to store them in self.emails and deliver back to init method.
Please help.
import scrapy
from scrapy.crawler import CrawlerProcess
class EmailSpider(scrapy.Spider):
def __init__(self, record):
self.record = record
self.emails = []
url = record.split("|")[4]
if not url.startswith("http"):
url = "http://{}".format(url)
if url:
self.start_urls = ["https://www.wockenfusscandies.com/"]
else:
self.start_urls = []
def parse(self, response):
contact_list = [a.attrib['href'] for a in response.css('a') if 'contact' in a.attrib['href'] or 'about' in a.attrib['href']]
contact_list.append(response.request.url)
for fllink in contact_list:
yield response.follow(fllink, self.extemail)
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
yield {
'emails': emails
}
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
f = open("/Users/kalpesh/work/data/test.csv")
for rec in f:
process.crawl(EmailSpider, record=rec)
f.close()
process.start()
If I understand your intend correctly you could try the following proceeding:
a) collect the mail-ids in self.emails like
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
self.emails = emails.copy()
yield {
'emails': emails
}
(Or on what other way you get the email-ids from emails)
b) add a close(self, reason) method as in GitHub-Example which is called when the spider has finished
def close(self, reason):
mails_for_record = ""
for mail in self.emails:
mails_for_record += mail + "|"
print(self.record + mails_for_record)
Please also note, I read somewhere that for some versions of Scrapy it is def close(self, reason), for others it is def closed(self, reason).
Hope, this proceeding helps you.
You should visit all the site pages before yielding result for this one site.
This means that you should have queue of pages to visit and results storage.
It can be done using meta.
Some pseudocode:
def parse(self, response):
meta = response.meta
if not meta.get('seen'):
# -- finding urls of contact and about us pages --
# -- putting it to meta['queue'] --
# -- setting meta['seen'] = True
page_emails_found = ...getting emails here...
# --- extending already discovered emails
# --- from other pages/initial empty list with new ones
meta['emails'].extend(page_emails_found)
# if queue isn't empty - yielding new request
if meta['queue']:
next_url = meta['queue'].pop()
yield Request(next_url, callback=self.parse, meta=copy(meta))
# if queue is empty - yielding result from meta
else:
yield {'url': current_domain, 'emails': meta['emails']}
Something like this..

scrapy plash set input value?

I've succesfully been able to load javascript generated html with scrapy-splash. Now I want to set a couple input value's which are not part of a form. As soon as I put in a value the content on the site changes. I haven't found a way to set the input value's and rescrap the adjusted html. Is this possible?
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = (
'https://example.com',
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 3}
}
})
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'screener-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You need to put the input inside a lua_script as someone suggested in the comments, following an example to click a button:
script ="""
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:runjs('document.getElementsByClassName("nameofbutton").click()'))
assert(splash:wait(0.75))
-- return result as a JSON object
return {
html = splash:html()
}
end
"""
then execute the script like this:
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse_item, meta={
'splash': {
'args': {'lua_source': self.script},
'endpoint': 'execute',
}
})

Is it possible to pass a variable from start_requests() to parse() for each individual request?

I'm using a loop to generate my requests inside start_request() and I'd like to pass the index to parse() so it can store it in the item. However when I use self.i the output has the i max value (last loop turn) for every items. I can use response.url.re('regex to extract the index') but I wonder if there is a clean way to pass a variable from start_requests to parse.
You can use scrapy.Request meta attribute:
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
urls = [...]
for index, url in enumerate(urls):
yield scrapy.Request(url, meta={'index':index})
def parse(self, response):
print(response.url)
print(response.meta['index'])
You can pass cb_kwargs argument to scrapy.Request()
https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.cb_kwargs
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
urls = [...]
for index, url in enumerate(urls):
yield scrapy.Request(url, callback=self.parse, cb_kwargs={'index':index})
def parse(self, response, index):
pass