Scrapy: how to send the items to the site via the api - scrapy

Now my spiders are sending data to my site in this way:
def parse_product(response, **cb_kwargs):
item = {}
item[url] = response.url
data = {
"source_id": 505,
"token": f"{API_TOKEN}",
"products": [item]
}
headers = {'Content-Type': 'application/json'}
url = 'http://some.site.com/api/'
requests.post(url=url, headers=headers, data=json.dumps(data))
is it possible to somehow implement this design through a pipeline or middleware, because it is inconvenient to prescribe for each spider?
p.s. the data (data) needs to be sent in the json format (json.dumps(data)), if I make the item = MyItemClass() class, an error occurs...

It can be done using a pipeline fairly easily. You can also use scrapy's Item class and item Field class as long as you cast them to a dict prior to calling json.dumps.
For Example:
class Pipeline:
def process_item(self, item, spider):
data = dict(item)
headers = {'Content-Type': 'application/json'}
url = 'http://some.site.com/api/'
requests.post(url=url, headers=headers, data=json.dumps(data))
return item
If you use this example it will call it on each and every item you yield from your spider. Just remember to activate it in your settings.py file.

I found another additional solution (on github), maybe someone will be interested...
pipeline.py
import json
import logging
import requests
from scrapy.utils.serialize import ScrapyJSONEncoder
from twisted.internet.defer import DeferredLock
from twisted.internet.threads import deferToThread
default_serialize = ScrapyJSONEncoder().encode
class HttpPostPipeline(object):
settings = None
items_buffer = []
DEFAULT_HTTP_POST_PIPELINE_BUFFERED = False
DEFAULT_HTTP_POST_PIPELINE_BUFFER_SIZE = 100
def __init__(self, url, headers=None, serialize_func=default_serialize):
"""Initialize pipeline.
Parameters
----------
url : StrictRedis
Redis client instance.
serialize_func : callable
Items serializer function.
"""
self.url = url
self.headers = headers if headers else {}
self.serialize_func = serialize_func
self._lock = DeferredLock()
#classmethod
def from_crawler(cls, crawler):
params = {
'url': crawler.settings.get('HTTP_POST_PIPELINE_URL'),
}
if crawler.settings.get('HTTP_POST_PIPELINE_HEADERS'):
params['headers'] = crawler.settings['HTTP_POST_PIPELINE_HEADERS']
ext = cls(**params)
ext.settings = crawler.settings
return ext
def process_item(self, item, spider):
if self.settings.get('HTTP_POST_PIPELINE_BUFFERED', self.DEFAULT_HTTP_POST_PIPELINE_BUFFERED):
self._lock.run(self._process_items, item)
return item
else:
return deferToThread(self._process_item, item, spider)
def _process_item(self, item, spider):
data = self.serialize_func(item)
requests.post(self.url, json=json.loads(data), headers=self.headers)
return item
def _process_items(self, item):
self.items_buffer.append(item)
if len(self.items_buffer) >= int(self.settings.get('HTTP_POST_PIPELINE_BUFFER_SIZE',
self.DEFAULT_HTTP_POST_PIPELINE_BUFFER_SIZE)):
deferToThread(self.send_items, self.items_buffer)
self.items_buffer = []
def send_items(self, items):
logging.debug("Sending batch of {} items".format(len(items)))
serialized_items = [self.serialize_func(item) for item in items]
requests.post(self.url, json=[json.loads(data) for data in serialized_items], headers=self.headers)
def close_spider(self, spider):
if len(self.items_buffer) > 0:
deferToThread(self.send_items, self.items_buffer)

Related

Scrapy: upgrade the pipeline to send items

I have a class in pipelines.py that sends and threads to my server's API:
class MyPipeline:
def process_item(self, item, spider):
data = {
"source_id": 'name_of_the_running_spider,
"token": "token",
"products": [dict(item)],
}
headers = {'Content-Type': 'application/json'}
url = 'http://for.example.com/my-api/'
requests.post(url=url, headers=headers, data=json.dumps(data))
return item
The problem is that the pipeline sends each time under one item ("products": [dict(item)]). Is it possible to somehow pass a list items to "products" (for example [dict(item)*10])? If in the spider itself, it can be organized using a loop and a counter, but how to implement it through pipeline.py
After some testing I have come up with a possible solution, that adds functionality to the pipeline by storing each item in a list, and having a separate method that manages the number of items collected and automatically triggers the request once length of the list has reached a certain threshold, and then resets the list back to empty. Then in the pipelines close_spider method you can check if there are any remaining requests that haven't been sent and send those.
For the spider name, the pipelines process_item method receives the instance of the spider. So in order to get the spider's name attribute all you need to do is use spider.name. If instead you are trying to get the name of the spider class then you can either do some regex on type(spider) or simply add the class name as an attribute to the spider and get it through spider.classname.
For example:
pipelines.py
class MyPipeline:
def __init__(self):
self._request_data = []
self._url = 'http://for.example.com/my-api/'
self._headers = {'Content-Type': 'application/json'}
self._max_number_of_requests = 10
def process_item(self, item, spider):
spidername = spider.name
if len(self._request_data) >= self._max_number_of_requests:
self.send_post_request(spidername)
self._request_data.append(item)
return item
def send_post_request(self, spidername):
data = {"source_id": spidername,
"token": "token",
"products": self._request_data}
response = requests.post(url=self._url,
headers=self._headers,
data=json.dumps(data))
if response.status_code != 200:
print(f"REQUEST FAILED: status code {response.status_code}")
self._request_data = []
def close_spider(self, spider):
if len(self._request_data) > 0:
self.send_post_request(spider.name)

Changing next page url within scraper and loading

I am trying to get within several urls of a webpage and follow the response to the next parser to grab another set of urls on a page. However, from this page I need to grab the next page urls but I wanted to try this by manipulating the page string by parsing it and then passing this as the next page. However, the scraper crawls but it returns nothing not even the output on the final parser when I load item.
Note: I know that I can grab the next page rather simply with an if-statement on the href. However, I wanted to try something different in case I had to face a situation where I would have to do this.
Here's my scraper:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
start_urls = ['https://www.zoopla.co.uk/overseas/']
def start_request(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse, )
def parse(self, response):
container = response.xpath("//ul[#class='list-inline list-unstyled']//li")
for links in container:
urls = links.xpath(".//a/#href").get()
yield response.follow(
urls, callback = self.parse_places
)
def parse_places(self, response):
container = response.xpath("//ul[#class='listing-results clearfix js-gtm-list']//li")
for links in container:
urls = links.xpath('(//div[#class="listing-results-right clearfix"]//a)[position() mod 3=1]//#href').get()
yield response.follow(
urls, callback = self.parse_listings
)
if response.xpath("//div[#id='content']//div//h1//text()").extract_first():
page_on = response.xpath("//div[#id='content']//div//h1//text()").extract_first()
name_of_page = page_on.split()[-1]
else:
pass
if response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first():
url_link = response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first()
url_link = url_link.split('/')
last_page = url_link[-1].split('=')[-1]
else:
pass
all_pages = []
for index, n in enumerate(url_link):
for page_name, page_num in zip(name_of_page, last_page):
if index == 5:
url_link[index] = page_name
testit='/'.join(url_link)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, page_num+1):
equal_split[another_i] = str(2)
all_pages.append('='.join(equal_split))
for urls in all_pages:
yield response.follow(
urls, callback = self.parse.places
)
def parse_listings(self, response):
loader = ItemLoader(ZooplasItem(), response=response)
loader.default.output_processor = TakeFirst()
loader.add_xpath("//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'zoopla.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ZooplasSpider)
process.start()
I know the way of grabbing the urls works as I have tried it on a single url using the following:
url = "https://www.zoopla.co.uk/overseas/property/ireland/?new_homes=include&include_sold=false&pn=16"
list_of_stuff = ['Ireland', 'Germany','France']
pages_of_stuff = [5, 7, 6]
test = []
all_pages = []
j=0
for index, n in enumerate(a):
for l_stuff, p_stuff in zip(list_of_stuff,pages_of_stuff):
if index == 5:
a[index] = l_stuff
testit='/'.join(a)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, p_stuff+1):
equal_split[another_i] = str(range_val)
print('='.join(equal_split))
Which is the same as the one used above just a change of variables. this outputs the following links and they work:
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=2
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=3
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=4
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=5
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=2
...
Your use case is suited for using scrapy crawl spider. You can write rules on how to extract links to the properties and how to extract links to the next pages. I have changed your code to use a crawl spider class and I have changed your FEEDS settings to use the recommended settings. FEED_URI and FEED_FORMAT are deprecated in newer versions of scrapy.
Read more about the crawl spider from the docs
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(#class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","//li[#class='ui-breadcrumbs__item'][3]/a/text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()

Need to return Scrapy callback method data to calling function

In below code I am trying to collect email ids from a website. It can be on contact or about us page.
From parse method I follow extemail method for all those pages.
From every page I collected few email ids.
Now I need to print them with original record sent to init method.
For example:
record = "https://www.wockenfusscandies.com/"
I want to print output as,
https://www.wockenfusscandies.com/|abc#gamil.com|def#outlook.com
I am not able to store them in self.emails and deliver back to init method.
Please help.
import scrapy
from scrapy.crawler import CrawlerProcess
class EmailSpider(scrapy.Spider):
def __init__(self, record):
self.record = record
self.emails = []
url = record.split("|")[4]
if not url.startswith("http"):
url = "http://{}".format(url)
if url:
self.start_urls = ["https://www.wockenfusscandies.com/"]
else:
self.start_urls = []
def parse(self, response):
contact_list = [a.attrib['href'] for a in response.css('a') if 'contact' in a.attrib['href'] or 'about' in a.attrib['href']]
contact_list.append(response.request.url)
for fllink in contact_list:
yield response.follow(fllink, self.extemail)
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
yield {
'emails': emails
}
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
f = open("/Users/kalpesh/work/data/test.csv")
for rec in f:
process.crawl(EmailSpider, record=rec)
f.close()
process.start()
If I understand your intend correctly you could try the following proceeding:
a) collect the mail-ids in self.emails like
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
self.emails = emails.copy()
yield {
'emails': emails
}
(Or on what other way you get the email-ids from emails)
b) add a close(self, reason) method as in GitHub-Example which is called when the spider has finished
def close(self, reason):
mails_for_record = ""
for mail in self.emails:
mails_for_record += mail + "|"
print(self.record + mails_for_record)
Please also note, I read somewhere that for some versions of Scrapy it is def close(self, reason), for others it is def closed(self, reason).
Hope, this proceeding helps you.
You should visit all the site pages before yielding result for this one site.
This means that you should have queue of pages to visit and results storage.
It can be done using meta.
Some pseudocode:
def parse(self, response):
meta = response.meta
if not meta.get('seen'):
# -- finding urls of contact and about us pages --
# -- putting it to meta['queue'] --
# -- setting meta['seen'] = True
page_emails_found = ...getting emails here...
# --- extending already discovered emails
# --- from other pages/initial empty list with new ones
meta['emails'].extend(page_emails_found)
# if queue isn't empty - yielding new request
if meta['queue']:
next_url = meta['queue'].pop()
yield Request(next_url, callback=self.parse, meta=copy(meta))
# if queue is empty - yielding result from meta
else:
yield {'url': current_domain, 'emails': meta['emails']}
Something like this..

Scrapy Spider which reads from Warc file

I am looking for a Scrapy Spider that instead of getting URL's and crawls them, it gets as input a WARC file (preferably from S3) and send to the parse method the content.
I actually need to skip all the download phase, that means that from start_requests method i would like to return a Response that will then send to the parse method.
This is what i have so far:
class WarcSpider(Spider):
name = "warc_spider"
def start_requests(self):
f = warc.WARCFile(fileobj=gzip.open("file.war.gz"))
for record in f:
if record.type == "response":
payload = record.payload.read()
headers, body = payload.split('\r\n\r\n', 1)
url=record['WARC-Target-URI']
yield Response(url=url, status=200, body=body, headers=headers)
def parse(self, response):
#code that creates item
pass
Any ideas of what is the Scarpy way of doing that ?
What you want to do is something like this:
class DummyMdw(object):
def process_request(self, request, spider):
record = request.meta['record']
payload = record.payload.read()
headers, body = payload.split('\r\n\r\n', 1)
url=record['WARC-Target-URI']
return Response(url=url, status=200, body=body, headers=headers)
class WarcSpider(Spider):
name = "warc_spider"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {'x.DummyMdw': 1}
}
def start_requests(self):
f = warc.WARCFile(fileobj=gzip.open("file.war.gz"))
for record in f:
if record.type == "response":
yield Request(url, callback=self.parse, meta={'record': record})
def parse(self, response):
#code that creates item
pass

how to generate javascript request to scrapy+webkit?

I need to scrap a webpage generated by Javascript.The href is like this:
[4]
I wrote a WebkitDownloader in scrapy and it works.
class WebkitDownloader( object ):
def process_request( self, request, spider ):
if spider.name in settings.WEBKIT_DOWNLOADER:
if( type(request) is not FormRequest ):
print '===================Processing non-FormRequest=================='
webview = webkit.WebView()
webview.connect( 'load-finished', lambda v,f: gtk.main_quit() )
webview.load_uri( request.url )
gtk.main()
js = jswebkit.JSContext( webview.get_main_frame().get_global_context() )
renderedBody = str( js.EvaluateScript( 'document.documentElement.innerHTML' ) )
#print renderedBody
return HtmlResponse( request.url, body=renderedBody )
I still don't know how to send a request to WebkitDownloader in my spider parse module.
Can I use something like this: request.append('javascript:__doPostBack('pager','4')') and send the Javascript request to the WebkitDownloader? Or how to build the Javascript request?
you can use this code in your middleware
from scrapy.http import HtmlResponse
import gtk
import webkit
import jswebkit
class WebkitDownloader( object ):
def stop_gtk(self, v, f):
gtk.main_quit()
def _get_webview(self):
webview = webkit.WebView()
props = webview.get_settings()
props.set_property('enable-java-applet', False)
props.set_property('enable-plugins', False)
props.set_property('enable-page-cache', False)
return webview
def process_request( self, request, spider ):
if 'renderjs' in request.meta:
webview = self._get_webview()
webview.connect('load-finished', self.stop_gtk)
webview.load_uri(request.url)
gtk.main()
ctx = jswebkit.JSContext(webview.get_main_frame().get_global_context())
url = ctx.EvaluateScript('window.location.href')
html = ctx.EvaluateScript('document.documentElement.innerHTML')
return HtmlResponse(url, encoding='utf-8', body=html.encode('utf-8'))