modify scrapy middleware Block main program - scrapy

I modified the request library in the middleware, but it becomes synchronous and cannot play the asynchronous performance. The library that sends requests cannot be asynchronous. What should I do
code snippet
class Ja3DownloaderMiddleware:
def __init__(self):
self.session = pyhttpx.HttpSession(browser_type='chrome')
def process_request(self, request, spider):
proxies = {
'https': '127.0.0.1:7890',
'http': '127.0.0.1:7890',
}
if request.method == 'GET':
response = self.session.get(request.url, headers=get_project_settings().get('DEFAULT_REQUEST_HEADERS'),
proxies=proxies)
elif request.method == 'POST':
response = self.session.post(request.url, headers=get_project_settings().get('DEFAULT_REQUEST_HEADERS'),
proxies=proxies)
else:
raise ValueError(f"未知请求方式: {request.method}")
# body = str.encode(response.body)
return HtmlResponse(response.request.url, body=response.text, encoding='utf-8', request=request)
I tried to use asynchrony, but this library does not support it. I hope it will not block after modification

Related

How to get the response of a scrapy request

I am making a scrapy request to get all the data of a site. I am trying to get the response of the full request, however I am not getting any result. I send the code attached. Thanks for your help.
`
class FilminSpider(scrapy.Spider):
name = 'filmin'
allowed_domains = ['filmin.es']
start_urls = ['https://www.filmin.es/wapi/catalog/browse?type=film&page=2&limit=60']
def get_all_movies_data(self):
url = 'https://www.filmin.es/wapi/catalog/browse?type=film&page=2&limit=60'
headers = {"x-requested-with": "XMLHttpRequest"}
request = Request(url = url, method = 'GET',dont_filter = True
,headers = headers)
def parse(self, response):
return response.request
`
pls have a look at this example below you may get something
class ErrbackSpider(scrapy.Spider):
name = "errback_example"
start_urls = [
"http://www.httpbin.org/", # HTTP 200 expected
"http://www.httpbin.org/status/404", # Not found error
"http://www.httpbin.org/status/500", # server issue
"http://www.httpbin.org:12345/", # non-responding host, timeout expected
"http://www.httphttpbinbin.org/", # DNS error expected
]
def start_requests(self):
for u in self.start_urls:
yield scrapy.Request(u, callback=self.parse_httpbin,
errback=self.errback_httpbin,
dont_filter=True)
def parse_httpbin(self, response):
self.logger.info('Got successful response from {}'.format(response.url))
# do something useful here...
def errback_httpbin(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)

How to get the rediected url in the process_request() of scrapy RedirectMiddleware?

For example:
the url is http://www.wandoujia.com/search?key=saber
It will be redirected to the new url http://www.wandoujia.com/search/3161097853842468421.
I want to get the new url in the process_request() of scrapy RedirectMiddleware?
Following is my code:
class RedirectMiddleware(object):
def process_request(self, request, spider):
new_url = request.url
logging.debug('new_url = ' + new_url)
logging.debug('****************************')
patterns = spider.request_pattern
logging.debug(patterns)
for pattern in patterns:
obj = re.match(pattern, new_url)
if obj:
return Request(new_url)
ps:the request.url is the old url. I want to get the new url correctly.
Try replacing the default middleware with something like this (the method you're looking for is the process_response because the response "contains the redirection")
class CustomRedirectMiddleware(RedirectMiddleware):
def process_response(self, request, response, spider):
redirected = super(CustomRedirectMiddleware, self).process_response(
request, response, spider)
if isinstance(redirected, request.__class__):
print("Original url: <{}>".format(request.url))
print("Redirected url: <{}>".format(redirected.url))
return redirected
return response

Scrapy : How to write a UserAgentMiddleware?

I want to write a UserAgentMiddleware for scrapy,
the docs says:
Middleware that allows spiders to override the default user agent.
In order for a spider to override the default user agent, its user_agent attribute must be set.
docs:
https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.useragent
But there is no a example,I have no ideas how to write it.
Any suggestions?
You look at it in install scrapy path
/Users/tarun.lalwani/.virtualenvs/project/lib/python3.6/site-packages/scrapy/downloadermiddlewares/useragent.py
"""Set User-Agent header per spider or use a default value from settings"""
from scrapy import signals
class UserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
self.user_agent = user_agent
#classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings['USER_AGENT'])
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)
You can see a below example for setting Random user agent
https://github.com/alecxe/scrapy-fake-useragent/blob/master/scrapy_fake_useragent/middleware.py
First visit some website and get some of the newest user agents. Then in your standard middleware do something like this. This is the same place you would setup your own proxy settings. Grab a random UA from the text file, and put it in the headers. This is sloppy to show an example you would want to import random at the top and also make sure to closer useragents.txt when you are done with it. I would probably just load them into a list at the top of the document.
class GdataDownloaderMiddleware(object):
#classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
user_agents = open('useragents.txt', 'r')
user_agents = user_agents.readlines()
import random
user_agent = random.choice(user_agents)
request.headers.setdefault(b'User-Agent', user_agent)
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

scrapy downloadmiddleware failed to schedule request from process_response

I want to push the request which response code is 423 back to scheduler, so I create a downloadmiddleware:
class MyMiddleware(object):
def process_response(self, request, response, spider):
if response.status == 423:
return request
else:
return response
but it does not work, even the request is not in scheduler again.
thank you for your help!
Your new request is probably getting filtered out by scrapy's dupefilter.
You can try addding dont_filter=True parameter:
def process_response(self, request, response, spider):
if response.status == 423:
request = request.replace(dont_filter=True)
return request
else:
return response
You can set these two settings to your scrapy_settings:
RETRY_HTTP_CODES=[423]
RETRY_TIMES=10
and scrapy will manage it for you.

Scrapy how to remove a url from httpcache or prevent adding to cache

I am using latest scrapy version, v1.3
I crawl a webpage page by page, by following urls in pagination. In some pages, website detects that I use a bot and gives me an error in html. Since it is a successful request, it caches the page and when I run it again, I get the same error.
What I need is how can I prevent that page get into cache? Or if I cannot do that, I need to remove it from cache after I realize the error in parse method. Then I can retry and get the correct one.
I have a partial solution, I yield all requests with "dont_cache":False parameter in meta so I make sure they use cache. Where I detect the error and retry the request, I put dont_filter=True along with "dont_cache":True to make sure I get the fresh copy of the erroneous url.
def parse(self, response):
page = response.meta["page"] + 1
html = Selector(response)
counttext = html.css('h2#s-result-count::text').extract_first()
if counttext is None:
page = page - 1
yield Request(url=response.url, callback=self.parse, meta={"page":page, "dont_cache":True}, dont_filter=True)
I also tried a custom retry middleware, where I managed to get it working before cache, but I couldnt read the response.body successfully. I suspect that it is zipped somehow, as it is binary data.
class CustomRetryMiddleware(RetryMiddleware):
def process_response(self, request, response, spider):
with open('debug.txt', 'wb') as outfile:
outfile.write(response.body)
html = Selector(text=response.body)
url = response.url
counttext = html.css('h2#s-result-count::text').extract_first()
if counttext is None:
log.msg("Automated process error: %s" %url, level=log.INFO)
reason = 'Automated process error %d' %response.status
return self._retry(request, reason, spider) or response
return response
Any suggestion is appreciated.
Thanks
Mehmet
Middleware responsible for requests/response caching is HttpCacheMiddleware. Under the hood it is driven by the cache policies - special classes which dispatch what requests and responses should or shouldn't be cached. You can implement your own cache policy class and use it with the setting
HTTPCACHE_POLICY = 'my.custom.cache.Class'
More information in docs: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
Source code of built-in policies: https://github.com/scrapy/scrapy/blob/master/scrapy/extensions/httpcache.py#L18
Thanks to mizhgun, I managed to develop a solution using custom policies.
Here is what I did,
from scrapy.utils.httpobj import urlparse_cached
class CustomPolicy(object):
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
def should_cache_request(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes
def should_cache_response(self, response, request):
return response.status not in self.ignore_http_codes
def is_cached_response_fresh(self, response, request):
if "refresh_cache" in request.meta:
return False
return True
def is_cached_response_valid(self, cachedresponse, response, request):
if "refresh_cache" in request.meta:
return False
return True
And when I catch the error, (after caching occurred of course)
def parse(self, response):
html = Selector(response)
counttext = html.css('selector').extract_first()
if counttext is None:
yield Request(url=response.url, callback=self.parse, meta={"refresh_cache":True}, dont_filter=True)
When you add refresh_cache into meta, that can be catched in custom policy class.
Don't forget to add dont_filter otherwise second request will be filtered as duplicate.