I want to enable some http-proxy for some spiders, and disable them for other spiders.
Can I do something like this?
# settings.py
proxy_spiders = ['a1' , b2']
if spider in proxy_spider: #how to get spider name ???
HTTP_PROXY = 'http://127.0.0.1:8123'
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RandomUserAgentMiddleware': 400,
'myproject.middlewares.ProxyMiddleware': 410,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None
}
else:
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RandomUserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None
}
If the code above doesn't work, is there any other suggestion?
a bit late, but since release 1.0.0 there is a new feature in scrapy where you can override settings per spider like this:
class MySpider(scrapy.Spider):
name = "my_spider"
custom_settings = {"HTTP_PROXY":'http://127.0.0.1:8123',
"DOWNLOADER_MIDDLEWARES": {'myproject.middlewares.RandomUserAgentMiddleware': 400,
'myproject.middlewares.ProxyMiddleware': 410,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None}}
class MySpider2(scrapy.Spider):
name = "my_spider2"
custom_settings = {"DOWNLOADER_MIDDLEWARES": {'myproject.middlewares.RandomUserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None}}
There is a new and easier way to do this.
class MySpider(scrapy.Spider):
name = 'myspider'
custom_settings = {
'SOME_SETTING': 'some value',
}
I use Scrapy 1.3.1
You can add setting.overrides within the spider.py file
Example that works:
from scrapy.conf import settings
settings.overrides['DOWNLOAD_TIMEOUT'] = 300
For you, something like this should also work
from scrapy.conf import settings
settings.overrides['DOWNLOADER_MIDDLEWARES'] = {
'myproject.middlewares.RandomUserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None
}
You can define your own proxy middleware, something straightforward like this:
from scrapy.contrib.downloadermiddleware import HttpProxyMiddleware
class ConditionalProxyMiddleware(HttpProxyMiddleware):
def process_request(self, request, spider):
if getattr(spider, 'use_proxy', None):
return super(ConditionalProxyMiddleware, self).process_request(request, spider)
Then define the attribute use_proxy = True in the spiders that you want to have the proxy enabled. Don't forget to disable the default proxy middleware and enable your modified one.
Why not use two projects rather than only one?
Let's name these two projects with proj1 and proj2. In proj1's settings.py, put these settings:
HTTP_PROXY = 'http://127.0.0.1:8123'
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RandomUserAgentMiddleware': 400,
'myproject.middlewares.ProxyMiddleware': 410,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None
}
In proj2's settings.py, put these settings:
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RandomUserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None
}
Related
My middlewares settings:
from w3lib.http import basic_auth_header
class CustomProxyMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = "111.11.11.111:1111"
request.headers['Proxy - Authorization'] = basic_auth_header('login', 'password')
My settings:
DOWNLOADER_MIDDLEWARES = {
'my_project.middlewares.CustomProxyMiddleware': 350,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
}
After launching, I get an error:
scrapy.core.downloader.handlers.http11.TunnelError: Could not open CONNECT tunnel with proxy 217.29.53.106:51725 [{'status': 407, 'reason': b'Proxy Authentication Required'}]
What is the reason, how to fix it? (I use valid https proxies)
Try changing the header name to be Proxy-Authorization
request.headers['Proxy-Authorization'] = basic_auth_header('login', 'password')
proxy = [
'http': 'http://{user}:{password}#{host}:{port}',
'https': 'https://{user}:{password}#{host}:{port}',
]
yield scrapy.request(url=url, proxy=proxy}
doesn't this work?
I want to customize the Scrapy feed URI to s3 to include the dimensions of the uploaded file. Currently I have the following in settings.py file:
FEEDS = {
's3://path-to-file/file_to_have_dimensions.csv': {
'format': 'csv',
'encoding': 'utf8',
'store_empty': False,
'indent': 4,
}
}
But would like to have something like the following:
NUMBER_OF_ROWS_IN_CSV = file.height()
FEEDS = {
f's3://path-to-files/file_to_have_dimensions_{NUMBER_OF_ROWS_IN_CSV}.csv': {
'format': 'csv',
'encoding': 'utf8',
'store_empty': False,
'indent': 4,
}
}
Note that I would like the number of rows to be inserted automatically.
Is this possible to do this solely through changing settings.py, or is it required to change other parts of the scrapy code?
The feed file is created when the spider starts running at which point the number of items is not yet know. However, when the spider finishes running, it calls a method named closed from which you can access the spider stats, settings and also you can perform any other tasks that you want to run after the spider has finished scraping and saving items.
In the case below i renamed the feed file from intial_file.csv to final_file_{item_count}.csv.
As you cannot rename files in s3,I use the boto3 library to copy the initial_file to a new file and name it with the item_count value included in the file name and then delete the initial file.
import scrapy
import boto3
class SampleSpider(scrapy.Spider):
name = 'sample'
start_urls = [
'http://quotes.toscrape.com/',
]
custom_settings = {
'FEEDS': {
's3://path-to-file/initial_file.csv': {
'format': 'csv',
'encoding': 'utf8',
'store_empty': False,
'indent': 4,
}
}
}
def parse(self, response):
for quote in response.xpath('//div[#class="quote"]'):
yield {
'text': quote.xpath('./span[#class="text"]/text()').extract_first(),
'author': quote.xpath('.//small[#class="author"]/text()').extract_first(),
'tags': quote.xpath('.//div[#class="tags"]/a[#class="tag"]/text()').extract()
}
def closed(self, reason):
item_count = self.crawler.stats.get_value('item_scraped_count')
try:
session = boto3.Session(aws_access_key_id = 'awsAccessKey', aws_secret_access_key = 'awsSecretAccessKey')
s3 = session.resource('s3')
s3.Object('my_bucket', f'path-to-file/final_file_{item_count}.csv').copy_from(CopySource = 'my_bucket/path-to-file/initial_file.csv')
s3.Object('my_bucket', 'path-to-file/initial_file.csv').delete()
except:
self.logger.info("unable to rename s3 file")
I am getting this from the stats after scrapy crawl test
'dupefilter/filtered': 288, how can I store the filtered requests into a .txt file (or any type) so I view later on?
To achieve this you need to make 2 things:
Set DUPEFILTER_DEBUG setting to True - it will add all
filtered requests into log
Set LOG_FILE to save log in
txt file.
One of possible way to do it - by setting custom_settings spider attribute:
....
class SomeSpider(scrapy.Spider):
....
custom_settings = {
"DUPEFILTER_DEBUG":True,
"LOG_FILE": "log.txt"}
....
def parse(self, response):
....
You will have log lines like this:
2019-12-21 20:34:07 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET http://quotes.toscrape.com/page/4/> (referer: http://quotes.toscrape.com/page/3/)
UPDATE
To save only dupefilter logs:
....
from logging import FileHandler
class SomeSpider(scrapy.Spider):
....
custom_settings = {
"DUPEFILTER_DEBUG":True,
# "LOG_FILE": "log.txt"} # - optional
....
def start_requests(self):
# Adding file handler to dupefilter logger:
dupefilter_log_filename = "df_log.txt"
self.crawler.engine.slot.scheduler.df.logger.addHandler(FileHandler(dupefilter_log_filename, delay=False, encoding="utf-8"))
def parse(self, response):
....
Additional info:
Scrapy logging documentation
Python logging module documentation
I write a simple test to validate the https proxy in scrapy.but it didn't work
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://www.baidu.com/']
def parse(self, response):
if response.status == 200:
print(response.text)
and the file of middlewares like this:
class DynamicProxyDownloaderMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = 'https://183.159.88.182:8010'
also the file of settings:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'requestTest.middlewares.DynamicProxyDownloaderMiddleware': 100
}
when using the lib of requests.the https proxy works.but changed to scrapy.it confused me.So anybody know this?
the log file:
[the log file][1]
[scrapy.downloadermiddlewares.retry] DEBUG: Retrying http://www.baidu.com/> (failed 1 times): TCP connection timed out: 10060:
the proxy address is https://183.159.88.182:8010
I'm getting an HTTP 500 error when trying to run the sample code (python) for inserting an activity on behalf of a user. I've set up domain wide delegation and have included all the correct scopes. I've successfully run with domain delegation for creating circles, adding people to circles, reading posts, comments and profiles but for some reason I cannot get the code to work for inserting a post on behalf of a user. Any ideas?
Code and error follow (private info redacted):
import httplib2
import pprint
from apiclient.discovery import build
from oauth2client.client import SignedJwtAssertionCredentials
SERVICE_ACCOUNT_EMAIL = 'svc-acct#developer.gserviceaccount.com'
SERVICE_ACCOUNT_PKCS12_FILE_PATH = '/path/privatekey.pem'
USER_EMAIL = 'email#domain.com'
SCOPES = ['https://www.googleapis.com/auth/plus.me',
'https://www.googleapis.com/auth/plus.stream.read',
'https://www.googleapis.com/auth/plus.stream.write',
'https://www.googleapis.com/auth/plus.circles.read',
'https://www.googleapis.com/auth/plus.circles.write',
'https://www.googleapis.com/auth/plus.profiles.read']
def authenticate():
print 'Authenticate the domain for %s' % USER_EMAIL
f = open(SERVICE_ACCOUNT_PKCS12_FILE_PATH, 'rb')
key = f.read()
f.close()
credentials = SignedJwtAssertionCredentials(SERVICE_ACCOUNT_EMAIL, key,
scope=SCOPES, sub=USER_EMAIL)
http = httplib2.Http()
http = credentials.authorize(http)
return build('plusDomains', 'v1', http=http)
def activitiesInsert(service):
user_id = 'me'
print 'Inserting activity'
result = service.activities().insert(
userId = user_id,
body = {
'object' : {
'originalContent' : 'Happy Monday! #caseofthemondays'
},
'access' : {
'items' : [{
'type' : 'domain'
}],
# Required, this does the domain restriction
'domainRestricted': True
}
}).execute()
print 'result = %s' % pprint.pformat(result)
if __name__ == '__main__':
service = authenticate()
activitiesInsert(service)
python addpost.py
Authenticate the domain for email#domain.com
Inserting activity
Traceback (most recent call last):
File "addpost.py", line 72, in
activitiesInsert(service)
File "addpost.py", line 64, in activitiesInsert
'domainRestricted': True
File "build/bdist.macosx-10.6-intel/egg/oauth2client/util.py", line 132, in
positional_wrapper
File "build/bdist.macosx-10.6-intel/egg/apiclient/http.py",
line 723, in execute
apiclient.errors.httperror
HttpError 500 when requesting https://www.googleapis.com/plusDomains/v1/people/me/activities?alt=json returned ""