Twisted trial error with SSL - ssl

SSLTest.testError passes, but Trial raises an exception after tearDown. For comparison there is RegularTest.testOk that works OK.
I have not found any Twisted bug that explain this, so I assume I'm doing something wrong given how easy this is to reproduce. Any ideas?
Here's the code:
from twisted.web import resource
from twisted.internet import ssl, reactor
from twisted.web.server import Site
from twisted.web.client import Agent, WebClientContextFactory
from twisted.trial.unittest import TestCase
class DummyServer(resource.Resource):
isLeaf = True
def render(self, request):
return 'hello world'
class SSLTest(TestCase):
def setUp(self):
site = Site(DummyServer())
SSLFactory = ssl.DefaultOpenSSLContextFactory('../server.key',
'../server.crt')
port = reactor.listenSSL(0, site, contextFactory=SSLFactory)
self.port = port
self.portNumber = port._realPortNumber
def tearDown(self):
self.port.stopListening()
def testError(self):
def finished(result):
self.assertEquals(result.code, 200)
url = 'https://127.0.0.1:%s' % self.portNumber
agent = Agent(reactor, WebClientContextFactory())
d = agent.request('GET', url)
d.addCallback(finished)
return d
class RegularTest(TestCase):
def setUp(self):
site = Site(DummyServer())
port = reactor.listenTCP(0, site)
self.port = port
self.portNumber = port._realPortNumber
def tearDown(self):
self.port.stopListening()
def testOk(self):
def finished(result):
self.assertEquals(result.code, 200)
url = 'http://127.0.0.1:%s' % self.portNumber
agent = Agent(reactor,)
d = agent.request('GET', url)
d.addCallback(finished)
return d
Here's stdout:
$ trial trialerror.py
trialerror
RegularTest
testOk ... [OK]
SSLTest
testError ... [OK]
[ERROR]
===============================================================================
[ERROR]
Traceback (most recent call last):
Failure: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean.
Selectables:
<TLSMemoryBIOProtocol #0 on 51135>
trialerror.SSLTest.testError
-------------------------------------------------------------------------------
Ran 2 tests in 0.018s
FAILED (errors=1, successes=2)

Jonathan Lange wrote about this problem and its solutions. You may also want to consider not using real network connections in your unit tests. Agent already works. So do Site,
reactor.listenSSL, etc. Try to write unit tests that exercise your code and not lots and lots of code from the libraries your code depends on.

Related

Is flask_restful compatible with virtual environments?

I am making an API with Flask-RESTFUL, but when I make the POST
http://127.0.0.1:5000/bot?id_articulo=1&url_articulo=www.wiki.org
I get the message
"message": "The browser (or proxy) sent a request that this server could not understand."
My python code is
from flask import Flask
from flask_restful import Resource, Api, reqparse
import pandas as pd
app = Flask(__name__)
api = Api(app)
class Bot(Resource):
def post(self):
parser = reqparse.RequestParser()
parser.add_argument('id_articulo' , required=True, type=int)
parser.add_argument('url_articulo', required=True, type=str)
args = parser.parse_args()
print(args)
data_articulo = pd.read_csv('articulos.csv')
print(data_articulo)
if args['url_articulo'] in list(data_articulo['url']):
return {
'mensage': f"El artículo '{args['url_articulo']}' ya existe."
}, 409
else:
nueva_columna = pd.DataFrame({
'id_articulo': [args['id_articulo']],
'url': [args['url_articulo']],
})
data_articulo = data_articulo.append(nueva_columna, ignore_index=True)
data_articulo.to_csv('articulos.csv', index=False)
return {'data': data_articulo.to_dict()}, 200
api.add_resource(Bot, '/bot', methods=['POST'])
if __name__ == '__main__':
app.run()
Now, I noticed that the error message is thrown only when I am in a virtual environment whose requirements.txt is
aniso8601==9.0.1
click==8.1.3
colorama==0.4.5
Flask==2.1.2
Flask-RESTful==0.3.9
importlib-metadata==4.12.0
itsdangerous==2.1.2
Jinja2==3.1.2
joblib==1.1.0
MarkupSafe==2.1.1
numpy==1.23.1
pandas==1.4.3
python-dateutil==2.8.2
pytz==2022.1
six==1.16.0
Werkzeug==2.1.2
zipp==3.8.0
By this far, I don't have a clue about what is going on and it makes me think that the flask_restful library have issues with virtual environments and I would like to know how to make this work properly in one.

Reverse gecoding - geopy.Nominatim module throws urlopen error [SSL: UNKNOWN PROTOCOL]

I am trying to get the address details from Latitude and Longitude using geopy.Nominatim module. Am getting "<'urlopen error [SSL: UNKNOWN_PROTOCOL] unknown protocol (_ssl.c:727)>" error.
Version Details :
Python version : 2.7
geopy version : 1.23.0
geographiclib : 1.50 (Dependency with geopy)
requests : 2.25.1
chardet : 3.0.4 (Dependency with requests)
urllib3 : 1.25.10 (Dependency with requests)
idna : 2.10 (Dependency with requests)
certifi : 2020.6.20 (Dependency with requests)
Code:
=====
from geopy.geocoders.osm import Nominatim
from geopy.exc import GeocoderServiceError
def reverse(lat,long):
app = Nominatim(user_agent='reverse-geocoding')
coordinates = "{},{}".format(lat,long) # not giving the actual co-ordinates
try:
address_details = app.reverse(coordinates,language="en").raw
return address_details
except GeocoderServiceError as e1:
print (str(e1))
result = reverse(lat,long)
print(result)
================
I have used the following workarounds with the same script.Am getting "<'urlopen error [SSL: UNKNOWN_PROTOCOL] unknown protocol (_ssl.c:727)>" error
workaround 1: To use the CA bundle used by requests library:
from geopy.geocoders.osm import Nominatim
from geopy.geocoders import options
from geopy.exc import GeocoderServiceError
import ssl
import certifi
def reverse(lat,long):
ctx = ssl.create_default_context(cafile=certifi.where())
options.default_ssl_context = ctx
app = Nominatim(user_agent='reverse-geocoding')
coordinates = "{},{}".format(lat,long) # not giving the actual co-ordinates
try:
address_details = app.reverse(coordinates,language="en").raw
return address_details
except GeocoderServiceError as e1:
print (str(e1))
result = reverse(lat,long)
print(result)
Workaround 2: To disable TLS certificate verification completely:
from geopy.geocoders.osm import Nominatim
from geopy.geocoders import options
from geopy.exc import GeocoderServiceError
import ssl
def reverse(lat,long):
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
options.default_ssl_context = ctx
app = Nominatim(user_agent='reverse-geocoding')
coordinates = "{},{}".format(lat,long) # not giving the actual co-ordinates
try:
address_details = app.reverse(coordinates,language="en").raw
return address_details
except GeocoderServiceError as e1:
print (str(e1))
result = reverse(lat,long)
print(result)
Could anyone please help me to find a fix for this issue.
The error UNKNOWN PROTOCOL is in all probability due to the fact that your request is going via a proxy.
I looked into your code mentioned in Workaround 2. Please mention the proxy explicitly in your code. Try using below code lines:
ctx = ssl.create_default.context()
ctx.check_hostname = True
ctx.verify_mode = ssl.CERT_REQUIRED
options.default_ssl_context = ctx
proxies = {'https':'https://your-proxy-server.com:port'}
app = Nominatim(user_agent="your-agent", proxies=proxies, timeout=10)
This will also help you not to ignore SSL verification completely.

Why does calling a scrapy spider from pywikibot give a ReactorNotRestartable error?

I am able to call a scrapy spider from another Python script using either CrawlerRunner or CrawlerProcess. But, when I try to call the same spider calling class from a pywikibot robot, I get a ReactorNotRestartable error. Why is this and how can I fix it?
Here is the error:
File ".\scripts\userscripts\ReplicationWiki\RWLoad.py", line 161, in format_new_page
aea = AEAMetadata(url=DOI_url)
File ".\scripts\userscripts\ReplicationWiki\GetAEAMetadata.py", line 39, in __init__
reactor.run() # the script will block here until all crawling jobs are finished
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py", line 1282, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py", line 1262, in startRunning
ReactorBase.startRunning(self)
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py", line 765, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
CRITICAL: Exiting due to uncaught exception <class 'twisted.internet.error.ReactorNotRestartable'>
Here is the script which calls my scrapy spider. It runs fine if I just call the class from main.
from twisted.internet import reactor, defer
from scrapy import signals
from scrapy.crawler import Crawler, CrawlerProcess, CrawlerRunner
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
from Scrapers.spiders.ScrapeAEA import ScrapeaeaSpider
class AEAMetadata:
"""
Helper to run ScrapeAEA spider and return JEL codes and data links
for a given AEA article link.
"""
def __init__(self, *args, **kwargs):
"""Initializer"""
url = kwargs.get('url')
if not url:
raise ValueError('No article url given')
self.items = []
def collect_items(item, response, spider):
self.items.append(item)
settings = get_project_settings()
crawler = Crawler(ScrapeaeaSpider, settings)
crawler.signals.connect(collect_items, signals.item_scraped)
runner = CrawlerRunner(settings)
d = runner.crawl(crawler, url=url)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
#process = CrawlerProcess(settings)
#process.crawl(crawler, url=url)
#process.start() # the script will block here until the crawling is finished
def get_jelcodes(self):
jelcodes = self.items[0]['jelcodes']
return jelcodes
def main():
aea = AEAMetadata(url='https://doi.org/10.1257/app.20180286')
jelcodes = aea.get_jelcodes()
print(jelcodes)
if __name__ == '__main__':
main()
Updated simple Test that instantiates the AEAMetadata class twice.
Here is the calling code in my pywikibot bot which fails:
from GetAEAMetadata import AEAMetadata
def main(*args):
for _ in [1,2]:
print('Top')
url = 'https://doi.org/10.1257/app.20170442'
aea = AEAMetadata(url=url)
print('After AEAMetadata')
jelcodes = aea.get_jelcodes()
print(jelcodes)
if __name__ == '__main__':
main()
My call to AEAMetadata was embedded in a larger script which fooled me into thinking the AEAMetadata class was only instantiated once before failure.
In fact, AEAMetadata was called twice.
And, I also thought that the script would block after the reactor.run() because the comment in all the scrapy examples stated that was the case.
However, the second deferred callback is reactor.stop() which unblocks the reactor.run().
A more basic incorrect assumption was that the reactor was deleted and recreated on each iteration. In fact, the reactor is instantiated and initialized when it is first imported. And, it is a global object which lives as long as the underlying process and was not designed to be restarted. The extremes actually needed to delete and restart a reactor are described here:
http://www.blog.pythonlibrary.org/2016/09/14/restarting-a-twisted-reactor/
So, I guess I've answered my own question.
And, I'm rewriting my script so it doesn't try to use the reactor in a way it was never intended to be used.
And, thanks Gallaecio for getting me thinking in the right direction.

SSL in python3 with HTTPServer

I found here a (apparently-)working HTTPS server for python 2: http://code.activestate.com/recipes/442473-simple-http-server-supporting-ssl-secure-communica/?c=15536
I'm trying to port it in python3 but I have no good results. This is my code:
from socketserver import BaseServer
import string,cgi,time
from os import curdir, sep
from http.server import SimpleHTTPRequestHandler, HTTPServer
import ssl
import os # os. path
import socket
class SecureHTTPServer(HTTPServer):
def __init__(self, server_address, HandlerClass):
BaseServer.__init__(self, server_address, HandlerClass)
ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
#server.pem's location (containing the server private key and
#the server certificate).
fpem = 'certificate1.pem'
ctx.load_verify_locations(fpem)
self.socket = ctx.wrap_socket(socket.socket(self.address_family,
self.socket_type))
self.server_bind()
self.server_activate()
class SecureHTTPRequestHandler(SimpleHTTPRequestHandler):
def setup(self):
self.connection = self.request
self.rfile = socket._fileobject(self.request, "rb", self.rbufsize)
self.wfile = socket._fileobject(self.request, "wb", self.wbufsize)
def do_GET(self):
print('get recieved!');
self.send_error(404,'File Not Found: %s' % self.path)
def test(HandlerClass = SecureHTTPRequestHandler,
ServerClass = SecureHTTPServer):
server_address = ('', 1443) # (address, port)
httpd = ServerClass(server_address, HandlerClass)
sa = httpd.socket.getsockname()
print ("Serving HTTPS on", sa[0], "port", sa[1], "...")
httpd.serve_forever()
if __name__ == '__main__':
test()
When I run it I get no errors, but when I connect to localhost:1443 (with https) I get no response and the print('get recieved!'); is't triggered.
I found another (simpler) solution here: http://www.piware.de/2011/01/creating-an-https-server-in-python/
This is my working porting to python3:
from http.server import HTTPServer,SimpleHTTPRequestHandler
from socketserver import BaseServer
import ssl
httpd = HTTPServer(('localhost', 1443), SimpleHTTPRequestHandler)
httpd.socket = ssl.wrap_socket (httpd.socket, certfile='certificate.pem', server_side=True)
httpd.serve_forever()
Since Python 3.7 ssl.wrap_socket is deprecated, use SSLContext.wrap_socket instead:
check: https://docs.python.org/3.7/library/ssl.html#ssl.wrap_socket
and since version 3.10: SSLContext without protocol argument is deprecated.
check: https://docs.python.org/3.10/library/ssl.html#ssl.SSLContext
from http.server import HTTPServer,SimpleHTTPRequestHandler
import ssl
httpd = HTTPServer(('localhost', 1443), SimpleHTTPRequestHandler)
# Since version 3.10: SSLContext without protocol argument is deprecated.
# sslctx = ssl.SSLContext()
sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
sslctx.check_hostname = False # If set to True, only the hostname that matches the certificate will be accepted
sslctx.load_cert_chain(certfile='certificate.pem', keyfile="private.pem")
httpd.socket = sslctx.wrap_socket(httpd.socket, server_side=True)
httpd.serve_forever()

Setting referer in Selenium

Im working with the selenium remote driver to automate actions on a site, i can open the page i need directly by engineering the url as the sites url schema is very constant. This speeds up the script as it dose not have to work through several pages before it gets to the one it needs.
To make the automation seem organic is there a way to set a referral page in Selenium ?
If you're checking the referrer on the server, then using a proxy (as mentioned in other answers) will be the way to go.
However, if you need access to the referrer in Javascript using a proxy will not work. To set the Javascript referrer I did the following:
Go to the referral website
Inject this javascript onto the page via Selenium API: document.write('<script>window.location.href = "<my website>";</script>')"
I'm using a Python wrapper around selenium, so I cannot provide the function you need to inject the code in your language, but it should be easy to find.
What you are looking for is referer spoofing.
Selenium does not have an inbuilt method to do this, however it can be accomplished by using a proxy such as fiddler.
Fiddler also provides an API-only version of the FiddlerCore component, and programmatic access to all of the proxy's settings and data, thus allowing you to modify the headers of the http response.
Here is a solution in Python to do exactly that:
https://github.com/j-bennet/selenium-referer
I described the use case and the solution in the README. I think github repo won't go anywhere, but I'll quote the relevant pieces here just in case.
The solution uses libmproxy to implement a proxy server that only does one thing: adds a Referer header. Header is specified as command line parameter when running the proxy. Code:
# -*- coding: utf-8 -*-
"""
Proxy server to add a specified Referer: header to the request.
"""
from optparse import OptionParser
from libmproxy import controller, proxy
from libmproxy.proxy.server import ProxyServer
class RefererMaster(controller.Master):
"""
Adds a specified referer header to the request.
"""
def __init__(self, server, referer):
"""
Init the proxy master.
:param server: ProxyServer
:param referer: string
"""
controller.Master.__init__(self, server)
self.referer = referer
def run(self):
"""
Basic run method.
"""
try:
print('Running...')
return controller.Master.run(self)
except KeyboardInterrupt:
self.shutdown()
def handle_request(self, flow):
"""
Adds a Referer header.
"""
flow.request.headers['referer'] = [self.referer]
flow.reply()
def handle_response(self, flow):
"""
Does not do anything extra.
"""
flow.reply()
def start_proxy_server(port, referer):
"""
Start proxy server and return an instance.
:param port: int
:param referer: string
:return: RefererMaster
"""
config = proxy.ProxyConfig(port=port)
server = ProxyServer(config)
m = RefererMaster(server, referer)
m.run()
if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-r", "--referer", dest="referer",
help="Referer URL.")
parser.add_option("-p", "--port", dest="port", type="int",
help="Port number (int) to run the server on.")
popts, pargs = parser.parse_args()
start_proxy_server(popts.port, popts.referer)
Then, in the setUp() method of the test, proxy server is started as an external process, using pexpect, and stopped in tearDown(). Method called proxy() returns proxy settings to configure Firefox driver with:
# -*- coding: utf-8 -*-
import os
import sys
import pexpect
import unittest
from selenium.webdriver.common.proxy import Proxy, ProxyType
import utils
class ProxyBase(unittest.TestCase):
"""
We have to use our own proxy server to set a Referer header, because Selenium does not
allow to interfere with request headers.
This is the base class. Change `proxy_referer` to set different referers.
"""
base_url = 'http://www.facebook.com'
proxy_server = None
proxy_address = '127.0.0.1'
proxy_port = 8888
proxy_referer = None
proxy_command = '{0} {1} --referer {2} --port {3}'
def setUp(self):
"""
Create the environment.
"""
print('\nSetting up.')
self.start_proxy()
self.driver = utils.create_driver(proxy=self.proxy())
def tearDown(self):
"""
Cleanup the environment.
"""
print('\nTearing down.')
utils.close_driver(self.driver)
self.stop_proxy()
def proxy(self):
"""
Create proxy settings for our Firefox profile.
:return: Proxy
"""
proxy_url = '{0}:{1}'.format(self.proxy_address, self.proxy_port)
p = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': proxy_url,
'ftpProxy': proxy_url,
'sslProxy': proxy_url,
'noProxy': 'localhost, 127.0.0.1'
})
return p
def start_proxy(self):
"""
Start the proxy process.
"""
if not self.proxy_referer:
raise Exception('Set the proxy_referer in child class!')
python_path = sys.executable
current_dir = os.path.dirname(__file__)
proxy_file = os.path.normpath(os.path.join(current_dir, 'referer_proxy.py'))
command = self.proxy_command.format(
python_path, proxy_file, self.proxy_referer, self.proxy_port)
print('Running the proxy command:')
print(command)
self.proxy_server = pexpect.spawnu(command)
self.proxy_server.expect_exact(u'Running...', 2)
def stop_proxy(self):
"""
Override in child class to use a proxy.
"""
print('Stopping proxy server...')
self.proxy_server.close(True)
print('Proxy server stopped.')
I wanted my unit tests to start and stop the proxy server without any user interaction, and could not find any Python samples doing that. Which is why I created the github repo (link above).
Hope this helps someone.
Not sure if i understand your question correctly, but if you want to override your HTTP requests there is no way to do it directly with webdriver. You must run your request thru a proxy. I prefer using browsermob, you can get it thru maven or similar.
ProxyServer server = new ProxyServer(proxy_port); //net.lightbody.bmp.proxy.ProxyServer;
server.start();
server.setCaptureHeaders(true);
Proxy proxy = server.seleniumProxy(); //org.openqa.selenium.Proxy
proxy.setHttpProxy("localhost").setSslProxy("localhost");
server.addRequestInterceptor(new RequestInterceptor() {
#Override
public void process(BrowserMobHttpRequest browserMobHttpRequest, Har har) {
browserMobHttpRequest.addRequestHeader("Referer", "blabla");
}
});
// configure it as a desired capability
DesiredCapabilities capabilities = new DesiredCapabilities();
capabilities.setCapability(CapabilityType.PROXY, proxy);
// start the driver
driver = new FirefoxDriver(capabilities);
Or black/whitelist anything:
server.blacklistRequests("https?://.*\\.google-analytics\\.com/.*", 410);
server.whitelistRequests("https?://*.*.yoursite.com/.*. https://*.*.someOtherYourSite.*".split(","), 200);