WebKit pyGTK handle custom protocol (apt://) - webkit

I'd like a regular GTK box with a web code. That code will have an install buttons with apt (Firefox is opening that URLs into the Software Center).
self.web = builder.get_object('boxWeb')
self.web_view = WebKit.WebView()
self.web_view.open("http://web_with_apt_links")
self.web_view.show()
self.web.add(self.web_view)
But when I try it, I get an URL error:
Unable to load page
Problem occurred while loading the URL apt:package
URL cannot be shown
Can I capture the apt links in Linux? Thanks in advance!

You have to connect to the navigation-requested signal. Here is an example:
from gi.repository import Gtk, WebKit
class window(Gtk.Window):
def __init__(self):
Gtk.Window.__init__(self)
self.connect('delete-event', Gtk.main_quit)
webview = WebKit.WebView()
self.add(webview)
webview.connect('navigation-requested', self.on_navigation_requested)
webview.open('http://google.de')
#webview.open('apt://test') uncomment to test apt URIs
def on_navigation_requested(self, view, frame, req):
uri = req.get_uri()
if uri and uri.startswith('apt'):
print('apt uri')
return WebKit.NavigationResponse.IGNORE
return WebKit.NavigationResponse.ACCEPT
if __name__ == '__main__':
win = window()
win.show_all()
Gtk.main()

Related

unable to open database file on a hosting service pythonanywhere

I want to deploy my project on pythonanywhere. Error.log says that server or machine is unable to open my database. Everything works fine on my local machine. I watched a video of Pretty Printed from YouTube
This how I initialize in app.py. This what I got from error.log
db_session.global_init("db/data.sqlite")
this in db_session:
def global_init(db_file):
global __factory
if __factory:
return
if not db_file or not db_file.strip():
raise Exception("Необходимо указать файл базы данных.")
conn_str = f'sqlite:///{db_file.strip()}?check_same_thread=False'
print(f"Подключение к базе данных по адресу {conn_str}")
engine = sa.create_engine(conn_str, echo=False)
__factory = orm.sessionmaker(bind=engine)
from . import __all_models
SqlAlchemyBase.metadata.create_all(engine)
def create_session() -> Session:
global __factory
return __factory()
last thing is my wsgi.py:
import sys
path = '/home/r1chter/Chicken-beta'
if path not in sys.path:
sys.path.append(path)
import os
from dotenv import load_dotenv
project_folder = os.path.expanduser(path)
load_dotenv(os.path.join(project_folder, '.env'))
import app # noqa
application = app.app()
Usually errors like this on PythonAnywhere are due to providing relative path instead of absolute path.

How to fill JavaScript form using Python?

I want to use Python to fill this form.
I tried using Mechanize but this is a Microsoft Form which uses JavaScript and has no form tag and no GET/POST URL. Maybe BeautifulSoup/Selenium can do this, but I do not have any experience in scraping JS forms. Can anyone help me out and suggest how to go about this?
Here's what I've tried, Mechanize is unable to recognize any form on the page:
import mechanize
def main():
br = mechanize.Browser()
br.set_handle_robots(False)
br.set_handle_refresh(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
response = br.open("https://forms.office.com/Pages/ResponsePage.aspx?id=8Pm7rtoj40mYvzIXGrvJvCxQDveyljlCrKN2Teo3EHFUQVNaWDlYRkhYR09JRTZWRFpKTTNIQU9HUC4u")
for form in br.forms():
print("Form name:", form.name) #prints nothing
print(form) #prints nothing
if __name__ == '__main__':
main()
Selenium works fine.
You'll need to install the components
install selenium pip install selenium
You need to ensure you download the correct chromedriver (or other driver) for your browser and OS versions and add it to path
Then this runs:
from selenium import webdriver
driver = webdriver.Chrome()
url = "https://forms.office.com/Pages/ResponsePage.aspx?id=8Pm7rtoj40mYvzIXGrvJvCxQDveyljlCrKN2Teo3EHFUQVNaWDlYRkhYR09JRTZWRFpKTTNIQU9HUC4u"
driver.get(url)
name = driver.find_element_by_xpath("//div[#class='question-title-box'][.//span[text()='NAME']]/following-sibling::*//input")
name.send_keys("hello, World")
setionSelection = "F"
section = driver.find_element_by_xpath("//div[#class='question-title-box'][.//span[text()='Section']]/following-sibling::*//input[#value='" + setionSelection + "']")
section.click()
date = driver.find_element_by_xpath("//input[contains(#placeholder, 'Please input date')]")
date.send_keys("01/12/2020")
submit = driver.find_element_by_xpath("//div[text()='Submit']")
submit.click()
The xapths are a little long but they're based on the question text so potentially stable
For an alternative approach - When you say there is no POST url, did you check devtools? - That exposes the destination of the form:
Request URL: https://forms.office.com/formapi/api/aebbf9f0-23da-49e3-98bf-32171abbc9bc/users/f70e502c-96b2-4239-aca3-764dea371071/forms('8Pm7rtoj40mYvzIXGrvJvCxQDveyljlCrKN2Teo3EHFUQVNaWDlYRkhYR09JRTZWRFpKTTNIQU9HUC4u')/responses
Request Method: POST
it also exposes the payload... This is the first submit:
{startDate: "2020-08-17T10:40:18.504Z", submitDate: "2020-08-17T10:40:18.507Z",…}
answers: "[{"questionId":"r8f09d63e6f6f42feb2f8f4f8ed3f9389","answer1":"Hello, World"},{"questionId":"r28fe12073dfa47399f8ce95ae679dccf","answer1":"G"},{"questionId":"r8f9e9fedcc2e410c80bfa1e0e3ef9750","answer1":"2020-08-28"}]"
startDate: "2020-08-17T10:40:18.504Z"
submitDate: "2020-08-17T10:40:18.507Z"
Those post URL UUID/GUIDs questions IDs seem to be satic for this form. Every time i run form they're not chaning. This is the second run:
{startDate: "2020-08-17T10:43:48.544Z", submitDate: "2020-08-17T10:43:48.546Z",…}
answers: "[{"questionId":"r8f09d63e6f6f42feb2f8f4f8ed3f9389","answer1":"test me"},{"questionId":"r28fe12073dfa47399f8ce95ae679dccf","answer1":"G"},{"questionId":"r8f9e9fedcc2e410c80bfa1e0e3ef9750","answer1":"2020-08-12"}]"
startDate: "2020-08-17T10:43:48.544Z"
submitDate: "2020-08-17T10:43:48.546Z"
Once you capture this once you'll probably be able to do it through the API without a GUI.
... Just to make sure, i tried it and i get success...
import requests
url = "https://forms.office.com/formapi/api/aebbf9f0-23da-49e3-98bf-32171abbc9bc/users/f70e502c-96b2-4239-aca3-764dea371071/forms('8Pm7rtoj40mYvzIXGrvJvCxQDveyljlCrKN2Teo3EHFUQVNaWDlYRkhYR09JRTZWRFpKTTNIQU9HUC4u')/responses"
myobj = {"startDate":"2020-08-17T10:48:40.118Z","submitDate":"2020-08-17T10:48:40.121Z","answers":"[{\"questionId\":\"r8f09d63e6f6f42feb2f8f4f8ed3f9389\",\"answer1\":\"Hello again, World\"},{\"questionId\":\"r28fe12073dfa47399f8ce95ae679dccf\",\"answer1\":\"F\"},{\"questionId\":\"r8f9e9fedcc2e410c80bfa1e0e3ef9750\",\"answer1\":\"2020-08-26\"}]"}
x = requests.post(url, data = myobj)
My answers are just hard coded into the data object but it seems to work.
Remember to pip install requests if you don't already have it

Why does calling a scrapy spider from pywikibot give a ReactorNotRestartable error?

I am able to call a scrapy spider from another Python script using either CrawlerRunner or CrawlerProcess. But, when I try to call the same spider calling class from a pywikibot robot, I get a ReactorNotRestartable error. Why is this and how can I fix it?
Here is the error:
File ".\scripts\userscripts\ReplicationWiki\RWLoad.py", line 161, in format_new_page
aea = AEAMetadata(url=DOI_url)
File ".\scripts\userscripts\ReplicationWiki\GetAEAMetadata.py", line 39, in __init__
reactor.run() # the script will block here until all crawling jobs are finished
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py", line 1282, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py", line 1262, in startRunning
ReactorBase.startRunning(self)
File "C:\Users\lextr\.conda\envs\py37\lib\site-packages\twisted\internet\base.py", line 765, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
CRITICAL: Exiting due to uncaught exception <class 'twisted.internet.error.ReactorNotRestartable'>
Here is the script which calls my scrapy spider. It runs fine if I just call the class from main.
from twisted.internet import reactor, defer
from scrapy import signals
from scrapy.crawler import Crawler, CrawlerProcess, CrawlerRunner
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
from Scrapers.spiders.ScrapeAEA import ScrapeaeaSpider
class AEAMetadata:
"""
Helper to run ScrapeAEA spider and return JEL codes and data links
for a given AEA article link.
"""
def __init__(self, *args, **kwargs):
"""Initializer"""
url = kwargs.get('url')
if not url:
raise ValueError('No article url given')
self.items = []
def collect_items(item, response, spider):
self.items.append(item)
settings = get_project_settings()
crawler = Crawler(ScrapeaeaSpider, settings)
crawler.signals.connect(collect_items, signals.item_scraped)
runner = CrawlerRunner(settings)
d = runner.crawl(crawler, url=url)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
#process = CrawlerProcess(settings)
#process.crawl(crawler, url=url)
#process.start() # the script will block here until the crawling is finished
def get_jelcodes(self):
jelcodes = self.items[0]['jelcodes']
return jelcodes
def main():
aea = AEAMetadata(url='https://doi.org/10.1257/app.20180286')
jelcodes = aea.get_jelcodes()
print(jelcodes)
if __name__ == '__main__':
main()
Updated simple Test that instantiates the AEAMetadata class twice.
Here is the calling code in my pywikibot bot which fails:
from GetAEAMetadata import AEAMetadata
def main(*args):
for _ in [1,2]:
print('Top')
url = 'https://doi.org/10.1257/app.20170442'
aea = AEAMetadata(url=url)
print('After AEAMetadata')
jelcodes = aea.get_jelcodes()
print(jelcodes)
if __name__ == '__main__':
main()
My call to AEAMetadata was embedded in a larger script which fooled me into thinking the AEAMetadata class was only instantiated once before failure.
In fact, AEAMetadata was called twice.
And, I also thought that the script would block after the reactor.run() because the comment in all the scrapy examples stated that was the case.
However, the second deferred callback is reactor.stop() which unblocks the reactor.run().
A more basic incorrect assumption was that the reactor was deleted and recreated on each iteration. In fact, the reactor is instantiated and initialized when it is first imported. And, it is a global object which lives as long as the underlying process and was not designed to be restarted. The extremes actually needed to delete and restart a reactor are described here:
http://www.blog.pythonlibrary.org/2016/09/14/restarting-a-twisted-reactor/
So, I guess I've answered my own question.
And, I'm rewriting my script so it doesn't try to use the reactor in a way it was never intended to be used.
And, thanks Gallaecio for getting me thinking in the right direction.

Setting referer in Selenium

Im working with the selenium remote driver to automate actions on a site, i can open the page i need directly by engineering the url as the sites url schema is very constant. This speeds up the script as it dose not have to work through several pages before it gets to the one it needs.
To make the automation seem organic is there a way to set a referral page in Selenium ?
If you're checking the referrer on the server, then using a proxy (as mentioned in other answers) will be the way to go.
However, if you need access to the referrer in Javascript using a proxy will not work. To set the Javascript referrer I did the following:
Go to the referral website
Inject this javascript onto the page via Selenium API: document.write('<script>window.location.href = "<my website>";</script>')"
I'm using a Python wrapper around selenium, so I cannot provide the function you need to inject the code in your language, but it should be easy to find.
What you are looking for is referer spoofing.
Selenium does not have an inbuilt method to do this, however it can be accomplished by using a proxy such as fiddler.
Fiddler also provides an API-only version of the FiddlerCore component, and programmatic access to all of the proxy's settings and data, thus allowing you to modify the headers of the http response.
Here is a solution in Python to do exactly that:
https://github.com/j-bennet/selenium-referer
I described the use case and the solution in the README. I think github repo won't go anywhere, but I'll quote the relevant pieces here just in case.
The solution uses libmproxy to implement a proxy server that only does one thing: adds a Referer header. Header is specified as command line parameter when running the proxy. Code:
# -*- coding: utf-8 -*-
"""
Proxy server to add a specified Referer: header to the request.
"""
from optparse import OptionParser
from libmproxy import controller, proxy
from libmproxy.proxy.server import ProxyServer
class RefererMaster(controller.Master):
"""
Adds a specified referer header to the request.
"""
def __init__(self, server, referer):
"""
Init the proxy master.
:param server: ProxyServer
:param referer: string
"""
controller.Master.__init__(self, server)
self.referer = referer
def run(self):
"""
Basic run method.
"""
try:
print('Running...')
return controller.Master.run(self)
except KeyboardInterrupt:
self.shutdown()
def handle_request(self, flow):
"""
Adds a Referer header.
"""
flow.request.headers['referer'] = [self.referer]
flow.reply()
def handle_response(self, flow):
"""
Does not do anything extra.
"""
flow.reply()
def start_proxy_server(port, referer):
"""
Start proxy server and return an instance.
:param port: int
:param referer: string
:return: RefererMaster
"""
config = proxy.ProxyConfig(port=port)
server = ProxyServer(config)
m = RefererMaster(server, referer)
m.run()
if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-r", "--referer", dest="referer",
help="Referer URL.")
parser.add_option("-p", "--port", dest="port", type="int",
help="Port number (int) to run the server on.")
popts, pargs = parser.parse_args()
start_proxy_server(popts.port, popts.referer)
Then, in the setUp() method of the test, proxy server is started as an external process, using pexpect, and stopped in tearDown(). Method called proxy() returns proxy settings to configure Firefox driver with:
# -*- coding: utf-8 -*-
import os
import sys
import pexpect
import unittest
from selenium.webdriver.common.proxy import Proxy, ProxyType
import utils
class ProxyBase(unittest.TestCase):
"""
We have to use our own proxy server to set a Referer header, because Selenium does not
allow to interfere with request headers.
This is the base class. Change `proxy_referer` to set different referers.
"""
base_url = 'http://www.facebook.com'
proxy_server = None
proxy_address = '127.0.0.1'
proxy_port = 8888
proxy_referer = None
proxy_command = '{0} {1} --referer {2} --port {3}'
def setUp(self):
"""
Create the environment.
"""
print('\nSetting up.')
self.start_proxy()
self.driver = utils.create_driver(proxy=self.proxy())
def tearDown(self):
"""
Cleanup the environment.
"""
print('\nTearing down.')
utils.close_driver(self.driver)
self.stop_proxy()
def proxy(self):
"""
Create proxy settings for our Firefox profile.
:return: Proxy
"""
proxy_url = '{0}:{1}'.format(self.proxy_address, self.proxy_port)
p = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': proxy_url,
'ftpProxy': proxy_url,
'sslProxy': proxy_url,
'noProxy': 'localhost, 127.0.0.1'
})
return p
def start_proxy(self):
"""
Start the proxy process.
"""
if not self.proxy_referer:
raise Exception('Set the proxy_referer in child class!')
python_path = sys.executable
current_dir = os.path.dirname(__file__)
proxy_file = os.path.normpath(os.path.join(current_dir, 'referer_proxy.py'))
command = self.proxy_command.format(
python_path, proxy_file, self.proxy_referer, self.proxy_port)
print('Running the proxy command:')
print(command)
self.proxy_server = pexpect.spawnu(command)
self.proxy_server.expect_exact(u'Running...', 2)
def stop_proxy(self):
"""
Override in child class to use a proxy.
"""
print('Stopping proxy server...')
self.proxy_server.close(True)
print('Proxy server stopped.')
I wanted my unit tests to start and stop the proxy server without any user interaction, and could not find any Python samples doing that. Which is why I created the github repo (link above).
Hope this helps someone.
Not sure if i understand your question correctly, but if you want to override your HTTP requests there is no way to do it directly with webdriver. You must run your request thru a proxy. I prefer using browsermob, you can get it thru maven or similar.
ProxyServer server = new ProxyServer(proxy_port); //net.lightbody.bmp.proxy.ProxyServer;
server.start();
server.setCaptureHeaders(true);
Proxy proxy = server.seleniumProxy(); //org.openqa.selenium.Proxy
proxy.setHttpProxy("localhost").setSslProxy("localhost");
server.addRequestInterceptor(new RequestInterceptor() {
#Override
public void process(BrowserMobHttpRequest browserMobHttpRequest, Har har) {
browserMobHttpRequest.addRequestHeader("Referer", "blabla");
}
});
// configure it as a desired capability
DesiredCapabilities capabilities = new DesiredCapabilities();
capabilities.setCapability(CapabilityType.PROXY, proxy);
// start the driver
driver = new FirefoxDriver(capabilities);
Or black/whitelist anything:
server.blacklistRequests("https?://.*\\.google-analytics\\.com/.*", 410);
server.whitelistRequests("https?://*.*.yoursite.com/.*. https://*.*.someOtherYourSite.*".split(","), 200);

ScrapyDeprecationWaring: Command's default `crawler` is deprecated and will be removed. Use `create_crawler` method to instantiate crawlers

Scrapy version 0.19
I am using the code at this page ( Run multiple scrapy spiders at once using scrapyd ). When I run scrapy allcrawl, I got
ScrapyDeprecationWaring: Command's default `crawler` is deprecated and will be removed. Use `create_crawler` method to instantiate crawlers
Here is the code:
from scrapy.command import ScrapyCommand
import urllib
import urllib2
from scrapy import log
class AllCrawlCommand(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def short_desc(self):
return "Schedule a run for all available spiders"
def run(self, args, opts):
url = 'http://localhost:6800/schedule.json'
for s in self.crawler.spiders.list(): #this line raise the warning
values = {'project' : 'YOUR_PROJECT_NAME', 'spider' : s}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
log.msg(response)
How do I fix the DeprecationWarning ?
Thanks
Use:
crawler = self.crawler_process.create_crawler()