How to do an internal redirect in python falcon - falconframework

How can I make an "internal redirect" in Falcon?
I setup static routes with:
app.add_static_route('/', os.path.abspath(here + '/static/')
And I want to redirect '/' to '/index.html', but not as an http 3xx, I want to do it internally, so that the path as far as the browser is concerned is still '/' but the content is what is the content of '/static/index.html'.

As of Falcon 1.4.1 there is no way to do "internal redirection" not to reply with a single file, but I was able to implement this using a sink and regex:
import os
import falcon
WORKING_DIRECTORY = os.getcwd()
STATIC = 'static/'
def apex(req, resp):
resp.content_type = 'text/html; charset=utf-8'
filename = os.path.abspath(os.path.join(WORKING_DIRECTORY, STATIC, 'index.html'))
with open(filename, 'rt') as f:
resp.body = f.read()
app.add_sink(apex, prefix='^/$')
app.add_static_route('/', os.path.abspath(os.path.join(WORKING_DIRECTORY, STATIC)))

Related

Save Pandas or Pyspark dataframe from Databricks to Azure Blob Storage

Is there a way I can save a Pyspark or Pandas dataframe from Databricks to a blob storage without mounting or installing libraries?
I was able to achieve this after mounting the storage container into Databricks and using the library com.crealytics.spark.excel, but I was wondering if I can do the same without the library or without mounting because I will be working on clusters without these 2 permissions.
Here the code for saving the dataframe locally to dbfs.
# export
from os import path
folder = "export"
name = "export"
file_path_name_on_dbfs = path.join("/tmp", folder, name)
# Writing to DBFS
# .coalesce(1) used to generate only 1 file, if the dataframe is too big this won't work so you'll have multiple files qnd you need to copy them later one by one
sampleDF \
.coalesce(1) \
.write \
.mode("overwrite") \
.option("header", "true") \
.option("delimiter", ";") \
.option("encoding", "UTF-8") \
.csv(file_path_name_on_dbfs)
# path of destination, which will be sent to az storage
dest = file_path_name_on_dbfs + ".csv"
# Renaming part-000...csv to our file name
target_file = list(filter(lambda file: file.name.startswith("part-00000"), dbutils.fs.ls(file_path_name_on_dbfs)))
if len(target_file) > 0:
dbutils.fs.mv(target_file[0].path, dest)
dbutils.fs.cp(dest, f"file://{dest}") # this line is added for community edition only cause /dbfs is not recognized, so we copy the file locally
dbutils.fs.rm(file_path_name_on_dbfs,True)
The code that will send the file into az storage :
import requests
sas="YOUR_SAS_TOKEN_PREVIOUSLY_CREATED" # follow the link below to create SAS token (using sas is slightly more secure than raw key storage)
blob_account_name = "YOUR_BLOB_ACCOUNT_NAME"
container = "YOUR_CONTAINER_NAME"
destination_path_w_name = "export/export.csv"
url = f"https://{blob_account_name}.blob.core.windows.net/{container}/{destination_path_w_name}?{sas}"
# here we read the content of our previously exported df -> csv
# if you are not on community edition you might want to use /dbfs + dest
payload=open(dest).read()
headers = {
'x-ms-blob-type': 'BlockBlob',
'Content-Type': 'text/csv' # you can change the content type according to your needs
}
response = requests.request("PUT", url, headers=headers, data=payload)
# if response.status_code is 201 it means your file was created successfully
print(response.status_code)
Follow this link to setup a SAS token
Remember that anyone who got the sas token can access your storage depending on permissions you set while creating the sas token
Code for Excel export version (using com.crealytics:spark-excel_2.12:0.14.0)
Saving the dataframe :
data = [
('a',25,'ast'),
('b',15,'phone'),
('c',32,'dlp'),
('d',45,'rare'),
('e',60,'phq' )
]
colums = ["column1" ,"column2" ,"column3"]
sampleDF = spark.createDataFrame(data=data, schema = colums)
sampleDF.show()
# export
from os import path
folder = "export"
name = "export"
file_path_name_on_dbfs = path.join("/tmp", folder, name)
# Writing to DBFS
sampleDF.write.format("com.crealytics.spark.excel")\
.option("header", "true")\
.mode("overwrite")\
.save(file_path_name_on_dbfs + ".xlsx")
# excel
dest = file_path_name_on_dbfs + ".xlsx"
dbutils.fs.cp(dest, f"file://{dest}") # this line is added for community edition only cause /dbfs is not recognized, so we copy the file locally
Uploading the file to azure storage :
import requests
sas="YOUR_SAS_TOKEN_PREVIOUSLY_CREATED" # follow the link below to create SAS token (using sas is slightly more secure than raw key storage)
blob_account_name = "YOUR_BLOB_ACCOUNT_NAME"
container = "YOUR_CONTAINER_NAME"
destination_path_w_name = "export/export.xlsx"
# destination_path_w_name = "export/export.csv"
url = f"https://{blob_account_name}.blob.core.windows.net/{container}/{destination_path_w_name}?{sas}"
# here we read the content of our previously exported df -> csv
# if you are not on community edition you might want to use /dbfs + dest
# payload=open(dest).read()
payload=open(dest, 'rb').read()
headers = {
'x-ms-blob-type': 'BlockBlob',
# 'Content-Type': 'text/csv'
'Content-Type': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
}
response = requests.request("PUT", url, headers=headers, data=payload)
# if response.status_code is 201 it means your file was created successfully
print(response.status_code)

Storing cache results

I have activated the middleware extension scrapy.extensions.httpcache.FilesystemCacheStorage to return the cache as results in a folder (.gzip) when scraping. However, I get the following error:
raise BadGzipFile('Not a gzipped file (%r)' % magic)
gzip.BadGzipFile: Not a gzipped file (b'\x80\x04')
I think the issue is that the file names are saved as the following:
My settings.py:
HTTPCACHE_ENABLED = True
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_DBM_MODULE = 'dbm'
HTTPCACHE_GZIP = True
How do I correctly activate the extension and store the cache as files in my working directory?
example scraper:
import scrapy
class email_spider(scrapy.Spider):
name = 'email'
start_urls = [ 'http://quotes.toscrape.com/tag/humor/' ]
def parse(self, response):
content = response.xpath("(//div[#class='col-md-8'])[2]//div")
for stuff in content:
yield {
'stuff':stuff.xpath(".//a//#href).get(),}

Simple scraper with Scrapy API

I am writing a scraper with Scrapy within a larger project, and I'm trying to keep it as minimal as possible (without create a whole scrapy project). This code downloads a single URL correctly:
import scrapy
from scrapy.crawler import CrawlerProcess
class WebsiteSpider(scrapy.Spider):
"""
https://docs.scrapy.org/en/latest/
"""
custom_settings = {'DOWNLOAD_DELAY': 1, 'DEPTH_LIMIT': 3}
name = 'my_website_scraper'
def parse(self,response):
html = response.body
url = response.url
# process page here
process = CrawlerProcess()
process.crawl(WebsiteSpider, start_urls=['https://www.bbc.co.uk/'])
process.start()
How can I enrich this code to keep scraping the links found in the start URLs (with a maximum depth, for example of 3)?
Try this.
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class WebsiteSpider(Spider):
name = 'bbc.co.uk'
allowed_domains = ['.bbc.co.uk']
start_urls = ['https://www.bbc.co.uk/']
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lstA = doc.listA(url=url["url"]) # Get link data for subsequent crawling
data = [{"title": doc.title.text}] # Get target data
return {"Urls": lstA, "Data": data} # Return data to framework
SimplifiedMain.startThread(WebsiteSpider()) # Start crawling

Setting referer in Selenium

Im working with the selenium remote driver to automate actions on a site, i can open the page i need directly by engineering the url as the sites url schema is very constant. This speeds up the script as it dose not have to work through several pages before it gets to the one it needs.
To make the automation seem organic is there a way to set a referral page in Selenium ?
If you're checking the referrer on the server, then using a proxy (as mentioned in other answers) will be the way to go.
However, if you need access to the referrer in Javascript using a proxy will not work. To set the Javascript referrer I did the following:
Go to the referral website
Inject this javascript onto the page via Selenium API: document.write('<script>window.location.href = "<my website>";</script>')"
I'm using a Python wrapper around selenium, so I cannot provide the function you need to inject the code in your language, but it should be easy to find.
What you are looking for is referer spoofing.
Selenium does not have an inbuilt method to do this, however it can be accomplished by using a proxy such as fiddler.
Fiddler also provides an API-only version of the FiddlerCore component, and programmatic access to all of the proxy's settings and data, thus allowing you to modify the headers of the http response.
Here is a solution in Python to do exactly that:
https://github.com/j-bennet/selenium-referer
I described the use case and the solution in the README. I think github repo won't go anywhere, but I'll quote the relevant pieces here just in case.
The solution uses libmproxy to implement a proxy server that only does one thing: adds a Referer header. Header is specified as command line parameter when running the proxy. Code:
# -*- coding: utf-8 -*-
"""
Proxy server to add a specified Referer: header to the request.
"""
from optparse import OptionParser
from libmproxy import controller, proxy
from libmproxy.proxy.server import ProxyServer
class RefererMaster(controller.Master):
"""
Adds a specified referer header to the request.
"""
def __init__(self, server, referer):
"""
Init the proxy master.
:param server: ProxyServer
:param referer: string
"""
controller.Master.__init__(self, server)
self.referer = referer
def run(self):
"""
Basic run method.
"""
try:
print('Running...')
return controller.Master.run(self)
except KeyboardInterrupt:
self.shutdown()
def handle_request(self, flow):
"""
Adds a Referer header.
"""
flow.request.headers['referer'] = [self.referer]
flow.reply()
def handle_response(self, flow):
"""
Does not do anything extra.
"""
flow.reply()
def start_proxy_server(port, referer):
"""
Start proxy server and return an instance.
:param port: int
:param referer: string
:return: RefererMaster
"""
config = proxy.ProxyConfig(port=port)
server = ProxyServer(config)
m = RefererMaster(server, referer)
m.run()
if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-r", "--referer", dest="referer",
help="Referer URL.")
parser.add_option("-p", "--port", dest="port", type="int",
help="Port number (int) to run the server on.")
popts, pargs = parser.parse_args()
start_proxy_server(popts.port, popts.referer)
Then, in the setUp() method of the test, proxy server is started as an external process, using pexpect, and stopped in tearDown(). Method called proxy() returns proxy settings to configure Firefox driver with:
# -*- coding: utf-8 -*-
import os
import sys
import pexpect
import unittest
from selenium.webdriver.common.proxy import Proxy, ProxyType
import utils
class ProxyBase(unittest.TestCase):
"""
We have to use our own proxy server to set a Referer header, because Selenium does not
allow to interfere with request headers.
This is the base class. Change `proxy_referer` to set different referers.
"""
base_url = 'http://www.facebook.com'
proxy_server = None
proxy_address = '127.0.0.1'
proxy_port = 8888
proxy_referer = None
proxy_command = '{0} {1} --referer {2} --port {3}'
def setUp(self):
"""
Create the environment.
"""
print('\nSetting up.')
self.start_proxy()
self.driver = utils.create_driver(proxy=self.proxy())
def tearDown(self):
"""
Cleanup the environment.
"""
print('\nTearing down.')
utils.close_driver(self.driver)
self.stop_proxy()
def proxy(self):
"""
Create proxy settings for our Firefox profile.
:return: Proxy
"""
proxy_url = '{0}:{1}'.format(self.proxy_address, self.proxy_port)
p = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': proxy_url,
'ftpProxy': proxy_url,
'sslProxy': proxy_url,
'noProxy': 'localhost, 127.0.0.1'
})
return p
def start_proxy(self):
"""
Start the proxy process.
"""
if not self.proxy_referer:
raise Exception('Set the proxy_referer in child class!')
python_path = sys.executable
current_dir = os.path.dirname(__file__)
proxy_file = os.path.normpath(os.path.join(current_dir, 'referer_proxy.py'))
command = self.proxy_command.format(
python_path, proxy_file, self.proxy_referer, self.proxy_port)
print('Running the proxy command:')
print(command)
self.proxy_server = pexpect.spawnu(command)
self.proxy_server.expect_exact(u'Running...', 2)
def stop_proxy(self):
"""
Override in child class to use a proxy.
"""
print('Stopping proxy server...')
self.proxy_server.close(True)
print('Proxy server stopped.')
I wanted my unit tests to start and stop the proxy server without any user interaction, and could not find any Python samples doing that. Which is why I created the github repo (link above).
Hope this helps someone.
Not sure if i understand your question correctly, but if you want to override your HTTP requests there is no way to do it directly with webdriver. You must run your request thru a proxy. I prefer using browsermob, you can get it thru maven or similar.
ProxyServer server = new ProxyServer(proxy_port); //net.lightbody.bmp.proxy.ProxyServer;
server.start();
server.setCaptureHeaders(true);
Proxy proxy = server.seleniumProxy(); //org.openqa.selenium.Proxy
proxy.setHttpProxy("localhost").setSslProxy("localhost");
server.addRequestInterceptor(new RequestInterceptor() {
#Override
public void process(BrowserMobHttpRequest browserMobHttpRequest, Har har) {
browserMobHttpRequest.addRequestHeader("Referer", "blabla");
}
});
// configure it as a desired capability
DesiredCapabilities capabilities = new DesiredCapabilities();
capabilities.setCapability(CapabilityType.PROXY, proxy);
// start the driver
driver = new FirefoxDriver(capabilities);
Or black/whitelist anything:
server.blacklistRequests("https?://.*\\.google-analytics\\.com/.*", 410);
server.whitelistRequests("https?://*.*.yoursite.com/.*. https://*.*.someOtherYourSite.*".split(","), 200);

ScrapyDeprecationWaring: Command's default `crawler` is deprecated and will be removed. Use `create_crawler` method to instantiate crawlers

Scrapy version 0.19
I am using the code at this page ( Run multiple scrapy spiders at once using scrapyd ). When I run scrapy allcrawl, I got
ScrapyDeprecationWaring: Command's default `crawler` is deprecated and will be removed. Use `create_crawler` method to instantiate crawlers
Here is the code:
from scrapy.command import ScrapyCommand
import urllib
import urllib2
from scrapy import log
class AllCrawlCommand(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def short_desc(self):
return "Schedule a run for all available spiders"
def run(self, args, opts):
url = 'http://localhost:6800/schedule.json'
for s in self.crawler.spiders.list(): #this line raise the warning
values = {'project' : 'YOUR_PROJECT_NAME', 'spider' : s}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
log.msg(response)
How do I fix the DeprecationWarning ?
Thanks
Use:
crawler = self.crawler_process.create_crawler()