fastapi throws 400 bad request when I upload a large file - file-upload

I provisioned and configured a Fedora 34 vm on VirtualBox with 2048 MB RAM to serve this FastAPI application on localhost:7070. The full application source code and dependency code and instructions are here. Below is the smallest reproducible example I could make.
main.py
import os, pathlib
import fastapi as fast
import aiofiles
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
RESULTS_DIR = pathlib.Path('/'.join((ROOT_DIR, 'results')))
app = fast.FastAPI()
#app.post('/api')
async def upload(
request: fast.Request,
file: fast.UploadFile = fast.File(...),
filedir: str = ''):
dest = RESULTS_DIR.joinpath(filedir, file.filename)
dest.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(dest, 'wb') as buffer:
await file.seek(0)
contents = await file.read()
await buffer.write(contents)
return f'localhost:7070/{dest.parent.name}/{dest.name}'
start.sh the server application
#! /bin/bash
uvicorn --host "0.0.0.0" --log-level debug --port 7070 main:app
client.py
import httpx
from pathlib import Path
import asyncio
async def async_post_file_req(url: str, filepath: Path):
async with httpx.AsyncClient(
timeout=httpx.Timeout(write=None, read=None, connect=None, pool=None)) as client:
r = await client.post(
url,
files={
'file': (filepath.name, filepath.open('rb'), 'application/octet-stream')
}
)
if __name__ == '__main__':
url = 'http://localhost:7070'
asyncio.run(
async_post_file_req(
f'{url}/api',
Path('~/1500M.txt')
))
create a 1500 MB file
truncate -s 1500M 1500M.txt
When uploading a 1500 MB file, the current implementation of upload appears to read the whole file into memory, and then the server responds with {status: 400, reason: 'Bad Request', details: 'There was an error parsing the body.'}, and the file is not written to disk. When uploading an 825 MB file, the server responds with 200, and the file is written to disk. I don't understand why there is an error in parsing the larger file.
What's going on?
How do I upload files that are larger than the machine's available memory?
Do I have to stream the body?

Digging into the source code, I found that FastAPI is throws the HTTP exception with status code 400 and detail There was an error in parsing body exactly once in the source code, when it is trying to figure out if the request form or body needs to be read. The FastAPI Request is basically the Starlette Request, so I reimplemented the FastAPI server application as a Starlette application hoping it would bypass this exception handler and give me more information about this issue.
main.py
from starlette.applications import Starlette
from starlette.responses import JSONResponse
from starlette.routing import Route
async def homepage(request):
return JSONResponse({'hello': 'world'})
async def upload(request):
form = await request.form()
print(type(form['upload_file']))
filename = form['upload_file'].filename or 'not found'
contents = await form['upload_file'].read()
b = len(contents) or -1
return JSONResponse({
'filename': filename,
'bytes': b
})
app = Starlette(debug=True, routes=[
Route('/', homepage),
Route('/api', upload, methods=['POST'])
])
Pipfile
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
starlette = "*"
uvicorn = "*"
uvloop = "*"
httpx = "*"
watchgod = "*"
python-multipart = "*"
[dev-packages]
[requires]
python_version = "3.9"
On posting a file of size 989 MiB or larger, the Starlette application throws OS error 28, no space left on device. A file of size 988 MiB or less, caused no error.
INFO: 10.0.2.2:46996 - "POST /api HTTP/1.1" 500 Internal Server Error
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/usr/local/lib/python3.9/site-packages/uvicorn/protocols/http/httptools_impl.py", line 398, in run_asgi
result = await app(self.scope, self.receive, self.send)
File "/usr/local/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 45, in __call__
return await self.app(scope, receive, send)
File "/usr/local/lib/python3.9/site-packages/starlette/applications.py", line 112, in __call__
await self.middleware_stack(scope, receive, send)
File "/usr/local/lib/python3.9/site-packages/starlette/middleware/errors.py", line 181, in __call__
raise exc from None
File "/usr/local/lib/python3.9/site-packages/starlette/middleware/errors.py", line 159, in __call__
await self.app(scope, receive, _send)
File "/usr/local/lib/python3.9/site-packages/starlette/exceptions.py", line 82, in __call__
raise exc from None
File "/usr/local/lib/python3.9/site-packages/starlette/exceptions.py", line 71, in __call__
await self.app(scope, receive, sender)
File "/usr/local/lib/python3.9/site-packages/starlette/routing.py", line 580, in __call__
await route.handle(scope, receive, send)
File "/usr/local/lib/python3.9/site-packages/starlette/routing.py", line 241, in handle
await self.app(scope, receive, send)
File "/usr/local/lib/python3.9/site-packages/starlette/routing.py", line 52, in app
response = await func(request)
File "/home/vagrant/star-file-server/./main.py", line 11, in upload
form = await request.form()
File "/usr/local/lib/python3.9/site-packages/starlette/requests.py", line 240, in form
self._form = await multipart_parser.parse()
File "/usr/local/lib/python3.9/site-packages/starlette/formparsers.py", line 231, in parse
await file.write(message_bytes)
File "/usr/local/lib/python3.9/site-packages/starlette/datastructures.py", line 445, in write
await run_in_threadpool(self.file.write, data)
File "/usr/local/lib/python3.9/site-packages/starlette/concurrency.py", line 40, in run_in_threadpool
return await loop.run_in_executor(None, func, *args)
File "/usr/lib64/python3.9/concurrent/futures/thread.py", line 52, in run
result = self.fn(*self.args, **self.kwargs)
File "/usr/lib64/python3.9/tempfile.py", line 755, in write
rv = file.write(s)
OSError: [Errno 28] No space left on device
Starlette's UploadFile data structure uses a SpooledTemporaryFile. This object writes to your os's temporary directory. My temporary directory is /tmp because I'm on Fedora 34, and I have not created any environment variables to tell python to use anything else as a temporary directory.
[vagrant#fedora star-file-server]$ python
Python 3.9.5 (default, May 14 2021, 00:00:00)
[GCC 11.1.1 20210428 (Red Hat 11.1.1-1)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import tempfile
>>> tempfile.gettempdir()
'/tmp'
[vagrant#fedora star-file-server]$ df -h
Filesystem Size Used Avail Use% Mounted on
devtmpfs 974M 0 974M 0% /dev
tmpfs 989M 168K 989M 1% /dev/shm
tmpfs 396M 5.6M 390M 2% /run
/dev/sda1 40G 1.6G 36G 5% /
tmpfs 989M 0 989M 0% /tmp
tmpfs 198M 84K 198M 1% /run/user/1000
Starlette sets max_size for the SpooledTemporaryDirectory to 1 MiB. From the Python tempfile documentation, I think that means only 1 MiB can be read into memory at a time from the temporary file while it is being used. Although it is of by 1 MiB, 989 MiB appears to be the correct hard boundary on the UploadFile size because SpooledTemporaryDirectory is bound by the storage available to the system's temporary directory.
If I still want to use UploadFile I can create an environment variable to point to a device that is known to always have enough space available, even for the largest uploads.
export TMPDIR=/huge_storage_device
The approach I prefer uses the request's stream, to avoid having to write the file twice, first to a local temporary directory, and second to a local permanent directory.
import os, pathlib
import fastapi as fast
import aiofiles
app = fast.FastAPI()
#app.post('/stream')
async def stream(
request: fast.Request,
filename: str,
filedir: str = ''
):
dest = RESULTS_DIR.joinpath(filedir, filename)
dest.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(dest, 'wb') as buffer:
async for chunk in request.stream():
await buffer.write(chunk)
return {
'loc': f'localhost:7070/{dest.parent.name}/{dest.name}'
}
Using this approach, when I uploaded files (5M, 450M, 988M each with two repeated measures) to the server running on a Fedora vm with 2048 MiB memory, the server never used up too much memory, never crashed, and the average latency reduction was 40% (i.e. the latency of posting to /stream was about 60% of latency posting to /api).

Related

How to read an image sent in body of req in falcon

Sending jpg image in body of POST, using postman to do so:
Reading it with
image_text_similarity.py:
import json
class ImageTextSimilarity():
def on_post(self, req, resp):
image_raw = json.loads(req.stream.read())
which errors out with
Traceback (most recent call last):
File "/home/dario/.local/lib/python3.6/site-packages/gunicorn/workers/sync.py", line 134, in handle
self.handle_request(listener, req, client, addr)
File "/home/dario/.local/lib/python3.6/site-packages/gunicorn/workers/sync.py", line 175, in handle_request
respiter = self.wsgi(environ, resp.start_response)
File "falcon/api.py", line 274, in falcon.api.API.__call__
File "falcon/api.py", line 269, in falcon.api.API.__call__
File "/home/dario/ImageTextSimilarityApp/image_text_similarity.py", line 95, in on_post
image_raw = json.loads(req.stream.read())
File "/usr/lib/python3.6/json/__init__.py", line 349, in loads
s = s.decode(detect_encoding(s), 'surrogatepass')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
How do we read the image from the body of the POST request?
Rest of the code is
image_similarity_app.py:
import falcon
from image_text_similarity import ImageTextSimilarity
api = application = falcon.API()
api.req_options.auto_parse_form_urlencoded = True
image_text_similarity_object = ImageTextSimilarity()
api.add_route('/image_text_similarity', image_text_similarity_object)
And starting the service with gunicorn image_similarity_app
I'm not an expert at Postman, but it appears that by choosing binary, you are sending your JPEG image data as the request body: Postman Chrome: What is the difference between form-data, x-www-form-urlencoded and raw
In Falcon, you can simply read the request payload as
jpeg_data = req.stream.read()
(Note that on some app servers such as the stdlib's wsgiref.simple_server, you may need to use the safe Request.bounded_stream wrapper.)
See also Falcon's WSGI and ASGI tutorials for inspiration; they use are very related topic (building an image service) to illustrate the basic concepts of the framework. You'll find examples how to handle RESTful image resources: upload, convert, store, list, serve, cache etc.

Dynamically created JPG image using AWS Lambda service

I am trying to create a dynamically created graph as a JPG file that I could use in Alexa Skill standard cards as part of response. The following code creates a JPG image when I run it locally on my computer, when using browser with URL "http://localhost:5000/image.jpg".
from flask import send_file
from flask import Flask
from PIL import Image, ImageDraw
from io import BytesIO
app = Flask(__name__)
app.config['DEBUG'] = True
def serve_pil_image(pil_img):
img_io = BytesIO()
pil_img.save(img_io, 'JPEG', quality=70)
img_io.seek(0)
return send_file(img_io, mimetype='image/jpeg')
#app.route('/image.jpg')
def serve_img():
size = (128,128)
background = (128,128,55)
xy = [(0,0),(10,10),(20,20),(30,12),(50,50),(70,9),(90,70)]
img = Image.new('RGB',size,background)
draw = ImageDraw.Draw(img)
draw.line(xy, fill=128, width=5)
return serve_pil_image(img)
if __name__ == '__main__':
app.run(debug=True)
However, when I deploy the same code to AWS Lambda service using Zappa I am getting the following error message (from CloudWatch logs):
An error occurred during JSON serialization of response: 'utf8' codec can't decode byte 0xff in position 0: invalid start byte
Traceback (most recent call last):
File "/usr/lib64/python2.7/json/__init__.py", line 250, in dumps
sort_keys=sort_keys, **kw).encode(obj)
File "/usr/lib64/python2.7/json/encoder.py", line 207, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/usr/lib64/python2.7/json/encoder.py", line 270, in iterencode
return _iterencode(o, 0)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 0: invalid start byte
Is there some configuration option to fix this problem? I haven't found any so far.
Binary Support is finally here! You should look at it and try again.
If you want to serve binary data (in this case Base64 images) through API Gateway, you need to set the following:
In the Method Response of your method
Set Content-Type as image/jpeg in HTTP 200 Status Response
Header
In the Integration Response of your method
Set Content-Type as 'image/jpeg' in Header Mappings. Mind the quotes!
With the AWS CLI, set contentHandling attribute to CONVERT_TO_BINARYon your Integration Response
Check to entire process in this great step-by step guide: https://stackoverflow.com/a/41434295/720665
(example is for a base64 encoded png image, but the gist of it is the same)

Scrapyd with Polipo and Tor

UPDATE: I am now running this command:
scrapyd-deploy <project_name>
And getting this error:
504 Connect to localhost:8123 failed: General SOCKS server failure
I am trying to deploy my scrapy spider through scrapyd-deploy, the following is the command I use:
scrapyd-deploy -L <project_name>
I get the following error message:
Traceback (most recent call last):
File "/usr/local/bin/scrapyd-deploy", line 269, in <module>
main()
File "/usr/local/bin/scrapyd-deploy", line 74, in main
f = urllib2.urlopen(req)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: Not found
The following is my scrapy.cfg file:
[settings]
default = <project_name>.settings
[deploy:<project_name>]
url = http://localhost:8123
project = <project_name>
eggs_dir = eggs
logs_dir = logs
items_dir = items
jobs_to_keep = 5
dbs_dir = dbs
max_proc = 0
max_proc_per_cpu = 4
finished_to_keep = 100
poll_interval = 5
http_port = 8123
debug = on
runner = scrapyd.runner
application = scrapyd.app.application
launcher = scrapyd.launcher.Launcher
[services]
schedule.json = scrapyd.webservice.Schedule
cancel.json = scrapyd.webservice.Cancel
addversion.json = scrapyd.webservice.AddVersion
listprojects.json = scrapyd.webservice.ListProjects
listversions.json = scrapyd.webservice.ListVersions
listspiders.json = scrapyd.webservice.ListSpiders
delproject.json = scrapyd.webservice.DeleteProject
delversion.json = scrapyd.webservice.DeleteVersion
listjobs.json = scrapyd.webservice.ListJobs
I am running tor and polipo, with the polipo proxy on port 'http://localhost:8123'. I can perform a wget and download that page without any problems. The proxy is correctly working, I can connect to the internet and so on. Please ask if you need more clarification.
Thanks!
urllib2.HTTPError: HTTP Error 404: Not found
The url is not reached.
Anything interesting in /var/log/polipo/polipo.log? What comes from tail -100 /var/log/polipo/polipo.log?
Apparently this is because I forgot to run the main command. It is easy to miss because it is mentioned in the Overview page of the documentation, and not the Deployment page. The following is the command:
scrapyd
504 Connect to localhost:8123 failed: General SOCKS server failure
You're asking Polipo to connect to localhost:8123; Polipo passes the request to tor, which returns a failure result which is dutifully returned by Polipo ("General SOCKS server failure").
url = http://localhost:8123
This is certainly not what you meant.
http_port = 8123
I'm also pretty sure you didn't want to run scrapyd on the same port as Polipo.

How to disable or change the path of ghostdriver.log?

Question is straightfoward, but some context may help.
I'm trying to deploy scrapy while using selenium and phantomjs as downloader. But the problem is that it keeps on saying permission denied when trying to deploy. So I want to change the path of ghostdriver.log or just disable it. Looking at phantomjs -h and ghostdriver github page I couldn't find the answer, my friend google let me down also.
$ scrapy deploy
Building egg of crawler-1370960743
'build/scripts-2.7' does not exist -- can't clean it
zip_safe flag not set; analyzing archive contents...
tests.fake_responses.__init__: module references __file__
Deploying crawler-1370960743 to http://localhost:6800/addversion.json
Server response (200):
Traceback (most recent call last):
File "/usr/lib/pymodules/python2.7/scrapyd/webservice.py", line 18, in render
return JsonResource.render(self, txrequest)
File "/usr/lib/pymodules/python2.7/scrapy/utils/txweb.py", line 10, in render
r = resource.Resource.render(self, txrequest)
File "/usr/lib/python2.7/dist-packages/twisted/web/resource.py", line 216, in render
return m(request)
File "/usr/lib/pymodules/python2.7/scrapyd/webservice.py", line 66, in render_POST
spiders = get_spider_list(project)
File "/usr/lib/pymodules/python2.7/scrapyd/utils.py", line 65, in get_spider_list
raise RuntimeError(msg.splitlines()[-1])
RuntimeError: IOError: [Errno 13] Permission denied: 'ghostdriver.log
When using the PhantomJS driver add the following parameter:
driver = webdriver.PhantomJS(service_log_path='/var/log/phantomjs/ghostdriver.log')
Related code, would be nice to have an option to turn off logging though, seems thats not supported:
selenium/webdriver/phantomjs/service.py
class Service(object):
"""
Object that manages the starting and stopping of PhantomJS / Ghostdriver
"""
def __init__(self, executable_path, port=0, service_args=None, log_path=None):
"""
Creates a new instance of the Service
:Args:
- executable_path : Path to PhantomJS binary
- port : Port the service is running on
- service_args : A List of other command line options to pass to PhantomJS
- log_path: Path for PhantomJS service to log to
"""
self.port = port
self.path = executable_path
self.service_args= service_args
if self.port == 0:
self.port = utils.free_port()
if self.service_args is None:
self.service_args = []
self.service_args.insert(0, self.path)
self.service_args.append("--webdriver=%d" % self.port)
if not log_path:
log_path = "ghostdriver.log"
self._log = open(log_path, 'w')
#Reduce logging level
driver = webdriver.PhantomJS(service_args=["--webdriver-loglevel=SEVERE"])
#Remove logging
import os
driver = webdriver.PhantomJS(service_log_path=os.path.devnull)

https with jython2.7 + trusting all certificates does not work. Result: httplib.BadStatusLine

UPDATE: Problem related to bug in jython 2.7b1. See bug report: http://bugs.jython.org/issue2021. jython-coders are working on a fix!
After changing to jython2.7beta1 from Jython2.5.3 I am no longer able to read content of webpages using SSL, http and "trusting all certificates". The response from the https-page is always an empty string, resulting in httplib.BadStatusLine exception from httplib.py in Jython.
I need to be able to read from a webpage which requires authentication and do not want to setup any certificate store since I must have portability. Therefore my solution is to use the excellent implementation provided by http://tech.pedersen-live.com/2010/10/trusting-all-certificates-in-jython/
Example code is detailed below. Twitter might not be the best example, since it does not require certificate trusting; but the result is the same with or without the decorator.
#! /usr/bin/python
import sys
from javax.net.ssl import TrustManager, X509TrustManager
from jarray import array
from javax.net.ssl import SSLContext
class TrustAllX509TrustManager(X509TrustManager):
# Define a custom TrustManager which will blindly
# accept all certificates
def checkClientTrusted(self, chain, auth):
pass
def checkServerTrusted(self, chain, auth):
pass
def getAcceptedIssuers(self):
return None
# Create a static reference to an SSLContext which will use
# our custom TrustManager
trust_managers = array([TrustAllX509TrustManager()], TrustManager)
TRUST_ALL_CONTEXT = SSLContext.getInstance("SSL")
TRUST_ALL_CONTEXT.init(None, trust_managers, None)
# Keep a static reference to the JVM's default SSLContext for restoring
# at a later time
DEFAULT_CONTEXT = SSLContext.getDefault()
def trust_all_certificates(f):
# Decorator function that will make it so the context of the decorated
# method will run with our TrustManager that accepts all certificates
def wrapped(*args, **kwargs):
# Only do this if running under Jython
if 'java' in sys.platform:
from javax.net.ssl import SSLContext
SSLContext.setDefault(TRUST_ALL_CONTEXT)
print "SSLContext set to TRUST_ALL"
try:
res = f(*args, **kwargs)
return res
finally:
SSLContext.setDefault(DEFAULT_CONTEXT)
else:
return f(*args, **kwargs)
return wrapped
##trust_all_certificates
def read_page(host):
import httplib
print "Host: " + host
conn = httplib.HTTPSConnection(host)
conn.set_debuglevel(1)
conn.request('GET', '/example')
response = conn.getresponse()
print response.read()
read_page("twitter.com")
This results in:
Host: twitter.com
send: 'GET /example HTTP/1.1\r\nHost: twitter.com\r\nAccept-Encoding: identity\r\n\r\n'
reply: ''
Traceback (most recent call last):
File "jytest.py", line 62, in <module>
read_page("twitter.com")
File "jytest.py", line 59, in read_page
response = conn.getresponse()
File "/Users/erikiveroth/Workspace/Procera/sandbox/jython/jython2.7.jar/Lib/httplib.py", line 1030, in getresponse
File "/Users/erikiveroth/Workspace/Procera/sandbox/jython/jython2.7.jar/Lib/httplib.py", line 407, in begin
File "/Users/erikiveroth/Workspace/Procera/sandbox/jython/jython2.7.jar/Lib/httplib.py", line 371, in _read_status
httplib.BadStatusLine: ''
Changing back to jython2.5.3 gives me parseable output from twitter.
Have any of you seen this before? Can not find any bug-tickets on jython project page about this nor can I understand what changes could result in this behaviour (more than maybe #1309, but I do not understand if it is related to my problem).
Cheers