Download File from s3 using Boto via proxy server - amazon-s3

I have a file to this address:
http://s3.amazonaws.com/bucket-name/sdile_pr_2_1_1/pr/0/2/1/1/dile_0_2_1_1.nc
in a s3 bucket, that i want to make accessible via a flask app.
to do so i created a function that looks like this:
#app.route('/select/dile')
def select_dile_by_uri():
uri=request.args.get('uri')
if uri is not None:
if uri.startswith("http://s3.amazonaws.com/"):
path = uri.replace("http://s3.amazonaws.com/","")
bname, kstr = path.split("/",1) # split the bname from the key string
conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
try:
bucket = conn.get_bucket(bname)
except:
print "BUCKET NOT FOUND"
return str("ERROR: bucket "+bname+" not found")
else:
print "BUCKET CONNECTED"
try:
key = bucket.get_key(kstr)
print "KEY: ", key
except:
print "KEY NOT FOUND"
return str("ERROR: key "+kstr+"not found")
else:
try:
key.open_read() # opens the file
headers = dict(key.resp.getheaders()) # request the headers
return Response(key, headers=headers) # return a response
except S3ResponseError as e:
return Response(e.body, status=e.status, headers=key.resp.getheaders())
abort(400)
the download works, but the name of the downloaded file appears to be only "dile" instead of dile_0_2_1_1.nc .
How come ? is there something i needed to set?

what i needed to do was add a field into the headers, specifically:
headers["Content-Disposition"] = "inline; filename=myfilename"
where -myfilename- is the name you want the file to have.

Related

Getting the root directory of a file in Google Drive using API

I need to get the full path of folders where a file is located in Google Drive. I'm getting the files themselves using the Google Drive API, but I need information about it's parent folders
I'm using the following code tothe the list of spreadsheets in a Shared Drive:
from googleapiclient import discovery
from httplib2 import Http
from oauth2client import file, client, tools
# Change the value of SCOPES to 'https://www.googleapis.com/auth/drive'
# if you want to be able to read and write to the user's Google Drive.
SCOPES = 'https://www.googleapis.com/auth/drive'
store = file.Storage('storage.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('client_secret.json', SCOPES)
creds = tools.run_flow(flow, store)
DRIVE = discovery.build('drive', 'v3', http=creds.authorize(Http()))
folder_id = "1Z1GzY-D3I3qwQu3oxIW-L1a9nXgD0PXl"
query = "mimeType='application/vnd.google-apps.spreadsheet'"
query+= "and fullText contains 'CLAS' and trashed = false"
# query += " and parents in '" + folder_id + "'"
spreadsheets = []
# Initialize the page token
next_page_token = None
# Loop until all pages of results have been retrieved
while True:
# Execute the list request
response = DRIVE.files().list(
q=query,
corpora='drive',
includeItemsFromAllDrives=True,
driveId='0AEJNMySKcEzsUk9PVA',
supportsAllDrives=True,
# orderBy='folder',
pageSize=1000,
fields='nextPageToken, files(id, name, parents, mimeType, webViewLink)',
pageToken=next_page_token,
).execute()
# Append the results to the list
spreadsheets.extend(response.get('files', []))
# Check if there is another page of results
next_page_token = response.get('nextPageToken', None)
if next_page_token is None:
break
# Set the page token for the next iteration
# parameters['pageToken'] = next_page_token
# Print the number of results
print(f'Last spreadsheet found: {spreadsheets[-1]["name"]}. Number of spreadsheets: {len(spreadsheets)}')
This returns a list of dictionaries with the specified fields. I would like to know the names of the parent folders for each file, for which I'm trying:
from googleapiclient.errors import HttpError
for item in spreadsheets:
if 'parents' in item:
parent_folders_list = []
parent_id = item['parents'][0]
try:
while parent_id:
folder=DRIVE.files().get(fileId=parent_id, fields='name, id, parents').execute()
parent_folders_list.append(folder.get("parents", []))
if parent_id:
parent_id = parent_id[0]
except HttpError as error:
print('An error occurred: %s' % error)
print(f'{item["name"]} is in {parent_folders_list}')
And I've been able to identify that parent_id is correctly retrieved, and that I am able to access it, as I was able to open it in the browser. However, I get back errors 'File Not Found' for all parent_id. I wonder if the DRIVE.files().get(fileId=) is the correct way to get back a folder using the API.
Any help would be greatly appreciated.

wait until the blob storoage folder is created

I would like to download a picture into a blob folder.
Before that I need to create the folder first.
Below codes are what I am doing.
The issue is the folder needs time to be created.
When it comes to with open(abs_file_name, "wb") as f:
it can not find the folder.
I am wondering whether there is an 'await' to get to know the completion of the folder creation, then do the write operation.
for index, row in data.iterrows():
url = row['Creatives']
file_name = url.split('/')[-1]
r = requests.get(url)
abs_file_name = lake_root + file_name
dbutils.fs.mkdirs(abs_file_name)
if r.status_code == 200:
with open(abs_file_name, "wb") as f:
f.write(r.content)
The final sub folder will not be created when using dbutils.fs.mkdirs() on blob storage.
It creates a file with the final sub folder name which would be considered as a directory, but it is not a directory. Look at the following demonstration:
dbutils.fs.mkdirs('/mnt/repro/s1/s2/s3.csv')
When I try to open this file, the error says that this is a directory.
This might be the issue with the code. So, try using the following code instead:
for index, row in data.iterrows():
url = row['Creatives']
file_name = url.split('/')[-1]
r = requests.get(url)
abs_file_name = lake_root + 'fail' #creates the fake directory (to counter the problem we are facing above)
dbutils.fs.mkdirs(abs_file_name)
if r.status_code == 200:
with open(lake_root + file_name, "wb") as f:
f.write(r.content)

Cannot download_file from S3 to lambda after notification that S3 object was created

I have configured SES to put some emails into S3 bucket and set a S3 trigger to fire lambda function on object created. In lambda, I need to parse and process the email. Here is my lambda (relevant part):
s3client = boto3.client('s3')
def lambda_handler(event, context):
my_bucket = s3.Bucket(‘xxxxxxxx')
my_key = event['Records'][0]['s3']['object']['key']
filename = '/tmp/'+ my_key
logger.info('Target file: ' + filename)
s3client.download_file(my_bucket, my_key, filename)
# Process email file
download_file throws an exception:
expected string or bytes-like object: TypeError
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 22, in lambda_handler
s3client.download_file(my_bucket, my_key, filename)
...
File "/var/runtime/botocore/handlers.py", line 217, in validate_bucket_name
if VALID_BUCKET.search(bucket) is None:
TypeError: expected string or bytes-like object
Any idea what is wrong? The bucket is fine, object exists in the bucket.
The error is related to the bucket name (and you have a strange curly quote in your code).
The recommended way to retrieve the object details is:
for record in event['Records']:
bucket = record['s3']['bucket']['name']
key = record['s3']['object']['key']
...
s3_client.download_file(bucket, key, download_path)
Edit: My first answer was probably wrong, here's another attempt
The validation function that throws the exception can be found here
# From the S3 docs:
# The rules for bucket names in the US Standard region allow bucket names
# to be as long as 255 characters, and bucket names can contain any
# combination of uppercase letters, lowercase letters, numbers, periods
# (.), hyphens (-), and underscores (_).
VALID_BUCKET = re.compile(r'^[a-zA-Z0-9.\-_]{1,255}$')
# [I excluded unrelated code here]
def validate_bucket_name(params, **kwargs):
if 'Bucket' not in params:
return
bucket = params['Bucket']
if VALID_BUCKET.search(bucket) is None:
error_msg = (
'Invalid bucket name "%s": Bucket name must match '
'the regex "%s"' % (bucket, VALID_BUCKET.pattern))
raise ParamValidationError(report=error_msg)
boto3 uses the S3Transfer Download Manager under the hood, which then uses the download method that is defined as follows:
def download(self, bucket, key, fileobj, extra_args=None,
subscribers=None):
"""Downloads a file from S3
:type bucket: str
:param bucket: The name of the bucket to download from
...
It expects the bucket parameter to be a string and you're passing an s3.Bucket(‘xxxxxxxx') object, which probably isn't a string.
I'd try to pass the bucket name to download_file as a string.
Old and most likely wrong answer as pointed out in the comments
Some sample code in the Boto Documentation shows us how downloads from S3 can be performed:
import boto3
import botocore
BUCKET_NAME = 'my-bucket' # replace with your bucket name
KEY = 'my_image_in_s3.jpg' # replace with your object key
s3 = boto3.resource('s3')
try:
s3.Bucket(BUCKET_NAME).download_file(KEY, 'my_local_image.jpg')
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("The object does not exist.")
else:
raise
Looking at your code, it seems as if you're calling the download_file method the wrong way, it should look like this - you need to call the method on the Bucket-Object:
s3client = boto3.client('s3')
def lambda_handler(event, context):
my_bucket = s3.Bucket(‘xxxxxxxx')
my_key = event['Records'][0]['s3']['object']['key']
filename = '/tmp/'+ my_key
logger.info('Target file: ' + filename)
my_bucket.download_file(my_key, filename)
# Process email file
The important part is my_bucket.download_file(my_key, filename)

How to convert raw emails (MIME) from AWS SES to Gmail?

I have a gmail account linked to my domain account.
AWS SES will send messages to my S3 bucket. From there, SNS will forward the message in a raw format to my gmail address.
How do I automatically convert the raw message into a standard email format?
The raw message is in the standard email format. I think what you want to know is how to parse that standard raw email into an object that you can manipulate so that you can forward it to yourself and have it look like a standard email. AWS provides a tutorial on how to forward emails with a lambda function, through SES, by first storing them in your S3 bucket: https://aws.amazon.com/blogs/messaging-and-targeting/forward-incoming-email-to-an-external-destination/
If you follow those instructions, you'll find that the email you recieve comes as an attachment, not looking like a standard email. The following code is an alteration of the Python code provided by AWS that does what you're looking for (substitute this for the code provided in the tutorial):
# Copyright 2010-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# Altered from original by Adam Winter
#
# This file is licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License. A copy of the
# License is located at
#
# http://aws.amazon.com/apache2.0/
#
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
# OF ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import os
import boto3
import email
import re
import html
from botocore.exceptions import ClientError
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
from email.mime.image import MIMEImage
region = os.environ['Region']
def get_message_from_s3(message_id):
incoming_email_bucket = os.environ['MailS3Bucket']
incoming_email_prefix = os.environ['MailS3Prefix']
if incoming_email_prefix:
object_path = (incoming_email_prefix + "/" + message_id)
else:
object_path = message_id
object_http_path = (f"http://s3.console.aws.amazon.com/s3/object/{incoming_email_bucket}/{object_path}?region={region}")
# Create a new S3 client.
client_s3 = boto3.client("s3")
# Get the email object from the S3 bucket.
object_s3 = client_s3.get_object(Bucket=incoming_email_bucket,
Key=object_path)
# Read the content of the message.
file = object_s3['Body'].read()
file_dict = {
"file": file,
"path": object_http_path
}
return file_dict
def create_message(file_dict):
stringMsg = file_dict['file'].decode('utf-8')
# Create a MIME container.
msg = MIMEMultipart('alternative')
sender = os.environ['MailSender']
recipient = os.environ['MailRecipient']
# Parse the email body.
mailobject = email.message_from_string(file_dict['file'].decode('utf-8'))
#print(mailobject.as_string())
# Get original sender for reply-to
from_original = mailobject['Return-Path']
from_original = from_original.replace('<', '');
from_original = from_original.replace('>', '');
print(from_original)
# Create a new subject line.
subject = mailobject['Subject']
print(subject)
if mailobject.is_multipart():
index = stringMsg.find('Content-Type: multipart/')
stringBody = stringMsg[index:]
#print(stringBody)
stringData = 'Subject: ' + subject + '\nTo: ' + sender + '\nreply-to: ' + from_original + '\n' + stringBody
message = {
"Source": sender,
"Destinations": recipient,
"Data": stringData
}
return message
for part in mailobject.walk():
ctype = part.get_content_type()
cdispo = str(part.get('Content-Disposition'))
# case for each common content type
if ctype == 'text/plain' and 'attachment' not in cdispo:
bodyPart = MIMEText(part.get_payload(decode=True), 'plain', part.get_content_charset())
msg.attach(bodyPart)
if ctype == 'text/html' and 'attachment' not in cdispo:
mt = MIMEText(part.get_payload(decode=True), 'html', part.get_content_charset())
email.encoders.encode_quopri(mt)
del mt['Content-Transfer-Encoding']
mt.add_header('Content-Transfer-Encoding', 'quoted-printable')
msg.attach(mt)
if 'attachment' in cdispo and 'image' in ctype:
mi = MIMEImage(part.get_payload(decode=True), ctype.replace('image/', ''))
del mi['Content-Type']
del mi['Content-Disposition']
mi.add_header('Content-Type', ctype)
mi.add_header('Content-Disposition', cdispo)
msg.attach(mi)
if 'attachment' in cdispo and 'application' in ctype:
ma = MIMEApplication(part.get_payload(decode=True), ctype.replace('application/', ''))
del ma['Content-Type']
del ma['Content-Disposition']
ma.add_header('Content-Type', ctype)
ma.add_header('Content-Disposition', cdispo)
msg.attach(ma)
# not multipart - i.e. plain text, no attachments, keeping fingers crossed
else:
body = MIMEText(mailobject.get_payload(decode=True), 'UTF-8')
msg.attach(body)
# The file name to use for the attached message. Uses regex to remove all
# non-alphanumeric characters, and appends a file extension.
filename = re.sub('[^0-9a-zA-Z]+', '_', subject_original)
# Add subject, from and to lines.
msg['Subject'] = subject
msg['From'] = sender
msg['To'] = recipient
msg['reply-to'] = mailobject['Return-Path']
# Create a new MIME object.
att = MIMEApplication(file_dict["file"], filename)
att.add_header("Content-Disposition", 'attachment', filename=filename)
# Attach the file object to the message.
msg.attach(att)
message = {
"Source": sender,
"Destinations": recipient,
"Data": msg.as_string()
}
return message
def send_email(message):
aws_region = os.environ['Region']
# Create a new SES client.
client_ses = boto3.client('ses', region)
# Send the email.
try:
#Provide the contents of the email.
response = client_ses.send_raw_email(
Source=message['Source'],
Destinations=[
message['Destinations']
],
RawMessage={
'Data':message['Data']
}
)
# Display an error if something goes wrong.
except ClientError as e:
print('send email ClientError Exception')
output = e.response['Error']['Message']
else:
output = "Email sent! Message ID: " + response['MessageId']
return output
def lambda_handler(event, context):
# Get the unique ID of the message. This corresponds to the name of the file
# in S3.
message_id = event['Records'][0]['ses']['mail']['messageId']
print(f"Received message ID {message_id}")
# Retrieve the file from the S3 bucket.
file_dict = get_message_from_s3(message_id)
# Create the message.
message = create_message(file_dict)
# Send the email and print the result.
result = send_email(message)
print(result)
For those getting this error:
'bytes' object has no attribute 'encode'
In this line:
body = MIMEText(mailobject.get_payload(decode=True), 'UTF-8')
I could make it work. I am not an expert on this so the code might need some improvement. Also the email body includes html tags. But at least it got delivered.
If decoding the email still fails the error message will appear in your CloudWatch log. Also you will receive an email with the error message.
payload = mailobject.get_payload(decode=True)
try:
decodedPayload = payload.decode()
body = MIMEText(decodedPayload, 'UTF-8')
msg.attach(body)
except Exception as error:
errorMsg = "An error occured when decoding the email payload:\n" + str(error)
print(errorMsg)
body = errorMsg + "\nPlease download it manually from the S3 bucket."
msg.attach(MIMEText(body, 'plain'))
It is up to you which information you want to add to the error email like the subject or the from address.
Just another hint: With the above code you will get an error because subject_original is undefinded. Just delete the following lines.
# The file name to use for the attached message. Uses regex to remove all
# non-alphanumeric characters, and appends a file extension.
filename = re.sub('[^0-9a-zA-Z]+', '_', subject_original)
# Create a new MIME object.
att = MIMEApplication(file_dict["file"], filename)
att.add_header("Content-Disposition", 'attachment', filename=filename)
# Attach the file object to the message.
msg.attach(att)
As far as I understand this code is supposed to add the original email as an attachment which was not what I wanted.

Create a cased redirection for small cased file in Amazon S3

Here is the situation:
I have a static website host on Amazon S3. All files in it are small letters, for example: file.html
I am looking for a script/program/tool to find all small letter files in a S3 site and create several cased 301 redirection.
E.g. Create File.html and FILE.html two files and use the new 301 redirect feature to redirect the requests with capital letters to small letters real file.
Please advice
I've hacked together a script which does what you want. It's not well rounded by all means but should do the trick. I've put it up on GitHub at https://github.com/mikewirth/s3-caseredirect.
Usage:
python makeredirects.py access_code secret bucketname key_for_your_file
I've tried a version which uses the Redirection Rules feature, but that didn't work because there is a limit of around 20 rules. This script will therefore create LOTS of empty keys.
For completeness and because it's so small here's the script:
#!/usr/bin/env python
"""
This script takes a file on S3 and creates a redirect from every possible
permutation of case to the original file.
Author: Michael Wirth (https://github.com/mikewirth/s3-caseredirect/)
"""
import sys
import os.path
import argparse
try:
import boto.s3.connection
except:
print "boto library (http://code.google.com/p/boto/) for aws needs to be installed"
sys.exit(1)
filenames = None
def make_case_insensitive(bucket, access, secret, key):
""" Get filename permutations """
global filenames
filenames = []
filename = os.path.basename(key)
path = os.path.dirname(key)
filename_permutations(filename)
connection = boto.s3.connection.S3Connection(access, secret, True)
b = connection.get_bucket(bucket)
for fname in filenames:
if fname == filename:
continue
k = b.new_key(os.path.join(path, fname))
k.set_redirect(key)
def filename_permutations(filename, pos=0):
if len(filename) == pos:
filenames.append(filename)
else:
upper = filename[:pos] + filename[pos:pos+1].upper() + filename[pos+1:]
lower = filename[:pos] + filename[pos:pos+1].lower() + filename[pos+1:]
if upper != lower:
filename_permutations(upper, pos+1)
filename_permutations(lower, pos+1)
else:
filename_permutations(filename, pos+1)
def main():
""" CLI """
parser = argparse.ArgumentParser()
parser.add_argument("access", help="AWS credentials: access code")
parser.add_argument("secret", help="AWS credentials: secret")
parser.add_argument("bucket", help="Name of Amazon S3 bucket")
parser.add_argument("key", help="Name of the key to make available case-insensitively. (Starts with a slash.)")
args = parser.parse_args()
make_case_insensitive(args.bucket, args.access, args.secret, args.key)
if __name__ == "__main__":
main()