How to load a zip file (containing shp) from s3 bucket to Geopandas? - amazon-s3

I zipped name.shp, name.shx, name.dbf files and uploaded them into a AWS s3 bucket. So now, i wanna load this zip file and convert the contained shapefile into a GeoDataFrame of geopandas.
I can do it perfectly if the file is a zipped geojson instead of zipped shapefile.
import io
import boto3
import geopandas as gpd
import zipfile
cliente = boto3.client("s3", aws_access_key_id=ak, aws_secret_access_key=sk)
bucket_name = 'bucketname'
object_key = 'myfolder/locations.zip'
bytes_buffer = io.BytesIO()
cliente.download_fileobj(Bucket=bucket_name, Key=object_key, Fileobj=bytes_buffer)
geojson = bytes_buffer.getvalue()
with zipfile.ZipFile(bytes_buffer) as zi:
with zi.open("locations.shp") as file:
print(gpd.read_file(file.read().decode('ISO-8859-9')))
I got this error:
ç­¤íEÀ¡ËÆ3À: No such file or directory

Basically geopandas package allows to read files directly from S3. And as mentioned in the answer above it allows to read zip files also. So below you can see the code which will read zip file from s3 without downloading it. You need to enter zip+s3:// in the beginning, then add the path in S3.
geopandas.read_file(f'zip+s3://bucket-name/file.zip')

You can read zip directly, no need to use zipfile. You need all parts of Shapefile, not just .shp itself. That is why it works with geojson. You just need to pass it with zip:///. So instead of
gpd.read_file('path/file.shp')
You go with
gpd.read_file('zip:///path/file.zip')
I am not familiar enough with boto3 to know at which point you actually have this path, but I think it will help.

I do not know if it can be of any help, but I faced a similar problem recently, though I only wanted to read the .shp with fiona. I ended up like others zipping the relevant shp, dbf, cpg and shx on the bucket.
And to read from the bucket, I do like so:
from io import BytesIO
from pathlib import Path
from typing import List
from typing import Union
import boto3
from fiona.io import ZipMemoryFile
from pydantic import BaseSettings
from shapely.geometry import Point
from shapely.geometry import Polygon
import fiona
class S3Configuration(BaseSettings):
"""
S3 configuration class
"""
s3_access_key_id: str = ''
s3_secret_access_key: str = ''
s3_region_name: str = ''
s3_endpoint_url: str = ''
s3_bucket_name: str = ''
s3_use: bool = False
S3_CONF = S3Configuration()
S3_STR = 's3'
S3_SESSION = boto3.session.Session()
S3 = S3_SESSION.resource(
service_name=S3_STR,
aws_access_key_id=S3_CONF.s3_access_key_id,
aws_secret_access_key=S3_CONF.s3_secret_access_key,
endpoint_url=S3_CONF.s3_endpoint_url,
region_name=S3_CONF.s3_region_name,
use_ssl=True,
verify=True,
)
BUCKET = S3_CONF.s3_bucket_name
CordexShape = Union[Polygon, List[Polygon], List[Point]]
ZIP_EXT = '.zip'
def get_shapefile_data(file_path: Path, s3_use: S3_CONF.s3_use) -> CordexShape:
"""
Retrieves the shapefile content associated to the passed file_path (either on disk or on S3).
file_path is a .shp file.
"""
if s3_use:
return load_zipped_shp(get_s3_object(file_path.with_suffix(ZIP_EXT)), file_path)
return load_shp(file_path)
def get_s3_object(file_path: Path) -> bytes:
"""
Retrieve as bytes the content associated to the passed file_path
"""
return S3.Object(bucket_name=BUCKET, key=forge_key(file_path)).get()['Body'].read()
def forge_key(file_path: Path) -> str:
"""
Edit this code at your convenience to forge the bucket key out of the passed file_path
"""
return str(file_path.relative_to(*file_path.parts[:2]))
def load_shp(file_path: Path) -> CordexShape:
"""
Retrieve a list of Polygons stored at file_path location
"""
with fiona.open(file_path) as shape:
parsed_shape = list(shape)
return parsed_shape
def load_zipped_shp(zipped_data: bytes, file_path: Path) -> CordexShape:
"""
Retrieve a list of Polygons stored at file_path location
"""
with ZipMemoryFile(BytesIO(zipped_data)) as zip_memory_file:
with zip_memory_file.open(file_path.name) as shape:
parsed_shape = list(shape)
return parsed_shape
There is quite a lot of code, but the first part is very helpful to easily use a minio proxy for local devs (just have to change the .env).
The key to solve the issue for me was the use of fiona not so well documented (in my opinion) but life saver (in my case :)) ZipMemoryFile

Related

how can I get a s3 zip file and attached it in my email using boto3?

I'm trying to get a zip file from my s3 bucket and then attached it in my email using boto3. I tried this but it doesn't work :
msg = MIMEMultipart()
def get_object(bucket,key):
client = boto3.client("s3")
return client.get_object(Bucket=bucket, Key=key)
file = get_object(BUCKET,key)
from email import encoders
from email.mime.base import MIMEBase
msg_1 = MIMEBase('application')
msg_1.set_payload(file['Body'].read())
encoders.encode_base64(msg_1)
msg_1.add_header('Content-Disposition', 'attachment',
filename='file.zip')
msg.attach(msg_1)

Delete Cache-Control metadata from S3 using boto3 [duplicate]

boto3 documentation does not clearly specify how to update the user metadata of an already existing S3 Object.
It can be done using the copy_from() method -
import boto3
s3 = boto3.resource('s3')
s3_object = s3.Object('bucket-name', 'key')
s3_object.metadata.update({'id':'value'})
s3_object.copy_from(CopySource={'Bucket':'bucket-name', 'Key':'key'}, Metadata=s3_object.metadata, MetadataDirective='REPLACE')
You can do this using copy_from() on the resource (like this answer) mentions, but you can also use the client's copy_object() and specify the same source and destination. The methods are equivalent and invoke the same code underneath.
import boto3
s3 = boto3.client("s3")
src_key = "my-key"
src_bucket = "my-bucket"
s3.copy_object(Key=src_key, Bucket=src_bucket,
CopySource={"Bucket": src_bucket, "Key": src_key},
Metadata={"my_new_key": "my_new_val"},
MetadataDirective="REPLACE")
The 'REPLACE' value specifies that the metadata passed in the request should overwrite the source metadata entirely. If you mean to only add new key-values, or delete only some keys, you'd have to first read the original data, edit it and call the update.
To replacing only a subset of the metadata correctly:
Retrieve the original metadata with head_object(Key=src_key, Bucket=src_bucket). Also take note of the Etag in the response
Make desired changes to the metadata locally.
Call copy_object as above to upload the new metadata, but pass CopySourceIfMatch=original_etag in the request to ensure the remote object has the metadata you expect before overwriting it. original_etag is the one you got in step 1. In case the metadata (or the data itself) has changed since head_object was called (e.g. by another program running simultaneously), copy_object will fail with an HTTP 412 error.
Reference: boto3 issue 389
Similar to this answer but with the existing Metadata preserved while modifying only what is needed. From the system defined meta data, I've only preserved ContentType and ContentDisposition in this example. Other system defined meta data can also be preserved similarly.
import boto3
s3 = boto3.client('s3')
response = s3.head_object(Bucket=bucket_name, Key=object_name)
response['Metadata']['new_meta_key'] = "new_value"
response['Metadata']['existing_meta_key'] = "new_value"
result = s3.copy_object(Bucket=bucket_name, Key=object_name,
CopySource={'Bucket': bucket_name,
'Key': object_name},
Metadata=response['Metadata'],
MetadataDirective='REPLACE', TaggingDirective='COPY',
ContentDisposition=response['ContentDisposition'],
ContentType=response['ContentType'])
You can either update metadata by adding something or updating a current metadata value with a new one, here is the piece of code I am using :
import sys
import os
import boto3
import pprint
from boto3 import client
from botocore.utils import fix_s3_host
param_1= YOUR_ACCESS_KEY
param_2= YOUR_SECRETE_KEY
param_3= YOUR_END_POINT
param_4= YOUR_BUCKET
#Create the S3 client
s3ressource = client(
service_name='s3',
endpoint_url= param_3,
aws_access_key_id= param_1,
aws_secret_access_key=param_2,
use_ssl=True,
)
# Building a list of of object per bucket
def BuildObjectListPerBucket (variablebucket):
global listofObjectstobeanalyzed
listofObjectstobeanalyzed = []
extensions = ['.jpg','.png']
for key in s3ressource.list_objects(Bucket=variablebucket)["Contents"]:
#print (key ['Key'])
onemoreObject=key['Key']
if onemoreObject.endswith(tuple(extensions)):
listofObjectstobeanalyzed.append(onemoreObject)
#print listofObjectstobeanalyzed
else :
s3ressource.delete_object(Bucket=variablebucket,Key=onemoreObject)
return listofObjectstobeanalyzed
# for a given existing object, create metadata
def createmetdata(bucketname,objectname):
s3ressource.upload_file(objectname, bucketname, objectname, ExtraArgs={"Metadata": {"metadata1":"ImageName","metadata2":"ImagePROPERTIES" ,"metadata3":"ImageCREATIONDATE"}})
# for a given existing object, add new metadata
def ADDmetadata(bucketname,objectname):
s3_object = s3ressource.get_object(Bucket=bucketname, Key=objectname)
k = s3ressource.head_object(Bucket = bucketname, Key = objectname)
m = k["Metadata"]
m["new_metadata"] = "ImageNEWMETADATA"
s3ressource.copy_object(Bucket = bucketname, Key = objectname, CopySource = bucketname + '/' + objectname, Metadata = m, MetadataDirective='REPLACE')
# for a given existing object, update a metadata with new value
def CHANGEmetadata(bucketname,objectname):
s3_object = s3ressource.get_object(Bucket=bucketname, Key=objectname)
k = s3ressource.head_object(Bucket = bucketname, Key = objectname)
m = k["Metadata"]
m.update({'watson_visual_rec_dic':'ImageCREATIONDATEEEEEEEEEEEEEEEEEEEEEEEEEE'})
s3ressource.copy_object(Bucket = bucketname, Key = objectname, CopySource = bucketname + '/' + objectname, Metadata = m, MetadataDirective='REPLACE')
def readmetadata (bucketname,objectname):
ALLDATAOFOBJECT = s3ressource.get_object(Bucket=bucketname, Key=objectname)
ALLDATAOFOBJECTMETADATA=ALLDATAOFOBJECT['Metadata']
print ALLDATAOFOBJECTMETADATA
# create the list of object on a per bucket basis
BuildObjectListPerBucket (param_4)
# Call functions to see the results
for objectitem in listofObjectstobeanalyzed:
# CALL The function you want
readmetadata(param_4,objectitem)
ADDmetadata(param_4,objectitem)
readmetadata(param_4,objectitem)
CHANGEmetadata(param_4,objectitem)
readmetadata(param_4,objectitem)

Python boto3 load model tar file from s3 and unpack it

I am using Sagemaker and have a bunch of model.tar.gz files that I need to unpack and load in sklearn. I've been testing using list_objects with delimiter to get to the tar.gz files:
response = s3.list_objects(
Bucket = bucket,
Prefix = 'aleks-weekly/models/',
Delimiter = '.csv'
)
for i in response['Contents']:
print(i['Key'])
And then I plan to extract with
import tarfile
tf = tarfile.open(model.read())
tf.extractall()
But how do I get to the actual tar.gz file from s3 instead of a some boto3 object?
You can download objects to files using s3.download_file(). This will make your code look like:
s3 = boto3.client('s3')
bucket = 'my-bukkit'
prefix = 'aleks-weekly/models/'
# List objects matching your criteria
response = s3.list_objects(
Bucket = bucket,
Prefix = prefix,
Delimiter = '.csv'
)
# Iterate over each file found and download it
for i in response['Contents']:
key = i['Key']
dest = os.path.join('/tmp',key)
print("Downloading file",key,"from bucket",bucket)
s3.download_file(
Bucket = bucket,
Key = key,
Filename = dest
)

Read and parse CSV file in S3 without downloading the entire file using Python

So, i want to read a large CSV file from an S3 bucket, but i dont want that file to be completely downloaded in memory, what i wanna do is somehow stream the file in chunks and then process it.
So far this is what i have done, but i dont think so this is gonna solve the problem.
import logging
import boto3
import codecs
import os
import csv
LOGGER = logging.getLogger()
LOGGER.setLevel(logging.INFO)
s3 = boto3.client('s3')
def lambda_handler(event, context):
# retrieve bucket name and file_key from the S3 event
bucket_name = event['Records'][0]['s3']['bucket']['name']
file_key = event['Records'][0]['s3']['object']['key']
chunk, chunksize = [], 1000
if file_key.endswith('.csv'):
LOGGER.info('Reading {} from {}'.format(file_key, bucket_name))
# get the object
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
file_object = obj['Body']
count = 0
for i, line in enumerate(file_object):
count += 1
if (i % chunksize == 0 and i > 0):
process_chunk(chunk)
del chunk[:]
chunk.append(line)
def process_chunk(chuck):
print(len(chuck))
This will do what you want to achieve. It wont download the whole file in the memory, instead will download in chunks, process and proceed:
from smart_open import smart_open
import csv
def get_s3_file_stream(s3_path):
"""
This function will return a stream of the s3 file.
The s3_path should be of the format: '<bucket_name>/<file_path_inside_the_bucket>'
"""
#This is the full path with credentials:
complete_s3_path = 's3://' + aws_access_key_id + ':' + aws_secret_access_key + '#' + s3_path
return smart_open(complete_s3_path, encoding='utf8')
def download_and_process_csv:
datareader = csv.DictReader(get_s3_file_stream(s3_path))
for row in datareader:
yield process_csv(row) # write a function to do whatever you want to do with the CSV
Did u try AWS Athena https://aws.amazon.com/athena/ ?
its extremely good serverless and pay as go. Without dowloading the file it does everything what you want.
BlazingSql is open source and its also usefull in case of big data problem.

Create a cased redirection for small cased file in Amazon S3

Here is the situation:
I have a static website host on Amazon S3. All files in it are small letters, for example: file.html
I am looking for a script/program/tool to find all small letter files in a S3 site and create several cased 301 redirection.
E.g. Create File.html and FILE.html two files and use the new 301 redirect feature to redirect the requests with capital letters to small letters real file.
Please advice
I've hacked together a script which does what you want. It's not well rounded by all means but should do the trick. I've put it up on GitHub at https://github.com/mikewirth/s3-caseredirect.
Usage:
python makeredirects.py access_code secret bucketname key_for_your_file
I've tried a version which uses the Redirection Rules feature, but that didn't work because there is a limit of around 20 rules. This script will therefore create LOTS of empty keys.
For completeness and because it's so small here's the script:
#!/usr/bin/env python
"""
This script takes a file on S3 and creates a redirect from every possible
permutation of case to the original file.
Author: Michael Wirth (https://github.com/mikewirth/s3-caseredirect/)
"""
import sys
import os.path
import argparse
try:
import boto.s3.connection
except:
print "boto library (http://code.google.com/p/boto/) for aws needs to be installed"
sys.exit(1)
filenames = None
def make_case_insensitive(bucket, access, secret, key):
""" Get filename permutations """
global filenames
filenames = []
filename = os.path.basename(key)
path = os.path.dirname(key)
filename_permutations(filename)
connection = boto.s3.connection.S3Connection(access, secret, True)
b = connection.get_bucket(bucket)
for fname in filenames:
if fname == filename:
continue
k = b.new_key(os.path.join(path, fname))
k.set_redirect(key)
def filename_permutations(filename, pos=0):
if len(filename) == pos:
filenames.append(filename)
else:
upper = filename[:pos] + filename[pos:pos+1].upper() + filename[pos+1:]
lower = filename[:pos] + filename[pos:pos+1].lower() + filename[pos+1:]
if upper != lower:
filename_permutations(upper, pos+1)
filename_permutations(lower, pos+1)
else:
filename_permutations(filename, pos+1)
def main():
""" CLI """
parser = argparse.ArgumentParser()
parser.add_argument("access", help="AWS credentials: access code")
parser.add_argument("secret", help="AWS credentials: secret")
parser.add_argument("bucket", help="Name of Amazon S3 bucket")
parser.add_argument("key", help="Name of the key to make available case-insensitively. (Starts with a slash.)")
args = parser.parse_args()
make_case_insensitive(args.bucket, args.access, args.secret, args.key)
if __name__ == "__main__":
main()