Process CSV line by line from S3 using python on Lambda - amazon-s3

I am trying to process .csv (30MB) file that is on S3 bucket using AWS Lambda (Python). I wrote my python code locally to process file, now trying to execute using Lambda. Having a hard time to read file line by line.
Please let me know how I can traverse file line by line using boto3 or s3 methods. Please help me on the same at the earliest. Thanks
In Lambda:
s3 = boto3.client("s3")
file_obj = event["Records"][0]
filename=str(file_obj['s3']['object']['key'])
#print('file name is :', filename)
fileObj = s3.get_object(Bucket=<mybucket>, Key=filename)
file_content = fileObj["Body"].read().decode('utf-8')
My Original code:
import csv
import pandas as pd
import datetime
#from datetime import datetime,timedelta
import numpy as np
with open ('sample.csv', 'r') as file_name:
csv_reader = csv.reader(file_name, delimiter=',')
Time = []
Latitude=[]
Longitude= []
Org_Units=[]
Org_Unit_Type =[]
Variable_Name=[]
#New columns
Year=[]
Month= []
Day =[]
Celsius=[]
Far=[]
Conv_Units=[]
Conv_Unit_Type=[]
header = ['Time','Latitude', 'Longitude','Org_Units','Org_Unit_Type','Conv_Units','Conv_Unit_Type','Variable_Name']
out_filename = 'Write' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") #need to rename based on the org file name
with open(out_filename +'.csv', 'w') as csvFile:
outputwriter = csv.writer(csvFile, delimiter=',')
outputwriter.writerow(header)
next(csv_reader, None) # avoid hearder
for row in csv_reader:
# print(row)
Time = row[0]
Org_Lat=row[1]
Org_Long=row[2]
Org_Units=row[3]
Org_Unit_Type =row[4]
Variable_Name=row[5]
# print(Time,Org_Lat,Org_Long,Org_Units,Org_Unit_Type,Variable_Name)
if Org_Unit_Type == 'm s-1':
Conv_Units =round(float(Org_Units) * 1.151,2)
Conv_Unit_Type = 'miles'
if Org_Unit_Type == 'm':
Conv_Units =round(float(Org_Units) / 1609.344,2)
# print (Org_Units,Conv_Units)
Conv_Unit_Type = 'miles'
if Org_Unit_Type == 'Pa':
Conv_Units =round(float(Org_Units) / 6894.757,2)
Conv_Unit_Type = 'Psi'
#print(type(Time))
date_time_obj = datetime.datetime.strptime(Time, '%m-%d-%Y, %H:%M')
# Year = time.strptime(date_time_obj, "%B")
#print(date_time_obj)
f_row =[Time,Latitude,Longitude,Org_Units,Org_Unit_Type,Conv_Units,Conv_Unit_Type,Variable_Name]
outputwriter.writerow(f_row)
csvFile.close()
print("done")

I think this should work the only thing you need to check is your lambda needs a role with policy which has read access on s3 bucket.
Initially for testing i would give full access on s3 to the lambda AmazonS3FullAccess
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": "s3:*",
"Resource": "*"
}
]
}
python code
s3 = boto3.client('s3')
def lambda_handler(event, context):
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key'].encode('utf8')
obj = s3.get_object(Bucket=bucket, Key=key)
rows = obj['Body'].read().split('\n')
print("rows" + rows)

Rather than using .read() to read the object as a stream, you might find it easier to download the object to local storage:
s3_client = boto3.client('s3', region='ap-southeast-2')
s3_client.download_file(bucket, key, '/tmp/local_file.csv')
You can then use your original program to process the file.
Once you have finished, be sure to delete the temporary file because the AWS Lambda container might be reused and there is only 500MB of disk space available.

Related

Read pdf object from S3

I am trying to create a lambda function that will access a pdf form uploaded to s3 and strip out the data entered into the form and send it elsewhere.
I am able to do this when I can download the file locally. So the below script works and allows me to read the data from the pdf into my pandas dataframe.:
import PyPDF2 as pypdf
import pandas as pd
s3 = boto3.resource('s3')
s3.meta.client.download_file(bucket_name, asset_key, './target.pdf')
pdfobject = open("./target.pdf", 'rb')
pdf = pypdf.PdfFileReader(pdfobject)
data = pdf.getFormTextFields()
pdf_df = pd.DataFrame(data, columns=get_cols(data), index=[0])
But with lambda I cannot save the file locally because I get a "read only filesystem" error.
I have tried using the s3.get_object() method like below:
s3_response_object= s3.get_object(
Bucket='pdf-forms-bucket',
Key='target.pdf',
)
pdf_bytes = s3_response_object['Body'].read()
But I have no idea how to convert the resulting bytes into an object that can be parsed with PyDF2. The output that I need and that PyDF2 will produce is like below:
{'form1[0].#subform[0].nameandmail[0]': 'Burt Lancaster',
'form1[0].#subform[0].mailaddress[0]': '675 Creighton Ave, Washington DC',
'form1[0].#subform[0].Principal[0]': 'David St. Hubbins',
'Principal[1]': None,
'form1[0].#subform[0].Principal[2]': 'Bart Simpson',
'Principal[3]': None}
So in summary, I need o be able to read a pdf with fillable forms, into memory and parse it without downloading the file because my lambda function environment won't allow local temp files.
Solved:
This does the trick:
import boto3
from PyPDF2 import PdfFileReader
from io import BytesIO
bucket_name ="pdf-forms-bucket"
item_name = "form.pdf"
s3 = boto3.resource('s3')
obj = s3.Object(bucket_name, item_name)
fs = obj.get()['Body'].read()
pdf = PdfFileReader(BytesIO(fs))
data = pdf.getFormTextFields()

from python aws-lambda how to read files on S3 bucket using GeoPandas reader?

There is a lot of literature on the topic but none of those I could find use a GeoPandas reader.
My code purpose is to identify if a point is located into a polygone described in a .shp file stored in S3. It's then expected to return a boolean True or False.
I use python-lambda-local python module to test my python script located on PyCharm.
import geopandas as gpd
from geopandas.geoseries import *
import boto3
from io import BytesIO
def search(event, context):
dep = event['Dep']
arr = event['Arr']
point_1 = GeoSeries(dep)
point_2 = GeoSeries(arr)
s3 = boto3.client("s3")
bucket = "mybucket"
obj_key = "filename.shp"
# bytes_buffer = BytesIO()
# client.download_fileobj(Bucket=bucket, Key=obj_key, Fileobj=bytes_buffer)
obj = s3.download_file(Bucket=bucket, Key="filename.shp", Filename=obj_key)
geo = obj['body'].read().decode('ISO-8859-9')
# geo = bytes_buffer.get_key(obj_key).get_contents_as_string()
answer = gpd.read_file(geo)
print(answer)
As you can see in the code, I tried a few different lines to use IO and the reader() in different ways. Always unsuccessfully though.
#And this is the error message:#
MacBook-Pro:IdPolygons me$ python-lambda-local -l lib/ -f search -t 4 IdAircraft.py event.json
*This is the point I'm trying to identify inside or outside the polygon:*
[root - INFO - 2019-12-24 07:33:54,388] Event: {'Dep': '(40.7128, 74.0060)', 'Arr': '(48.8566, 2.3522)'}
[root - INFO - 2019-12-24 07:33:54,388] START RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125 Version:
[botocore.credentials - INFO - 2019-12-24 07:33:54,923] Found credentials in shared credentials file: ~/.aws/credentials stored
[root - INFO - 2019-12-24 07:33:55,576] END RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125
[root - INFO - 2019-12-24 07:33:55,577] REPORT RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125 Duration: 663.91 ms
[root - INFO - 2019-12-24 07:33:55,577] RESULT:
{
"errorMessage": "'NoneType' object has no attribute 'startswith'",
"stackTrace": [
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/lambda_local/main.py\", line 153, in execute\n result = func(event, context._activate())\n",
" File \"IdAircraft.py\", line 30, in search\n df1 = gpd.read_file(obj)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/geopandas/io/file.py\", line 77, in read_file\n with reader(path_or_bytes, **kwargs) as features:\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/env.py\", line 397, in wrapper\n return f(*args, **kwargs)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/__init__.py\", line 249, in open\n path = parse_path(fp)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/path.py\", line 132, in parse_path\n elif path.startswith('/vsi'):\n"
],
"errorType": "AttributeError"
}
Thank you for taking the time.

Reading multiple files from S3 bucket and processing them using Lambda trigger

I am reading multiple files in S3, processing them and then making tables in AWS RDS with these processed dataframes. I am doing all this on my Mac OS using PyCharm.
I want to read these csv files from the S3 bucket and run this same python script to process these files in AWS and not on my local system. I want to use lambda to trigger this script and it should run only when all the needed files are uploaded in the bucket.
How would the code vary in AWS Lambda ?
My present code is as below -
import boto3
import pandas as pd
import numpy as np
import sys
client = boto3.client('s3')
resource = boto3.resource('s3')
my_bucket = resource.Bucket('test-s3')
#CREATE ALL THE NEEDED OBJECTS
obj1 = client.get_object(Bucket='test-s3', Key='file1.csv')
obj2 = client.get_object(Bucket='test-s3', Key='file2.csv')
obj3 = client.get_object(Bucket='test-s3', Key='file3.csv')
obj4 = client.get_object(Bucket='test-s3', Key='file4.csv')
obj5 = client.get_object(Bucket='test-s3', Key='file5.csv')
obj6 = client.get_object(Bucket='test-s3', Key='file6.csv')
obj7 = client.get_object(Bucket='test-s3', Key='file7.csv')
obj8 = client.get_object(Bucket='test-s3', Key='file8.csv')
obj9 = client.get_object(Bucket='test-s3', Key='file9.csv')
obj10 = client.get_object(Bucket='test-s3', Key='file10.csv')
obj11 = client.get_object(Bucket='test-s3', Key='file11.csv')
obj12 = client.get_object(Bucket='test-s3', Key='file12.csv')
obj13 = client.get_object(Bucket='test-s3', Key='file13.csv')
obj14 = client.get_object(Bucket='test-s3', Key='file14.csv')
obj15 = client.get_object(Bucket='test-s3', Key='file15.csv')
#CREATE ALL THE DATAFRAMES FROM RESPECTIVE OBJECTS
df_file1 = pd.read_csv(obj1['Body'], encoding='utf-8', sep = ',')
df_file2 = pd.read_csv(obj2['Body'], encoding='utf-8', sep = ',')
df_file3 = pd.read_csv(obj3['Body'], encoding='utf-8', sep = ',')
df_file4 = pd.read_csv(obj4['Body'], encoding='utf-8', sep = ',')
df_file5 = pd.read_csv(obj5['Body'], encoding='utf-8', sep = ',')
df_file6 = pd.read_csv(obj6['Body'], encoding='utf-8', sep = ',')
df_file7 = pd.read_csv(obj7['Body'], encoding='utf-8', sep = ',')
df_file8 = pd.read_csv(obj8['Body'], encoding='utf-8', sep = ',')
df_file9 = pd.read_csv(obj9['Body'], encoding='utf-8', sep = ',')
df_file10 = pd.read_csv(obj10['Body'], encoding='utf-8', sep = ',')
df_file11 = pd.read_csv(obj11['Body'], encoding='utf-8', sep = ',')
df_file12 = pd.read_csv(obj12['Body'], encoding='utf-8', sep = ',')
df_file13 = pd.read_csv(obj13['Body'], encoding='utf-8', sep = ',')
df_file14 = pd.read_csv(obj14['Body'], encoding='utf-8', sep = ',')
df_file15 = pd.read_csv(obj15['Body'], encoding='utf-8', sep = ',')
#+++++++++++ make a function to process the data frames ++++++++++++
def function(df_file1, df_file2):
*** some logic ***
return df_final
## MAKE THE TABLES IN RDS
from sqlalchemy import create_engine
import psycopg2
engine = create_engine('postgresql://USERNAME:PASSWORD#***.eu-central-1.rds.amazonaws.com:5432/DBNAME')
df_final.to_sql('table name', engine, schema='data')
I am a noob at AWS Lambda. How do I run this script on Lambda?
After taking Ninad's suggestion I edited the script. It's as below-
import boto3
import pandas as pd
import numpy as np
import sys
client = boto3.client('s3')
resource = boto3.resource('s3')
my_bucket = resource.Bucket('test-s3')
def function(df_file1, df_file2):
*** some logic ***
return df_final
def lambda_handler(event, context):
obj1 = client.get_object(Bucket='test-s3', Key='file1.csv')
obj2 = client.get_object(Bucket='test-s3', Key='file2.csv')
obj3 = client.get_object(Bucket='test-s3', Key='file3.csv')
df_file1 = pd.read_csv(obj1['Body'], encoding='utf-8', sep=',')
df_file2 = pd.read_csv(obj2['Body'], encoding='utf-8', sep=',')
df_file3 = pd.read_csv(obj3['Body'], encoding='utf-8', sep=',')
df_final = function(df_file1, df_file2)
from sqlalchemy import create_engine
import psycopg2
engine = create_engine('postgresql://USERNAME:PASSWORD#***.eu-central-1.rds.amazonaws.com:5432/DBNAME')
df_final.to_sql('table name', engine, schema='data')
I made a virtual environment in my local system and installed all the packages - pandas, SQLAlchemy etc. I zipped this package and the script and uploaded it to Lambda. Now I am getting this error -
[ERROR] Runtime.ImportModuleError: Unable to import module 'lambda_function': No module named 'pandas'
I have followed aws package deploy link to package all necessary stuff. Why Do I still get an error then ?
Use the console to create a lambda. Select the correct python version you want and also ensure you have alloted enough memory and put the timeout time to 15 minutes (maximum). When creating the lambda it will also let you attach a role to it. Create a role and attach a policy to the role that lets you access the the s3 bucket where your CSVs will be.
Next step is to create a layer for your lambda which will have all the dependencies that you need for your script to run. Lambda by default has boto3 package installed but you will need to install pandas (with all its dependencies), sqlalchemy and psycopg2. You can find a simple tutorial on how to do this here
Now that you have created a layer, attach that layer to your lambda.
We can finally move on to your script. Since you need to read all csv files on your s3 path, you will have to change your script to read the csv files dynamically. Currently you have hardcoded the names of the csv files. You can change your script to first get all the keys in your bucket using something like:
response = client.list_objects_v2(
Bucket=my_bucket
)['Contents']
This will give you a list of your keys. Filter them if you need.
Next you can create multiple dataframes by looping through the response like this:
d = {}
for idx, obj in enumerate(response):
d['df_'+idx] = pd.read_csv(client.get_object(Bucket='test-s3', Key=obj['Key'])['Body'], encoding='utf-8', sep = ',')
This will create a dictionary d with all your dataframes. Please try this code out locally first to correct any mistakes.
Now copy your final code and paste it in the lambda editor above the def lambda handler(): Call your function from within the lambda handler function.

minimal example of how to export a jupyter notebook to pdf using nbconvert and PDFExporter()

I am trying to export a pdf copy of a jupyter notebook using nbconvert from within a notebook cell. I have read the documentation, but I just cannot find some basic code to actually execute the nbconvert command and export to pdf.
I was able to get this far, but I was hoping that someone could just fill in the final gaps.
from nbconvert import PDFExporter
notebook_pdf = PDFExporter()
notebook_pdf.template_file = '../print_script/pdf_nocode.tplx'
Note sure how to get from here to actually getting the pdf created.
Any help would be appreciated.
I'm no expert, but managed to get this working. The key is that you need to preprocess the notebook which will allow you to use the PDFExporter.from_notebook_node() function. This will give you your pdf_data in byte format that can then be written to file:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert import PDFExporter
notebook_filename = "notebook.ipynb"
with open(notebook_filename) as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {'metadata': {'path': 'notebooks/'}})
pdf_exporter = PDFExporter()
pdf_data, resources = pdf_exporter.from_notebook_node(nb)
with open("notebook.pdf", "wb") as f:
f.write(pdf_data)
f.close()
It's worth noting that the ExecutePreprocessor requires the resources dict, but we don't use it in this example.
Following is rest api that convert .ipynb file into .html
POST: http://URL/export/<id>
Get: http://URL/export/<id> will return a id.html
import os
from flask import Flask, render_template, make_response
from flask_cors import CORS
from flask_restful import reqparse, abort, Api, Resource
from nbconvert.exporters import HTMLExporter
exporter = HTMLExporter()
app = Flask(__name__)
cors = CORS(app, resources={r"/export/*": {"origins": "*"}})
api = Api(app)
parser = reqparse.RequestParser()
parser.add_argument('path')
notebook_file_srv = '/path of your .ipynb file'
def notebook_doesnt_exist(nb):
abort(404, message="Notebook {} doesn't exist".format(nb))
class Notebook(Resource):
def get(self, id):
headers = {'Content-Type': 'text/html'}
return make_response(render_template(id + '.html'), 200, headers)
def post(self, id):
args = parser.parse_args()
notebook_file = args['path']
notebook_file = notebook_file_srv + id + '.ipynb'
if not os.path.exists(notebook_file):
return 'notebook \'.ipynb\' file not found', 404
else:
nb_name, _ = os.path.splitext(os.path.basename(notebook_file))
# dirname = os.path.dirname(notebook_file)
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'templates')
output_path = os.path.join(output_path, '{}.html'.format(nb_name))
output, resources = exporter.from_filename(notebook_file)
f = open(output_path, 'wb')
f.write(output.encode('utf8'))
f.close()
return 'done', 201
api.add_resource(Notebook, '/export/<id>')
if __name__ == '__main__':
app.run(debug=True)

How to use the PyPy as the notebook interpreter?

I Have a Script for data extraction from some CSV files and bifurcating the Data into different excel files. I using Ipython for the that and I m sure it using CPython as the Default interpreter.
But the script is taking too much time for the whole process to finish. Can someone please help to how use that script using the PyPy as i heard it is much faster than CPython.
Script is something like this:
import pandas as pd
import xlsxwriter as xw
import csv
import pymsgbox as py
file1 = "vDashOpExel_Change_20150109.csv"
file2 = "vDashOpExel_T3Opened_20150109.csv"
path = "C:\Users\Abhishek\Desktop\Pandas Anlaysis"
def uniq(words):
seen = set()
for word in words:
l = word.lower()
if l in seen:
continue
seen.add(l)
yield word
def files(file_name):
df = pd.read_csv( path + '\\' + file_name, sep=',', encoding = 'utf-16')
final_frame = df.dropna(how='all')
file_list = list(uniq(list(final_frame['DOEClient'])))
return file_list, final_frame
def fill_data(f_list, frame1=None, frame2=None):
if f_list is not None:
for client in f_list:
writer = pd.ExcelWriter(path + '\\' + 'Accounts'+ '\\' + client + '.xlsx', engine='xlsxwriter')
if frame1 is not None:
data1 = frame1[frame1.DOEClient == client] # Filter the Data
data1.to_excel(writer,'Change',index=False, header=True) # Importing the Data to Excel File
if frame2 is not None:
data2 = frame2[frame2.DOEClient == client] # Filter the Data
data2.to_excel(writer,'Opened',index=False, header=True) # Importing the Data to Excel File
else:
py.alert('Please enter the First Parameter !!!', 'Error')
list1, frame1 = files(file1)
list2, frame2 = files(file2)
final_list = set(list1 + list2)