ERROR in getting file from Amazon S3 using python(3.8) - amazon-s3

I am trying to get data from Amazon S3 and store it into a variable (file such as .pkl file).
And I am getting the following error:
ERROR:
expected str, bytes or os.PathLike object, not _io.BytesIO
CODE for S3
class S3Mgr:
def __init__(self,bucketName):
self.aws_access_key_id= CONFIG.S3[0]
self.aws_secret_access_key = CONFIG.S3[1]
self.region_name = CONFIG.S3[2]
self.bucketName = bucketName
def __connect(self):
self.s3 = boto3.client(
's3',
aws_access_key_id=self.aws_access_key_id,
aws_secret_access_key=self.aws_secret_access_key,
region_name=self.region_name
)
def retrieveModel(self,fileName):
self.__connect()
a = self.s3.download_fileobj(Bucket=self.bucketName, Key="fcm/project/"+str(fileName))
return a['Body'].read()
CODE for pickle
import pickle
from io import BytesIO
S3obj = S3mgr("mybucket")
model = S3obj.retrieveModel("model.pkl")
data = BytesIO(model)
model = pickle.load(data)
prediction = model.predict(inputArray)
Above inputArray is the array of inputs.

Try this, replace download_fileobj with get_object in your s3Mgr class's retrieveModel method.
Something like this:
def retrieveModel(self,fileName):
self.__connect()
a = self.s3.get_object(Bucket=self.bucketName, Key="fcm/project/"+str(fileName))
return a['Body'].read()
I Hope, it works. Ping if any progress or need any help.
Cheers 👍 !

Related

Read pdf object from S3

I am trying to create a lambda function that will access a pdf form uploaded to s3 and strip out the data entered into the form and send it elsewhere.
I am able to do this when I can download the file locally. So the below script works and allows me to read the data from the pdf into my pandas dataframe.:
import PyPDF2 as pypdf
import pandas as pd
s3 = boto3.resource('s3')
s3.meta.client.download_file(bucket_name, asset_key, './target.pdf')
pdfobject = open("./target.pdf", 'rb')
pdf = pypdf.PdfFileReader(pdfobject)
data = pdf.getFormTextFields()
pdf_df = pd.DataFrame(data, columns=get_cols(data), index=[0])
But with lambda I cannot save the file locally because I get a "read only filesystem" error.
I have tried using the s3.get_object() method like below:
s3_response_object= s3.get_object(
Bucket='pdf-forms-bucket',
Key='target.pdf',
)
pdf_bytes = s3_response_object['Body'].read()
But I have no idea how to convert the resulting bytes into an object that can be parsed with PyDF2. The output that I need and that PyDF2 will produce is like below:
{'form1[0].#subform[0].nameandmail[0]': 'Burt Lancaster',
'form1[0].#subform[0].mailaddress[0]': '675 Creighton Ave, Washington DC',
'form1[0].#subform[0].Principal[0]': 'David St. Hubbins',
'Principal[1]': None,
'form1[0].#subform[0].Principal[2]': 'Bart Simpson',
'Principal[3]': None}
So in summary, I need o be able to read a pdf with fillable forms, into memory and parse it without downloading the file because my lambda function environment won't allow local temp files.
Solved:
This does the trick:
import boto3
from PyPDF2 import PdfFileReader
from io import BytesIO
bucket_name ="pdf-forms-bucket"
item_name = "form.pdf"
s3 = boto3.resource('s3')
obj = s3.Object(bucket_name, item_name)
fs = obj.get()['Body'].read()
pdf = PdfFileReader(BytesIO(fs))
data = pdf.getFormTextFields()

Printing value of tensorflow variable inside object

I am trying to print the value of a Tensorflow variable that is defined inside an object. To better illustrate my issue, I am currently trying to run the monodepth library. It has 2 main files, dataloader and main. Basically dataloader iterates over a text file of
class MonodepthDataloader(object):
"""monodepth dataloader"""
def __init__(self, data_path, filenames_file, params, dataset, mode):
self.data_path = data_path
self.params = params
self.dataset = dataset
self.mode = mode
self.left_image_batch = None
self.right_image_batch = None
input_queue = tf.train.string_input_producer([filenames_file], shuffle=False)
line_reader = tf.TextLineReader()
_, line = line_reader.read(input_queue)
split_line = tf.string_split([line]).values
...
left_image_path = tf.string_join([self.data_path, split_line[0]])
left_image_o = self.read_image(left_image_path)
I am trying to print out left_image_path to verify that it is being generated correctly. However this is in an object being called by monodepth_main. That is monodepth_main calls the dataloader with the following lines:
dataloader = MonodepthDataloader(args.data_path, args.filenames_file, params, args.dataset, args.mode)
left = dataloader.left_image_batch
As a result I can't just use sess.run(x). I have also tried using tf.Print(line, [line]) but nothing shows up.
How do I print out the value of a tensorflow variable inside an object? Specifically left_image_path?

Getting data from odo.resource(source) to odo.resource(target)

I'm trying to extend the odo library with functionality to convert a GDAL dataset (raster with spatial information) to a NetCDF file.
Reading in the gdal dataset goes fine. But in the creation stage of the netcdf I need some metadata of the gdal dataset (metadata that is not know yet when calling odo.odo(source,target) ). How could I achieve this?
a short version of my code so far:
import odo
from odo import resource, append
import gdal
import netCDF4 as nc4
import numpy as np
#resource.register('.+\.tif')
def resource_gdal(uri, **kwargs):
ds = gdal.Open(uri)
# metadata I need to transfer to netcdf
b = ds.GetGeoTransform() #bbox, interval
return ds
#resource.register('.+\.nc')
def resource_netcdf(uri, dshape=None, **kwargs):
ds = nc4.Dataset(uri,'w')
# create lat lon dimensions and variables
ds.createDimension(lat, dshape[0].val)
ds.createDimension(lon, dshape[1].val)
lat = ds.createVariable('lat','f4', ('lat',))
lon = ds.createVariable('lon','f4', ('lon',))
# create a range from the **gdal metadata**
lat_array = np.arange(dshape[0].val)*b[1]+b[0]
lon_array = np.arange(dshape[1].val)*b[5]+b[3]
# assign the range to the netcdf variable
lat[:] = lat_array
lon[:] = lon_array
# create the variable which will hold the gdal data
data = ds.createVariable('data', 'f4', ('lat', 'lon',))
return data
#append.register(nc4.Variable, gdal.Dataset)
def append_gdal_to_nc4(tgt, src, **kwargs):
arr = src.ReadAsArray()
tgt[:] = arr
return tgt
Thanks!
I don't have much experience with odo, but from browsing the source code and docs it looks like resource_netcdf() should not be involved in translating gdal data to netcdf. Translating should be the job of a gdal_to_netcdf() function decorated by convert.register. In such a case, the gdal.Dataset object returned by resource_gdal would have all sufficient information (georeferencing, pixel size) to make a netcdf.

minimal example of how to export a jupyter notebook to pdf using nbconvert and PDFExporter()

I am trying to export a pdf copy of a jupyter notebook using nbconvert from within a notebook cell. I have read the documentation, but I just cannot find some basic code to actually execute the nbconvert command and export to pdf.
I was able to get this far, but I was hoping that someone could just fill in the final gaps.
from nbconvert import PDFExporter
notebook_pdf = PDFExporter()
notebook_pdf.template_file = '../print_script/pdf_nocode.tplx'
Note sure how to get from here to actually getting the pdf created.
Any help would be appreciated.
I'm no expert, but managed to get this working. The key is that you need to preprocess the notebook which will allow you to use the PDFExporter.from_notebook_node() function. This will give you your pdf_data in byte format that can then be written to file:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert import PDFExporter
notebook_filename = "notebook.ipynb"
with open(notebook_filename) as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {'metadata': {'path': 'notebooks/'}})
pdf_exporter = PDFExporter()
pdf_data, resources = pdf_exporter.from_notebook_node(nb)
with open("notebook.pdf", "wb") as f:
f.write(pdf_data)
f.close()
It's worth noting that the ExecutePreprocessor requires the resources dict, but we don't use it in this example.
Following is rest api that convert .ipynb file into .html
POST: http://URL/export/<id>
Get: http://URL/export/<id> will return a id.html
import os
from flask import Flask, render_template, make_response
from flask_cors import CORS
from flask_restful import reqparse, abort, Api, Resource
from nbconvert.exporters import HTMLExporter
exporter = HTMLExporter()
app = Flask(__name__)
cors = CORS(app, resources={r"/export/*": {"origins": "*"}})
api = Api(app)
parser = reqparse.RequestParser()
parser.add_argument('path')
notebook_file_srv = '/path of your .ipynb file'
def notebook_doesnt_exist(nb):
abort(404, message="Notebook {} doesn't exist".format(nb))
class Notebook(Resource):
def get(self, id):
headers = {'Content-Type': 'text/html'}
return make_response(render_template(id + '.html'), 200, headers)
def post(self, id):
args = parser.parse_args()
notebook_file = args['path']
notebook_file = notebook_file_srv + id + '.ipynb'
if not os.path.exists(notebook_file):
return 'notebook \'.ipynb\' file not found', 404
else:
nb_name, _ = os.path.splitext(os.path.basename(notebook_file))
# dirname = os.path.dirname(notebook_file)
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'templates')
output_path = os.path.join(output_path, '{}.html'.format(nb_name))
output, resources = exporter.from_filename(notebook_file)
f = open(output_path, 'wb')
f.write(output.encode('utf8'))
f.close()
return 'done', 201
api.add_resource(Notebook, '/export/<id>')
if __name__ == '__main__':
app.run(debug=True)

Unable to reload data as a csv file from IPython Notebook

I have the following IPython Notebook, I am trying to access data base of movies from rotten tomatoes website.
But Rotten Tomatoes limits to 10,000 API requests a day
So I don't want to re-run this function every time when I restart the notebook, I am trying to save and reload this data as a CSV file. When I convert the data to a csv file I am getting this processing symbol[*] inside IPython notebook. After some time I am getting the following error
ConnectionError: HTTPConnectionPool(host='api.rottentomatoes.com', port=80): Max retries exceeded with url: /api/public/v1.0/movie_alias.json?apikey=5xr26r2qtgf9h3kcq5kt6y4v&type=imdb&id=0113845 (Caused by <class 'socket.gaierror'>: [Errno 11002] getaddrinfo failed)
Is this problem due to slow internet connection? Should I make some changes to my code? Kindly help me with this.
The code for the file is shown below:
%matplotlib inline
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
api_key = '5xr26r2qtgf9h3kcq5kt6y4v'
movie_id = '770672122' # toy story 3
url = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % movie_id
#these are "get parameters"
options = {'review_type': 'top_critic', 'page_limit': 20, 'page': 1, 'apikey': api_key}
data = requests.get(url, params=options).text
data = json.loads(data) # load a json string into a collection of lists and dicts
print json.dumps(data['reviews'][0], indent=2) # dump an object into a json string
from io import StringIO
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file,delimiter='\t')
movies
#print the first row
movies[['id', 'title', 'imdbID', 'year']]
def base_url():
return 'http://api.rottentomatoes.com/api/public/v1.0/'
def rt_id_by_imdb(imdb):
"""
Queries the RT movie_alias API. Returns the RT id associated with an IMDB ID,
or raises a KeyError if no match was found
"""
url = base_url() + 'movie_alias.json'
imdb = "%7.7i" % imdb
params = dict(id=imdb, type='imdb', apikey=api_key)
r = requests.get(url, params=params).text
r = json.loads(r)
return r['id']
def _imdb_review(imdb):
"""
Query the RT reviews API, to return the first page of reviews
for a movie specified by its IMDB ID
Returns a list of dicts
"""
rtid = rt_id_by_imdb(imdb)
url = base_url() + 'movies/{0}/reviews.json'.format(rtid)
params = dict(review_type='top_critic',
page_limit=20,
page=1,
country='us',
apikey=api_key)
data = json.loads(requests.get(url, params=params).text)
data = data['reviews']
data = [dict(fresh=r['freshness'],
quote=r['quote'],
critic=r['critic'],
publication=r['publication'],
review_date=r['date'],
imdb=imdb, rtid=rtid
) for r in data]
return data
def fetch_reviews(movies, row):
m = movies.irow(row)
try:
result = pd.DataFrame(_imdb_review(m['imdbID']))
result['title'] = m['title']
except KeyError:
return None
return result
def build_table(movies, rows):
dfs = [fetch_reviews(movies, r) for r in range(rows)]
dfs = [d for d in dfs if d is not None]
return pd.concat(dfs, ignore_index=True)
critics = build_table(movies, 3000)
critics.to_csv('critics.csv', index=False)
critics = pd.read_csv('critics.csv')