How to use drive with external account in colab - google-colaboratory

So, i have all important files in my account and i need to import that files to the local machine of colab and share the files can be dangerous and for any rason the files copy and paste theirself in the drive filling it, i try to use pydrive but i cant "automatize" the process of login with a user account. I need a form to automatize the process of login using colab, ideas?
`
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google_drive_downloader import GoogleDriveDownloader as gdd
gauth = GoogleAuth()
drive = GoogleDrive(gauth)
file = open("client_secrets.json", "w")
file.write('{"web": __')
file.close()
file = open("credentials.json", "w")
file.write('{"access_token": __"}')
file.close()
file = open("settings.json", "w")
file.write('')
if gauth.credentials is None:
gauth.CommandLineAuth()
elif gauth.access_token_expired:
gauth.Refresh()
else:
gauth.Authorize()
( __ = private information)

Related

How to obtain the current path to my google colab file?

I work with google colab on my google drive. When I do
import os
cwd= os.getcwd()
print (cwd)
it return
/content
. How to obtain the path to my google colab file like
/content/drive/path_to_my_current_google_colab_file
from google.colab \
import drive
import os, glob
my_name = 'my.ipynb'
drive.mount(os.getcwd() + '/drive')
my_path = glob.glob(os.getcwd() + '/**/' +
my_name, recursive = True)
# some 'list comprehension'.
my_path = [my_i
for my_i in my_path if len(my_i) in [max([len(my_i)
for my_i in my_path])]
][0]; print(my_path)

Read pdf object from S3

I am trying to create a lambda function that will access a pdf form uploaded to s3 and strip out the data entered into the form and send it elsewhere.
I am able to do this when I can download the file locally. So the below script works and allows me to read the data from the pdf into my pandas dataframe.:
import PyPDF2 as pypdf
import pandas as pd
s3 = boto3.resource('s3')
s3.meta.client.download_file(bucket_name, asset_key, './target.pdf')
pdfobject = open("./target.pdf", 'rb')
pdf = pypdf.PdfFileReader(pdfobject)
data = pdf.getFormTextFields()
pdf_df = pd.DataFrame(data, columns=get_cols(data), index=[0])
But with lambda I cannot save the file locally because I get a "read only filesystem" error.
I have tried using the s3.get_object() method like below:
s3_response_object= s3.get_object(
Bucket='pdf-forms-bucket',
Key='target.pdf',
)
pdf_bytes = s3_response_object['Body'].read()
But I have no idea how to convert the resulting bytes into an object that can be parsed with PyDF2. The output that I need and that PyDF2 will produce is like below:
{'form1[0].#subform[0].nameandmail[0]': 'Burt Lancaster',
'form1[0].#subform[0].mailaddress[0]': '675 Creighton Ave, Washington DC',
'form1[0].#subform[0].Principal[0]': 'David St. Hubbins',
'Principal[1]': None,
'form1[0].#subform[0].Principal[2]': 'Bart Simpson',
'Principal[3]': None}
So in summary, I need o be able to read a pdf with fillable forms, into memory and parse it without downloading the file because my lambda function environment won't allow local temp files.
Solved:
This does the trick:
import boto3
from PyPDF2 import PdfFileReader
from io import BytesIO
bucket_name ="pdf-forms-bucket"
item_name = "form.pdf"
s3 = boto3.resource('s3')
obj = s3.Object(bucket_name, item_name)
fs = obj.get()['Body'].read()
pdf = PdfFileReader(BytesIO(fs))
data = pdf.getFormTextFields()

Access local folder in colab

I want to create a function to read files from the folder one by one in the colab.
I have tried below that i.e. I uploaded my folder into My Drive and then tried to access it from there.
But still, it is showing the error.
from google.colab import drive
drive.mount('/content/gdrive')
mypath = "/gdrive/My Drive/resume pdf" #enter your path here where you saved the resumes
onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
Error-->
FileNotFoundError Traceback (most recent call last)
<ipython-input-15-a7a8abc74cc6> in <module>()
1 mypath = "/gdrive/My Drive/resume pdf" #enter your path here where you saved the resumes
----> 2 onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
FileNotFoundError: [Errno 2] No such file or directory: '/gdrive/My Drive/resume pdf'`enter code here`
I have crossed check that there is a folder named "resume pdf" in my drive.
So How can I upload a local folder and access each file of it one by one in colab??
You mounted it at /content/gdrive, but you try to access it at /gdrive, that's the error.
Just change to
mypath = "/content/gdrive/My Drive/resume pdf"

Convert PDF to .txt file with Google Cloud Storage

I have this code for Python on a local file system.
What is the equivalent Python object API for os.getcwd(), os.listdir?
I want this code to work using files from GCS?
In order to use GCS folders, I include this code
from google.cloud import storage
client = storage.Client()
bucket = client.bucket('my-bucket')
pdfDir = bucket.get_blob('uploads/pdf/')
txtDir = bucket.get_blob('uploads/txt/')
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to
txtdir
def PDF2txt(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
pdfDir = "C:/pdftotxt/pdfs/"
txtDir = "C:/pdftotxt/txt/"
PDF2txt(pdfDir, txtDir)
I assume that what you want is to list objects in a bucket and objects in particular folders inside a bucket. For doing that you can use directly the Python Client Libraries that Google Cloud Storage provide. Use bucket.list_blobs() for listing the whole bucket and bucket.list_blobs(prefix=prefix, delimiter=delimiter) for listing a particular folder or object.
A more detailed documentation can be found here [1] and the Git repository containing the whole libraries here [2].

minimal example of how to export a jupyter notebook to pdf using nbconvert and PDFExporter()

I am trying to export a pdf copy of a jupyter notebook using nbconvert from within a notebook cell. I have read the documentation, but I just cannot find some basic code to actually execute the nbconvert command and export to pdf.
I was able to get this far, but I was hoping that someone could just fill in the final gaps.
from nbconvert import PDFExporter
notebook_pdf = PDFExporter()
notebook_pdf.template_file = '../print_script/pdf_nocode.tplx'
Note sure how to get from here to actually getting the pdf created.
Any help would be appreciated.
I'm no expert, but managed to get this working. The key is that you need to preprocess the notebook which will allow you to use the PDFExporter.from_notebook_node() function. This will give you your pdf_data in byte format that can then be written to file:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert import PDFExporter
notebook_filename = "notebook.ipynb"
with open(notebook_filename) as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {'metadata': {'path': 'notebooks/'}})
pdf_exporter = PDFExporter()
pdf_data, resources = pdf_exporter.from_notebook_node(nb)
with open("notebook.pdf", "wb") as f:
f.write(pdf_data)
f.close()
It's worth noting that the ExecutePreprocessor requires the resources dict, but we don't use it in this example.
Following is rest api that convert .ipynb file into .html
POST: http://URL/export/<id>
Get: http://URL/export/<id> will return a id.html
import os
from flask import Flask, render_template, make_response
from flask_cors import CORS
from flask_restful import reqparse, abort, Api, Resource
from nbconvert.exporters import HTMLExporter
exporter = HTMLExporter()
app = Flask(__name__)
cors = CORS(app, resources={r"/export/*": {"origins": "*"}})
api = Api(app)
parser = reqparse.RequestParser()
parser.add_argument('path')
notebook_file_srv = '/path of your .ipynb file'
def notebook_doesnt_exist(nb):
abort(404, message="Notebook {} doesn't exist".format(nb))
class Notebook(Resource):
def get(self, id):
headers = {'Content-Type': 'text/html'}
return make_response(render_template(id + '.html'), 200, headers)
def post(self, id):
args = parser.parse_args()
notebook_file = args['path']
notebook_file = notebook_file_srv + id + '.ipynb'
if not os.path.exists(notebook_file):
return 'notebook \'.ipynb\' file not found', 404
else:
nb_name, _ = os.path.splitext(os.path.basename(notebook_file))
# dirname = os.path.dirname(notebook_file)
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'templates')
output_path = os.path.join(output_path, '{}.html'.format(nb_name))
output, resources = exporter.from_filename(notebook_file)
f = open(output_path, 'wb')
f.write(output.encode('utf8'))
f.close()
return 'done', 201
api.add_resource(Notebook, '/export/<id>')
if __name__ == '__main__':
app.run(debug=True)