I am trying to read a config file from S3 bucket via python script run by AWS glue. But the file is not read.
from __future__ import print_function
from pyspark import SparkContext
from awsglue.utils import getResolvedOptions
import sys
import ConfigParser
from os import path
sc = SparkContext.getOrCreate()
print("My New Job")
args = getResolvedOptions(sys.argv, ['config_path', 'section_type'])
config_path = args["config_path"]
print("config_path")
print(config_path)
config = ConfigParser.ConfigParser()
filenameread = config.read(['s3://temp-bucket/config/myconfig.ini','s3://temp-bucket/config/myconfig.config','s3://temp-bucket/config/myconfig.txt'])
print("filenameread")
print(filenameread)
print("sections")
print(config.sections())
The log appears as below. The config sections is empty.
{
"timestamp": 1551705899133,
"message": "Container: container_somenumber on ip-somenumber.ec2.internal_somenumber\nLogType:stdout\nLog Upload Time:Mon Mar 04 13:24:51 +0000 2019\nLogLength:175\nLog Contents:\nMy New Job\nconfig_path\ns3://temp-bucket/config/myconfig.ini\nfilenameread\n[]\nsections\n[]\nEnd of LogType:stdout\n",
"ingestionTime": 1551705899785
},
Can anybody help on this. Using Python 2.7.15 .
This can be achieved by defining the --extra-files key in the job parameters section.
Due to this, the filepath(s)(comma separated in case of multiple files) mentioned in the value of this key are brought to the run time environment during execution.
Then using configparser, one can easily access the file as a conventional config file.
For more detailed info, refer https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
Related
I'm trying to setup the sample code used for pdf data extract using the python sdk for Adobe PDF API service on a databricks environment. This cluster has just one driver node. But i'm facing issue while accessing the configuration files uploaded to DBFS folder.
Please let me know how do i fix the issue.
Here is the code snippet
import logging
import os.path
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.table_structure_type import TableStructureType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
#logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
credentials = Credentials.service_account_credentials_builder() \
.from_file("/dbfs/FileStore/pdfservices_api_credentials.json") \
.build()
execution_context = ExecutionContext.create(credentials)
extract_pdf_operation = ExtractPDFOperation.create_new()
source = FileRef.create_from_local_file("/dbfs/FileStore/form.pdf")
extract_pdf_operation.set_input(source)
# Build ExtractPDF options and set them into the operation
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
.with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
.with_element_to_extract_renditions(ExtractRenditionsElementType.TABLES) \
.with_table_structure_format(TableStructureType.CSV) \
.build()
extract_pdf_operation.set_options(extract_pdf_options)
# Execute the operation.
result: FileRef = extract_pdf_operation.execute(execution_context)
result.save_as(base_path + "/output/ExtractTextInfoFromPDF.zip")
Here is the error details:
INFO:adobe.pdfservices.operation.pdfops.extract_pdf_operation:All validations successfully done. Beginning ExtractPDF operation execution
INFO:py4j.java_gateway:Received command c on object id p0
INFO:py4j.java_gateway:Received command c on object id p0
INFO:py4j.java_gateway:Received command c on object id p0
INFO:py4j.java_gateway:Received command c on object id p0
SdkException: description =Exception in fetching access token, requestTrackingId=(<class 'AttributeError'>, AttributeError("'str' object has no attribute 'get'"), <traceback object at 0x7f7572a3fd00>)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-150d96ca-003d-4671-a6d9-ab8e566616d1/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/auth/jwt_authenticator.py in refresh_token(self)
62 data=access_token_request_payload, headers={})
---> 63 response = http_client.process_request(http_request=http_request, success_status_codes=[HTTPStatus.OK],
64 error_response_handler=self.handle_ims_failure)
/local_disk0/.ephemeral_nfs/envs/pythonEnv-150d96ca-003d-4671-a6d9-ab8e566616d1/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py in process_request(http_request, success_status_codes, error_response_handler)
37 response = _execute_request(http_request)
---> 38 if _handle_response_and_retry(response, success_status_codes,
39 error_response_handler, not http_request.authenticator, http_request.request_key) and http_request.retryable:
/local_disk0/.ephemeral_nfs/envs/pythonEnv-150d96ca-003d-4671-a6d9-ab8e566616d1/lib/python3.8/site-packages/adobe/pdfservices/operation/internal/http/http_client.py in _handle_response_and_retry(response, success_status_codes, error_response_handler, is_ims_api, request_key)
94 "Failure response code {error_code} encountered from backend".format(error_code=response.status_code))
---> 95 should_retry = ResponseUtil.handle_api_failures(response, request_key, is_ims_api)
96 return should_retry if should_retry else error_response_handler(response)
It looks like you're missing forward slash in the file name, so the file is considered as path relative to the current directory /databricks/driver/. Instead of dbfs/tmp/pdf/pdfservices-api-credentials.json try /dbfs/tmp/pdf/pdfservices-api-credentials.json.
Update for community edition: There is no support for /dbfs mount on the community edition, so you need to copy file from DBFS to the local file system for use. You can do it using dbutils.fs.cp command:
# copy file to local disk
dbutils.fs.cp("/tmp/pdf/pdfservices-api-credentials.json",
"file:/tmp/pdfservices-api-credentials.json")
# use it
credentials = Credentials.service_account_credentials_builder() \
.from_file("/tmp/pdfservices-api-credentials.json") \
.build()
I have a csv file with the following columns:
Name Adress/1 Adress/2 City State
When I try to read this csv file from local disk I have no issue.
But when I try to read it from S3 with the below code I get error when I use io.StringIO.
When I use io.BytesIO each record displays as one column. Though the file is a ',' separated some column do contain '/n' or '/t' in it. I believe these causing the issue.
I used AWS Wrangler with no issue. But my requirement is to read this csv file with boto3
import pandas as pd
import boto3
s3 = boto3.resource('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
my_bucket = s3.Bucket(AWS_S3_BUCKET)
csv_obj=my_bucket.Object(key=key).get().get('Body').read().decode('utf16')
data= io.BytesIO(csv_obj) #io.StringIO(csv_obj)
sdf = pd.read_csv(data,delimiter=sep,names=cols, header=None,skiprows=1)
print(sdf)
Any suggestion please?
try get_object():
obj = boto3.client('s3').get_object(Bucket=AWS_S3_BUCKET, Key=key)
data = io.StringIO(obj['Body'].read().decode('utf-8'))
The following code snippet is from a Google tutorial, it simply prints the names of files on GCP in a given bucket:
from google.cloud import storage
def list_blobs(bucket_name):
"""Lists all the blobs in the bucket."""
# bucket_name = "your-bucket-name"
storage_client = storage.Client()
# Note: Client.list_blobs requires at least package version 1.17.0.
blobs = storage_client.list_blobs(bucket_name)
for blob in blobs:
print(blob.name)
list_blobs('sn_project_data')
No from the command line I can run:
$ python path/file.py
And in my terminal the files in said bucket are printed out. Great, it works!
However, this isn't quite my goal. I'm looking to open a file and act upon it. For example:
df = pd.read_excel(filename)
print(df.iloc[0])
However, when I pass the path to the above, the error returned reads "invalid file path." So I'm sure there is some sort of GCP specific function call to actually access these files...
What command(s) should I run?
Edit: This video https://www.youtube.com/watch?v=ED5vHa3fE1Q shows a trick to open files and needs to use StringIO in the process. But it doesn't support excel files, so it's not an effective solution.
read_excel() does not support google cloud storage file path as of now but it can read data in bytes.
pandas.read_excel(io, sheet_name=0, header=0, names=None,
index_col=None, usecols=None, squeeze=False, dtype=None, engine=None,
converters=None, true_values=None, false_values=None, skiprows=None,
nrows=None, na_values=None, keep_default_na=True, na_filter=True,
verbose=False, parse_dates=False, date_parser=None, thousands=None,
comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True,
storage_options=None)
Parameters: io : str, bytes, ExcelFile, xlrd.Book, path object, or
file-like object
What you can do is use the blob object and use download_as_bytes() to convert the object into bytes.
Download the contents of this blob as a bytes object.
For this example I just used a random sample xlsx file and read the 1st sheet:
from google.cloud import storage
import pandas as pd
bucket_name = "your-bucket-name"
blob_name = "SampleData.xlsx"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
data_bytes = blob.download_as_bytes()
df = pd.read_excel(data_bytes)
print(df)
Test done:
I am trying to read csv df from s3 bucket , but facing issues. Can you let me know where am I masking mistakes here ?
conf=SparkConf()
conf.setMaster('local')
conf.setAppName('sparkbasic')
sc = SparkContext.getOrCreate(conf=conf)
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", "abc")
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "xyz")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "mybucket/path/fileeast-1.redshift.amazonaws.com")
from pyspark.sql import SparkSession
sc = SparkSession.builder.appName('sparkbasic').getOrCreate()
This is the code where I get the error
csvDf = sc.read.csv("s3a://bucket/path/file/*.csv")
This is the error I get , I tried links given in stackoverflow answers , but nothing worked me so far
ava.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
Maybe you can have a look to S3Fs
Given your details, maybe a configuration like that could work:
import s3fs
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'fileeast-1.redshift.amazonaws.com',
"aws_access_key_id": "abc",
"aws_secret_access_key": "xyz"})
To check if you manage to interact with s3, you can try the following command (NB: change somefile.csv to an existing one)
fs.info('s3://bucket/path/file/somefile.csv')
Note that in fs.info we start the path with s3. If you do not encounter an error, you might hope the following command works:
csvDf = sc.read.csv("s3a://bucket/path/file/*.csv")
This time you have the path begins by s3a
I have a CGI form which takes a CSV sheet and email and calls two individual python scripts which run in the background. These take about 15 minutes to execute. I want to make an asynchronous call to these scripts so that I can display some message and prevent apache timeout.
Here is my code
import os
import cgi, cgitb
import csv
import sys
import subprocess
import io
cgitb.enable()
form = cgi.FieldStorage()
filedata = form['file']
filecontent = filedata.file.read().splitlines()
email=form.getvalue('email_address')
email = str(email)
subprocess.Popen([sys.executable, 'giw.py', str(email)], shell=False,
stdin=None, stdout=None, stderr=None, close_fds=True)
subprocess.Popen([sys.executable, 'mailer.py', str(email)], shell=False,
stdin=None, stdout=None, stderr=None, close_fds=True)
This worked for me:-
Run background process in Python and do NOT wait
import subprocess
import sys
subprocess.Popen([sys.executable, 'giw.py', str(email)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT);