Save Pandas or Pyspark dataframe from Databricks to Azure Blob Storage - pandas

Is there a way I can save a Pyspark or Pandas dataframe from Databricks to a blob storage without mounting or installing libraries?
I was able to achieve this after mounting the storage container into Databricks and using the library com.crealytics.spark.excel, but I was wondering if I can do the same without the library or without mounting because I will be working on clusters without these 2 permissions.

Here the code for saving the dataframe locally to dbfs.
# export
from os import path
folder = "export"
name = "export"
file_path_name_on_dbfs = path.join("/tmp", folder, name)
# Writing to DBFS
# .coalesce(1) used to generate only 1 file, if the dataframe is too big this won't work so you'll have multiple files qnd you need to copy them later one by one
sampleDF \
.coalesce(1) \
.write \
.mode("overwrite") \
.option("header", "true") \
.option("delimiter", ";") \
.option("encoding", "UTF-8") \
.csv(file_path_name_on_dbfs)
# path of destination, which will be sent to az storage
dest = file_path_name_on_dbfs + ".csv"
# Renaming part-000...csv to our file name
target_file = list(filter(lambda file: file.name.startswith("part-00000"), dbutils.fs.ls(file_path_name_on_dbfs)))
if len(target_file) > 0:
dbutils.fs.mv(target_file[0].path, dest)
dbutils.fs.cp(dest, f"file://{dest}") # this line is added for community edition only cause /dbfs is not recognized, so we copy the file locally
dbutils.fs.rm(file_path_name_on_dbfs,True)
The code that will send the file into az storage :
import requests
sas="YOUR_SAS_TOKEN_PREVIOUSLY_CREATED" # follow the link below to create SAS token (using sas is slightly more secure than raw key storage)
blob_account_name = "YOUR_BLOB_ACCOUNT_NAME"
container = "YOUR_CONTAINER_NAME"
destination_path_w_name = "export/export.csv"
url = f"https://{blob_account_name}.blob.core.windows.net/{container}/{destination_path_w_name}?{sas}"
# here we read the content of our previously exported df -> csv
# if you are not on community edition you might want to use /dbfs + dest
payload=open(dest).read()
headers = {
'x-ms-blob-type': 'BlockBlob',
'Content-Type': 'text/csv' # you can change the content type according to your needs
}
response = requests.request("PUT", url, headers=headers, data=payload)
# if response.status_code is 201 it means your file was created successfully
print(response.status_code)
Follow this link to setup a SAS token
Remember that anyone who got the sas token can access your storage depending on permissions you set while creating the sas token
Code for Excel export version (using com.crealytics:spark-excel_2.12:0.14.0)
Saving the dataframe :
data = [
('a',25,'ast'),
('b',15,'phone'),
('c',32,'dlp'),
('d',45,'rare'),
('e',60,'phq' )
]
colums = ["column1" ,"column2" ,"column3"]
sampleDF = spark.createDataFrame(data=data, schema = colums)
sampleDF.show()
# export
from os import path
folder = "export"
name = "export"
file_path_name_on_dbfs = path.join("/tmp", folder, name)
# Writing to DBFS
sampleDF.write.format("com.crealytics.spark.excel")\
.option("header", "true")\
.mode("overwrite")\
.save(file_path_name_on_dbfs + ".xlsx")
# excel
dest = file_path_name_on_dbfs + ".xlsx"
dbutils.fs.cp(dest, f"file://{dest}") # this line is added for community edition only cause /dbfs is not recognized, so we copy the file locally
Uploading the file to azure storage :
import requests
sas="YOUR_SAS_TOKEN_PREVIOUSLY_CREATED" # follow the link below to create SAS token (using sas is slightly more secure than raw key storage)
blob_account_name = "YOUR_BLOB_ACCOUNT_NAME"
container = "YOUR_CONTAINER_NAME"
destination_path_w_name = "export/export.xlsx"
# destination_path_w_name = "export/export.csv"
url = f"https://{blob_account_name}.blob.core.windows.net/{container}/{destination_path_w_name}?{sas}"
# here we read the content of our previously exported df -> csv
# if you are not on community edition you might want to use /dbfs + dest
# payload=open(dest).read()
payload=open(dest, 'rb').read()
headers = {
'x-ms-blob-type': 'BlockBlob',
# 'Content-Type': 'text/csv'
'Content-Type': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
}
response = requests.request("PUT", url, headers=headers, data=payload)
# if response.status_code is 201 it means your file was created successfully
print(response.status_code)

Related

AWS s3 object upload to google cloud storage

we are trying to migrate data from aws s3 to gcp storage. we tried transfer job in gcp and its working fine. So we wanted to achieve that programmatically with aws lambda since we have dependencies on aws.
When i tried importing google.cloud module I am getting this error
lambda cloudwatch logs
Here is my code:
import os
import logging
import boto3
#from StringIO import StringIO
from google.cloud import storage
#import google-cloud-storage
# Setup logging
LOG = logging.getLogger(__name__)
LOG.setLevel(os.environ.get('LOG_LEVEL', 'INFO'))
GCS_BUCKET_NAME=os.environ['GCS_BUCKET_NAME']
S3 = boto3.client('s3')
def lambda_handler(event, context):
try:
l_t_bucketKey = _getKeys(event)
# Create google client
storage_client = storage.Client()
gcs_bucket = storage_client.get_bucket(os.environ['GCS_BUCKET_NAME'])
LOG.debug('About to copy %d files', len(l_t_bucketKey))
for bucket, key in l_t_bucketKey:
try:
inFileObj = StringIO()
S3.download_fileobj(
Bucket=bucket,
Key=key,
Fileobj=inFileObj
)
blob = gcs_bucket.blob(key)
blob.upload_from_file(inFileObj, rewind=True) # seek(0) before reading file obj
LOG.info('Copied s3://%s/%s to gcs://%s/%s', bucket, key, GCS_BUCKET_NAME, key)
except:
LOG.exception('Error copying file: {k}'.format(k=key))
return 'SUCCESS'
except Exception as e:
LOG.exception("Lambda function failed:")
return 'ERROR'
def _getKeys(d_event):
"""
Extracts (bucket, key) from event
:param d_event: Event dict
:return: List of tuples (bucket, key)
"""
l_t_bucketKey = []
if d_event:
if 'Records' in d_event and d_event['Records']:
for d_record in d_event['Records']:
try:
bucket = d_record['s3']['bucket']['name']
key = d_record['s3']['object']['key']
l_t_bucketKey.append((bucket, key))
except:
LOG.warn('Error extracting bucket and key from event')
return l_t_bucketKey
And I downloaded google-cloud-storage module from pypi website and imported that in aws lambda layer. Please help in providing me the best link for downloading this module.
Google Storage Bucket can be used with S3 APIs, so you can just use it in your Lambda functions without any extra GCP libraries.
source_client = boto3.client(
's3',
endpoint_url='https://storage.googleapis.com',
aws_access_key_id=os.environ['GCP_KEY'],
aws_secret_access_key=os.environ['GCP_SECRET']
To get access_key and secret - go to the GS bucket settings -> Interoperability -> Access keys for your user account -> Create a key

Is there a way to authenticate OAUTH2.0 of google API through terminal?

I am fetching google photos from my account using Google Photo API. Now there is a need for me to execute that php file via terminal, but the problem is that I can't authenticate with Google API in doing so. Is there a way to do this, and if yes, then how shall it be done?
Yes, it is possible, you need an interactive login for the first authentication but then you can save the token and refresh it automatically as required.
I have implemented this class in Python to do just that.
from requests.adapters import HTTPAdapter
from requests_oauthlib import OAuth2Session
from pathlib import Path
from urllib3.util.retry import Retry
from typing import List, Optional
from json import load, dump, JSONDecodeError
import logging
log = logging.getLogger(__name__)
# OAuth endpoints given in the Google API documentation
authorization_base_url = "https://accounts.google.com/o/oauth2/v2/auth"
token_uri = "https://www.googleapis.com/oauth2/v4/token"
class Authorize:
def __init__(
self, scope: List[str], token_file: Path,
secrets_file: Path, max_retries: int = 5
):
""" A very simple class to handle Google API authorization flow
for the requests library. Includes saving the token and automatic
token refresh.
Args:
scope: list of the scopes for which permission will be granted
token_file: full path of a file in which the user token will be
placed. After first use the previous token will also be read in from
this file
secrets_file: full path of the client secrets file obtained from
Google Api Console
"""
self.max_retries = max_retries
self.scope: List[str] = scope
self.token_file: Path = token_file
self.session = None
self.token = None
try:
with secrets_file.open('r') as stream:
all_json = load(stream)
secrets = all_json['installed']
self.client_id = secrets['client_id']
self.client_secret = secrets['client_secret']
self.redirect_uri = secrets['redirect_uris'][0]
self.token_uri = secrets['token_uri']
self.extra = {
'client_id': self.client_id,
'client_secret': self.client_secret}
except (JSONDecodeError, IOError):
print('missing or bad secrets file: {}'.format(secrets_file))
exit(1)
def load_token(self) -> Optional[str]:
try:
with self.token_file.open('r') as stream:
token = load(stream)
except (JSONDecodeError, IOError):
return None
return token
def save_token(self, token: str):
with self.token_file.open('w') as stream:
dump(token, stream)
self.token_file.chmod(0o600)
def authorize(self):
""" Initiates OAuth2 authentication and authorization flow
"""
token = self.load_token()
if token:
self.session = OAuth2Session(self.client_id, token=token,
auto_refresh_url=self.token_uri,
auto_refresh_kwargs=self.extra,
token_updater=self.save_token)
else:
self.session = OAuth2Session(self.client_id, scope=self.scope,
redirect_uri=self.redirect_uri,
auto_refresh_url=self.token_uri,
auto_refresh_kwargs=self.extra,
token_updater=self.save_token)
# Redirect user to Google for authorization
authorization_url, _ = self.session.authorization_url(
authorization_base_url,
access_type="offline",
prompt="select_account")
print('Please go here and authorize,', authorization_url)
# Get the authorization verifier code from the callback url
response_code = input('Paste the response token here:')
# Fetch the access token
self.token = self.session.fetch_token(
self.token_uri, client_secret=self.client_secret,
code=response_code)
self.save_token(self.token)
# note we want retries on POST as well, need to review this once we
# start to do methods that write to Google Photos
retries = Retry(total=self.max_retries,
backoff_factor=0.1,
status_forcelist=[500, 502, 503, 504],
method_whitelist=frozenset(['GET', 'POST']),
raise_on_status=False)
self.session.mount('https://', HTTPAdapter(max_retries=retries))

Uploading Multiple files in AWS S3 from terraform

I want to upload multiple files to AWS S3 from a specific folder in my local device. I am running into the following error.
Here is my terraform code.
resource "aws_s3_bucket" "testbucket" {
bucket = "test-terraform-pawan-1"
acl = "private"
tags = {
Name = "test-terraform"
Environment = "test"
}
}
resource "aws_s3_bucket_object" "uploadfile" {
bucket = "test-terraform-pawan-1"
key = "index.html"
source = "/home/pawan/Documents/Projects/"
}
How can I solve this problem?
As of Terraform 0.12.8, you can use the fileset function to get a list of files for a given path and pattern. Combined with for_each, you should be able to upload every file as its own aws_s3_bucket_object:
resource "aws_s3_bucket_object" "dist" {
for_each = fileset("/home/pawan/Documents/Projects/", "*")
bucket = "test-terraform-pawan-1"
key = each.value
source = "/home/pawan/Documents/Projects/${each.value}"
# etag makes the file update when it changes; see https://stackoverflow.com/questions/56107258/terraform-upload-file-to-s3-on-every-apply
etag = filemd5("/home/pawan/Documents/Projects/${each.value}")
}
See terraform-providers/terraform-provider-aws : aws_s3_bucket_object: support for directory uploads #3020 on GitHub.
Note: This does not set metadata like content_type, and as far as I can tell there is no built-in way for Terraform to infer the content type of a file. This metadata is important for things like HTTP access from the browser working correctly. If that's important to you, you should look into specifying each file manually instead of trying to automatically grab everything out of a folder.
You are trying to upload a directory, whereas Terraform expects a single file in the source field. It is not yet supported to upload a folder to an S3 bucket.
However, you can invoke awscli commands using null_resource provisioner, as suggested here.
resource "null_resource" "remove_and_upload_to_s3" {
provisioner "local-exec" {
command = "aws s3 sync ${path.module}/s3Contents s3://${aws_s3_bucket.site.id}"
}
}
Since June 9, 2020, terraform has a built-in way to infer the content type (and a few other attributes) of a file which you may need as you upload to a S3 bucket
HCL format:
module "template_files" {
source = "hashicorp/dir/template"
base_dir = "${path.module}/src"
template_vars = {
# Pass in any values that you wish to use in your templates.
vpc_id = "vpc-abc123"
}
}
resource "aws_s3_bucket_object" "static_files" {
for_each = module.template_files.files
bucket = "example"
key = each.key
content_type = each.value.content_type
# The template_files module guarantees that only one of these two attributes
# will be set for each file, depending on whether it is an in-memory template
# rendering result or a static file on disk.
source = each.value.source_path
content = each.value.content
# Unless the bucket has encryption enabled, the ETag of each object is an
# MD5 hash of that object.
etag = each.value.digests.md5
}
JSON format:
{
"resource": {
"aws_s3_bucket_object": {
"static_files": {
"for_each": "${module.template_files.files}"
#...
}}}}
#...
}
Source: https://registry.terraform.io/modules/hashicorp/dir/template/latest
My objective was to make this dynamic, so whenever i create a folder in a directory, terraform automatically uploads that new folder and its contents into S3 bucket with the same key structure.
Heres how i did it.
First you have to get a local variable with a list of each Folder and the files under it. Then we can loop through that list to upload the source to S3 bucket.
Example: I have a folder called "Directories" with 2 sub folders called "Folder1" and "Folder2" each with their own files.
- Directories
- Folder1
* test_file_1.txt
* test_file_2.txt
- Folder2
* test_file_3.txt
Step 1: Get the local var.
locals{
folder_files = flatten([for d in flatten(fileset("${path.module}/Directories/*", "*")) : trim( d, "../") ])
}
Output looks like this:
folder_files = [
"Folder1/test_file_1.txt",
"Folder1/test_file_2.txt",
"Folder2/test_file_3.txt",
]
Step 2: dynamically upload s3 objects
resource "aws_s3_object" "this" {
for_each = { for idx, file in local.folder_files : idx => file }
bucket = aws_s3_bucket.this.bucket
key = "/Directories/${each.value}"
source = "${path.module}/Directories/${each.value}"
etag = "${path.module}/Directories/${each.value}"
}
This loops over the local var,
So in your S3 bucket, you will have uploaded in the same structure, the local Directory and its sub directories and files:
Directory
- Folder1
- test_file_1.txt
- test_file_2.txt
- Folder2
- test_file_3.txt

Serving STATIC FILES in development and Amazon S3 together

I would like to server static files from Amazon S3 and local server?
also I don't know how to setup MEDIA_URL STATIC_ROOT and MEDIA_ROOT
Context:
I am serving my static files from Amazon S3 using django-boto and my settings/base.py are:
STATICFILES_LOCATION = 'assets'
STATICFILES_STORAGE = 'custom_storages.StaticStorage'
STATIC_URL = "https://%s/%s/" % (AWS_S3_CUSTOM_DOMAIN, STATICFILES_LOCATION)
MEDIAFILES_LOCATION = 'media'
MEDIA_URL = "https://%s/%s/" % (AWS_S3_CUSTOM_DOMAIN, MEDIAFILES_LOCATION)
DEFAULT_FILE_STORAGE = 'custom_storages.MediaStorage'
My custom_storages.py file content is:
from django.conf import settings
# from storages.backends.s3boto3 import S3Boto3Storage
from storages.backends.s3boto import S3BotoStorage
class StaticStorage(S3BotoStorage):
location = settings.STATICFILES_LOCATION
class MediaStorage(S3BotoStorage):
location = settings.MEDIAFILES_LOCATION
And all of this is working fine.
When I execute collectstatic my static files are being uploaded to my bucket in Amazon S3.
The problem I have is that every time I make a change in my css or js files, I need to do the collectstatic command.
How can I setup my project (settings) to serve my static files from S3 and my django in a local server together?
I have a settings/development.py file in which I am overriding the following settings:
STATIC_URL = '/assets/'
STATICFILES_LOCATION = 'assets'
MEDIAFILES_LOCATION = 'media/'
MEDIA_URL = MEDIAFILES_LOCATION
STATIC_ROOT = os.path.join(BASE_DIR, "assets")
MEDIA_ROOT = os.path.join(BASE_DIR, "media")
And my urls.py main file I have this condition:
if settings.DEBUG:
urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)

Cant read web2py uploaded .txt from the shell

I have a simple table:
db.define_table('myfiles',
Field('title','string'),
Field('myfile','upload))
Then i run my app from shell:
python web2py.py -S myapp -M
Choose my file_path:
file_path = os.path.join(request.folder,'upload',db.myfiles[1].myfile)
but then i try to read my uploaded file, i get "File not open for reading"
with open(file_path, 'wb') as f: data = f.readlines()
I even tried the same process with copy-paste my file to private folder but still get the same error.
First, the default folder for uploaded files is "uploads", not "upload":
file_path = os.path.join(request.folder, 'uploads', db.myfiles[1].myfile)
Second, you should open the file for reading rather than writing:
with open(file_path, 'rb') as f:
data = f.readlines()