AWS Glue - Create date partition from timestamp field - amazon-s3

Having a data frame with a timestamp field, like so:
timestamp
id
version
2022-01-01 01:02:00.000
1
2
2022-01-01 05:12:00.000
1
2
I've created a Glue job that is using ApplyMapping to save the data to a new S3 location. Currently I've added id and version partition by selecting those fields in the visual editor and my data is saved with the following structure: id=1/version=2/ I would like to parse the timestamp and extract the date value so the filesystem structure would be id=1/version=2/dt=2022-01-01/. However, in the visual editor I can only select the timestamp and cant perform any manipulation on the field. I'm guessing I need to change the code, but I'm not sure how.
Code:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node S3 bucket
S3bucket_node1 = glueContext.create_dynamic_frame.from_options(
format_options={},
connection_type="s3",
format="parquet",
connection_options={"paths": ["s3://my-data"], "recurse": True},
transformation_ctx="S3bucket_node1",
)
# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
frame=S3bucket_node1,
mappings=[
("timestamp", "timestamp", "timestamp", "timestamp"),
("id", "string", "id", "string"),
("version", "string", "version", "string"),
],
transformation_ctx="ApplyMapping_node2",
)
# Script generated for node S3 bucket
S3bucket_node3 = glueContext.write_dynamic_frame.from_options(
frame=ApplyMapping_node2,
connection_type="s3",
format="glueparquet",
connection_options={
"path": "s3://target-data",
"partitionKeys": ["id", "version"],
},
format_options={"compression": "gzip"},
transformation_ctx="S3bucket_node3",
)
job.commit()

Use the Map Class.
Add this method to your script
def AddDate(rec):
ts = str(rec["timestamp"])
rec["dt"] = ts[:10]
return rec
Insert the Map Transform after the ApplyMapping step.
Mapped_dyF = Map.apply(frame = ApplyMapping_node2, f = AddDate)
Update the write to S3 step, notice the change to frame and partitionKeys.
S3bucket_node3 = glueContext.write_dynamic_frame.from_options(
frame=Mapped_dyF,
connection_type="s3",
format="glueparquet",
connection_options={
"path": "s3://target-data",
"partitionKeys": ["id", "version", "dt"],
},
format_options={"compression": "gzip"},
transformation_ctx="S3bucket_node3",
)

Related

Error in Loading nested and repeated data to bigquery?

I am getting JSON response from a API.
I want to get 5 columns from that,from which 4 are normal,but 1 column in RECORD REPEATED type.
I want to load that data to Bigquery table.
Below is my code in which Schema is mentioned.
import requests
from requests.auth import HTTPBasicAuth
import json
from google.cloud import bigquery
import pandas
import pandas_gbq
URL='<API>'
auth = HTTPBasicAuth('username', 'password')
# sending get request and saving the response as response object
r = requests.get(url=URL ,auth=auth)
data = r.json()
----------------------json repsonse----------------
{
"data": {
"id": "jfp695q8",
"origin": "taste",
"title": "Christmas pudding martini recipe",
"subtitle": null,
"customTitles": [{
"name": "editorial",
"value": "Christmas pudding martini"
}]
}
}
id=data['data']['id']
origin=data['data']['origin']
title=data['data']['title']
subtitle=data['data']['subtitle']
customTitles=json.dumps(data['data']['customTitles'])
# print(customTitles)
df = pandas.DataFrame(
{
'id':id,
'origin':origin,
'title':title,
'subtitle':'subtitle',
'customTitles':customTitles
},index=[0]
)
# df.head()
client = bigquery.Client(project='ncau-data-newsquery-sit')
table_id = 'sdm_adpoint.testfapi'
job_config = bigquery.LoadJobConfig(
schema=[
bigquery.SchemaField("id", "STRING"),
bigquery.SchemaField("origin", "STRING"),
bigquery.SchemaField("title", "STRING"),
bigquery.SchemaField("subtitle", "STRING"),
bigquery.SchemaField(
"customTitles",
"RECORD",
mode="REPEATED",
fields=[
bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
bigquery.SchemaField("value", "STRING", mode="NULLABLE"),
])
],
autodetect=False
)
df.head()
job = client.load_table_from_dataframe(
df, table_id, job_config=job_config
)
job.result()
customeTitle is RECORD REPEATED fiels, which has two keys name and values, so I have made schema like that.
Below is my table schema.
Below is output of df.head()
jfp695q8 taste Christmas pudding martini recipe subtitle [{"name": "editorial", "value": "Christmas pudding martini"}]
Till here its good.
But ,when I try to load the data to table it throws below error.
ArrowTypeError: Could not convert '[' with type str: was expecting tuple of (key, value) pair
Can anyone tell me whats wrong here?

Only csv file can import from GCS to Dataflow and BigQuery using Cloud Composer - Apache Airflow

I have a usecase: There are several files type in GCS like json, csv, txt,.. but I only want to choose csv file, use Dataflow in Python to transform them (such as rename fields,...), then write it to BigQuery. And the main requirement is use Airflow sensors without Cloud Fucntion to trigger them whenever a new csv file import to GCS.
Here is my code:
from datetime import timedelta, datetime
from airflow.models import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.providers.google.cloud.sensors.gcs import GCSObjectExistenceSensor
from airflow.contrib.operators.dataflow_operator import DataflowTemplateOperator
PROJECT = 'abc'
ZONE = 'us-central1-c'
BUCKET_NAME = 'bucket_testing'
BQ_DATASET = "abc.dataset_name"
LOCATION = "US"
DEFAULT_DAG_ARGS = {
'owner': 'gcs to bigquery using dataflow',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'schedule_interval': '#daily',
'dataflow_default_options': {
'project': PROJECT,
'zone': ZONE,
'stagingLocation': BUCKET_NAME
}
}
ENVIRONMENT = {
"bypassTempDirValidation": "false",
"maxWorkers": "20",
"numWorkers": "1",
"serviceAccountEmail": "abc8932097-compute#developer.gserviceaccount.com",
"tempLocation": "gs://composer_bucket",
"ipConfiguration": "WORKER_IP_UNSPECIFIED",
"additionalExperiments": [
"sideinput_io_metrics"
]
}
PARAMETERS = {
"outputTable": "abc:dataset_name.how_to_define_here", // how to got multiple table from multiple csv
"bigQueryLoadingTemporaryDirectory": "gs://composer_bucket",
}
with DAG('dag_sensor', default_args=DEFAULT_DAG_ARGS,dagrun_timeout=timedelta(hours=3),schedule_interval='00 * * * *') as dag:
gcs_file_exists = GCSObjectExistenceSensor(
task_id="gcs_object_sensor",
bucket=BUCKET_NAME,
object='*.csv',
mode='poke',
)
my_dataflow_job = DataflowTemplateOperator(
task_id='transfer_from_gcs_to_bigquery',
template='???', //what I need to write here
parameters=PARAMETERS,
environment=ENVIRONMENT,
dag=dag
)
my_bq_result = BigQueryOperator(
task_id='write_to_bq',
use_legacy_sql=False,
write_disposition='WRITE_TRUNCATE',
create_disposition='CREATE_IF_NEEDED',
dag = dag
)
gcs_file_exists >> my_dataflow_job >> my_bq_result
I am a newbie here, so please point me a detailed example.
Many thanks!

MongoDB delete data using regex

I was able to use the following to delete data using pandas:
import re
repl = {r'<[^>]+>': '',
r'\r\n': ' ',
r'Share to facebook|Share to twitter|Share to linkedin|Share on Facebook|Share on Twitter|Share on Messenger|Share on Whatsapp': ''}
articles['content'] = articles['content'].replace(repl, regex=True)
How can I do the same on the actual database thats in Atlas?
My data structure is:
_id:
title:
url:
description:
author:
publishedAt:
content:
source_id:
urlToImage:
summarization:
MongoDB does not have any built-in Operators to perform Regex Replace on the go (for now).
You can loop through the documents using the regex find in the programming language of your choice and replace that way instead.
from pymongo import MongoClient
import re
m_client = MongoClient("<MONGODB-URI-STRING")
db = m_client["<DB-NAME>"]
collection = db["<COLLECTION-NAME>"]
replace_dictionary = {
r'<[^>]+>': '',
r'\r\n': ' ',
r'Share to facebook|Share to twitter|Share to linkedin|Share on Facebook|Share on Twitter|Share on Messenger|Share on Whatsapp': ''
}
count = 0
for it in collection.find({
# Merge all refex finds to a single list
"$or": [{"content": re.compile(x, re.IGNORECASE)} for x in replace_dictionary.keys()]
}, {
# Project only the field to be replaced for faster execution of script
"content": 1
}):
# Iterate over regex and replacements and apply the same using `re.sub`
for k, v in replace_dictionary.items():
it["content"] = re.sub(
pattern=k,
repl=v,
string=it["content"],
)
# Update the regex replaced string
collection.update_one({
"_id": it["_id"]
}, {
"$set": {
"content": it['content']
}
})
# Count to keep track of completion
count += 1
print("\r", count, end='')
print("DONE!!!")

Is there a way to use dynamic dataset name in bigquery

Problem Statement :
I am trying to use BigqueryOperator in airflow. The aim is to read the same queries as many times with dynamic changing of dataset names ie dataset names will be passed as a parameter.
example:
project.dataset1_layer1.tablename1, project.dataset2_layer1.tablename1
Expected:
I want to maintain one single copy of SQL wherein I can pass dataset names as parameters which can get replaced for that particular dataset.
Error Messages:
I tried to pass dynamic dataset name as a part of query_params. But it got failed with below error message.
The query got parsed as
INFO - Executing: [u'SELECT col1, col2 FROM project.#partner_layer1.tablename']
ERROR - BigQuery job failed. Final error was: {u'reason': u'invalidQuery', u'message': u'Query parameters cannot be used in place of table names at [1:37]', u'location': u'query'}. u'CREATE_IF_NEEDED', u'query': u'SELECT col1, col2 FROM project.#partner_layer1.tablename'}, u'jobType': u'QUERY'}}
`
Things I have tried so far
Query Temaplate temp.sql is as below:
SELECT col1, col2 FROM `project.#partner_layer1.tablename`;
Airflow BigqueryOperator is used as below:
query_template_dict = {
'partner_list' = ['val1', 'val2', 'val3', 'val4']
'google_project': 'project_name',
'queries': {
'layer3': {
'template': 'temp.sql',
'output_dataset': '_layer3',
'output_tbl': 'table_{}'.format(table_date),
'output_tbl_schema': 'temp.txt'
}
},
'applicable_tasks': {
'val1': {
'table_layer3': []
},
'val2': {
'table_layer3': []
},
'val3': {
'table_layer3': []
},
'val4': {
'table_layer3': []
}
}
}
for partner in query_template_dict['partner_list']:
# Loop over applicable report queries for a partner
applicable_tasks = query_template_dict['applicable_tasks'][partner].keys()
for task in applicable_tasks:
destination_tbl = '{}.{}{}.{}'.format(query_template_dict['google_project'], partner,
query_template_dict['queries'][task]['output_dataset'] ,
query_template_dict['queries'][task]['output_tbl'])
}
#Actual destination table structure
#destination_tbl = 'project.partner_layer3.table_20200223'
run_bq_cmd = BigQueryOperator (
task_id =partner + '-' + task,
sql =[query_template_dict['queries'][task]['template']],
destination_dataset_table =destination_tbl,
use_legacy_sql =False,
write_disposition ='WRITE_APPEND',
create_disposition ='CREATE_IF_NEEDED',
allow_large_results =True,
query_params=[
{
"name": "partner",
"parameterType": { "type": "STRING" },
"parameterValue": { "value": partner}
},
{
"name": "batch_date",
"parameterType": { "type": "STRING" },
"parameterValue": { "value": batch_date}
}
],
dag=dag,
Can anybody help me with this issue?
Is there a limitation in BigQuery to dynamically pass dataset names?
Replace the dataset name in Airflow, not in BigQuery.
So do this before the query is sent to BigQuery - use Python string replacement within Airflow.

How to export pandas data to elasticsearch?

It is possible to export a pandas dataframe data to elasticsearch using elasticsearch-py. For example, here is some code:
https://www.analyticsvidhya.com/blog/2017/05/beginners-guide-to-data-exploration-using-elastic-search-and-kibana/
There are a lot of similar methods like to_excel, to_csv, to_sql.
Is there a to_elastic method? If no, where should I request it?
The following script works for localhost:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
INDEX="dataframe"
TYPE= "record"
def rec_to_actions(df):
import json
for record in df.to_dict(orient="records"):
yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}'% (INDEX, TYPE))
yield (json.dumps(record, default=int))
from elasticsearch import Elasticsearch
e = Elasticsearch() # no args, connect to localhost:9200
if not e.indices.exists(INDEX):
raise RuntimeError('index does not exists, use `curl -X PUT "localhost:9200/%s"` and try again'%INDEX)
r = e.bulk(rec_to_actions(df)) # return a dict
print(not r["errors"])
Verify using curl -g 'http://localhost:9200/dataframe/_search?q=A:[29%20TO%2039]'
There are many little things that can be added to suit different needs but main is there.
I'm not aware of any to_elastic method integrated in pandas. You can always raise an issue on the pandas github repo or create a pull request.
However, there is espandas which allows to import a pandas DataFrame to elasticsearch. The following example from the README has been tested with Elasticsearch 6.2.1.
import pandas as pd
import numpy as np
from espandas import Espandas
df = (100 * pd.DataFrame(np.round(np.random.rand(100, 5), 2))).astype(int)
df.columns = ['A', 'B', 'C', 'D', 'E']
df['indexId'] = (df.index + 100).astype(str)
INDEX = 'foo_index'
TYPE = 'bar_type'
esp = Espandas()
esp.es_write(df, INDEX, TYPE)
Retrieving the mappings with GET foo_index/_mappings:
{
"foo_index": {
"mappings": {
"bar_type": {
"properties": {
"A": {
"type": "long"
},
"B": {
"type": "long"
},
"C": {
"type": "long"
},
"D": {
"type": "long"
},
"E": {
"type": "long"
},
"indexId": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
may you can use
pip install es_pandas
pip install progressbar2
This package should work on Python3(>=3.4) and ElasticSearch should be version 5.x, 6.x or 7.x.
import time
import pandas as pd
from es_pandas import es_pandas
# Information of es cluseter
es_host = 'localhost:9200'
index = 'demo'
# crete es_pandas instance
ep = es_pandas(es_host)
# Example data frame
df = pd.DataFrame({'Alpha': [chr(i) for i in range(97, 128)],
'Num': [x for x in range(31)],
'Date': pd.date_range(start='2019/01/01', end='2019/01/31')})
# init template if you want
doc_type = 'demo'
ep.init_es_tmpl(df, doc_type)
# Example of write data to es, use the template you create
ep.to_es(df, index, doc_type=doc_type)
# set use_index=True if you want to use DataFrame index as records' _id
ep.to_es(df, index, doc_type=doc_type, use_index=True)
here is the document https://pypi.org/project/es-pandas/
if 'es_pandas' cann't solve you problem,you could see other solution : https://towardsdatascience.com/exporting-pandas-data-to-elasticsearch-724aa4dd8f62
You could use elasticsearch-py or if you won't use elasticsearch-py you may find answer to your question here => index-a-pandas-dataframe-into-elasticsearch-without-elasticsearch-py