How to troubleshot 'timeout' error on Airflow - error-handling

I have a new DAG that performs a 3 task operation, the DAG runs fine but every now and then i'm getting a 'timeout' error massage on the top in red. i have now idea why this is.
Does anybody knows what can be the cause ?
Here is my code (with a few parameters i changed for discretion reasons):
from airflow import DAG
from airflow.operators.mysql_operator import MySqlOperator
from datetime import datetime
from airflow.operators.sensors import NamedHivePartitionSensor
from airflow.hooks.presto_hook import PrestoHook
import sys
import os
import logging
sys.path.append(os.environ['SSSSSS'] + '/WWW/WWWWW')
from utils import sql_to_string, parse_exec_to_time, parse_exec_to_date, NewPrestoOperator
from config import emails
from NotifyOperator import NotifyOperator
########################################################################
# Parameters to be set
default_args = {
'owner': 'etl',
'start_date': datetime(2019, 04, 15, 0, 0),
'depends_on_past': True,
'wait_for_downstream': True,
'email': data_team_emails,
'email_on_failure': True,
'email_on_retry': False
}
dag = DAG(dag_id='g13-new_lead_form_alert',
default_args=default_args,
max_active_runs=1,
schedule_interval='0 * * * *')
def _get_records_pandas(query):
start_time = datetime.now()
logging.log(logging.INFO, "Extract Query={}".format(query))
records = PrestoHook(presto_conn_id='{0}-new'.format(os.environ['YYYYY'])).get_pandas_df(query)
logging.log(logging.INFO, "Extract completed. it took:{}".format(str(datetime.now() - start_time)))
return records
SELECT_ALL_QUERY = 'select title, pageloadid from mysql.{0}.agg_pageloadid_lead_form'.format(os.environ['DDDDDD'])
t0 = NamedHivePartitionSensor(task_id='g13-00-wait_for_partition',
partition_names=['{2}.table/dt={0}/tm={1}/'.format(
'{{ (execution_date + macros.timedelta(minutes=60)).strftime(\'%Y-%m-%d\')}}',
'{{ (execution_date + macros.timedelta(minutes=60)).strftime(\'%H\')}}',
os.environ['XXXXX'])],
metastore_conn_id='RRRRRR',
dag=dag,
soft_fail=True,
pool='sensor_tasks',
retries=5
)
t1 = MySqlOperator(
task_id='g13-01-truncate',
sql='''
truncate table {0}.agg_pageloaduid_lead_form
'''.format(os.environ['LLLLL']),
mysql_conn_id='AAAA',
dag=dag)
t2 = NewPrestoOperator(
task_id="g13-02-insert_new_lead",
sql=sql_to_string("/g13_insert_new_lead.sql").format(
os.environ['YYYYY'],
'{{execution_date.strftime(\'%Y-%m-%d\')}}',
'{{execution_date.strftime(\'%H\')}}',
os.environ['ETL_ENVIRONMENT']),
presto_conn_id='{0}-new'.format(os.environ['XXXXX']),
provide_context=True,
fail_on_zero_rows=False,
retries=5,
retry_delay=60,
pool='presto_tasks',
dag=dag
)
t3 = NotifyOperator(
task_id='g13-03-notification',
channels=['test'],
email_recipients=[],
email_subject='New Lead Alert',
email_template="""abc""",
op_kwargs={
'title': 'New Lead Form',
'response': _get_records_pandas(SELECT_ALL_QUERY)
},
dag=dag
)
t0 >> t1 >> t2 >> t3
Any idea what could be causing this ?

Related

Slack automate calendar events

I'm looking for a way to automate the creation of calendar events. I'm part of multiple spaces in my school and they keep on posting some events that are happening on a regular basis.
I was wondering is there's a way to automate these calendar events. I want to write a script with Slack api's that can read the messages from all the spaces I'm part of and scan them to see if there's any event related information and create a new calendar event in my google calendars. I want to run this at the end of the day on all the messages from all the spaces.
from __future__ import print_function
import os
import json
import pprint
import time
import parsedatetime
from datetime import datetime
from datetime import timedelta
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
def get_google_service():
creds = None
SCOPES = ['https://www.googleapis.com/auth/calendar']
if os.path.exists('token.json'):
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
return build('calendar', 'v3', credentials=creds)
def send_google_calendar_invite(service, channel_name, start_time, end_time):
try:
# f = open("template.json", "r")
# template_data = f.read()
template_data = '''
{
"summary": "event_name",
"location": "event_location",
"description": "event_description",
"start": {
"dateTime": "event_start_time",
"timeZone": "America/Los_Angeles"
},
"end": {
"dateTime": "event_end_time",
"timeZone": "America/Los_Angeles"
}
}
'''
template_data = template_data.replace('event_name', channel_name)
template_data = template_data.replace('event_location', channel_name+'-meeting')
template_data = template_data.replace('event_description', channel_name+'-desrpition')
template_data = template_data.replace('event_start_time', start_time)
template_data = template_data.replace('event_end_time', end_time)
json_object = json.loads(template_data)
json_formatted_str = json.dumps(json_object, indent=2)
print(json_formatted_str)
event = service.events().insert(calendarId='primary', body=json_object).execute()
print('Event created: %s' % (event.get('htmlLink')))
except HttpError as error:
print('An error occurred: %s' % error)
def read_slack_messages():
channel_id = "C04QL76V21X"
try:
lastHourDateTime = datetime.now() - timedelta(hours=24)
client = WebClient(token=open("secrets.txt", "r").read())
conversation_history = client.conversations_history(channel=channel_id, oldest=time.mktime(lastHourDateTime.timetuple()))
channel_info_result = client.conversations_info(channel=channel_id)
channel_name = channel_info_result['channel']['name']
conversation_messages = conversation_history["messages"]
print("{} messages found in {}".format(len(conversation_messages), id))
# import pdb; pdb.set_trace();
service = get_google_service()
for message in conversation_messages[:2]:
chat_message = message['text']
try:
cal = parsedatetime.Calendar()
dates = cal.parse(chat_message)
print(dates)
start_time = time.strftime('%Y-%m-%dT%H:%M:%S-%000:00', (dates[0]))
end_time = start_time[:11]+f"{int(start_time[11:13])+1:02}"+start_time[13:]
print(chat_message, ' : ', start_time, ' ||| ', end_time)
send_google_calendar_invite(service, channel_name, start_time, end_time)
except TypeError as e:
print(' : Nope : ', e);
except SlackApiError as e:
print("Error getting conversation: {}".format(e))
if __name__ == '__main__':
read_slack_messages()

AttributeError:'str' object has no attribute 'unique' (Pandas.unique)

In my script, I use pandas module. When I execute my file.py - everything works well. But I've converted my file.py to file.exe with auto-py-to-exe and got an error: AttributeError:'str' object has no attribute 'unique'. It's strange because it worked normally. The line where becomes an error: wells=list(file[0].unique()). Who knows this issue, please help.
import tkinter as tk
import tkinter.filedialog as fd
import pandas as pd
import os
import datetime
from datetime import datetime, date
import numpy as np
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 800)
def resource_path(relative_path):
try:
base_path = sys._MEIPASS
except Exception:
base_path = os.path.abspath(".")
return os.path.join(base_path, relative_path)
def open():
global file_excel, name
file_excel = fd.askopenfilename(initialdir='/Desktop', title='Открыть файл', filetypes = [("Excel", "*.xlsx")])
name = os.path.basename(file_excel)
name=os.path.splitext(name)[0]
file_excel=pd.read_excel(file_excel, skiprows=[0], header=None)
win.destroy()
return file_excel, name
win = tk.Tk()
path = resource_path("image.png")
photo = tk.PhotoImage(file=path)
win.iconphoto(False, photo)
win.config(bg='#FFC')
win.title('Конвертация в формат .ev')
win.geometry('400x130+500+500')
win.resizable(False, False)
label_1 = tk.Label(win, text = 'Выберите файл с испытаниями скважин:',
bg = '#FFC',
font=('Arial', 10, 'bold'),
padx=20,
pady=10).pack()
btn_1 = tk.Button(win, text = 'Выбрать Excel',
command = open,
activebackground = '#6F6',
font=('Arial', 12, 'bold'),
padx=20,
pady=10,
relief = tk.RAISED,
bd=2).pack()
win.mainloop()
wells=list(file_excel[0].unique())
file_excel[1] = pd.to_datetime(file_excel[1], errors='coerce').dt.strftime("%d/%m/%Y")
file_excel[4] = np.where(file_excel[1].str, 'Perforation', np.nan)
file_excel.iloc[:,[2,3]]=file_excel.iloc[:,[2,3]].abs()
col_list = list(file_excel)
col_list[4], col_list[2] = col_list[2], col_list[4]
file_excel.columns = col_list
Perforation=pd.DataFrame(data=None)
for i in wells:
well_name=pd.DataFrame({'WELLNAME '+i}, columns=[1])
Perforation=Perforation.append(well_name)
Perforation=Perforation.append(file_excel.iloc[:,[1,2,3,4]][file_excel.iloc[:,0]==i])
Perforation=Perforation.append(pd.Series(dtype = 'object'), ignore_index=True)
def SaveFile():
Save=fd.asksaveasfile(mode='w',defaultextension=".ev", initialfile=name)
Save.write(Perforation.to_string(index=False, header=False, na_rep=' '))
win.destroy()
win = tk.Tk()
path = resource_path("image.png")
photo = tk.PhotoImage(file=path)
win.iconphoto(False, photo)
win.config(bg='#FFC')
win.title('Конвертация в формат .ev')
win.geometry('400x130+500+500')
win.resizable(False, False)
label_1 = tk.Label(win, text = 'Сохранение:',
bg = '#FFC',
font=('Arial', 10, 'bold'),
padx=20,
pady=10).pack()
btn_1 = tk.Button(win, text = 'Сохранить как',
command = SaveFile,
activebackground = '#6F6',
font=('Arial', 12, 'bold'),
padx=20,
pady=10,
relief = tk.RAISED,
bd=2).pack()
win.mainloop()
type of file[0]
Error screen
When I created virtual env I should have added openpyxl module. And I made it and everything is fine now

airflow BigQueryOperator ERROR - 400 Syntax error: Unexpected token at [1:244] - while using params

I have 2 BigQueryOperator tasks in a loop. The first task works perfectly, however the second task (create_partition_table_agent_intensity_{v_list[i]}) throws an error:
ERROR - 400 Syntax error: Unexpected "{" at [1:244]
I can't understand what is the difference between the tasks.
Maybe someone can point me to the right direction?
Here is my entire code:
from airflow.models import (DAG, Variable)
import os
from airflow.operators.dummy import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
import datetime
import json
import pandas as pd
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from google.cloud import bigquery
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.providers.google.cloud.operators.bigquery import BigQueryDeleteTableOperator
default_args = {
'start_date': datetime.datetime(2020, 1, 1),
}
PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "bigquery_default")
PROJECT_ID_GCP = os.environ.get("GCP_PROJECT_ID", "my_project")
DATASET_MRR = os.environ.get("GCP_BIGQUERY_DATASET_NAME", "LP_RAW")
DATASET_STG = os.environ.get("GCP_BIGQUERY_DATASET_NAME", "LP_STG")
MRR_AGENT_ACTIVITY = "RPT_FA_AGENT_ACTIVITY_VW"
MRR_AGENT_INTENSITY = "RPT_AGG_15M_MSG_AGENT_INTENSITY_VW"
STG_AGENT_ACTIVITY_PARTITIONED = "agent_acitivity_partitioned"
STG_AGENT_INTENSITY_PARTITIONED = "agent_intensity_partitioned"
def list_dates_in_df(ti):
hook = BigQueryHook(bigquery_conn_id=PROJECT_ID,
use_legacy_sql=False)
bq_client = bigquery.Client(project = hook._get_field("project"),
credentials = hook._get_credentials())
query = "select distinct(cast(PARTITION_KEY as string)) as PARTITION_KEY \
FROM LP_MNG.PartitionStatusMonitoring\
where SOURCE_TABLE in ('RPT_FA_AGENT_ACTIVITY_VW','RPT_AGG_15M_MSG_AGENT_INTENSITY_VW')\
and IS_LOAD_COMPLETED = false;"
df = bq_client.query(query).to_dataframe()
res = df.values.tolist()
#unpack the list of lists, l is a list inside res list, take item from res, now each item is l
my_list = [item for l in res for item in l]
ti.xcom_push(key = 'list_of_dates', value = my_list)
def update_variable(ti):
updated_file_list = ti.xcom_pull(key = 'list_of_dates',task_ids='list_dates')
Variable.set(key="updated_dates", value=json.dumps(updated_file_list))
print(updated_file_list)
print(type(updated_file_list))
with DAG(
'test_with_mng_table_list',
schedule_interval=None,
catchup = False,
default_args=default_args
) as dag:
list_dates = PythonOperator(
task_id ='list_dates',
python_callable = list_dates_in_df
)
set_list = PythonOperator(
task_id= 'set_list',
python_callable=update_variable
)
v_list = Variable.get("updated_dates", deserialize_json=True)
end_job = BashOperator(
task_id='end_job',
bash_command='echo end_job.',
trigger_rule = 'all_done', )
for i in range(len(v_list)):
create_partition_table_agent_activity = BigQueryOperator(
task_id=f"create_partition_table_agent_activity_{v_list[i]}",
sql="select ACCOUNT_ID,timestamp_trunc(CHANGE_EVENT_TIME_15M,HOUR) as ANALYSIS_DATE,\
AGENT_ID,AGENT_GROUP_ID,USER_TYPE_ID,\
sum(AWAY_ENGAGED_TIME) AWAY_ENGAGED_TIME,sum(BACKIN5_ENGAGED_TIME) BACKIN5_ENGAGED_TIME,\
sum(DURATION_DAYS) DURATION_DAYS,sum(ONLINE_TIME) ONLINE_TIME,\
sum(BACK_IN_5_TIME) BACK_IN_5_TIME,sum(AWAY_TIME) AWAY_TIME\
from {{ params.PROJECT_ID }}.{{ params.DATASET_MRR }}.{{ params.MRR1 }}\
where cast(CHANGE_EVENT_TIME_15M as STRING FORMAT 'YYYY-MM-DD') = cast('{{ params.date_a }}' as STRING) \
group by 1,2,3,4,5;",
params={"PROJECT_ID":PROJECT_ID_GCP ,
"DATASET_MRR":DATASET_MRR,
"MRR1":MRR_AGENT_ACTIVITY,
"date_a" : v_list[i]
},
destination_dataset_table=f"{PROJECT_ID_GCP}.{DATASET_STG}.{STG_AGENT_ACTIVITY_PARTITIONED}{v_list[i]}",
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_TRUNCATE',
#bigquery_conn_id=CONNECTION_ID,
use_legacy_sql=False,
dag=dag
)
create_partition_table_agent_intensity = BigQueryOperator(
task_id=f"create_partition_table_agent_intensity_{v_list[i]}",
sql=f"select ACCOUNT_ID,timestamp_trunc(AGG_DATE,HOUR) as ANALYSIS_DATE,\
AGENT_ID, GROUP_ID as AGENT_GROUP_ID,\
USER_TYPE_ID, SUM(SUM_CONVERSATION_LOAD_RATE) as SUM_CONVERSATION_LOAD_RATE,\
SUM(NO_EVENTS) AS NO_EVENTS\
from {{ params.PROJECT_ID }}.{{ params.DATASET_MRR }}.{{ params.MRR2 }}\
where cast(AGG_DATE as STRING FORMAT 'YYYY-MM-DD') = cast('{{ params.date_a }}' as STRING) \
group by 1,2,3,4,5;",
params={"PROJECT_ID":PROJECT_ID_GCP ,
"DATASET_MRR":DATASET_MRR,
"MRR2":MRR_AGENT_INTENSITY,
"date_a" : v_list[i]
},
destination_dataset_table=f"{PROJECT_ID_GCP}.{DATASET_STG}.{STG_AGENT_INTENSITY_PARTITIONED}{v_list[i]}",
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_TRUNCATE',
#bigquery_conn_id=CONNECTION_ID,
use_legacy_sql=False,
dag=dag
)
d2 = DummyOperator(task_id='generate_data_{0}'.format(v_list[i]),dag=dag)
list_dates >> set_list >> [
create_partition_table_agent_activity,create_partition_table_agent_intensity
] >> d2 >> end_job
I do not have playground to test it, but I think you should not use f-string for sql parameter. If you use {{something}} in f-string it returns string {something} so parameters for query are not inserted and this results in SQL syntax error as query is run without parameters. Please try to remove f before string for sql in 2nd task.

Using Json Input Variables In Airflow EMR Operator Steps

I'm currently following the template given here: https://github.com/apache/airflow/blob/master/airflow/contrib/example_dags/example_emr_job_flow_manual_steps.py to create a DAG to call for a emr instance using spark submit. When setting up the spark_test_steps, I need to include variables passed in from a POST Json to fill the spark submit like below:
SPARK_TEST_STEPS = [
{
'Name': 'calculate_pi',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'/usr/lib/spark/bin/run-example',
'SparkPi',
kwargs['dag_run'].conf['var_1']
kwargs['dag_run'].conf['var_2']
'10'
]
}
}
]
How can I pass in variables given by the POST Json while still following the format given in the git link to look like below?
from datetime import timedelta
import airflow
from airflow import DAG
from airflow.contrib.operators.emr_create_job_flow_operator import EmrCreateJobFlowOperator
from airflow.contrib.operators.emr_add_steps_operator import EmrAddStepsOperator
from airflow.contrib.sensors.emr_step_sensor import EmrStepSensor
from airflow.contrib.operators.emr_terminate_job_flow_operator import EmrTerminateJobFlowOperator
DEFAULT_ARGS = {
'owner': 'Airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False
}
dag = DAG(
'emr_job_flow_manual_steps_dag',
default_args=DEFAULT_ARGS,
dagrun_timeout=timedelta(hours=2),
schedule_interval='0 3 * * *'
)
var_1 = ''
var_2 = ''
SPARK_TEST_STEPS = []
def define_param(**kwargs):
global var_1
global var_2
global SPARK_TEST_STEPS
var_1 = str(kwargs['dag_run'].conf['var_1'])
var_2 = str(kwargs['dag_run'].conf['var_2'])
SPARK_TEST_STEPS = [
{
'Name': 'calculate_pi',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'/usr/lib/spark/bin/run-example',
'SparkPi',
kwargs['dag_run'].conf['var_1']
kwargs['dag_run'].conf['var_2']
'10'
]
}
}
]
return SPARK_TEST_STEPS
DEFINE_PARAMETERS = PythonOperator(
task_id='DEFINE_PARAMETERS',
python_callable=define_param,
provide_context=True,
dag=dag)
cluster_creator = EmrCreateJobFlowOperator(
task_id='create_job_flow',
job_flow_overrides=JOB_FLOW_OVERRIDES,
aws_conn_id='aws_default',
emr_conn_id='emr_default',
dag=dag
)
step_adder = EmrAddStepsOperator(
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
steps='{{ ti.xcom_pull(task_ids="DEFINE_PARAMETERS") }}',
dag=dag
)
step_checker = EmrStepSensor(
task_id='watch_step',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
aws_conn_id='aws_default',
dag=dag
)
cluster_remover = EmrTerminateJobFlowOperator(
task_id='remove_cluster',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
dag=dag
)
cluster_creator.set_downstream(step_adder)
step_adder.set_downstream(step_checker)
step_checker.set_downstream(cluster_remover)
I cannot use Variable.get and Variable.set as this will not allow multiple dag calls for different variable types at the same time due to the constant changing of airflow global variables. I have tried calling SPARK_TEST_STEPS using xcom but the return type of xcom is string and EmrAddStepsOperator steps requires a list.
I solved a similar problem by creating a custom operator that parses the json prior to executing. The cause of the problem is that when you pass steps='{{ ti.xcom_pull(task_ids="DEFINE_PARAMETERS") }}',. you are literally passing a string with the value interpolated by the templating engine, it is not deserialized.
from airflow.contrib.hooks.emr_hook import EmrHook
from airflow.exceptions import AirflowException
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
from airflow.contrib.operators.emr_add_steps_operator import EmrAddStepsOperator
import json
class DynamicEmrStepsOperator(EmrAddStepsOperator):
template_fields = ['job_flow_id', 'steps']
template_ext = ()
ui_color = '#f9c915'
#apply_defaults
def __init__(
self,
job_flow_id=None,
steps="[]",
*args, **kwargs):
super().__init__(
job_flow_id = job_flow_id,
steps = steps,
*args, **kwargs)
def execute(self, context):
self.steps = json.loads(self.steps)
return super().execute(context)

BigQuery Python Client Library - Named Parameters Error

I'm trying to write a simple query using the Python client library named "parameter", but kept encountering errors.
I keep getting "Undeclared query parameters" when I try to run the code. Did I miss out anything?
My Code:
import datetime
import os
from google.cloud import bigquery
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=<path>
client = bigquery.Client(project='project_id')
query = """
SELECT * from `<project_id>.<dataset_id>.*`
WHERE CAST(REGEXP_EXTRACT(_TABLE_SUFFIX, r"^(\d{8})$") AS INT64) = #date
limit 10;
"""
query_params = [
bigquery.ScalarQueryParameter(
'date',
'INT64',
int((datetime.date.today().strftime('%Y%m%d'))
)
]
job_config = bigquery.QueryJobConfig()
job_config.query_parameters = query_params
query_job = client.query(
query,
location = 'US')
for row in query_job:
print(row)
assert query_job.state == 'DONE'
It looks like you are missing to enter your job_config into the arguments of your client.query() method. You should have:
query_job = client.query(
query,
location = 'US',
job_config=job_config)
Official docs here.