How to dynamically update parameters of an existing Airflow (1.9 version)Connection within code? - ssh

I have defined a SSH connection via Airflow Admin UI. However I am only defining a service account , host and port in the UI. I am retrieving the password in the first task instance and I need to update the SSH connection with the password in the second task instance and use it in the third task instance.
t1 : call an R function to retrieve password for svc account (stored
in xcom_push)
t2 : Update the SSH connection with this password (I am using
SSHHook) ssh02.password = password (retrieved via xcom_pull)
t3 : call a server using previously updated connection (ssh02)
Currently t1 and t2 work as expected ,however t3 fails since the password is not getting updated and it is looking for .ssh key file based authentication. Can someone please suggest how this can be implemented ?
Here is my code snippet :
from airflow import models
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta
from airflow.contrib.hooks.ssh_hook import SSHHook
from airflow.models import Variable
from airflow.models import Connection
from airflow.settings import Session
from airflow.utils import db
from airflow.utils.db import provide_session
from airflow import DAG
import logging
import os
svcpassword = 'XXXX'
logging.getLogger().setLevel(logging.DEBUG)
ssh01 = SSHHook(ssh_conn_id='ssh_conn1')
ssh02 = SSHHook(ssh_conn_id='ssh_conn2')
default_args = {
'owner': 'user',
'depends_on_past': False,
'start_date': datetime.now(),
'email': ['abcd#gmail.com'],
'email_on_failure': True,
'email_on_retry': True,
'retries': 1,
'retry_delay':timedelta(minutes=1)
}
dag = DAG('dag_POC', default_args=default_args,
schedule_interval="#once")
path1 = '/home/user/R_samplescript'
t1 = SSHOperator(
task_id='SSHTask',
command='Rscript '+path1+'.R',
ssh_hook=ssh01,
params={},retries =1 ,
do_xcom_push = True,
dag = dag
)
def create_new_connection(**kwargs):
ti = kwargs['ti']
pwd = ti.xcom_pull(task_ids='SSHTask')
password = str(pwd).replace("\\n","\n")
password = password[password.find(' ')+1 : ]
password = password.strip()
svcpassword = password
db.merge_conn( models.Connection(
conn_id='ssh_conn2', conn_type='SSH',
host='server_name', port='XXXX',login =
'account_name',password = svcpassword))
t2 = PythonOperator(
task_id='Create_Connection',
python_callable=create_new_connection,
provide_context=True,
dag=dag
)
t3 = SSHOperator(
task_id='RemoteCallTest',
command="R command",
ssh_hook = SSHHook().get_conn('ssh_conn2'),
do_xcom_push = False,
retries = 1,
dag=dag
)
t1 >> t2 >> t3

You need to leverage the session wrapper to persist changes to the db
#provide_session()
def set_password(session=None):
conn = MyHook().get_conn(conn_id)
conn.set_password(my_password)
session.add(conn)
session.commit()

Related

Google people API returning empty / no results in Python

I'm trying to read contacts from my person gmail account and the instructions provided by Google from the People API is returning an empty list. I'm not sure why. I've tried another solution from a few years ago, but that doens't seem to work. I've pasted my code below. Any help troubleshooting this is appreciated!
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/contacts.readonly']
from google.oauth2 import service_account
SERVICE_ACCOUNT_FILE = '<path name hidden>.json'
credentials = service_account.Credentials.from_service_account_file(
SERVICE_ACCOUNT_FILE, scopes=SCOPES)
def main():
#Shows basic usage of the People API.
#Prints the name of the first 10 connections.
creds = None
service = build('people', 'v1', credentials=credentials)
# Call the People API
print('List 10 connection names')
results = service.people().connections().list(
resourceName='people/me',
pageSize=10,
personFields='names,emailAddresses').execute()
connections = results.get('connections', [])
request = service.people().searchContacts(pageSize=10, query="A", readMask="names")
results = service.people().connections().list(resourceName='people/me',personFields='names,emailAddresses',fields='connections,totalItems,nextSyncToken').execute()
for i in results:
print ('result', i)
for person in connections:
names = person.get('names', [])
if names:
name = names[0].get('displayName')
print(name)
if __name__ == '__main__':
main()

Unload data from Redshift to S3 using Airflow

I would like to unload data from the Redshift db to an S3 bucket, which would later be used to copy into another database. I have written my DAG as below:
from airflow.operators import RedshiftToS3Transfer
from datetime import datetime, timedelta
from airflow import DAG
default_args = {
'owner': 'me',
'start_date': datetime.today(),
'max_active_runs': 1,
}
dag = DAG(dag_id='redshift_S3',
default_args=default_args,
schedule_interval="#once",
catchup=False
)
unload_to_S3 = RedshiftToS3Transfer(
task_id='unload_to_S3',
schema='schema_name',
table='table_name',
s3_bucket='bucket_name',
s3_key='s3_key',
redshift_conn_id='redshift',
aws_conn_id='my_s3_conn',
dag=dag
)
But I get an error "Broken DAG: cannot import name 'RedshiftToS3Transfer' from 'airflow.operators' (unknown location)". Any idea on how to import the RedshiftToS3Transfer would be of help.
The right way to import this is
from airflow.operators.redshift_to_s3_operator import RedshiftToS3Transfer

How do you authenticate a websocket connection with Knox token authentication on django channels?

I understand you can write custom authentication middleware to use in django channels 2.
This works fine with Django's built-in token authentication but using django-rest-knox tokens is a different story.
Knox stores its tokens in an encrypted form so it is not as easy as simply retrieving the user from the database by looking up the token.
Please help.
Figured it out!
from knox.auth import TokenAuthentication
...
knoxAuth = TokenAuthentication();
user, auth_token = knoxAuth.authenticate_credentials(tokenString.encode(HTTP_HEADER_ENCODING))
scope['user'] = user
Integrate the above code with: https://gist.github.com/rluts/22e05ed8f53f97bdd02eafdf38f3d60a
In order to be able to authenticate a user using token authentication, you must use cookies, The headers you can send using WS are limited, you must also implement your own "TokenAuthMiddleware" to handle the cookie. for channels 2, you also have to handle access to the database correctly, below is how to do that:
from channels.auth import AuthMiddlewareStack
from channels.db import database_sync_to_async
from knox.auth import TokenAuthentication
from django.contrib.auth.models import AnonymousUser
from django.db import close_old_connections
from rest_framework.exceptions import AuthenticationFailed
import re
class TokenAuthMiddlewareInstance :
def __init__ (
#
self ,
scope ,
middleware ,
):
self.middleware = middleware
self.scope = dict(scope)
self.inner = self.middleware.inner
async def __call__ (
#
self ,
receive ,
send ,
):
self.scope['user'] = AnonymousUser()
cookie = dict(self.scope.get('headers',{})).get(b'cookie')
if cookie :
token = re.findall(r'X-Authorization=(\w*)', cookie.decode('ascii'))
if len(token) :
self.scope['user'] = await self._g_user(token)
inner = self.inner(self.scope)
return await inner(receive, send)
#database_sync_to_async
def _g_user (
#
self ,
token ,
):
try :
token_key = token[0]
user, token = TokenAuthentication().authenticate_credentials(token_key.encode('ascii'))
close_old_connections()
return user
except AuthenticationFailed as e :
return AnonymousUser()
class TokenAuthMiddleware :
def __init__ (
#
self ,
inner ,
):
self.inner = inner
def __call__ (
#
self ,
scope ,
):
return TokenAuthMiddlewareInstance(scope, self)
TokenAuthMiddlewareStack = lambda inner: TokenAuthMiddleware(AuthMiddlewareStack(inner))

How to increase request timeout for http4s

I have a request that is talking to a backend db that takes time to respond. And the http4s is throwing request timeout. I wanted to know if there is a property to increase the request timeout?
Thanks
Saad.
Server Timeouts
BlazeBuilder can easily be adjusted. The default implementation is
import org.http4s._
import scala.concurrent.duration._
BlazeBuilder(
socketAddress = InetSocketAddress.createUnresolved(LoopbackAddress, 8080),
serviceExecutor = DefaultPool, // #org.http4s.util.threads - ExecutorService
idleTimeout = 30.seconds
isNio2 = false,
connectorPoolSize = math.max(4, Runtime.getRuntime.availableProcessors() + 1),
bufferSize = 64*1024,
enableWebSockets = true,
sslBits = None,
isHttp2Enabled = false,
maxRequestLineLen = 4*1024,
maxHeadersLen = 40*1024,
serviceMounts = Vector.empty
)
We can utilize the default and change that, as the class has a copy method implemented.
import org.http4s._
import scala.concurrent.duration._
BlazeBuilder.copy(idleTimeout = 5.minutes)
You can then proceed with your server however you would like, adding your services and then serving.
Client Timeouts
BlazeClient takes a config class called BlazeClientConfig
The default is
import org.http4s._
import org.http4s.client._
BlazeClientConfig(
idleTimeout = 60.seconds,
requestTimeout = Duration.Inf,
userAgent = Some(
`User-Agent`(AgentProduct("http4s-blaze", Some(BuildInfo.version)))
),
sslContext = None,
checkEndpointIdentification = true,
maxResponseLineSize = 4*1024,
maxHeaderLength = 40*1024,
maxChunkSize = Integer.MAX_VALUE,
lenientParser = false,
bufferSize = 8*1024,
customeExecutor = None,
group = None
)
However we have a default config and as it exists as a case class you would probably be better modifying the default. Use PooledHttp1Client under most cases.
import scala.concurrent.duration._
import org.http4s.client._
val longTimeoutConfig =
BlazeClientConfig
.defaultConfig
.copy(idleTimeout = 5.minutes)
val client = PooledHttp1Client(
maxTotalConnections = 10,
config = longTimeoutConfig
)

How to retrieve a value from Airflow XCom pushed via SSHExecuteOperator

I have the following DAG with two SSHExecuteOperator tasks. The first task executes a stored procedure which returns a parameter. The second task needs this parameter as an input.
Could please explain how to pull the value from the XCom pushed in task1, in order to use it in task2?
from airflow import DAG
from datetime import datetime, timedelta
from airflow.contrib.hooks.ssh_hook import SSHHook
from airflow.contrib.operators.ssh_execute_operator import SSHExecuteOperator
from airflow.models import Variable
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime.now(),
'email': ['my#email.com'],
'email_on_failure': True,
'retries': 0
}
#server must be changed to point to the correct environment, to do so update DataQualitySSHHook variable in Airflow admin
DataQualitySSHHook = Variable.get('DataQualitySSHHook')
print('Connecting to: ' + DataQualitySSHHook)
sshHookEtl = SSHHook(conn_id=DataQualitySSHHook)
sshHookEtl.no_host_key_check = True
#create dag
dag = DAG(
'ed_data_quality_test-v0.0.3', #update version whenever you change something
default_args=default_args,
schedule_interval="0 0 * * *",
dagrun_timeout=timedelta(hours=24),
max_active_runs=1)
#create tasks
task1 = SSHExecuteOperator(
task_id='run_remote_sp_audit_batch_register',
bash_command="bash /opt/scripts/data_quality/EXEC_SP_AUDIT_BATCH.sh 'ED_DATA_QUALITY_MANUAL' 'REGISTER' '1900-01-01 00:00:00.000000' '2999-12-31 00:00:00.000000' ", #keep the space at the end
ssh_hook=sshHookEtl,
xcom_push=True,
retries=0,
dag=dag)
task2 = SSHExecuteOperator(
task_id='run_remote_sp_audit_module_session_start',
bash_command="echo {{ ti.xcom_pull(task_ids='run_remote_sp_audit_batch_register') }}",
ssh_hook=sshHookEtl,
retries=0,
dag=dag)
#create dependencies
task1.set_downstream(task2)
So the solution I have found is when task1 executes the shell script, you have to make sure the parameter you want to be captured by the XCom variable is the last thing printed by your script (using echo).
Then I was able to retrieve the XCom variable value with the following code snippet:
{{ task_instance.xcom_pull(task_ids='run_remote_sp_audit_batch_register') }}
insted of xcom_push=True , try do_xcom_push=True, It will bring all the stdout to the xcom with key return_value