I'm trying to write a simple query using the Python client library named "parameter", but kept encountering errors.
I keep getting "Undeclared query parameters" when I try to run the code. Did I miss out anything?
My Code:
import datetime
import os
from google.cloud import bigquery
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=<path>
client = bigquery.Client(project='project_id')
query = """
SELECT * from `<project_id>.<dataset_id>.*`
WHERE CAST(REGEXP_EXTRACT(_TABLE_SUFFIX, r"^(\d{8})$") AS INT64) = #date
limit 10;
"""
query_params = [
bigquery.ScalarQueryParameter(
'date',
'INT64',
int((datetime.date.today().strftime('%Y%m%d'))
)
]
job_config = bigquery.QueryJobConfig()
job_config.query_parameters = query_params
query_job = client.query(
query,
location = 'US')
for row in query_job:
print(row)
assert query_job.state == 'DONE'
It looks like you are missing to enter your job_config into the arguments of your client.query() method. You should have:
query_job = client.query(
query,
location = 'US',
job_config=job_config)
Official docs here.
Related
I'm running this flask app
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS, cross_origin
import json
import pandas as pd
# Create the app object
app = Flask(__name__)
cors = CORS(app, resources= {r"/*": {'origins' : "*"}})
# importing function for calculations
from Record_Matching import Matching
#app.route("/query", methods = ['get'])
#cross_origin()
def query():
# service_account_creds = request.json
query1 = request.args.get('query1', type = str)
query2 = request.args.get('query2', type = str)
querycolumns = request.args.get('querycolumns')
project_id = request.args.get('project_id', type = str)
service_account_creds = request.args.get('service_account')
SS = request.args.get('SS', type = float)
TT = request.args.get('TT', type = float)
result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns)
return result
if __name__ == "__main__":
app.run(host="localhost", port=8080, debug=True)
and I'm importing the matching function from this python scripts
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import recordlinkage
from recordlinkage.preprocessing import phonetic
from pandas.io.json import json_normalize
import uuid
from uuid import uuid4
import random
import string
import json
import ast
# Results to data frame function
def gcp2df(sql, client):
query = client.query(sql)
results = query.result()
return results.to_dataframe()
# Exporting df to bigquery - table parameter example: "dataset.tablename"
# def insert(df, table):
# client = bigquery.Client()
# job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
# return client.load_table_from_dataframe(df, table, job_config = job_config)
def pair(df1, df2, TT, querycolumns):
# function to take pair from list and compare:
L = querycolumns
l=len(querycolumns)
p1=0
p2=1
# To generate phonetics we need to make sure all names are in english.
# thus we'll replace non-english words by random english strings
df1[L[p1]] = df1[L[p1]].astype(str)
df2[L[p2]] = df2[L[p2]].astype(str)
for i in range(0,len(df1)):
if df1[L[p1]][i].isascii() == False:
df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
for i in range(0,len(df2)):
if df2[L[p2]][i].isascii() == False:
df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
compare = recordlinkage.Compare()
df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex")
df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex")
df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1])
df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1])
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(df1, df2)
compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name")
# O(n) a function that uses two pointers to track consecutive pairs for the input list
while p2 <=l:
compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1])
p1+=2
p2+=2
features = compare.compute(candidate_links,df1, df2)
return features
def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns):
service_account_creds = ast.literal_eval(service_account_creds)
credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'],
service_account_creds['token_uri'])
job_config = bigquery.LoadJobConfig()
client = bigquery.Client( project = project_id)
SS=int(SS)
TT=float(TT)
df1 = gcp2df("""{}""".format(query1), client)
df2 = gcp2df("""{}""".format(query2), client)
querycolumns = json.loads(querycolumns)
querycolumns = list(querycolumns.values())
features = pair(df1, df2, TT, querycolumns)
features['Similarity_score'] = features.sum(axis=1)
features = features[features['Similarity_score']>=SS].reset_index()
final = features[['level_0', 'level_1']]
final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True)
final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))]
final['Unique_ID'] = final['Unique_ID'].astype(str)
final['Similarity_Score'] = SS
final_duplicates = final['df1_index'].value_counts().max()
# insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices")
message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates)
return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')}
I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app?
I tried to_dict('record') to convert a dataframe to a dictionary,
it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.
I have 2 BigQueryOperator tasks in a loop. The first task works perfectly, however the second task (create_partition_table_agent_intensity_{v_list[i]}) throws an error:
ERROR - 400 Syntax error: Unexpected "{" at [1:244]
I can't understand what is the difference between the tasks.
Maybe someone can point me to the right direction?
Here is my entire code:
from airflow.models import (DAG, Variable)
import os
from airflow.operators.dummy import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
import datetime
import json
import pandas as pd
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from google.cloud import bigquery
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.providers.google.cloud.operators.bigquery import BigQueryDeleteTableOperator
default_args = {
'start_date': datetime.datetime(2020, 1, 1),
}
PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "bigquery_default")
PROJECT_ID_GCP = os.environ.get("GCP_PROJECT_ID", "my_project")
DATASET_MRR = os.environ.get("GCP_BIGQUERY_DATASET_NAME", "LP_RAW")
DATASET_STG = os.environ.get("GCP_BIGQUERY_DATASET_NAME", "LP_STG")
MRR_AGENT_ACTIVITY = "RPT_FA_AGENT_ACTIVITY_VW"
MRR_AGENT_INTENSITY = "RPT_AGG_15M_MSG_AGENT_INTENSITY_VW"
STG_AGENT_ACTIVITY_PARTITIONED = "agent_acitivity_partitioned"
STG_AGENT_INTENSITY_PARTITIONED = "agent_intensity_partitioned"
def list_dates_in_df(ti):
hook = BigQueryHook(bigquery_conn_id=PROJECT_ID,
use_legacy_sql=False)
bq_client = bigquery.Client(project = hook._get_field("project"),
credentials = hook._get_credentials())
query = "select distinct(cast(PARTITION_KEY as string)) as PARTITION_KEY \
FROM LP_MNG.PartitionStatusMonitoring\
where SOURCE_TABLE in ('RPT_FA_AGENT_ACTIVITY_VW','RPT_AGG_15M_MSG_AGENT_INTENSITY_VW')\
and IS_LOAD_COMPLETED = false;"
df = bq_client.query(query).to_dataframe()
res = df.values.tolist()
#unpack the list of lists, l is a list inside res list, take item from res, now each item is l
my_list = [item for l in res for item in l]
ti.xcom_push(key = 'list_of_dates', value = my_list)
def update_variable(ti):
updated_file_list = ti.xcom_pull(key = 'list_of_dates',task_ids='list_dates')
Variable.set(key="updated_dates", value=json.dumps(updated_file_list))
print(updated_file_list)
print(type(updated_file_list))
with DAG(
'test_with_mng_table_list',
schedule_interval=None,
catchup = False,
default_args=default_args
) as dag:
list_dates = PythonOperator(
task_id ='list_dates',
python_callable = list_dates_in_df
)
set_list = PythonOperator(
task_id= 'set_list',
python_callable=update_variable
)
v_list = Variable.get("updated_dates", deserialize_json=True)
end_job = BashOperator(
task_id='end_job',
bash_command='echo end_job.',
trigger_rule = 'all_done', )
for i in range(len(v_list)):
create_partition_table_agent_activity = BigQueryOperator(
task_id=f"create_partition_table_agent_activity_{v_list[i]}",
sql="select ACCOUNT_ID,timestamp_trunc(CHANGE_EVENT_TIME_15M,HOUR) as ANALYSIS_DATE,\
AGENT_ID,AGENT_GROUP_ID,USER_TYPE_ID,\
sum(AWAY_ENGAGED_TIME) AWAY_ENGAGED_TIME,sum(BACKIN5_ENGAGED_TIME) BACKIN5_ENGAGED_TIME,\
sum(DURATION_DAYS) DURATION_DAYS,sum(ONLINE_TIME) ONLINE_TIME,\
sum(BACK_IN_5_TIME) BACK_IN_5_TIME,sum(AWAY_TIME) AWAY_TIME\
from {{ params.PROJECT_ID }}.{{ params.DATASET_MRR }}.{{ params.MRR1 }}\
where cast(CHANGE_EVENT_TIME_15M as STRING FORMAT 'YYYY-MM-DD') = cast('{{ params.date_a }}' as STRING) \
group by 1,2,3,4,5;",
params={"PROJECT_ID":PROJECT_ID_GCP ,
"DATASET_MRR":DATASET_MRR,
"MRR1":MRR_AGENT_ACTIVITY,
"date_a" : v_list[i]
},
destination_dataset_table=f"{PROJECT_ID_GCP}.{DATASET_STG}.{STG_AGENT_ACTIVITY_PARTITIONED}{v_list[i]}",
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_TRUNCATE',
#bigquery_conn_id=CONNECTION_ID,
use_legacy_sql=False,
dag=dag
)
create_partition_table_agent_intensity = BigQueryOperator(
task_id=f"create_partition_table_agent_intensity_{v_list[i]}",
sql=f"select ACCOUNT_ID,timestamp_trunc(AGG_DATE,HOUR) as ANALYSIS_DATE,\
AGENT_ID, GROUP_ID as AGENT_GROUP_ID,\
USER_TYPE_ID, SUM(SUM_CONVERSATION_LOAD_RATE) as SUM_CONVERSATION_LOAD_RATE,\
SUM(NO_EVENTS) AS NO_EVENTS\
from {{ params.PROJECT_ID }}.{{ params.DATASET_MRR }}.{{ params.MRR2 }}\
where cast(AGG_DATE as STRING FORMAT 'YYYY-MM-DD') = cast('{{ params.date_a }}' as STRING) \
group by 1,2,3,4,5;",
params={"PROJECT_ID":PROJECT_ID_GCP ,
"DATASET_MRR":DATASET_MRR,
"MRR2":MRR_AGENT_INTENSITY,
"date_a" : v_list[i]
},
destination_dataset_table=f"{PROJECT_ID_GCP}.{DATASET_STG}.{STG_AGENT_INTENSITY_PARTITIONED}{v_list[i]}",
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_TRUNCATE',
#bigquery_conn_id=CONNECTION_ID,
use_legacy_sql=False,
dag=dag
)
d2 = DummyOperator(task_id='generate_data_{0}'.format(v_list[i]),dag=dag)
list_dates >> set_list >> [
create_partition_table_agent_activity,create_partition_table_agent_intensity
] >> d2 >> end_job
I do not have playground to test it, but I think you should not use f-string for sql parameter. If you use {{something}} in f-string it returns string {something} so parameters for query are not inserted and this results in SQL syntax error as query is run without parameters. Please try to remove f before string for sql in 2nd task.
This is a continuous of previous post .
How to get result from BigQuery based on user input parameters .
I tried to use EXECUTE IMMEDIATE and USING as these article say.
https://cloud.google.com/bigquery/docs/parameterized-queries
https://towardsdatascience.com/how-to-use-dynamic-sql-in-bigquery-8c04dcc0f0de
But when I run the sql , I got syntax error . I'd like to get my sql checked . I guess this error is caused by the line breaks but I want to do that for the readability . Sorry for my poor coding skill . Could you give me advice ??
I'm little bit worry about BigQuery doesn't support dynamic parameter in Python. Because the article above seems to use these statement in Console not in Python .
The error
File "/srv/main.py", line 14 SELECT EXISTS(SELECT 1
SyntaxError: invalid syntax
SQL
query = """EXECUTE IMMEDIATE format("""
SELECT EXISTS(SELECT 1
FROM `test-266778.conversion_log.conversion_log_2020*` as p
WHERE p.luid = #request_luid AND orderid != '' limit 1000)""")"""
USING "request_luid" as request_luid;
/home/user/api_dev/main.py
from flask import Flask, request, jsonify
from google.cloud import bigquery
app = Flask(__name__)
#app.route('/')
def get_request():
request_luid = request.args.get('luid') or ''
client = bigquery.Client()
query = """EXECUTE IMMEDIATE format("""
SELECT EXISTS(SELECT 1
FROM `test-266778.conversion_log.conversion_log_2020*` as p
WHERE p.luid = #request_luid AND orderid != '' limit 1000)""")"""
USING "request_luid" as request_luid;
job_config = bigquery.QueryJobConfig(
query_parameters=[
bigquery.ScalarQueryParameter("request_luid", "STRING", request_luid)
]
)
query_job = client.query(query, job_config=job_config)
query_res = query_job.result()
first_row = next(iter(query_job.result()))
for row in query_res:
return str(row)
#return jsonify({request_luid:query_res.total_rows})
if __name__ == "__main__":
app.run()
You can try this:
def get_request():
request_luid = request.args.get("luid") or ""
client = bigquery.Client()
query = """SELECT EXISTS(
SELECT 1
FROM `test-266778.conversion_log.conversion_log_2020*` as p
WHERE p.luid = {}
AND p.orderid is not null limit 1000)""".format(request_luid)
query_job = client.query(query)
query_res = query_job.result()
first_row = next(iter(query_job.result()))
for row in query_res:
return str(row)
Notes: If the luid is non-numeric, then use '{}'.
You can try this:
EXECUTE IMMEDIATE
"""SELECT EXISTS(SELECT 1 FROM `test-266778.conversion_log.conversion_log_2020*` WHERE luid = ? AND orderid is not null limit 1000)"""
USING
"string-value";
For numeric input value, don't use double quotes
Using the Python Connector I can query Snowflake:
import snowflake.connector
# Gets the version
ctx = snowflake.connector.connect(
user=USER,
password=PASSWORD,
account=ACCOUNT,
authenticator='https://XXXX.okta.com',
)
ctx.cursor().execute('USE warehouse MY_WH')
ctx.cursor().execute('USE MYDB.MYSCHEMA')
query = '''
select * from MYDB.MYSCHEMA.MYTABLE
LIMIT 10;
'''
cur = ctx.cursor().execute(query)
The result is a snowflake.connector.cursor.SnowflakeCursor. How can I convert that to a pandas DataFrame?
You can use DataFrame.from_records() or pandas.read_sql() with snowflake-sqlalchemy. The snowflake-alchemy option has a simpler API
pd.DataFrame.from_records(iter(cur), columns=[x[0] for x in cur.description])
will return a DataFrame with proper column names taken from the SQL result. The iter(cur) will convert the cursor into an iterator and cur.description gives the names and types of the columns.
So the complete code will be
import snowflake.connector
import pandas as pd
# Gets the version
ctx = snowflake.connector.connect(
user=USER,
password=PASSWORD,
account=ACCOUNT,
authenticator='https://XXXX.okta.com',
)
ctx.cursor().execute('USE warehouse MY_WH')
ctx.cursor().execute('USE MYDB.MYSCHEMA')
query = '''
select * from MYDB.MYSCHEMA.MYTABLE
LIMIT 10;
'''
cur = ctx.cursor().execute(query)
df = pd.DataFrame.from_records(iter(cur), columns=[x[0] for x in cur.description])
If you prefer using pandas.read_sql then you can
import pandas as pd
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
url = URL(
account = 'xxxx',
user = 'xxxx',
password = 'xxxx',
database = 'xxx',
schema = 'xxxx',
warehouse = 'xxx',
role='xxxxx',
authenticator='https://xxxxx.okta.com',
)
engine = create_engine(url)
connection = engine.connect()
query = '''
select * from MYDB.MYSCHEMA.MYTABLE
LIMIT 10;
'''
df = pd.read_sql(query, connection)
There is now a method .fetch_pandas.all() for this, no need for SQL Alchemy anymore.
Note that you need to install snowflake.connector for pandas by doing this
pip install snowflake-connector-python[pandas]
Full documentation here
import pandas as pd
import snowflake.connector
conn = snowflake.connector.connect(
user="xxx",
password="xxx",
account="xxx",
warehouse="xxx",
database="MYDB",
schema="MYSCHEMA"
)
cur = conn.cursor()
# Execute a statement that will generate a result set.
sql = "select * from MYTABLE limit 10"
cur.execute(sql)
# Fetch the result set from the cursor and deliver it as the Pandas DataFrame.
df = cur.fetch_pandas_all()
I just want to leave here a small change made to the code to ensure that the columns have correct names (in my case the fetch call returned long column names that included information beyond the name itself). I leave it here, in case someone needs it:
import snowflake.connector
import pandas as pd
def fetch_pandas(cur, sql):
cur.execute(sql)
rows = 0
while True:
dat = cur.fetchmany(n)
if not dat:
break
a = [cursor.description[i][0] for i in range(len(cursor.description))]
df = pd.DataFrame(dat, columns=a)
rows += df.shape[0]
return df
n = 100000
conn = snowflake.connector.connect(
user='xxxxx',
password='yyyyyy',
account='zzzzz',
warehouse = 'wwwww',
database = 'mmmmmm',
schema = 'nnnnn'
)
cursor = conn.cursor()
fetch_pandas(cursor, 'select * from "mmmmmm"."wwwww"."table"')
I need to run parameterized queries using arrays.
Python Client Library for BigQuery API
id_pull = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
query = "SELECT column1 FROM `table1` WHERE id = #get_id;"
query_params = [
bigquery.ArrayQueryParameter(
'get_id', 'INT64', id_pull)
]
job_config = bigquery.QueryJobConfig()
job_config.query_parameters = query_params
query_job = client.query(query, location='US', job_config=job_config) #API request-starts query
results = query_job.result() # Waits for job to complete.
I followed instructions from the documentation, however, this error after execution appears:
raise self._exception google.api_core.exceptions.BadRequest: 400 No
matching signature for operator = for argument types: INT64,
ARRAY. Supported signatures: ANY = ANY at [1:67]
Does someone what the problem is and how to fix it?
I think the issue is in your WHERE clause
Instead of
WHERE id = #get_id
it should be something like
WHERE id IN UNNEST(#get_id)