Way to optimize this case when sql query - sql

WITH cte AS
(
SELECT
channelGrouping,
visitnumber AS times,
COUNT(*) AS number_of_visitor
FROM
`bigquery-public-data.google_analytics_sample.*`
WHERE
_TABLE_SUFFIX BETWEEN 'ga_sessions_20160801' AND 'ga_sessions_20171213'
AND FullvisitorID IN (SELECT fullVisitorID
FROM `bigquery-public-data.google_analytics_sample.*`
WHERE _TABLE_SUFFIX BETWEEN 'ga_sessions_20160801'
AND 'ga_sessions_20171213'
GROUP BY fullvisitorID
HAVING COUNT(fullvisitorid) > 1)
GROUP BY
channelgrouping,
visitnumber
ORDER BY
channelgrouping,
times
)
SELECT
*,
(number_of_visitor * 100 /
(CASE
WHEN channelgrouping = 'Organic Search'
THEN (SELECT number_of_visitor
FROM cte
WHERE times = 1 AND channelgrouping = 'Organic Search')
WHEN channelgrouping = 'Social'
THEN (SELECT number_of_visitor
FROM cte
WHERE times = 1 AND channelgrouping = 'Social')
WHEN channelgrouping = 'Direct'
THEN (SELECT number_of_visitor
FROM cte
WHERE times = 1 AND channelgrouping
='Direct')
WHEN channelgrouping = 'Referral'
THEN (SELECT number_of_visitor
FROM cte
WHERE times = 1 AND channelgrouping = 'Referral')
WHEN channelgrouping = 'Paid Search'
THEN (SELECT number_of_visitor
FROM cte
WHERE times = 1 AND channelgrouping ='Paid Search')
WHEN channelgrouping = 'Affiliates'
THEN (SELECT number_of_visitor
FROM cte
WHERE times = 1 AND channelgrouping = 'Affiliates')
WHEN channelgrouping = 'Display'
THEN (SELECT number_of_visitor
FROM cte
WHERE times = 1 AND channelgrouping = 'Display')
WHEN channelgrouping = '(Other)'
THEN (SELECT number_of_visitor
FROM cte
WHERE times = 1 AND channelgrouping = '(Other)')
END)) AS retention_rate
FROM
cte
WHERE
times > 1
ORDER BY
cte.channelgrouping, cte.times
There are only 8 channelgrouping, so I can list them. Just wonder if there is a better way to reduce the repetition? What if there are 100 channelgroupings?

There is no need to repeat the same query multiple times in the CASE statement. In fact, there doesn't appear to be any need for a CASE statement at all, since the inner query can reference the outer query and you say these are the only possible values.
You should be able to do something more like:
WITH cte AS (
...
)
SELECT *,
number_of_visitor * 100 /
(
SELECT number_of_visitor
FROM cte cte2
WHERE cte2.times = 1 AND cte1.channelgrouping = cte2.channelgrouping
)
FROM cte cte1
WHERE
times > 1
ORDER BY
cte1.channelgrouping, cte1.times
I should note that this reduces the repetition in the existing query which I think is the intention of your question. I'm clarifying since "optimize" when talking about queries usually has a performance connotation. What I'm suggesting will make the query easier to read and easier to maintain ("What if there are 100 channelgroupings"), but won't necessarily improve performance.

Related

Find the count by conditioned group

I have a table on hand:
order_id, order_status (2=completed, 1=canceled), client_id, fee
I need to find how many client completed 1 order, and how many client completed more than 1 order.
I tried case as below but no clue:
select client_id, count(case order_status when '2' then 'completed' end) as cnt_completed, sum(fee)
from order
where cnt_completed = 1;
Are there any straightforward ways to get the numbers of clients who completed 1 order and numbers of clients who completed more than 1 order, along with the sum of fee for each group? Thanks.
Use two levels of aggregation to get this broken out by the number of orders:
select cnt, count(*), sum(sum_fee)
from (select client_id, count(*) as cnt, sum(fee) as sum_fee
from order o
where order_status = '2'
group by client_id
) o
group by cnt;
For just two rows, you can use:
select (case when cnt = 1 then 1 else 2 end), count(*), sum(sum_fee)
from (select client_id, count(*) as cnt, sum(fee) as sum_fee
from order o
where order_status = '2'
group by client_id
) o
group by (case when cnt = 1 then 1 else 2 end);

Getting sum of distinct values when grouping by an unnested value in google big query

I am querying google big query table which has many rows but the ones I am interested in looks like this:
date fullVisitorId hits.product.productSKU hits.product.v2ProductName hits.transaction.transactionId
20210427 63546815 MM52AF panda 149816182
20210427 65198162 KGSA5A giraffe 321498182
I am trying to count the total transactions by counting distinct hits.transaction.transactionId.
with t1 as
(
SELECT
DATE_TRUNC(PARSE_DATE("%Y%m%d", date), MONTH) as month,
fullVisitorId,
product.productSKU as sku,
product.v2ProductName as v2,
case when hits.ecommerceaction.action_type = '2' then 1 else 0 end as pdp_visitor,
count(case when hits.ecommerceaction.action_type = '2' then fullvisitorid else null end) AS views_pdp,
count(case when hits.ecommerceaction.action_type = '3' then fullvisitorid else null end) AS add_cart,
count(case when hits.ecommerceaction.action_type = '6' then hits.transaction.transactionid else null end) AS conversions,
count(distinct(hits.transaction.transactionId)) as transaction_id_cnt,
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
GROUP BY 1,2,3,4,5
)
select
month,
product.productSKU as sku,
product.v2ProductName as v2,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt) as transactions
from t1
group by 1
order by 1 desc;
Which returns:
month sku v2 pdp add_cart conversions transactions
2021-04-01 AHBS 615 10146410 365569 46885 46640
2021-03-01 HERD 154 10074095 399483 58162 57811
But transactions is not correct, I get the correct output using this:
with t1 as
(
SELECT
DATE_TRUNC(PARSE_DATE("%Y%m%d", date), MONTH) as month,
fullVisitorId,
ARRAY_AGG(DISTINCT product.productSKU IGNORE NULLS) AS productSKU_list, -- changed this
ARRAY_AGG(DISTINCT product.v2ProductName IGNORE NULLS) AS productName_list, -- changed this
case when hits.ecommerceaction.action_type = '2' then 1 else 0 end as pdp_visitor,
0 AS views_impressions,
count(case when hits.ecommerceaction.action_type = '2' then fullvisitorid else null end) AS views_pdp,
count(case when hits.ecommerceaction.action_type = '3' then fullvisitorid else null end) AS add_cart,
0 AS add_shortlist,
count(case when hits.ecommerceaction.action_type = '5' then fullvisitorid else null end) AS checkouts,
count(case when hits.ecommerceaction.action_type = '6' then hits.transaction.transactionid else null end) AS conversions,
count(distinct(hits.transaction.transactionId)) as transaction_id_cnt,
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
GROUP BY 1,2,5
)
select
month,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt) as transactions
from t1
group by 1
order by 1 desc;
Which returns the correct transactions
month pdp add_cart conversions transactions
2021-04-01 9978511 396333 46885 30917
2021-03-01 15101718 568904 58162 23017
But using this :
...
ARRAY_AGG(DISTINCT product.productSKU IGNORE NULLS) AS productSKU_list,
ARRAY_AGG(DISTINCT product.v2ProductName IGNORE NULLS) AS productName_list,
...
Does not allow me to group or select productSKU_list and productName_list in my second select statement.
I believe this is because if one order is made with multiple items in the basket there are multiple lines in google big query with the same hits.transaction.transactionId
I tried confirming this with:
select distinct(hits.transaction.transactionId), count(distinct hits.transaction.transactionId) as total
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
WHERE _TABLE_SUFFIX between '200101' AND '210428'
GROUP BY 1
order by 2 desc
But I get:
transactionId total
ABSAD54 1
515ABDG 1
So at this point, I am lost, as I am unsure why I get the correct answer if I use the second script or when I comment out this part from the first query.
--product.productSKU,
--product.v2ProductName,
Any tips on how google big query works is accepted.
My goal is to have the correct output of transactions which is achieved in the second script but still be able to group and have values of product.productSKU and product.v2ProductName.
in your second query , you need to aggregate them again :
select
month,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt) as transactions
,ARRAY_AGG(productSKU_list)
,ARRAY_AGG(productName_list)
from t1
group by month
order by month desc;

Sum of distinct values after grouping explodes a metric

I am using
with t1 as
(
SELECT
DATE_TRUNC(PARSE_DATE("%Y%m%d", date), MONTH) as month,
fullVisitorId,
product.productSKU,
product.v2ProductName,
case when hits.ecommerceaction.action_type = '2' then 1 else 0 end as pdp_visitor,
count(case when hits.ecommerceaction.action_type = '2' then fullvisitorid else null end) AS views_pdp,
count(case when hits.ecommerceaction.action_type = '3' then fullvisitorid else null end) AS add_cart,
count(case when hits.ecommerceaction.action_type = '6' then hits.transaction.transactionid else null end) AS conversions,
count(distinct(hits.transaction.transactionId)) as transaction_id_cnt,
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
GROUP BY 1,2,3,4,5
)
select
month,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt)
from t1
group by 1
order by 1 desc;
Which returns
month pdp add_cart conversions f0_
2021-02-01 500 100 20 10
2021-01-01 600 200 30 20
I know that f0_ ( count(distinct(hits.transaction.transactionId)) ) is bad here because of product.productSKU and product.v2ProductName grouping.
In general, when user makes an order with 3 items in his basket, I want to count this as one order, whereas now it is counted as 3.
This count(distinct(hits.transaction.transactionId)) as transaction_id_cnt results in the correct output if I comment out product.productSKU and product.v2ProductName.
Running this query:
with t1 as
(
SELECT
DATE_TRUNC(PARSE_DATE("%Y%m%d", date), MONTH) as month,
fullVisitorId,
-- product.productSKU, # commented out
-- product.v2ProductName, # commented out
case when hits.ecommerceaction.action_type = '2' then 1 else 0 end as pdp_visitor,
count(case when hits.ecommerceaction.action_type = '2' then fullvisitorid else null end) AS views_pdp,
count(case when hits.ecommerceaction.action_type = '3' then fullvisitorid else null end) AS add_cart,
count(case when hits.ecommerceaction.action_type = '6' then hits.transaction.transactionid else null end) AS conversions,
count(distinct(hits.transaction.transactionId)) as transaction_id_cnt,
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
GROUP BY 1,2,3,4,5
)
select
month,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt)
from t1
group by 1
order by 1 desc;
Returns what is expected, but now I don't have productSKU and v2ProductName which I need. I suspect that the problem is that each order is a new line in google big query and when I ask to to select it by product name and SKU, I count the uniques and then sum it.
How can I achieve the correct summation of count(distinct(hits.transaction.transactionId)) without losing the grouping by product.productSKU and product.v2ProductName which explodes this metric?
On the group by Query you could cherry pick them as array(so you don't group by them):
ARRAY_AGG(DISTINCT product.productSKU IGNORE NULLS) AS productSKU_list,
ARRAY_AGG(DISTINCT product.v2ProductName IGNORE NULLS) AS productName_list,
Update per your below comment: If you want to use them in further group by just save them as string instead of array.
STRING_AGG(DISTINCT product.productSKU, ',') AS productSKU_list,
STRING_AGG(DISTINCT product.v2ProductName, ',') AS productName_list,

query tuning for performance

I need to rewrite the below query for performance optimization.It is currently consuming > 50000 CPU seconds. I see the problem with group by cube. Can anyone suggest how to rewrite it
select
"platform", "subscriptions","rptg_dt","store_front_id","engaged_subscriptions",
"state_type","subscribers","qualified_subscribers","hardware_detail",
"engaged_subscribers","adam_id"
from (
select
'2020-03-16' as rptg_dt,
coalesce(abc.adam_id, 'ALL_ITEMS') as adam_id,
trim(coalesce(abc.store_front_id, 'ALL_ITEMS')) as store_front_id,
coalesce(abc.state_type, 'ALL_ITEMS') as state_type,
trim(coalesce(abc.hardware_detail, 'ALL_ITEMS')) as hardware_detail,
coalesce(abc.platform, 'ALL_ITEMS') as platform,
abc.subscriptions as subscriptions,
abc.subscribers as subscribers,
abc.qualified_subscribers as qualified_subscribers,
abc.engaged_subscribers as engaged_subscribers,
abc.engaged_subscriptions as engaged_subscriptions
from (
select store_front_id as store_front_id,
cast(cast(a.adam_id as integer) as varchar(12)) as adam_id,
case when subscn_type = 'Harmony' then 'Harmony Trial' else trim(state_type) end as state_type,
coalesce(hardware_detail, 'Unknown') as hardware_detail, --state_type,hardware_detail,platform
trim(platform_name) as platform,
count(distinct subscn_id) as subscriptions,
count(distinct acct_id) as subscribers,
count(distinct case when qualified_ind = 1 or subscn_owner_ind = 1 then acct_id end) as qualified_subscribers,
count(distinct case when engaged_ind = 1 then acct_id end) as engaged_subscribers,
count(distinct case when engaged_ind = 1 then subscn_id end) as engaged_subscriptions
from itsp_amr.atv_state_daily a
where calendar_type = 'F'
and adam_id in (1472441559,1478184786)
and a.rptg_dt = '2020-03-16'
and state_type <>'CHURNED'
group by cube /*(store_front_id,adam_id,state_type,hardware_detail,platform)*/ (1,2,3,4,5)
) abc
) mandela_temp

SemanticException Failed to breakup Windowing invocations into Groups. At least 1 group must only depend on input columns

The below query is working fine in Oracle but it is not working in hive.
SELECT Q.tm_mo_id,
'1380' AS mrc_cd,
NVL (R.itm_profit_ctr_cd, '99') AS profit_center_cd,
MAX(CASE R.itm_profit_ctr_cd
WHEN NULL THEN 'UNASSIGN PROFIT CNTR'
ELSE R.itm_profit_ctr_ds
END) profit_center_desc,
SUM(Q.bp_grs_quota_am) AS mth_bp_plan_gts_am_usd,
SUM(Q.grs_quota_am) AS mth_ju_plan_gts_am_usd
FROM v_l_0002_gb_gds_us_quota_v_1 Q
LEFT JOIN
(SELECT * FROM
(SELECT ph_dtl_id,
itm_profit_ctr_cd,
MIN (itm_profit_ctr_ds) AS itm_profit_ctr_ds,
ROW_NUMBER () OVER (
PARTITION BY ph_dtl_id
ORDER BY COUNT(CASE profit_ctr_cd
WHEN 'JNJDUMMY' THEN NULL
WHEN '99' THEN NULL
ELSE profit_ctr_cd
END) DESC,
itm_profit_ctr_cd ASC) rn
FROM v_l_0002_gb_gds_us_sku_to_profit_center_lookup_v_1
GROUP BY ph_dtl_id,
itm_profit_ctr_cd) E
WHERE rn = 1 ) R
ON (Q.ph_dtl_id = R.ph_dtl_id)
WHERE SUBSTR (Q.tm_mo_id, 1, 4) = '2016'
GROUP BY Q.tm_mo_id,
NVL(R.itm_profit_ctr_cd, '99')