How to do a union with where clause - sql

I am still learning to write it and below query is not giving me accurate results plus it is also not optimized.
So one of the main things that I am trying to do is create 3 date types:
sales_shift,
refund,
sales_reg
in my data sets.
To achieve this I am doing a union.
All 3 data sets are querying the same source sales_main. The problem is in my 2nd data set with ‘Refund’ as date_type, it is not pulling the actual rows because of ColumnA = ‘B’ condition.
I would like this set to look at only those records that were pulled in 1st data set (i.e ‘Sales_shift’ as date_type) and then apply the condition ColumnA = ‘B’. How do I do that?
My 3rd data set with ‘sales_reg’ as date_type should be same as 1st set except with transaction_date not shifted. How do I do that. I was thinking of where exists but do not know how to apply it.
Any help would be awesome. Thanks much
create table sales_refund as select * from (
with table1 as (select 'Sales_shift' as date_type,
add_months(date_trunc('month',transaction_date),1) as event_date,
sku,
sum(sales) as Sales_total,
sum(refund) as Refund_total,
case when region in ('US','EU') then 'type A' else 'type B' end as Flag_field1
from sales_main where transaction_date >= add_months(current_date(),-6)
group by sku,
'Sales_shift'
add_months(date_trunc('month',transaction_date),1),
Flag_field1),
table2 as (select sku,
date_type,
event_date,
Sales_total,
Refund_total,
Flag_field1,
case when Flag_field1 = 'type B' or (Flag_field1 = 'type A' and Sales_total > 20000) then 'Yes' else 'No' end as Flag_field2
from table1 )
Select sku,
date_type,
event_date,
Sales_total,
Refund_total,
Flag_field1
fromm table2 where Flag_field2 = 'Yes' )
union all
select sku,
'Refund' as date_type,
date_trunc('month',refund_date) as event_date,
sum(sales) as Sales_total,
sum(refund) as Refund_total,
'X' as Flag_field1
from sales_main where transaction_date >= add_months(current_date(),-6) and ColumnA = 'B'
group by sku,
'Refund'
date_trunc('month',refund_date),
'X'
union all
select sku,
'Sales_Reg' as date_type,
date_trunc('month',transaction_date) as event_date,
sum(sales) as Sales_total,
sum(refund) as Refund_total,
'Y' as Flag_field1
from sales_main where transaction_date >= add_months(current_date(),-6)
group by sku,
'Refund'
date_trunc('month',transaction_date),
'Y'

Related

Getting sum of distinct values when grouping by an unnested value in google big query

I am querying google big query table which has many rows but the ones I am interested in looks like this:
date fullVisitorId hits.product.productSKU hits.product.v2ProductName hits.transaction.transactionId
20210427 63546815 MM52AF panda 149816182
20210427 65198162 KGSA5A giraffe 321498182
I am trying to count the total transactions by counting distinct hits.transaction.transactionId.
with t1 as
(
SELECT
DATE_TRUNC(PARSE_DATE("%Y%m%d", date), MONTH) as month,
fullVisitorId,
product.productSKU as sku,
product.v2ProductName as v2,
case when hits.ecommerceaction.action_type = '2' then 1 else 0 end as pdp_visitor,
count(case when hits.ecommerceaction.action_type = '2' then fullvisitorid else null end) AS views_pdp,
count(case when hits.ecommerceaction.action_type = '3' then fullvisitorid else null end) AS add_cart,
count(case when hits.ecommerceaction.action_type = '6' then hits.transaction.transactionid else null end) AS conversions,
count(distinct(hits.transaction.transactionId)) as transaction_id_cnt,
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
GROUP BY 1,2,3,4,5
)
select
month,
product.productSKU as sku,
product.v2ProductName as v2,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt) as transactions
from t1
group by 1
order by 1 desc;
Which returns:
month sku v2 pdp add_cart conversions transactions
2021-04-01 AHBS 615 10146410 365569 46885 46640
2021-03-01 HERD 154 10074095 399483 58162 57811
But transactions is not correct, I get the correct output using this:
with t1 as
(
SELECT
DATE_TRUNC(PARSE_DATE("%Y%m%d", date), MONTH) as month,
fullVisitorId,
ARRAY_AGG(DISTINCT product.productSKU IGNORE NULLS) AS productSKU_list, -- changed this
ARRAY_AGG(DISTINCT product.v2ProductName IGNORE NULLS) AS productName_list, -- changed this
case when hits.ecommerceaction.action_type = '2' then 1 else 0 end as pdp_visitor,
0 AS views_impressions,
count(case when hits.ecommerceaction.action_type = '2' then fullvisitorid else null end) AS views_pdp,
count(case when hits.ecommerceaction.action_type = '3' then fullvisitorid else null end) AS add_cart,
0 AS add_shortlist,
count(case when hits.ecommerceaction.action_type = '5' then fullvisitorid else null end) AS checkouts,
count(case when hits.ecommerceaction.action_type = '6' then hits.transaction.transactionid else null end) AS conversions,
count(distinct(hits.transaction.transactionId)) as transaction_id_cnt,
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
GROUP BY 1,2,5
)
select
month,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt) as transactions
from t1
group by 1
order by 1 desc;
Which returns the correct transactions
month pdp add_cart conversions transactions
2021-04-01 9978511 396333 46885 30917
2021-03-01 15101718 568904 58162 23017
But using this :
...
ARRAY_AGG(DISTINCT product.productSKU IGNORE NULLS) AS productSKU_list,
ARRAY_AGG(DISTINCT product.v2ProductName IGNORE NULLS) AS productName_list,
...
Does not allow me to group or select productSKU_list and productName_list in my second select statement.
I believe this is because if one order is made with multiple items in the basket there are multiple lines in google big query with the same hits.transaction.transactionId
I tried confirming this with:
select distinct(hits.transaction.transactionId), count(distinct hits.transaction.transactionId) as total
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
WHERE _TABLE_SUFFIX between '200101' AND '210428'
GROUP BY 1
order by 2 desc
But I get:
transactionId total
ABSAD54 1
515ABDG 1
So at this point, I am lost, as I am unsure why I get the correct answer if I use the second script or when I comment out this part from the first query.
--product.productSKU,
--product.v2ProductName,
Any tips on how google big query works is accepted.
My goal is to have the correct output of transactions which is achieved in the second script but still be able to group and have values of product.productSKU and product.v2ProductName.
in your second query , you need to aggregate them again :
select
month,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt) as transactions
,ARRAY_AGG(productSKU_list)
,ARRAY_AGG(productName_list)
from t1
group by month
order by month desc;

Sum of distinct values after grouping explodes a metric

I am using
with t1 as
(
SELECT
DATE_TRUNC(PARSE_DATE("%Y%m%d", date), MONTH) as month,
fullVisitorId,
product.productSKU,
product.v2ProductName,
case when hits.ecommerceaction.action_type = '2' then 1 else 0 end as pdp_visitor,
count(case when hits.ecommerceaction.action_type = '2' then fullvisitorid else null end) AS views_pdp,
count(case when hits.ecommerceaction.action_type = '3' then fullvisitorid else null end) AS add_cart,
count(case when hits.ecommerceaction.action_type = '6' then hits.transaction.transactionid else null end) AS conversions,
count(distinct(hits.transaction.transactionId)) as transaction_id_cnt,
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
GROUP BY 1,2,3,4,5
)
select
month,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt)
from t1
group by 1
order by 1 desc;
Which returns
month pdp add_cart conversions f0_
2021-02-01 500 100 20 10
2021-01-01 600 200 30 20
I know that f0_ ( count(distinct(hits.transaction.transactionId)) ) is bad here because of product.productSKU and product.v2ProductName grouping.
In general, when user makes an order with 3 items in his basket, I want to count this as one order, whereas now it is counted as 3.
This count(distinct(hits.transaction.transactionId)) as transaction_id_cnt results in the correct output if I comment out product.productSKU and product.v2ProductName.
Running this query:
with t1 as
(
SELECT
DATE_TRUNC(PARSE_DATE("%Y%m%d", date), MONTH) as month,
fullVisitorId,
-- product.productSKU, # commented out
-- product.v2ProductName, # commented out
case when hits.ecommerceaction.action_type = '2' then 1 else 0 end as pdp_visitor,
count(case when hits.ecommerceaction.action_type = '2' then fullvisitorid else null end) AS views_pdp,
count(case when hits.ecommerceaction.action_type = '3' then fullvisitorid else null end) AS add_cart,
count(case when hits.ecommerceaction.action_type = '6' then hits.transaction.transactionid else null end) AS conversions,
count(distinct(hits.transaction.transactionId)) as transaction_id_cnt,
FROM `table` AS nr,
UNNEST(hits) hits,
UNNEST(product) product
GROUP BY 1,2,3,4,5
)
select
month,
sum(views_pdp) as pdp
,sum(add_cart) as add_cart
,sum(conversions) as conversions
,sum(transaction_id_cnt)
from t1
group by 1
order by 1 desc;
Returns what is expected, but now I don't have productSKU and v2ProductName which I need. I suspect that the problem is that each order is a new line in google big query and when I ask to to select it by product name and SKU, I count the uniques and then sum it.
How can I achieve the correct summation of count(distinct(hits.transaction.transactionId)) without losing the grouping by product.productSKU and product.v2ProductName which explodes this metric?
On the group by Query you could cherry pick them as array(so you don't group by them):
ARRAY_AGG(DISTINCT product.productSKU IGNORE NULLS) AS productSKU_list,
ARRAY_AGG(DISTINCT product.v2ProductName IGNORE NULLS) AS productName_list,
Update per your below comment: If you want to use them in further group by just save them as string instead of array.
STRING_AGG(DISTINCT product.productSKU, ',') AS productSKU_list,
STRING_AGG(DISTINCT product.v2ProductName, ',') AS productName_list,

MSSQL Group by and Select rows from grouping

I'm trying to figure out if what I'm trying to do is possible. Instead of resorting to multiple queries on a table, I wanted to group the records by business date and id then group by the id and select one date for a field and another date for the other field.
SELECT
*
{AMOUNT FROM DATE}
{AMOUNT FROM OTHER DATE}
FROM (
SELECT
date,
id,
SUM(amount) AS amount
FROM
table
GROUP BY id, date
AS subquery
GROUP BY id
It seems that you're looking to do a pivot query. I usually use cross tabs for this. Based on the query you posted, it could look like:
SELECT
id,
SUM(CASE WHEN date = '20190901' THEN amount ELSE 0 END) AmountFromSept01,
SUM(CASE WHEN date = '20191001' THEN amount ELSE 0 END) AmountFromOct01
FROM (
SELECT
date,
id,
SUM(amount) AS amount
FROM
table
GROUP BY id, date
)AS subquery
GROUP BY id;
You could also use a CTE.
WITH CTE AS(
SELECT
date,
id,
SUM(amount) AS amount
FROM
table
GROUP BY id, date
)
SELECT
id,
SUM(CASE WHEN date = '20190901' THEN amount ELSE 0 END) AmountFromSept01,
SUM(CASE WHEN date = '20191001' THEN amount ELSE 0 END) AmountFromOct01
FROM CTE
GROUP BY id;
Or even be a rebel and do the operation directly.
SELECT
id,
SUM(CASE WHEN date = '20190901' THEN amount ELSE 0 END) AmountFromSept01,
SUM(CASE WHEN date = '20191001' THEN amount ELSE 0 END) AmountFromOct01
FROM CTE
GROUP BY id;
However, some people have tested for performance and found that pre-aggregating can improve performance.
If I understand you correctly, then you're just trying to pivot, but only with two particular dates:
select id,
date1 = sum(iif(date = '2000-01-01', amount, null)),
date2 = sum(iif(date = '2000-01-02', amount, null))
from [table]
group by id

How to split columns based on field title

I am working with a single warehouse table with columns like "2015 Fee", "2015 Revenue", "2016 Fee", "2016 Revenue", etc. I need to split these out into "Revenue", "Fee", and "Billed Year" in order to do some analysis. Many of the records have fee and revenue in multiple years.
The CASE statement I tried only pulls in the first year, but I need it to pull in all years.
Here's my case statements:
(CASE when 2015_revenue IS NOT NULL then 2015_revenue
when 2016_revenue_$ IS NOT NULL then 2016_revenue
END) as revenue,
(CASE when 2015_fee IS NOT NULL then 2015_fee
when 2016_fee IS NOT NULL then 2016_fee
END) as fee,
(CASE when 2015_revenue IS NOT NULL then '2015'
when 2015_fee IS NOT NULL then '2015'
when 2016_revenue IS NOT NULL then '2016'
when 2016_fee IS NOT NULL then '2016'
end) as bill_year
Any ideas?
It's not a CASE statement, it's creating a temp table with UNIONs and joining to that:
SELECT p1.[bunch of fields from table]
amts.fee
amts.revenue
amts.bill_year
FROM table p1
JOIN (SELECT id, 2015_revenue as revenue, 2015_fee as fee, '2015' as bill_year FROM table
UNION
SELECT id, 2016_revenue as revenue, 2016_fee as fee, '2016' as bill_year FROM table
U) amts on amts.id = p1.id;

Invalid number in Oracle SQL Case

Hi im habing trouble with a SQL case, the problem is im trying to run a case with 7 different columns, the columns can have different kinds of data (string,date,number) depending on an id.
This means that under some id's the rows in a column will be string while under other ids the rows in a column will be number.
I realise this isn't a conventional use of astructured database, but this specific table serve a specific purpose where this approach was deemed usefull in the past.
The case is supposed to only select a "then" when the column does have a number. However when i run it i get a invalid number ORA-01722. because one of the, rows will hold a string og date.
I realise its properly because oracle asses the sql before executing, and doesnt execute sequential, therefore giving errors on these column even though it wouldn actually have to calculate on the column under a given ID.
The code im trying to execute is the following, The hardcoded 1 and 2 before 'then' will change depending on ctrl_id (the unique id) and it will be the one securing that we only look and a list_val column/row with a number
WITH sampledata1 AS
(SELECT '1' ctrl_id, '23' list_val1, 'Textfield' list_val2
FROM dual),
sampledata2 AS
(SELECT '2' ctrl_id, 'Textfield' list_val1, '45' list_val2
FROM dual),
sampledata3 AS
(SELECT *
FROM sampledata1
UNION
SELECT *
FROM sampledata2)
SELECT CASE
WHEN ctrl_id = 1 THEN
AVG(list_val1)
over(PARTITION BY qd.ctrl_id ORDER BY qd.ctrl_id ASC)
WHEN ctrl_id = 2 THEN
AVG(list_val2)
over(PARTITION BY qd.ctrl_id ORDER BY qd.ctrl_id ASC)
END AS avg_val
FROM sampledata3 qd
Any suggestions to how i can make this work. either a workaround or a different approach ?
Thx in advance.
--------- Solution below
I used some of the suggestions and solutions posted below and got this code samble working. I will try and implement it with the system. Thx for the help everyone you saved me alot of headache.
WITH sampledata1
AS (SELECT '1' ctrl_id, '23' list_val1, 'Textfield' list_val2 FROM DUAL),
sampledata2
AS (SELECT '2' ctrl_id, 'Textfield' list_val1, '45' list_val2 FROM DUAL),
sampledata3
AS (SELECT * FROM sampledata1
UNION
SELECT * FROM sampledata2)
select ctrl_id,
avg(CASE WHEN TRIM(TRANSLATE(list_val1, ' +-.0123456789', ' ')) is null
then list_val1 else null end) over(PARTITION BY qd.ctrl_id ORDER BY qd.ctrl_id ASC) list_val1,
avg(CASE WHEN TRIM(TRANSLATE(list_val2, ' +-.0123456789', ' ')) is null
then list_val2 else null end) over(PARTITION BY qd.ctrl_id ORDER BY qd.ctrl_id ASC) list_val2
from sampledata3 qd
You can try to filtered out values with non-numeric symbols with something like
AVG(CASE WHEN TRIM(TRANSLATE(list_val1, ' +-.0123456789', ' ')) is null then list_val1 else null end) OVER (...)
NB! Unfortunately strings like '+12-.3' also will be recognized as a numeric and in this case you will get same ora-01722
The aggregate function like AVG does not work with VARCHAR data type, NUMBER or INTEGER is a must when such functions are being used.
I have modified the query to have number instead of a string,
WITH sampledata1
AS (SELECT '1' ctrl_id, '23' list_val1, '43' list_val2 FROM DUAL),
sampledata2
AS (SELECT '2' ctrl_id, '34' list_val1, '45' list_val2 FROM DUAL),
sampledata3
AS (SELECT * FROM sampledata1
UNION
SELECT * FROM sampledata2)
SELECT CASE
WHEN ctrl_id = 1
THEN
AVG (list_val1)
OVER (PARTITION BY qd.ctrl_id ORDER BY qd.ctrl_id ASC)
WHEN ctrl_id = 2
THEN
AVG (list_val2)
OVER (PARTITION BY qd.ctrl_id ORDER BY qd.ctrl_id ASC)
END
AS avg_val
FROM sampledata3 qd
The output is
AVG_VAL
----------
23
45
Edit 1
Perhaps you can do something like the below to first ascertain row return NUMERIC or NON-NUMERIC.
Change as per your requirements.
WITH sampledata1
AS (SELECT '1' ctrl_id, '23' list_val1, 'Textfield' list_val2 FROM DUAL),
sampledata2
AS (SELECT '2' ctrl_id, 'Textfield' list_val1, '45' list_val2 FROM DUAL),
sampledata3
AS (SELECT * FROM sampledata1
UNION
SELECT * FROM sampledata2),
sampledata4
AS (SELECT LENGTH (TRIM (TRANSLATE (ctrl_id, ' +-.0123456789', ' ')))
ctrl_id,
LENGTH (TRIM (TRANSLATE (list_val1, ' +-.0123456789', ' ')))
list_val1,
LENGTH (TRIM (TRANSLATE (list_val2, ' +-.0123456789', ' ')))
list_val2
FROM sampledata3 qd -- group by ctrl_id
)
( SELECT CASE WHEN ctrl_id IS NULL THEN AVG (ctrl_id) ELSE 0 END ctrl_id,
CASE WHEN list_val1 IS NULL THEN AVG (list_val1) ELSE 0 END list_val1,
CASE WHEN list_val2 IS NULL THEN AVG (list_val2) ELSE 0 END list_val2
FROM sampledata4
GROUP BY ctrl_id, list_val1, list_val2)
Not sure why you are using that Analytic function. For your basic problem this would work:
SELECT AVG(CASE
WHEN ctrl_id = 1 THEN list_val1
WHEN ctrl_id = 2 THEN list_val2
END) AS avg_val
FROM sampledata3 qd
You're trying to get Average sum of list_val1, However list_val1 from sampledata2 query returns VARCHAR value.
You shouldn't use AVG on non-numeric values.
WITH sampledata1 AS
(SELECT '1' ctrl_id, '23' list_val1, '10' list_val2
FROM dual),
sampledata2 AS
(SELECT '2' ctrl_id, '45' list_vall, '90' list_val2
FROM dual),
sampledata3 AS
(SELECT *
FROM sampledata1
UNION
SELECT *
FROM sampledata2)
SELECT CASE
WHEN ctrl_id = 1 THEN
AVG(list_val1)
over(PARTITION BY qd.ctrl_id ORDER BY qd.ctrl_id ASC)
WHEN ctrl_id = 2 THEN
AVG(list_val2)
over(PARTITION BY qd.ctrl_id ORDER BY qd.ctrl_id ASC)
END AS avg_val
FROM sampledata3 qd