How to mimic an ON condition while using Union? - sql

I have a query like this:
select
yyyy_mm_dd,
xml_id,
feature,
status
from
schema.t1
where
yyyy_mm_dd >= '2019-02-02'
union all
select
yyyy_mm_dd,
p_id as xml_id,
'payment' as feature,
case
when payment = 1 then 1
else 0
end as status
from
schema.t2
where
yyyy_mm_dd >= '2019-02-02'
Is there a way I can ensure no side of the union has a greater date than the other? With a join I could enforce this with an on condition on yyyy_mm_dd. I want to maintain the union but only until the max date which is available in both tables.
Is there a more efficient way to solve this than the solution I've come up with?
select
c.yyyy_mm_dd,
xml_id,
feature,
status
from
schema.t1 c
left join(
select
max(yyyy_mm_dd) as yyyy_mm_dd
from
schema.t2
where
yyyy_mm_dd >= '2020-10-01'
) m on m.yyyy_mm_dd = c.yyyy_mm_dd
where
c.yyyy_mm_dd >= '2020-10-01'
and m.yyyy_mm_dd is null
union all
select
c.yyyy_mm_dd,
p_id as xml_id,
'payment' as feature,
case
when payment = 1 then 1
else 0
end as status
from
schema.t2 c
left join(
select
max(yyyy_mm_dd) as yyyy_mm_dd
from
schema.t1
where
yyyy_mm_dd >= '2020-10-01'
) m on m.yyyy_mm_dd = c.yyyy_mm_dd
where
c.yyyy_mm_dd >= '2020-10-01'
and m.yyyy_mm_dd is not null

Create 2 CTEs for each of your queries and then select only the rows of each CTE that have matching yyyy_mm_dds in the other CTE:
with
cte1 as (
select yyyy_mm_dd, xml_id, feature, status
from schema.t1
where yyyy_mm_dd >= '2019-02-02'
),
cte2 as (
select yyyy_mm_dd, p_id as xml_id, 'payment' as feature,
case when payment = 1 then 1 else 0 end as status
from schema.t2
where yyyy_mm_dd >= '2019-02-02'
)
select c1.* from cte1 c1
where exists (select 1 from cte2 c2 where c2.yyyy_mm_dd = c1.yyyy_mm_dd)
union all
select c2.* from cte2 c2
where exists (select 1 from cte1 c1 where c1.yyyy_mm_dd = c2.yyyy_mm_dd)

Related

SQL Case When Slowing Down Query

What I'm looking to do is quantify the total value of purchases and the number of months in which a purchase was made within three different timeframes by account. I only want to look at accounts who made a purchase between 1-1-2020 and 4-1-2021.
I'm wondering if there is a more streamlined way to pull in the fields I'm creating using CASE WHEN below (maybe through a series of queries to create the calculations and the left joining?). This query is taking extremely long to pull back, so I'd like to enhance this code where I can. All of my code and desired output is listed below. Thank you!
Creating a temporary table to pull account numbers:
DROP TABLE IF EXISTS #accounts
SELECT DISTINCT s.account_no, c.code, c.code_desc
INTO #accounts
FROM sales AS s
LEFT JOIN customer AS c ON s.account_no = c.account_no
WHERE s.tran_date BETWEEN '2020-01-01' AND '2021-04-01'
GROUP BY s.account_no, c.code, c.code_desc;
Confirming row counts:
SELECT COUNT (*)
FROM #accounts
ORDER BY account_no;
Creating Sales and Sales period count columns for three timeframes:
SELECT
s.account_no, c.code, c.code_desc
SUM(CASE
WHEN s.tran_date BETWEEN '2020-01-01' AND '2021-04-01'
THEN VALUE_USD
END) AS Total_Spend_Pre,
SUM(CASE
WHEN s.tran_date BETWEEN '2021-04-01' AND '2022-03-31'
THEN VALUE_USD
END) Total_Spend_During,
SUM(CASE
WHEN s.tran_date > '2022-04-01'
THEN VALUE_USD
END) Total_Spend_Post,
COUNT(DISTINCT CASE WHEN s.tran_date BETWEEN '2020-01-01' AND '2021-04-01' THEN CONCAT(s.bk_month, s.bk_year) END) Pre_Periods,
COUNT(DISTINCT CASE WHEN s.tran_date BETWEEN '2021-04-01' AND '2022-03-31' THEN CONCAT(s.bk_month, s.bk_year) END) During_Periods,
COUNT(DISTINCT CASE WHEN s.tran_date > '2022-04-01' THEN CONCAT(s.bk_month, s.bk_year) END) Post_Periods
FROM
sales AS s
LEFT JOIN
customer AS c ON s.account_no = c.account_no
WHERE
c.account_no IN (SELECT DISTINCT account_no
FROM #accounts)
GROUP BY
s.account_no, c.code, c.code_desc;
Desired output:
account_no
code
code_desc
Total_Spend_Pre
Total_Spend_During
Total_Spend_Post
Pre_Periods
During_Periods
Post_Periods
25
1234
OTHER
1000
2005
500
2
14
5
11
5678
PC
500
100
2220
5
11
2
You may use your date ranges to join with dataset, and 'Tag' your result like below, this will result in 3 rows, for each group. If you need them in a single row, have PIVOTE over it
;With DateRanges AS (
SELECT CAST('2020-01-01' AS DATE) StartDate, CAST('2021-04-01' AS DATE) EndDate, 'Pre' Tag UNION
SELECT '2021-04-01', '2022-03-31', 'During' UNION
SELECT '2022-04-01', Null, 'Post'
)
SELECT s.account_no, c.code, c.code_desc, d.Tag,
SUM(VALUE_USD) AS Total_Spend,
COUNT(DISTINCT CONCAT(s.bk_month, s.bk_year)) RecordCount
FROM sales as s
LEFT JOIN customer as c
INNER JOIN DateRanges D ON s.tran_date BETWEEN D.StartDate AND ISNULL(D.EndDate,s.tran_date)
ON s.account_no = c.account_no
WHERE c.account_no IN (SELECT DISTINCT account_no FROM #accounts)
GROUP BY s.account_no, c.code, c.code_desc;
with [cte_accountActivityPeriods] as (
select [PeriodOrdinal] = 1, [PeriodName] = 'Total Spend Pre', [PeriodStart] = convert(date,'2020-01-01',23) , [PeriodFinish] = convert(date,'2021-03-31',23) union
select [PeriodOrdinal] = 2, [PeriodName] = 'Total Spend During', [PeriodStart] = convert(date,'2021-04-01',23) , [PeriodFinish] = convert(date,'2022-03-31',23) union
select [PeriodOrdinal] = 3, [PeriodName] = 'Total Spend Post', [PeriodStart] = convert(date,'2022-04-01',23) , [PeriodFinish] = convert(date,'9999-12-31',23)
)
, [cte_allsalesForActivityPeriod]
SELECT s.account_no, bk_month, bk_year, [PeriodOrdinal], s.tran_date, s.value_usd
FROM sales as s
cross join [cte_accountActivityPeriods]
on s.[tran_date] between [cte_ActivityPeriods].[PeriodStart] and [cte_ActivityPeriods].[PeriodFinish]
)
, [cte_uniqueAccounts] as ( /*Unique and qualifying Accounts*/
select distinct account_no from [cte_allsalesForActivityPeriod]
inner join #accounts accs on accs.[account_no] = [cte_allsalesForActivityPeriod].[account_no]
)
, [cte_AllSalesAggregatedByPeriod] as (
select account_no, [PeriodOrdinal], bk_month, bk_year, [PeriodTotalSpend] = sum([value_usd])
from [cte_allsalesForActivityPeriod]
group by s.account_no, [PeriodOrdinal], bk_month, bk_year
)
, [cte_PeriodAnalysis] as (
select account_no, [PeriodOrdinal], [ActivePeriods] = count(distinct concat(bk_month, bk_year))
from [cte_AllSalesAggregatedByPeriod]
group by s.account_no, [PeriodOrdinal]
)
, [cte_pivot_clumsily] as (
/* Aggregations already done - so simple pivot */
select [cte_uniqueAccounts].[account_no]
, [Total_Spend_Pre] = case when [SaleVal].[PeriodOrdinal] in (1) then [SaleVal].[PeriodTotalSpend] else 0 end
, [Total_Spend_During] = case when [SaleVal].[PeriodOrdinal] in (2) then [SaleVal].[PeriodTotalSpend] else 0 end
, [Total_Spend_Post] = case when [SaleVal].[PeriodOrdinal] in (3) then [SaleVal].[PeriodTotalSpend] else 0 end
, [Pre_Periods] = case when [SalePrd].[PeriodOrdinal] in (1) then [SalePrd].[ActivePeriods] else 0 end
, [During_Periods] = case when [SalePrd].[PeriodOrdinal] in (2) then [SalePrd].[ActivePeriods] else 0 end
, [Post_Periods] = case when [SalePrd].[PeriodOrdinal] in (3) then [SalePrd].[ActivePeriods] else 0 end
from [cte_uniqueAccounts]
left join [cte_AllSalesAggregatedByPeriod] [SaleVal] on [SaleVal].[account_no] = [cte_uniqueAccounts].[account_no]
left join [cte_PeriodAnalysis] [SalePrd] on [SalePrd].[account_no] = [cte_uniqueAccounts].[account_no]
)
select c.code, c.code_desc, [cte_pivot_clumsily].*
from [cte_pivot_clumsily]
LEFT JOIN customer as c
ON [cte_pivot_clumsily].account_no = c.account_no

SQL - find row with closest date but different column value

i'm new to SQL and i would need an help.
I have a TAB and I need to find for any item B in the TAB the item A with the closest date. In this case the A with 02.09.2021 04:25:30
Date.
Item
07.09.2021 05:02:05
A
06.09.2021 05:01:02
A
05.09.2021 05:00:02
A
04.09.2021 04:59:01
A
03.09.2021 04:58:03
A
02.09.2021 04:56:55
A
02.09.2021 04:33:56
B
02.09.2021 04:25:30
A
WITH CTE(DATE,ITEM)AS
(
SELECT '20210907 05:02:05' , 'A'UNION ALL
SELECT '20210906 05:01:02' , 'A'UNION ALL
SELECT '20210905 05:00:02' , 'A'UNION ALL
SELECT'20210904 04:59:01' , 'A'UNION ALL
SELECT'20210903 04:58:03' , 'A'UNION ALL
SELECT'20210902 04:56:55' , 'A'UNION ALL
SELECT'20210902 04:33:56' , 'B'UNION ALL
SELECT'20210902 04:25:30' , 'A'
)
SELECT
CAST(C.DATE AS DATETIME)X_DATE,C.ITEM,Q.CLOSEST
FROM CTE AS C
OUTER APPLY
(
SELECT TOP 1 CAST(X.DATE AS DATETIME)CLOSEST
FROM CTE AS X
WHERE X.ITEM='A'AND CAST(X.DATE AS DATETIME)<CAST(C.DATE AS DATETIME)
ORDER BY CAST(X.DATE AS DATETIME) ASC
)Q
WHERE C.ITEM='B'
You can use OUTER APPLY-approach as in the above query.
Please also take a look that datetime-column (DATE)is written in the ISO-compliant form
Your data has only two columns. If you want the only the closest A timestamp, then the fastest way is probably window functions:
select t.*,
(case when prev_a_date is null then next_a_date
when next_a_date is null then prev_a_date
when datediff(second, prev_a_date, date) <= datediff(second, date, next_a_date) then prev_a_date
else next_a_date
end) as a_date
from (select t.*,
max(case when item = 'A' then date end) over (order by date) as prev_a_date,
min(case when item = 'A' then date end) over (order by date desc) as next_a_date
from t
) t
where item = 'B';
This uses seconds to measure the time difference, but you can use a smaller unit if appropriate.
You can also do this using apply if you have more columns from the "A" rows that you want:
select tb.*, ta.*
from t b outer apply
(select top (1) ta.*
from t ta
where item = 'A'
order by abs(datediff(second, a.date, b.date))
) t
where item = 'B';

Oracle SQL Hierarchy Summation

I have a table TRANS that contains the following records:
TRANS_ID TRANS_DT QTY
1 01-Aug-2020 5
1 01-Aug-2020 1
1 03-Aug-2020 2
2 02-Aug-2020 1
The expected output:
TRANS_ID TRANS_DT BEGBAL TOTAL END_BAL
1 01-Aug-2020 0 6 6
1 02-Aug-2020 6 0 6
1 03-Aug-2020 6 2 8
2 01-Aug-2020 0 0 0
2 02-Aug-2020 0 1 1
2 03-Aug-2020 1 0 1
Each trans_id starts with a beginning balance of 0 (01-Aug-2020). For succeeding days, the beginning balance is the ending balance of the previous day and so on.
I can create PL/SQL block to create the output. Is it possible to get the output in 1 SQL statement?
Thanks.
Try this following script using CTE-
Demo Here
WITH CTE
AS
(
SELECT DISTINCT A.TRANS_ID,B.TRANS_DT
FROM your_table A
CROSS JOIN (SELECT DISTINCT TRANS_DT FROM your_table) B
),
CTE2
AS
(
SELECT C.TRANS_ID,C.TRANS_DT,SUM(D.QTY) QTY
FROM CTE C
LEFT JOIN your_table D
ON C.TRANS_ID = D.TRANS_ID
AND C.TRANS_DT = D.TRANS_DT
GROUP BY C.TRANS_ID,C.TRANS_DT
ORDER BY C.TRANS_ID,C.TRANS_DT
)
SELECT F.TRANS_ID,F.TRANS_DT,
(
SELECT COALESCE (SUM(QTY), 0) FROM CTE2 E
WHERE E.TRANS_ID = F.TRANS_ID AND E.TRANS_DT < F.TRANS_DT
) BEGBAL,
(
SELECT COALESCE (SUM(QTY), 0) FROM CTE2 E
WHERE E.TRANS_ID = F.TRANS_ID AND E.TRANS_DT = F.TRANS_DT
) TOTAL ,
(
SELECT COALESCE (SUM(QTY), 0) FROM CTE2 E
WHERE E.TRANS_ID = F.TRANS_ID AND E.TRANS_DT <= F.TRANS_DT
) END_BAL
FROM CTE2 F
You can as well do like this (I would assume it's a bit faster): Demo
with
dt_between as (
select mindt + level - 1 as trans_dt
from (select min(trans_dt) as mindt, max(trans_dt) as maxdt from t)
connect by level <= maxdt - mindt + 1
),
dt_for_trans_id as (
select *
from dt_between, (select distinct trans_id from t)
),
qty_change as (
select distinct trans_id, trans_dt,
sum(qty) over (partition by trans_id, trans_dt) as total,
sum(qty) over (partition by trans_id order by trans_dt) as end_bal
from t
right outer join dt_for_trans_id using (trans_id, trans_dt)
)
select
trans_id,
to_char(trans_dt, 'DD-Mon-YYYY') as trans_dt,
nvl(lag(end_bal) over (partition by trans_id order by trans_dt), 0) as beg_bal,
nvl(total, 0) as total,
nvl(end_bal, 0) as end_bal
from qty_change q
order by trans_id, trans_dt
dt_between returns all the days between min(trans_dt) and max(trans_dt) in your data.
dt_for_trans_id returns all these days for each trans_id in your data.
qty_change finds difference for each day (which is TOTAL in your example) and cumulative sum over all the days (which is END_BAL in your example).
The main select takes END_BAL from previous day and calls it BEG_BAL, it also does some formatting of final output.
First of all, you need to generate dates, then you need to aggregate your values by TRANS_DT, and then left join your aggregated data to dates. The easiest way to get required sums is to use analitic window functions:
with dates(dt) as ( -- generating dates between min(TRANS_DT) and max(TRANS_DT) from TRANS
select min(trans_dt) from trans
union all
select dt+1 from dates
where dt+1<=(select max(trans_dt) from trans)
)
,trans_agg as ( -- aggregating QTY in TRANS
select TRANS_ID,TRANS_DT,sum(QTY) as QTY
from trans
group by TRANS_ID,TRANS_DT
)
select -- using left join partition by to get data on daily basis for each trans_id:
dt,
trans_id,
nvl(sum(qty) over(partition by trans_id order by dates.dt range between unbounded preceding and 1 preceding),0) as BEGBAL,
nvl(qty,0) as TOTAL,
nvl(sum(qty) over(partition by trans_id order by dates.dt),0) as END_BAL
from dates
left join trans_agg tr
partition by (trans_id)
on tr.trans_dt=dates.dt;
Full example with sample data:
alter session set nls_date_format='dd-mon-yyyy';
with trans(TRANS_ID,TRANS_DT,QTY) as (
select 1,to_date('01-Aug-2020'), 5 from dual union all
select 1,to_date('01-Aug-2020'), 1 from dual union all
select 1,to_date('03-Aug-2020'), 2 from dual union all
select 2,to_date('02-Aug-2020'), 1 from dual
)
,dates(dt) as ( -- generating dates between min(TRANS_DT) and max(TRANS_DT) from TRANS
select min(trans_dt) from trans
union all
select dt+1 from dates
where dt+1<=(select max(trans_dt) from trans)
)
,trans_agg as ( -- aggregating QTY in TRANS
select TRANS_ID,TRANS_DT,sum(QTY) as QTY
from trans
group by TRANS_ID,TRANS_DT
)
select
dt,
trans_id,
nvl(sum(qty) over(partition by trans_id order by dates.dt range between unbounded preceding and 1 preceding),0) as BEGBAL,
nvl(qty,0) as TOTAL,
nvl(sum(qty) over(partition by trans_id order by dates.dt),0) as END_BAL
from dates
left join trans_agg tr
partition by (trans_id)
on tr.trans_dt=dates.dt;
You can use a recursive query to generate the overall date range, cross join it with the list of distinct tran_id, then bring the table with a left join. The last step is aggregation and window functions:
with all_dates (trans_dt, max_dt) as (
select min(trans_dt), max(trans_dt) from trans group by trans_id
union all
select trans_dt + interval '1' day, max_dt from all_dates where trans_dt < max_dt
)
select
i.trans_id,
d.trans_dt,
coalesce(sum(sum(t.qty)) over(partition by i.trans_id order by d.trans_dt), 0) - coalesce(sum(t.qty), 0) begbal,
coalesce(sum(t.qty), 0) total,
coalesce(sum(sum(t.qty)) over(partition by i.trans_id order by d.trans_dt), 0) endbal
from all_dates d
cross join (select distinct trans_id from trans) i
left join trans t on t.trans_id = i.trans_id and t.trans_dt = d.trans_dt
group by i.trans_id, d.trans_dt
order by i.trans_id, d.trans_dt

conditional running sum

I'm trying to return the number of unique users that converted over time.
So I have the following query:
WITH CTE
As
(
SELECT '2020-04-01' as date,'userA' as user,1 as goals Union all
SELECT '2020-04-01','userB',0 Union all
SELECT '2020-04-01','userC',0 Union all
SELECT '2020-04-03','userA',1 Union all
SELECT '2020-04-05','userC',1 Union all
SELECT '2020-04-06','userC',0 Union all
SELECT '2020-04-06','userB',0
)
select
date,
COUNT(DISTINCT
IF
(goals >= 1,
user,
NULL)) AS cad_converters
from CTE
group by date
I'm trying to count distinct user but I need to find a way to apply the distinct count to the whole date. I probably need to do something like a cumulative some...
expected result would be something like this
date, goals, total_unique_converted_users
'2020-04-01',1,1
'2020-04-01',0,1
'2020-04-01',0,1
'2020-04-03',1,2
'2020-04-05',1,2
'2020-04-06',0,2
'2020-04-06',0,2
Below is for BigQuery Standard SQL
#standardSQL
SELECT t.date, t.goals, total_unique_converted_users
FROM `project.dataset.table` t
LEFT JOIN (
SELECT a.date,
COUNT(DISTINCT IF(b.goals >= 1, b.user, NULL)) AS total_unique_converted_users
FROM `project.dataset.table` a
CROSS JOIN `project.dataset.table` b
WHERE a.date >= b.date
GROUP BY a.date
)
USING(date)
I would approach this by tagging when the first goal is scored for each name. Then simply do a cumulative sum:
select cte.* except (seqnum), countif(seqnum = 1) over (order by date)
from (select cte.*,
(case when goals = 1 then row_number() over (partition by user, goals order by date) end) as seqnum
from cte
) cte;
I realize this can be expressed without the case in the subquery:
select cte.* except (seqnum), countif(seqnum = 1 and goals = 1) over (order by date)
from (select cte.*,
row_number() over (partition by user, goals order by date) as seqnum
from cte
) cte;

How to find latest status of the day in SQL Server

I have a SQL Server question that I'm trying to figure out at work:
There is a table with a status field which can contain a status called "Participate." I am only trying to find records if the latest status of the day is "Participate" and only if the status changed on the same day from another status to "Participate."
I don't want any records where the status was already "Participate." It must have changed to that status on the same day. You can tell when the status was changed by the datetime field ChangedOn.
In the sample below I would only want to bring back ID 1880 since the status of "Participated" has the latest timestamp. I would not bring back ID 1700 since the last record is "Other," and I would not bring back ID 1600 since "Participated" is the only status of that day.
ChangedOn Status ID
02/01/17 15:23 Terminated 1880
02/01/17 17:24 Participated 1880
02/01/17 09:00 Other 1880
01/31/17 01:00 Terminated 1700
01/31/17 02:00 Participated 1700
01/31/17 03:00 Other 1700
01/31/17 02:00 Participated 1600
I was thinking of using a Window function, but I'm not sure how to get started on this. It's been a few months since I've written a query like this so I'm a bit out of practice.
Thanks!
You can use window functions for this:
select t.*
from (select t.*,
row_number() over (partition by cast(ChangedOn as date)
order by ChangedOn desc
) as seqnum,
sum(case when status <> 'Participate' then 1 else 0 end) over (partition by cast(ChangedOn as date)) as num_nonparticipate
from t
) t
where (seqnum = 1 and ChangedOn = 'Participate') and
num_nonparticipate > 0;
Can you check this?
WITH sample_table(ChangedOn,Status,ID)AS(
SELECT CONVERT(DATETIME,'02/01/2017 15:23'),'Terminated',1880 UNION ALL
SELECT '02/01/2017 17:24','Participated',1880 UNION ALL
SELECT '02/01/2017 09:00','Other',1880 UNION ALL
SELECT '01/31/2017 01:00','Terminated',1700 UNION ALL
SELECT '01/31/2017 02:00','Participated',1700 UNION ALL
SELECT '01/31/2017 03:00','Other',1700 UNION ALL
SELECT '01/31/2017 02:00','Participated',1600
)
SELECT ID FROM (
SELECT *
,ROW_NUMBER()OVER(PARTITION BY ID,CONVERT(VARCHAR,ChangedOn,112) ORDER BY ChangedOn) AS rn
,COUNT(0)OVER(PARTITION BY ID,CONVERT(VARCHAR,ChangedOn,112)) AS cnt
,CASE WHEN Status<>'Participated' THEN 1 ELSE 0 END AS ss
,SUM(CASE WHEN Status!='Participated' THEN 1 ELSE 0 END)OVER(PARTITION BY ID,CONVERT(VARCHAR,ChangedOn,112)) AS OtherStatusCnt
FROM sample_table
) AS t WHERE t.rn=t.cnt AND t.Status='Participated' AND t.OtherStatusCnt>0
--Return:
1880
try this with other sample data,
declare #t table(ChangedOn datetime,Status varchar(50),ID int)
insert into #t VALUES
('02/01/17 15:23', 'Terminated' ,1880)
,('02/01/17 17:24', 'Participated' ,1880)
,('02/01/17 09:00', 'Other' ,1880)
,('01/31/17 01:00', 'Terminated' ,1700)
,('01/31/17 02:00', 'Participated' ,1700)
,('01/31/17 03:00', 'Other' ,1700)
,('01/31/17 02:00', 'Participated' ,1600)
;
WITH CTE
AS (
SELECT *
,row_number() OVER (
PARTITION BY id
,cast(ChangedOn AS DATE) ORDER BY ChangedOn DESC
) AS seqnum
FROM #t
)
SELECT *
FROM cte c
WHERE seqnum = 1
AND STATUS = 'Participated'
AND EXISTS (
SELECT id
FROM cte c1
WHERE seqnum > 1
AND c.id = c1.id
)
2nd query,this is better
here CTE is same
SELECT *
FROM cte c
WHERE seqnum = 1
AND STATUS = 'Participated'
AND EXISTS (
SELECT id
FROM cte c1
WHERE STATUS != 'Participated'
AND c.id = c1.id
)