Overlapping effective dates aggregation - sql

I am trying to aggregate overlapping effective dates. Any gaps between dates should be considered as separate rows. I am using min and max and I am getting below output but would like to see expected output.
My query
WITH test_data AS (
SELECT '2020-01-01' AS date_from,
'2020-01-03' AS date_to,
'1' AS product
UNION ALL
SELECT '2020-01-05' AS date_from,
'2020-01-07' AS date_to,
'1' AS product
UNION ALL
SELECT '2020-01-06' AS date_from,
'2020-01-10' AS date_to,
'1' AS product
)
SELECT product,
MIN(date_from) AS date_from,
MAX(date_to) AS date_to
FROM test_data
GROUP BY 1;
Source data
date_from
date_to
product
2020-01-01
2020-01-03
1
2020-01-05
2020-01-07
1
2020-01-06
2020-01-10
1
Output table
date_from
date_to
product
2020-01-01
2020-01-10
1
Expected output
date_from
date_to
product
2020-01-01
2020-01-03
1
2020-01-05
2020-01-10
1
Thanks in advance !

This is a type of gaps-and-islands problem. I recommend an approach like this:
SELECT product,
MIN(date_from) AS date_from,
MAX(date_to) AS date_to
FROM (SELECT td.*,
SUM(CASE WHEN prev_date_to >= date_from THEN 0 ELSE 1 END) OVER (PARTITION BY product ORDER BY date_to) as grp
FROM (SELECT td.*,
MAX(date_to) OVER (PARTITION BY product ORDER BY date_from ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) as prev_date_to
FROM test_data td
) td
) td
GROUP BY grp, product
ORDER BY product, MIN(date_from);
Here is a db<>fiddle.
What is this doing? The innermost subquery is getting the latest date_to on previous rows. This is used to determine if each row is "connected" to the previous row or if it starts a new grouping.
The middle subquery has logic which is a cumulative sum of when the rows start a new group. The outer query then aggregates by this grouping.

Merging of date ranges could be achieved with MATCH_RECOGNIZE.
Data preparation:
CREATE OR REPLACE TABLE test_data AS
SELECT '2020-01-01'::DATE AS date_from, '2020-01-03'::DATE AS date_to, '1' AS product
UNION ALL
SELECT '2020-01-05'::DATE AS date_from, '2020-01-07'::DATE AS date_to, '1' AS product
UNION ALL
SELECT '2020-01-06'::DATE AS date_from, '2020-01-10'::DATE AS date_to, '1' AS product;
Query:
SELECT *
FROM test_data t
MATCH_RECOGNIZE(
PARTITION BY product
ORDER BY date_from, date_to
MEASURES FIRST(date_from) date_from, MAX(date_to) date_to
PATTERN(a* b)
DEFINE a AS MAX(date_to) OVER() >= NEXT(date_from)
) mr;
db<>fiddle demo - Oracle
Related reading: Merging Overlapping Date Ranges with MATCH_RECOGNIZE by stewashton

Related

create time range with 2 columns date_time

The problem I am facing is how to find distinct time periods from multiple time periods with overlap in Teradata ANSI SQL.
For example, the attached tables contain multiple overlapping time periods, how can I combine those time periods into 3 unique time periods in Teradata SQL???
I think I can do it in python with the loop function, but not sure how to do it in SQL
ID
Start Date
End Date
001
2005-01-01
2006-01-01
001
2005-01-01
2007-01-01
001
2008-01-01
2008-06-01
001
2008-04-01
2008-12-01
001
2010-01-01
2010-05-01
001
2010-04-01
2010-12-01
001
2010-11-01
2012-01-01
My expected result is:
ID
start_Date
end_date
001
2005-01-01
2007-01-01
001
2008-01-01
2008-12-01
001
2010-01-01
2012-01-01
From Oracle 12, you can use MATCH_RECOGNIZE to perform a row-by-row comparison:
SELECT *
FROM table_name
MATCH_RECOGNIZE(
PARTITION BY id
ORDER BY start_date
MEASURES
FIRST(start_date) AS start_date,
MAX(end_date) AS end_date
ONE ROW PER MATCH
PATTERN (overlapping_ranges* last_range)
DEFINE overlapping_ranges AS NEXT(start_date) <= MAX(end_date)
)
Which, for the sample data:
CREATE TABLE table_name (ID, Start_Date, End_Date) AS
SELECT '001', DATE '2005-01-01', DATE '2006-01-01' FROM DUAL UNION ALL
SELECT '001', DATE '2005-01-01', DATE '2007-01-01' FROM DUAL UNION ALL
SELECT '001', DATE '2008-01-01', DATE '2008-06-01' FROM DUAL UNION ALL
SELECT '001', DATE '2008-04-01', DATE '2008-12-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-01-01', DATE '2010-05-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-04-01', DATE '2010-12-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-11-01', DATE '2012-01-01' FROM DUAL;
Outputs:
ID
START_DATE
END_DATE
001
2005-01-01 00:00:00
2007-01-01 00:00:00
001
2008-01-01 00:00:00
2008-12-01 00:00:00
001
2010-01-01 00:00:00
2012-01-01 00:00:00
db<>fiddle here
Update: Alternative query
SELECT id,
start_date,
end_date
FROM (
SELECT id,
dt,
SUM(cnt) OVER (PARTITION BY id ORDER BY dt) AS grp,
cnt
FROM (
SELECT ID,
dt,
SUM(type) OVER (PARTITION BY id ORDER BY dt, ROWNUM) * type AS cnt
FROM table_name
UNPIVOT (dt FOR type IN (start_date AS 1, end_date AS -1))
)
WHERE cnt IN (1,0)
)
PIVOT (MAX(dt) FOR cnt IN (1 AS start_date, 0 AS end_date))
Or, an equivalent that does not use UNPIVOT, PIVOT or ROWNUM and works in both Oracle and PostgreSQL:
SELECT id,
MAX(CASE cnt WHEN 1 THEN dt END) AS start_date,
MAX(CASE cnt WHEN 0 THEN dt END) AS end_date
FROM (
SELECT id,
dt,
SUM(cnt) OVER (PARTITION BY id ORDER BY dt) AS grp,
cnt
FROM (
SELECT ID,
dt,
SUM(type) OVER (PARTITION BY id ORDER BY dt, rn) * type AS cnt
FROM (
SELECT r.*,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY dt ASC, type DESC) AS rn
FROM (
SELECT id, 1 AS type, start_date AS dt FROM table_name
UNION ALL
SELECT id, -1 AS type, end_date AS dt FROM table_name
) r
) p
) s
WHERE cnt IN (1,0)
) t
GROUP BY id, grp
Update 2: Another Alternative
SELECT id,
MIN(start_date) AS start_date,
MAX(end_Date) AS end_date
FROM (
SELECT t.*,
SUM(CASE WHEN start_date <= prev_max THEN 0 ELSE 1 END)
OVER (PARTITION BY id ORDER BY start_date) AS grp
FROM (
SELECT t.*,
MAX(end_date) OVER (
PARTITION BY id ORDER BY start_date
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
) AS prev_max
FROM table_name t
) t
) t
GROUP BY id, grp
db<>fiddle Oracle PostgreSQL
This is a gaps and islands problem. Try this:
with u as
(select ID, start_date, end_date,
case
when start_date <= lag(end_date) over(partition by ID order by start_date, end_date) then 0
else 1 end as grp
from table_name),
v as
(select ID, start_date, end_date,
sum(grp) over(partition by ID order by start_date, end_date) as island
from u)
select ID, min(start_date) as start_Date, max(end_date) as end_date
from v
group by ID, island;
Fiddle
Basically you can identify "islands" by comparing start_date of current row to end_date of previous row (ordered by start_date, end_date), if it precedes it then it's the same island. Then you can do a rolling sum() to get the island numbers. Finally select min(start_date) and max(end_date) from each island to get the desired output.
This may work ,with little bit of change in function , I tried it in Dbeaver :
select ID,Start_Date,End_Date
from
(
select t.*,
dense_rank () over(partition by extract (year from Start_Date) order BY End_Date desc) drnk
from testing_123 t
) temp
where temp.drnk = 1
ORDER BY Start_Date;
Try this
WITH a as (
SELECT
ID,
LEFT(Start_Date, 4) as Year,
MIN(Start_Date) as New_Start_Date
FROM
TAB1
GROUP BY
ID,
LEFT(Start_Date, 4)
), b as (
SELECT
a.ID,
Year,
New_Start_Date,
End_Date
FROM
a
LEFT JOIN
TAB1
ON LEFT(a.New_Start_Date, 4) = LEFT(TAB1.Start_Date, 4)
)
select
ID,
New_Start_Date as Start_Date,
MAX(End_Date)
from
b
GROUP BY
ID,
New_Start_Date;
Example: https://dbfiddle.uk/?rdbms=mysql_8.0&fiddle=97f91b68c635aebfb752538cdd752ace

LAG with condition

I want to get a value from the previous row that matches a certain condition.
For example: here I want for each row to get the timestamp from the last event = 1.
I feel I can do it without joins with LAG and PARTITION BY with CASE but I am not able to crack it.
Please help.
Here is one approach using analytic functions:
WITH cte AS (
SELECT *, COUNT(CASE WHEN event = 1 THEN 1 END) OVER
(PARTITION BY customer_id ORDER BY ts) cnt
FROM yourTable
)
SELECT ts, customer_id, event,
MAX(CASE WHEN event = 1 THEN ts END) OVER
(PARTITION BY customer_id, cnt) AS desired_result
FROM cte
ORDER BY customer_id, ts;
Demo
We can articulate your problem by saying that your want the desired_result column to contain the most recent timestamp value when the event was 1. The count (cnt) in the CTE above computes a pseudo group of records for each time the event is 1. Then we simply do a conditional aggregation over customer and pseudo group to find the timestamp value.
One more approach with "one query":
with data as
(
select sysdate - 0.29 ts, 111 customer_id, 1 event from dual union all
select sysdate - 0.28 ts, 111 customer_id, 2 event from dual union all
select sysdate - 0.27 ts, 111 customer_id, 3 event from dual union all
select sysdate - 0.26 ts, 111 customer_id, 1 event from dual union all
select sysdate - 0.25 ts, 111 customer_id, 1 event from dual union all
select sysdate - 0.24 ts, 111 customer_id, 2 event from dual union all
select sysdate - 0.23 ts, 111 customer_id, 1 event from dual union all
select sysdate - 0.22 ts, 111 customer_id, 1 event from dual
)
select
ts, event,
last_value(case when event=1 then ts end) ignore nulls
over (partition by customer_id order by ts) desired_result,
max(case when event=1 then ts end)
over (partition by customer_id order by ts) desired_result_2
from data
order by ts
Edit: As suggested by MatBailie the max(case...) works as well and is a more general approach. The "last_value ... ignore nulls" is Oracle specific.

Query to find changes in a row wrt previous row in SQL query

I have a table per_all_Assignments_f with date_from and date_to and following column structure :
PERSON_ID DATE_FROM DATE_TO GRADE
--------- ------------ ----------- -----
12 01-Jan-2018 28-Feb-2018 c
12 01-Mar-2018 29-Mar-2018 a
12 30-Mar-2018 31-dec-4712 b
13 01-jan-2018 31-dec-4712 c
In the above table, I have to retrieve the latest grade change i.e. for person_id '12', I have to retrieve both record rows : 30-mar-2018 to 31 dec 4712 being the latest and one prior row. What function can i use for this ?
solved by :
SELECT person_id,
asg.grade_id,
lag(asg.grade_id) Over (Partition By person_ID Order By start_date) as prev_ppg_line1,
lag(start_date) Over (Partition By person_ID Order By start_date)
as prev_ppg_effective_start_date,
start_date,
row_Number() Over (Partition By person_ID Order By effective_start_date) as rn
FROM asg_table asg
WHERE person_id = 12;
This query will fetch 3 rows with all the previous changes. I want to fetch the latest change only without using max on effective start date
You can use row_number and lead analytic functions together inside the subquery as :
select person_id, date_From, date_to, grade
from
(
with per_all_Assignments_f(person_id, date_From, date_to, grade) as
(
select 12,date'2018-01-01',date'2018-02-28','c' from dual union all
select 12,date'2018-03-01',date'2018-03-29','a' from dual union all
select 12,date'2018-03-30',date'4172-12-31','b' from dual union all
select 13,date'2018-01-01',date'4172-12-31','c' from dual
)
select t.*,
lead(grade) over (order by date_From desc) as ld,
row_number() over (order by date_From desc) as rn
from per_all_Assignments_f t
)
where rn <= 2
and grade != ld
order by rn desc;
PERSON_ID DATE_FROM DATE_TO GRADE
---------- ----------- ---------- -------
12 01.03.2018 29.03.2018 a
12 30.03.2018 31.12.4172 b
Rextester Demo
Seems like you just want all with a row_number() of 1 or 2 partitioned by the person and ordered by the beginning descending.
SELECT person_id,
date_from,
date_to,
grade
FROM (SELECT person_id,
date_from,
date_to,
grade,
row_number() OVER (PARTITION BY person_id
ORDER BY date_from DESC) rn
FROM per_all_assignments_f t) x
WHERE rn IN (1, 2)
ORDER BY person_id ASC,
date_from DESC;

select periods from date

I have a problem with choosing from the list of absences, those that follow one another and grouping them into periods.
date_from (data_od) date_to(data_do)
--------------------------
18/08/01 - 18/08/15
18/08/16 - 18/08/20
18/08/21 - 18/08/31
18/09/01 - 18/09/08
18/05/01 - 18/05/31
18/06/01 - 18/06/30
18/03/01 - 18/03/18
18/02/14 - 18/02/28
above is a list of absences, and the result of which should be a table:
date_from (data_od) date_to(data_do)
--------------------------
18/08/01 18/09/08
18/05/01 18/06/30
18/02/14 18/03/18
For now, I did something like this, but I only research in twos :(
SELECT u1.data_od,u2.data_do
FROM l_absencje u1 CROSS APPLY
(SELECT * FROM l_absencje labs
WHERE labs.prac_id=u1.prac_id AND
TRUNC(labs.data_od) = TRUNC(u1.data_do)+1
ORDER BY id DESC FETCH FIRST 1 ROWS ONLY
) u2 where u1.prac_id=1067 ;
And give me that:
18/08/01 18/08/20 bad
18/08/16 18/08/31 bad
18/08/21 18/09/08 bad
18/05/01 18/06/30 good
18/02/14 18/03/18 good
You can use a combination of the LAG(), LEAD() and LAST_VALUE() analytic functions:
SQL Fiddle
Oracle 11g R2 Schema Setup:
CREATE TABLE absences ( date_from, date_to ) AS
SELECT DATE '2018-08-01', DATE '2018-08-15' FROM DUAL UNION ALL
SELECT DATE '2018-08-16', DATE '2018-08-20' FROM DUAL UNION ALL
SELECT DATE '2018-08-21', DATE '2018-08-31' FROM DUAL UNION ALL
SELECT DATE '2018-09-01', DATE '2018-09-08' FROM DUAL UNION ALL
SELECT DATE '2018-05-01', DATE '2018-05-31' FROM DUAL UNION ALL
SELECT DATE '2018-06-01', DATE '2018-06-30' FROM DUAL UNION ALL
SELECT DATE '2018-03-01', DATE '2018-03-18' FROM DUAL UNION ALL
SELECT DATE '2018-02-14', DATE '2018-02-28' FROM DUAL;
Query 1:
SELECT *
FROM (
SELECT CASE
WHEN date_to IS NOT NULL
THEN LAST_VALUE( date_from ) IGNORE NULLS
OVER( ORDER BY ROWNUM )
END AS date_from,
date_to
FROM (
SELECT CASE date_from
WHEN LAG( date_to ) OVER ( ORDER BY date_to )
+ INTERVAL '1' DAY
THEN NULL
ELSE date_from
END AS date_from,
CASE date_to
WHEN LEAD( date_from ) OVER ( ORDER BY date_from )
- INTERVAL '1' DAY
THEN NULL
ELSE date_to
END AS date_to
FROM absences
)
)
WHERE date_from IS NOT NULL
AND date_to IS NOT NULL
Results:
| DATE_FROM | DATE_TO |
|----------------------|----------------------|
| 2018-02-14T00:00:00Z | 2018-03-18T00:00:00Z |
| 2018-05-01T00:00:00Z | 2018-06-30T00:00:00Z |
| 2018-08-01T00:00:00Z | 2018-09-08T00:00:00Z |

SQL min() max() function with exception

Here is my simplied code:
SELECT
a.user_id as User_ID,
min(b.a_day) as Date_from,
max(b.a_day) as Date_to,
c.code as ID
FROM a, b, c
WHERE
a_day > (day, -15, getdate())
GROUP BY
a.user_id,
c.code
Query gives the following output:
User ID date_from date_to id
1234567 2016-06-13 2016-06-13 B
1234567 2016-06-17 2016-06-17 A
12345672016-06-18 2016-06-18 A
1234567 2016-06-19 2016-06-19 A
1234567 2016-06-20 2016-06-20 A
1234567 2016-06-21 2016-06-21 B
I need something like this:
User ID date_from date_to id
1234567 2016-06-13 2016-06-13 B
1234567 2016-06-17 2016-06-20 A
1234567 2016-06-21 2016-06-21 B
When I use min() and max() function with group by, it aggregates fine for all records with ID=A but there should be exception for ID=B. I have to aggregate only dates with the same ID day after day.
Any ideas?
Thanks in advance.
You can combine these rows using the following strategy:
Determine where a new grouping begins.
Do a cumulative sum of the flag from (1) to identify each grouping.
Then do the aggregation.
This looks like:
select min(date_from) as date_from, max(date_to) as date_to, id
from (select t.*,
sum(isNewGroup) over (partition by id order by date_from) as grp
from (select t.*,
(case when lag(date_to) over (partition by id order by date_from) >= date_from
then 0 else 1
end) as isNewGroup
from t
) t
) t
group by id, grp;
it's my solution to get min/max continuous date.
try to run the SQL in your oracle.
is it helpful for you?
WITH TEST_DATA AS (
SELECT TO_DATE('20160613', 'YYYYMMDD') AS DATE_FROM, TO_DATE('20160613', 'YYYYMMDD') AS DATE_TO, 'B' AS ID FROM DUAL
UNION ALL
SELECT TO_DATE('20160617', 'YYYYMMDD') AS DATE_FROM, TO_DATE('20160617', 'YYYYMMDD') AS DATE_TO, 'A' AS ID FROM DUAL
UNION ALL
SELECT TO_DATE('20160618', 'YYYYMMDD') AS DATE_FROM, TO_DATE('20160618', 'YYYYMMDD') AS DATE_TO, 'A' AS ID FROM DUAL
UNION ALL
SELECT TO_DATE('20160619', 'YYYYMMDD') AS DATE_FROM, TO_DATE('20160619', 'YYYYMMDD') AS DATE_TO, 'A' AS ID FROM DUAL
UNION ALL
SELECT TO_DATE('20160620', 'YYYYMMDD') AS DATE_FROM, TO_DATE('20160620', 'YYYYMMDD') AS DATE_TO, 'A' AS ID FROM DUAL
UNION ALL
SELECT TO_DATE('20160621', 'YYYYMMDD') AS DATE_FROM, TO_DATE('20160621', 'YYYYMMDD') AS DATE_TO, 'B' AS ID FROM DUAL
)
SELECT
MIN(ID) AS ID,
MIN(DATE_FROM) AS DATE_FROM,
MAX(DATE_TO) AS DATE_TO
FROM (
SELECT
CONNECT_BY_ROOT(DATE_FROM) || CONNECT_BY_ROOT(ID) AS GROUP_KEY,
ROW_NUMBER() OVER(PARTITION BY ID, DATE_FROM, DATE_TO ORDER BY ID, LEVEL DESC) AS DISTINCT_FLG,
DATE_FROM,
DATE_TO,
ID
FROM
TEST_DATA
WHERE ID = CONNECT_BY_ROOT(ID)
CONNECT BY DATE_FROM = PRIOR DATE_TO + 1
ORDER BY DATE_FROM
)
WHERE
DISTINCT_FLG = 1
GROUP BY
GROUP_KEY
Here is mysql solution:
select grp, min(f) f, max(t) t, i
from
(
select x.*
,case when #lastu = i and datediff(f, #lastf)=1 then #gr:=#gr else #gr:=#gr+1 end grp
,#lastu:= i
,#lastf:= f
from
(
select '2016-06-13' f,'2016-06-13' t ,'B' i union all
select '2016-06-17','2016-06-17','A' union all
select '2016-06-18','2016-06-18','A' union all
select '2016-06-19','2016-06-19','A' union all
select '2016-06-20','2016-06-20','A' union all
select '2016-06-21','2016-06-21','B'
order by i, f, t
) x
, (select #gr:=0, #lastu:='', #lastf:='' ) b
) xx
group by grp, i