SQL: Identify first and last date within each consecutive group of days - sql

Objective:
The objective is to find the first and last observation date for which the room has a constant price using postgresql SQL queries.
We are completely lost so any guidance would be highly appreciated.
Create example:
CREATE TABLE table_prices
(
pk int GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
room_id character varying(50) COLLATE pg_catalog."default",
check_in date,
price integer,
observation_date date
)
Insert data:
insert into table_prices (room_id, check_in, price, observation_date) values
('1', '2019-05-01', 100, '2019-01-01'),
('1', '2019-05-01', 100, '2019-01-02'),
('1', '2019-05-01', 100, '2019-01-03'),
('1', '2019-05-01', 150, '2019-01-04'),
('1', '2019-05-01', 150, '2019-01-05'),
('1', '2019-05-01', 150, '2019-01-06'),
('1', '2019-05-01', 150, '2019-01-07'),
('1', '2019-05-01', 100, '2019-01-08'),
('1', '2019-05-01', 100, '2019-01-09'),
('2', '2019-05-01', 200, '2019-01-01'),
('2', '2019-05-01', 200, '2019-01-02'),
('2', '2019-05-01', 200, '2019-01-03'),
('2', '2019-05-01', 200, '2019-01-04'),
('2', '2019-05-01', 200, '2019-01-05'),
('2', '2019-05-01', 200, '2019-01-06'),
('2', '2019-05-01', 200, '2019-01-07'),
('2', '2019-05-01', 200, '2019-01-08'),
('2', '2019-05-01', 200, '2019-01-09')
Expected outcome:
room_id, check_in, first_observation, last_observation, price
1, 2019-05-01, 2019-01-01, 2019-01-03, 100
1, 2019-05-01, 2019-01-04, 2019-01-07, 150
1, 2019-05-01, 2019-01-08, 2019-01-09, 100
2, 2019-05-01, 2019-01-01, 2019-01-09, 200

This is a gap & island problem -you can try using row_number()
DEMO
select room_id, check_in,min(observation_date) first_observation,max(observation_date)
last_observation,price
from
(
select *,island=row_number() over(partition by room_id order by observation_date) -
row_number() over(partition by room_id, price order by observation_date)
from table_prices
)A group by room_id, check_in,island,price
OUTPUT:
room_id check_in first_observation last_observation price
1 01/05/2019 00:00:00 01/01/2019 00:00:00 03/01/2019 00:00:00 100
1 01/05/2019 00:00:00 04/01/2019 00:00:00 07/01/2019 00:00:00 150
1 01/05/2019 00:00:00 08/01/2019 00:00:00 09/01/2019 00:00:00 100
2 01/05/2019 00:00:00 01/01/2019 00:00:00 09/01/2019 00:00:00 200

Related

fetching startdate and enddate from timestemp column and partition by modes in SQL

i am trying to solve this this is table1 and i am trying to have below output i am not able to build up a logic that how could i fetch start date and end date from same timestemp column in SQL.
CREATE TABLE table1 (
`batch` INTEGER,
`timestemp` VARCHAR(8),
`mo` INTEGER,
`speed` INTEGER
);
INSERT INTO table1
(`batch`, `timestemp`, `mo`, `speed`)
VALUES
('1', '00:18:00', '0', '0'),
('1', '01:18:00', '0', '0'),
('1', '02:18:00', '0', '0'),
('1', '03:18:00', '1', '5'),
('1', '04:18:00', '1', '6'),
('1', '05:18:00', '1', '7'),
('1', '06:18:00', '2', '10'),
('1', '07:18:00', '2', '9'),
('1', '08:18:00', '2', '8'),
('1', '09:18:00', '3', '12'),
('1', '10:18:00', '3', '23'),
('1', '11:18:00', '3', '21'),
('1', '12:18:00', '4', '20'),
('1', '13:18:00', '4', '22');
mo=mode
batch
timestemp
mo
speed
1
00:18:00
0
0
1
01:18:00
0
0
1
02:18:00
0
0
1
03:18:00
1
5
1
04:18:00
1
6
1
05:18:00
1
7
1
06:18:00
2
10
1
07:18:00
2
9
1
08:18:00
2
8
1
09:18:00
3
12
1
10:18:00
3
23
1
11:18:00
3
21
1
12:18:00
4
20
1
13:18:00
4
22
ooutput:
batch
start time
end time
mode
1
00:18:00
03:17:00
0
1
03:18:00
06:17:00
1
1
06:18:00
09:17:00
2
1
09:18:00
12:17:00
3
1
12:18:00
13:18:00
4
Schema (MySQL v8.0)
CREATE TABLE table1 (
`batch` INTEGER,
`timestemp` TIME,
`mo` INTEGER,
`speed` INTEGER
);
INSERT INTO table1
(`batch`, `timestemp`, `mo`, `speed`)
VALUES
('1', '00:18:00', '0', '0'),
('1', '01:18:00', '0', '0'),
('1', '02:18:00', '0', '0'),
('1', '03:18:00', '1', '5'),
('1', '04:18:00', '1', '6'),
('1', '05:18:00', '1', '7'),
('1', '06:18:00', '2', '10'),
('1', '07:18:00', '2', '9'),
('1', '08:18:00', '2', '8'),
('1', '09:18:00', '3', '12'),
('1', '10:18:00', '3', '23'),
('1', '11:18:00', '3', '21'),
('1', '12:18:00', '4', '20'),
('1', '13:18:00', '4', '22');
Query
SELECT batch
, mode
, start_time
, COALESCE(SUBTIME(LEAD(start_time) OVER (ORDER BY start_time), '00:01:00'), end_time) end_time
FROM (
SELECT batch
, min(timestemp) start_time
, max(timestemp) end_time
, mo mode
FROM table1
GROUP BY batch, mo
) min_max;
batch
mode
start_time
end_time
1
0
00:18:00
03:17:00
1
1
03:18:00
06:17:00
1
2
06:18:00
09:17:00
1
3
09:18:00
12:17:00
1
4
12:18:00
13:18:00
View on DB Fiddle

Oracle SQL - How to return common date periods and "divide" when there are gaps between periods

I'm trying to return common date periods (per id) from below data, but I cannot find a way to handle case when date periods have a gap between common periods. Can anyone help?
|id|code_id|code|date_from|date_to|
|--|--|--|--|--|
|10|100| 1000 |02/02/2022 |03/02/2022 23:57:00|
|10|100| 1000 |07/02/2022 01:00:00 |08/02/2022 |
|10|100| 2000 |02/02/2022 |02/02/2022 23:00:00|
|10|100| 2000 |07/02/2022 03:00:00 |08/02/2022 |
|10|200| 2000 |02/02/2022 02:14:00 |04/02/2022 21:37:00|
|20|100| 1000 |01/02/2022 05:00:00 |03/02/2022 |
|30|100| 2000 |02/02/2022 |02/02/2022 23:00:00|
|30|200| 2000 |02/02/2022 02:14:00 |04/02/2022 |
|40|100| 2000 |07/02/2022 03:00:00 |08/02/2022 23:10:00|
|50|200| 2000 |04/02/2022 |04/02/2022 21:37:00|
|50|200| 3000 |04/02/2022 02:12:00 |05/02/2022 23:31:00|
Below simple query works fine, but only for ids which have one common period (with no gaps).
I would expect for id = 10 to return two rows (as there is a gap between dates) for periods which are:
I) 02/02/2022 00:00:00 <-> 04/02/2022 21:37:00
II) 07/02/2022 01:00:00 <-> 08/02/2022 00:00:00
SELECT id
,MIN(date_from) date_from
,MAX(date_to) date_to
FROM my_gtt
GROUP BY id
ORDER BY id
Current results (but id = 10 is incorrect):
|id|date_from|date_to|
|--|--|--|
|10| 02/02/2022 |08/02/2022 |
|20| 01/02/2022 05:00:00 |03/02/2022 |
|30| 02/02/2022 |04/02/2022 |
|40| 07/02/2022 03:00:00 |08/02/2022 23:10:00|
|50| 04/02/2022 |05/02/2022 23:31:00|
Data and table creation:
CREATE GLOBAL TEMPORARY TABLE my_gtt
(
id NUMBER(10),
code_id NUMBER(10),
code NUMBER(10),
date_from DATE,
date_to DATE
)
ON COMMIT PRESERVE ROWS;
INSERT INTO my_gtt VALUES (10, 100, 1000, TO_DATE('02-02-2022', 'dd-mm-yyyy'), TO_DATE('03-02-2022 23:57:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (10, 100, 1000, TO_DATE('07-02-2022 01:00:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('08-02-2022', 'dd-mm-yyyy'));
INSERT INTO my_gtt VALUES (10, 100, 2000, TO_DATE('02-02-2022', 'dd-mm-yyyy'), TO_DATE('02-02-2022 23:00:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (10, 100, 2000, TO_DATE('07-02-2022 03:00:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('08-02-2022', 'dd-mm-yyyy'));
INSERT INTO my_gtt VALUES (10, 200, 2000, TO_DATE('02-02-2022 02:14:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('04-02-2022 21:37:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (20, 100, 1000, TO_DATE('01-02-2022 05:00:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('03-02-2022', 'dd-mm-yyyy'));
INSERT INTO my_gtt VALUES (30, 100, 2000, TO_DATE('02-02-2022', 'dd-mm-yyyy'), TO_DATE('02-02-2022 23:00:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (30, 200, 2000, TO_DATE('02-02-2022 02:14:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('04-02-2022', 'dd-mm-yyyy'));
INSERT INTO my_gtt VALUES (40, 100, 2000, TO_DATE('07-02-2022 03:00:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('08-02-2022 23:10:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (50, 200, 2000, TO_DATE('04-02-2022', 'dd-mm-yyyy'), TO_DATE('04-02-2022 21:37:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (50, 200, 3000, TO_DATE('04-02-2022 02:12:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('05-02-2022 23:31:00', 'dd-mm-yyyy hh24:mi:ss'));
From Oracle 12, MATCH_RECOGNIZE is the simplest solution:
SELECT *
FROM my_gtt
MATCH_RECOGNIZE (
PARTITION BY id
ORDER BY date_from, date_to
MEASURES
MIN(date_from) AS start_date,
MAX(date_to) AS end_date
PATTERN (overlap* last_row)
DEFINE
overlap AS MAX(date_to) >= NEXT(date_from)
);
However, if you are on an earlier version you can find the output using:
SELECT id,
MIN(dt) AS date_from,
MAX(dt) AS date_to
FROM (
SELECT id,
dt,
SUM(value) OVER (PARTITION BY id ORDER BY dt, ROWNUM) AS match_no
FROM (
SELECT id,
dt,
type * SUM(type) OVER (PARTITION BY id ORDER BY dt, ROWNUM) AS value
FROM my_gtt
UNPIVOT (dt FOR type IN (date_from AS 1, date_to AS -1))
)
WHERE value IN (1,0)
)
GROUP BY id, match_no
Which, for the sample data:
INSERT INTO my_gtt VALUES (10, 100, 1000, TO_DATE('02-02-2022', 'dd-mm-yyyy'), TO_DATE('03-02-2022 23:57:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (10, 100, 1000, TO_DATE('07-02-2022 01:00:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('08-02-2022', 'dd-mm-yyyy'));
INSERT INTO my_gtt VALUES (10, 100, 2000, TO_DATE('02-02-2022', 'dd-mm-yyyy'), TO_DATE('02-02-2022 23:00:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (10, 100, 2000, TO_DATE('07-02-2022 03:00:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('08-02-2022', 'dd-mm-yyyy'));
INSERT INTO my_gtt VALUES (10, 200, 2000, TO_DATE('02-02-2022 02:14:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('04-02-2022 21:37:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (20, 100, 1000, TO_DATE('01-02-2022 05:00:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('03-02-2022', 'dd-mm-yyyy'));
INSERT INTO my_gtt VALUES (30, 100, 2000, TO_DATE('02-02-2022', 'dd-mm-yyyy'), TO_DATE('02-02-2022 23:00:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (30, 200, 2000, TO_DATE('02-02-2022 02:14:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('04-02-2022', 'dd-mm-yyyy'));
INSERT INTO my_gtt VALUES (40, 100, 2000, TO_DATE('07-02-2022 03:00:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('08-02-2022 23:10:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (50, 200, 2000, TO_DATE('04-02-2022', 'dd-mm-yyyy'), TO_DATE('04-02-2022 21:37:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (50, 200, 3000, TO_DATE('04-02-2022 02:12:00', 'dd-mm-yyyy hh24:mi:ss'), TO_DATE('05-02-2022 23:31:00', 'dd-mm-yyyy hh24:mi:ss'));
INSERT INTO my_gtt VALUES (60, 200, 3000, DATE '2022-01-01', DATE '2022-01-10');
INSERT INTO my_gtt VALUES (60, 200, 3000, DATE '2022-01-02', DATE '2022-01-04');
INSERT INTO my_gtt VALUES (60, 200, 3000, DATE '2022-01-06', DATE '2022-01-11');
INSERT INTO my_gtt VALUES (60, 200, 3000, DATE '2022-01-13', DATE '2022-01-16');
INSERT INTO my_gtt VALUES (60, 200, 3000, DATE '2022-01-14', DATE '2022-01-15');
Both output:
ID
START_DATE
END_DATE
10
2022-02-02 00:00:00
2022-02-04 21:37:00
10
2022-02-07 01:00:00
2022-02-08 00:00:00
20
2022-02-01 05:00:00
2022-02-03 00:00:00
30
2022-02-02 00:00:00
2022-02-04 00:00:00
40
2022-02-07 03:00:00
2022-02-08 23:10:00
50
2022-02-04 00:00:00
2022-02-05 23:31:00
60
2022-01-01 00:00:00
2022-01-11 00:00:00
60
2022-01-13 00:00:00
2022-01-16 00:00:00
db<>fiddle here
SQL pattern matching can help:
select * from my_gtt match_recognize (
partition by id
order by date_from, date_to
measures
min ( date_from ) start_date,
max ( date_to ) end_date
pattern ( overlap* gap )
define
overlap as next ( date_from ) <= max ( date_to )
);
ID START_DATE END_DATE
---------- -------------------- --------------------
10 02-FEB-2022 00:00:00 04-FEB-2022 21:37:00
10 07-FEB-2022 01:00:00 08-FEB-2022 00:00:00
20 01-FEB-2022 05:00:00 03-FEB-2022 00:00:00
30 02-FEB-2022 00:00:00 04-FEB-2022 00:00:00
40 07-FEB-2022 03:00:00 08-FEB-2022 23:10:00
50 04-FEB-2022 00:00:00 05-FEB-2022 23:31:00
I discuss how this works in more detail in pattern matching use cases

Select rows until running sum reaches specific value

I have the following data:
DECLARE #t TABLE (usr VARCHAR(100), dt DATE, amount INT);
INSERT INTO #t VALUES
('a', '2018-01-01', 100), -- 100
('a', '2018-02-01', 100), -- 200
('a', '2018-03-01', 100), -- 300
('a', '2018-04-01', 100), -- 400
('a', '2018-05-01', 100), -- 500
('b', '2018-01-01', 150), -- 150
('b', '2018-02-01', 150), -- 300
('b', '2018-03-01', 150), -- 450
('b', '2018-04-01', 150), -- 600
('b', '2018-05-01', 150); -- 750
And a value such as 300 or 301 (a user variable or column). I want to select rows until running total of amount reaches the specified value, with the following twist:
For 300 I want to select first 3 rows for a and first 2 rows for b
For 301 I want to select first 4 rows for a and first 3 rows for b
This is supposed to be simple but the solutions I found do not handle the second case.
DECLARE #t TABLE (usr VARCHAR(100), dt DATE, amount INT);
INSERT INTO #t VALUES
('a', '2018-01-01', 100), -- 100
('a', '2018-02-01', 100), -- 200
('a', '2018-03-01', 100), -- 300
('a', '2018-04-01', 100), -- 400
('a', '2018-05-01', 100), -- 500
('b', '2018-01-01', 150), -- 150
('b', '2018-02-01', 150), -- 300
('b', '2018-03-01', 150), -- 450
('b', '2018-04-01', 150), -- 600
('b', '2018-05-01', 150); -- 750
DECLARE #Total INT = 301;
WITH cte AS
(
SELECT *, SUM(amount) OVER (PARTITION BY usr ORDER BY dt) AS RunTotal
FROM #t
)
SELECT *
FROM cte
WHERE cte.RunTotal - cte.amount < #Total -- running total for previous row is less
-- than #Total then include current row
DECLARE #t TABLE (usr VARCHAR(100), dt DATE, amount INT);
INSERT INTO #t VALUES
('a', '2018-01-01', 100), -- 100
('a', '2018-02-01', 100), -- 200
('a', '2018-03-01', 100), -- 300
('a', '2018-04-01', 100), -- 400
('a', '2018-05-01', 100), -- 500
('b', '2018-01-01', 150), -- 150
('b', '2018-02-01', 150), -- 300
('b', '2018-03-01', 150), -- 450
('b', '2018-04-01', 150), -- 600
('b', '2018-05-01', 150); -- 750
declare #target int = 300;
with cte_RunningTotal as
(
select
usr,
dt,
amount,
sum(amount) over (partition by usr order by dt rows unbounded preceding) as runningTotal
from #t
)
select *
from cte_RunningTotal
where runningTotal < #target + amount
order by usr, dt

SQL Server episode identification

I am working with a blood pressure database in SQL Server which contains patient_id, timestamp (per minute) and systolicBloodPressure.
My goals are to find:
the number of episodes in which a patient is under a certain blood pressure threshold
An episode consists of the timestmap where the patient drops below a certain threshold until the timestamp where the patient comes above the threshold.
the mean blood pressure per episode per patient
the duration of the episode per episode per patient
What I have tried so far:
I am able to identify episodes by just making a new column which sets to 1 if threshold is reached.
select *
, CASE
when sys < threshold THEN '1'
from BPDATA
However , I am not able to 'identify' different episodes within the patient; episode1 episode 2 with their relative timestamps.
Could someone help me with this? Or is there someone with a better different solution?
EDIT: Sample data with example threshold 100
ID Timestamp SysBP below Threshold
----------------------------------------------------
1 9:38 110 Null
1 9:39 105 Null
1 9:40 96 1
1 9:41 92 1
1 9:42 102 Null
2 12:23 95 1
2 12:24 98 1
2 12:25 102 Null
2 12:26 104 Null
2 12:27 94 1
2 12:28 88 1
2 12:29 104 Null
Thanks for the sample data.
This should work:
declare #t table (ID int, Timestamp time, SysBP int, belowThreshold bit)
insert #t
values
(1, '9:38', 110, null),
(1, '9:39', 105, null),
(1, '9:40', 96, 1),
(1, '9:41', 92, 1),
(1, '9:42', 102, null),
(2, '12:23', 95, 1),
(2, '12:24', 98, 1),
(2, '12:25', 102, null),
(2, '12:26', 104, null),
(2, '12:27', 94, 1),
(2, '12:28', 88, 1),
(2, '12:29', 104, null)
declare #treshold int = 100
;with y as (
select *, case when lag(belowThreshold, 1, 0) over(partition by id order by timestamp) = belowThreshold then 0 else 1 end epg
from #t
),
z as (
select *, sum(epg) over(partition by id order by timestamp) episode
from y
where sysbp < #treshold
)
select id, episode, count(episode) over(partition by id) number_of_episodes_per_id, avg(sysbp) avg_sysbp, datediff(minute, min(timestamp), max(timestamp))+1 episode_duration
from z
group by id, episode
This answer relies on LEAD() and LAG() functions so only works on 2012 or later:
Setup:
CREATE TABLE #bloodpressure
(
Patient_id int,
[TimeStamp] SmallDateTime,
SystolicBloodPressure INT
)
INSERT INTO #bloodpressure
VALUES
(1, '2017-01-01 09:01', 60),
(1, '2017-01-01 09:02', 55),
(1, '2017-01-01 09:03', 60),
(1, '2017-01-01 09:04', 70),
(1, '2017-01-01 09:05', 72),
(1, '2017-01-01 09:06', 75),
(1, '2017-01-01 09:07', 60),
(1, '2017-01-01 09:08', 50),
(1, '2017-01-01 09:09', 52),
(1, '2017-01-01 09:10', 53),
(1, '2017-01-01 09:11', 65),
(1, '2017-01-01 09:12', 71),
(1, '2017-01-01 09:13', 73),
(1, '2017-01-01 09:14', 74),
(2, '2017-01-01 09:01', 70),
(2, '2017-01-01 09:02', 75),
(2, '2017-01-01 09:03', 80),
(2, '2017-01-01 09:04', 70),
(2, '2017-01-01 09:05', 72),
(2, '2017-01-01 09:06', 75),
(2, '2017-01-01 09:07', 60),
(2, '2017-01-01 09:08', 50),
(2, '2017-01-01 09:09', 52),
(2, '2017-01-01 09:10', 53),
(2, '2017-01-01 09:11', 65),
(2, '2017-01-01 09:12', 71),
(2, '2017-01-01 09:13', 73),
(2, '2017-01-01 09:14', 74),
(3, '2017-01-01 09:12', 71),
(3, '2017-01-01 09:13', 60),
(3, '2017-01-01 09:14', 74)
Now using Lead And Lag to find the previous rows values, to find whether this is the beginning or end of a sequence of low blood pressures, in combination with a common table expression. Using a UNION of start and end events ensures that an event which covers just one minute is recorded as both a start and an end event.
;WITH CTE
AS
(
SELECT *,
LAG(SystolicBloodPressure,1)
OVER (PaRTITION BY Patient_Id ORDER BY TimeStamp) As PrevValue,
Lead(SystolicBloodPressure,1)
OVER (PaRTITION BY Patient_Id ORDER BY TimeStamp) As NextValue
FROM #bloodpressure
),
CTE2
AS
(
-- Get Start Events (EventType 1)
SELECT 1 As [EventType], Patient_id, TimeStamp,
ROW_NUMBER() OVER (ORDER BY Patient_id, TimeStamp) AS RN
FROM CTE
WHERE (PrevValue IS NULL AND SystolicBloodPressure < 70) OR
(PrevValue >= 70 AND SystolicBloodPressure < 70)
UNION
-- Get End Events (EventType 2)
SELECT 2 As [EventType], Patient_id, TimeStamp,
ROW_NUMBER() OVER (ORDER BY Patient_id, TimeStamp) AS RN
FROM CTE
WHERE (NextValue IS NULL AND SystolicBloodPressure < 70 ) OR
(NextValue >= 70 AND SystolicBloodPressure < 70)
)
SELECT C1.Patient_id, C1.TimeStamp As EventStart, C2.TimeStamp As EventEnd
FROM CTE2 C1
INNER JOIN CTE2 C2
ON C1.Patient_id = C2.Patient_id AND C1.RN = C2.RN
WHERE C1.EventType = 1 AND C2.EventType = 2
ORDER BY C1.Patient_id, C1.TimeStamp

Postgresql. Merge and split date ranges from two tables by set of keys

I'm trying to combine multiple date ranges from two same tables with same or diferrent data. (PostgreSql 9.*)
Tables structure:
CREATE TABLE "first_activities" (
"id" int4 NOT NULL DEFAULT nextval('first_activities_id_seq'::regclass),
"start_time" timestamptz,
"end_time" timestamptz,
"activity_type" int2,
"user_id" int4
)
WITH (OIDS=FALSE);
ALTER TABLE "first_activities" ADD PRIMARY KEY ("id") NOT DEFERRABLE INITIALLY IMMEDIATE;
CREATE TABLE "second_activities" (
"id" int4 NOT NULL DEFAULT nextval('second_activities_id_seq'::regclass),
"start_time" timestamptz,
"end_time" timestamptz,
"activity_type" int2,
"user_id" int4
)
WITH (OIDS=FALSE);
ALTER TABLE "second_activities" ADD PRIMARY KEY ("id") NOT DEFERRABLE INITIALLY IMMEDIATE;
Data in First table:
INSERT INTO "first_activities" VALUES
(NULL, '2014-10-31 01:00:00', '2014-10-31 02:00:00', '3', '1'),
(NULL, '2014-10-31 02:00:00', '2014-10-31 03:00:00', '4', '1'),
(NULL, '2014-10-31 03:00:00', '2014-10-31 04:00:00', '2', '1'),
(NULL, '2014-10-31 04:30:00', '2014-10-31 05:00:00', '3', '1'),
(NULL, '2014-10-31 05:30:00', '2014-11-01 06:00:00', '4', '1'),
(NULL, '2014-11-01 06:30:00', '2014-11-01 07:00:00', '2', '1'),
(NULL, '2014-11-01 07:30:00', '2014-11-01 08:00:00', '1', '1'),
(NULL, '2014-11-01 08:00:00', '2014-11-01 09:00:00', '3', '1'),
(NULL, '2014-11-01 09:00:00', '2014-11-02 10:00:00', '4', '1'),
(NULL, '2014-08-27 10:00:00', '2014-08-27 11:00:00', '2', '1'),
(NULL, '2014-08-27 11:00:00', '2014-08-27 12:00:00', '1', '1'),
Data in Second table:
INSERT INTO "second_activities" VALUES
(NULL, '2014-10-31 01:00:00', '2014-10-31 02:00:00', '3', '1'),
(NULL, '2014-10-31 02:00:00', '2014-10-31 03:00:00', '4', '1'),
-- Differece from first table
(NULL, '2014-10-31 03:30:00', '2014-10-31 04:00:00', '1', '1'),
(NULL, '2014-10-31 04:25:00', '2014-10-31 04:35:00', '3', '1'),
(NULL, '2014-10-31 04:45:00', '2014-10-31 05:35:00', '3', '1'),
-- End of Difference from first table
(NULL, '2014-08-27 10:00:00', '2014-08-27 11:00:00', '2', '1'),
(NULL, '2014-08-27 11:00:00', '2014-08-27 12:00:00', '1', '1');
How can I filter result set that starting from query:
SELECT * FROM first_activities UNION ALL SELECT * from second_activities
ORDER BY start_time ASC;
to get final result set.
Final Result:
-- merge same data by user_id and activity_type and combine with
-- and split data with range intersection but not same user_id and acitvity_type
-- start_time end_time type user_id
'2014-10-31 01:00:00', '2014-10-31 02:00:00', '3', '1');
'2014-10-31 02:00:00', '2014-10-31 03:00:00', '4', '1');
--data dont merge. Splitting with range intersection
'2014-10-31 03:00:00', '2014-10-31 03:30:00', '2', '1'); -- from first table
'2014-10-31 03:30:00', '2014-10-31 04:00:00', '1', '1'); -- from second table
-- data merged by same user_id and activity_type
'2014-10-31 04:25:00', '2014-10-31 05:35:00', '3', '1');
'2014-10-31 05:30:00', '2014-11-01 06:00:00', '4', '1');
'2014-11-01 06:30:00', '2014-11-01 07:00:00', '2', '1');
'2014-11-01 07:30:00', '2014-11-01 08:00:00', '1', '1');
'2014-11-01 08:00:00', '2014-11-01 09:00:00', '3', '1');
'2014-11-01 09:00:00', '2014-11-02 10:00:00', '4', '1');
'2014-08-27 10:00:00', '2014-08-27 11:00:00', '2', '1');
'2014-08-27 11:00:00', '2014-08-27 12:00:00', '1', '1');
The issue can be reduced to the question of how to combine (compact) a group of adjacent (overlapping) ranges into one. I had to deal with this some time ago and found it a bit complicated in plain SQL. There is a simple solution using loop in a plpgsql code, but I found also a general solution with the use of custom aggregate.
The function compact_ranges(anyrange, anyrange) returns the sum of ranges if they are adjacent (overlapping) or the second range otherwise:
create or replace function compact_ranges(anyrange, anyrange)
returns anyrange language sql as $$
select case
when $1 && $2 or $1 -|- $2 then $1+ $2
else $2
end
$$;
create aggregate compact_ranges_agg (anyrange) (
sfunc = compact_ranges,
stype = anyrange
);
The aggregate has a narrow scope of usage, it should be called as a progressive window function like in the example:
with test(rng) as (
values
('[ 1, 2)'::int4range),
('[ 3, 7)'), -- group 1
('[ 5, 10)'), -- group 1
('[ 6, 8)'), -- group 1
('[11, 17)'), -- group 2
('[12, 16)'), -- group 2
('[15, 16)'), -- group 2
('[18, 19)')
)
select distinct on (lower(new_rng)) new_rng
from (
select *, compact_ranges_agg(rng) over (order by rng) new_rng
from test
) s
order by lower(new_rng), new_rng desc;
new_rng
---------
[1,2)
[3,10)
[11,17)
[18,19)
(4 rows)
In the same way you can use it for your tables:
with merged as (
select tstzrange(start_time, end_time) rng, activity_type, user_id
from first_activities
union
select tstzrange(start_time, end_time) rng, activity_type, user_id
from second_activities
),
compacted as (
select distinct on (user_id, activity_type, lower(new_rng))
lower(new_rng) start_time,
upper(new_rng) end_time,
activity_type,
user_id
from (
select
user_id, activity_type,
compact_ranges_agg(rng) over (partition by user_id, activity_type order by rng) new_rng
from merged
) s
order by user_id, activity_type, lower(new_rng), new_rng desc
)
select
start_time,
case when end_time > lead(start_time) over w then lead(start_time) over w else end_time end,
activity_type,
user_id
from compacted
window w as (order by start_time)
order by start_time;
The result:
start_time | end_time | activity_type | user_id
------------------------+------------------------+---------------+---------
2014-08-27 10:00:00+02 | 2014-08-27 11:00:00+02 | 2 | 1
2014-08-27 11:00:00+02 | 2014-08-27 12:00:00+02 | 1 | 1
2014-10-31 01:00:00+01 | 2014-10-31 02:00:00+01 | 3 | 1
2014-10-31 02:00:00+01 | 2014-10-31 03:00:00+01 | 4 | 1
2014-10-31 03:00:00+01 | 2014-10-31 03:30:00+01 | 2 | 1
2014-10-31 03:30:00+01 | 2014-10-31 04:00:00+01 | 1 | 1
2014-10-31 04:25:00+01 | 2014-10-31 05:30:00+01 | 3 | 1
2014-10-31 05:30:00+01 | 2014-11-01 06:00:00+01 | 4 | 1
2014-11-01 06:30:00+01 | 2014-11-01 07:00:00+01 | 2 | 1
2014-11-01 07:30:00+01 | 2014-11-01 08:00:00+01 | 1 | 1
2014-11-01 08:00:00+01 | 2014-11-01 09:00:00+01 | 3 | 1
2014-11-01 09:00:00+01 | 2014-11-02 10:00:00+01 | 4 | 1
(12 rows)