Optimize the query of weekday statistics between two dates - sql

I have a table with two fields: start_date and end_date. Now I want to count the total number of work overtime. I have created a new calendar table to maintain the working day status of the date.
table: workdays
id status
2020-01-01 4
2020-01-02 1
2020-01-03 1
2020-01-04 2
4: holiday, 1: weekday, 2: weekend
I created a function to calculate the weekdays between two dates (excluding weekends, holidays).
create or replace function get_workday_count (start_date in date, end_date in date)
return number is
day_count int;
begin
select count(0) into day_count from WORKDAYS
where TRUNC(ID) >= TRUNC(start_date)
and TRUNC(ID) <= TRUNC(end_date)
and status in (1, 3, 5);
return day_count;
end;
When I execute the following query statement, it takes about 5 minutes to display the results, erp_sj table has about 200000 rows of data.
select count(0) from ERP_SJ GET_WORKDAY_COUNT(start_date, end_date) > 5;
The fields used in query statements are indexed.
How to optimize? Or is there a better solution?

First of all, optimizing your function:
1.adding pragma udf (for faster execution in sql
2. Adding deterministic clause(for caching)
3. Replacing count(0) to count(*) (to allow cbo optimize count)
4. Replacing return number to int
create or replace function get_workday_count (start_date in date, end_date in date)
return int deterministic is
pragma udf;
day_count int;
begin
select count(*) into day_count from WORKDAYS w
where w.ID >= TRUNC(start_date)
and w.ID <= TRUNC(end_date)
and status in (1, 3, 5);
return day_count;
end;
Then you don't need to call your function in case of (end_date - start_date) < required number of days. Moreover, ideally it would be to use scalar subquery instead of function:
select count(*)
from ERP_SJ
where
case
when trunc(end_date) - trunc(start_date) > 5
then GET_WORKDAY_COUNT(trunc(start_date) , trunc(end_date))
else 0
end > 5
Or using subquery:
select count(*)
from ERP_SJ e
where
case
when trunc(end_date) - trunc(start_date) > 5
then (select count(*) from WORKDAYS w
where w.ID >= TRUNC(e.start_date)
and w.ID <= TRUNC(e.end_date)
and w.status in (1, 3, 5))
else 0
end > 5

WORKDAY_STATUSES table (just for completeness, not used below):
create table workday_statuses
( status number(1) constraint workday_statuses_pk primary key
, status_name varchar2(10) not null constraint workday_status_name_uk unique );
insert all
into workday_statuses values (1, 'Weekday')
into workday_statuses values (2, 'Weekend')
into workday_statuses values (3, 'Unknown 1')
into workday_statuses values (4, 'Holiday')
into workday_statuses values (5, 'Unknown 2')
select * from dual;
WORKDAYS table: one row for each day in 2020:
create table workdays
( id date constraint workdays_pk primary key
, status references workday_statuses not null )
organization index;
insert into workdays (id, status)
select date '2019-12-31' + rownum
, case
when to_char(date '2019-12-31' + rownum, 'Dy', 'nls_language = English') like 'S%' then 2
when date '2019-12-31' + rownum in
( date '2020-01-01', date '2020-04-10', date '2020-04-13'
, date '2020-05-08', date '2020-05-25', date '2020-08-31'
, date '2020-12-25', date '2020-12-26', date '2020-12-28' ) then 4
else 1
end
from xmltable('1 to 366')
where date '2019-12-31' + rownum < date '2021-01-01';
ERP_SJ table containing 30K rows with random data:
create table erp_sj
( id integer generated always as identity
, start_date date not null
, end_date date not null
, filler varchar2(100) );
insert into erp_sj (start_date, end_date, filler)
select dt, dt + dbms_random.value(0,7), dbms_random.string('x',100)
from ( select date '2019-12-31' + dbms_random.value(1,366) as dt
from xmltable('1 to 30000') );
commit;
get_workday_count() function:
create or replace function get_workday_count
( start_date in date, end_date in date )
return integer
deterministic -- Cache some results
parallel_enable -- In case you want to use it in parallel queries
as
pragma udf; -- Tell compiler to optimise for SQL
day_count integer;
begin
select count(*) into day_count
from workdays w
where w.id between trunc(start_date) and end_date
and w.status in (1, 3, 5);
return day_count;
end;
Notice that you should not truncate w.id, because all values have the time as 00:00:00 already. (I'm assuming that if end_date falls somewhere in the middle of a day, you want to count that day, so I have not truncated the end_date parameter.)
Test:
select count(*) from erp_sj
where get_workday_count(start_date, end_date) > 5;
COUNT(*)
--------
1302
Results returned in around 1.4 seconds.
Execution plan for the query within the function:
select count(*)
from workdays w
where w.id between trunc(sysdate) and sysdate +10
and w.status in (1, 3, 5);
--------------------------------------------------------------------------------------------
| Id | Operation | Name | Starts | E-Rows | A-Rows | A-Time | Buffers |
--------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 1 | | 1 |00:00:00.01 | 1 |
| 1 | SORT AGGREGATE | | 1 | 1 | 1 |00:00:00.01 | 1 |
|* 2 | FILTER | | 1 | | 7 |00:00:00.01 | 1 |
|* 3 | INDEX RANGE SCAN| WORKDAYS_PK | 1 | 7 | 7 |00:00:00.01 | 1 |
--------------------------------------------------------------------------------------------
Now try adding the function as a virtual column and indexing it:
create index erp_sj_workday_count_ix on erp_sj(workday_count);
select count(*) from erp_sj
where workday_count > 5;
Same result in 0.035 seconds. Plan:
-------------------------------------------------------------------------------------------------------
| Id | Operation | Name | Starts | E-Rows | A-Rows | A-Time | Buffers |
-------------------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 1 | | 1 |00:00:00.01 | 5 |
| 1 | SORT AGGREGATE | | 1 | 1 | 1 |00:00:00.01 | 5 |
|* 2 | INDEX RANGE SCAN| ERP_SJ_WORKDAY_COUNT_IX | 1 | 1302 | 1302 |00:00:00.01 | 5 |
-------------------------------------------------------------------------------------------------------
Tested in 19.0.0.
Edit: As Sayan pointed out, the index on the virtual column won't be automatically updated if there are any changes in WORKDAYS, so there is a risk of wrong results with this approach. However, if performance is critical you could work around it by rebuilding the index on ERP_SJ every time you updated WORKDAYS. Maybe you could do this in a statement-level trigger on WORKDAYS, or just through scheduled IT maintenance processes if updates are very infrequent and ERP_SJ isn't so big that an index rebuild is impractical. If the index is partitioned, rebuilding affected partitions could be an option.
Or, don't have an index and live with the 1.4 seconds query execution time.

I understand that the columns ID and status have indexes on them ( not functional index on TRUNC(ID) ). So use this query
SELECT count(0)
INTO day_count
FROM WORKDAYS
WHERE ID BETWEEN TRUNC(start_date) AND TRUNC(end_date)
AND status in (1, 3, 5);
in order to be able to exploit the index on date column ID also.

May be try Scalar Subquery Caching
(in case there are plenty erp_sj records with the same start_date and end_date)
select count(0) from ERP_SJ where
(select GET_WORKDAY_COUNT(start_date, end_date) from dual) > 5

You are dealing with a data warehouse query (not an OLTP query).
Some best practices says you should
get rid od functions - avoid contenxt switch (this could be somehow mitigated with the UDF pragma but why to use function if you don't need it?)
get rid of indexes - quick for few rows; slow for large number of records
get rid of caching - caching is basically a workaround for repeating same thing
So the data warehouse approach for the problem consists of two steps
Extend the Workday Table
The workday table can be with a little query extended with a new column MIN_END_DAY that defines for each (start) day the minimum threshold to reach the limit of 5 working days.
The query uses LEAD aggregate function to get the 4th leading working day (check the PARTITION BY clause that distincs between the working ays and other days.
For the non working days you simple takes the LAST_VALUE of the next working day.
Example
with wd as (
select ID, STATUS,
case when status in (1, 3, 5) then
lead(id,4) over (partition by case when status in (1, 3, 5) then 'workday' end order by id) /* 4 working days ahead */
end as min_stop_day
from workdays),
wd2 as (
select ID, STATUS,
last_value(MIN_STOP_DAY) ignore nulls over (order by id desc) MIN_END_DAY
from wd)
select ID, STATUS, MIN_END_DAY
from wd2
order by 1;
ID, STATUS, MIN_END_DAY
01.01.2020 00:00:00 4 08.01.2020 00:00:00
02.01.2020 00:00:00 1 08.01.2020 00:00:00
03.01.2020 00:00:00 1 09.01.2020 00:00:00
04.01.2020 00:00:00 2 10.01.2020 00:00:00
05.01.2020 00:00:00 2 10.01.2020 00:00:00
06.01.2020 00:00:00 1 10.01.2020 00:00:00
Join to the Base Table
Now you can simple join your base table with the the extended workday table on the start_day and filter rows by comparing the end_daywith the MIN_END_DAY
Query
with wd as (
select ID, STATUS,
case when status in (1, 3, 5) then
lead(id,4) over (partition by case when status in (1, 3, 5) then 'workday' end order by id)
end as min_stop_day
from workdays),
wd2 as (
select ID, STATUS,
last_value(MIN_STOP_DAY) ignore nulls over (order by id desc) MIN_END_DAY
from wd)
select count(*) from erp_sj
join wd2
on trunc(erp_sj.start_date) = wd2.ID
where trunc(end_day) >= min_end_day
This will lead for large tables to the expected HASH JOIN execution plan.
Note that I assume 1) the workday table is complete (otherwise you can't use inner join) and 2) contains enough future data (the last 5 rows are obviously not usable).

Related

SQL: Calculate number of days since last success

Following table represents results of given test.
Every result for the same test is either pass ( error_id=0) or fail ( error_id <> 0)
I need help to write a query, that returns the number of runs since last good run ( error_id= 0) and the date.
| Date | test_id | error_id |
-----------------------------------
| 2019-12-20 | 123 | 23
| 2019-12-19 | 123 | 23
| 2019-12-17 | 123 | 22
| 2019-12-18 | 123 | 0
| 2019-12-16 | 123 | 11
| 2019-12-15 | 123 | 11
| 2019-12-13 | 123 | 11
| 2019-12-12 | 123 | 0
So the result for this example should be:
| 2019-12-18 | 123 | 4
as the test 123 was PASS on 2019-12-18 and this happened 4 runs ago.
I have a query to determine whether given run is error or not, but I have trouble applying appropriate window function to it to get the wanted result
select test_id, Date, error_id, (CASE WHEN error_id 0 THEN 1 ELSE 0 END) as is_error
from testresults
You can generate a row number, in reverse order from the sorting of the query itself:
SELECT test_date, test_id, error_code,
(row_number() OVER (ORDER BY test_date asc) - 1) as runs_since_last_pass
FROM tests
WHERE test_date >= (SELECT MAX(test_date) FROM tests WHERE error_code=0)
ORDER BY test_date DESC
LIMIT 1;
Note that this will run into issues if test_date is not unique. Better use a timestamp (precise to the millisecond) instead of a date.
Here's a DBFiddle: https://www.db-fiddle.com/f/8gSHVcXMztuRiFcL8zLeEx/0
If there's more than one test_id, you'll want to add a PARTITION BY clause to the row number function, and the subquery would become a bit more complex. It may be more efficient to come up with a way to do this by a JOIN instead of a subquery, but it would be more cognitively complex.
I think you just want aggregation and some filtering:
select id, count(*),
max(date) over (filter where error_id = 0) as last_success_date
from t
where date > (select max(t2.date) from t t2 where t2.error_id = 0);
group by id;
You have to use the Maximum date of the good runs for every test_id in your query. You can try this query:
select tr2.Date_error, tr.test_id, count(tr.error_id) from
testresults tr inner join (select max(Date_error), test_id
from testresult where error_id=0 group by test_id) tr2 on
tr.test_id=tr2.test_id and tr.date_error >=tr2.date_error
group by test_id
This should do the trick:
select count(*) from table t,
(select max(date) date from table where error_id = 0) good
where t.date >= good.date
Basically you are counting the rows that have a date >= the date of the last success.
Please note: If you need the number of days, it is a complete different query:
select now()::date - max(test_date) last_valid from tests
where error_code = 0;

Unable to calculate difference between CTE subquery outputs for use in larger PostgreSQL query output column

Using PostgreSQL v9.4.5 from Shell I created a database called moments in psql by running create database moments. I then created a moments table:
CREATE TABLE moments
(
id SERIAL4 PRIMARY KEY,
moment_type BIGINT NOT NULL,
flag BIGINT NOT NULL,
time TIMESTAMP NOT NULL,
UNIQUE(moment_type, time)
);
INSERT INTO moments (moment_type, flag, time) VALUES (1, 7, '2016-10-29 12:00:00');
INSERT INTO moments (moment_type, flag, time) VALUES (1, -30, '2016-10-29 13:00:00');
INSERT INTO moments (moment_type, flag, time) VALUES (3, 5, '2016-10-29 14:00:00');
INSERT INTO moments (moment_type, flag, time) VALUES (2, 9, '2016-10-29 18:00:00');
INSERT INTO moments (moment_type, flag, time) VALUES (2, -20, '2016-10-29 17:00:00');
INSERT INTO moments (moment_type, flag, time) VALUES (3, 10, '2016-10-29 16:00:00');
I run select * from moments to view the table:
Moments Table
id | moment_type | flag | time
----+-------------+------+---------------------
1 | 1 | 7 | 2016-10-29 12:00:00
2 | 1 | -30 | 2016-10-29 13:00:00
3 | 3 | 5 | 2016-10-29 14:00:00
4 | 2 | 9 | 2016-10-29 18:00:00
5 | 2 | -20 | 2016-10-29 17:00:00
6 | 3 | 10 | 2016-10-29 16:00:00
I then try to write an SQL query that produces the following output, whereby for each pair of duplicate moment_type values it returns the difference between the flag value of the moment_type having the most recent timestamp value, and the flag value of the second most recent timestamp value, and lists the results in ascending order by moment_type.
Expected SQL Query Output
moment_type | flag |
------------+------+
1 | -37 | (i.e. -30 - 7)
2 | 29 | (i.e. 9 - -20)
3 | 5 | (i.e. 10 - 5)
The SQL query that I came up with is as follows, which uses the WITH query to write multiple Common Table Expressions (CET) subqueries for use as temporary tables in the larger SELECT query at the end. I also use an SQL function to calculate the difference between two of the subquery outputs (alternatively I think I could have just used DIFFERENCE DIFFERENCE(most_recent_flag, second_most_recent_flag) AS flag instead of the function):
CREATE FUNCTION difference(most_recent_flag, second_most_recent_flag) RETURNS numeric AS $$
SELECT $1 - $2;
$$ LANGUAGE SQL;
-- get two flags that have the most recent timestamps
WITH two_most_recent_flags AS (
SELECT moments.flag
FROM moments
ORDER BY moments.time DESC
LIMIT 2
),
-- get one flag that has the most recent timestamp
most_recent_flag AS (
SELECT *
FROM two_most_recent_flags
ORDER BY flag DESC
LIMIT 1
),
-- get one flag that has the second most recent timestamp
second_most_recent_flag AS (
SELECT *
FROM two_most_recent_flags
ORDER BY flag ASC
LIMIT 1
)
SELECT DISTINCT ON (moments.moment_type)
moments.moment_type,
difference(most_recent_flag, second_most_recent_flag) AS flag
FROM moments
ORDER BY moment_type ASC
LIMIT 2;
But when I run the above SQL query in PostgreSQL, it returns the following error:
ERROR: column "most_recent_flag" does not exist
LINE 21: difference(most_recent_flag, second_most_recent_flag) AS fla...
Question
What techniques can I use and how may I apply them to overcome this error, and calculate and display the differences in the flag column to achieve the Expected SQL Query Output?
Note: Perhaps the Window Function may be used somehow as it performs calculations across table rows
Use the lag() window function:
select moment_type, difference
from (
select *, flag- lag(flag) over w difference
from moments
window w as (partition by moment_type order by time)
) s
where difference is not null
order by moment_type
moment_type | difference
-------------+------------
1 | -37
2 | 29
3 | 5
(3 rows)
One method is to use conditional aggregation. The window function row_number() can be used to identify the first and last time values:
select m.moment_type,
(max(case when seqnum_desc = 1 then flag end) -
min(case when seqnum_asc = 1 then flag end)
)
from (select m.*,
row_number() over (partition by m.moment_type order by m.time) as seqnum_asc,
row_number() over (partition by m.moment_type order by m.time desc) as seqnum_desc
from moments m
) m
group by m.moment_type;

How to create end date that is one day less than the next start date created by another another query with sql?

I queried off of a table that pulls in anyone who has working time percentage of less than 100 and all their working time records if they met the less than 100 criteria.
This table contains the columns: id, eff_date (of working time percentage), and percentage. This table does not contain end_date.
Problem: how to build on top of the query below and add a new column called end_date that is one date less than the next eff_date?
Current query
select
j1.id, j1.eff_date, j1.percentage
from
working_time_table j1
where
exists (select 1
from working_time_table j2
where j2.id = j1.id and j2.percentage < 100)
Data returned from the query above:
ID | EFF_DATE| PERCENTAGE
------------------------
12 | 01-JUN-2012 | 70
12 | 03-MAR-2013 | 100
12 | 13-DEC-2014 | 85
The desired result set is:
ID | EFF_DATE | PERCENTAGE | END_DATE
-------------------------------------------
12 | 01-JUN-2012 | 70 | 02-MAR-2013
12 | 03-MAR-2013 | 100 | 12-DEC-2014
12 | 13-DEC-2014 | 85 | null
You didn't state your DBMS so this is ANSI SQL using window functions:
select j1.id,
j1.eff_date,
j1.percentage,
lead(j1.eff_date) over (partition by j1.id order by j1.eff_date) - interval '1' day as end_date
from working_time_table j1
where exists (select 1
from working_time_table j2
where j2.id = j1.id and j2.percentage < 100);
First off, curious if the "id" column is unique or it has duplicate values like the 12's in your sample, or is that a unique column or primary key possibly. It would be WAAAAY easier to do this if there was a unique
id column that held the order. If you don't have a unique ID column,
are you able to add one to the table? Again, would simplify this
tremendously.
This took forever to get right, I hope this helps, burned many hours on it.
Props to Akhil for helping me finally get the query right. He is a true SQL genius.
Here is the ..
SQLFIDDLE
SELECT
id,
firstTbl.eff_Date,
UPPER(DATE_FORMAT(DATE_SUB(
STR_TO_DATE(secondTbl.eff_Date, '%d-%M-%Y'),
INTERVAL 1 DAY), '%d-%b-%Y')) todate,
percentage FROM
(SELECT
(#cnt := #cnt + 1) rownum,
id, eff_date, percentage
FROM working_time_table,
(SELECT
#cnt := 0) s) firstTbl
LEFT JOIN
(SELECT
(#cnt1 := #cnt1 + 1) rownum,
eff_date
FROM working_time_table,
(SELECT
#cnt1 := 0) s) secondTbl
ON (firstTbl.rownum + 1) = secondTbl.rownum

Finding gaps in huge event streams?

I have about 1 million events in a PostgreSQL database that are of this format:
id | stream_id | timestamp
----------+-----------------+-----------------
1 | 7 | ....
2 | 8 | ....
There are about 50,000 unique streams.
I need to find all of the events where the time between any two of the events is over a certain time period. In other words, I need to find event pairs where there was no event in a certain period of time.
For example:
a b c d e f g h i j k
| | | | | | | | | | |
\____2 mins____/
In this scenario, I would want to find the pair (f, g) since those are the events immediately surrounding a gap.
I don't care if the query is (that) slow, i.e. on 1 million records it's fine if it takes an hour or so. However, the data set will keep growing, so hopefully if it's slow it scales sanely.
I also have the data in MongoDB.
What's the best way to perform this query?
You can do this with the lag() window function over a partition by the stream_id which is ordered by the timestamp. The lag() function gives you access to previous rows in the partition; without a lag value, it is the previous row. So if the partition on stream_id is ordered by time, then the previous row is the previous event for that stream_id.
SELECT stream_id, lag(id) OVER pair AS start_id, id AS end_id,
("timestamp" - lag("timestamp") OVER pair) AS diff
FROM my_table
WHERE diff > interval '2 minutes'
WINDOW pair AS (PARTITION BY stream_id ORDER BY "timestamp");
In postgres it can be done very easily with a help of the lag() window function. Check the fiddle below as an example:
SQL Fiddle
PostgreSQL 9.3 Schema Setup:
CREATE TABLE Table1
("id" int, "stream_id" int, "timestamp" timestamp)
;
INSERT INTO Table1
("id", "stream_id", "timestamp")
VALUES
(1, 7, '2015-06-01 15:20:30'),
(2, 7, '2015-06-01 15:20:31'),
(3, 7, '2015-06-01 15:20:32'),
(4, 7, '2015-06-01 15:25:30'),
(5, 7, '2015-06-01 15:25:31')
;
Query 1:
with c as (select *,
lag("timestamp") over(partition by stream_id order by id) as pre_time,
lag(id) over(partition by stream_id order by id) as pre_id
from Table1
)
select * from c where "timestamp" - pre_time > interval '2 sec'
Results:
| id | stream_id | timestamp | pre_time | pre_id |
|----|-----------|------------------------|------------------------|--------|
| 4 | 7 | June, 01 2015 15:25:30 | June, 01 2015 15:20:32 | 3 |

Can I use a SQL Server CTE to merge intersecting dates?

I'm writing an app that handles scheduling time off for some of our employees. As part of this, I need to calculate how many minutes throughout the day that they have requested off.
In the first version of this tool, we disallowed overlapping time off requests, because we wanted to be able to just add up the total of StartTime minus EndTime for all requests. Preventing overlaps makes this calculation very fast.
This has become problematic, because Managers now want to schedule team meetings but are unable to do so when someone has already asked for the day off.
So, in the new version of the tool, we have a requirement to allow overlapping requests.
Here is an example set of data like what we have:
UserId | StartDate | EndDate
----------------------------
1 | 2:00 | 4:00
1 | 3:00 | 5:00
1 | 3:45 | 9:00
2 | 6:00 | 9:00
2 | 7:00 | 8:00
3 | 2:00 | 3:00
3 | 4:00 | 5:00
4 | 1:00 | 7:00
The result that I need to get, as efficiently as possible, is this:
UserId | StartDate | EndDate
----------------------------
1 | 2:00 | 9:00
2 | 6:00 | 9:00
3 | 2:00 | 3:00
3 | 4:00 | 5:00
4 | 1:00 | 7:00
We can easily detect overlaps with this query:
select
*
from
requests r1
cross join
requests r2
where
r1.RequestId < r2.RequestId
and
r1.StartTime < r2.EndTime
and
r2.StartTime < r1.EndTime
This is, in fact, how we were detecting and preventing the problems originally.
Now, we are trying to merge the overlapping items, but I'm reaching the limits of my SQL ninja skills.
It wouldn't be too hard to come up with a method using temp tables, but we want to avoid this if at all possible.
Is there a set-based way to merge overlapping rows?
Edit:
It would also be acceptable for the all of the rows to show up, as long as they were collapsed into just their time. For example if someone wants off from three to five, and from four to six, it would be acceptable for them to have two rows, one from three to five, and the next from five to six OR one from three to four, and the next from four to six.
Also, here is a little test bench:
DECLARE #requests TABLE
(
UserId int,
StartDate time,
EndDate time
)
INSERT INTO #requests (UserId, StartDate, EndDate) VALUES
(1, '2:00', '4:00'),
(1, '3:00', '5:00'),
(1, '3:45', '9:00'),
(2, '6:00', '9:00'),
(2, '7:00', '8:00'),
(3, '2:00', '3:00'),
(3, '4:00', '5:00'),
(4, '1:00', '7:00');
Complete Rewrite:
;WITH new_grp AS (
SELECT r1.UserId, r1.StartTime
FROM #requests r1
WHERE NOT EXISTS (
SELECT *
FROM #requests r2
WHERE r1.UserId = r2.UserId
AND r2.StartTime < r1.StartTime
AND r2.EndTime >= r1.StartTime)
GROUP BY r1.UserId, r1.StartTime -- there can be > 1
),r AS (
SELECT r.RequestId, r.UserId, r.StartTime, r.EndTime
,count(*) AS grp -- guaranteed to be 1+
FROM #requests r
JOIN new_grp n ON n.UserId = r.UserId AND n.StartTime <= r.StartTime
GROUP BY r.RequestId, r.UserId, r.StartTime, r.EndTime
)
SELECT min(RequestId) AS RequestId
,UserId
,min(StartTime) AS StartTime
,max(EndTime) AS EndTime
FROM r
GROUP BY UserId, grp
ORDER BY UserId, grp
Now produces the requested result and really covers all possible cases, including disjunct sub-groups and duplicates.
Have a look at the comments to the test data in the working demo at data.SE.
CTE 1
Find the (unique!) points in time where a new group of overlapping intervals starts.
CTE 2
Count the starts of new group up to (and including) every individual interval, thereby forming a unique group number per user.
Final SELECT
Merge the groups, take earlies start and latest end for groups.
I faced some difficulty, because T-SQL window functions max() or sum() do not accept an ORDER BY clause in a in a window. They can only compute one value per partition, which makes it impossible to compute a running sum / count per partition. Would work in PostgreSQL or Oracle (but not in MySQL, of course - it has neither window functions nor CTEs).
The final solution uses one extra CTE and should be just as fast.
Ok, it is possible to do with CTEs. I did not know how to use them at the beginning of the night, but here is the results of my research:
A recursive CTE has 2 parts, the "anchor" statement and the "recursive" statements.
The crucial part about the recursive statement is that when it is evaluated, only the rows that have not already been evaluated will show up in the recursion.
So, for example, if we wanted to use CTEs to get an all-inclusive list of times for these users, we could use something like this:
WITH
sorted_requests as (
SELECT
UserId, StartDate, EndDate,
ROW_NUMBER() OVER (PARTITION BY UserId ORDER BY StartDate, EndDate DESC) Instance
FROM #requests
),
no_overlap(UserId, StartDate, EndDate, Instance) as (
SELECT *
FROM sorted_requests
WHERE Instance = 1
UNION ALL
SELECT s.*
FROM sorted_requests s
INNER JOIN no_overlap n
ON s.UserId = n.UserId
AND s.Instance = n.Instance + 1
)
SELECT *
FROM no_overlap
Here, the "anchor" statement is just the first instance for every user, WHERE Instance = 1.
The "recursive" statement joins each row to the next row in the set, using the s.UserId = n.UserId AND s.Instance = n.Instance + 1
Now, we can use the property of the data, when sorted by start date, that any overlapping row will have a start date that is less than the previous row's end date. If we continually propagate the row number of the first intersecting row, every subsequent overlapping row will share that row number.
Using this query:
WITH
sorted_requests as (
SELECT
UserId, StartDate, EndDate,
ROW_NUMBER() OVER (PARTITION BY UserId ORDER BY StartDate, EndDate DESC) Instance
FROM
#requests
),
no_overlap(UserId, StartDate, EndDate, Instance, ConnectedGroup) as (
SELECT
UserId,
StartDate,
EndDate,
Instance,
Instance as ConnectedGroup
FROM sorted_requests
WHERE Instance = 1
UNION ALL
SELECT
s.UserId,
s.StartDate,
CASE WHEN n.EndDate >= s.EndDate
THEN n.EndDate
ELSE s.EndDate
END EndDate,
s.Instance,
CASE WHEN n.EndDate >= s.StartDate
THEN n.ConnectedGroup
ELSE s.Instance
END ConnectedGroup
FROM sorted_requests s
INNER JOIN no_overlap n
ON s.UserId = n.UserId AND s.Instance = n.Instance + 1
)
SELECT
UserId,
MIN(StartDate) StartDate,
MAX(EndDate) EndDate
FROM no_overlap
GROUP BY UserId, ConnectedGroup
ORDER BY UserId
We group by the aforementioned "first intersecting row" (called ConnectedGroup in this query) and find the minimum start time and maximum end time in that group.
The first intersecting row is propagated using this statement:
CASE WHEN n.EndDate >= s.StartDate
THEN n.ConnectedGroup
ELSE s.Instance
END ConnectedGroup
Which basically says, "if this row intersects with the previous row (based on us being sorted by start date), then consider this row to have the same 'row grouping' as the previous row. Otherwise, use this row's own row number as the 'row grouping' for itself."
This gives us exactly what we were looking for.
EDIT
When I had originally thought this up on my whiteboard, I knew that I would have to advance the EndDate of each row, to ensure that it would intersect with the next row, if any of the previous rows in the connected group would have intersected. I accidentally left that out. This has been corrected.
This works for postgres. Microsoft might need some modifications.
SET search_path='tmp';
DROP TABLE tmp.schedule CASCADE;
CREATE TABLE tmp.schedule
( person_id INTEGER NOT NULL
, dt_from timestamp with time zone
, dt_to timestamp with time zone
);
INSERT INTO schedule( person_id, dt_from, dt_to) VALUES
( 1, '2011-12-03 02:00:00' , '2011-12-03 04:00:00' )
, ( 1, '2011-12-03 03:00:00' , '2011-12-03 05:00:00' )
, ( 1, '2011-12-03 03:45:00' , '2011-12-03 09:00:00' )
, ( 2, '2011-12-03 06:00:00' , '2011-12-03 09:00:00' )
, ( 2, '2011-12-03 07:00:00' , '2011-12-03 08:00:00' )
, ( 3, '2011-12-03 02:00:00' , '2011-12-03 03:00:00' )
, ( 3, '2011-12-03 04:00:00' , '2011-12-03 05:00:00' )
, ( 4, '2011-12-03 01:00:00' , '2011-12-03 07:00:00' );
ALTER TABLE schedule ADD PRIMARY KEY (person_id,dt_from)
;
CREATE UNIQUE INDEX ON schedule (person_id,dt_to);
SELECT * FROM schedule ORDER BY person_id, dt_from;
WITH RECURSIVE ztree AS (
-- Terminal part
SELECT p1.person_id AS person_id
, p1.dt_from AS dt_from
, p1.dt_to AS dt_to
FROM schedule p1
UNION
-- Recursive part
SELECT p2.person_id AS person_id
, LEAST(p2.dt_from, zzt.dt_from) AS dt_from
, GREATEST(p2.dt_to, zzt.dt_to) AS dt_to
FROM ztree AS zzt
, schedule AS p2
WHERE 1=1
AND p2.person_id = zzt.person_id
AND (p2.dt_from < zzt.dt_from AND p2.dt_to >= zzt.dt_from)
)
SELECT *
FROM ztree zt
WHERE NOT EXISTS (
SELECT * FROM ztree nx
WHERE nx.person_id = zt.person_id
-- the recursive query returns *all possible combinations of
-- touching or overlapping intervals
-- we'll have to filter, keeping only the biggest ones
-- (the ones for which there is no bigger overlapping interval)
AND ( (nx.dt_from <= zt.dt_from AND nx.dt_to > zt.dt_to)
OR (nx.dt_from < zt.dt_from AND nx.dt_to >= zt.dt_to)
)
)
ORDER BY zt.person_id,zt.dt_from
;
Result:
DROP TABLE
CREATE TABLE
INSERT 0 8
NOTICE: ALTER TABLE / ADD PRIMARY KEY will create implicit index "schedule_pkey" for table "schedule"
ALTER TABLE
CREATE INDEX
person_id | dt_from | dt_to
-----------+------------------------+------------------------
1 | 2011-12-03 02:00:00+01 | 2011-12-03 04:00:00+01
1 | 2011-12-03 03:00:00+01 | 2011-12-03 05:00:00+01
1 | 2011-12-03 03:45:00+01 | 2011-12-03 09:00:00+01
2 | 2011-12-03 06:00:00+01 | 2011-12-03 09:00:00+01
2 | 2011-12-03 07:00:00+01 | 2011-12-03 08:00:00+01
3 | 2011-12-03 02:00:00+01 | 2011-12-03 03:00:00+01
3 | 2011-12-03 04:00:00+01 | 2011-12-03 05:00:00+01
4 | 2011-12-03 01:00:00+01 | 2011-12-03 07:00:00+01
(8 rows)
person_id | dt_from | dt_to
-----------+------------------------+------------------------
1 | 2011-12-03 02:00:00+01 | 2011-12-03 09:00:00+01
2 | 2011-12-03 06:00:00+01 | 2011-12-03 09:00:00+01
3 | 2011-12-03 02:00:00+01 | 2011-12-03 03:00:00+01
3 | 2011-12-03 04:00:00+01 | 2011-12-03 05:00:00+01
4 | 2011-12-03 01:00:00+01 | 2011-12-03 07:00:00+01
(5 rows)