Only select records that do not start within the time frame of another record - sql

I'm trying to achieve the following goal using MS SQL Server 2005 but do not know how to do it.
The goal is to select only records that do not start within the same time period as an anchor record.
Rows that have same ID are a group and evaluated as part of that group.
Start with the earliest date (A) based on StartDate, compare to the next row (B) that has the same ID.
If B starts within A, mark B as invalid. Continue to compare A against all remaining records that have the same ID. Mark any starting within A as invalid.
Flag the next record that does not overlap with A as Valid. Now repeat the same process as above (i.e. check to see if any subsequent records start within the time frame of the new valid record).
Repeat this process until all records have been analyzed.
Example: Create the following table.
if object_id ('tempdb..#Dates') is not null drop table #Dates
create table #Dates (ID int, StartDate datetime, EndDate datetime)
Insert into #Dates
Select 1, '7/23/2003' , '8/22/2003' union all
select 1, '8/21/2003' , '11/19/2003' union all
select 1, '11/18/2003' , '12/18/2003' union all
select 1, '12/17/2003' , '1/16/2004' union all
select 1, '1/15/2004' , '2/14/2004' union all
select 1, '2/11/2004' , '2/26/2004' union all
select 1, '9/14/2004' , '10/14/2004' union all
select 1, '10/5/2004' , '10/20/2004' union all
select 1, '11/20/2004' , '12/20/2004' union all
select 1, '12/19/2004' , '1/18/2005' union all
select 1, '1/12/2005' , '1/27/2005' union all
select 1, '2/27/2005' , '3/11/2005'
Expected output after applying the overlap logic rules:
ID StartDate EndDate Valid
-- --------- --------- -----
1 7/23/2003 8/22/2003 1
1 8/21/2003 11/19/2003 0
1 11/18/2003 12/18/2003 1
1 12/17/2003 1/16/2004 0
1 1/15/2004 2/14/2004 1
1 2/11/2004 2/26/2004 0
1 9/14/2004 10/14/2004 1
1 10/5/2004 10/20/2004 0
1 11/20/2004 12/20/2004 1
1 12/19/2004 1/18/2005 0
1 1/12/2005 1/27/2005 1
1 2/27/2005 3/11/2005 1

I figured out how to answer my own question. Used recursive SQL after ordering the records using row_number.
if object_id ('tempdb..#Dates') is not null drop table #Dates
create table #Dates (ID int, StartDate datetime, EndDate datetime)
Insert into #Dates
Select 1, '7/23/2003' , '8/22/2003' union all
select 1, '8/21/2003' , '11/19/2003' union all
select 1, '11/18/2003' , '12/18/2003' union all
select 1, '12/19/2004' , '1/18/2005' union all
select 1, '1/12/2005' , '1/27/2005' union all
select 1, '2/27/2005' , '3/11/2005' union all
select 1, '12/17/2003' , '1/16/2004' union all
select 1, '1/15/2004' , '2/14/2004' union all
select 1, '2/11/2004' , '2/26/2004' union all
select 1, '9/14/2004' , '10/14/2004' union all
select 1, '10/5/2004' , '10/20/2004' union all
select 1, '11/20/2004' , '12/20/2004'
--Phase 1: Apply ordering to dates
if object_id ('tempdb..#OrderedRecords') is not null drop table #OrderedRecords
select *, N = row_number () over (partition by ID order by StartDate asc, EndDate desc)
into #OrderedRecords
from #Dates
--Phase 2: Apply Overlap Rules (Subsume records that overlap)
;with Subsume (ID, N, StartDate, EndDate, IntermediateStartDate, IntermediateEndDate, Valid) as
(
select ID, N, StartDate, EndDate, IntermediateStartDate = StartDate, IntermediateEndDate = EndDate,
Valid = 1
from #OrderedRecords
where N = 1
UNION ALL
select c.ID, c.N, y.StartDate, y.EndDate,
IntermediateStartDate = case when c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate then y.IntermediateStartDate else c.StartDate end,
IntermediateEndDate = case when c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate then y.IntermediateEndDate else c.EndDate end,
Valid = case when (c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate) then 0 else 1 end
from #OrderedRecords c
join Subsume y
on y.ID = c.ID
and y.N = c.n - 1
and y.IntermediateStartDate >= c.EndDate
UNION ALL
select c.ID, c.N, c.StartDate, c.EndDate,
IntermediateStartDate = case when c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate then y.IntermediateStartDate else c.StartDate end,
IntermediateEndDate = case when c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate then y.IntermediateEndDate else c.EndDate end,
Valid = case when (c.StartDate between y.IntermediateStartDate and y.IntermediateEndDate) then 0 else 1 end
from #OrderedRecords c
join Subsume y
on y.ID = c.ID
and y.N = c.n - 1
and y.IntermediateStartDate < c.EndDate
)
Select ID, StartDate, EndDate, Valid
from Subsume
OPTION (MAXRECURSION 0)

Related

Hive query takes forever on Superset

I have a query that was written in Presto SQL format (100 lines of insert a query result to a table that already exists) and takes within 10 minutes to get the result.
Now I am going to use Airflow and need to change the query to Hive SQL format to append previous month's data, there is no error, but it is taking 75+ minutes now and the query is still running and not returning any result.
Shall I 'stop' it or is there anything else to consider?
SET hive.limit.query.max.table.partition = 1000000;
INSERT INTO TABLE schema.temp_tbl partition(year_month_key)
Select
distinct
tbl.account_id,
tbl.theme_status,
streaming.streaming_hours,
tbl.year_month as year_month_key
From
(
Select
tbl_0.year_month,
tbl_0.account_id,
case when max(tbl_0.theme_status) = 1 then 'With Theme' else 'No Theme' end as theme_status
From
(Select
streaming.year_month,
streaming.account_id,
case when theme_events.account_id is not null then 1 else 0 end as theme_status
from
(
Select
substring(date_key, 1, 7) as year_month,
last_day(add_months(date_key, -1)) as year_month_ed,
date_key,
upper(account_id) as account_id,
play_seconds
from agg_device_streaming_metrics_daily
Where date_key between date_add(last_day(add_months(current_date, -2)),1) and last_day(add_months(current_date, -1))
and play_seconds > 0
) streaming
left join
(
Select
upper(theme.virtualuserid) as account_id,
min(theme.createddate) as min_createddate,
min(theme.date_key) as date_key
From
(
select * from theme_activate_event_history
where date_key between '2019-01-01' and '2020-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between '2020-01-01' and '2021-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between '2021-01-01' and '2022-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between cast('2022-01-01' as date) and last_day(add_months(current_date, -1))
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
) theme
group by theme.virtualuserid
) theme_events
on streaming.account_id = theme_events.account_id
and date(theme_events.date_key) <= date(streaming.year_month_ed)
) tbl_0
group by tbl_0.year_month, tbl_0.account_id
) tbl
inner join
(Select
substring(date_key, 1, 7) as year_month,
upper(account_id) as account_id,
cast(sum(play_seconds) / 3600 as double) as streaming_hours
from agg_device_streaming_metrics_daily
Where date_key between date_add(last_day(add_months(current_date, -2)),1) and last_day(add_months(current_date, -1))
and play_seconds > 0
group by substring(date_key, 1, 7), upper(account_id)
) streaming
on tbl.account_id = streaming.account_id and tbl.year_month = streaming.year_month;

SQL - '1' IF hour in month EXISTS, '0' IF NOT EXISTS

I have a table that has aggregations down to the hour level YYYYMMDDHH. The data is aggregated and loaded by an external process (I don't have control over). I want to test the data on a monthly basis.
The question I am looking to answer is: Does every hour in the month exist?
I'm looking to produce output that will return a 1 if the hour exists or 0 if the hour does not exist.
The aggregation table looks something like this...
YYYYMM YYYYMMDD YYYYMMDDHH DATA_AGG
201911 20191101 2019110100 100
201911 20191101 2019110101 125
201911 20191101 2019110103 135
201911 20191101 2019110105 95
… … … …
201911 20191130 2019113020 100
201911 20191130 2019113021 110
201911 20191130 2019113022 125
201911 20191130 2019113023 135
And defined as...
CREATE TABLE YYYYMMDDHH_DATA_AGG AS (
YYYYMM VARCHAR,
YYYYMMDD VARCHAR,
YYYYMMDDHH VARCHAR,
DATA_AGG INT
);
I'm looking to produce the following below...
YYYYMMDDHH HOUR_EXISTS
2019110100 1
2019110101 1
2019110102 0
2019110103 1
2019110104 0
2019110105 1
... ...
In the example above, two hours do not exist, 2019110102 and 2019110104.
I assume I'd have to join the aggregation table against a computed table that contains all the YYYYMMDDHH combos???
The database is Snowflake, but assume most generic ANSI SQL queries will work.
You can get what you want with a recursive CTE
The recursive CTE generates the list of possible Hours. And then a simple left outer join gets you the flag for if you have any records that match that hour.
WITH RECURSIVE CTE (YYYYMMDDHH) as
(
SELECT YYYYMMDDHH
FROM YYYYMMDDHH_DATA_AGG
WHERE YYYYMMDDHH = (SELECT MIN(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG)
UNION ALL
SELECT TO_VARCHAR(DATEADD(HOUR, 1, TO_TIMESTAMP(C.YYYYMMDDHH, 'YYYYMMDDHH')), 'YYYYMMDDHH') YYYYMMDDHH
FROM CTE C
WHERE TO_VARCHAR(DATEADD(HOUR, 1, TO_TIMESTAMP(C.YYYYMMDDHH, 'YYYYMMDDHH')), 'YYYYMMDDHH') <= (SELECT MAX(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG)
)
SELECT
C.YYYYMMDDHH,
IFF(A.YYYYMMDDHH IS NOT NULL, 1, 0) HOUR_EXISTS
FROM CTE C
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG A
ON C.YYYYMMDDHH = A.YYYYMMDDHH;
If your timerange is too long you'll have issues with the cte recursing too much. You can create a table or temp table with all of the possible hours instead. For example:
CREATE OR REPLACE TEMPORARY TABLE HOURS (YYYYMMDDHH VARCHAR) AS
SELECT TO_VARCHAR(DATEADD(HOUR, SEQ4(), TO_TIMESTAMP((SELECT MIN(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG), 'YYYYMMDDHH')), 'YYYYMMDDHH')
FROM TABLE(GENERATOR(ROWCOUNT => 10000)) V
ORDER BY 1;
SELECT
H.YYYYMMDDHH,
IFF(A.YYYYMMDDHH IS NOT NULL, 1, 0) HOUR_EXISTS
FROM HOURS H
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG A
ON H.YYYYMMDDHH = A.YYYYMMDDHH
WHERE H.YYYYMMDDHH <= (SELECT MAX(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG);
You can then fiddle with the generator count to make sure you have enough hours.
You can generate a table with every hour of the month and LEFT OUTER JOIN your aggregation to it:
WITH EVERY_HOUR AS (
SELECT TO_CHAR(DATEADD(HOUR, HH, TO_DATE(YYYYMM::TEXT, 'YYYYMM')),
'YYYYMMDDHH')::NUMBER YYYYMMDDHH
FROM (SELECT DISTINCT YYYYMM FROM YYYYMMDDHH_DATA_AGG) t
CROSS JOIN (
SELECT ROW_NUMBER() OVER (ORDER BY NULL) - 1 HH
FROM TABLE(GENERATOR(ROWCOUNT => 745))
) h
QUALIFY YYYYMMDDHH < (YYYYMM + 1) * 10000
)
SELECT h.YYYYMMDDHH, NVL2(a.YYYYMM, 1, 0) HOUR_EXISTS
FROM EVERY_HOUR h
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG a ON a.YYYYMMDDHH = h.YYYYMMDDHH
Here's something that might help get you started. I'm guessing you want to have 'synthetic' [YYYYMMDD] values? Otherwise, if the value aren't there, then they shouldn't appear in the list
DROP TABLE IF EXISTS #_hours
DROP TABLE IF EXISTS #_temp
--Populate a table with hours ranging from 00 to 23
CREATE TABLE #_hours ([hour_value] VARCHAR(2))
DECLARE #_i INT = 0
WHILE (#_i < 24)
BEGIN
INSERT INTO #_hours
SELECT FORMAT(#_i, '0#')
SET #_i += 1
END
-- Replicate OP's sample data set
CREATE TABLE #_temp (
[YYYYMM] INTEGER
, [YYYYMMDD] INTEGER
, [YYYYMMDDHH] INTEGER
, [DATA_AGG] INTEGER
)
INSERT INTO #_temp
VALUES
(201911, 20191101, 2019110100, 100),
(201911, 20191101, 2019110101, 125),
(201911, 20191101, 2019110103, 135),
(201911, 20191101, 2019110105, 95),
(201911, 20191130, 2019113020, 100),
(201911, 20191130, 2019113021, 110),
(201911, 20191130, 2019113022, 125),
(201911, 20191130, 2019113023, 135)
SELECT X.YYYYMM, X.YYYYMMDD, X.YYYYMMDDHH
-- Case: If 'target_hours' doesn't exist, then 0, else 1
, CASE WHEN X.target_hours IS NULL THEN '0' ELSE '1' END AS [HOUR_EXISTS]
FROM (
-- Select right 2 characters from converted [YYYYMMDDHH] to act as 'target values'
SELECT T.*
, RIGHT(CAST(T.[YYYYMMDDHH] AS VARCHAR(10)), 2) AS [target_hours]
FROM #_temp AS T
) AS X
-- Right join to keep all of our hours and only the target hours that match.
RIGHT JOIN #_hours AS H ON H.hour_value = X.target_hours
Sample output:
YYYYMM YYYYMMDD YYYYMMDDHH HOUR_EXISTS
201911 20191101 2019110100 1
201911 20191101 2019110101 1
NULL NULL NULL 0
201911 20191101 2019110103 1
NULL NULL NULL 0
201911 20191101 2019110105 1
NULL NULL NULL 0
With (almost) standard sql, you can do a cross join of the distinct values of YYYYMMDD to a list of all possible hours and then left join to the table:
select concat(d.YYYYMMDD, h.hour) as YYYYMMDDHH,
case when t.YYYYMMDDHH is null then 0 else 1 end as hour_exists
from (select distinct YYYYMMDD from tablename) as d
cross join (
select '00' as hour union all select '01' union all
select '02' union all select '03' union all
select '04' union all select '05' union all
select '06' union all select '07' union all
select '08' union all select '09' union all
select '10' union all select '11' union all
select '12' union all select '13' union all
select '14' union all select '15' union all
select '16' union all select '17' union all
select '18' union all select '19' union all
select '20' union all select '21' union all
select '22' union all select '23'
) as h
left join tablename as t
on concat(d.YYYYMMDD, h.hour) = t.YYYYMMDDHH
order by concat(d.YYYYMMDD, h.hour)
Maybe in Snowflake you can construct the list of hours with a sequence much easier instead of all those UNION ALLs.
This version accounts for the full range of days, across months and years. It's a simple cross join of the set of possible days with the set of possible hours of the day -- left joined to actual dates.
set first = (select min(yyyymmdd::number) from YYYYMMDDHH_DATA_AGG);
set last = (select max(yyyymmdd::number) from YYYYMMDDHH_DATA_AGG);
with
hours as (select row_number() over (order by null) - 1 h from table(generator(rowcount=>24))),
days as (
select
row_number() over (order by null) - 1 as n,
to_date($first::text, 'YYYYMMDD')::date + n as d,
to_char(d, 'YYYYMMDD') as yyyymmdd
from table(generator(rowcount=>($last-$first+1)))
)
select days.yyyymmdd || lpad(hours.h,2,0) as YYYYMMDDHH, nvl2(t.yyyymmddhh,1,0) as HOUR_EXISTS
from days cross join hours
left join YYYYMMDDHH_DATA_AGG t on t.yyyymmddhh = days.yyyymmdd || lpad(hours.h,2,0)
order by 1
;
$first and $last can be packed in as sub-queries if you prefer.

Get the aggregated result of a GROUP BY for each value on WHERE clause in TSQL

I have a table in SQL Server with the following format
MType (Integer), MDate (Datetime), Status (SmallInt)
1, 10-05-2018, 1
1, 15-05-2018, 1
2, 25-3-2018, 0
3, 12-01-2018, 1
....
I want to get the MIN MDate for specific MTypes for future dates. In case there isn't one, then the MType should be returned but with NULL value.
Here is what I have done until now:
SELECT m.MType,
MIN(m.MDate)
FROM MyTypes m
WHERE m.MType IN ( 1, 2, 3, 4)
AND m.MDate > GETDATE()
AND m.Status = 1
GROUP BY m.MType
Obviously, the above will return only the following:
1, 10-05-2018
Since there are any other rows with future date and status equals to 1.
However, the results I want are:
1, 10-05-2018
2, NULL
3, NULL
4, NULL //this is missing in general from the table. No MType with value 4
The table is big, so performance is something to take into account. Any ideas how to proceed?
One way is to join the table to itself and filter the date in the ON clause.
SELECT a.Mtype, MIN(b.MDate)
FROM MyTypes a
LEFT JOIN MyTypes b
ON a.MType = b.MType
AND b.MDate > GETDATE()
AND b.Status = 1
WHERE a.MType IN ( 1, 2, 3)
GROUP BY a.MType
Here's a Demo.
I don't know what is logic behind but it seems to use of look-up tables
SELECT a.MType, l.MDate
FROM
(
values (1),(2),(3),(4)
)a (MType)
LEFT JOIN (
SELECT m.MType,
MIN(m.MDate) MDate
FROM MyTypes m
WHERE m.MDate > GETDATE()
AND m.Status = 1
GROUP BY m.MType
)l on l.MType = a.MType
Use a windows function and a union to a numbers table:
declare #t table (MType int, MDate datetime, [Status] smallint)
Insert into #t values (1, convert(date, '10-05-2018', 103), 1)
,(1, convert(date, '15-05-2018', 103), 1)
,(2, convert(date, '25-03-2018', 103), 0)
,(3, convert(date, '12-01-2018', 103), 1)
Select DISTINCT Mtype
, min(iiF(MDate>getdate() and status = 1, MDate, NUll)) over (Partition By Mtype) as MDate
from ( SELECT TOP 10000 row_number() over(order by t1.number) as MType
, '1900-01-01' as MDate, 0 as [Status]
FROM master..spt_values t1
CROSS JOIN master..spt_values t2
union
Select Mtype, MDate, [Status] from #t
) x
where MType in (1,2,3,4)
order by x.MType

SQL Last activity of given type

So I have a Visitor table, and a Visitor_activity table. Say:
Visitor
Visitor_ID Int
Visitor_name varchar(20)
Visitor_Activity
ID Int
Visitor_ID Int
Activity_Type char(3) -- values IN or OUT
Activity_Time datetime
Visitors might sign in and out multiple times in a day.
I'd like a nice query to tell me all visitors who are in: i.e. the last activity for today (on activity_time) was an "IN" not an "OUT". Any advice much appreciated.
It's T-SQL by the way, but I think it's more of an in-principle question.
One way to solve this is to use a correlated not exists predicate:
select Activity_Time, Visitor_ID
from Visitor_Activity t1
where Activity_Type = 'IN'
and not exists (
select 1
from Visitor_Activity
where Activity_Type = 'OUT'
and Visitor_ID = t1.Visitor_ID
and Activity_Time > t1.Activity_Time
and cast(Activity_Time as date) = cast(t1.Activity_Time as date)
)
This basically says get all visitor_id that have type = IN for which there doesn't exists any type = OUT record with a later time (on the same date).
Sample SQL Fiddle
SELECT
v.*
FROM
Visitors v
JOIN Visitor_Activity va ON va.Visitor_ID = v.Visitor_ID
WHERE
va.Activity_Type = 'IN'
AND NOT EXISTS ( SELECT
*
FROM
Visitor_Activity va_out
WHERE
va_out.Visitor_ID = va.Visitor_ID
AND va_out.Activity_Type = 'OUT'
AND va_out.Activity_Time > va.Activity_Time )
with visitorsInOut as (
select Visitor_id,
max(case when Activity_Type = 'in' then Activity_Time else null end) inTime,
max(case when Activity_Type = 'out' then Activity_Time else null end) outTime
from Visitor_Activity
where datediff(dd, Activity_Time, getdate()) = 0
group by Visitor_id)
select Visitor_id
from visitorsInOut
where inTime > outTime or outTime is null
This uses a CTE to find the activity record with the greatest Activity_Time where the Activity_Type = 'IN' and assigns it RowNum 1. Then you can INNER JOIN the CTE to the Visitor table, filtering by the CTE results where RowNum = 1.
; WITH VisAct AS(
SELECT act.Visitor_ID
, ROW_NUMBER() OVER(PARTITION BY Visitor_ID ORDER BY Activity_Time DESC) AS RowNum
FROM Visitor_Activity act
WHERE act.Activity_Type = 'IN'
AND act.Activity_Time >= CAST(GETDATE() AS DATE)
)
SELECT vis.Visitor_ID, vis.Visitor_name
FROM Visitor vis
INNER JOIN VisAct act
ON act.Visitor_ID = vis.Visitor_ID
WHERE act.Row_Num = 1
You can pull the most recent action for each visitor, and then only return those where the last action for today was to check in.
SELECT v.Visitor_ID, v.Visitor_Name, va.Activity_Type, va.Activity_Time
FROM Visitor AS v
INNER JOIN (SELECT Visitor_ID, Activity_Type, Activity_Time, RANK() OVER (PARTITION BY Visitor_ID ORDER BY Activity_Time DESC) AS LastAction
FROM Visitor_Activity
-- checks for today, can be omitted if you still want
-- to see someone checked in from yesterday
WHERE DATEDIFF(d, 0, Activity_Time) = DATEDIFF(d, 0, getdate())
) AS va ON va.Visitor_ID = v.Visitor_ID
WHERE LastAction = 1
AND Activity_Type = 'IN'
With CROSS APPLY:
DECLARE #d DATE = '20150320'
DECLARE #v TABLE
(
visitor_id INT ,
visitor_name NVARCHAR(MAX)
)
DECLARE #a TABLE
(
visitor_id INT ,
type CHAR(3) ,
time DATETIME
)
INSERT INTO #v
VALUES ( 1, 'A' ),
( 2, 'B' ),
( 3, 'C' )
INSERT INTO #a
VALUES ( 1, 'in', '2015-03-20 19:32:27.513' ),
( 1, 'out', '2015-03-20 19:32:27.514' ),
( 1, 'in', '2015-03-20 19:32:27.515' ),
( 2, 'in', '2015-03-20 19:32:27.516' ),
( 2, 'out', '2015-03-20 19:32:27.517' ),
( 3, 'in', '2015-03-20 19:32:27.518' ),
( 3, 'out', '2015-03-20 19:32:27.519' ),
( 3, 'in', '2015-03-20 19:32:27.523' )
SELECT *
FROM #v v
CROSS APPLY ( SELECT *
FROM ( SELECT TOP 1
type
FROM #a a
WHERE a.visitor_id = v.visitor_id
AND a.time >= #d
AND a.time < DATEADD(dd, 1, #d)
ORDER BY time DESC
) i
WHERE type = 'in'
) c
Output:
visitor_id visitor_name type
1 A in
3 C in
The principle:
First you are selecting all visitors.
Then you are applying to visitor last activity
SELECT TOP 1
type
FROM #a a
WHERE a.visitor_id = v.visitor_id
AND a.time >= #d
AND a.time < DATEADD(dd, 1, #d)
ORDER BY time DESC
Then you are selecting from previous step in order to get empty set which will filter out visitors whose last activity was not 'in'. If last activity was 'in' you get one row in result and thus applying works. If last activity is 'out' then outer query will result in empty set, and by design CROSS APPLY will eliminate such visitor.

How to merge time intervals in SQL Server

Suppose I have the following an event table with personId, startDate and endDate.
I want to know how much time the person X spent doing an event (the events can override each other).
If the person just has 1 event, its easy: datediff(dd, startDate, endDate)
If the person has 2 events it gets tricky.
I'll set some scenarios for the expected results.
Scenario 1
startDate endDate
1 4
3 5
This means he the results should be the datediff from 1 to 5
Scenario 2
startDate endDate
1 3
6 9
this means he the results should be the some of datediff(dd,1,3) and datediff(dd,6,9)
How can I get this result on an sql query? I can only think of a bunch of if statements, but the same person can have n events so the query will be really confusing.
Shredder Edit: I'd like to add a 3rd scenario:
startDate endDate
1 5
4 8
11 15
Desired result to Shredder scenario:
(1,5) and (4,8) merge in (1,8) since they overlap then we need to datediff(1,8) + datediff(11,15) => 7 + 4 => 11
You can use a recursive CTE to build a list of dates and then count the distinct dates.
declare #T table
(
startDate date,
endDate date
);
insert into #T values
('2011-01-01', '2011-01-05'),
('2011-01-04', '2011-01-08'),
('2011-01-11', '2011-01-15');
with C as
(
select startDate,
endDate
from #T
union all
select dateadd(day, 1, startDate),
endDate
from C
where dateadd(day, 1, startDate) < endDate
)
select count(distinct startDate) as DayCount
from C
option (MAXRECURSION 0)
Result:
DayCount
-----------
11
Or you can use a numbers table. Here I use master..spt_values:
declare #MinStartDate date
select #MinStartDate = min(startDate)
from #T
select count(distinct N.number)
from #T as T
inner join master..spt_values as N
on dateadd(day, N.Number, #MinStartDate) between T.startDate and dateadd(day, -1, T.endDate)
where N.type = 'P'
Here's a solution that uses the Tally table idea (which I first heard of in an article by Itzk Ben-Gan -- I still cut and paste his code whenver the subject comes up). The idea is to generate a list of ascending integers, join the source data by range against the numbers, and then count the number of distinct numbers, as follows. (This code uses syntax from SQL Server 2008, but with minor modifications would work in SQL 2005.)
First set up some testing data:
CREATE TABLE #EventTable
(
PersonId int not null
,startDate datetime not null
,endDate datetime not null
)
INSERT #EventTable
values (1, 'Jan 1, 2011', 'Jan 4, 2011')
,(1, 'Jan 3, 2011', 'Jan 5, 2011')
,(2, 'Jan 1, 2011', 'Jan 3, 2011')
,(2, 'Jan 6, 2011', 'Jan 9, 2011')
Determine some initial values
DECLARE
#Interval bigint
,#FirstDay datetime
,#PersonId int = 1 -- (or whatever)
Get the first day and the maximum possible number of dates (to keep the cte from generating extra values):
SELECT
#Interval = datediff(dd, min(startDate), max(endDate)) + 1
,#FirstDay = min(startDate)
from #EventTable
where PersonId = #PersonId
Cut and paste over the one routine and modify and test it to only return as many integers as we'll need:
/*
;WITH
Pass0 as (select 1 as C union all select 1), --2 rows
Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
Pass5 as (select 1 as C from Pass4 as A, Pass4 as B),--4,294,967,296 rows
Tally as (select row_number() over(order by C) as Number from Pass5)
select Number from Tally where Number <= #Interval
*/
And now revise it by first joining to the intervals defined in each source row, and then count each distinct value found:
;WITH
Pass0 as (select 1 as C union all select 1), --2 rows
Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
Pass5 as (select 1 as C from Pass4 as A, Pass4 as B),--4,294,967,296 rows
Tally as (select row_number() over(order by C) as Number from Pass5)
SELECT PersonId, count(distinct Number) EventDays
from #EventTable et
inner join Tally
on dateadd(dd, Tally.Number - 1, #FirstDay) between et.startDate and et.endDate
where et.PersonId = #PersonId
and Number <= #Interval
group by PersonId
Take out the #PersonId filter and you'd get it for all persons. And with minor modification you can do it for any time interval, not just days (which is why I set the Tally table to generate severely large numbers.)
The following SQL is for the three scenarios you've described
with sampleData
AS (
SELECT 1 personid,1 startDate,4 endDate
UNION SELECT 1,3,5
UNION SELECT 2,1,3
UNION SELECT 2,6,9
UNION SELECT 3,1,5
UNION SELECT 3,4,8
UNION SELECT 3,11, 15
),
cte
AS (SELECT personid,
startdate,
enddate,
Row_number() OVER(ORDER BY personid, startdate) AS rn
FROM sampledata),
overlaps
AS (SELECT a.personid,
a.startdate,
b.enddate,
a.rn id1,
b.rn id2
FROM cte a
INNER JOIN cte b
ON a.personid = b.personid
AND a.enddate > b.startdate
AND a.rn = b.rn - 1),
nooverlaps
AS (SELECT a.personid,
a.startdate,
a.enddate
FROM cte a
LEFT JOIN overlaps b
ON a.rn = b.id1
OR a.rn = b.id2
WHERE b.id1 IS NULL)
SELECT personid,
SUM(timespent) timespent
FROM (SELECT personid,
enddate - startdate timespent
FROM nooverlaps
UNION
SELECT personid,
enddate - startdate
FROM overlaps) t
GROUP BY personid
Produces this result
Personid timeSpent
----------- -----------
1 4
2 5
3 11
Notes: I used the simple integers but the DateDiffs should work too
Correctness issue There is a correctness issue if your data is allowed to have multiple overlaps as Cheran S noted, the results won't be correct and you should use one of the other answers instead. His example used [1,5],[4,8],[7,11] for the same person ID
Algebra. If B-n is the ending time of the nth event, and A-n is the starting time of the nth event, then the sum of the differences is the difference of the sums. So you can write
select everything else, sum(cast(endDate as int)) - sum(cast(startDate as int)) as daysSpent
If your dates have no time component, this works. Otherwise, you could use a real.
Try something like this
select
personId,
sum(DateDuration) as TotalDuration
from
(
select personId, datediff(dd, startDate, endDate) as DateDuration
from yourEventTable
) a
group by personId
;WITH cte(gap)
AS
(
SELECT sum(b-a) from xxx GROUP BY uid
)
SELECT * FROM cte
Edit 1: I have modified both solutions to get correct results.
Edit 2: I have done comparative tests using the solutions proposed by Mikael Eriksson, Conrad Frix, Philip Kelley and me. All tests use an EventTable with the following structure:
CREATE TABLE EventTable
(
EventID INT IDENTITY PRIMARY KEY
,PersonId INT NOT NULL
,StartDate DATETIME NOT NULL
,EndDate DATETIME NOT NULL
,CONSTRAINT CK_StartDate_Before_EndDate CHECK(StartDate < EndDate)
);
Also, all tests use warm buffer (no DBCC DROPCLEANBUFFERS) and cold [plan] cache (I have executed DBCC FREEPROCCACHE before every test). Because some solutions use a filter(PersonId = 1) and others not, I have inserted into EventTable rows for only one person (INSERT ...(PersonId,...) VALUES (1,...)).
These are the results:
My solutions use recursive CTEs.
Solution 1:
WITH BaseCTE
AS
(
SELECT e.StartDate
,e.EndDate
,e.PersonId
,ROW_NUMBER() OVER(PARTITION BY e.PersonId ORDER BY e.StartDate, e.EndDate) RowNumber
FROM EventTable e
), RecursiveCTE
AS
(
SELECT b.PersonId
,b.RowNumber
,b.StartDate
,b.EndDate
,b.EndDate AS MaxEndDate
,1 AS PseudoDenseRank
FROM BaseCTE b
WHERE b.RowNumber = 1
UNION ALL
SELECT crt.PersonId
,crt.RowNumber
,crt.StartDate
,crt.EndDate
,CASE WHEN crt.EndDate > prev.MaxEndDate THEN crt.EndDate ELSE prev.MaxEndDate END
,CASE WHEN crt.StartDate <= prev.MaxEndDate THEN prev.PseudoDenseRank ELSE prev.PseudoDenseRank + 1 END
FROM RecursiveCTE prev
INNER JOIN BaseCTE crt ON prev.PersonId = crt.PersonId
AND prev.RowNumber + 1 = crt.RowNumber
), SumDaysPerPersonAndInterval
AS
(
SELECT src.PersonId
,src.PseudoDenseRank --Interval ID
,DATEDIFF(DAY, MIN(src.StartDate), MAX(src.EndDate)) Days
FROM RecursiveCTE src
GROUP BY src.PersonId, src.PseudoDenseRank
)
SELECT x.PersonId, SUM( x.Days ) DaysPerPerson
FROM SumDaysPerPersonAndInterval x
GROUP BY x.PersonId
OPTION(MAXRECURSION 32767);
Solution 2:
DECLARE #Base TABLE --or a temporary table: CREATE TABLE #Base (...)
(
PersonID INT NOT NULL
,StartDate DATETIME NOT NULL
,EndDate DATETIME NOT NULL
,RowNumber INT NOT NULL
,PRIMARY KEY(PersonID, RowNumber)
);
INSERT #Base (PersonID, StartDate, EndDate, RowNumber)
SELECT e.PersonId
,e.StartDate
,e.EndDate
,ROW_NUMBER() OVER(PARTITION BY e.PersonID ORDER BY e.StartDate, e.EndDate) RowNumber
FROM EventTable e;
WITH RecursiveCTE
AS
(
SELECT b.PersonId
,b.RowNumber
,b.StartDate
,b.EndDate
,b.EndDate AS MaxEndDate
,1 AS PseudoDenseRank
FROM #Base b
WHERE b.RowNumber = 1
UNION ALL
SELECT crt.PersonId
,crt.RowNumber
,crt.StartDate
,crt.EndDate
,CASE WHEN crt.EndDate > prev.MaxEndDate THEN crt.EndDate ELSE prev.MaxEndDate END
,CASE WHEN crt.StartDate <= prev.MaxEndDate THEN prev.PseudoDenseRank ELSE prev.PseudoDenseRank + 1 END
FROM RecursiveCTE prev
INNER JOIN #Base crt ON prev.PersonId = crt.PersonId
AND prev.RowNumber + 1 = crt.RowNumber
), SumDaysPerPersonAndInterval
AS
(
SELECT src.PersonId
,src.PseudoDenseRank --Interval ID
,DATEDIFF(DAY, MIN(src.StartDate), MAX(src.EndDate)) Days
FROM RecursiveCTE src
GROUP BY src.PersonId, src.PseudoDenseRank
)
SELECT x.PersonId, SUM( x.Days ) DaysPerPerson
FROM SumDaysPerPersonAndInterval x
GROUP BY x.PersonId
OPTION(MAXRECURSION 32767);