Eliminate and reduce overlapping data ranges using SQL - sql

i got a dataset in SQL Server Management Studio. The data looks like the following. i have a identifier for each people userID, date of the record, start timestartime and finish time endtime.
UserID date startime endtime
1 20110203 09:30 09:35
1 20110203 09:31 09:38
1 20110203 10:03 10:05
1 20110203 10:04:00 10:35:00
2 20110203 11:02 11:05
For each people, i want check if there is any overlapping time. If there is, I want to keep the smallest startime and largest endtime. if no overlapping time, I keep the original data. In addition, I want to calculate the duration of maxi endtime and smallest startime.
The result I want should looks like the following. Can anyone teach me how to code this please.
UserID date startime endtime diff
1 20110203 09:30 09:38 00:08
1 20110203 10:03 10:35 00:02
2 20110203 11:02 11:05 00:03

It seems that SELECT with CTE needs to recursively merge undetermined number of rows. In that case I would prefer safe CURSOR based solution:
DECLARE #t TABLE
(
UserId int,
[Date] date,
StartTime time,
EndTime time
);
INSERT INTO #t VALUES
(1, '2011-02-03', '09:30:00', '09:35:00'),
(1, '2011-02-03', '09:31:00', '09:38:00'),
(1, '2011-02-03', '09:36:00', '09:41:00'),
(1, '2011-02-03', '09:40:00', '09:45:00'),
(1, '2011-02-03', '09:42:00', '09:43:00'),
(1, '2011-02-03', '10:03:00', '10:05:00'),
(2, '2011-02-03', '11:02:00', '11:05:00'),
(1, '2011-02-03', '12:00:00', '12:05:00'),
(1, '2011-02-03', '12:04:00', '12:06:00');
------------------
DECLARE #result TABLE
(
UserId int,
[Date] date,
StartTime time,
EndTime time
)
DECLARE cur CURSOR FOR
SELECT UserId, [Date], StartTime, EndTime
FROM #t
ORDER BY UserId, [Date], StartTime;
DECLARE #UserId int
DECLARE #Date date
DECLARE #StartTime time
DECLARE #EndTime time
DECLARE #LastUserId int
DECLARE #LastDate date
DECLARE #LastStartTime time
DECLARE #LastEndTime time
OPEN cur
FETCH NEXT FROM cur INTO #UserId, #Date, #StartTime, #EndTime
SET #LastUserId = #UserId
SET #LastDate = #Date
SET #LastStartTime = #StartTime
SET #LastEndTime = #EndTime
WHILE ##FETCH_STATUS = 0
BEGIN
IF #UserId = #LastUserId AND #Date = #LastDate AND #StartTime <= #LastEndTime
SET #LastEndTime = CASE WHEN #LastEndTime > #EndTime THEN #LastEndTime ELSE #EndTime END
ELSE
BEGIN
INSERT #result(UserId, [Date], StartTime, EndTime) VALUES (#LastUserId, #LastDate, #LastStartTime, #LastEndTime)
SET #LastUserId = #UserId
SET #LastDate = #Date
SET #LastStartTime = #StartTime
SET #LastEndTime = #EndTime
END
FETCH NEXT FROM cur INTO #UserId, #Date, #StartTime, #EndTime
END
INSERT #result(UserId, [Date], StartTime, EndTime) VALUES (#LastUserId, #LastDate, #LastStartTime, #LastEndTime)
CLOSE cur
DEALLOCATE cur
SELECT UserId,
[Date],
StartTime,
EndTime,
CAST(DATEADD(second,DATEDIFF(second,StartTime,EndTime),'2000-01-01') AS time) Diff
FROM #result
which returns
1 2011-02-03 09:30:00.0000000 09:45:00.0000000 00:15:00.0000000
1 2011-02-03 10:03:00.0000000 10:05:00.0000000 00:02:00.0000000
1 2011-02-03 12:00:00.0000000 12:06:00.0000000 00:06:00.0000000
2 2011-02-03 11:02:00.0000000 11:05:00.0000000 00:03:00.0000000

Following a redesigned Version of my previous cte Approach. However, it will still have Problems if there are multiple records for the same user with identical start time... didn't have time to fix that one, but as far as I understood this is not possible in the described process!?
--
-- This part is temporary and has to be replaced by your tables
-- There several more records included now
-- There is still a glitch if the starttime is identical for two records - but as far as I understood, this is not possible in the described case?
--
declare #t table (userid int, date int, starttime time, endtime time);
insert into #t values (1, 20110203, '09:30:00', '09:35:00'), (1, 20110203, '09:31:00', '09:38:00'), (1, 20110203, '09:36:00', '09:41:00'), (1, 20110203, '10:03:00', '10:05:00'),(1, 20110203, '10:04:00', '10:35:00'),
(2, 20110203, '11:02:00', '11:05:00'), (2, 20110203, '11:03:00', '11:20:00'), (2, 20110203, '11:04:00', '11:35:00'), (2, 20110203, '13:02:00', '13:05:00'), (2, 20110203, '13:04:00', '13:15:00');
--
-- First cte: selects all start and endtimes and their - if existing - "overlaps"; recursive cte
--
WITH cte AS(
SELECT 1 lvl, a.userid
,CASE WHEN a.starttime <= ISNULL(b.starttime, a.starttime) THEN a.starttime ELSE b.starttime END AS starttime
,CASE WHEN a.endtime >= ISNULL(b.endtime, a.endtime) THEN a.endtime ELSE b.endtime END AS endtime
FROM #t as a
LEFT OUTER JOIN #t AS b ON b.userid = a.userid
AND b.starttime < a.starttime
AND b.endtime > a.starttime
UNION ALL
select a.lvl+1, a.userid
,CASE WHEN a.starttime <= ISNULL(b.starttime, a.starttime) THEN a.starttime ELSE b.starttime END AS xStart
,CASE WHEN a.endtime >= ISNULL(b.endtime, a.endtime) THEN a.endtime ELSE b.endtime END AS xEnd
from cte as a
INNER JOIN #t AS b ON b.userid = a.userid
AND b.starttime < a.starttime
AND b.endtime > a.starttime
),
--
-- Second cte: get the max. lvl result per user from the recursive cte
--
cteUserMaxLvl AS (
SELECT userid, max(lvl) AS MaxLvl
FROM cte
GROUP BY userid
),
--
-- third cte: get the rows matching the max.lvl; their timespan should cover the desired min. start and max. end
--
cteNoMoreOverlap AS(
SELECT a.userid, starttime, endtime
FROM cte AS a
JOIN cteUserMaxLvl AS b ON a.userid = b.userid AND a.lvl = b.MaxLvl
)
--
-- Select the rows from the "No more overlap" cte
--
SELECT userid, starttime, endtime
FROM cteNoMoreOverlap
UNION ALL
--
-- And finally select all rows, which are not covered by the previously selected timespan
--
SELECT a.userid, min(a.starttime) AS starttime, max(a.endtime) AS endtime
FROM cte AS a
JOIN cteNoMoreOverlap AS b ON a.userid = b.userid AND a.starttime NOT BETWEEN b.starttime AND b.endtime
GROUP BY a.userid
order by userid, starttime, endtime

I believe when you say overlapping time, you are saying within the same hour on the same day. If that is what you mean, following solution might work. Attached is the screenshot of my results.
CREATE TABLE #OverlappingDates
(
UserID INT
, [date] DATE
, starttime VARCHAR(5)
, endtime VARCHAR(5)
);
INSERT INTO #OverlappingDates
( UserID, date, starttime, endtime )
VALUES ( 1 -- UserID - int
, '20110203' -- date - date
, '09:30' -- starttime - time
, '09:35' -- endtime - time
),
( 1 -- UserID - int
, '20110203' -- date - date
, '09:31' -- starttime - time
, '09:38' -- endtime - time
),
( 1 -- UserID - int
, '20110203' -- date - date
, '10:03' -- starttime - time
, '10:05' -- endtime - time
),
( 2 -- UserID - int
, '20110203' -- date - date
, '11:02' -- starttime - time
, '11:05' -- endtime - time
),
( 2 -- UserID - int
, '20110203' -- date - date
, '11:05' -- starttime - time
, '11:15' -- endtime - time
),
( 2 -- UserID - int
, '20110203' -- date - date
, '11:05' -- starttime - time
, '12:00' -- endtime - time
);
WITH cte
AS ( SELECT UserID
, date
, MIN(starttime) AS StartTime
, MAX(endtime) AS EndTime
FROM #OverlappingDates
GROUP BY UserID
, date
, LEFT(starttime, 2)
, LEFT(endtime, 2)
)
SELECT cte.UserID
, cte.date
, cte.StartTime
, cte.EndTime
, ( RIGHT('0'
+ CAST(( DATEDIFF(SECOND,
( CAST(CONCAT(( CAST(cte.[date] AS VARCHAR(10)) ),
' ', cte.StartTime) AS DATETIME) ),
( CAST(CONCAT(( CAST(cte.[date] AS VARCHAR(10)) ),
' ', cte.EndTime) AS DATETIME) )) )
/ 3600 AS VARCHAR(2)), 2) + ':' + RIGHT('0'
+ CAST(( ( DATEDIFF(SECOND,
( CAST(CONCAT(( CAST(cte.[date] AS VARCHAR(10)) ),
' ',
cte.StartTime) AS DATETIME) ),
( CAST(CONCAT(( CAST(cte.[date] AS VARCHAR(10)) ),
' ', cte.EndTime) AS DATETIME) )) )
/ 60 ) % 60 AS VARCHAR(2)),
2) ) AS Diff
FROM cte;

Related

Getting Minutes by Hour for Date Range

I'm trying to write a SQL query (SQL Server) and part of it is determining the number of minutes per hour between two datetimes.
Example: 11/1/2018 09:05 - 11/1/2018 13:15
Hour 09: 55 minutes
Hour 10: 60 minutes
Hour 11: 60 minutes
Hour 12: 60 minutes
Hour 13: 15 minutes
These would then get put into a temp table and grouped by some other data which will then be used to calculate dollar amounts from these minutes.
Is there a way to accomplish something like this via SQL that isn't too slow or laborious?
Thanks!
I think a recursive CTE is possibly the best approach:
with cte as (
select startTime, endTime,
startTime_hour as hourStart,
(case when endTime < dateadd(hour, 1, startTime_hour) then endTime
else dateadd(hour, 1, startTime_hour)
end) as hourEnd
from (select t.*,
dateadd(hour, datediff(hour, 0, startTime), 0) as startTime_hour
from t
) t
union all
select startTime, endTime,
dateadd(hour, 1, hourStart) as hourStart,
(case when endTime < dateadd(hour, 2, hourStart) then endTime
else dateadd(hour, 2, hourStart)
end) as endHour
from cte
where hourEnd < endTime
)
select cte.hourStart,
(case when hourStart > startTime then datediff(minute, hourStart, hourEnd) else datediff(minute, startTime, hourEnd) end) as minutes
from cte
order by hourStart;
Here is a db<>fiddle.
Here is an alternative dynamic solution that you can work with two parameters (start/end dates) only:
create table #temp
([hour] int, [minutes] int)
declare #startTime datetime='11/1/2018 09:05'
declare #EndTime datetime='11/1/2018 13:15'
declare #tempStartTime datetime = #startTime
declare #nextTimeRounded datetime
declare #hourdiff int = DATEDIFF(HOUR,#startTime,#EndTime)
declare #counter int = DATEPART(HH,#startTime)
declare #limit int = #counter + #hourdiff + 1
while #counter < #limit
begin
insert into #temp ([hour]) values (#counter)
set #nextTimeRounded= (dateadd(hour,
1 + datepart(hour, #tempStartTime),
cast(convert(varchar(10),#tempStartTime, 112) as datetime))
)
if #nextTimeRounded > #EndTime
begin
set #nextTimeRounded = #EndTime
end
update #temp
set [minutes] = (case when DATEDIFF(MINUTE,#tempStartTime,#nextTimeRounded)=0 then 60 else DATEDIFF(MINUTE,#tempStartTime,#nextTimeRounded) end)
where [hour] = #counter
set #counter = #counter + 1
set #tempStartTime = DATEADD(MINUTE,DATEDIFF(MINUTE,#tempStartTime,#nextTimeRounded),#tempStartTime);
end
select * from #temp
Sample Data
Below, we pump four time ranges, with associated values, into a table. All time ranges are different, but the first two are 10h 30m apart. The second two are 9h 45m apart.
declare #times table (
startTime time,
endTime time,
val float
);
insert #times values
('2018-10-01 01:00:00', '2018-10-01 10:45:00', 7),
('2018-10-02 01:00:00', '2018-10-02 10:45:00', 8),
('2018-10-01 01:00:00', '2018-10-01 11:30:00', 1),
('2018-10-02 01:00:00', '2018-10-02 11:30:00', 3);
Solution
You can use the 'datediff' function to aggregate as you so desire. Use the modulo operator to convert your minutes into just the minutes that remain over when whole hours are discounted.
select ap.h,
ap.m,
sumVal = sum(val)
from #times
cross apply (select
h = datediff(hour, startTime, endTime),
m = datediff(minute, startTime, endTime) % 60
) ap
group by ap.h,
ap.m

Based on day fetch all dates - sql

I have start date, end date and name of days. How can fetch all dates between those two dates of that specific days in sql?
example data:
start_date:4/11/2018
end_date: 5/11/2018
days: monday, thursday
expected output: all dates between start and end date which comes on monday and thursday and store them in table
updated
my present code(not working)
; WITH CTE(dt)
AS
(
SELECT #P_FROM_DATE
UNION ALL
SELECT DATEADD(dw, 1, dt) FROM CTE
WHERE dt < #P_TO_DATE
)
INSERT INTO Table_name
(
ID
,DATE_TIME
,STATUS
,CREATED_DATE
,CREATED_BY
)
SELECT #P_ID
,(SELECT dt FROM CTE WHERE DATENAME(dw, dt) In ('tuesday','friday',null))
,'NOT SENT'
,CAST(GETDATE() AS DATE)
,#USER_ID
Another approach for generating dates between ranges can be like following query. This will be faster compared to CTE or WHILE loop.
DECLARE #StartDate DATETIME = '2018-04-11'
DECLARE #EndDate DATETIME = '2018-05-15'
SELECT #StartDate + RN AS DATE FROM
(
SELECT (ROW_NUMBER() OVER (ORDER BY (SELECT NULL)))-1 RN
FROM master..[spt_values] T1
) T
WHERE RN <= DATEDIFF(DAY,#StartDate,#EndDate)
AND DATENAME(dw,#StartDate + RN) IN('Monday','Thursday')
Note:
If the row count present in master..[spt_values] is not sufficient for the provided range, you can make a cross join with the same to get a bigger range like following.
SELECT (ROW_NUMBER() OVER (ORDER BY (SELECT NULL)))-1 RN
FROM master..[spt_values] T1
CROSS JOIN master..[spt_values] T2
By this you will be able to generate date between a range with gap of 6436369 days.
You can use a recursive common table expression (CTE) to generate a list of days. With datepart(dw, ...) you can filter for specific days of the week.
An example that creates a list of Mondays and Thursdays between March 1st and today:
create table ListOfDates (dt date);
with cte as
(
select cast('2018-03-01' as date) as dt -- First day of interval
union all
select dateadd(day, 1, dt)
from cte
where dt < getdate() -- Last day of interval
)
insert into ListOfDates
(dt)
select dt
from cte
where datepart(dw, dt) in (2, 5) -- 2=Monday and 5=Thursday
option (maxrecursion 0)
See it working at SQL Fiddle.
This will work for you:
DECLARE #table TABLE(
ID INT IDENTITY(1,1),
Date DATETIME,
Day VARCHAR(50)
)
DECLARE #Days TABLE(
ID INT IDENTITY(1,1),
Day VARCHAR(50)
)
INSERT INTO #Days VALUES ('Monday')
INSERT INTO #Days VALUES ('Thursday')
DECLARE #StartDate DATETIME='2018-01-01';
DECLARE #EndDate DATETIME=GETDATE();
DECLARE #Day VARCHAR(50)='Friday';
DECLARE #TempDate DATETIME=#StartDate;
WHILE CAST(#TempDate AS DATE)<=CAST(#EndDate AS DATE)
BEGIN
IF EXISTS (SELECT 1 FROM #Days WHERE DAY IN (DATENAME(dw,#TempDate)))
BEGIN
INSERT INTO #table
VALUES (
#TempDate, -- Date - datetime
DATENAME(dw,#TempDate) -- Day - varchar(50)
)
END
SET #TempDate=DATEADD(DAY,1,#TempDate)
END
SELECT * FROM #table
INSERT INTO TargetTab(dateCOL)
SELECT dateCOL
FROM tab
WHERE dateCOL >= startdate AND dateCOL <= enddate
AND (DATENAME(dw,dateCOL) ='Thursday' OR DATENAME(dw,dateCOL) = 'Monday')
Try this query to get your result.
Use a recursive CTE to generate your dates, then filter by week day.
SET DATEFIRST 1 -- 1: Monday, 7 Sunday
DECLARE #StartDate DATE = '2018-04-11'
DECLARE #EndDate DATE = '2018-05-15'
DECLARE #WeekDays TABLE (WeekDayNumber INT)
INSERT INTO #WeekDays (
WeekDayNumber)
VALUES
(1), -- Monday
(4) -- Thursday
;WITH GeneratingDates AS
(
SELECT
GeneratedDate = #StartDate,
WeekDay = DATEPART(WEEKDAY, #StartDate)
UNION ALL
SELECT
GeneratedDate = DATEADD(DAY, 1, G.GeneratedDate),
WeekDay = DATEPART(WEEKDAY, DATEADD(DAY, 1, G.GeneratedDate))
FROM
GeneratingDates AS G -- Notice that we are referencing a CTE that we are also declaring
WHERE
G.GeneratedDate < #EndDate
)
SELECT
G.GeneratedDate
FROM
GeneratingDates AS G
INNER JOIN #WeekDays AS W ON G.WeekDay = W.WeekDayNumber
OPTION
(MAXRECURSION 30000)
Try this:
declare #start date = '04-11-2018'
declare #end date = '05-11-2018'
declare #P_ID int = 1
declare #USER_ID int = 11
;with cte as(
select #start [date]
union all
select dateadd(DAY, 1, [date]) from cte
where [date] < #end
)
--if MY_TABLE doesn't exist
select #P_ID,
[date],
'NOT SENT',
cast(getdate() as date),
#USER_ID
into MY_TABLE
from cte
--here you can specify days: 1 - Sunday, 2 - Monday, etc.
where DATEPART(dw,[date]) in (2, 5)
option (maxrecursion 0)
--if MY_TABLE does exist
--insert into MY_TABLE
--select #P_ID,
-- [date],
-- 'NOT SENT',
-- cast(getdate() as date),
-- #USER_ID
--from cte
--where DATEPART(dw,[date]) in (2, 5)
--option (maxrecursion 0)

distribute accumulated working hours through days

I have Date time when engine has started working and how long was it working. but sometimes it can work more than 24 Hours.
if it worked for 28 Hours on the starting date i will have record
Name started_working Finished working hours_worked
obj-00123 07/02/2018 13:30 08/02/2018 17:30 28
I need to to have record that will show that engine has worked for 10:30 in 07 and 17:30 in 08.
Name started_working Finished working hours_worked
obj-00123 07/02/2018 13:30 07/02/2018 00:00 10:30
obj-00123 07/02/2018 13:30 08/02/2018 17:30 17:30
or something like that. I don't have any idea how can i get this done. can you give me some clues. i dont ask for writing code if its not too easy.
thank you
This might do the trick for you
--Using CTE to show sample data
;WITH cteX( Name,started_working,Finished_working)
AS
(
SELECT
'obj-00123','07/02/2018 13:30','08/02/2018 17:30' UNION ALL
SELECT 'obj-00155','07/02/2018 15:00','07/02/2018 22:30'
)
SELECT
X.Name
, X.started_working
, X.Finished_working
, HoursWorked = CONVERT(VARCHAR(12), DATEADD(minute, DATEDIFF(minute, X.started_working, X.Finished_working), 0), 114)
FROM
(
SELECT
T1.Name
,T1.started_working
,Finished_working = DATEADD(SECOND,0,DATEADD(DAY, DATEDIFF(DAY,-1,T1.started_working),0)) -- Dummy finish time # Midnight
FROM
cteX T1
WHERE
DATEDIFF(DAY,T1.started_working,T1.Finished_working) <> 0 --Create a dummy finish time #Midnight when start and end not on same day
UNION ALL
SELECT
T2.Name
,started_working = CASE WHEN DATEDIFF(DAY,T2.started_working,T2.Finished_working) <> 0
THEN DATEADD(DAY, DATEDIFF(DAY, 0, T2.Finished_working), 0) --Start # Midnight
ELSE T2.started_working
END
,T2.Finished_working
FROM
cteX T2
) X
ORDER BY
X.Name, X.started_working
OUTPUT
Name started_working Finished_working HoursWorked
obj-00123 2018-07-02 13:30:00.000 2018-07-03 00:00:00.000 10:30:00:000
obj-00123 2018-08-02 00:00:00.000 2018-08-02 17:30:00.000 17:30:00:000
obj-00155 2018-07-02 15:00:00.000 2018-07-02 22:30:00.000 07:30:00:000
According to your sample data working hours may be more than several days. In this case you need to use tally table or recursive CTE. I have used recursive CTE since it's easier to handle result fields. Also there are two columns in result named started_working and started_working2. started_working is from your expected output, but I believe you need started_working2 column
declare #T as table (
Name varchar(100)
, started_working datetime
, finished_working datetime
--, hours_worked int
)
insert into #T
values
('obj-00123', '20180207 13:30', '20180208 17:30')
, ('obj-00123', '20180208 19:00', '20180209 05:00')
, ('obj-00123', '20180209 19:00', '20180209 22:00')
, ('obj-00123', '20180210 19:00', '20180213 22:00')
;with rcte as (
select
*, started_working2 = started_working
, next_date = cast(dateadd(dd, 1, started_working) as date), 1 step
from
#T
union all
select
Name, started_working, finished_working
, cast(next_date as datetime)
, dateadd(dd, 1, next_date), step + 1
from
rcte
where
next_date < finished_working
)
select
Name, started_working, started_working2, finished_working
, right(replace(str(diff / 60), ' ', 0), 2) + ':' + right(replace(str(diff % 60), ' ', 0), 2) hours_worked
from (
select
Name, started_working
, case
when step = 1 then started_working
else started_working2
end started_working2
, case
when step = max(step) over (partition by Name, started_working)
then finished_working else next_date
end finished_working
from
rcte
) t
cross apply (select datediff(mi, started_working2, finished_working) diff) ca
I'd approach the solution something like this:
WITH dynamic_twelths_of_hr_table(datetime2_value) AS
(
SELECT '2017-01-01'
UNION ALL
SELECT DATEADD(MINUTE, 5, datetime2_value)
FROM dynamic_twelths_of_hr_table
WHERE DATEADD(MINUTE, 5, datetime2_value) <= '2019-01-01'
)
,twelths_hr_table AS
(
SELECT
DATEADD(DAY, DATEDIFF(DAY, 0, datetime2_value), 0) AS date_value
,datetime2_value
FROM dynamic_twelths_of_hr_table
)
,modified_source_table AS
(
SELECT
name
,objectid
,engine_start
,ISNULL(engine_stop, GETDATE()) AS engine_stop
,IIF(engine_start IS NULL OR engine_stop IS NULL, 1, 0) AS is_still_running
FROM [YOUR_SOURCE_TABLE]
)
SELECT
name
,objectid
,is_still_running
,date_value
,(COUNT(datetime2_value)/12.0) AS hours_run_on_this_day
FROM
modified_source_table
LEFT JOIN
twelths_hr_table AS tht
ON (tht.datetime2_value BETWEEN engine_start AND engine_stop)
GROUP BY
name, objectid, is_still_running, date_value
ORDER BY
name, objectid, is_still_running, date_value
Note I haven't tested this code so please excuse any small syntax errors.
I've also baked in an assumption about the range of dates to be considered (these can be widened, or made dynamic based on when the query runs), and it has a 5 minute resolution (based on the fact that, at a glance, I could only see one value in the engine_stop column that didn't fall on a 5-minute threshold - so I assume sub-5-minute precision is not required).
Basically what it does is expand each engine row out into 5-minute windows (twelths of an hour), and then simply groups these by day and counts the number of windows per day during which the engine was running.
For currently-running engines, it will calculate how long it has run so far. I trust you can tweak the code to your exact requirements.
thank you to all. this worked perfectly. it needed slight polishing and recursion needed to be set to 0.
But creating view is a trouble with CTE.
create view mroobjectenginerowkinghoursdeclare as
declare #T as table (
Name nvarchar(100)
, OBJECTID varchar(50)
, started_working datetime
,STOPFROM datetime
,STARTDATE datetime
,STOPDATE datetime
,MODIFIEDDATETIME datetime
,START_STOP int
,STARTDESCRIPTION nvarchar(300)
,STOPDESCRIPTION nvarchar(300)
,wattage nvarchar (50)
,purpose nvarchar(300)
,location nvarchar(300)
,finished_working datetime
,oldDiff int
)
insert into #T
select
NAME
,OBJECTID
,STOPTO
,STOPFROM
,STARTDATE
,STOPDATE
,MODIFIEDDATETIME
,START_STOP
,STARTDESCRIPTION
,STOPDESCRIPTION
,wattage
,purpose
,location
,next_stopfrom
,diff
FROM [MicrosoftDynamicsAX].[dbo].[mroobjectengineworkinghours]
;with rcte as (
select
*, started_working2 = started_working
, next_date = cast(dateadd(dd, 1, started_working) as date), 1 step
from
#T
union all
select
Name,OBJECTID, started_working,STOPFROM,STARTDATE,STOPDATE,MODIFIEDDATETIME,START_STOP,STARTDESCRIPTION
,STOPDESCRIPTION,wattage
,purpose
,location, finished_working,oldDiff
, cast(next_date as datetime)
, dateadd(dd, 1, next_date), step + 1
from
rcte
where
next_date < finished_working
)
select
Name,OBJECTID, started_working,STOPFROM,STARTDATE,STOPDATE,MODIFIEDDATETIME,START_STOP,STARTDESCRIPTION
,STOPDESCRIPTION,wattage
,purpose
,location,oldDiff, started_working2, finished_working
, right(replace(str(diff / 60), ' ', 0), 2) + ':' + right(replace(str(diff % 60), ' ', 0), 2) hours_worked
from (
select
Name,OBJECTID, started_working,STOPFROM,STARTDATE,STOPDATE,MODIFIEDDATETIME,START_STOP,STARTDESCRIPTION
,STOPDESCRIPTION,wattage
,purpose
,location,oldDiff
, case
when step = 1 then started_working
else started_working2
end started_working2
, case
when step = max(step) over (partition by Name, started_working)
then finished_working else next_date
end finished_working
from
rcte
) t
cross apply (select datediff(mi, started_working2, finished_working) diff) ca
OPTION (MAXRECURSION 0);

How can I get aggregate values for all dates, even when missing data for some days?

I have the data with users tracking time. The data is in segments and each row represent one segment. Here is the sample data
http://sqlfiddle.com/#!6/2fa61
How can I get the data on daily basis i.e. if a complete day is of 1440 minutes then I want to know how many minutes the user was tracked in a day. I also want to show 0 on the day when there is no data.
I am expecting the following output
Use table of numbers. I personally have a permanent table Numbers with 100K numbers in it.
Once you have a set of numbers you can generate a set of dates for the range that you need. In this query I'll take MIN and MAX dates from your data, but since you may not have data for some dates, it is better to have explicit parameters defining the range.
For each date I have the beginning and ending of a day - our grouping interval.
For each date we are searching among track rows for those that intersect with this interval. Two intervals (DayStart, DayEnd) and (StartTime, EndTime) intersect if StartTime < DayEnd and EndTime > DayStart. This goes into WHERE.
For each intersecting intervals we are calculating the range that belongs to both intervals: from MAX(DayStart, StartTime) to MIN(DayEnd, EndTime).
Finally, we group by day and sum up durations of all ranges.
I added a row to your sample data to test the case when interval covers the whole day. From 2015-02-14 20:50:43 to 2015-02-16 19:49:59. I chose this interval to be well before intervals in your sample, so that results for the dates in your example are not affected. Here is SQL Fiddle.
DECLARE #track table
(
Email varchar(20),
StartTime datetime,
EndTime datetime,
DurationInSeconds int,
FirstDate datetime,
LastUpdate datetime
);
Insert into #track values ( 'ABC', '2015-02-20 08:49:43.000', '2015-02-20 14:49:59.000', 21616, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-20 14:49:59.000', '2015-02-20 22:12:07.000', 26528, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-20 22:12:07.000', '2015-02-21 07:00:59.000', 31732, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-21 09:49:43.000', '2015-02-21 16:30:10.000', 24027, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-21 16:30:10.000', '2015-02-22 09:49:30.000', 62360, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-22 09:55:43.000', '2015-02-22 11:49:59.000', 5856, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-22 11:49:10.000', '2015-02-23 08:49:59.000', 75649, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-23 10:59:43.000', '2015-02-23 12:49:59.000', 6616, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-23 12:50:43.000', '2015-02-24 19:49:59.000', 111556, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-28 08:49:43.000', '2015-02-28 14:49:59.000', 21616, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
Insert into #track values ( 'ABC', '2015-02-14 20:50:43.000', '2015-02-16 19:49:59.000', 0, '2015-02-19 00:00:00.000', '2015-02-28 11:45:27.000')
.
;WITH
CTE_Dates
AS
(
SELECT
Email
,CAST(MIN(StartTime) AS date) AS StartDate
,CAST(MAX(EndTime) AS date) AS EndDate
FROM #track
GROUP BY Email
)
SELECT
CTE_Dates.Email
,DayStart AS xDate
,ISNULL(SUM(DATEDIFF(second, RangeStart, RangeEnd)) / 60, 0) AS TrackMinutes
FROM
Numbers
CROSS JOIN CTE_Dates -- this generates list of dates without gaps
CROSS APPLY
(
SELECT
DATEADD(day, Numbers.Number-1, CTE_Dates.StartDate) AS DayStart
,DATEADD(day, Numbers.Number, CTE_Dates.StartDate) AS DayEnd
) AS A_Date -- this is midnight of each current and next day
OUTER APPLY
(
SELECT
-- MAX(DayStart, StartTime)
CASE WHEN DayStart > StartTime THEN DayStart ELSE StartTime END AS RangeStart
-- MIN(DayEnd, EndTime)
,CASE WHEN DayEnd < EndTime THEN DayEnd ELSE EndTime END AS RangeEnd
FROM #track AS T
WHERE
T.Email = CTE_Dates.Email
AND T.StartTime < DayEnd
AND T.EndTime > DayStart
) AS A_Track -- this is all tracks that intersect with the current day
WHERE
Numbers.Number <= DATEDIFF(day, CTE_Dates.StartDate, CTE_Dates.EndDate)+1
GROUP BY DayStart, CTE_Dates.Email
ORDER BY DayStart;
Result
Email xDate TrackMinutes
ABC 2015-02-14 189
ABC 2015-02-15 1440
ABC 2015-02-16 1189
ABC 2015-02-17 0
ABC 2015-02-18 0
ABC 2015-02-19 0
ABC 2015-02-20 910
ABC 2015-02-21 1271
ABC 2015-02-22 1434
ABC 2015-02-23 1309
ABC 2015-02-24 1189
ABC 2015-02-25 0
ABC 2015-02-26 0
ABC 2015-02-27 0
ABC 2015-02-28 360
You can still get TrackMinutes more than 1440, if two or more intervals in your data overlap.
update
You said in the comments that you have few rows in your data, where intervals do overlap and result has values more than 1440. You can wrap SUM into CASE to hide these errors in the data, but ultimately it is better to find these rows with problems and fix the data. You saw only few rows with values more than 1440, but there could be many more other rows with the same problem, which is not so visible. So, it is better to write a query that finds such overlapping rows and check how many there are and then decide what to do with them. The danger here is that at the moment you think that there are only few, but there could be a lot. This is beyond the scope of this question.
To hide the problem replace this line in the query above:
,ISNULL(SUM(DATEDIFF(second, RangeStart, RangeEnd)) / 60, 0) AS TrackMinutes
with this:
,CASE
WHEN ISNULL(SUM(DATEDIFF(second, RangeStart, RangeEnd)) / 60, 0) > 1440
THEN 1440
ELSE ISNULL(SUM(DATEDIFF(second, RangeStart, RangeEnd)) / 60, 0)
END AS TrackMinutes
I am making some guesses on the date ranges but this should be pretty close.
On my system I keep a view named cteTally which is my version of a tally table. Here is the code to create it.
create View [dbo].[cteTally] as
WITH
E1(N) AS (select 1 from (values (1),(1),(1),(1),(1),(1),(1),(1),(1),(1))dt(n)),
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS
(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
)
select N from cteTally
Now we can utilize this to build your results. We just need to put in a couple other CTEs to get the date ranges established.
with DateRange as
(
select MIN(FirstDate) as StartDate
, MAX(LastUpdate) as EndDate
from track
)
, AllDates as
(
select DateAdd(DAY, t.N - 1, StartDate) BaseDate
from DateRange dr
cross join cteTally t
where t.N <= DATEDIFF(day, StartDate, EndDate) + 1
)
select t.Email
, ad.BaseDate as xDate
, t.DurationInSeconds as TrackMinutes
from AllDates ad
left join track t on cast(t.StartTime as date) = ad.BaseDate
Create a table variable for the dates
Populate table in a WHILE loop
Cross join to tracker data with the dates table variable
Convert values in column [DurationInSeconds] into minutes
Replace nulls with zero
Code:
DECLARE #dates TABLE ( ReportDates DATE )
DECLARE #BeginDate AS DATE
, #EndDate AS DATE
, #RunDate AS DATE
SELECT #BeginDate = MIN(starttime) FROM dbo.track
SELECT #EndDate = MAX(starttime) FROM dbo.track
SET #RunDate = #BeginDate
WHILE #RunDate <= #EndDate
BEGIN
SET #RunDate = DATEADD(DAY, 1, #RunDate)
INSERT INTO #dates
VALUES ( #RunDate )
END;
SELECT e.Email
, e.ReportDates
, ISNULL(SUM(DurationInSeconds / 60), 0) AS TotDurationInMinutes
FROM ( SELECT d.ReportDates
,t.email
FROM #dates AS d
cross JOIN track AS t
GROUP BY d.ReportDates, t.Email ) AS e
LEFT JOIN track AS t ON e.ReportDates = CAST(t.StartTime AS DATE)
GROUP BY e.ReportDates, e.Email
Results:
Email ReportDates TotDurationInMinutes
----- ----------- ----------------------
ABC 2015-02-21 1439
ABC 2015-02-22 1357
ABC 2015-02-23 1969
ABC 2015-02-24 0
ABC 2015-02-25 0
ABC 2015-02-26 0
ABC 2015-02-27 0
ABC 2015-02-28 360
ABC 2015-03-01 0
you should group by the day value. you could get the day with the function DATEPART as in : DATEPART(d,[StartTime])
SELECT cast([StartTime] as date) as date ,sum(datediff(n,[StartTime],[EndTime])) as "min"
FROM [test].[dbo].[track]
group by DATEPART(d,[StartTime]),cast([StartTime]as date)
hope it helps
SET NOCOUNT ON;
DROP TABLE #temp_table
CREATE TABLE #temp_table (
Email VARCHAR(20)
,StartTime DATETIME
,DurationInSeconds INT
,
)
DECLARE #Nextday DATETIME
,#Email VARCHAR(20)
,#StartTime DATETIME
,#DurationInSeconds INT
,#lastduration INT
,#currentduration INT
,#FirstDate DATETIME
SET #FirstDate = (
SELECT TOP 1 LEFT(StartTime, 11)
FROM track
)
DECLARE vendor_cursor CURSOR
FOR
SELECT Email
,StartTime
,DurationInSeconds
FROM track
OPEN vendor_cursor
FETCH NEXT
FROM vendor_cursor
INTO #Email
,#StartTime
,#DurationInSeconds
WHILE ##FETCH_STATUS = 0
BEGIN
IF EXISTS (
SELECT 1
FROM #temp_table
WHERE LEFT(StartTime, 11) = LEFT(#StartTime, 11)
)
BEGIN
SELECT #lastduration = DurationInSeconds
FROM #temp_table
WHERE LEFT(StartTime, 11) = LEFT(#StartTime, 11)
SET #currentduration = #lastduration + #DurationInSeconds
UPDATE #temp_table
SET DurationInSeconds = #currentduration
WHERE LEFT(StartTime, 11) = LEFT(#StartTime, 11)
END
ELSE
BEGIN
INSERT INTO #temp_table
SELECT #Email
,#StartTime
,#DurationInSeconds
SET #FirstDate = DATEADD(day, 1, #FirstDate)
END
IF NOT EXISTS (
SELECT 1
FROM track
WHERE LEFT(StartTime, 11) = #FirstDate
)
BEGIN
INSERT INTO #temp_table
SELECT #Email
,#FirstDate
,0
SET #FirstDate = DATEADD(day, 1, #FirstDate)
END
-- Get the next vendor.
FETCH NEXT
FROM vendor_cursor
INTO #Email
,#StartTime
,#DurationInSeconds
END
CLOSE vendor_cursor;
DEALLOCATE vendor_cursor;
SELECT *
FROM #temp_table
ORDER BY StartTime

In a set of overlapping, version-numbered intervals, find the most recent version at each point in time

I'm working with a set of date intervals where each interval has a version number and new intervals will frequently overlap old ones, or even be subsets of them. From this data I need to calculate a new set of intervals that shows the most recent version number, at each point in time. Is there a set-based solution to this problem?
Here's an illustration:
Interval 1: 11111111111111111111111
Interval 2: 2222222222
Interval 3: 33333333333333
Interval 4: 444444444
Interval 5: 555555555
Result : 11333333333333331155555555544
Here is a sample of the data I'm working with:
groupId startDate endDate version
-------- --------- ---------- ------
1 1/1/2010 1/1/2011 1
1 10/1/2010 7/5/2011 2
1 7/5/2011 8/13/2012 3
1 8/13/2012 12/31/2012 6
1 10/1/2012 11/1/2012 8
... and the desired output:
groupId startDate endDate version
-------- --------- ---------- ------
1 1/1/2010 10/1/2010 1
1 10/1/2010 7/5/2011 2
1 7/5/2011 8/13/2012 3
1 8/13/2011 10/1/2012 6
1 10/1/2012 11/1/2012 8 << note how version 8 supersedes version 6
1 11/1/2012 12/31/2012 6 << version 6 is split into two records
I haven't found any other examples of this problem, my googling only turns up queries that identify gaps and islands or covering sets.
I think I have an iterative solution (SQL Server 2008). It starts with a temp table for intervals in the result set and defines the start and end points for the range that we want to cover by inserting records with special version numbers. Then, it repeatedly identifies gaps between result set intervals and attempts to fill them with the most recent records from the original data set, until there are no more gaps or no more records to add:
GO
-- Create data set and results table
CREATE TABLE #Data (
groupId INT
,startDate DATE
,endDate DATE
,versionId INT
)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2007-12-22', '2008-12-22', 8)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2008-12-22', '2009-12-22', 9)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2009-12-22', '2010-12-22', 10)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2010-12-22', '2011-12-22', 11)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-01-01', '2011-11-30', 500)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-12-22', '2012-12-22', 12)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 13)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 14)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 17)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 19)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-01-01', '2011-01-01', 1)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-10-01', '2011-07-05', 2)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2011-07-05', '2012-08-13', 3)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-08-13', '2012-12-31', 6)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-10-01', '2012-11-01', 8)
CREATE TABLE #Results (
groupId VARCHAR(10)
,startDate DATE
,endDate DATE
,versionId BIGINT
)
DECLARE #startDate DATE
DECLARE #endDate DATE
DECLARE #placeholderId BIGINT
SET #startDate = '20030101'
SET #endDate = '20121231'
SET #placeholderId = 999999999999999
INSERT #Results
SELECT DISTINCT
groupId
,CASE WHEN MIN(startDate) < #startDate THEN MIN(startDate) ELSE #startDate END
,CASE WHEN MIN(startDate) < #startDate THEN #startDate ELSE MIN(startDate) END
,#placeholderId
FROM #data
GROUP BY groupId
UNION ALL
SELECT DISTINCT
groupId
,CASE WHEN MAX(endDate) < #endDate THEN MAX(endDate) ELSE #endDate END
,CASE WHEN MAX(endDate) < #endDate THEN #endDate ELSE MAX(endDate) END
,#placeholderId
FROM #data
GROUP BY groupId
GO
-- Fill gaps in results table
DECLARE #startDate DATE
DECLARE #endDate DATE
DECLARE #placeholderId BIGINT
SET #startDate = '20030101'
SET #endDate = '20111231'
SET #placeholderId = 999999999999999
DECLARE #counter INT
SET #counter = 0
WHILE #counter < 10
BEGIN
SET #counter = #counter + 1;
WITH Gaps AS (
SELECT
gs.groupId
,gs.startDate
,MIN(ge.endDate) as endDate
,ROW_NUMBER() OVER (ORDER BY gs.groupId, gs.startDate) as gapId
FROM (
SELECT groupId, endDate as startDate
FROM #Results r1
WHERE NOT EXISTS (
SELECT *
FROM #Results r2
WHERE r2.groupId = r1.groupId
AND r2.versionId <> r1.versionId
AND r2.startDate <= r1.endDate
AND r2.endDate > r1.endDate
)
AND NOT (endDate >= #endDate AND versionId = #placeholderId)
) gs
INNER JOIN (
SELECT groupId, startDate as endDate
FROM #Results r1
WHERE NOT EXISTS (
SELECT *
FROM #Results r2
WHERE r2.groupId = r1.groupId
AND r2.versionId <> r1.versionId
AND r2.endDate >= r1.startDate
AND r2.startDate < r1.startDate
)
AND NOT (startDate <= #startDate AND versionId = #placeholderId)
) ge
ON ge.groupId = gs.groupId
AND ge.endDate >= gs.startDate
GROUP BY gs.groupId, gs.startDate
)
INSERT #Results (
groupId
,startDate
,endDate
,versionId
)
SELECT
d.groupId
,CASE WHEN d.startDate < g.startDate THEN g.startDate ELSE d.startDate END
,CASE WHEN d.endDate > g.endDate THEN g.endDate ELSE d.endDate END
,d.versionId
FROM #Data d
INNER JOIN Gaps g
ON g.groupId = d.groupId
AND g.startDate <= d.endDate
AND g.endDate >= d.startDate
INNER JOIN (
SELECT
d.groupId
,gapId
,MAX(d.versionId) as versionId
FROM #Data d
INNER JOIN Gaps g
ON g.groupId = d.groupId
AND g.startDate <= d.endDate
AND g.endDate >= d.startDate
WHERE d.versionId < (
SELECT MIN(versionId)
FROM #Results r
WHERE r.groupId = d.groupId
AND (r.startDate = g.endDate OR r.endDate = g.startDate)
)
AND NOT EXISTS (
SELECT *
FROM #Data dsup
WHERE dsup.groupId = d.groupId
AND dsup.versionId > d.versionId
AND dsup.startDate <= d.startDate
AND dsup.endDate >= d.endDate
)
GROUP BY
d.groupId
,g.gapId
) mg
ON mg.groupId = g.groupId
AND mg.gapId = g.gapId
AND mg.versionId = d.versionId
END
SELECT *
FROM #Results
WHERE versionId <> #placeholderId
order by groupId, startDate
A set-based solution would be much more useful, but I've struggled to find one. Any ideas?
-- create a dates table
create table dates (thedate date primary key clustered);
;with dates(thedate) as (
select dateadd(yy,years.number,0)+days.number
from master..spt_values years
join master..spt_values days
on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
where years.type='p' and years.number between 100 and 150
-- note: 100-150 creates dates in the year range 2000-2050
-- adjust as required
)
insert dbo.dates select * from dates;
-- for each date, determine the prevailing version
select t.groupId, d.thedate, max(t.versionId) versionId
into #tmp1
from dates d
join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate;
-- create index to help
create clustered index cix_tmp1 on #tmp1(groupId, thedate, versionId);
-- find the start dates
;with t as (
select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
from #tmp1 a
left join #tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
where b.versionId is null
)
select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
order by groupId, startdate;
Of course, you can do everything in "one query" but do it at your peril, as the performance goes down the drain, big time.
DO NOT USE - for academic interest only-
;with dates(thedate) as (
select dateadd(yy,years.number,0)+days.number
from master..spt_values years
join master..spt_values days
on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
where years.type='p' and years.number between 100 and 150
-- note: 100-150 creates dates in the year range 2000-2050
-- adjust as required
), tmp1 as (
select t.groupId, d.thedate, max(t.versionId) versionId
from dates d
join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate
), t as (
select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
from tmp1 a
left join tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
where b.versionId is null
)
select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
order by groupId, startdate;
Updated due to some feedback from the comments. I'm not going to worry about the end cases that a few people have pointed out since they've been proven trivial to solve in other Answers, but I wanted to go ahead and get a working version out that didn't require DDL... I figure it's just good to have options. :-)
This code should work:
select nesty.groupId, nesty.startDate, nesty.segment_end_date, Max(bob.versionId)
from(
select starter.groupId, starter.startDate,
coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31')) AS segment_end_date
from
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
(select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xx) starter
left outer join
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
(select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xy) ender on
starter.groupId = ender.groupId and
starter.rownumber = ender.rownumber - 1
where
starter.startDate<= coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31'))
) nesty
left outer join #Data bob on
bob.groupId = nesty.groupId and
nesty.segment_end_date between bob.startDate and bob.endDate
group by nesty.groupId, nesty.startDate, nesty.segment_end_date
order by nesty.groupId, nesty.startDate
There are a couple of tiny caveats I had to do to get it into a single SQL statement. First, the max end date is not dynamic; I hard coded '2012-12-31'. You can replace it with a MAX(endDate), but you can't put that in the GROUP BY statement. If you can do this in a procedure, you can do:
select into #max_end_date MAX(endDate) from #Data
and replace '2012-12-31' with #max_end_date.
Second, I do not guarantee that two adjacent segments won't have the same value! This may or may not be important to you... that is, if you had the following:
Interval 1: 111111
Interval 2: 22222222222222
Your output would be:
Interval 1: 2222
Interval 2: 2222222222
Still, I think it's worth hitting it in a simple and efficient SQL query. It may not be hard to fix those caveats, but it didn't matter to what I was working on, so I haven't bothered yet.
If the end dates are important, as well as gaps, here's a way you can do it. This solution could also be adapted to work if your versions are datetimes instead of just dates.
First a bunch of functions
One to get the version at a given date
Create Function dbo.VersionAtDate(#GroupID int, #Date datetime) Returns int as
Begin
Declare #Ret int = Null
Select
#Ret = Max(VersionID)
From
VersionedIntervals iv
Where
iv.GroupID = #GroupID And
iv.StartDate <= #Date And
iv.EndDate + 1 > #Date -- if dates were half open intervals this would just be iv.EndDate > #Date
Return #Ret
End
Next to get the midpoint of two datetimes (minute resolution):
Create Function dbo.Midpoint(#Start datetime, #End datetime) Returns datetime as
Begin
Return DateAdd(Minute, DateDiff(Minute, #Start, #End) / 2, #Start)
End
Version at a midpoint:
Create Function dbo.VersionAtMidpoint(#GroupID int, #Start datetime, #End datetime) returns int as
Begin
Return dbo.VersionAtDate(#GroupID, dbo.Midpoint(#Start, #End))
End;
Finally a table valued function to help with the fact that some points are the start of one range and the end of another, and it helps to get two rows from one input for this:
-- returns two rows if a point is the end of one interval and the
-- start of another
Create Function dbo.EndPoints(#GroupID int, #RN bigint, #Start datetime, #End datetime, #Next datetime, #Version int)
Returns #EndPoints Table (
GroupID int,
RN bigint,
Version int,
StartDate datetime,
EndDate datetime
) As
Begin
Declare #NextVersion int, #VersionAtMidpoint int
Set #NextVersion = dbo.VersionAtDate(#GroupID, #Next)
If #NextVersion = #Version
-- interval carries on
Insert Into #EndPoints Select #GroupID, #RN, #Version, #Start, #Next
Else
Begin
-- interval has ended
Set #VersionAtMidpoint = dbo.VersionAtMidPoint(#GroupID, #End, #Next)
If #VersionAtMidpoint != #Version
-- we have something like this, start a run of 3s (run of 4s is already ended by previous call)
-- 3333333
-- 44
Insert Into #EndPoints Select #GroupID, #RN, #VersionAtMidpoint, #End, #Next
Else
Begin
-- We have something like this, end the run of 3s and start the run of fours
-- 33333
-- 444
Insert Into #EndPoints Select #GroupID, -1, #Version, #Start, #Next
Insert Into #EndPoints Select #GroupID, #RN, #NextVersion, #Next, #Next
End
End
Return
End
With all this machinery in place, finally a recursive CTE plust table variable, you'll need to set maxrecursion appropriately:
Declare #Bounds Table (GroupID int, RN bigint, BoundDate datetime, Primary Key (GroupID, RN))
Insert Into
#Bounds
Select
GroupID,
Row_Number() Over (Partition By GroupID Order By BoundDate),
BoundDate
From (
Select
GroupID,
StartDate As BoundDate
From
dbo.VersionedIntervals
Union
Select
GroupID,
EndDate
From
dbo.VersionedIntervals
) a
;With VersionedBounds (GroupID, RN, StartDate, EndDate, Version) as (
Select
GroupID,
RN,
BoundDate,
BoundDate,
dbo.VersionAtDate(GroupID, BoundDate)
From
#Bounds
Where
RN = 1
Union All
Select
e.GroupID,
e.RN,
e.StartDate,
e.EndDate,
e.Version
From
#Bounds b
Inner Join
VersionedBounds v
On v.GroupID = b.GroupID And b.RN = v.RN + 1
Cross Apply
dbo.EndPoints(v.GroupID, b.RN, v.StartDate, v.EndDate, b.BoundDate, v.Version) e
)
Select
GroupID,
StartDate,
Max(EndDate) As EndDate,
Max(Version) As Version
From
VersionedBounds
Group By
GroupID,
StartDate
Order By
GroupID,
StartDate
http://sqlfiddle.com/#!6/b95bd/2