How can I update extreme columns within range fast? - sql

I have 40 tables that look like following, and each table contains 30 million records.
Table RawData : PK(CaregoryID, Time)
CategoryID Time IsSampled Value
-----------------------------------------------------------
1 2012-07-01 00:00:00.000 0 -> 1 65.36347
1 2012-07-01 00:00:11.000 0 80.16729
1 2012-07-01 00:00:14.000 0 29.19716
1 2012-07-01 00:00:25.000 0 -> 1 7.05847
1 2012-07-01 00:00:36.000 0 -> 1 98.08257
1 2012-07-01 00:00:57.000 0 75.35524
1 2012-07-01 00:00:59.000 0 35.35524
As of now, the IsSampled column is 0 for all records.
I need to update the records, so that for each CategoryID and for each minute range, the records with Max(Value), Min(Value), and the first record should have 1 for IsSampled.
Following is the procedural query I've created, but it takes too long to run. (approx. 2h 30m for each table)
DECLARE #startRange datetime
DECLARE #endRange datetime
DECLARE #endTime datetime
SET #startRange = '2012-07-01 00:00:00.000'
SET #endTime = '2012-08-01 00:00:00.000'
WHILE (#startRange < #endTime)
BEGIN
SET #endRange = DATEADD(MI, 1, #startRange)
UPDATE r1
SET IsSampled = 1
FROM RawData AS r1
JOIN
(
SELECT r2.CategoryID,
MAX(Value) as MaxValue,
MIN(Value) as MinValue,
MIN([Time]) AS FirstTime
FROM RawData AS r2
WHERE #startRange <= [Time] AND [Time] < #endRange
GROUP BY CategoryID
) as samples
ON r1.CategoryID = samples.CategoryID
AND (r1.Value = samples.MaxValue
OR r1.Value = samples.MinValue
OR r1.[Time] = samples.FirstTime)
AND #startRange <= r1.[Time] AND r1.[Time] < #endRange
SET #startRange = DATEADD(MI, 1, #startRange)
END
Is there a way to update these tables faster(presumably in non-procedural way)? Thanks!

I'm not sure what the performance of this will be like, but it's a more set-based approach than your current one:
declare #T table (CategoryID int not null,Time datetime2 not null,IsSampled bit not null,Value decimal(10,5) not null)
insert into #T (CategoryID,Time,IsSampled,Value) values
(1,'2012-07-01T00:00:00.000',0,65.36347),
(1,'2012-07-01T00:00:11.000',0,80.16729),
(1,'2012-07-01T00:00:14.000',0,29.19716),
(1,'2012-07-01T00:00:25.000',0,7.05847),
(1,'2012-07-01T00:00:36.000',0,98.08257),
(1,'2012-07-01T00:00:57.000',0,75.35524),
(1,'2012-07-01T00:00:59.000',0,35.35524)
;with BinnedValues as (
select CategoryID,Time,IsSampled,Value,DATEADD(minute,DATEDIFF(minute,0,Time),0) as TimeBin
from #T
), MinMax as (
select CategoryID,Time,IsSampled,Value,TimeBin,
ROW_NUMBER() OVER (PARTITION BY CategoryID, TimeBin ORDER BY Value) as MinPos,
ROW_NUMBER() OVER (PARTITION BY CategoryID, TimeBin ORDER BY Value desc) as MaxPos,
ROW_NUMBER() OVER (PARTITION BY CategoryID, TimeBin ORDER BY Time) as Earliest
from
BinnedValues
)
update MinMax set IsSampled = 1 where MinPos=1 or MaxPos=1 or Earliest=1
select * from #T
Result:
CategoryID Time IsSampled Value
----------- ---------------------- --------- ---------------------------------------
1 2012-07-01 00:00:00.00 1 65.36347
1 2012-07-01 00:00:11.00 0 80.16729
1 2012-07-01 00:00:14.00 0 29.19716
1 2012-07-01 00:00:25.00 1 7.05847
1 2012-07-01 00:00:36.00 1 98.08257
1 2012-07-01 00:00:57.00 0 75.35524
1 2012-07-01 00:00:59.00 0 35.35524
It could possibly be sped up if the TimeBin column could be added as a computed column to the table and added to appropriate indexes.
It should also be noted that this will mark a maximum of 3 rows as sampled - if the earliest is also the min or max value, it will only be marked once (obviously), but the next nearest min or max value will not be. Also, if multiple rows have the same Value, and that is the min or max value, one of the rows will be selected arbitrarily.

You could rewrite update in the loop to something like:
UPDATE r1
SET IsSampled = 1
FROM RawData r1
WHERE r1.Time >= #startRange and Time < #endRange
AND NOT EXISTS
(
select *
from RawData r2
where r2.CategoryID = r1.CategoryID
and r2.Time >= #startRange and r2.Time < #endRange
and (r2.Time < r1.Time or r2.Value < r1.Value or r2.Value > r1.Value)
)
To get actual performance improvement you need an index on Time column.

Hi try this query.
declare #T table (CategoryID int not null,Time datetime2 not null,IsSampled bit not null,Value decimal(10,5) not null)
insert into #T (CategoryID,Time,IsSampled,Value) values
(1,'2012-07-01T00:00:00.000',0,65.36347),
(1,'2012-07-01T00:00:11.000',0,80.16729),
(1,'2012-07-01T00:00:14.000',0,29.19716),
(1,'2012-07-01T00:00:25.000',0,7.05847),
(1,'2012-07-01T00:00:36.000',0,98.08257),
(1,'2012-07-01T00:00:57.000',0,75.35524),
(1,'2012-07-01T00:00:59.000',0,35.35524)
;WITH CTE as (SELECT CategoryID,CAST([Time] as Time) as time,IsSampled,Value FROM #T)
,CTE2 as (SELECT CategoryID,Max(time) mx,MIN(time) mn,'00:00:00.0000000' as start FROM CTE where time <> '00:00:00.0000000' Group by CategoryID)
update #T SET IsSampled=1
FROM CTE2 c inner join #T t on c.CategoryID = t.CategoryID and (CAST(t.[Time] as Time)=c.mx or CAST(t.[Time] as Time)=c.mn or CAST(t.[Time] as Time)=c.start)
select * from #T

Hi Here is the latest updated query.
Check the query for performance:
declare #T table (CategoryID int not null,Time datetime2 not null,IsSampled bit not null,Value decimal(10,5) not null)
insert into #T (CategoryID,Time,IsSampled,Value) values
(1,'2012-07-01T00:00:00.000',0,65.36347),
(1,'2012-07-01T00:00:11.000',0,80.16729),
(1,'2012-07-01T00:00:14.000',0,29.19716),
(1,'2012-07-01T00:00:25.000',0,7.05847),
(1,'2012-07-01T00:00:36.000',0,98.08257),
(1,'2012-07-01T00:00:57.000',0,75.35524),
(1,'2012-07-01T00:00:59.000',0,35.35524)
;WITH CTE as (SELECT CategoryID,Time,CAST([Time] as Time) as timepart,IsSampled,Value FROM #T)
--SELECT * FROM CTE
,CTE2 as (SELECT CategoryID,Max(value) mx,MIN(value) mn FROM CTE
where timepart <> '00:00:00.0000000' and Time <=DATEADD(MM,1,Time)
Group by CategoryID)
,CTE3 as (SELECT CategoryID,Max(value) mx,MIN(value) mn FROM CTE
where timepart = '00:00:00.0000000' and Time <=DATEADD(MM,1,Time)
Group by CategoryID)
update #T SET IsSampled=1
FROM #T t left join CTE2 c1
on (t.CategoryID = c1.CategoryID and (t.Value = c1.mn or t.Value =c1.mx))
left join CTE3 c3 on(t.CategoryID = c3.CategoryID and t.Value = c3.mx)
where (c1.CategoryID is not null or c3.CategoryID is not null)
select * from #T

Related

Selecting count of consecutives dates before and after a specified date based on start/end

I'm trying to determine the number of records with consecutive dates (previous record ends on the same date as the start date of the next record) before and after a specified date, and ignore any consecutive records as soon as there is a break in the chain.
If I have the following data:
-- declare vars
DECLARE #dateToCheck date = '2020-09-20'
DECLARE #numRecsBefore int = 0
DECLARE #numRecsAfter int = 0
DECLARE #tempID int
-- temp table
CREATE TABLE #dates
(
[idx] INT IDENTITY(1,1),
[startDate] DATETIME ,
[endDate] DATETIME,
[prevEndDate] DATETIME
)
-- insert temp table
INSERT INTO #dates
( [startDate], [endDate] )
VALUES ( '2020-09-01', '2020-09-04' ),
( '2020-09-04', '2020-09-10' ),
( '2020-09-10', '2020-09-16' ),
( '2020-09-17', '2020-09-19' ),
( '2020-09-19', '2020-09-20' ),
--
( '2020-09-20', '2020-09-23' ),
( '2020-09-25', '2020-09-26' ),
( '2020-09-27', '2020-09-28' ),
( '2020-09-28', '2020-09-30' ),
( '2020-10-01', '2020-09-05' )
-- update with previous records endDate
DECLARE #maxRows int = (SELECT MAX(idx) FROM #dates)
DECLARE #intCount int = 0
WHILE #intCount <= #maxRows
BEGIN
UPDATE #dates SET prevEndDate = (SELECT endDate FROM #dates WHERE idx = (#intCount - 1) ) WHERE idx=#intCount
SET #intCount = #intCount + 1
END
-- clear any breaks in the chain?
-- number of consecutive records before this date
SET #numRecsBefore = (SELECT COUNT(idx) FROM #dates WHERE startDate = prevEndDate AND endDate <= #dateToCheck)
-- number of consecutive records after this date
SET #numRecsAfter = (SELECT COUNT(idx) FROM #dates WHERE startDate = prevEndDate AND endDate >= #dateToCheck)
-- return & clean up
SELECT * FROM #dates
SELECT #numRecsBefore AS numBefore, #numRecsAfter AS numAfter
DROP TABLE #dates
With the specified date being '2020-09-20, I would expect #numRecsBefore = 2 and #numRecsAfter = 1. That is not what I am getting, as its counting all the consecutive records.
There has to be a better way to do this. I know the loop isn't optimal, but I couldn't get LAG() or LEAD() to work. I've spend all morning trying different methods and searching, but everything I find doesn't deal with two dates, or breaks in the chain.
This reads like a gaps-and-island problem. Islands represents rows whose date ranges are adjacent, and you want to count how many records preceed of follow a current date in the same island.
You could do:
select
max(case when #dateToCheck > startdate and #dateToCheck <= enddate then numRecsBefore end) as numRecsBefore,
max(case when #dateToCheck >= startdate and #dateToCheck < enddate then numRecsAfter end) as numRecsAfter
from (
select d.*,
count(*) over(partition by grp order by startdate) as numRecsBefore,
count(*) over(partition by grp order by startdate desc) as numRecsAfter
from (
select d.*,
sum(case when startdate = lag_enddate then 0 else 1 end) over(order by startdate) as grp
from (
select d.*,
lag(enddate) over(order by startdate) as lag_enddate
from #dates d
) d
) d
) d
This uses lag() and a cumulative sum() to define the islands. The a window count gives the number and preceding and following records on the same island. The final step is conditional aggrgation; extra care needs to be taken on the inequalities to take in account various possibilites (typically, the date you search for might not always match a range bound).
Demo on DB Fiddle
I think this is what you are after, however, this does not give the results in your query; I suspect that is because they aren't the expected results? One of the conditional aggregated may also want to be a >= or <=, but I don't know which:
WITH CTE AS(
SELECT startDate,
endDate,
CASE startDate WHEN LAG(endDate) OVER (ORDER BY startDate ASC) THEN 1 END AS IsSame
FROM #dates d)
SELECT COUNT(CASE WHEN startDate < #dateToCheck THEN IsSame END) AS numBefore,
COUNT(CASE WHEN startDate > #dateToCheck THEN IsSame END) AS numAfter
FROM CTE;

SQL sum a particular row based on condition?

I am using MS SQL Server. This is the table I have:
Create table tblVal
(
Id int identity(1,1),
Val NVARCHAR(100),
startdate datetime,
enddate datetime
)
--Inserting Records--
Insert into tblVal values(500,'20180907','20191212')
Insert into tblVal values(720,'20190407','20191212')
Insert into tblVal values(539,'20190708','20201212')
Insert into tblVal values(341,'20190221','20190712')
Table as this:
Id |Val |startdate |enddate
--- ----------------------------------------------
1 |500 |2018-09-07 |2019-12-12
2 |720 |2019-04-07 |2019-12-12
3 |539 |2019-07-08 |2020-12-12
4 |341 |2019-02-21 |2019-07-12
This is what I want:
Mon | Total
------------------
Jan | 500
Feb | 841
March | 841
April | 1561
May | 1561
June | 1561
July | 2100
........|.........
I want to sum Val column if it lies in that particular month. For ex. in case of April month it lies between two of the rows. I have to check both the condition start date and end date. and then sum the values.
This is what I have tried:
select *
into #ControlTable
from dbo.tblVal
DECLARE #cnt INT = 0;
while #cnt<12
begin
select sum(CASE
WHEN MONTH(startdate) BETWEEN #cnt and MONTH(enddate) THEN 0
ELSE 0
END)
from #ControlTable
SET #cnt = #cnt + 1;
end
drop table #ControlTable
but from above I was unable to achieve the result.
How do I solve this? Thanks.
I believe you want something like this:
with dates as (
select min(datefromparts(year(startdate), month(startdate), 1)) as dte,
max(datefromparts(year(enddate), month(enddate), 1)) as enddte
from tblVal
union all
select dateadd(month, 1, dte), enddte
from dates
where dte < enddte
)
select d.dte, sum(val)
from dates d left join
tblval t
on t.startdate <= eomonth(dte) and
t.enddate >= dte
group by d.dte
order by d.dte;
This does the calculation for all months in the data.
The results are a bit different from your sample results, but seem more consistent with the data provided.
Here is a db<>fiddle.
Hi if i understand your wall query i think this query can respond :
Create table #tblVal
(
Id int identity(1,1),
Val NVARCHAR(100),
startdate datetime,
enddate datetime
)
--Inserting Records--
Insert into #tblVal values(500,'20180907','20191212')
Insert into #tblVal values(720,'20190407','20191212')
Insert into #tblVal values(539,'20190708','20201212')
Insert into #tblVal values(341,'20190221','20190712')
Create table #tblMonth ( iMonth int)
Insert into #tblMonth values(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11),(12);
select * from #tblVal
select * from #tblMonth
SELECT *, SUM(case when Val is null then 0 else cast (Val as int) end) OVER(ORDER BY iMonth
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as 'Totaltime'
FROM #tblMonth
LEFT JOIN #tblVal ON MONTH(startdate) = iMonth
ORDER BY iMonth
drop table #tblVal
drop table #tblMonth
Not you have to use SQL Server version 2008 min for use OVER(ORDER BY iMonth
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
Link :
https://learn.microsoft.com/en-us/sql/t-sql/queries/select-over-clause-transact-sql?view=sql-server-2017
If you have older version you can use CTE or JOIN ON select .
DECLARE #outpuTable Table(
MOn INT,
Total nvarchar(MAX)
)
DECLARE #cnt INT = 1;
while (#cnt<=12)
begin
INSERT INTo #outpuTable VALUES(#cnt,
(select ISNULL(sum(CONVERT(INT,Val)),0)
from tblVal
WHERE #cnt BETWEEN MONTH(startdate) and MONTH(enddate) ))
SET #cnt = #cnt + 1;
end
Select * from #outpuTable

SQL - Selecting rows with dates before and after column value change

I have a table called test.
In test I have An ID, a value and a date.
The dates are ordered for each ID.
I want to select rows for an ID, before and after a change of value, so the following example table.
RowNum--------ID------- Value -------- Date
1------------------001 ---------1----------- 01/01/2015
2------------------001 ---------1----------- 02/01/2015
3------------------001 ---------1----------- 04/01/2015
4------------------001 ---------1----------- 05/01/2015
5------------------001 ---------1----------- 06/01/2015
6------------------001 ---------1----------- 08/01/2015
7------------------001 ---------0----------- 09/01/2015
8------------------001 ---------0----------- 10/01/2015
9------------------001 ---------0----------- 11/01/2015
10-----------------001 ---------1----------- 12/01/2015
11-----------------001 ---------1----------- 14/01/2015
12------------------002 ---------1----------- 01/01/2015
13------------------002 ---------1----------- 04/01/2015
14------------------002 ---------0----------- 05/01/2015
15------------------002 ---------0----------- 07/01/2015
The result would return rows 6, 7, 9, 10, 13, 14
You could use analytic functions LAG() and LEAD() to access value in preceding and following rows, then check that it does not match value in current row.
SELECT *
FROM (
SELECT RowNum,
ID,
Value,
Date,
LAG(VALUE, 1, VALUE) OVER(ORDER BY RowNum) PrevValue,
LEAD(VALUE, 1, VALUE) OVER(ORDER BY RowNum) NextValue
FROM test)
WHERE PrevValue <> Value
OR NextValue <> Value
Params passed to this functions are
some scalar expression (column name in this case);
offset (1 row before or after);
default value (LAG() will return NULL for first row and LEAD() will return NULL for last row, but they don't seem special in your question, so I used column value as default).
Refer the below one for without using LEAD and LAG:
DECLARE #i INT = 1,
#cnt INT,
#dstvalue INT,
#srcvalue INT
CREATE TABLE #result
(
id INT,
mydate DATE
)
CREATE TABLE #temp1
(
rn INT IDENTITY(1, 1),
id INT,
mydate DATE
)
INSERT INTO #temp1
(id,
mydate)
SELECT id,
mydate
FROM table
ORDER BY id,
mydate
SELECT #cnt = Count(*)
FROM #temp1
SELECT #srcvalue = value
FROM #temp1
WHERE rn = #i
WHILE ( #i <= #cnt )
BEGIN
SELECT #dstvalue = value
FROM #temp1
WHERE rn = #i
IF( #srcvalue = #dstvalue )
BEGIN
SET #i = #i + 1
CONTINUE;
END
ELSE
BEGIN
SET #srcvalue = #dstvalue
INSERT INTO #result
(id,
mydate)
SELECT id,
mydate
FROM #temp
WHERE rn = #i - 1
UNION ALL
SELECT id,
mydate
FROM #temp
WHERE rn = #i
END
SET #i = #i + 1
END
SELECT *
FROM #result
The answer using lag() and lead() is the right answer. If you are using a pre-SQL Server 2012 version, then you can do essentially the same thing using cross apply or a correlated subquery:
select t.*
from test t cross apply
(select top 1 tprev.*
from test tprev
where tprev.date < t.date
order by date desc
) tprev cross apply
(select top 1 tnext.*
from test tnext
where tnext.date > t.date
order by date asc
) tnext
where tprev.value <> tnext.value;

Convert Procedural Approach into Set Based Approach in Sql-Server

We are using procedural approach (while loop) for inserting records into a particular table. the insert syntax is like below,
DECLARE #CNT INT = 0,
#WEEK DATE = '2015-11-01',
#FLAG INT
CREATE TABLE #Tmpdata (officeId int,id smallint, weekDate date,startsOn varchar(10),endsOn varchar(10),flag bit);
WHILE (#CNT <7)
BEGIN
SET #WEEK = DATEADD(D,#CNT,#WEEK )
IF EXISTS
(SELECT 1
FROM YEARRANGE
WHERE #WEEK BETWEEN CONVERT(DATE,taxseasonbegin)
AND CONVERT (DATE,taxSeasonEnd)
)
BEGIN
SET #FLAG =1
END
ELSE
BEGIN
SET #FLAG = 0
END
INSERT INTO #Tmpdata
(
officeId,id,weekDate,startsOn,endsOn,flag
)
VALUES
(
5134,#lvCounter,#week,'09:00 AM','05:00 PM',#flag
);
SET #cnt=#cnt+1;
end
(NOTE : TaxSeason is from january to august).
Is it possible to re-write the above logic in set based approach?
This is making a number of assumption because you didn't post ddl or any consumable sample data. Also, there is a variable #lvCounter not defined in your code. This is perfect opportunity to use a tally or numbers table instead of a loop.
declare #lvCounter int = 42;
DECLARE #CNT INT = 0,
#WEEK DATE = '2015-11-01',
#FLAG INT;
WITH
E1(N) AS (select 1 from (values (1),(1),(1),(1),(1),(1),(1),(1),(1),(1))dt(n))
, cteTally(N) AS
(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E1
)
select 5134 as officeId
, #lvCounter as Id
, DATEADD(DAY, N - 1, #WEEK) as weekDate
, '09:00 AM' as startsOn
, '05:00 PM' as EndOn
, Flag
from cteTally t
cross apply
(
select CAST(count(*) as bit) as Flag
from YearRange
where DATEADD(Day, t.N, #WEEK) > CONVERT(DATE,taxseasonbegin)
AND DATEADD(Day, t.N, #WEEK) <= CONVERT (DATE,taxSeasonEnd)
) y
where t.N <= 7;
Please can you provide sample data?
You can do something like:
SELECT DateIncrement = SUM(DATEADD(D,#CNT,#WEEK)) OVER (ORDER BY officeID)
FROM...
This gets an incremented date value for each record which you can then check against your start and end dates.
You could try some Kind of this one. This gives you the data I think you Need for your insert. I do not have a table named YEARRANGE so I couldn't test it completely
DECLARE #CNT INT = 0, #WEEK DATE = '2015-11-01', #FLAG INT;
CREATE TABLE #Tmpdata (officeId int,id smallint, weekDate date,startsOn varchar(10),endsOn varchar(10),flag bit);
WITH CTE AS
(
SELECT num AS cnt,
DATEADD(D, SUM(num) OVER(ORDER BY num ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
, #WEEK) AS [week]
FROM
(
SELECT ROW_NUMBER() OVER (ORDER BY nl) -1 AS num
FROM
(SELECT NULL AS nl UNION ALL SELECT NULL AS nl UNION ALL SELECT NULL AS nl UNION ALL SELECT NULL AS nl
UNION ALL SELECT NULL AS nl UNION ALL SELECT NULL AS nl UNION ALL SELECT NULL AS nl
) AS ni
) AS no
)
INSERT INTO #Tmpdata (officeId,id,weekDate,startsOn,endsOn,flag)
SELECT 5134 AS officeID, cnt AS id, [week],'09:00 AM' AS startsOn,'05:00 PM' AS endsOn, COALESCE(A1.flag,0) AS flag
FROM CTE
OUTER APPLY (SELECT 1
FROM YEARRANGE
WHERE [week] BETWEEN CONVERT(DATE,taxseasonbegin)
AND CONVERT (DATE,taxSeasonEnd)
) AS A1(flag);

In a set of overlapping, version-numbered intervals, find the most recent version at each point in time

I'm working with a set of date intervals where each interval has a version number and new intervals will frequently overlap old ones, or even be subsets of them. From this data I need to calculate a new set of intervals that shows the most recent version number, at each point in time. Is there a set-based solution to this problem?
Here's an illustration:
Interval 1: 11111111111111111111111
Interval 2: 2222222222
Interval 3: 33333333333333
Interval 4: 444444444
Interval 5: 555555555
Result : 11333333333333331155555555544
Here is a sample of the data I'm working with:
groupId startDate endDate version
-------- --------- ---------- ------
1 1/1/2010 1/1/2011 1
1 10/1/2010 7/5/2011 2
1 7/5/2011 8/13/2012 3
1 8/13/2012 12/31/2012 6
1 10/1/2012 11/1/2012 8
... and the desired output:
groupId startDate endDate version
-------- --------- ---------- ------
1 1/1/2010 10/1/2010 1
1 10/1/2010 7/5/2011 2
1 7/5/2011 8/13/2012 3
1 8/13/2011 10/1/2012 6
1 10/1/2012 11/1/2012 8 << note how version 8 supersedes version 6
1 11/1/2012 12/31/2012 6 << version 6 is split into two records
I haven't found any other examples of this problem, my googling only turns up queries that identify gaps and islands or covering sets.
I think I have an iterative solution (SQL Server 2008). It starts with a temp table for intervals in the result set and defines the start and end points for the range that we want to cover by inserting records with special version numbers. Then, it repeatedly identifies gaps between result set intervals and attempts to fill them with the most recent records from the original data set, until there are no more gaps or no more records to add:
GO
-- Create data set and results table
CREATE TABLE #Data (
groupId INT
,startDate DATE
,endDate DATE
,versionId INT
)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2007-12-22', '2008-12-22', 8)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2008-12-22', '2009-12-22', 9)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2009-12-22', '2010-12-22', 10)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2010-12-22', '2011-12-22', 11)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-01-01', '2011-11-30', 500)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-12-22', '2012-12-22', 12)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 13)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 14)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 17)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 19)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-01-01', '2011-01-01', 1)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-10-01', '2011-07-05', 2)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2011-07-05', '2012-08-13', 3)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-08-13', '2012-12-31', 6)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-10-01', '2012-11-01', 8)
CREATE TABLE #Results (
groupId VARCHAR(10)
,startDate DATE
,endDate DATE
,versionId BIGINT
)
DECLARE #startDate DATE
DECLARE #endDate DATE
DECLARE #placeholderId BIGINT
SET #startDate = '20030101'
SET #endDate = '20121231'
SET #placeholderId = 999999999999999
INSERT #Results
SELECT DISTINCT
groupId
,CASE WHEN MIN(startDate) < #startDate THEN MIN(startDate) ELSE #startDate END
,CASE WHEN MIN(startDate) < #startDate THEN #startDate ELSE MIN(startDate) END
,#placeholderId
FROM #data
GROUP BY groupId
UNION ALL
SELECT DISTINCT
groupId
,CASE WHEN MAX(endDate) < #endDate THEN MAX(endDate) ELSE #endDate END
,CASE WHEN MAX(endDate) < #endDate THEN #endDate ELSE MAX(endDate) END
,#placeholderId
FROM #data
GROUP BY groupId
GO
-- Fill gaps in results table
DECLARE #startDate DATE
DECLARE #endDate DATE
DECLARE #placeholderId BIGINT
SET #startDate = '20030101'
SET #endDate = '20111231'
SET #placeholderId = 999999999999999
DECLARE #counter INT
SET #counter = 0
WHILE #counter < 10
BEGIN
SET #counter = #counter + 1;
WITH Gaps AS (
SELECT
gs.groupId
,gs.startDate
,MIN(ge.endDate) as endDate
,ROW_NUMBER() OVER (ORDER BY gs.groupId, gs.startDate) as gapId
FROM (
SELECT groupId, endDate as startDate
FROM #Results r1
WHERE NOT EXISTS (
SELECT *
FROM #Results r2
WHERE r2.groupId = r1.groupId
AND r2.versionId <> r1.versionId
AND r2.startDate <= r1.endDate
AND r2.endDate > r1.endDate
)
AND NOT (endDate >= #endDate AND versionId = #placeholderId)
) gs
INNER JOIN (
SELECT groupId, startDate as endDate
FROM #Results r1
WHERE NOT EXISTS (
SELECT *
FROM #Results r2
WHERE r2.groupId = r1.groupId
AND r2.versionId <> r1.versionId
AND r2.endDate >= r1.startDate
AND r2.startDate < r1.startDate
)
AND NOT (startDate <= #startDate AND versionId = #placeholderId)
) ge
ON ge.groupId = gs.groupId
AND ge.endDate >= gs.startDate
GROUP BY gs.groupId, gs.startDate
)
INSERT #Results (
groupId
,startDate
,endDate
,versionId
)
SELECT
d.groupId
,CASE WHEN d.startDate < g.startDate THEN g.startDate ELSE d.startDate END
,CASE WHEN d.endDate > g.endDate THEN g.endDate ELSE d.endDate END
,d.versionId
FROM #Data d
INNER JOIN Gaps g
ON g.groupId = d.groupId
AND g.startDate <= d.endDate
AND g.endDate >= d.startDate
INNER JOIN (
SELECT
d.groupId
,gapId
,MAX(d.versionId) as versionId
FROM #Data d
INNER JOIN Gaps g
ON g.groupId = d.groupId
AND g.startDate <= d.endDate
AND g.endDate >= d.startDate
WHERE d.versionId < (
SELECT MIN(versionId)
FROM #Results r
WHERE r.groupId = d.groupId
AND (r.startDate = g.endDate OR r.endDate = g.startDate)
)
AND NOT EXISTS (
SELECT *
FROM #Data dsup
WHERE dsup.groupId = d.groupId
AND dsup.versionId > d.versionId
AND dsup.startDate <= d.startDate
AND dsup.endDate >= d.endDate
)
GROUP BY
d.groupId
,g.gapId
) mg
ON mg.groupId = g.groupId
AND mg.gapId = g.gapId
AND mg.versionId = d.versionId
END
SELECT *
FROM #Results
WHERE versionId <> #placeholderId
order by groupId, startDate
A set-based solution would be much more useful, but I've struggled to find one. Any ideas?
-- create a dates table
create table dates (thedate date primary key clustered);
;with dates(thedate) as (
select dateadd(yy,years.number,0)+days.number
from master..spt_values years
join master..spt_values days
on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
where years.type='p' and years.number between 100 and 150
-- note: 100-150 creates dates in the year range 2000-2050
-- adjust as required
)
insert dbo.dates select * from dates;
-- for each date, determine the prevailing version
select t.groupId, d.thedate, max(t.versionId) versionId
into #tmp1
from dates d
join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate;
-- create index to help
create clustered index cix_tmp1 on #tmp1(groupId, thedate, versionId);
-- find the start dates
;with t as (
select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
from #tmp1 a
left join #tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
where b.versionId is null
)
select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
order by groupId, startdate;
Of course, you can do everything in "one query" but do it at your peril, as the performance goes down the drain, big time.
DO NOT USE - for academic interest only-
;with dates(thedate) as (
select dateadd(yy,years.number,0)+days.number
from master..spt_values years
join master..spt_values days
on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
where years.type='p' and years.number between 100 and 150
-- note: 100-150 creates dates in the year range 2000-2050
-- adjust as required
), tmp1 as (
select t.groupId, d.thedate, max(t.versionId) versionId
from dates d
join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate
), t as (
select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
from tmp1 a
left join tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
where b.versionId is null
)
select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
order by groupId, startdate;
Updated due to some feedback from the comments. I'm not going to worry about the end cases that a few people have pointed out since they've been proven trivial to solve in other Answers, but I wanted to go ahead and get a working version out that didn't require DDL... I figure it's just good to have options. :-)
This code should work:
select nesty.groupId, nesty.startDate, nesty.segment_end_date, Max(bob.versionId)
from(
select starter.groupId, starter.startDate,
coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31')) AS segment_end_date
from
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
(select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xx) starter
left outer join
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
(select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xy) ender on
starter.groupId = ender.groupId and
starter.rownumber = ender.rownumber - 1
where
starter.startDate<= coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31'))
) nesty
left outer join #Data bob on
bob.groupId = nesty.groupId and
nesty.segment_end_date between bob.startDate and bob.endDate
group by nesty.groupId, nesty.startDate, nesty.segment_end_date
order by nesty.groupId, nesty.startDate
There are a couple of tiny caveats I had to do to get it into a single SQL statement. First, the max end date is not dynamic; I hard coded '2012-12-31'. You can replace it with a MAX(endDate), but you can't put that in the GROUP BY statement. If you can do this in a procedure, you can do:
select into #max_end_date MAX(endDate) from #Data
and replace '2012-12-31' with #max_end_date.
Second, I do not guarantee that two adjacent segments won't have the same value! This may or may not be important to you... that is, if you had the following:
Interval 1: 111111
Interval 2: 22222222222222
Your output would be:
Interval 1: 2222
Interval 2: 2222222222
Still, I think it's worth hitting it in a simple and efficient SQL query. It may not be hard to fix those caveats, but it didn't matter to what I was working on, so I haven't bothered yet.
If the end dates are important, as well as gaps, here's a way you can do it. This solution could also be adapted to work if your versions are datetimes instead of just dates.
First a bunch of functions
One to get the version at a given date
Create Function dbo.VersionAtDate(#GroupID int, #Date datetime) Returns int as
Begin
Declare #Ret int = Null
Select
#Ret = Max(VersionID)
From
VersionedIntervals iv
Where
iv.GroupID = #GroupID And
iv.StartDate <= #Date And
iv.EndDate + 1 > #Date -- if dates were half open intervals this would just be iv.EndDate > #Date
Return #Ret
End
Next to get the midpoint of two datetimes (minute resolution):
Create Function dbo.Midpoint(#Start datetime, #End datetime) Returns datetime as
Begin
Return DateAdd(Minute, DateDiff(Minute, #Start, #End) / 2, #Start)
End
Version at a midpoint:
Create Function dbo.VersionAtMidpoint(#GroupID int, #Start datetime, #End datetime) returns int as
Begin
Return dbo.VersionAtDate(#GroupID, dbo.Midpoint(#Start, #End))
End;
Finally a table valued function to help with the fact that some points are the start of one range and the end of another, and it helps to get two rows from one input for this:
-- returns two rows if a point is the end of one interval and the
-- start of another
Create Function dbo.EndPoints(#GroupID int, #RN bigint, #Start datetime, #End datetime, #Next datetime, #Version int)
Returns #EndPoints Table (
GroupID int,
RN bigint,
Version int,
StartDate datetime,
EndDate datetime
) As
Begin
Declare #NextVersion int, #VersionAtMidpoint int
Set #NextVersion = dbo.VersionAtDate(#GroupID, #Next)
If #NextVersion = #Version
-- interval carries on
Insert Into #EndPoints Select #GroupID, #RN, #Version, #Start, #Next
Else
Begin
-- interval has ended
Set #VersionAtMidpoint = dbo.VersionAtMidPoint(#GroupID, #End, #Next)
If #VersionAtMidpoint != #Version
-- we have something like this, start a run of 3s (run of 4s is already ended by previous call)
-- 3333333
-- 44
Insert Into #EndPoints Select #GroupID, #RN, #VersionAtMidpoint, #End, #Next
Else
Begin
-- We have something like this, end the run of 3s and start the run of fours
-- 33333
-- 444
Insert Into #EndPoints Select #GroupID, -1, #Version, #Start, #Next
Insert Into #EndPoints Select #GroupID, #RN, #NextVersion, #Next, #Next
End
End
Return
End
With all this machinery in place, finally a recursive CTE plust table variable, you'll need to set maxrecursion appropriately:
Declare #Bounds Table (GroupID int, RN bigint, BoundDate datetime, Primary Key (GroupID, RN))
Insert Into
#Bounds
Select
GroupID,
Row_Number() Over (Partition By GroupID Order By BoundDate),
BoundDate
From (
Select
GroupID,
StartDate As BoundDate
From
dbo.VersionedIntervals
Union
Select
GroupID,
EndDate
From
dbo.VersionedIntervals
) a
;With VersionedBounds (GroupID, RN, StartDate, EndDate, Version) as (
Select
GroupID,
RN,
BoundDate,
BoundDate,
dbo.VersionAtDate(GroupID, BoundDate)
From
#Bounds
Where
RN = 1
Union All
Select
e.GroupID,
e.RN,
e.StartDate,
e.EndDate,
e.Version
From
#Bounds b
Inner Join
VersionedBounds v
On v.GroupID = b.GroupID And b.RN = v.RN + 1
Cross Apply
dbo.EndPoints(v.GroupID, b.RN, v.StartDate, v.EndDate, b.BoundDate, v.Version) e
)
Select
GroupID,
StartDate,
Max(EndDate) As EndDate,
Max(Version) As Version
From
VersionedBounds
Group By
GroupID,
StartDate
Order By
GroupID,
StartDate
http://sqlfiddle.com/#!6/b95bd/2