Selecting count of consecutives dates before and after a specified date based on start/end - sql

I'm trying to determine the number of records with consecutive dates (previous record ends on the same date as the start date of the next record) before and after a specified date, and ignore any consecutive records as soon as there is a break in the chain.
If I have the following data:
-- declare vars
DECLARE #dateToCheck date = '2020-09-20'
DECLARE #numRecsBefore int = 0
DECLARE #numRecsAfter int = 0
DECLARE #tempID int
-- temp table
CREATE TABLE #dates
(
[idx] INT IDENTITY(1,1),
[startDate] DATETIME ,
[endDate] DATETIME,
[prevEndDate] DATETIME
)
-- insert temp table
INSERT INTO #dates
( [startDate], [endDate] )
VALUES ( '2020-09-01', '2020-09-04' ),
( '2020-09-04', '2020-09-10' ),
( '2020-09-10', '2020-09-16' ),
( '2020-09-17', '2020-09-19' ),
( '2020-09-19', '2020-09-20' ),
--
( '2020-09-20', '2020-09-23' ),
( '2020-09-25', '2020-09-26' ),
( '2020-09-27', '2020-09-28' ),
( '2020-09-28', '2020-09-30' ),
( '2020-10-01', '2020-09-05' )
-- update with previous records endDate
DECLARE #maxRows int = (SELECT MAX(idx) FROM #dates)
DECLARE #intCount int = 0
WHILE #intCount <= #maxRows
BEGIN
UPDATE #dates SET prevEndDate = (SELECT endDate FROM #dates WHERE idx = (#intCount - 1) ) WHERE idx=#intCount
SET #intCount = #intCount + 1
END
-- clear any breaks in the chain?
-- number of consecutive records before this date
SET #numRecsBefore = (SELECT COUNT(idx) FROM #dates WHERE startDate = prevEndDate AND endDate <= #dateToCheck)
-- number of consecutive records after this date
SET #numRecsAfter = (SELECT COUNT(idx) FROM #dates WHERE startDate = prevEndDate AND endDate >= #dateToCheck)
-- return & clean up
SELECT * FROM #dates
SELECT #numRecsBefore AS numBefore, #numRecsAfter AS numAfter
DROP TABLE #dates
With the specified date being '2020-09-20, I would expect #numRecsBefore = 2 and #numRecsAfter = 1. That is not what I am getting, as its counting all the consecutive records.
There has to be a better way to do this. I know the loop isn't optimal, but I couldn't get LAG() or LEAD() to work. I've spend all morning trying different methods and searching, but everything I find doesn't deal with two dates, or breaks in the chain.

This reads like a gaps-and-island problem. Islands represents rows whose date ranges are adjacent, and you want to count how many records preceed of follow a current date in the same island.
You could do:
select
max(case when #dateToCheck > startdate and #dateToCheck <= enddate then numRecsBefore end) as numRecsBefore,
max(case when #dateToCheck >= startdate and #dateToCheck < enddate then numRecsAfter end) as numRecsAfter
from (
select d.*,
count(*) over(partition by grp order by startdate) as numRecsBefore,
count(*) over(partition by grp order by startdate desc) as numRecsAfter
from (
select d.*,
sum(case when startdate = lag_enddate then 0 else 1 end) over(order by startdate) as grp
from (
select d.*,
lag(enddate) over(order by startdate) as lag_enddate
from #dates d
) d
) d
) d
This uses lag() and a cumulative sum() to define the islands. The a window count gives the number and preceding and following records on the same island. The final step is conditional aggrgation; extra care needs to be taken on the inequalities to take in account various possibilites (typically, the date you search for might not always match a range bound).
Demo on DB Fiddle

I think this is what you are after, however, this does not give the results in your query; I suspect that is because they aren't the expected results? One of the conditional aggregated may also want to be a >= or <=, but I don't know which:
WITH CTE AS(
SELECT startDate,
endDate,
CASE startDate WHEN LAG(endDate) OVER (ORDER BY startDate ASC) THEN 1 END AS IsSame
FROM #dates d)
SELECT COUNT(CASE WHEN startDate < #dateToCheck THEN IsSame END) AS numBefore,
COUNT(CASE WHEN startDate > #dateToCheck THEN IsSame END) AS numAfter
FROM CTE;

Related

How to calculate MTD given daily account balance in SQL Server?

I have a table with columns [accountid], [DateEnding], and [AccountBalance].
I need to calculate MTD using the balance of the current month and subtracting the account balance from the last day of the previous month for each accountid.
So far I have this:
SELECT [accountid]
,[DateEnding]
,[AccountBalance]
,[AccountBalance MTD Last] = AccountBalance - FIRST_VALUE(AccountBalance) OVER (PARTITION BY accountid, YEAR(DATEADD(mm,-1,[DateEnding])), MONTH(DATEADD(mm,-1,[DateEnding])) ORDER BY [DateEnding] DESC)
FROM [test]
ORDER BY accountid, DateEnding;
Here, for each distinct account, we find the latest record available according to DateEnding
we then find the last day of the last month by taking a number of days away equal to the current day number. e.g 23rd April 2019 we subtract 23 days to get 1st March 2019
we can then find the balance on that day.
Then put the calculation together in the SELECT
SELECT Q1.accountid,
Q2.DateEnding ,
Q3.EOMbalance,
Q2.LatestBalance,
Q2.LatestBalance - Q3.EOMbalance EOM
FROM (
SELECT Distinct t1.accountid FROM test t1
) Q1
CROSS APPLY (
SELECT TOP 1 t2.AccountBalance LatestBalance, t2.[DateEnding]
FROM test t2
WHERE t2.[accountid] = Q1.accountid
ORDER BY t2.[DateEnding] DESC
) Q2
CROSS APPLY (
SELECT Top 1 t3.AccountBalance EOMbalance
FROM test t3
WHERE t3.[accountid] = Q1.accountid
AND t3.[DateEnding]
= dateadd(day,0 - DAY(q2.dateending), q2.dateending)
ORDER BY t3.[DateEnding] DESC
) Q3
The first answer seems a little complicated for this problem (Cross Apply isn't necessary here).
The following may be easier for you:
I first look at the current day's account balances in subquery 'a'.
Then I look at the account balances from the last day of last month's data, in subquery 'b'.
Then it's just a matter of subtracting the two to show the MTD delta:
select a.accountid,
a.DateEnding,
a.AccountBalance as [Current AccountBalance],
b.AccountBalance as [EOM prior AccountBalance], --added for clarity
a.AccountBalance-b.AccountBalance as [AccountBalance MTD Last]
from
(select accountid, DateEnding, AccountBalance
from #test
where DateEnding = cast(getdate() as date)
/* getdate() returns today's date, so this query will also be with respect to today */
) a
left join
(select *
from #test
where DateEnding = DATEADD(MONTH, DATEDIFF(MONTH, -1, GETDATE())-1, -1)
/*this returns the last day of last month, always*/
) b
on a.accountid = b.accountid
Here is the SQL that makes this sample data and #test table. Simply execute it to have your own '#test' table to run against:
/*drop table #test
drop table #dates */
create table #test ([accountid] varchar(255),[DateEnding] date, [AccountBalance] decimal(16,2))
create table #dates (rnk int,dt date)
insert into #dates (dt)
values (cast('20180101' as date))
DECLARE
#basedate DATE,
#d INT
SELECT
#basedate = '20180101',
#d = 1
WHILE #d < (select datediff(day,cast('20180101' as date),getdate())+2) --select datediff(day,getdate(),cast('20180101' as datetime))
BEGIN
INSERT INTO #dates (dt)
values (DATEADD(day, 1, (select max(dt) from #dates)))
set #d = #d+1
END
update a
set a.rnk = b.rnk
from #dates a
left join (select rank() over (order by dt) rnk,dt from #dates) b on a.dt = b.dt
declare #a int
set #a = 1
declare #i int
set #i = 1
while #a <20
begin
while #i < (select max(rnk) from #dates)
begin
insert into #test
values (#a,(select dt from #dates where rnk = #i),cast(rand()*1000.0+#i as decimal(16,2)))
set #i=#i+1
end
set #a=#a+1
set #i = 1
end

get best sales rep weekly SQL

I need a bit of help with a SQL Server issue.
I have 2 tables:
complete_sales_raw
(
Id int Identity(1,1) PK,
RepId int FK in sale_reps,
Revenue decimal(15,2),
Sale_date datetime2(7)
)
and
sale_reps
(
Id int Identity(1,1) PK,
RepName nvarchar(50)
)
What I need to do is get best sales rep based on the total revenue for each week, starting with 2014-06-01 and ending at current date.
Each week has 7 days and the first day is 2014-06-01.
So far I got to here:
SELECT TOP(1)
sr.RepName as RepName,
SUM(csr.Revenue) as Revenue
INTO #tmp1
FROM complete_sales_raw csr
JOIN sale_reps sr on csr.RepId = sr.Id
WHERE DATEDIFF( d,'2014-06-01', Sale_date ) BETWEEN 0 and 6
GROUP BY sr.RepName
ORDER BY 2 desc
But this only returns the best sale rep for the first week and I need it for each week.
All help is appreciated.
ok so, I created a week table like so
IF ( OBJECT_ID('dbo.tmp4') IS NOT NULL )
DROP TABLE dbo.tmp4
GO
Create Table tmp4(
StartDate datetime,Enddate datetime,WeekNo varchar(20)
)
DECLARE
#start_date DATETIME,
#end_date DATETIME,
#start_date1 DATETIME,
#end_date1 DATETIME
DECLARE #Table table(StartDate datetime,Enddate datetime,WeekNo varchar(20))
Declare #WeekDt as varchar(10)
SET #start_date = '2014-06-01'
SET #end_date = '2015-01-03'
Set #WeekDt = DATEPART(WEEK,#start_date)
SET #start_date1 = #start_date
While #start_date<=#end_date
Begin
--Select #start_date,#start_date+1
IF #WeekDt<>DATEPART(WEEK,#start_date)
BEGIN
Set #WeekDt = DATEPART(WEEK,#start_date)
SET #end_date1=#start_date-1
INSERT INTO tmp4 Values(#start_date1,#end_date1,DATEPART(WEEK,#start_date1))
SET #start_date1 = #start_date
END
set #start_date = #start_date+1
END
GO
and then I used Gordon's answer and made this:
SELECT t.StartDate as StartDate, sr.RepName as RepName, SUM(csr.Revenue) as Revenue,
RANK() OVER (PARTITION BY (t.StartDate) ORDER BY SUM(csr.Revenue) desc) as seqnum into tmp1
FROM tmp4 t,
complete_sales_raw csr
JOIN sale_reps sr on csr.RepId = sr.Id
WHERE DATEDIFF( d,t.StartDate, MAS_PostDate ) BETWEEN 0 and 6
GROUP BY sr.RepName, t.StartDate
SELECT * FROM tmp1
WHERE seqnum = 1
ORDER BY StartDate
which returns the best sales_rep for each week
You can do an aggregation to get the total sales by week. This requires some manipulation of the dates to calculate the number of weeks -- basically dividing the days by 7.
Then, use rank() (or row_number() if you only want one when there are ties) to get the top value:
SELECT s.*
FROM (SELECT tsr.RepName as RepName,
(DATEDIFF(day, '2014-06-01', MAS_PostDate ) - 1) / 7 as weeknum,
SUM(csr.Revenue) as Revenue,
RANK() OVER (PARTITION BY (DATEDIFF(day, '2014-06-01', MAS_PostDate ) - 1) / 7 ORDER BY SUM(csr.Revenue)) as seqnum
FROM complete_sales_raw csr JOIN
sale_reps sr
on csr.RepId = sr.Id
WHERE DATEDIFF(day, '2014-06-01', MAS_PostDate ) BETWEEN 0 and 6
GROUP BY sr.RepName, (DATEDIFF(day, '2014-06-01', MAS_PostDate ) - 1) / 7
) s
WHERE seqnum = 1;

Select date + 3 days, not including weekends and holidays

I've found a number of answers to the problem of doing a date-diff, in SQL, not including weekends and holidays. My problem is that I need to do a date comparison - how many child records are there whose work date is within three days of the parent record's send date?
Most of the date-diff answers involve a calendar table, and I think if I can build a sub-select that returns the date+3, I can work out the rest. But I can't figure out how to return a date+3.
So:
CREATE TABLE calendar
(
thedate DATETIME NOT NULL,
isweekday SMALLINT NULL,
isholiday SMALLINT NULL
);
And:
SELECT thedate AS fromdate, xxx AS todate
FROM calendar
What I want is for todate to be fromdate + 72 hours, not counting weekends and holidays. Doing a COUNT(*) where isweekday and not isholiday is simple enough, but doing a DATEADD() is another matter.
I'm not sure where to start.
EDIT:
Changed to include non-workdays as valid fromDates.
WITH rankedDates AS
(
SELECT
thedate
, ROW_NUMBER()
OVER(
ORDER BY thedate
) dateRank
FROM
calendar c
WHERE
c.isweekday = 1
AND
c.isholiday = 0
)
SELECT
c1.fromdate
, rd2.thedate todate
FROM
(
SELECT
c.thedate fromDate
,
(
SELECT
TOP 1 daterank
FROM
rankedDates rd
WHERE
rd.thedate <= c.thedate
ORDER BY
thedate DESC
) dateRank
FROM
calendar c
) c1
LEFT JOIN
rankedDates rd2
ON
c1.dateRank + 3 = rd2.dateRank
You could put a date rank column on the calendar table to simplify this and avoid the CTE:
CREATE TABLE
calendar
(
TheDate DATETIME PRIMARY KEY
, isweekday BIT NOT NULL
, isHoliday BIT NOT NULL DEFAULT 0
, dateRank INT NOT NULL
);
Then you'd set the daterank column only where it's a non-holiday weekday.
This should do the trick, change the number in the "top" to the number of days you want to include.
declare #date as datetime
set #date = '5/23/13'
select
max(_businessDates.thedate)
from (
select
top 3 _Calendar.thedate
from calendar _Calendar
where _Calendar.isWeekday = 1
and _Calendar.isholiday = 0
and _Calendar.thedate >= #date
order by
_Calendar.thedate
) as _businessDates
For a dynamic version that can go forward or backward a certain number of days try this:
declare #date as datetime
declare #DayOffset as int
set #date = '5/28/13'
set #DayOffset = -3
select
(case when #DayOffset >= 0 then
max(_businessDates.thedate)
else
min(_businessDates.thedate)
end)
from (
select
top (abs(#DayOffset) + (case when #DayOffset >= 0 then 1 else 0 end)) _Calendar.thedate
from calendar _Calendar
where _Calendar.isWeekday = 1
and _Calendar.isholiday = 0
and ( (#DayOffset >= 0 and _Calendar.thedate >= #date)
or (#DayOffset < 0 and _Calendar.thedate < #date) )
order by
cast(_Calendar.thedate as int) * (case when #DayOffset >=0 then 1 else -1 end)
) as _businessDates
You can set #DayOffset to a positive or negative number.
You just need DATEADD, unless I'm not understanding your question.
DATEADD(DAY,3,fromdate)
Edit: I see, not counting weekends or Holidays, will update momentarily.
Update: Well looks like Jason nailed it, but on the off chance you're using SQL2012, here's the simple version:
SELECT todate = thedate
fromdate = LEAD(thedate,3) OVER (ORDER BY thedate)
FROM calendar
WHERE isweekday = 1
AND isHoliday = 0
Try this if you need it as a query with dateAdd:
SELECT
allDates.thedate fromDate
,min(nonWeekendHoliday.thedate) toDate
FROM (
SELECT
thedate
FROM
calendar _calendar
) allDates
LEFT JOIN (
SELECT
thedate
FROM
calendar _calendar
WHERE
_calendar.isweekday = 1
AND
_calendar.isholiday = 0
) nonWeekendHoliday
on dateadd(d,3,allDates.thedate) <= nonWeekendHoliday.thedate
where allDates.thedate between '5/20/13' and '5/31/13'
group by
allDates.thedate

T-SQL function loop [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I was wondering if I could get some help on a T-SQL function I am trying to create:
Here is some sample data that needs to be queried:
Simplified table:
ID|PersonID|ValueTypeID|ValueTypeDescription|Value
1|ZZZZZ000L6|ZZZZZ00071|Start Prison Date|3/28/2012
2|ZZZZZ000L6|ZZZZZ00071|Start Prison Date|10/10/2012
3|ZZZZZ000L6|ZZZZZ00072|End Prison Date |3/29/2012
4|ZZZZZ000MD|ZZZZZ00071|Start Prison Date|1/15/2012
5|ZZZZZ000MD|ZZZZZ00072|End Prison Date |2/15/2012
6|ZZZZZ000MD|ZZZZZ00071|Start Prison Date|4/1/2012
7|ZZZZZ000MD|ZZZZZ00072|End Prison Date |4/5/2012
8|ZZZZZ000MD|ZZZZZ00071|Start Prison Date|9/3/2012
9|ZZZZZ000MD|ZZZZZ00072|End Prison Date |12/1/2012
What I need is a T-SQL function that accepts the PersonID and the Year (#PID, #YR) and returns the number of days that person has been in prison for that year.
dbo.NumDaysInPrison(#PID, #YR) as int
Example:
dbo.NumDaysInPrison('ZZZZZ000L6', 2012) returns 84
dbo.NumDaysInPrison('ZZZZZ000MD', 2012) returns 124
So far, I have come up with this query that gives me the answer sometimes.
DECLARE #Year int
DECLARE #PersonID nvarchar(50)
SET #Year = 2012
SET #PersonID = 'ZZZZZ000AA'
;WITH StartDates AS
(
SELECT
Value,
ROW_NUMBER() OVER(ORDER BY Value) AS RowNumber
FROM Prisoners
WHERE ValueTypeDescription = 'Start Prison Date' AND PersonID = #PersonID AND YEAR(Value) = #Year
), EndDates AS
(
SELECT
Value,
ROW_NUMBER() OVER(ORDER BY Value) AS RowNumber
FROM Prisoners
WHERE ValueTypeDescription = 'End Prison Date' AND PersonID = #PersonID AND YEAR(Value) = #Year
)
SELECT
SUM(DATEDIFF(d, s.Value, ISNULL(e.Value, cast(str(#Year*10000+12*100+31) as date)))) AS NumDays
FROM StartDates s
LEFT OUTER JOIN EndDates e ON s.RowNumber = e.RowNumber
This fails to capture if a record earlier in the year was left without an end date:
for example if a person has only two records:
ID|PersonID|ValueTypeID|ValueTypeDescription|Value
1|ZZZZZ000AA|ZZZZZ00071|Start Prison Date|3/28/2012
2|ZZZZZ000AA|ZZZZZ00071|Start Prison Date|10/10/2012
(3/28/2012 -> End of Year)
(10/10/2012 -> End of Year)
will returns 360, not 278.
So it seems that you have the data that you need to split out your 'start date' values and your 'end date' values. You don't really need to loop through anything, you can just pull out your start values then your end values based on your person and compare them.
The important thing is to pull out all you need to begin with and then compare the appropriate values.
Here's an example based on your data above. It would need some heavy tweaking to work with production data; it makes assumptions about the Value data. It's also a bad idea to hard-code valuetypeid as I have here; if you're making a function, you'd want to handle that, I think.
DECLARE #pid INT, #yr INT;
WITH startdatecalc AS
(
SELECT personid, CAST([value] AS date) AS startdate, DATEPART(YEAR, CAST([value] AS date)) AS startyear
FROM incarctbl
WHERE valuetypeid = 'ZZZZZ00071'
),
enddatecalc AS
(
SELECT personid, CAST([value] AS date) AS enddate, DATEPART(YEAR, CAST([value] AS date)) AS endyear
FROM incarctbl
WHERE valuetypeid = 'ZZZZZ00072'
)
SELECT CASE WHEN startyear < #yr THEN DATEDIFF(day, CAST(CAST(#yr AS VARCHAR(4)) + '-01-01' AS date), ISNULL(enddatecalc.enddate, CURRENT_TIMESTAMP))
ELSE DATEDIFF(DAY, startdate, ISNULL(enddatecalc.enddate, CURRENT_TIMESTAMP)) END AS NumDaysInPrison
FROM startdatecalc
LEFT JOIN enddatecalc
ON startdatecalc.personid = enddatecalc.personid
AND enddatecalc.enddate >= startdatecalc.startdate
AND NOT EXISTS
(SELECT 1 FROM enddatecalc xref
WHERE xref.personid = enddatecalc.personid
AND xref.enddate < enddatecalc.enddate
AND xref.enddate >= startdatecalc.startdate
AND xref.endyear < #yr)
WHERE startdatecalc.personid = #pid
AND startdatecalc.startyear <= #yr
AND (enddatecalc.personid IS NULL OR endyear >= #yr);
EDIT: Added existence check to attempt to handle if the same personid was used multiple times in the same year.
Here's my implementation with test tables and data. You'll have to change where appropriate. NOTE: i take datediff + 1 for days in prison, so if you go in on monday and leave on tuesday, that counts as two days. if you want it to count as one day, remove the "+ 1"
create table PrisonRegistry
(
id int not null identity(1,1) primary key
, PersonId int not null
, ValueTypeId int not null
, Value date
)
-- ValueTypeIDs: 1 = start prison date, 2 = end prison date
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 1, 1, '2012-03-28' )
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 1, 1, '2012-10-12' )
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 1, 2, '2012-03-29' )
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 2, 1, '2012-01-15' )
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 2, 2, '2012-02-15' )
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 2, 1, '2012-04-01' )
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 2, 2, '2012-04-05' )
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 2, 1, '2012-09-03' )
insert PrisonRegistry( PersonId, ValueTypeId, Value ) values ( 2, 2, '2012-12-1' )
go
create function dbo.NumDaysInPrison(
#personId int
, #year int
)
returns int
as
begin
declare #retVal int
set #retVal = 0
declare #valueTypeId int
declare #value date
declare #startDate date
declare #noDates bit
set #noDates = 1
set #startDate = DATEFROMPARTS( #year, 1, 1 )
declare prisonCursor cursor for
select
pr.ValueTypeId
, pr.Value
from
PrisonRegistry pr
where
DATEPART( yyyy, pr.Value ) = #year
and pr.ValueTypeId in (1,2)
and PersonId = #personId
order by
pr.Value
open prisonCursor
fetch next from prisonCursor
into #valueTypeId, #value
while ##FETCH_STATUS = 0
begin
set #noDates = 0
-- if end date, add date diff to retVal
if 2 = #valueTypeId
begin
--if #startDate is null
--begin
-- -- error: two end dates in a row
-- -- handle
--end
set #retVal = #retVal + DATEDIFF( dd, #startDate, #value ) + 1
set #startDate = null
end
else if 1 = #valueTypeId
begin
set #startDate = #value
end
fetch next from prisonCursor
into #valueTypeId, #value
end
close prisonCursor
deallocate prisonCursor
if #startDate is not null and 0 = #noDates
begin
set #retVal = #retVal + DATEDIFF( dd, #startDate, DATEFROMPARTS( #year, 12, 31 ) ) + 1
end
return #retVal
end
go
select dbo.NumDaysInPrison( 1, 2012 )
select dbo.NumDaysInPrison( 2, 2012 )
select dbo.NumDaysInPrison( 2, 2011 )
This is a complicated question. It is not so much "asking for a function" as it is dealing with two competing problems. The first is organizing the data, which is transaction-based, into records with start and stop dates for the prison period. The second is summarizing this for time spent within another given span of time (a year).
I think you need to spend some time investigating the data to understand the anomalies in it, before progressing to writing a function. The following query should help you. It does the calculate for all prisoners for a given year (which is the year in the first CTE):
with vals as (
select 2012 as yr
),
const as (
select cast(CAST(yr as varchar(255))+'-01-01' as DATE) as periodstart,
cast(CAST(yr as varchar(255))+'-12-31' as DATE) as periodend
from vals
)
select t.personId, SUM(datediff(d, (case when StartDate < const.periodStart then const.periodStart else StartDate end),
(case when EndDate > const.PeriodEnd or EndDate is NULL then const.periodEnd, else EndDate end)
)
) as daysInYear
from (select t.*, t.value as StartDate,
(select top 1 value
from t t2
where t.personId = t2.personId and t2.Value >= t.Value and t2.ValueTypeDescription = 'End Prison Date'
order by value desc
) as EndDate
from t
where valueTypeDescription = 'Start Prison Date'
) t cross join
const
where StartDate <= const.periodend and (EndDate >= const.periodstart or EndDate is NULL)
group by t.PersonId;
This query can be adapted as a function. But, I would encourage you to investigate the data before going there. Once you wrap things up in a function, it will be much more difficult to find and understand anomalies -- why did someone go in and out on the same day? How has the longest periods in prison? And so on.

In a set of overlapping, version-numbered intervals, find the most recent version at each point in time

I'm working with a set of date intervals where each interval has a version number and new intervals will frequently overlap old ones, or even be subsets of them. From this data I need to calculate a new set of intervals that shows the most recent version number, at each point in time. Is there a set-based solution to this problem?
Here's an illustration:
Interval 1: 11111111111111111111111
Interval 2: 2222222222
Interval 3: 33333333333333
Interval 4: 444444444
Interval 5: 555555555
Result : 11333333333333331155555555544
Here is a sample of the data I'm working with:
groupId startDate endDate version
-------- --------- ---------- ------
1 1/1/2010 1/1/2011 1
1 10/1/2010 7/5/2011 2
1 7/5/2011 8/13/2012 3
1 8/13/2012 12/31/2012 6
1 10/1/2012 11/1/2012 8
... and the desired output:
groupId startDate endDate version
-------- --------- ---------- ------
1 1/1/2010 10/1/2010 1
1 10/1/2010 7/5/2011 2
1 7/5/2011 8/13/2012 3
1 8/13/2011 10/1/2012 6
1 10/1/2012 11/1/2012 8 << note how version 8 supersedes version 6
1 11/1/2012 12/31/2012 6 << version 6 is split into two records
I haven't found any other examples of this problem, my googling only turns up queries that identify gaps and islands or covering sets.
I think I have an iterative solution (SQL Server 2008). It starts with a temp table for intervals in the result set and defines the start and end points for the range that we want to cover by inserting records with special version numbers. Then, it repeatedly identifies gaps between result set intervals and attempts to fill them with the most recent records from the original data set, until there are no more gaps or no more records to add:
GO
-- Create data set and results table
CREATE TABLE #Data (
groupId INT
,startDate DATE
,endDate DATE
,versionId INT
)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2007-12-22', '2008-12-22', 8)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2008-12-22', '2009-12-22', 9)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2009-12-22', '2010-12-22', 10)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2010-12-22', '2011-12-22', 11)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-01-01', '2011-11-30', 500)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-12-22', '2012-12-22', 12)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 13)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 14)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 17)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 19)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-01-01', '2011-01-01', 1)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-10-01', '2011-07-05', 2)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2011-07-05', '2012-08-13', 3)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-08-13', '2012-12-31', 6)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-10-01', '2012-11-01', 8)
CREATE TABLE #Results (
groupId VARCHAR(10)
,startDate DATE
,endDate DATE
,versionId BIGINT
)
DECLARE #startDate DATE
DECLARE #endDate DATE
DECLARE #placeholderId BIGINT
SET #startDate = '20030101'
SET #endDate = '20121231'
SET #placeholderId = 999999999999999
INSERT #Results
SELECT DISTINCT
groupId
,CASE WHEN MIN(startDate) < #startDate THEN MIN(startDate) ELSE #startDate END
,CASE WHEN MIN(startDate) < #startDate THEN #startDate ELSE MIN(startDate) END
,#placeholderId
FROM #data
GROUP BY groupId
UNION ALL
SELECT DISTINCT
groupId
,CASE WHEN MAX(endDate) < #endDate THEN MAX(endDate) ELSE #endDate END
,CASE WHEN MAX(endDate) < #endDate THEN #endDate ELSE MAX(endDate) END
,#placeholderId
FROM #data
GROUP BY groupId
GO
-- Fill gaps in results table
DECLARE #startDate DATE
DECLARE #endDate DATE
DECLARE #placeholderId BIGINT
SET #startDate = '20030101'
SET #endDate = '20111231'
SET #placeholderId = 999999999999999
DECLARE #counter INT
SET #counter = 0
WHILE #counter < 10
BEGIN
SET #counter = #counter + 1;
WITH Gaps AS (
SELECT
gs.groupId
,gs.startDate
,MIN(ge.endDate) as endDate
,ROW_NUMBER() OVER (ORDER BY gs.groupId, gs.startDate) as gapId
FROM (
SELECT groupId, endDate as startDate
FROM #Results r1
WHERE NOT EXISTS (
SELECT *
FROM #Results r2
WHERE r2.groupId = r1.groupId
AND r2.versionId <> r1.versionId
AND r2.startDate <= r1.endDate
AND r2.endDate > r1.endDate
)
AND NOT (endDate >= #endDate AND versionId = #placeholderId)
) gs
INNER JOIN (
SELECT groupId, startDate as endDate
FROM #Results r1
WHERE NOT EXISTS (
SELECT *
FROM #Results r2
WHERE r2.groupId = r1.groupId
AND r2.versionId <> r1.versionId
AND r2.endDate >= r1.startDate
AND r2.startDate < r1.startDate
)
AND NOT (startDate <= #startDate AND versionId = #placeholderId)
) ge
ON ge.groupId = gs.groupId
AND ge.endDate >= gs.startDate
GROUP BY gs.groupId, gs.startDate
)
INSERT #Results (
groupId
,startDate
,endDate
,versionId
)
SELECT
d.groupId
,CASE WHEN d.startDate < g.startDate THEN g.startDate ELSE d.startDate END
,CASE WHEN d.endDate > g.endDate THEN g.endDate ELSE d.endDate END
,d.versionId
FROM #Data d
INNER JOIN Gaps g
ON g.groupId = d.groupId
AND g.startDate <= d.endDate
AND g.endDate >= d.startDate
INNER JOIN (
SELECT
d.groupId
,gapId
,MAX(d.versionId) as versionId
FROM #Data d
INNER JOIN Gaps g
ON g.groupId = d.groupId
AND g.startDate <= d.endDate
AND g.endDate >= d.startDate
WHERE d.versionId < (
SELECT MIN(versionId)
FROM #Results r
WHERE r.groupId = d.groupId
AND (r.startDate = g.endDate OR r.endDate = g.startDate)
)
AND NOT EXISTS (
SELECT *
FROM #Data dsup
WHERE dsup.groupId = d.groupId
AND dsup.versionId > d.versionId
AND dsup.startDate <= d.startDate
AND dsup.endDate >= d.endDate
)
GROUP BY
d.groupId
,g.gapId
) mg
ON mg.groupId = g.groupId
AND mg.gapId = g.gapId
AND mg.versionId = d.versionId
END
SELECT *
FROM #Results
WHERE versionId <> #placeholderId
order by groupId, startDate
A set-based solution would be much more useful, but I've struggled to find one. Any ideas?
-- create a dates table
create table dates (thedate date primary key clustered);
;with dates(thedate) as (
select dateadd(yy,years.number,0)+days.number
from master..spt_values years
join master..spt_values days
on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
where years.type='p' and years.number between 100 and 150
-- note: 100-150 creates dates in the year range 2000-2050
-- adjust as required
)
insert dbo.dates select * from dates;
-- for each date, determine the prevailing version
select t.groupId, d.thedate, max(t.versionId) versionId
into #tmp1
from dates d
join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate;
-- create index to help
create clustered index cix_tmp1 on #tmp1(groupId, thedate, versionId);
-- find the start dates
;with t as (
select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
from #tmp1 a
left join #tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
where b.versionId is null
)
select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
order by groupId, startdate;
Of course, you can do everything in "one query" but do it at your peril, as the performance goes down the drain, big time.
DO NOT USE - for academic interest only-
;with dates(thedate) as (
select dateadd(yy,years.number,0)+days.number
from master..spt_values years
join master..spt_values days
on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
where years.type='p' and years.number between 100 and 150
-- note: 100-150 creates dates in the year range 2000-2050
-- adjust as required
), tmp1 as (
select t.groupId, d.thedate, max(t.versionId) versionId
from dates d
join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate
), t as (
select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
from tmp1 a
left join tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
where b.versionId is null
)
select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
order by groupId, startdate;
Updated due to some feedback from the comments. I'm not going to worry about the end cases that a few people have pointed out since they've been proven trivial to solve in other Answers, but I wanted to go ahead and get a working version out that didn't require DDL... I figure it's just good to have options. :-)
This code should work:
select nesty.groupId, nesty.startDate, nesty.segment_end_date, Max(bob.versionId)
from(
select starter.groupId, starter.startDate,
coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31')) AS segment_end_date
from
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
(select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xx) starter
left outer join
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
(select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xy) ender on
starter.groupId = ender.groupId and
starter.rownumber = ender.rownumber - 1
where
starter.startDate<= coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31'))
) nesty
left outer join #Data bob on
bob.groupId = nesty.groupId and
nesty.segment_end_date between bob.startDate and bob.endDate
group by nesty.groupId, nesty.startDate, nesty.segment_end_date
order by nesty.groupId, nesty.startDate
There are a couple of tiny caveats I had to do to get it into a single SQL statement. First, the max end date is not dynamic; I hard coded '2012-12-31'. You can replace it with a MAX(endDate), but you can't put that in the GROUP BY statement. If you can do this in a procedure, you can do:
select into #max_end_date MAX(endDate) from #Data
and replace '2012-12-31' with #max_end_date.
Second, I do not guarantee that two adjacent segments won't have the same value! This may or may not be important to you... that is, if you had the following:
Interval 1: 111111
Interval 2: 22222222222222
Your output would be:
Interval 1: 2222
Interval 2: 2222222222
Still, I think it's worth hitting it in a simple and efficient SQL query. It may not be hard to fix those caveats, but it didn't matter to what I was working on, so I haven't bothered yet.
If the end dates are important, as well as gaps, here's a way you can do it. This solution could also be adapted to work if your versions are datetimes instead of just dates.
First a bunch of functions
One to get the version at a given date
Create Function dbo.VersionAtDate(#GroupID int, #Date datetime) Returns int as
Begin
Declare #Ret int = Null
Select
#Ret = Max(VersionID)
From
VersionedIntervals iv
Where
iv.GroupID = #GroupID And
iv.StartDate <= #Date And
iv.EndDate + 1 > #Date -- if dates were half open intervals this would just be iv.EndDate > #Date
Return #Ret
End
Next to get the midpoint of two datetimes (minute resolution):
Create Function dbo.Midpoint(#Start datetime, #End datetime) Returns datetime as
Begin
Return DateAdd(Minute, DateDiff(Minute, #Start, #End) / 2, #Start)
End
Version at a midpoint:
Create Function dbo.VersionAtMidpoint(#GroupID int, #Start datetime, #End datetime) returns int as
Begin
Return dbo.VersionAtDate(#GroupID, dbo.Midpoint(#Start, #End))
End;
Finally a table valued function to help with the fact that some points are the start of one range and the end of another, and it helps to get two rows from one input for this:
-- returns two rows if a point is the end of one interval and the
-- start of another
Create Function dbo.EndPoints(#GroupID int, #RN bigint, #Start datetime, #End datetime, #Next datetime, #Version int)
Returns #EndPoints Table (
GroupID int,
RN bigint,
Version int,
StartDate datetime,
EndDate datetime
) As
Begin
Declare #NextVersion int, #VersionAtMidpoint int
Set #NextVersion = dbo.VersionAtDate(#GroupID, #Next)
If #NextVersion = #Version
-- interval carries on
Insert Into #EndPoints Select #GroupID, #RN, #Version, #Start, #Next
Else
Begin
-- interval has ended
Set #VersionAtMidpoint = dbo.VersionAtMidPoint(#GroupID, #End, #Next)
If #VersionAtMidpoint != #Version
-- we have something like this, start a run of 3s (run of 4s is already ended by previous call)
-- 3333333
-- 44
Insert Into #EndPoints Select #GroupID, #RN, #VersionAtMidpoint, #End, #Next
Else
Begin
-- We have something like this, end the run of 3s and start the run of fours
-- 33333
-- 444
Insert Into #EndPoints Select #GroupID, -1, #Version, #Start, #Next
Insert Into #EndPoints Select #GroupID, #RN, #NextVersion, #Next, #Next
End
End
Return
End
With all this machinery in place, finally a recursive CTE plust table variable, you'll need to set maxrecursion appropriately:
Declare #Bounds Table (GroupID int, RN bigint, BoundDate datetime, Primary Key (GroupID, RN))
Insert Into
#Bounds
Select
GroupID,
Row_Number() Over (Partition By GroupID Order By BoundDate),
BoundDate
From (
Select
GroupID,
StartDate As BoundDate
From
dbo.VersionedIntervals
Union
Select
GroupID,
EndDate
From
dbo.VersionedIntervals
) a
;With VersionedBounds (GroupID, RN, StartDate, EndDate, Version) as (
Select
GroupID,
RN,
BoundDate,
BoundDate,
dbo.VersionAtDate(GroupID, BoundDate)
From
#Bounds
Where
RN = 1
Union All
Select
e.GroupID,
e.RN,
e.StartDate,
e.EndDate,
e.Version
From
#Bounds b
Inner Join
VersionedBounds v
On v.GroupID = b.GroupID And b.RN = v.RN + 1
Cross Apply
dbo.EndPoints(v.GroupID, b.RN, v.StartDate, v.EndDate, b.BoundDate, v.Version) e
)
Select
GroupID,
StartDate,
Max(EndDate) As EndDate,
Max(Version) As Version
From
VersionedBounds
Group By
GroupID,
StartDate
Order By
GroupID,
StartDate
http://sqlfiddle.com/#!6/b95bd/2