SQL server group / partition to condense history table - sql

Got a table of dates someone was in a particular category like this:
drop table if exists #category
create table #category (personid int, categoryid int, startdate datetime, enddate datetime)
insert into #category
select * from
(
select 1 Personid, 1 CategoryID, '01/04/2010' StartDate, '31/07/2016' EndDate union
select 1 Personid, 5 CategoryID, '07/08/2016' StartDate, '31/03/2019' EndDate union
select 1 Personid, 5 CategoryID, '01/04/2019' StartDate, '01/04/2019' EndDate union
select 1 Personid, 5 CategoryID, '02/04/2019' StartDate, '11/08/2019' EndDate union
select 1 Personid, 4 CategoryID, '12/08/2019' StartDate, '03/11/2019' EndDate union
select 1 Personid, 5 CategoryID, '04/11/2019' StartDate, '22/03/2020' EndDate union
select 1 Personid, 5 CategoryID, '23/03/2020' StartDate, NULL EndDate union
select 2 Personid, 1 CategoryID, '01/04/2010' StartDate, '09/04/2015' EndDate union
select 2 Personid, 4 CategoryID, '10/04/2015' StartDate, '31/03/2018' EndDate union
select 2 Personid, 4 CategoryID, '01/04/2018' StartDate, '31/03/2019' EndDate union
select 2 Personid, 4 CategoryID, '01/04/2019' StartDate, '23/06/2019' EndDate union
select 2 Personid, 4 CategoryID, '24/06/2019' StartDate, NULL EndDate
) x
order by personid, startdate
I'm trying to condense it so I get this:
PersonID
categoryid
startdate
EndDate
1
1
01/04/2010
31/07/2016
1
5
07/08/2016
11/08/2019
1
4
12/08/2019
03/11/2019
1
5
04/11/2019
NULL
2
1
01/04/2010
09/04/2015
2
4
01/04/2015
NULL
I'm having issues with people like personid 1 where they are in (e.g.) category 5, then go into category 4 and them back into category 5.
So doing something like:
select
personid,
categoryid,
min(startdate) startdate,
max(enddate) enddate
from #category
group by
personid, categoryid
gives me the earliest date from category 5's first period, and the latest date from the second period - and means it creates an overlapping period.
So I tried partitioning it with a rownum or rank, but it still does the same thing - i.e. treats the 'category 5's as the same group:
select
rank() over (partition by personid, categoryid order by personid, startdate) rank,
c.*
from #category c
order by personid, startdate
rank
personid
categoryid
startdate
enddate
1
1
1
2010-04-01 00:00:00.000
2016-07-31 00:00:00.000
1
1
5
2016-08-07 00:00:00.000
2019-03-31 00:00:00.000
2
1
5
2019-04-01 00:00:00.000
2019-04-01 00:00:00.000
3
1
5
2019-04-02 00:00:00.000
2019-08-11 00:00:00.000
1
1
4
2019-08-12 00:00:00.000
2019-11-03 00:00:00.000
4
1
5
2019-11-04 00:00:00.000
2020-03-22 00:00:00.000
5
1
5
2020-03-23 00:00:00.000
NULL
1
2
1
2010-04-01 00:00:00.000
2015-04-09 00:00:00.000
1
2
4
2015-04-10 00:00:00.000
2018-03-31 00:00:00.000
2
2
4
2018-04-01 00:00:00.000
2019-03-31 00:00:00.000
3
2
4
2019-04-01 00:00:00.000
2019-06-23 00:00:00.000
4
2
4
2019-06-24 00:00:00.000
NULL
You can see in the rank column that the category 5's start off 1,2,3, miss a row and carry on 4, 5 so obvs in the same partition - I thought that adding the order by clause would force it to start a new partition when the category changed from 5 to 4 and back again.
Any thoughts?

This is a type of gaps and islands problem. However, if your data tiles perfectly (no gaps) as it does in your example data, then you can do this without any aggregation at all -- which should be the most efficient method:
select personid, categoryid, startdate,
dateadd(day, -1, lead(startdate) over (partition by personid order by startdate)) as enddate
from (select c.*,
lag(categoryid) over (partition by personid order by startdate) as prev_categoryid
from #category c
) c
where prev_categoryid is null or prev_categoryid <> categoryid;
The where clause only selects the rows where the category changes. The lead() then gets the next start date -- and subtracts 1 for your desired enddate.

Related

Find max of total days using Partition By [duplicate]

This question already has answers here:
Merge overlapping date intervals
(9 answers)
Closed 4 months ago.
Need SQL Query to get a max of the sum of Days where enddate is equal to startdate
.Below is a table
ID
StartDate
EndDate
Days
121
01-01-2022
01-03-2022
2
121
01-03-2022
01-04-2022
1
121
01-04-2022
01-06-2022
2
121
01-07-2022
01-08-2022
1
121
01-08-2022
01-09-2022
1
In the above table, the 01-01-2022 to 01-06-2022 sum is 5 which is greater than the sum of 2 from 01-07-2022 to 01-09-2022.
Output required
ID
Days
121
5
I have written as per my understanding. correct me if I did worse
with table as (
select 121 id, "12-28-2021" startdate, "12-30-2021" enddate, 2 days
union all
select 121 id, "12-30-2021" startdate, "12-31-2021" enddate, 1 days
union all
select 121 id, "01-01-2022" startdate, "01-03-2022" enddate, 2 days
union all
select 121 id, "01-03-2022" startdate, "01-04-2022" enddate, 1 days
union all
select 121 id, "01-04-2022" startdate, "01-06-2022" enddate, 2 days
union all
select 121 id, "01-07-2022" startdate, "01-08-2022" enddate, 1 days
union all
select 121 id, "01-08-2022" startdate, "01-09-2022" enddate, 1 days
)
select
table3.id,
max(days) days
from
(
select
table2.id,
sum(days) days
from
(
select
table1.*,
case
when sum(x) over(rows between unbounded preceding and 1 preceding) is null then 0
else sum(x) over(rows between unbounded preceding and 1 preceding)
end as y
from
(
select
table.*,
case
when enddate = lead(startdate) over(order by startdate) then 0
else 1
end as x
from table
) table1
) table2
group by id,y
) table3
group by id

Get max date for each from either of 2 columns

I have a table like below
AID BID CDate
-----------------------------------------------------
1 2 2018-11-01 00:00:00.000
8 1 2018-11-08 00:00:00.000
1 3 2018-11-09 00:00:00.000
7 1 2018-11-15 00:00:00.000
6 1 2018-12-24 00:00:00.000
2 5 2018-11-02 00:00:00.000
2 7 2018-12-15 00:00:00.000
And I am trying to get a result set as follows
ID MaxDate
-------------------
1 2018-12-24 00:00:00.000
2 2018-12-15 00:00:00.000
Each value in the id columns(AID,BID) should return the max of CDate .
ex: in the case of 1, its max CDate is 2018-12-24 00:00:00.000 (here 1 appears under BID)
in the case of 2 , max date is 2018-12-15 00:00:00.000 . (here 2 is under AID)
I tried the following.
1.
select
g.AID,g.BID,
max(g.CDate) as 'LastDate'
from dbo.TT g
inner join
(select AID,BID,max(CDate) as maxdate
from dbo.TT
group by AID,BID)a
on (a.AID=g.AID or a.BID=g.BID)
and a.maxdate=g.CDate
group by g.AID,g.BID
and 2.
SELECT
AID,
CDate
FROM (
SELECT
*,
max_date = MAX(CDate) OVER (PARTITION BY [AID])
FROM dbo.TT
) AS s
WHERE CDate= max_date
Please suggest a 3rd solution.
You can assemble the data in a table expression first, and the compute the max for each value is simple. For example:
select
id, max(cdate)
from (
select aid as id, cdate from t
union all
select bid, cdate from t
) x
group by id
You seem to only care about values that are in both columns. If this interpretation is correct, then:
select id, max(cdate)
from ((select aid as id, cdate, 1 as is_a, 0 as is_b
from t
) union all
(select bid as id, cdate, 1 as is_a, 0 as is_b
from t
)
) ab
group by id
having max(is_a) = 1 and max(is_b) = 1;

How can I add a sequence / order a data in SQL

I am trying to add a sequence order to my raw data and was wondering if there is an efficient way to do this without a while exists loop as I have more than million records to order.
Example dataset:
CustomerID StartDate EndDate EnrollID
-------------------------------------------
1 1/1/1990 1/1/1991 14994
2 1/1/1990 1/1/1992 14995
2 1/1/1993 1/1/1995 14997
1 1/1/1992 1/1/1993 14996
1 1/1/1993 1/1/1994 14997
2 1/1/1995 1/1/1996 14998
3 1/1/1990 1/1/1991 15000
3 1/1/1992 1/1/1993 15001
3 1/1/1995 1/1/1996 15007
Re-ordered data should add a sequence/ order for each customer id based on min(startdate), min(enddate) , min(enrollid)
Final output Dataset should look like below where each customerID records are ordered by min(StartDate), min(EndDate), min(EnrollID)
CustomerID StartDate EndDate EnrollID Sequence_Order
----------------------------------------------------------
1 1/1/1990 1/1/1991 14994 1
1 1/1/1992 1/1/1993 14996 2
1 1/1/1993 1/1/1994 14997 3
2 1/1/1990 1/1/1992 14995 1
2 1/1/1993 1/1/1995 14997 2
2 1/1/1995 1/1/1996 14998 3
3 1/1/1990 1/1/1991 15000 1
3 1/1/1992 1/1/1993 15001 2
3 1/1/1995 1/1/1996 15007 3
Need the fastest way to do this in T-SQL
Use ROW_NUMBER()
SELECT CustomerID, StartDate, EndDate, EnrollID,
ROW_NUMBER() OVER(
PARTITION BY CustomerId
ORDER BY StartDate
,EndDate
,EnrollID
) AS Sequence_Order
FROM Table1
OUTPUT:
CustomerID StartDate EndDate EnrollID Sequence_Order
1 1990-01-01 1991-01-01 14994 1
1 1992-01-01 1993-01-01 14996 2
1 1993-01-01 1994-01-01 14997 3
2 1990-01-01 1992-01-01 14995 1
2 1993-01-01 1995-01-01 14997 2
2 1995-01-01 1996-01-01 14998 3
3 1990-01-01 1991-01-01 15000 1
3 1992-01-01 1993-01-01 15001 2
3 1995-01-01 1996-01-01 15007 3
Follow the link to the demo:
http://sqlfiddle.com/#!18/dbe66/2
Use Row_number
select
*, row_number() over (partition by CustomerID order by StartDate, EndDate, EnrollID) as Sequence_Order
from myTable
The Order= ROW_NUMBER() OVER (ORDER BY column) is uses for. try the below answer
SELECT (Order= ROW_NUMBER() OVER ( ORDER BY CustomerID)) as Sequence ,
CustomerID,
StartDate,
EndDate,
EnrollID
FROM dbo.SomeTable
Reference
Try this:
SELECT customerId, startDate, endDate, enrollID,
ROW_NUMBER() OVER(
PARTITION BY customerId
ORDER BY startDate
,endDate
,enrollID
) AS seq
FROM Table1
ORDER BY customerId
,startDate
,endDate
,enrollID
Order by is required in the end to sort the final output

Date range with minimum and maximum dates from dataset having records with continuous date range

I have a dataset with id ,Status and date range of employees.
The input dataset given below are the details of one employee.
The date ranges in the records are continuous(in exact order) such that startdate of second row will be the next date of enddate of first row.
If an employee takes leave continuously for different months, then the table is storing the info with date range as separated for different months.
For example: In the input set, the employee has taken Sick leave from '16-10-2016' to '31-12-2016' and joined back on '1-1-2017'.
So there are 3 records for this item but the dates are continuous.
In the output I need this as one record as shown in the expected output dataset.
INPUT
Id Status StartDate EndDate
1 Active 1-9-2007 15-10-2016
1 Sick 16-10-2016 31-10-2016
1 Sick 1-11-2016 30-11-2016
1 Sick 1-12-2016 31-12-2016
1 Active 1-1-2017 4-2-2017
1 Unpaid 5-2-2017 9-2-2017
1 Active 10-2-2017 11-2-2017
1 Unpaid 12-2-2017 28-2-2017
1 Unpaid 1-3-2017 31-3-2017
1 Unpaid 1-4-2017 30-4-2017
1 Active 1-5-2017 13-10-2017
1 Sick 14-10-2017 11-11-2017
1 Active 12-11-2017 NULL
EXPECTED OUTPUT
Id Status StartDate EndDate
1 Active 1-9-2007 15-10-2016
1 Sick 16-10-2016 31-12-2016
1 Active 1-1-2017 4-2-2017
1 Unpaid 5-2-2017 9-2-2017
1 Active 10-2-2017 11-2-2017
1 Unpaid 12-2-2017 30-4-2017
1 Active 1-5-2017 13-10-2017
1 Sick 14-10-2017 11-11-2017
1 Active 12-11-2017 NULL
I can't take min(startdate) and max(EndDate) group by id,status because if the same employee has taken another Sick leave then that end date ('11-11-2017' in the example) will come as the End date.
can anyone help me with the query in SQL server 2014?
It suddenly hit me that this is basically a gaps and islands problem - so I've completely changed my solution.
For this solution to work, the dates does not have to be consecutive.
First, create and populate sample table (Please save us this step in your future questions):
DECLARE #T AS TABLE
(
Id int,
Status varchar(10),
StartDate date,
EndDate date
);
SET DATEFORMAT DMY; -- This is needed because how you specified your dates.
INSERT INTO #T (Id, Status, StartDate, EndDate) VALUES
(1, 'Active', '1-9-2007', '15-10-2016'),
(1, 'Sick', '16-10-2016', '31-10-2016'),
(1, 'Sick', '1-11-2016', '30-11-2016'),
(1, 'Sick', '1-12-2016', '31-12-2016'),
(1, 'Active', '1-1-2017', '4-2-2017'),
(1, 'Unpaid', '5-2-2017', '9-2-2017'),
(1, 'Active', '10-2-2017', '11-2-2017'),
(1, 'Unpaid', '12-2-2017', '28-2-2017'),
(1, 'Unpaid', '1-3-2017', '31-3-2017'),
(1, 'Unpaid', '1-4-2017', '30-4-2017'),
(1, 'Active', '1-5-2017', '13-10-2017'),
(1, 'Sick', '14-10-2017', '11-11-2017'),
(1, 'Active', '12-11-2017', NULL);
The (new) common table expression:
;WITH CTE AS
(
SELECT Id,
Status,
StartDate,
EndDate,
ROW_NUMBER() OVER(PARTITION BY Id ORDER BY StartDate)
- ROW_NUMBER() OVER(PARTITION BY Id, Status ORDER BY StartDate) As IslandId,
ROW_NUMBER() OVER(PARTITION BY Id ORDER BY StartDate DESC)
- ROW_NUMBER() OVER(PARTITION BY Id, Status ORDER BY StartDate DESC) As ReverseIslandId
FROM #T
)
The (new) query:
SELECT DISTINCT Id,
Status,
MIN(StartDate) OVER(PARTITION BY IslandId, ReverseIslandId) As StartDate,
NULLIF(MAX(ISNULL(EndDate, '9999-12-31')) OVER(PARTITION BY IslandId, ReverseIslandId), '9999-12-31') As EndDate
FROM CTE
ORDER BY StartDate
(new) Results:
Id Status StartDate EndDate
1 Active 01.09.2007 15.10.2016
1 Sick 16.10.2016 31.12.2016
1 Active 01.01.2017 04.02.2017
1 Unpaid 05.02.2017 09.02.2017
1 Active 10.02.2017 11.02.2017
1 Unpaid 12.02.2017 30.04.2017
1 Active 01.05.2017 13.10.2017
1 Sick 14.10.2017 11.11.2017
1 Active 12.11.2017 NULL
You can see a live demo on rextester.
Please note that string representation of dates in SQL should be acccording to ISO 8601 - meaning either yyyy-MM-dd or yyyyMMdd as it's unambiguous and will always be interpreted correctly by SQL Server.
It's an example of GROUPING AND WINDOW.
First you set a reset point for each Status
Sum to set a group
Then get max/min dates of each group.
;with x as
(
select Id, Status, StartDate, EndDate,
iif (lag(Status) over (order by Id, StartDate) = Status, null, 1) rst
from emp
), y as
(
select Id, Status, StartDate, EndDate,
sum(rst) over (order by Id, StartDate) grp
from x
)
select Id,
MIN(Status) as Status,
MIN(StartDate) StartDate,
MAX(EndDate) EndDate
from y
group by Id, grp
order by Id, grp
GO
Id | Status | StartDate | EndDate
-: | :----- | :------------------ | :------------------
1 | Active | 01/09/2007 00:00:00 | 15/10/2016 00:00:00
1 | Sick | 16/10/2016 00:00:00 | 31/12/2016 00:00:00
1 | Active | 01/01/2017 00:00:00 | 04/02/2017 00:00:00
1 | Unpaid | 05/02/2017 00:00:00 | 09/02/2017 00:00:00
1 | Active | 10/02/2017 00:00:00 | 11/02/2017 00:00:00
1 | Unpaid | 12/02/2017 00:00:00 | 30/04/2017 00:00:00
1 | Active | 01/05/2017 00:00:00 | 13/10/2017 00:00:00
1 | Sick | 14/10/2017 00:00:00 | 11/11/2017 00:00:00
1 | Active | 12/11/2017 00:00:00 | null
dbfiddle here
Here's an alternative answer that doesn't use LAG.
First I need to take a copy of your test data:
DECLARE #table TABLE (Id INT, [Status] VARCHAR(50), StartDate DATE, EndDate DATE);
INSERT INTO #table SELECT 1, 'Active', '20070901', '20161015';
INSERT INTO #table SELECT 1, 'Sick', '20161016', '20161031';
INSERT INTO #table SELECT 1, 'Sick', '20161101', '20161130';
INSERT INTO #table SELECT 1, 'Sick', '20161201', '20161231';
INSERT INTO #table SELECT 1, 'Active', '20170101', '20170204';
INSERT INTO #table SELECT 1, 'Unpaid', '20170205', '20170209';
INSERT INTO #table SELECT 1, 'Active', '20170210', '20170211';
INSERT INTO #table SELECT 1, 'Unpaid', '20170212', '20170228';
INSERT INTO #table SELECT 1, 'Unpaid', '20170301', '20170331';
INSERT INTO #table SELECT 1, 'Unpaid', '20170401', '20170430';
INSERT INTO #table SELECT 1, 'Active', '20170501', '20171013';
INSERT INTO #table SELECT 1, 'Sick', '20171014', '20171111';
INSERT INTO #table SELECT 1, 'Active', '20171112', NULL;
Then the query is:
WITH add_order AS (
SELECT
*,
ROW_NUMBER() OVER (ORDER BY StartDate) AS order_id
FROM
#table),
links AS (
SELECT
a1.Id,
a1.[Status],
a1.order_id,
MIN(a1.order_id) AS start_order_id,
MAX(ISNULL(a2.order_id, a1.order_id)) AS end_order_id,
MIN(a1.StartDate) AS StartDate,
MAX(ISNULL(a2.EndDate, a1.EndDate)) AS EndDate
FROM
add_order a1
LEFT JOIN add_order a2 ON a2.Id = a1.Id AND a2.[Status] = a1.[Status] AND a2.order_id = a1.order_id + 1 AND a2.StartDate = DATEADD(DAY, 1, a1.EndDate)
GROUP BY
a1.Id,
a1.[Status],
a1.order_id),
merged AS (
SELECT
l1.Id,
l1.[Status],
l1.[StartDate],
ISNULL(l2.EndDate, l1.EndDate) AS EndDate,
ROW_NUMBER() OVER (PARTITION BY l1.Id, l1.[Status], ISNULL(l2.EndDate, l1.EndDate) ORDER BY l1.order_id) AS link_id
FROM
links l1
LEFT JOIN links l2 ON l2.order_id = l1.end_order_id)
SELECT
Id,
[Status],
StartDate,
EndDate
FROM
merged
WHERE
link_id = 1
ORDER BY
StartDate;
Results are:
Id Status StartDate EndDate
1 Active 2007-09-01 2016-10-15
1 Sick 2016-10-16 2016-12-31
1 Active 2017-01-01 2017-02-04
1 Unpaid 2017-02-05 2017-02-09
1 Active 2017-02-10 2017-02-11
1 Unpaid 2017-02-12 2017-04-30
1 Active 2017-05-01 2017-10-13
1 Sick 2017-10-14 2017-11-11
1 Active 2017-11-12 NULL
How does it work? First I add a sequence number, to assist with merging contiguous rows together. Then I determine the rows that can be merged together, add a number to identify the first row in each set that can be merged, and finally pick the first rows out of the final CTE. Note that I also have to handle rows that can't be merged, hence the LEFT JOINs and ISNULL statements.
Just for interest, this is what the output from the final CTE looks like, before I filter out all but the rows with a link_id of 1:
Id Status StartDate EndDate link_id
1 Active 2007-09-01 2016-10-15 1
1 Sick 2016-10-16 2016-12-31 1
1 Sick 2016-11-01 2016-12-31 2
1 Sick 2016-12-01 2016-12-31 3
1 Active 2017-01-01 2017-02-04 1
1 Unpaid 2017-02-05 2017-02-09 1
1 Active 2017-02-10 2017-02-11 1
1 Unpaid 2017-02-12 2017-04-30 1
1 Unpaid 2017-03-01 2017-04-30 2
1 Unpaid 2017-04-01 2017-04-30 3
1 Active 2017-05-01 2017-10-13 1
1 Sick 2017-10-14 2017-11-11 1
1 Active 2017-11-12 NULL 1
You could use lag() and lead() function together to check the previous and next status
WITH CTE AS
(
select *,
COALESCE(LEAD(status) OVER(ORDER BY (select 1)), '0') Nstatus,
COALESCE(LAG(status) OVER(ORDER BY (select 1)), '0') Pstatus
from table
)
SELECT * FROM CTE
WHERE (status <> Nstatus AND status <> Pstatus) OR
(status <> Pstatus)

T/SQL - Group/Multiply records

Source date:
CREATE TABLE #Temp (ID INT Identity(1,1) Primary Key, BeginDate datetime, EndDate datetime, GroupBy INT)
INSERT INTO #Temp
SELECT '2015-06-05 00:00:00.000','2015-06-12 00:00:00.000',7
UNION
SELECT '2015-06-05 00:00:00.000', '2015-06-08 00:00:00.000',7
UNION
SELECT '2015-10-22 00:00:00.000', '2015-10-31 00:00:00.000',7
SELECT *, DATEDIFF(DAY,BeginDate, EndDate) TotalDays FROM #Temp
DROP TABLE #Temp
ID BeginDate EndDate GroupBy TotalDays
1 6/5/15 0:00 6/8/15 0:00 7 3
2 6/5/15 0:00 6/12/15 0:00 7 7
3 10/22/15 0:00 10/31/15 0:00 7 9
Desired Output:
ID BeginDate EndDate GroupBy TotalDays GroupCnt GroupNum
1 6/5/15 0:00 6/8/15 0:00 7 3 1 1
2 6/5/15 0:00 6/12/15 0:00 7 7 1 1
3 10/22/15 0:00 10/29/15 0:00 7 9 2 1
3 10/29/15 0:00 10/31/15 0:00 7 9 2 2
Goal:
Group the records based on ID/BeginDate/EndDate.
Based on the GroupBy number (# of days) and TotalDays (days diff),
if the GroupBy => TotalDays, keep a single group record
else multiply the group records (1 record per GroupBy count) while staying within TotalDays limit.
Apologies if it's confusing but basically, in the above example, there should be one record for each group (ID/BeginDate/EndDate) for the record where days diff b/w Begin/End date = 7 or less (GroupBy).
If the days diff goes above 7 days, create another record (for every additional 7 days diff).
So since 1st two records have days diff of 7 days or less, there's only one record.
The 3rd record has days diff of 9 (7 + 2). Therefore, there should be 2 records (1st for the first 7 days and 2nd for the additional 2 days).
GroupCNT = how many records there're of the grouped records after applying the above records.
GroupNum is basically row number of the group.
GroupBy # can be different for each record. Dataset is huge so performance does matter.
One pattern I was able to figure out was related to the modulus b/w GroupBy and days diff.
When the GroupBy value is < days diff, modulus is always less than GroupBy. When the GroupBy value = days diff, modulus is always 0. And when the GroupBy value > days diff, modulus is always equals GroupBy. I'm not sure if/how to use that to group/multiply records to meet the requirement.
SELECT DISTINCT
ID
, BeginDate
, EndDate
, GroupBy
, DATEDIFF(DAY,BeginDate, EndDate) TotalDays
, CAST(GroupBy as decimal(18,6))%CAST(DATEDIFF(DAY,BeginDate, EndDate) AS decimal(18,6)) Modulus
, CASE WHEN DATEDIFF(DAY,BeginDate, EndDate) <= GroupBy THEN BeginDate END NewBeginDate
, CASE WHEN DATEDIFF(DAY,BeginDate, EndDate) <= GroupBy THEN EndDate END NewEndDate
FROM #Temp
Update:
Forgot to mention/include that the begin/enddate, when the records gets multiplied, will change accordingly. In other words, begin/end date will reflect the GroupBy - desired output shows what I mean more clearly in the 3rd and 4th record.
Also, GroupCnt/GroupNum are not as important to calculate as grouping/multiplying the records.
You could do something like this using a recursive CTE..
;WITH cte AS (
SELECT ID,
BeginDate,
EndDate,
GroupBy,
DATEDIFF(DAY, BeginDate, EndDate) AS TotalDays,
1 AS GroupNum
FROM #Temp
UNION ALL
SELECT ID,
BeginDate,
EndDate,
GroupBy,
TotalDays,
GroupNum + 1
FROM cte
WHERE GroupNum * GroupBy < TotalDays
)
SELECT ID,
BeginDate = CASE WHEN GroupNum = 1 THEN BeginDate
ELSE DATEADD(DAY, GroupBy * (GroupNum - 1), BeginDate)
END ,
EndDate = CASE WHEN TotalDays <= GroupBy THEN EndDate
WHEN DATEADD(DAY, GroupBy * GroupNum, BeginDate) > EndDate THEN EndDate
ELSE DATEADD(DAY, GroupBy * GroupNum, BeginDate)
END ,
GroupBy,
TotalDays,
COUNT(*) OVER (PARTITION BY ID) GroupCnt,
GroupNum
FROM cte
OPTION (MAXRECURSION 0)
the cte builds out a recordset like this.
ID BeginDate EndDate GroupBy TotalDays GroupNum
----------- ----------------------- ----------------------- ----------- ----------- -----------
1 2015-06-05 00:00:00.000 2015-06-08 00:00:00.000 7 3 1
2 2015-06-05 00:00:00.000 2015-06-12 00:00:00.000 7 7 1
3 2015-10-22 00:00:00.000 2015-10-31 00:00:00.000 7 9 1
3 2015-10-22 00:00:00.000 2015-10-31 00:00:00.000 7 9 2
then you just have to take this and use some case statements to determine what the begin and end date should be.
you should end up with
ID BeginDate EndDate GroupBy TotalDays GroupCnt GroupNum
----------- ----------------------- ----------------------- ----------- ----------- ----------- -----------
1 2015-06-05 00:00:00.000 2015-06-08 00:00:00.000 7 3 1 1
2 2015-06-05 00:00:00.000 2015-06-12 00:00:00.000 7 7 1 1
3 2015-10-22 00:00:00.000 2015-10-29 00:00:00.000 7 9 2 1
3 2015-10-29 00:00:00.000 2015-10-31 00:00:00.000 7 9 2 2
since you're using SQL 2012, you can also use the LAG and LEAD functions in your final query.
;WITH cte AS (
SELECT ID,
BeginDate,
EndDate,
GroupBy,
DATEDIFF(DAY, BeginDate, EndDate) AS TotalDays,
1 AS GroupNum
FROM #Temp
UNION ALL
SELECT ID,
BeginDate,
EndDate,
GroupBy,
TotalDays,
GroupNum + 1
FROM cte
WHERE GroupNum * GroupBy < TotalDays
)
SELECT ID,
BeginDate = COALESCE(LAG(BeginDate) OVER (PARTITION BY ID ORDER BY GroupNum) + GroupBy * (GroupNum - 1), BeginDate),
EndDate = COALESCE(LEAD(BeginDate) OVER (PARTITION BY ID ORDER BY GroupNum) + GroupBy * GroupNum, EndDate),
GroupBy,
TotalDays,
COUNT(*) OVER (PARTITION BY ID) GroupCnt,
GroupNum
FROM cte
OPTION (MAXRECURSION 0)
CREATE TABLE dim_number (id INT);
INSERT INTO dim_number VALUES ((0), (1), (2), (3)); -- Populate this to a large number
SELECT
#Temp.Id,
CASE WHEN dim_number.id = 0
THEN #Temp.BeginDate
ELSE DATEADD(DAY, dim_number.id * #Temp.GroupBy, #Temp.BeginDate)
END AS BeginDate,
CASE WHEN dim_number.id = parts.count
THEN #Temp.EndDate
ELSE DATEADD(DAY, (dim_number.id + 1) * #Temp.GroupBy, #Temp.BeginDate)
END AS EndDate,
#Temp.GroupBy AS GroupBy,
DATEDIFF(DAY, #Temp.BeginDate, #Temp.EndDate) AS TotalDays,
parts.count + 1 AS GroupCnt,
dim_number.id + 1 AS GroupNum
FROM
#Temp
CROSS APPLY
(SELECT DATEDIFF(DAY, #Temp.BeginDate, #Temp.EndDate) / #Temp.GroupBy AS count) AS parts
INNER JOIN
dim_number
ON dim_number.id >= 0
AND dim_number.id <= parts.count