How to aggregate (counting distinct items) over a sliding window in SQL Server? - sql

I am currently using this query (in SQL Server) to count the number of unique item each day:
SELECT Date, COUNT(DISTINCT item)
FROM myTable
GROUP BY Date
ORDER BY Date
How can I transform this to get for each date the number of unique item over the past 3 days (including the current day)?
The output should be a table with 2 columns:
one columns with all dates in the original table. On the second column, we have the number of unique item per date.
for instance if original table is:
Date Item
01/01/2018 A
01/01/2018 B
02/01/2018 C
03/01/2018 C
04/01/2018 C
With my query above I currently get the unique count for each day:
Date count
01/01/2018 2
02/01/2018 1
03/01/2018 1
04/01/2018 1
and I am looking to get as result the unique count over 3 days rolling window:
Date count
01/01/2018 2
02/01/2018 3 (because items ABC on 1st and 2nd Jan)
03/01/2018 3 (because items ABC on 1st,2nd,3rd Jan)
04/01/2018 1 (because only item C on 2nd,3rd,4th Jan)

Using an apply provides a convenient way to form sliding windows
CREATE TABLE myTable
([DateCol] datetime, [Item] varchar(1))
;
INSERT INTO myTable
([DateCol], [Item])
VALUES
('2018-01-01 00:00:00', 'A'),
('2018-01-01 00:00:00', 'B'),
('2018-01-02 00:00:00', 'C'),
('2018-01-03 00:00:00', 'C'),
('2018-01-04 00:00:00', 'C')
;
CREATE NONCLUSTERED INDEX IX_DateCol
ON MyTable([Date])
;
Query:
select distinct
t1.dateCol
, oa.ItemCount
from myTable t1
outer apply (
select count(distinct t2.item) as ItemCount
from myTable t2
where t2.DateCol between dateadd(day,-2,t1.DateCol) and t1.DateCol
) oa
order by t1.dateCol ASC
Results:
| dateCol | ItemCount |
|----------------------|-----------|
| 2018-01-01T00:00:00Z | 2 |
| 2018-01-02T00:00:00Z | 3 |
| 2018-01-03T00:00:00Z | 3 |
| 2018-01-04T00:00:00Z | 1 |
There may be some performance gains by reducing the date column prior to using the apply, like so:
select
d.date
, oa.ItemCount
from (
select distinct t1.date
from myTable t1
) d
outer apply (
select count(distinct t2.item) as ItemCount
from myTable t2
where t2.Date between dateadd(day,-2,d.Date) and d.Date
) oa
order by d.date ASC
;
Instead of using select distinct in that subquery you could use group by instead but the execution plan will remain the same.
Demo at SQL Fiddle

The most straight forward solution is to join the table with itself based on dates:
SELECT t1.DateCol, COUNT(DISTINCT t2.Item) AS C
FROM testdata AS t1
LEFT JOIN testdata AS t2 ON t2.DateCol BETWEEN DATEADD(dd, -2, t1.DateCol) AND t1.DateCol
GROUP BY t1.DateCol
ORDER BY t1.DateCol
Output:
| DateCol | C |
|-------------------------|---|
| 2018-01-01 00:00:00.000 | 2 |
| 2018-01-02 00:00:00.000 | 3 |
| 2018-01-03 00:00:00.000 | 3 |
| 2018-01-04 00:00:00.000 | 1 |

GROUP BY should be faster then DISTINCT (make sure to have an index on your Date column)
DECLARE #tbl TABLE([Date] DATE, [Item] VARCHAR(100))
;
INSERT INTO #tbl VALUES
('2018-01-01 00:00:00', 'A'),
('2018-01-01 00:00:00', 'B'),
('2018-01-02 00:00:00', 'C'),
('2018-01-03 00:00:00', 'C'),
('2018-01-04 00:00:00', 'C');
SELECT t.[Date]
--Just for control. You can take this part away
,(SELECT DISTINCT t2.[Item] AS [*]
FROM #tbl AS t2
WHERE t2.[Date]<=t.[Date]
AND t2.[Date]>=DATEADD(DAY,-2,t.[Date]) FOR XML PATH('')) AS CountedItems
--This sub-select comes back with your counts
,(SELECT COUNT(DISTINCT t2.[Item])
FROM #tbl AS t2
WHERE t2.[Date]<=t.[Date]
AND t2.[Date]>=DATEADD(DAY,-2,t.[Date])) AS ItemCount
FROM #tbl AS t
GROUP BY t.[Date];
The result
Date CountedItems ItemCount
2018-01-01 AB 2
2018-01-02 ABC 3
2018-01-03 ABC 3
2018-01-04 C 1

This solution is different from other solutions. Can you check performance of this query on real data with comparison to other answers?
The basic idea is that each row can participate in the window for its own date, the day after, or the day after that. So this first expands the row out into three rows with those different dates attached and then it can just use a regular COUNT(DISTINCT) aggregating on the computed date. The HAVING clause is just to avoid returning results for dates that were solely computed and not present in the base data.
with cte(Date, Item) as (
select cast(a as datetime), b
from (values
('01/01/2018','A')
,('01/01/2018','B')
,('02/01/2018','C')
,('03/01/2018','C')
,('04/01/2018','C')) t(a,b)
)
select
[Date] = dateadd(dd, n, Date), [Count] = count(distinct Item)
from
cte
cross join (values (0),(1),(2)) t(n)
group by dateadd(dd, n, Date)
having max(iif(n = 0, 1, 0)) = 1
option (force order)
Output:
| Date | Count |
|-------------------------|-------|
| 2018-01-01 00:00:00.000 | 2 |
| 2018-01-02 00:00:00.000 | 3 |
| 2018-01-03 00:00:00.000 | 3 |
| 2018-01-04 00:00:00.000 | 1 |
It might be faster if you have many duplicate rows:
select
[Date] = dateadd(dd, n, Date), [Count] = count(distinct Item)
from
(select distinct Date, Item from cte) c
cross join (values (0),(1),(2)) t(n)
group by dateadd(dd, n, Date)
having max(iif(n = 0, 1, 0)) = 1
option (force order)

Use GETDATE() function to get current date, and DATEADD() to get the last 3 days
SELECT Date, count(DISTINCT item)
FROM myTable
WHERE [Date] >= DATEADD(day,-3, GETDATE())
GROUP BY Date
ORDER BY Date

SQL
SELECT DISTINCT Date,
(SELECT COUNT(DISTINCT item)
FROM myTable t2
WHERE t2.Date BETWEEN DATEADD(day, -2, t1.Date) AND t1.Date) AS count
FROM myTable t1
ORDER BY Date;
Demo
Rextester demo: http://rextester.com/ZRDQ22190

Since COUNT(DISTINCT item) OVER (PARTITION BY [Date]) is not supported you can use dense_rank to emulate that:
SELECT Date, dense_rank() over (partition by [Date] order by [item])
+ dense_rank() over (partition by [Date] order by [item] desc)
- 1 as count_distinct_item
FROM myTable
One thing to note is that dense_rank will count null as whereas COUNT will not.
Refer this post for more details.

Here is a simple solution that uses myTable itself as the source of grouping dates (edited for SQLServer dateadd). Note that this query assumes there will be at least one record in myTable for every date; if any date is absent, it will not appear in the query results, even if there are records for the 2 days prior:
select
date,
(select
count(distinct item)
from (select distinct date, item from myTable) as d2
where
d2.date between dateadd(day,-2,d.date) and d.date
) as count
from (select distinct date from myTable) as d

I solve this question with Math.
z (any day) = 3x + y (y is mode 3 value)
I need from 3 * (x - 1) + y + 1 to 3 * (x - 1) + y + 3
3 * (x- 1) + y + 1 = 3* (z / 3 - 1) + z % 3 + 1
In that case; I can use group by (between 3* (z / 3 - 1) + z % 3 + 1 and z)
SELECT iif(OrderDate between 3 * (cast(OrderDate as int) / 3 - 1) + (cast(OrderDate as int) % 3) + 1
and orderdate, Orderdate, 0)
, count(sh.SalesOrderID) FROM Sales.SalesOrderDetail shd
JOIN Sales.SalesOrderHeader sh on sh.SalesOrderID = shd.SalesOrderID
group by iif(OrderDate between 3 * (cast(OrderDate as int) / 3 - 1) + (cast(OrderDate as int) % 3) + 1
and orderdate, Orderdate, 0)
order by iif(OrderDate between 3 * (cast(OrderDate as int) / 3 - 1) + (cast(OrderDate as int) % 3) + 1
and orderdate, Orderdate, 0)
If you need else day group, you can use;
declare #n int = 4 (another day count)
SELECT iif(OrderDate between #n * (cast(OrderDate as int) / #n - 1) + (cast(OrderDate as int) % #n) + 1
and orderdate, Orderdate, 0)
, count(sh.SalesOrderID) FROM Sales.SalesOrderDetail shd
JOIN Sales.SalesOrderHeader sh on sh.SalesOrderID = shd.SalesOrderID
group by iif(OrderDate between #n * (cast(OrderDate as int) / #n - 1) + (cast(OrderDate as int) % #n) + 1
and orderdate, Orderdate, 0)
order by iif(OrderDate between #n * (cast(OrderDate as int) / #n - 1) + (cast(OrderDate as int) % #n) + 1
and orderdate, Orderdate, 0)

Related

Loop within id and combine dates between rows in SQL [duplicate]

I have a table in the following format
Id StartDate EndDate Type
1 2012-02-18 2012-03-18 1
1 2012-03-17 2012-06-29 1
1 2012-06-27 2012-09-27 1
1 2014-08-23 2014-09-24 3
1 2014-09-23 2014-10-24 3
1 2014-10-23 2014-11-24 3
2 2015-07-04 2015-08-06 1
2 2015-08-04 2015-09-06 1
3 2013-11-01 2013-12-01 0
3 2018-01-09 2018-02-09 0
I found similar questions here, but not something that could help me solve my problem. I want to merge rows that has the same Id, Type and overlapping date periods.
The result from the above table should be
Id StartDate EndDate Type
1 2012-02-18 2012-09-27 1
1 2014-08-23 2014-11-24 3
2 2015-07-04 2015-09-06 1
3 2013-11-01 2013-12-01 0
3 2018-01-09 2018-02-09 0
In another server, I was able to do it with the following restrictions and the query below:
Didn't care about the Type column, but just the Id
Had a newer version of SQL Server (2012), but now I have 2008 which the code is not compatible
SELECT Id
, MIN(StartDate) AS StartDate
, MAX(EndDate) AS EndDate
FROM (
SELECT *
, SUM(CASE WHEN a.EndDate = a.StartDate THEN 0
ELSE 1
END
) OVER (ORDER BY Id, StartDate) sm
FROM (
SELECT Id
, StartDate
, EndDate
, LAG(EndDate, 1, NULL) OVER (PARTITION BY Id ORDER BY Id, EndDate) EndDate
FROM #temptable
) a
) b
GROUP BY Id, sm
Any advice how I can
Include Type on the process
Make it work on SQL Server 2008
This approach uses an additional temp table to identify the groups of overlapping dates, and then performs a quick aggregate based on the groupings.
SELECT *, ROW_NUMBER() OVER (ORDER BY Id, Type) AS UID,
ROW_NUMBER() OVER (ORDER BY Id, Type) AS GroupId INTO #G FROM #TempTable
WHILE ##ROWCOUNT <> 0 BEGIN
UPDATE T1 SET
GroupId = T2.GroupId
FROM #G T1
INNER JOIN (
SELECT T1.UID, CASE WHEN T1.GroupId < T2.GroupId THEN T1.GroupId ELSE T2.GroupId END
FROM #G T1
LEFT OUTER JOIN #G T2
ON T1.Id = T2.Id AND T1.Type = T2.Type AND T1.GroupId <> T2.GroupId
AND T1.StartDate <= T2.EndDate AND T2.StartDate <= T1.EndDate
) T2 (UID, GroupId)
ON T1.UID = T2.UID
WHERE T1.GroupId <> T2.GroupId
END
SELECT Id, MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate, Type
FROM #G G GROUP BY GroupId, Id, Type
This returns the expected values
Id StartDate EndDate Type
----------- ---------- ---------- -----------
1 2012-02-18 2012-09-27 1
1 2014-08-23 2014-11-24 3
2 2015-07-04 2015-09-06 1
3 2013-11-01 2013-12-01 0
3 2018-01-09 2018-02-09 0
This is 2008 compatible. A CTE really is the best way to link up all overlapping records in my opinion. The date overlap logic came from this thread: SO Date Overlap
I added extra data that's more complex to make sure that it's working as expected.
DECLARE #Data table (Id INT, StartDate DATE, EndDate DATE, Type INT)
INSERT INTO #data
SELECT 1,'2/18/2012' ,'3/18/2012', 1 UNION ALL
select 1,'3/17/2012','6/29/2012',1 UNION ALL
select 1,'6/27/2012','9/27/2012',1 UNION ALL
select 1,'8/23/2014','9/24/2014',3 UNION ALL
select 1,'9/23/2014','10/24/2014',3 UNION ALL
select 1,'10/23/2014','11/24/2014',3 UNION ALL
select 2,'7/4/2015','8/6/2015',1 UNION ALL
select 2,'8/4/2015','9/6/2015',1 UNION ALL
select 3,'11/1/2013','12/1/2013',0 UNION ALL
select 3,'1/9/2018','2/9/2018',0 UNION ALL
select 4,'1/1/2018','1/2/2018',0 UNION ALL --many non overlapping dates
select 4,'1/4/2018','1/5/2018',0 UNION ALL
select 4,'1/7/2018','1/9/2018',0 UNION ALL
select 4,'1/11/2018','1/13/2018',0 UNION ALL
select 4,'2/7/2018','2/8/2018',0 UNION ALL --many overlapping dates
select 4,'2/8/2018','2/9/2018',0 UNION ALL
select 4,'2/9/2018','2/10/2018',0 UNION all
select 4,'2/10/2018','2/11/2018',0 UNION all
select 4,'2/11/2018','2/12/2018',0 UNION all
select 4,'2/12/2018','2/13/2018',0 UNION all
select 4,'3/7/2018','3/8/2018',0 UNION ALL --many overlapping dates, second instance of id 4, type 0
select 4,'3/8/2018','3/9/2018',0 UNION ALL
select 4,'3/9/2018','3/10/2018',0 UNION all
select 4,'3/10/2018','3/11/2018',0 UNION all
select 4,'3/11/2018','3/12/2018',0 UNION all
select 4,'3/12/2018','3/13/2018',0
;
WITH cdata
AS (SELECT Id,
d.Type,
d.StartDate,
d.EndDate,
CurrentStart = d.StartDate
FROM #Data d
WHERE
NOT EXISTS (
SELECT * FROM #Data x WHERE x.StartDate < d.StartDate AND d.StartDate <= x.EndDate AND d.EndDate >= x.StartDate AND d.Id = x.Id AND d.Type = x.Type --get first records for overlapping ranges
)
UNION ALL
SELECT d.Id,
d.Type,
StartDate = CASE WHEN d2.StartDate < d.StartDate THEN d2.StartDate ELSE d.StartDate END,
EndDate = CASE WHEN d2.EndDate > d.EndDate THEN d2.EndDate ELSE d.EndDate END,
CurrentStart = d2.StartDate
FROM cdata d
INNER JOIN #Data d2
ON (
d.StartDate <= d2.EndDate
AND d.EndDate >= d2.StartDate
)
AND d2.Id = d.Id
AND d2.Type = d.Type
AND d2.StartDate > d.CurrentStart)
SELECT cdata.Id, cdata.Type, cdata.StartDate, EndDate = MAX(cdata.EndDate)
FROM cdata
GROUP BY cdata.Id, cdata.Type, cdata.StartDate
This looks like a Packing Intervals problem. See the post by Itzik Ben-Gan for all the details and what indexes he recommends to make it work efficiently. He presents a solution without recursive CTE.
Two notes.
The query below assumes that intervals are [closed; open), i.e. StartDate is inclusive and EndDate is exclusive. This way to represent such data is often the most convenient. (in the same sense as having arrays as zero-based instead of 1-based is usually more convenient in programming languages).
I added a RowID column to have unambiguous sorting.
Sample data
DECLARE #T TABLE
(
RowID int IDENTITY,
id int,
StartDate date,
EndDate date,
tp int
);
INSERT INTO #T(Id, StartDate, EndDate, tp) VALUES
(1, '2012-02-18', '2012-03-18', 1),
(1, '2012-03-17', '2012-06-29', 1),
(1, '2012-06-27', '2012-09-27', 1),
(1, '2014-08-23', '2014-09-24', 3),
(1, '2014-09-23', '2014-10-24', 3),
(1, '2014-10-23', '2014-11-24', 3),
(2, '2015-07-04', '2015-08-06', 1),
(2, '2015-08-04', '2015-09-06', 1),
(3, '2013-11-01', '2013-12-01', 0),
(3, '2018-01-09', '2018-02-09', 0);
-- Make EndDate an opened interval, make it exclusive
-- [Start; End)
UPDATE #T
SET EndDate = DATEADD(day, 1, EndDate)
;
Recommended indexes
-- indexes to support solutions
CREATE UNIQUE INDEX idx_start_id ON T(id, tp, StartDate, RowID);
CREATE UNIQUE INDEX idx_end_id ON T(id, tp, EndDate, RowID);
Query
Read the Itzik's post to understand what is going on. He has nice illustrations there. In short, each timestamp (start or end) is treated as an event. Each event has a + or - type. Each time we encounter a + event (some interval starts) we increase the running counter. Each time we encounter a - event (some interval ends) we decrease the running counter. When the running counter is 0 it means that the streak of overlapping intervals is over.
I took Itzik's query as is and simply changed the column names to match your names.
WITH C1 AS
-- let e = end ordinals, let s = start ordinals
(
SELECT
RowID, id, tp, StartDate AS ts, +1 AS EventType,
NULL AS e,
ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY StartDate, RowID) AS s
FROM #T
UNION ALL
SELECT
RowID, id, tp, EndDate AS ts, -1 AS EventType,
ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY EndDate, RowID) AS e,
NULL AS s
FROM #T
),
C2 AS
-- let se = start or end ordinal, namely, how many events (start or end) happened so far
(
SELECT C1.*,
ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY ts, EventType DESC, RowID) AS se
FROM C1
),
C3 AS
-- For start events, the expression s - (se - s) - 1 represents how many sessions were active
-- just before the current (hence - 1)
--
-- For end events, the expression (se - e) - e represents how many sessions are active
-- right after this one
--
-- The above two expressions are 0 exactly when a group of packed intervals
-- either starts or ends, respectively
--
-- After filtering only events when a group of packed intervals either starts or ends,
-- group each pair of adjacent start/end events
(
SELECT id, tp, ts,
((ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY ts) - 1) / 2 + 1)
AS grpnum
FROM C2
WHERE COALESCE(s - (se - s) - 1, (se - e) - e) = 0
)
SELECT id, tp, MIN(ts) AS StartDate, DATEADD(day, -1, MAX(ts)) AS EndDate
FROM C3
GROUP BY id, tp, grpnum
ORDER BY id, tp, StartDate;
Result
+----+----+------------+------------+
| id | tp | StartDate | EndDate |
+----+----+------------+------------+
| 1 | 1 | 2012-02-18 | 2012-09-27 |
| 1 | 3 | 2014-08-23 | 2014-11-24 |
| 2 | 1 | 2015-07-04 | 2015-09-06 |
| 3 | 0 | 2013-11-01 | 2013-12-01 |
| 3 | 0 | 2018-01-09 | 2018-02-09 |
+----+----+------------+------------+
create table #table
(Id int,StartDate date, EndDate date, Type int)
insert into #table
values
('1','2012-02-18','2012-03-18','1'),('1','2012-03-19','2012-06-19','1'),
('1','2012-06-27','2012-09-27','1'),('1','2014-08-23','2014-09-24','3'),
('1','2014-09-23','2014-10-24','3'),('1','2014-10-23','2014-11-24','3'),
('2','2015-07-04','2015-08-06','1'),('2','2015-08-04','2015-09-06','1'),
('3','2013-11-01','2013-12-01','0'),('3','2018-01-09','2018-02-09','0')
select ID,MIN(startdate)sd,MAX(EndDate)ed,type from #table
group by ID,TYPE,YEAR(startdate),YEAR(EndDate)
this can be easily achieved by using some window-functions and CTE's. Here is the solution
DECLARE #table TABLE
(id INT,
StartDate DATE,
EndDate DATE,
[Type] INT
);
INSERT INTO #table(Id, StartDate, EndDate, [Type]) VALUES
(1, '2012-02-18', '2012-03-18', 1),
(1, '2012-03-17', '2012-06-29', 1),
(1, '2012-06-27', '2012-09-27', 1),
(1, '2014-08-23', '2014-09-24', 3),
(1, '2014-09-23', '2014-10-24', 3),
(1, '2014-10-23', '2014-11-24', 3),
(2, '2015-07-04', '2015-08-06', 1),
(2, '2015-08-04', '2015-09-06', 1),
(3, '2013-11-01', '2013-12-01', 0),
(3, '2018-01-09', '2018-02-09', 0);
WITH C1 AS
(
SELECT *,
MAX(EndDate) OVER(PARTITION BY Id, [Type]
ORDER BY StartDate, EndDate
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS PrevEnd
FROM #table
),
C2 AS
(
SELECT *,
SUM(StartFlag) OVER(PARTITION BY Id, [Type]
ORDER BY StartDate, EndDate
ROWS UNBOUNDED PRECEDING) AS GroupID
FROM C1
CROSS APPLY ( VALUES(CASE WHEN StartDate <= PrevEnd THEN NULL ELSE 1 END) ) AS A(StartFlag)
)
SELECT Id, [Type], MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate
FROM C2
GROUP BY Id, [Type], GroupID;

Merge rows if date columns are overlapping in TSQL

I have a table in the following format
Id StartDate EndDate Type
1 2012-02-18 2012-03-18 1
1 2012-03-17 2012-06-29 1
1 2012-06-27 2012-09-27 1
1 2014-08-23 2014-09-24 3
1 2014-09-23 2014-10-24 3
1 2014-10-23 2014-11-24 3
2 2015-07-04 2015-08-06 1
2 2015-08-04 2015-09-06 1
3 2013-11-01 2013-12-01 0
3 2018-01-09 2018-02-09 0
I found similar questions here, but not something that could help me solve my problem. I want to merge rows that has the same Id, Type and overlapping date periods.
The result from the above table should be
Id StartDate EndDate Type
1 2012-02-18 2012-09-27 1
1 2014-08-23 2014-11-24 3
2 2015-07-04 2015-09-06 1
3 2013-11-01 2013-12-01 0
3 2018-01-09 2018-02-09 0
In another server, I was able to do it with the following restrictions and the query below:
Didn't care about the Type column, but just the Id
Had a newer version of SQL Server (2012), but now I have 2008 which the code is not compatible
SELECT Id
, MIN(StartDate) AS StartDate
, MAX(EndDate) AS EndDate
FROM (
SELECT *
, SUM(CASE WHEN a.EndDate = a.StartDate THEN 0
ELSE 1
END
) OVER (ORDER BY Id, StartDate) sm
FROM (
SELECT Id
, StartDate
, EndDate
, LAG(EndDate, 1, NULL) OVER (PARTITION BY Id ORDER BY Id, EndDate) EndDate
FROM #temptable
) a
) b
GROUP BY Id, sm
Any advice how I can
Include Type on the process
Make it work on SQL Server 2008
This approach uses an additional temp table to identify the groups of overlapping dates, and then performs a quick aggregate based on the groupings.
SELECT *, ROW_NUMBER() OVER (ORDER BY Id, Type) AS UID,
ROW_NUMBER() OVER (ORDER BY Id, Type) AS GroupId INTO #G FROM #TempTable
WHILE ##ROWCOUNT <> 0 BEGIN
UPDATE T1 SET
GroupId = T2.GroupId
FROM #G T1
INNER JOIN (
SELECT T1.UID, CASE WHEN T1.GroupId < T2.GroupId THEN T1.GroupId ELSE T2.GroupId END
FROM #G T1
LEFT OUTER JOIN #G T2
ON T1.Id = T2.Id AND T1.Type = T2.Type AND T1.GroupId <> T2.GroupId
AND T1.StartDate <= T2.EndDate AND T2.StartDate <= T1.EndDate
) T2 (UID, GroupId)
ON T1.UID = T2.UID
WHERE T1.GroupId <> T2.GroupId
END
SELECT Id, MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate, Type
FROM #G G GROUP BY GroupId, Id, Type
This returns the expected values
Id StartDate EndDate Type
----------- ---------- ---------- -----------
1 2012-02-18 2012-09-27 1
1 2014-08-23 2014-11-24 3
2 2015-07-04 2015-09-06 1
3 2013-11-01 2013-12-01 0
3 2018-01-09 2018-02-09 0
This is 2008 compatible. A CTE really is the best way to link up all overlapping records in my opinion. The date overlap logic came from this thread: SO Date Overlap
I added extra data that's more complex to make sure that it's working as expected.
DECLARE #Data table (Id INT, StartDate DATE, EndDate DATE, Type INT)
INSERT INTO #data
SELECT 1,'2/18/2012' ,'3/18/2012', 1 UNION ALL
select 1,'3/17/2012','6/29/2012',1 UNION ALL
select 1,'6/27/2012','9/27/2012',1 UNION ALL
select 1,'8/23/2014','9/24/2014',3 UNION ALL
select 1,'9/23/2014','10/24/2014',3 UNION ALL
select 1,'10/23/2014','11/24/2014',3 UNION ALL
select 2,'7/4/2015','8/6/2015',1 UNION ALL
select 2,'8/4/2015','9/6/2015',1 UNION ALL
select 3,'11/1/2013','12/1/2013',0 UNION ALL
select 3,'1/9/2018','2/9/2018',0 UNION ALL
select 4,'1/1/2018','1/2/2018',0 UNION ALL --many non overlapping dates
select 4,'1/4/2018','1/5/2018',0 UNION ALL
select 4,'1/7/2018','1/9/2018',0 UNION ALL
select 4,'1/11/2018','1/13/2018',0 UNION ALL
select 4,'2/7/2018','2/8/2018',0 UNION ALL --many overlapping dates
select 4,'2/8/2018','2/9/2018',0 UNION ALL
select 4,'2/9/2018','2/10/2018',0 UNION all
select 4,'2/10/2018','2/11/2018',0 UNION all
select 4,'2/11/2018','2/12/2018',0 UNION all
select 4,'2/12/2018','2/13/2018',0 UNION all
select 4,'3/7/2018','3/8/2018',0 UNION ALL --many overlapping dates, second instance of id 4, type 0
select 4,'3/8/2018','3/9/2018',0 UNION ALL
select 4,'3/9/2018','3/10/2018',0 UNION all
select 4,'3/10/2018','3/11/2018',0 UNION all
select 4,'3/11/2018','3/12/2018',0 UNION all
select 4,'3/12/2018','3/13/2018',0
;
WITH cdata
AS (SELECT Id,
d.Type,
d.StartDate,
d.EndDate,
CurrentStart = d.StartDate
FROM #Data d
WHERE
NOT EXISTS (
SELECT * FROM #Data x WHERE x.StartDate < d.StartDate AND d.StartDate <= x.EndDate AND d.EndDate >= x.StartDate AND d.Id = x.Id AND d.Type = x.Type --get first records for overlapping ranges
)
UNION ALL
SELECT d.Id,
d.Type,
StartDate = CASE WHEN d2.StartDate < d.StartDate THEN d2.StartDate ELSE d.StartDate END,
EndDate = CASE WHEN d2.EndDate > d.EndDate THEN d2.EndDate ELSE d.EndDate END,
CurrentStart = d2.StartDate
FROM cdata d
INNER JOIN #Data d2
ON (
d.StartDate <= d2.EndDate
AND d.EndDate >= d2.StartDate
)
AND d2.Id = d.Id
AND d2.Type = d.Type
AND d2.StartDate > d.CurrentStart)
SELECT cdata.Id, cdata.Type, cdata.StartDate, EndDate = MAX(cdata.EndDate)
FROM cdata
GROUP BY cdata.Id, cdata.Type, cdata.StartDate
This looks like a Packing Intervals problem. See the post by Itzik Ben-Gan for all the details and what indexes he recommends to make it work efficiently. He presents a solution without recursive CTE.
Two notes.
The query below assumes that intervals are [closed; open), i.e. StartDate is inclusive and EndDate is exclusive. This way to represent such data is often the most convenient. (in the same sense as having arrays as zero-based instead of 1-based is usually more convenient in programming languages).
I added a RowID column to have unambiguous sorting.
Sample data
DECLARE #T TABLE
(
RowID int IDENTITY,
id int,
StartDate date,
EndDate date,
tp int
);
INSERT INTO #T(Id, StartDate, EndDate, tp) VALUES
(1, '2012-02-18', '2012-03-18', 1),
(1, '2012-03-17', '2012-06-29', 1),
(1, '2012-06-27', '2012-09-27', 1),
(1, '2014-08-23', '2014-09-24', 3),
(1, '2014-09-23', '2014-10-24', 3),
(1, '2014-10-23', '2014-11-24', 3),
(2, '2015-07-04', '2015-08-06', 1),
(2, '2015-08-04', '2015-09-06', 1),
(3, '2013-11-01', '2013-12-01', 0),
(3, '2018-01-09', '2018-02-09', 0);
-- Make EndDate an opened interval, make it exclusive
-- [Start; End)
UPDATE #T
SET EndDate = DATEADD(day, 1, EndDate)
;
Recommended indexes
-- indexes to support solutions
CREATE UNIQUE INDEX idx_start_id ON T(id, tp, StartDate, RowID);
CREATE UNIQUE INDEX idx_end_id ON T(id, tp, EndDate, RowID);
Query
Read the Itzik's post to understand what is going on. He has nice illustrations there. In short, each timestamp (start or end) is treated as an event. Each event has a + or - type. Each time we encounter a + event (some interval starts) we increase the running counter. Each time we encounter a - event (some interval ends) we decrease the running counter. When the running counter is 0 it means that the streak of overlapping intervals is over.
I took Itzik's query as is and simply changed the column names to match your names.
WITH C1 AS
-- let e = end ordinals, let s = start ordinals
(
SELECT
RowID, id, tp, StartDate AS ts, +1 AS EventType,
NULL AS e,
ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY StartDate, RowID) AS s
FROM #T
UNION ALL
SELECT
RowID, id, tp, EndDate AS ts, -1 AS EventType,
ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY EndDate, RowID) AS e,
NULL AS s
FROM #T
),
C2 AS
-- let se = start or end ordinal, namely, how many events (start or end) happened so far
(
SELECT C1.*,
ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY ts, EventType DESC, RowID) AS se
FROM C1
),
C3 AS
-- For start events, the expression s - (se - s) - 1 represents how many sessions were active
-- just before the current (hence - 1)
--
-- For end events, the expression (se - e) - e represents how many sessions are active
-- right after this one
--
-- The above two expressions are 0 exactly when a group of packed intervals
-- either starts or ends, respectively
--
-- After filtering only events when a group of packed intervals either starts or ends,
-- group each pair of adjacent start/end events
(
SELECT id, tp, ts,
((ROW_NUMBER() OVER(PARTITION BY id, tp ORDER BY ts) - 1) / 2 + 1)
AS grpnum
FROM C2
WHERE COALESCE(s - (se - s) - 1, (se - e) - e) = 0
)
SELECT id, tp, MIN(ts) AS StartDate, DATEADD(day, -1, MAX(ts)) AS EndDate
FROM C3
GROUP BY id, tp, grpnum
ORDER BY id, tp, StartDate;
Result
+----+----+------------+------------+
| id | tp | StartDate | EndDate |
+----+----+------------+------------+
| 1 | 1 | 2012-02-18 | 2012-09-27 |
| 1 | 3 | 2014-08-23 | 2014-11-24 |
| 2 | 1 | 2015-07-04 | 2015-09-06 |
| 3 | 0 | 2013-11-01 | 2013-12-01 |
| 3 | 0 | 2018-01-09 | 2018-02-09 |
+----+----+------------+------------+
create table #table
(Id int,StartDate date, EndDate date, Type int)
insert into #table
values
('1','2012-02-18','2012-03-18','1'),('1','2012-03-19','2012-06-19','1'),
('1','2012-06-27','2012-09-27','1'),('1','2014-08-23','2014-09-24','3'),
('1','2014-09-23','2014-10-24','3'),('1','2014-10-23','2014-11-24','3'),
('2','2015-07-04','2015-08-06','1'),('2','2015-08-04','2015-09-06','1'),
('3','2013-11-01','2013-12-01','0'),('3','2018-01-09','2018-02-09','0')
select ID,MIN(startdate)sd,MAX(EndDate)ed,type from #table
group by ID,TYPE,YEAR(startdate),YEAR(EndDate)
this can be easily achieved by using some window-functions and CTE's. Here is the solution
DECLARE #table TABLE
(id INT,
StartDate DATE,
EndDate DATE,
[Type] INT
);
INSERT INTO #table(Id, StartDate, EndDate, [Type]) VALUES
(1, '2012-02-18', '2012-03-18', 1),
(1, '2012-03-17', '2012-06-29', 1),
(1, '2012-06-27', '2012-09-27', 1),
(1, '2014-08-23', '2014-09-24', 3),
(1, '2014-09-23', '2014-10-24', 3),
(1, '2014-10-23', '2014-11-24', 3),
(2, '2015-07-04', '2015-08-06', 1),
(2, '2015-08-04', '2015-09-06', 1),
(3, '2013-11-01', '2013-12-01', 0),
(3, '2018-01-09', '2018-02-09', 0);
WITH C1 AS
(
SELECT *,
MAX(EndDate) OVER(PARTITION BY Id, [Type]
ORDER BY StartDate, EndDate
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS PrevEnd
FROM #table
),
C2 AS
(
SELECT *,
SUM(StartFlag) OVER(PARTITION BY Id, [Type]
ORDER BY StartDate, EndDate
ROWS UNBOUNDED PRECEDING) AS GroupID
FROM C1
CROSS APPLY ( VALUES(CASE WHEN StartDate <= PrevEnd THEN NULL ELSE 1 END) ) AS A(StartFlag)
)
SELECT Id, [Type], MIN(StartDate) AS StartDate, MAX(EndDate) AS EndDate
FROM C2
GROUP BY Id, [Type], GroupID;

Split date range to year-month rows on SQL Server 2005

I need to create an output where I got one row per year-month.
Assume the dataset is:
id | dateStart | dateEnd
1 | 2015-01-01 00:00:00.000 | 2015-03-31 00:00:00.000
2 | 2014-07-01 00:00:00.000 | 2014-08-31 00:00:00.000
...
I need the following output:
id | year-month
1 | 2015-01
1 | 2015-02
1 | 2015-03
2 | 2014-07
2 | 2014-08
The output can be any datatype since I can just change that later.
That is for 2015-01 the following is ok, "2015-01-01 00:00:00.000", "2015-01-01", "201501", "2015 | jan" ect.
Note I'm using SQL Server 2005.
Here is a method that uses recursive CTEs:
with CTE as (
select id, dateStart as dte, dateEnd
from t
union all
select id, dateadd(month, 1, dte), dateEnd
from CTE
where dateadd(month, 1, dte) < dateEnd
)
select id, dte
from CTE;
You can convert the final result into any format you like. For instance:
select id, year(dte) * 10000 + month(dte) as yyyymm_int
or
select id, cast(year(dte) * 10000 + month(dte) as varchar(255)) as yyyymm
Generate tally table(just make sure you get enough rows there). tally will contain values 0,1,2,.....n. Then you do a join with condition adding thise values as months to startDate until it is greater then endDate:
DECLARE #t TABLE
(
id INT ,
dateStart DATETIME ,
dateEnd DATETIME
)
INSERT INTO #t
VALUES ( 1, '2015-01-01 00:00:00.000', '2015-03-31 00:00:00.000' ),
( 2, '2014-07-01 00:00:00.000', '2014-08-31 00:00:00.000' )
;WITH cte AS(SELECT -1 + ROW_NUMBER() OVER(ORDER BY t1.m) m
FROM(VALUES(0),(0),(0),(0),(0),(0),(0),(0),(0),(0))t1(m) CROSS JOIN
(VALUES(0),(0),(0),(0),(0),(0),(0),(0),(0),(0))t2(m))
SELECT t.id,
DATEADD(mm, c.m, t.dateStart) AS year_month
FROM cte c
JOIN #t t ON DATEADD(mm, c.m, t.dateStart) <= t.dateEnd
ORDER BY t.id, year_month
Output:
id year_month
1 2015-01-01 00:00:00.000
1 2015-02-01 00:00:00.000
1 2015-03-01 00:00:00.000
2 2014-07-01 00:00:00.000
2 2014-08-01 00:00:00.000
In an ideal world you would have a calendar table, then your query would simply be:
SELECT t.id,
c.FirstDayOfMonth
FROM YourTable AS t
INNER JOIN dbo.Calendar c
ON c.FirstDayOfMonth >= t.DateStart
AND c.FirstDayOfMonth <= t.DateEnd
AND c.DayOfMonth = 1;
Assuming that you don't have a calendar table then you can do it with a list of numbers generated on the fly (Read this article for more on this). The following will generate a list from 1-10,000:
WITH N1 AS (SELECT N FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N (N)),
N2 (N) AS (SELECT 1 FROM N1 AS N1 CROSS JOIN N1 AS N2),
N3 (N) AS (SELECT 1 FROM N2 AS N1 CROSS JOIN N2 AS N2),
Numbers (Number) AS (SELECT ROW_NUMBER() OVER(ORDER BY N3.N) FROM N3)
SELECT * FROM Numbers;
Then you can join this to your original table:
DECLARE #T TABLE (id INT, DateStart DATE, DateEnd DATE);
INSERT #T (ID, DateStart, DateEnd)
VALUES (1, '20150101', '20150331'), (2, '20140701', '20140831');
WITH N1 AS (SELECT N FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N (N)),
N2 (N) AS (SELECT 1 FROM N1 AS N1 CROSS JOIN N1 AS N2),
N3 (N) AS (SELECT 1 FROM N2 AS N1 CROSS JOIN N2 AS N2),
Numbers (Number) AS (SELECT ROW_NUMBER() OVER(ORDER BY N3.N) FROM N3)
SELECT t.ID,
[year-month] = DATEADD(MONTH, n.Number + DATEDIFF(MONTH, 0, t.DateStart), 0)
FROM #T AS t
INNER JOIN Numbers AS N
ON N.Number - 1 <= DATEDIFF(MONTH, t.DateStart, t.DateEnd);

SQL breakout date range to rows

I am trying to take given date ranges found in a data set and divide them into unique rows for each day in the range (example below). Doing the opposite in SQL is pretty straight forward, but I am struggling to achieve the desired query output.
Beginning data:
ITEM START_DATE END_DATE
A 1/1/2015 1/5/2015
B 2/5/2015 2/7/2015
Desired query output:
ITEM DATE_COVERED
A 1/1/2015
A 1/2/2015
A 1/3/2015
A 1/4/2015
A 1/5/2015
B 2/5/2015
B 2/6/2015
B 2/7/2015
The fastest way will be some tally table:
DECLARE #t TABLE
(
ITEM CHAR(1) ,
START_DATE DATE ,
END_DATE DATE
)
INSERT INTO #t
VALUES ( 'A', '1/1/2015', '1/5/2015' ),
( 'B', '2/5/2015', '2/7/2015' )
;WITH cte AS(SELECT -1 + ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) d FROM
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t1(n) CROSS JOIN
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t2(n) CROSS JOIN
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t3(n) CROSS JOIN
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t4(n))
SELECT t.ITEM, ca.DATE_COVERED FROM #t t
CROSS APPLY(SELECT DATEADD(dd, d, t.START_DATE) AS DATE_COVERED
FROM cte
WHERE DATEADD(dd, d, t.START_DATE) BETWEEN t.START_DATE AND t.END_DATE) ca
ORDER BY t.ITEM, ca.DATE_COVERED
Query:
SQLFiddleExample
SELECT t.ITEM,
DATEADD(day,n.number, t.START_DATE) AS DATE_COVERED
FROM Table1 t,
(SELECT number
FROM master..spt_values
WHERE [type] = 'P') n
WHERE START_DATE <= DATEADD(day,n.number, t.START_DATE)
AND END_DATE >= DATEADD(day,n.number, t.START_DATE)
Result:
| ITEM | DATE_COVERED |
|------|--------------|
| A | 2015-01-01 |
| A | 2015-01-02 |
| A | 2015-01-03 |
| A | 2015-01-04 |
| A | 2015-01-05 |
| B | 2015-02-05 |
| B | 2015-02-06 |
| B | 2015-02-07 |
NOTE: this only works if the difference between your startdate and enddate is a maximum of 2047 days (master..spt_values only allows 0..2047 range of values)
select item, dateadd(d,v.number,d.start_date) adate
from begindata d
join master..spt_values v on v.type='P'
and v.number between 0 and datediff(d, start_date, end_date)
order by adate;
I'd like to say I did this myself but I got the code from this
Here is a fiddle with your expected result
TRY THIS...
CREATE TABLE Table1
([ITEM] varchar(1), [START_DATE] date, [END_DATE] date)
;
INSERT INTO Table1
([ITEM], [START_DATE], [END_DATE])
VALUES ('A', '2015-01-01', '2015-01-05'), ('B', '2015-02-05', 2015-02-07');
WITH Days
AS ( SELECT ITEM, START_DATE AS [Date], 1 AS [level] from Table1
UNION ALL
SELECT TABLE1.ITEM, DATEADD(DAY, 1, [Date]), [level] + 1
FROM Days,Table1
WHERE DAYS.ITEM=TABLE1.ITEM AND [Date] < END_DATE )
SELECT distinct [Date]
FROM Days
DEMO

Query to return all the days of a month

This problem is related to this, which has no solution in sight: here
I have a table that shows me all sessions of an area.
This session has a start date.
I need to get all the days of month of the start date of the session by specific area (in this case)
I have this query:
SELECT idArea, idSession, startDate FROM SessionsPerArea WHERE idArea = 1
idArea | idSession | startDate |
1 | 1 | 01-01-2013 |
1 | 2 | 04-01-2013 |
1 | 3 | 07-02-2013 |
And i want something like this:
date | Session |
01-01-2013 | 1 |
02-01-2013 | NULL |
03-01-2013 | NULL |
04-01-2013 | 1 |
........ | |
29-01-2013 | NULL |
30-01-2013 | NULL |
In this case, the table returns me all the days of January.
The second column is the number of sessions that occur on that day, because there may be several sessions on the same day.
Anyone can help me?
Please try:
DECLARE #SessionsPerArea TABLE (idArea INT, idSession INT, startDate DATEtime)
INSERT #SessionsPerArea VALUES (1,1,'2013-01-01')
INSERT #SessionsPerArea VALUES (1,2,'2013-01-04')
INSERT #SessionsPerArea VALUES (1,3,'2013-07-02')
DECLARE #RepMonth as datetime
SET #RepMonth = '01/01/2013';
WITH DayList (DayDate) AS
(
SELECT #RepMonth
UNION ALL
SELECT DATEADD(d, 1, DayDate)
FROM DayList
WHERE (DayDate < DATEADD(d, -1, DATEADD(m, 1, #RepMonth)))
)
SELECT *
FROM DayList t1 left join #SessionsPerArea t2 on t1.DayDate=startDate and t2.idArea = 1
This will work:
DECLARE #SessionsPerArea TABLE (idArea INT, idSession INT, startDate DATE)
INSERT #SessionsPerArea VALUES
(1,1,'2013-01-01'),
(1,2,'2013-01-04'),
(1,3,'2013-07-02')
;WITH t1 AS
(
SELECT startDate
, DATEADD(MONTH, DATEDIFF(MONTH, '1900-01-01', startDate), '1900-01-01') firstInMonth
, DATEADD(DAY, -1, DATEADD(MONTH, DATEDIFF(MONTH, '1900-01-01', startDate) + 1, '1900-01-01')) lastInMonth
, COUNT(*) cnt
FROM #SessionsPerArea
WHERE idArea = 1
GROUP BY
startDate
)
, calendar AS
(
SELECT DISTINCT DATEADD(DAY, c.number, t1.firstInMonth) d
FROM t1
JOIN master..spt_values c ON
type = 'P'
AND DATEADD(DAY, c.number, t1.firstInMonth) BETWEEN t1.firstInMonth AND t1.lastInMonth
)
SELECT d date
, cnt Session
FROM calendar c
LEFT JOIN t1 ON t1.startDate = c.d
It uses simple join on master..spt_values table to generate rows.
Just an example of calendar table. To return data for a month adjust the number of days between < 32, for a year to 365+1. You can calculate the number of days in a month or between start/end dates with query. I'm not sure how to do this in SQL Server. I'm using hardcoded values to display all dates in Jan-2013. You can adjust start and end dates for diff. month or to get start/end dates with queries...:
WITH data(r, start_date) AS
(
SELECT 1 r, date '2012-12-31' start_date FROM any_table --dual in Oracle
UNION ALL
SELECT r+1, date '2013-01-01'+r-1 FROM data WHERE r < 32 -- number of days between start and end date+1
)
SELECT start_date FROM data WHERE r > 1
/
START_DATE
----------
1/1/2013
1/2/2013
1/3/2013
...
...
1/31/2013