SELECT DateTime not in SQL - sql

I have the following table:
oDateTime pvalue
2017-06-01 00:00:00 70
2017-06-01 01:00:00 65
2017-06-01 02:00:00 90
ff.
2017-08-01 08:00:00 98
The oDateTime field is an hourly data which is impossible to have a duplicate value.
My question is, how can I know if the oDateTime data is correct? I meant, I need to make sure the data is not jump? It should be always 'hourly' base.
Am I missing the date? Am I missing the time?
Please advice. Thank you.

Based on this answer, you can get the missing times form your table MyLogTable it like this:
DECLARE #StartDate DATETIME = '20170601', #EndDate DATETIME = '20170801'
SELECT DATEADD(hour, nbr - 1, #StartDate)
FROM ( SELECT ROW_NUMBER() OVER ( ORDER BY c.object_id ) AS Nbr
FROM sys.columns c
) nbrs
WHERE nbr - 1 <= DATEDIFF(hour, #StartDate, #EndDate) AND
NOT EXISTS (SELECT 1 FROM MyLogTable WHERE DATEADD(hour, nbr - 1, #StartDate)= oDateTime )
If you need to check longer period, you can just add CROSS JOIN like this
FROM sys.columns c
CROSS JOIN sys.columns c1
It enables you to check much more than cca thousand records (rowcount of sys.columns table) in one query.

Since your table is not having any unique id number, use a row_number() to get the row number in the cte , then perform an self inner join with the row id and next id ,take the difference of oDateTime accordingly, this will show exactly which row do not have time difference of one hour
;with cte(oDateTime,pValue,Rid)
As
(
select *,row_number() over(order by oDateTime) from [YourTableName] t1
)
select *,datediff(HH,c1.oDateTime,c2.oDateTime) as HourDiff from cte c1
inner join cte c2
on c1.Rid=c2.Rid-1 where datediff(HH,c1.oDateTime,c2.oDateTime) >1

You could use DENSE_RANK() for numbering the hours in a day from 1 to 24. Then all you have to do is to check whether the max rank is 24 or not for a day. if there is at least one entry for each hour, then dense ranking will have max value of 24.
Use the following query to find the date when you have a oDateTime missing.
SELECT [date]
FROM
(
SELECT *
, CAST(oDateTime AS DATE) AS [date]
, DENSE_RANK() OVER(PARTITION BY CAST(oDateTime AS DATE) ORDER BY DATEPART(HOUR, oDateTime)) AS rank_num
FROM Test
) AS t
GROUP BY [date]
HAVING(MAX(rank_num) != 24);
If you need validation for each row of oDateTime, you could do self join based on rank and get the missing hour for each oDateTime.

Perhaps you are looking for this? This will return dates having count < 24 - which indicates a "jump"
;WITH datecount
AS ( SELECT CAST(oDateTime AS DATE) AS [date] ,
COUNT(CAST(oDateTime AS DATE)) AS [count]
FROM #temp
GROUP BY ( CAST(oDateTime AS DATE) )
)
SELECT *
FROM datecount
WHERE [count] < 24;
EDIT: Since you changed the requirement from "How to know if there is missing" to "What is the missing", here's an updated query.
DECLARE #calendar AS TABLE ( oDateTime DATETIME )
DECLARE #min DATETIME = (SELECT MIN([oDateTime]) FROM #yourTable)
DECLARE #max DATETIME = (SELECT MAX([oDateTime]) FROM #yourTable)
WHILE ( #min <= #max )
BEGIN
INSERT INTO #calendar
VALUES ( #min );
SET #min = DATEADD(hh, 1, #min);
END;
SELECT t1.[oDateTime]
FROM #calendar t1
LEFT JOIN #yourTable t2 ON t1.[oDateTime] = t2.[oDateTime]
GROUP BY t1.[oDateTime]
HAVING COUNT(t2.[oDateTime]) = 0;
I first created a hourly calendar based on your MAX and MIN Datetime, then compared your actual table to the calendar to find out if there is a "jump".

Related

How to insert values based on another column value

Below is a subset of my table (for the first id)
date
id
value
01/01/2022
1
5
08/01/2022
1
2
For each id, the dates are not consecutive (e.g., for id 1, the min date is 01/01/2022 and the max date is 08/01/2022)--there are 7 days in between both dates. I want to insert rows to make the dates for each id consecutive and contiguous - the value for the value field/column to be filled with 0s so that the updated table looks like:
date
id
value
01/01/2022
1
5
02/01/2022
1
0
03/01/2022
1
0
04/01/2022
1
0
05/01/2022
1
0
06/01/2022
1
0
07/01/2022
1
0
08/01/2022
1
2
Any SQL code on how to implement this would be highly appreciated. I have a calendar table but am unsure how to join it with the above table so that I fill in missing dates dynamically for each id with 0s.
My calendar table looks like:
date
01/01/2022
02/01/2022
03/01/2022
04/01/2022
Considering you state you have a calendar table, it seems what you need to do with JOIN to it with the MIN and MAX dates from your other table, and the LEFT JOIN back to your table:
WITH MinMax AS(
SELECT ID,
MIN(date) AS MinDate,
MAX(date) AS MaxDate
FROM dbo.YourTable
GROUP BY ID),
Dates AS(
SELECT MM.ID,
C.CalendarDate AS [Date]
FROM MinMax MM
JOIN dbo.CalendarTable C ON MM.MinDate <= C.CalendarDate
AND MM.MaxDate >= C.CalendarDate)
SELECT D.ID,
D.[Date],
ISNULL(YT.[Value],0) AS [Value]
FROM Dates D
LEFT JOIN dbo.YourTable YT ON D.ID = YT.ID
AND D.[Date] = YT.[Date];
SET DATEFORMAT DMY
-- CREATE A TABLE WITH OUR INPUT DATA
DROP TABLE IF EXISTS #TheData
GO
CREATE TABLE #TheData
(TheDate DATE, id INT, TheValue INT)
INSERT INTO #TheData
(TheDate,id,Thevalue)
VALUES
('01/01/2022',1,5),
('08/01/2022',1,2),
('17/01/2022',2,7),
('25/01/2022',2,7),
('15/02/2022',2,7)
-- CREATE A CALENDAR CTE
DECLARE #StartDate date = '20210101';
DECLARE #CutoffDate date = DATEADD(DAY, -1, DATEADD(YEAR, 2, #StartDate));
;WITH DateSeq(TheDate) AS
(
SELECT #StartDate
UNION ALL
SELECT DATEADD(dd,1,TheDate) FROM DateSeq
WHERE TheDate < #CutoffDate
)
-- CROSS JOIN OUR CALENDAR CTE TO OUR SOURCE DATA. DERIVED TABLE TO GET FIRST AND LAST OF EACH RANGE TO USE FOR JOIN
SELECT
ds.*
,SourceDataRangesByID.ID
,ISNULL(td.TheValue,0) AS TheValue
FROM
DateSeq ds
CROSS JOIN
(
SELECT
d.ID
,MIN(d.TheDate) AS MinDatePerID
,MAX(d.TheDate) AS MaxDatePerID
FROM #TheData d
GROUP BY d.ID
) SourceDataRangesByID
LEFT JOIN #TheData td ON td.id = SourceDataRangesByID.ID AND td.TheDate = ds.TheDate
WHERE ds.TheDate >= SourceDataRangesByID.MinDatePerID
AND ds.TheDate <= SourceDataRangesByID.MaxDatePerID
OPTION (MAXRECURSION 0);
try the generate_series to create a date table then right join with it and coalesce for the non null value
SELECT generate_series('2016-01-01', -- series start date
'2018-06-30', -- series end date
'1 day'::interval)::date AS day) AS daily_series
from mytable
See Generate_Series for TSQL
https://dba.stackexchange.com/questions/255165/does-ms-sql-server-have-generate-series-function
(Sql server 2022)
https://learn.microsoft.com/en-us/sql/t-sql/functions/generate-series-transact-sql?view=sql-server-ver16

Join Generated Date Sequence

Currently I'm trying to join a date table to a ledger table so I can fill the gaps of the ledger table whenever there are no transactions in certain instances (e.g. there are transactions on March 1st and in March 3rd, but no transaction in March 2nd. And by joining both tables March 2nd would appear in the ledger table but with 0 for the variable we're analyzing.)
The challenge is that I can't create a Date object/table/dimension because I don't have permissions to create tables in the database. Therefore I've been generating a date sequence with this code:
DECLARE #startDate date = CAST('2016-01-01' AS date),
#endDate date = CAST(GETDATE() AS date);
SELECT DATEADD(day, number - 1, #startDate) AS [Date]
FROM (
SELECT ROW_NUMBER() OVER (
ORDER BY n.object_id
)
FROM sys.all_objects n
) S(number)
WHERE number <= DATEDIFF(day, #startDate, #endDate) + 1;
So, is there the possibility to join both tables into the same statement? Let's say the ledger table looks like this:
SELECT
date,cost
FROM ledger
I'd assume it can be done by using a subquery but I don't know how.
Thank you.
There is a very good article by Aaron Bertrand showing several methods for generating a sequence of numbers (or dates) in SQL Server: Generate a set or sequence without loops – part 1.
Try them out and see for yourself which is faster or more convenient to you. (spoiler - Recursive CTE is rather slow)
Once you've picked your preferred method you can wrap it in a CTE (common-table expression).
Here I'll use your method from the question
WITH
CTE_Dates
AS
(
SELECT
DATEADD(day, number - 1, #startDate) AS dt
FROM (
SELECT ROW_NUMBER() OVER (
ORDER BY n.object_id
)
FROM sys.all_objects n
) S(number)
WHERE number <= DATEDIFF(day, #startDate, #endDate) + 1
)
SELECT
...
FROM
CTE_Dates
LEFT JOIN Ledger ON Ledger.dt = CTE_Dates.dt
;
You can use your generated date sequence as a CTE and LEFT JOIN that to your ledger table. For example:
DECLARE #startDate date = CAST('2020-02-01' AS date);
DECLARE #endDate date = CAST(GETDATE() AS date);
WITH dates AS (
SELECT DATEADD(day, number - 1, #startDate) AS [Date]
FROM (
SELECT ROW_NUMBER() OVER (
ORDER BY n.object_id
)
FROM sys.all_objects n
) S(number)
WHERE number <= DATEDIFF(day, #startDate, #endDate) + 1
)
SELECT dates.Date, COALESCE(ledger.cost, 0)
FROM dates
LEFT JOIN (VALUES ('2020-02-02', 14), ('2020-02-05', 10)) AS ledger([Date], [cost]) ON dates.Date = ledger.Date
Output:
Date cost
2020-02-01 0
2020-02-02 14
2020-02-03 0
2020-02-04 0
2020-02-05 10
2020-02-06 0
Demo on dbfiddle

Find missing date as compare to calendar

I am explain problem in short.
select distinct DATE from #Table where DATE >='2016-01-01'
Output :
Date
2016-11-23
2016-11-22
2016-11-21
2016-11-19
2016-11-18
Now i need to find out missing date a compare to our calender dates from year '2016'
i.e. Here date '2016-11-20' is missing.
I want list of missing dates.
Thanks for reading this. Have nice day.
You need to generate dates and you have to find missing ones. Below with recursive cte i have done it
;WITH CTE AS
(
SELECT CONVERT(DATE,'2016-01-01') AS DATE1
UNION ALL
SELECT DATEADD(DD,1,DATE1) FROM CTE WHERE DATE1<'2016-12-31'
)
SELECT DATE1 MISSING_ONE FROM CTE
EXCEPT
SELECT * FROM #TABLE1
option(maxrecursion 0)
Using CTE and get all dates in CTE table then compare with your table.
CREATE TABLE #yourTable(_Values DATE)
INSERT INTO #yourTable(_Values)
SELECT '2016-11-23' UNION ALL
SELECT '2016-11-22' UNION ALL
SELECT '2016-11-21' UNION ALL
SELECT '2016-11-19' UNION ALL
SELECT '2016-11-18'
DECLARE #DATE DATE = '2016-11-01'
;WITH CTEYear (_Date) AS
(
SELECT #DATE
UNION ALL
SELECT DATEADD(DAY,1,_Date)
FROM CTEYear
WHERE _Date < EOMONTH(#DATE,0)
)
SELECT * FROM CTEYear
WHERE NOT EXISTS(SELECT 1 FROM #yourTable WHERE _Date = _Values)
OPTION(maxrecursion 0)
You need to generate the dates and then find the missing ones. A recursive CTE is one way to generate a handful of dates. Another way is to use master..spt_values as a list of numbers:
with n as (
select row_number() over (order by (select null)) - 1 as n
from master..spt_values
),
d as (
select dateadd(day, n.n, cast('2016-01-01' as date)) as dte
from n
where n <= 365
)
select d.date
from d left join
#table t
on d.dte = t.date
where t.date is null;
If you are happy enough with ranges of missing dates, you don't need a list of dates at all:
select date, (datediff(day, date, next_date) - 1) as num_missing
from (select t.*, lead(t.date) over (order by t.date) as next_date
from #table t
where t.date >= '2016-01-01'
) t
where next_date <> dateadd(day, 1, date);

How can I sum values per day and then plot them on calendar from start date to last date

I have a table, part of which is given below. It contain multiple values (durations) per day. I need two things 1) addition of durations per day. 2) plotting them on calendar in such a way that startdate is first_date from the table and last_date is Last_update from the table. I want to mention 0 for which date there is no duration. I think it will something like below but need help.
;WITH AllDates AS(
SELECT #Fromdate As TheDate
UNION ALL
SELECT TheDate + 1
FROM AllDates
WHERE TheDate + 1 <= #ToDate
)SELECT UserId,
TheDate,
COALESCE(
SUM(
-- When the game starts and ends in the same date
CASE WHEN DATEDIFF(DAY, GameStartTime, GameEndTime) = 0
Here is what I am looking for
Another way to generate the date range you are after would be something like .....
;WITH DateLimits AS
(
SELECT MIN(First_Date) FirstDate
,MAX(Last_Update) LastDate
FROM TableName
),
DateRange AS
(
SELECT TOP (SELECT DATEDIFF(DAY,FirstDate,LastDate ) FROM DateLimits)
DATEADD(DAY
,ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
, (SELECT FirstDate FROM DateLimits)
) AS Dates
FROM master..spt_values a cross join master..spt_values b
)
SELECT * FROM DateRange --<-- you have the desired date range here
-- other query whatever you need.

Merge overlapping date intervals

Is there a better way of merging overlapping date intervals?
The solution I came up with is so simple that now I wonder if someone else has a better idea of how this could be done.
/***** DATA EXAMPLE *****/
DECLARE #T TABLE (d1 DATETIME, d2 DATETIME)
INSERT INTO #T (d1, d2)
SELECT '2010-01-01','2010-03-31' UNION SELECT '2010-04-01','2010-05-31'
UNION SELECT '2010-06-15','2010-06-25' UNION SELECT '2010-06-26','2010-07-10'
UNION SELECT '2010-08-01','2010-08-05' UNION SELECT '2010-08-01','2010-08-09'
UNION SELECT '2010-08-02','2010-08-07' UNION SELECT '2010-08-08','2010-08-08'
UNION SELECT '2010-08-09','2010-08-12' UNION SELECT '2010-07-04','2010-08-16'
UNION SELECT '2010-11-01','2010-12-31' UNION SELECT '2010-03-01','2010-06-13'
/***** INTERVAL ANALYSIS *****/
WHILE (1=1) BEGIN
UPDATE t1 SET t1.d2 = t2.d2
FROM #T AS t1 INNER JOIN #T AS t2 ON
DATEADD(day, 1, t1.d2) BETWEEN t2.d1 AND t2.d2
IF ##ROWCOUNT = 0 BREAK
END
/***** RESULT *****/
SELECT StartDate = MIN(d1) , EndDate = d2
FROM #T
GROUP BY d2
ORDER BY StartDate, EndDate
/***** OUTPUT *****/
/*****
StartDate EndDate
2010-01-01 2010-06-13
2010-06-15 2010-08-16
2010-11-01 2010-12-31
*****/
I was looking for the same solution and came across this post on Combine overlapping datetime to return single overlapping range record.
There is another thread on Packing Date Intervals.
I tested this with various date ranges, including the ones listed here, and it works correctly every time.
SELECT
s1.StartDate,
--t1.EndDate
MIN(t1.EndDate) AS EndDate
FROM #T s1
INNER JOIN #T t1 ON s1.StartDate <= t1.EndDate
AND NOT EXISTS(SELECT * FROM #T t2
WHERE t1.EndDate >= t2.StartDate AND t1.EndDate < t2.EndDate)
WHERE NOT EXISTS(SELECT * FROM #T s2
WHERE s1.StartDate > s2.StartDate AND s1.StartDate <= s2.EndDate)
GROUP BY s1.StartDate
ORDER BY s1.StartDate
The result is:
StartDate | EndDate
2010-01-01 | 2010-06-13
2010-06-15 | 2010-06-25
2010-06-26 | 2010-08-16
2010-11-01 | 2010-12-31
You asked this back in 2010 but don't specify any particular version.
An answer for people on SQL Server 2012+
WITH T1
AS (SELECT *,
MAX(d2) OVER (ORDER BY d1) AS max_d2_so_far
FROM #T),
T2
AS (SELECT *,
CASE
WHEN d1 <= DATEADD(DAY, 1, LAG(max_d2_so_far) OVER (ORDER BY d1))
THEN 0
ELSE 1
END AS range_start
FROM T1),
T3
AS (SELECT *,
SUM(range_start) OVER (ORDER BY d1) AS range_group
FROM T2)
SELECT range_group,
MIN(d1) AS d1,
MAX(d2) AS d2
FROM T3
GROUP BY range_group
Which returns
+-------------+------------+------------+
| range_group | d1 | d2 |
+-------------+------------+------------+
| 1 | 2010-01-01 | 2010-06-13 |
| 2 | 2010-06-15 | 2010-08-16 |
| 3 | 2010-11-01 | 2010-12-31 |
+-------------+------------+------------+
DATEADD(DAY, 1 is used because your desired results show you want a period ending on 2010-06-25 to be collapsed into one starting 2010-06-26. For other use cases this may need adjusting.
Here is a solution with just three simple scans. No CTEs, no recursion, no joins, no table updates in a loop, no "group by" — as a result, this solution should scale the best (I think).
I think number of scans can be reduced to two, if min and max dates are known in advance;
the logic itself just needs two scans — find gaps, applied twice.
declare #datefrom datetime, #datethru datetime
DECLARE #T TABLE (d1 DATETIME, d2 DATETIME)
INSERT INTO #T (d1, d2)
SELECT '2010-01-01','2010-03-31'
UNION SELECT '2010-03-01','2010-06-13'
UNION SELECT '2010-04-01','2010-05-31'
UNION SELECT '2010-06-15','2010-06-25'
UNION SELECT '2010-06-26','2010-07-10'
UNION SELECT '2010-08-01','2010-08-05'
UNION SELECT '2010-08-01','2010-08-09'
UNION SELECT '2010-08-02','2010-08-07'
UNION SELECT '2010-08-08','2010-08-08'
UNION SELECT '2010-08-09','2010-08-12'
UNION SELECT '2010-07-04','2010-08-16'
UNION SELECT '2010-11-01','2010-12-31'
select #datefrom = min(d1) - 1, #datethru = max(d2) + 1 from #t
SELECT
StartDate, EndDate
FROM
(
SELECT
MAX(EndDate) OVER (ORDER BY StartDate) + 1 StartDate,
LEAD(StartDate ) OVER (ORDER BY StartDate) - 1 EndDate
FROM
(
SELECT
StartDate, EndDate
FROM
(
SELECT
MAX(EndDate) OVER (ORDER BY StartDate) + 1 StartDate,
LEAD(StartDate) OVER (ORDER BY StartDate) - 1 EndDate
FROM
(
SELECT d1 StartDate, d2 EndDate from #T
UNION ALL
SELECT #datefrom StartDate, #datefrom EndDate
UNION ALL
SELECT #datethru StartDate, #datethru EndDate
) T
) T
WHERE StartDate <= EndDate
UNION ALL
SELECT #datefrom StartDate, #datefrom EndDate
UNION ALL
SELECT #datethru StartDate, #datethru EndDate
) T
) T
WHERE StartDate <= EndDate
The result is:
StartDate EndDate
2010-01-01 2010-06-13
2010-06-15 2010-08-16
2010-11-01 2010-12-31
The idea is to simulate the scanning algorithm for merging intervals. My solution makes sure it works across a wide range of SQL implementations. I've tested it on MySQL, Postgres, SQL-Server 2017, SQLite and even Hive.
Assuming the table schema is the following.
CREATE TABLE t (
a DATETIME,
b DATETIME
);
We also assume the interval is half-open like [a,b).
When (a,i,j) is in the table, it shows that there are j intervals covering a, and there are i intervals covering the previous point.
CREATE VIEW r AS
SELECT a,
Sum(d) OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS i,
Sum(d) OVER (ORDER BY a ROWS UNBOUNDED PRECEDING) AS j
FROM (SELECT a, Sum(d) AS d
FROM (SELECT a, 1 AS d FROM t
UNION ALL
SELECT b, -1 AS d FROM t) e
GROUP BY a) f;
We produce all the endpoints in the union of the intervals and pair up adjacent ones. Finally, we produce the set of intervals by only picking the odd-numbered rows.
SELECT a, b
FROM (SELECT a,
Lead(a) OVER (ORDER BY a) AS b,
Row_number() OVER (ORDER BY a) AS n
FROM r
WHERE j=0 OR i=0 OR i is null) e
WHERE n%2 = 1;
I've created a sample DB-fiddle and SQL-fiddle. I also wrote a blog post on union intervals in SQL.
A Geometric Approach
Here and elsewhere I've noticed that date packing questions don't provide a geometric approach to this problem. After all, any range, date-ranges included, can be interpreted as a line. So why not convert them to a sql geometry type and utilize geometry::UnionAggregate to merge the ranges.
Why?
This has the advantage of handling all types of overlaps, including fully nested ranges. It also works like any other aggregate query, so it's a little more intuitive in that respect. You also get the bonus of a visual representation of your results if you care to use it. Finally, it is the approach I use for simultaneous range packing (you work with rectangles instead of lines in that case, and there are many more considerations). I just couldn't get the existing approaches to work in that scenario.
This has the disadvantage of requiring more recent versions of SQL Server. It also requires a numbers table and it's annoying to extract the individually produced lines from the aggregated shape. But hopefully in the future Microsoft adds a TVF that allows you to do this easily without a numbers table (or you can just build one yourself). Also, geometrical objects work with floats, so you have conversion annoyances and precision concerns to keep in mind.
Performance-wise I don't know how it compares, but I've done a few things (not shown here) to make it work for me even with large datasets.
Code Description
In 'numbers':
I build a table representing a sequence
Swap it out with your favorite way to make a numbers table.
For a union operation, you won't ever need more rows than in
your original table, so I just use it as the base to build it.
In 'mergeLines':
I convert the dates to floats and use those floats
to create geometrical points.
In this problem, we're working in
'integer space,' meaning there are no time considerations, and so
an begin date in one range that is one day apart from an end date
in another should be merged with that other. In order to make
that merge happen, we need to convert to 'real space.', so we
add 1 to the tail of all ranges (we undo this later).
I then connect these points via STUnion and STEnvelope.
Finally, I merge all these lines via UnionAggregate. The resulting
'lines' geometry object might contain multiple lines, but if they
overlap, they turn into one line.
In the outer query:
I use the numbers CTE to extract the individual lines inside 'lines'.
I envelope the lines which here ensures that the lines are stored
only as its two endpoints.
I read the endpoint x values and convert them back to their time
representations, ensuring to put them back into 'integer space'.
The Code
with
numbers as (
select row_number() over (order by (select null)) i
from #t
),
mergeLines as (
select lines = geometry::UnionAggregate(line)
from #t
cross apply (select line =
geometry::Point(convert(float, d1), 0, 0).STUnion(
geometry::Point(convert(float, d2) + 1, 0, 0)
).STEnvelope()
) l
)
select ap.StartDate,
ap.EndDate
from mergeLines ml
join numbers n on n.i between 1 and ml.lines.STNumGeometries()
cross apply (select line = ml.lines.STGeometryN(i).STEnvelope()) l
cross apply (select
StartDate = convert(datetime,l.line.STPointN(1).STX),
EndDate = convert(datetime,l.line.STPointN(3).STX) - 1
) ap
order by ap.StartDate;
In this solution, I created a temporary Calendar table which stores a value for every day across a range. This type of table can be made static. In addition, I'm only storing 400 some odd dates starting with 2009-12-31. Obviously, if your dates span a larger range, you would need more values.
In addition, this solution will only work with SQL Server 2005+ in that I'm using a CTE.
With Calendar As
(
Select DateAdd(d, ROW_NUMBER() OVER ( ORDER BY s1.object_id ), '1900-01-01') As [Date]
From sys.columns as s1
Cross Join sys.columns as s2
)
, StopDates As
(
Select C.[Date]
From Calendar As C
Left Join #T As T
On C.[Date] Between T.d1 And T.d2
Where C.[Date] >= ( Select Min(T2.d1) From #T As T2 )
And C.[Date] <= ( Select Max(T2.d2) From #T As T2 )
And T.d1 Is Null
)
, StopDatesInUse As
(
Select D1.[Date]
From StopDates As D1
Left Join StopDates As D2
On D1.[Date] = DateAdd(d,1,D2.Date)
Where D2.[Date] Is Null
)
, DataWithEariestStopDate As
(
Select *
, (Select Min(SD2.[Date])
From StopDatesInUse As SD2
Where T.d2 < SD2.[Date] ) As StopDate
From #T As T
)
Select Min(d1), Max(d2)
From DataWithEariestStopDate
Group By StopDate
Order By Min(d1)
EDIT The problem with using dates in 2009 has nothing to do with the final query. The problem is that the Calendar table is not big enough. I started the Calendar table at 2009-12-31. I have revised it start at 1900-01-01.
Try this
;WITH T1 AS
(
SELECT d1, d2, ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS R
FROM #T
), NUMS AS
(
SELECT ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS R
FROM T1 A
CROSS JOIN T1 B
CROSS JOIN T1 C
), ONERANGE AS
(
SELECT DISTINCT DATEADD(DAY, ROW_NUMBER() OVER(PARTITION BY T1.R ORDER BY (SELECT 0)) - 1, T1.D1) AS ELEMENT
FROM T1
CROSS JOIN NUMS
WHERE NUMS.R <= DATEDIFF(DAY, d1, d2) + 1
), SEQUENCE AS
(
SELECT ELEMENT, DATEDIFF(DAY, '19000101', ELEMENT) - ROW_NUMBER() OVER(ORDER BY ELEMENT) AS rownum
FROM ONERANGE
)
SELECT MIN(ELEMENT) AS StartDate, MAX(ELEMENT) as EndDate
FROM SEQUENCE
GROUP BY rownum
The basic idea is to first unroll the existing data, so you get a separate row for each day. This is done in ONERANGE
Then, identify the relationship between how dates increment and the way the row numbers do.
The difference remains constant within an existing range/island. As soon as you get to a new data island, the difference between them increases because the date increments by more than 1, while the row number increments by 1.
This Solution is similar to the 1st solution with additional Deletion Condition.
This will sort the data in the main table itself instead of using different table to store the result.
DROP TABLE IF EXISTS #SampleTable;
CREATE TABLE #SampleTable (StartTime DATETIME NULL, EndTime DATETIME NULL);
INSERT INTO #SampleTable(StartTime, EndTime)
VALUES
(N'2010-01-01T00:00:00', N'2010-03-31T00:00:00'),
(N'2010-03-01T00:00:00', N'2010-06-13T00:00:00'),
(N'2010-04-01T00:00:00', N'2010-05-31T00:00:00'),
(N'2010-06-15T00:00:00', N'2010-06-25T00:00:00'),
(N'2010-06-26T00:00:00', N'2010-07-10T00:00:00'),
(N'2010-07-04T00:00:00', N'2010-08-16T00:00:00'),
(N'2010-08-01T00:00:00', N'2010-08-05T00:00:00'),
(N'2010-08-01T00:00:00', N'2010-08-09T00:00:00'),
(N'2010-08-02T00:00:00', N'2010-08-07T00:00:00'),
(N'2010-08-08T00:00:00', N'2010-08-08T00:00:00'),
(N'2010-08-09T00:00:00', N'2010-08-12T00:00:00'),
(N'2010-11-01T00:00:00', N'2010-12-31T00:00:00');
--
DECLARE #RowCount INT=0;
WHILE(1=1) --
BEGIN
SET #RowCount=0;
--
UPDATE T1
SET T1.EndTime=T2.EndTime
FROM dbo.#SampleTable AS T1
INNER JOIN dbo.#SampleTable AS T2 ON DATEADD(DAY, 1, T1.EndTime) BETWEEN T2.StartTime AND T2.EndTime;
--
SET #RowCount=#RowCount+##ROWCOUNT;
--
DELETE T1
FROM dbo.#SampleTable AS T1
INNER JOIN dbo.#SampleTable AS T2 ON T1.EndTime=T2.EndTime AND T1.StartTime>T2.StartTime;
--
SET #RowCount=#RowCount+##ROWCOUNT;
--
IF #RowCount=0 --
BREAK;
END;
SELECT * FROM #SampleTable
I was inspired by the Geometric Approach given by pwilcox, but wanted to try a different approach. This is using Trino, but I hope the functions used can also be found in other versions of SQL.
WITH Geo AS (
SELECT
transform( -- 6) See Below~
ST_Geometries( -- 5) Extracts an array of individual lines from the union.
geometry_union( -- 4) Returns the union of aggregated lines, melding all lines together into a single geometric multi-line.
array_agg( -- 3) Aggregation function that joins all lines together.
ST_LineString( -- 2) Makes the pairs of geometric points into lines.
ARRAY[ST_Point(0, to_unixtime(d1)), ST_Point(0, to_unixtime(d2))] -- 1) Takes unix time start and end values and makes them into an array of geometric points.
)
)
)
)
, x -> (ST_YMin(x), ST_Length(x))) AS timestamp_duration -- 6) From the array of lines, The minimum value and length of each line is extracted.
FROM #T -- The miniumum value is a timestamp, length is duration.
WHERE d1 <> d2 -- I had errors any time this was the case.
)
-- 7) Finally, I unnest the produced array and convert the values back into timestamps.
SELECT from_unixtime(timestamp) AS StartDate
, from_unixtime(timestamp + duration) AS EndDate
FROM Geo
CROSS JOIN UNNEST(timestamp_duration) AS t(timestamp, duration)
For reference, this took my company cluster about 2 minutes to make 400k start/end timestamps into 700 distinct start/end timestamps.
It also runs in just 2 stages.