Related
Requirement: Is to insert rows ONLY FOR those rows whose difference b/w SUM of DLY rows are less than WKLY value and the DATES of DLY are within the range of DATES of WKLY
DDL:
create or replace table table_a
(
ID number,
qty number,
date_from date,
date_to date,
grain String
);
insert into tempdw.table_a values (1,102,'2020-07-04','2020-07-04','DLY');
insert into tempdw.table_a values (1,1028,'2020-07-05','2020-07-05','DLY');
insert into tempdw.table_a values (1,2828,'2020-07-06','2020-07-06','DLY');
insert into tempdw.table_a values (1,3870,'2020-07-05','2020-07-11','WKLY');
I need to insert a new row (yellow) with the difference of SUM of DLY(Orange) and WKLY(Green)
Tried :
select ID , sum(impression) over(partition by id , time_grain),date_from,date_to,time_grain
from tempdw.test_impress;
I don't have access to Snowflake but here's an example worked out (and tested with your sample data) using PostgreSQL. Hopefully you can tweak it for your own flavour of SQL.
INSERT INTO table_a
SELECT id,
missing_qty,
missing_date,
missing_date,
'DLY' AS grain
FROM ( /* NOTE: Use Average of b.qty because the value repeats on each row selected */
SELECT a.id,
Cast(Avg(b.qty) - Sum(a.qty) AS INTEGER) AS missing_qty,
( /* NOTE: Find an unused date in the week */
SELECT date(date_from + interval '1 day')
FROM table_a
WHERE grain = 'DLY'
AND date_from + interval '1 day' NOT IN
(
SELECT date_from
FROM table_a
WHERE id = a.id
AND grain <> 'WKLY') ) AS missing_date
FROM table_a a
JOIN table_a b
ON a.id = b.id
AND a.date_from BETWEEN b.date_from AND b.date_to
AND a.grain = 'DLY'
AND b.grain = 'WKLY'
GROUP BY a.id ) x
WHERE missing_qty > 0
This seems to work based on the data you've provided:
alter session set week_start = 7; -- Sets start of week to Sunday
insert into table_a (ID, qty, date_from, date_to, grain)
with t1 as (
select *
, concat(year(date_from),'-',week(date_from)) as year_week -- Week used to group records
, max(date_to) over (partition by grain, year_week) as max_dly_date -- Max date already used within week
,dateadd(day,1,max_dly_date) as new_dly_date -- Next date after the max date
,sum(qty) over (partition by grain, year_week) as sum_dly_qty -- Total qty by week and grain
from table_a
)
select dly.ID, (wkly.qty - dly.sum_dly_qty), dly.new_dly_date, dly.new_dly_date, 'DLY'
from t1 dly
inner join t1 wkly on dly.year_week = wkly.year_week and wkly.grain = 'WKLY'
where dly.grain = 'DLY' and dly.date_to = dly.max_dly_date; -- We only need one DLY record in each week
I have a table organized as follows:
id lateAt
1231235 2019/09/14
1242123 2019/09/13
3465345 NULL
5676548 2019/09/28
8986475 2019/09/23
Where lateAt is a timestamp of when a certain loan's payment became late. So, for each current date - I need to look at these numbers daily - there's a certain amount of entries which are late for 0-15, 15-30, 30-45, 45-60, 60-90 and 90+ days.
This is my desired output:
lateGroup Count
0-15 20
15-30 22
30-45 25
45-60 32
60-90 47
90+ 57
This is something I can easily calculate in R, but to get the results back to my BI dashboard I'd have to create a new table in my database, which I don't think is a good practice. What is the SQL-native approach to this problem?
I would define the "late groups" using a range, the join against the number of days:
with groups (grp) as (
values
(int4range(0,15, '[)')),
(int4range(15,30, '[)')),
(int4range(30,45, '[)')),
(int4range(45,60, '[)')),
(int4range(60,90, '[)')),
(int4range(90,null, '[)'))
)
select grp, count(t.user_id)
from groups g
left join the_table t on g.grp #> current_date - t.late_at
group by grp
order by grp;
int4range(0,15, '[)') creates a range from 0 (inclusive) and 15 (exclusive)
Online example: https://rextester.com/QJSN89445
The quick and dirty way to do this in SQL is:
SELECT '0-15' AS lateGroup,
COUNT(*) AS lateGroupCount
FROM my_table t
WHERE (CURRENT_DATE - t.lateAt) >= 0
AND (CURRENT_DATE - t.lateAt) < 15
UNION
SELECT '15-30' AS lateGroup,
COUNT(*) AS lateGroupCount
FROM my_table t
WHERE (CURRENT_DATE - t.lateAt) >= 15
AND (CURRENT_DATE - t.lateAt) < 30
UNION
SELECT '30-45' AS lateGroup,
COUNT(*) AS lateGroupCount
FROM my_table t
WHERE (CURRENT_DATE - t.lateAt) >= 30
AND (CURRENT_DATE - t.lateAt) < 45
-- Etc...
For production code, you would want to do something more like Ross' answer.
You didn't mention which DBMS you're using, but nearly all of them will have a construct known as a "value constructor" like this:
select bins.lateGroup, bins.minVal, bins.maxVal FROM
(VALUES
('0-15',0,15),
('15-30',15.0001,30), -- increase by a small fraction so bins don't overlap
('30-45',30.0001,45),
('45-60',45.0001,60),
('60-90',60.0001,90),
('90-99999',90.0001,99999)
) AS bins(lateGroup,minVal,maxVal)
If your DBMS doesn't have it, then you can probably use UNION ALL:
SELECT '0-15' as lateGroup, 0 as minVal, 15 as maxVal
union all SELECT '15-30',15,30
union all SELECT '30-45',30,45
Then your complete query, with the sample data you provided, would look like this:
--- example from SQL Server 2012 SP1
--- first let's set up some sample data
create table #temp (id int, lateAt datetime);
INSERT #temp (id, lateAt) values
(1231235,'2019-09-14'),
(1242123,'2019-09-13'),
(3465345,NULL),
(5676548,'2019-09-28'),
(8986475,'2019-09-23');
--- here's the actual query
select lateGroup, count(*) as Count
from #temp as T,
(VALUES
('0-15',0,15),
('15-30',15.0001,30), -- increase by a small fraction so bins don't overlap
('30-45',30.0001,45),
('45-60',45.0001,60),
('60-90',60.0001,90),
('90-99999',90.0001,99999)
) AS bins(lateGroup,minVal,maxVal)
) AS bins(lateGroup,minVal,maxVal)
where datediff(day,lateAt,getdate()) between minVal and maxVal
group by lateGroup
order by lateGroup
--- remove our sample data
drop table #temp;
Here's the output:
lateGroup Count
15-30 2
30-45 2
Note: rows with null lateAt are not counted.
I think you can do it all in one clear query :
with cte_lategroup as
(
select *
from (values(0,15,'0-15'),(15,30,'15-30'),(30,45,'30-45')) as t (mini, maxi, designation)
)
select
t2.designation
, count(*)
from test t
left outer join cte_lategroup t2
on current_date - t.lateat >= t2.mini
and current_date - lateat < t2.maxi
group by t2.designation;
With a preset like yours :
create table test
(
id int
, lateAt date
);
insert into test
values (1231235, to_date('2019/09/14', 'yyyy/mm/dd'))
,(1242123, to_date('2019/09/13', 'yyyy/mm/dd'))
,(3465345, null)
,(5676548, to_date('2019/09/28', 'yyyy/mm/dd'))
,(8986475, to_date('2019/09/23', 'yyyy/mm/dd'));
We have 2 tables:
sales
hourt (only 1 field (hourt) of numbers: 0 to 23)
The goal is to list all dates and all 24 hours for each day and group hours that have sales. For hours that do not have sales, zero will be shown.
This query cross joins the sales table with the hourt table and does list all dates and 24 hours. However, there are also many duplicate rows. How can we avoid the duplicates?
We're using Amazon Redshift (based on Postgres 8.0).
with h as (
SELECT
a.purchase_date,
CAST(DATE_PART("HOUR", AT_TIME_ZONE(AT_TIME_ZONE(CAST(a.purchase_date AS
DATETIME), "0:00"), "PST")) as INTEGER) AS Hour,
COUNT(a.quantity) AS QtyCount,
SUM(a.quantity) AS QtyTotal,
SUM((a.price) AS Price
FROM sales a
GROUP BY CAST(DATE_PART("HOUR",
AT_TIME_ZONE(AT_TIME_ZONE(CAST(a.purchase_date AS DATETIME), "0:00"),
"PST")) as INTEGER),
DATE_FORMAT(AT_TIME_ZONE(AT_TIME_ZONE(CAST(a.purchase_date AS DATETIME),
"0:00"), "PST"), "yyyy-MM-dd")
ORDER by a.purchase_date
),
hr as (
SELECT
CAST(hourt AS INTEGER) AS hourt
FROM hourt
),
joined as (
SELECT
purchase_date,
hourt,
QtyCount,
QtyTotal,
Price
FROM h
cross JOIN hr
)
SELECT *
FROM joined
Order by purchase_date,hourt
Sample Tables:
Before the cross join, query returned correct sales and grouped hours, as seen in the below table.
Desired results table:
Need to create a series of all the hour values and left join your data back to that. Comments inline explain the logic.
WITH data AS (-- Do the basic aggregation first
SELECT DATE_TRUNC('hour',a.purchase_date) purchase_hour --Truncate timestamp to the hour is simpler
,COUNT(a.quantity) AS QtyCount
,SUM(a.quantity) AS QtyTotal
,SUM((a.price) AS Price
FROM sales a
GROUP BY DATE_TRUNC('hour',a.purchase_date)
ORDER BY DATE_TRUNC('hour',a.purchase_date)
-- SELECT '2017-01-13 12:00:00'::TIMESTAMP purchase_hour, 1 qty_count, 1 qty_total, 119 price
-- UNION ALL SELECT '2017-01-13 15:00:00'::TIMESTAMP purchase_hour, 1 qty_count, 1 qty_total, 119 price
-- UNION ALL SELECT '2017-01-14 21:00:00'::TIMESTAMP purchase_hour, 1 qty_count, 1 qty_total, 119 price
)
,time_range AS (--Calculate the start and end **date** values
SELECT DATE_TRUNC('day',MIN(purchase_hour)) start_date
, DATE_TRUNC('day',MAX(purchase_hour))+1 end_date
FROM data
)
,hr AS (--Generate all hours between start and end
SELECT (SELECT start_date
FROM time_range
LIMIT 1) --Limit 1 so the optimizer knows it's not a correlated subquery
+ ((n-1) --Make the series start at zero so we don't miss the starting value
* INTERVAL '1 hour') AS "hour"
FROM (SELECT ROW_NUMBER() OVER () n
FROM stl_query --Can use any table here as long as it enough rows
LIMIT 100) series
WHERE "hour" < (SELECT end_date FROM time_range LIMIT 1)
)
--Use NVL to replace missing values with zeroes
SELECT hr.hour AS purchase_hour --Timestamp like `2017-01-13 12:00:00`
, NVL(data.qty_count, 0) AS qty_count
, NVL(data.qty_total, 0) AS qty_total
, NVL(data.price, 0) AS price
FROM hr
LEFT JOIN data
ON hr.hour = data.purchase_hour
ORDER BY hr.hour
;
I achieved the desired results by using Left Join (table A with table B) instead of Cross Join of these two tables:
Table A has all the dates and hours
Table B is the first part of the original query
I am trying to join two tables in Oracle SQL. One table has a DATE data type which represents a date(go figure) the other has an NUMBER data type which represents a month. I need to join the tables on the DATE's month and the NUMBER. I tried TO_CHAR() but it didn't work. Any suggestions?
Oracle's EXTRACT() function may do the trick ( https://docs.oracle.com/cd/B19306_01/server.102/b14200/functions050.htm ). Suppose we have 2 tables, populated with test data, like so:
create table numbers_ (num_ number);
create table dates_ (date_ date);
begin
for i in 1 .. 12
loop
insert into numbers_ values (i);
end loop;
insert into dates_ values ('15-JUL-2017');
insert into dates_ values ('16-AUG-2017');
insert into dates_ values ('17-SEP-2017');
end;
/
We can use EXTRACT to get the "months" from the dates_ table:
select extract (month from date_) from dates_;
EXTRACT(MONTHFROMDATE_)
7
8
9
Use the "extracted" months for joining the tables:
select *
from
numbers_ N,
( select extract( month from date_ ) month from dates_ ) D
where N.num_ = D.month;
-- output
NUM_ MONTH
7 7
8 8
9 9
If you need more columns from the dates_ table, add them into the subquery (and to the main SELECT clause). Example:
select
N.num_
, D.date_
, D.month
from
numbers_ N,
( select
extract( month from date_ ) month
, date_
from dates_ ) D
where N.num_ = D.month;
(See also: dbfiddle)
Or - better (as #Wernfried Domscheit suggested):
select
N.num_
, D.date_
from
numbers_ N join dates_ D
on extract(month from D.date_) = N.num_ ;
Is there a better way of merging overlapping date intervals?
The solution I came up with is so simple that now I wonder if someone else has a better idea of how this could be done.
/***** DATA EXAMPLE *****/
DECLARE #T TABLE (d1 DATETIME, d2 DATETIME)
INSERT INTO #T (d1, d2)
SELECT '2010-01-01','2010-03-31' UNION SELECT '2010-04-01','2010-05-31'
UNION SELECT '2010-06-15','2010-06-25' UNION SELECT '2010-06-26','2010-07-10'
UNION SELECT '2010-08-01','2010-08-05' UNION SELECT '2010-08-01','2010-08-09'
UNION SELECT '2010-08-02','2010-08-07' UNION SELECT '2010-08-08','2010-08-08'
UNION SELECT '2010-08-09','2010-08-12' UNION SELECT '2010-07-04','2010-08-16'
UNION SELECT '2010-11-01','2010-12-31' UNION SELECT '2010-03-01','2010-06-13'
/***** INTERVAL ANALYSIS *****/
WHILE (1=1) BEGIN
UPDATE t1 SET t1.d2 = t2.d2
FROM #T AS t1 INNER JOIN #T AS t2 ON
DATEADD(day, 1, t1.d2) BETWEEN t2.d1 AND t2.d2
IF ##ROWCOUNT = 0 BREAK
END
/***** RESULT *****/
SELECT StartDate = MIN(d1) , EndDate = d2
FROM #T
GROUP BY d2
ORDER BY StartDate, EndDate
/***** OUTPUT *****/
/*****
StartDate EndDate
2010-01-01 2010-06-13
2010-06-15 2010-08-16
2010-11-01 2010-12-31
*****/
I was looking for the same solution and came across this post on Combine overlapping datetime to return single overlapping range record.
There is another thread on Packing Date Intervals.
I tested this with various date ranges, including the ones listed here, and it works correctly every time.
SELECT
s1.StartDate,
--t1.EndDate
MIN(t1.EndDate) AS EndDate
FROM #T s1
INNER JOIN #T t1 ON s1.StartDate <= t1.EndDate
AND NOT EXISTS(SELECT * FROM #T t2
WHERE t1.EndDate >= t2.StartDate AND t1.EndDate < t2.EndDate)
WHERE NOT EXISTS(SELECT * FROM #T s2
WHERE s1.StartDate > s2.StartDate AND s1.StartDate <= s2.EndDate)
GROUP BY s1.StartDate
ORDER BY s1.StartDate
The result is:
StartDate | EndDate
2010-01-01 | 2010-06-13
2010-06-15 | 2010-06-25
2010-06-26 | 2010-08-16
2010-11-01 | 2010-12-31
You asked this back in 2010 but don't specify any particular version.
An answer for people on SQL Server 2012+
WITH T1
AS (SELECT *,
MAX(d2) OVER (ORDER BY d1) AS max_d2_so_far
FROM #T),
T2
AS (SELECT *,
CASE
WHEN d1 <= DATEADD(DAY, 1, LAG(max_d2_so_far) OVER (ORDER BY d1))
THEN 0
ELSE 1
END AS range_start
FROM T1),
T3
AS (SELECT *,
SUM(range_start) OVER (ORDER BY d1) AS range_group
FROM T2)
SELECT range_group,
MIN(d1) AS d1,
MAX(d2) AS d2
FROM T3
GROUP BY range_group
Which returns
+-------------+------------+------------+
| range_group | d1 | d2 |
+-------------+------------+------------+
| 1 | 2010-01-01 | 2010-06-13 |
| 2 | 2010-06-15 | 2010-08-16 |
| 3 | 2010-11-01 | 2010-12-31 |
+-------------+------------+------------+
DATEADD(DAY, 1 is used because your desired results show you want a period ending on 2010-06-25 to be collapsed into one starting 2010-06-26. For other use cases this may need adjusting.
Here is a solution with just three simple scans. No CTEs, no recursion, no joins, no table updates in a loop, no "group by" — as a result, this solution should scale the best (I think).
I think number of scans can be reduced to two, if min and max dates are known in advance;
the logic itself just needs two scans — find gaps, applied twice.
declare #datefrom datetime, #datethru datetime
DECLARE #T TABLE (d1 DATETIME, d2 DATETIME)
INSERT INTO #T (d1, d2)
SELECT '2010-01-01','2010-03-31'
UNION SELECT '2010-03-01','2010-06-13'
UNION SELECT '2010-04-01','2010-05-31'
UNION SELECT '2010-06-15','2010-06-25'
UNION SELECT '2010-06-26','2010-07-10'
UNION SELECT '2010-08-01','2010-08-05'
UNION SELECT '2010-08-01','2010-08-09'
UNION SELECT '2010-08-02','2010-08-07'
UNION SELECT '2010-08-08','2010-08-08'
UNION SELECT '2010-08-09','2010-08-12'
UNION SELECT '2010-07-04','2010-08-16'
UNION SELECT '2010-11-01','2010-12-31'
select #datefrom = min(d1) - 1, #datethru = max(d2) + 1 from #t
SELECT
StartDate, EndDate
FROM
(
SELECT
MAX(EndDate) OVER (ORDER BY StartDate) + 1 StartDate,
LEAD(StartDate ) OVER (ORDER BY StartDate) - 1 EndDate
FROM
(
SELECT
StartDate, EndDate
FROM
(
SELECT
MAX(EndDate) OVER (ORDER BY StartDate) + 1 StartDate,
LEAD(StartDate) OVER (ORDER BY StartDate) - 1 EndDate
FROM
(
SELECT d1 StartDate, d2 EndDate from #T
UNION ALL
SELECT #datefrom StartDate, #datefrom EndDate
UNION ALL
SELECT #datethru StartDate, #datethru EndDate
) T
) T
WHERE StartDate <= EndDate
UNION ALL
SELECT #datefrom StartDate, #datefrom EndDate
UNION ALL
SELECT #datethru StartDate, #datethru EndDate
) T
) T
WHERE StartDate <= EndDate
The result is:
StartDate EndDate
2010-01-01 2010-06-13
2010-06-15 2010-08-16
2010-11-01 2010-12-31
The idea is to simulate the scanning algorithm for merging intervals. My solution makes sure it works across a wide range of SQL implementations. I've tested it on MySQL, Postgres, SQL-Server 2017, SQLite and even Hive.
Assuming the table schema is the following.
CREATE TABLE t (
a DATETIME,
b DATETIME
);
We also assume the interval is half-open like [a,b).
When (a,i,j) is in the table, it shows that there are j intervals covering a, and there are i intervals covering the previous point.
CREATE VIEW r AS
SELECT a,
Sum(d) OVER (ORDER BY a ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS i,
Sum(d) OVER (ORDER BY a ROWS UNBOUNDED PRECEDING) AS j
FROM (SELECT a, Sum(d) AS d
FROM (SELECT a, 1 AS d FROM t
UNION ALL
SELECT b, -1 AS d FROM t) e
GROUP BY a) f;
We produce all the endpoints in the union of the intervals and pair up adjacent ones. Finally, we produce the set of intervals by only picking the odd-numbered rows.
SELECT a, b
FROM (SELECT a,
Lead(a) OVER (ORDER BY a) AS b,
Row_number() OVER (ORDER BY a) AS n
FROM r
WHERE j=0 OR i=0 OR i is null) e
WHERE n%2 = 1;
I've created a sample DB-fiddle and SQL-fiddle. I also wrote a blog post on union intervals in SQL.
A Geometric Approach
Here and elsewhere I've noticed that date packing questions don't provide a geometric approach to this problem. After all, any range, date-ranges included, can be interpreted as a line. So why not convert them to a sql geometry type and utilize geometry::UnionAggregate to merge the ranges.
Why?
This has the advantage of handling all types of overlaps, including fully nested ranges. It also works like any other aggregate query, so it's a little more intuitive in that respect. You also get the bonus of a visual representation of your results if you care to use it. Finally, it is the approach I use for simultaneous range packing (you work with rectangles instead of lines in that case, and there are many more considerations). I just couldn't get the existing approaches to work in that scenario.
This has the disadvantage of requiring more recent versions of SQL Server. It also requires a numbers table and it's annoying to extract the individually produced lines from the aggregated shape. But hopefully in the future Microsoft adds a TVF that allows you to do this easily without a numbers table (or you can just build one yourself). Also, geometrical objects work with floats, so you have conversion annoyances and precision concerns to keep in mind.
Performance-wise I don't know how it compares, but I've done a few things (not shown here) to make it work for me even with large datasets.
Code Description
In 'numbers':
I build a table representing a sequence
Swap it out with your favorite way to make a numbers table.
For a union operation, you won't ever need more rows than in
your original table, so I just use it as the base to build it.
In 'mergeLines':
I convert the dates to floats and use those floats
to create geometrical points.
In this problem, we're working in
'integer space,' meaning there are no time considerations, and so
an begin date in one range that is one day apart from an end date
in another should be merged with that other. In order to make
that merge happen, we need to convert to 'real space.', so we
add 1 to the tail of all ranges (we undo this later).
I then connect these points via STUnion and STEnvelope.
Finally, I merge all these lines via UnionAggregate. The resulting
'lines' geometry object might contain multiple lines, but if they
overlap, they turn into one line.
In the outer query:
I use the numbers CTE to extract the individual lines inside 'lines'.
I envelope the lines which here ensures that the lines are stored
only as its two endpoints.
I read the endpoint x values and convert them back to their time
representations, ensuring to put them back into 'integer space'.
The Code
with
numbers as (
select row_number() over (order by (select null)) i
from #t
),
mergeLines as (
select lines = geometry::UnionAggregate(line)
from #t
cross apply (select line =
geometry::Point(convert(float, d1), 0, 0).STUnion(
geometry::Point(convert(float, d2) + 1, 0, 0)
).STEnvelope()
) l
)
select ap.StartDate,
ap.EndDate
from mergeLines ml
join numbers n on n.i between 1 and ml.lines.STNumGeometries()
cross apply (select line = ml.lines.STGeometryN(i).STEnvelope()) l
cross apply (select
StartDate = convert(datetime,l.line.STPointN(1).STX),
EndDate = convert(datetime,l.line.STPointN(3).STX) - 1
) ap
order by ap.StartDate;
In this solution, I created a temporary Calendar table which stores a value for every day across a range. This type of table can be made static. In addition, I'm only storing 400 some odd dates starting with 2009-12-31. Obviously, if your dates span a larger range, you would need more values.
In addition, this solution will only work with SQL Server 2005+ in that I'm using a CTE.
With Calendar As
(
Select DateAdd(d, ROW_NUMBER() OVER ( ORDER BY s1.object_id ), '1900-01-01') As [Date]
From sys.columns as s1
Cross Join sys.columns as s2
)
, StopDates As
(
Select C.[Date]
From Calendar As C
Left Join #T As T
On C.[Date] Between T.d1 And T.d2
Where C.[Date] >= ( Select Min(T2.d1) From #T As T2 )
And C.[Date] <= ( Select Max(T2.d2) From #T As T2 )
And T.d1 Is Null
)
, StopDatesInUse As
(
Select D1.[Date]
From StopDates As D1
Left Join StopDates As D2
On D1.[Date] = DateAdd(d,1,D2.Date)
Where D2.[Date] Is Null
)
, DataWithEariestStopDate As
(
Select *
, (Select Min(SD2.[Date])
From StopDatesInUse As SD2
Where T.d2 < SD2.[Date] ) As StopDate
From #T As T
)
Select Min(d1), Max(d2)
From DataWithEariestStopDate
Group By StopDate
Order By Min(d1)
EDIT The problem with using dates in 2009 has nothing to do with the final query. The problem is that the Calendar table is not big enough. I started the Calendar table at 2009-12-31. I have revised it start at 1900-01-01.
Try this
;WITH T1 AS
(
SELECT d1, d2, ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS R
FROM #T
), NUMS AS
(
SELECT ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS R
FROM T1 A
CROSS JOIN T1 B
CROSS JOIN T1 C
), ONERANGE AS
(
SELECT DISTINCT DATEADD(DAY, ROW_NUMBER() OVER(PARTITION BY T1.R ORDER BY (SELECT 0)) - 1, T1.D1) AS ELEMENT
FROM T1
CROSS JOIN NUMS
WHERE NUMS.R <= DATEDIFF(DAY, d1, d2) + 1
), SEQUENCE AS
(
SELECT ELEMENT, DATEDIFF(DAY, '19000101', ELEMENT) - ROW_NUMBER() OVER(ORDER BY ELEMENT) AS rownum
FROM ONERANGE
)
SELECT MIN(ELEMENT) AS StartDate, MAX(ELEMENT) as EndDate
FROM SEQUENCE
GROUP BY rownum
The basic idea is to first unroll the existing data, so you get a separate row for each day. This is done in ONERANGE
Then, identify the relationship between how dates increment and the way the row numbers do.
The difference remains constant within an existing range/island. As soon as you get to a new data island, the difference between them increases because the date increments by more than 1, while the row number increments by 1.
This Solution is similar to the 1st solution with additional Deletion Condition.
This will sort the data in the main table itself instead of using different table to store the result.
DROP TABLE IF EXISTS #SampleTable;
CREATE TABLE #SampleTable (StartTime DATETIME NULL, EndTime DATETIME NULL);
INSERT INTO #SampleTable(StartTime, EndTime)
VALUES
(N'2010-01-01T00:00:00', N'2010-03-31T00:00:00'),
(N'2010-03-01T00:00:00', N'2010-06-13T00:00:00'),
(N'2010-04-01T00:00:00', N'2010-05-31T00:00:00'),
(N'2010-06-15T00:00:00', N'2010-06-25T00:00:00'),
(N'2010-06-26T00:00:00', N'2010-07-10T00:00:00'),
(N'2010-07-04T00:00:00', N'2010-08-16T00:00:00'),
(N'2010-08-01T00:00:00', N'2010-08-05T00:00:00'),
(N'2010-08-01T00:00:00', N'2010-08-09T00:00:00'),
(N'2010-08-02T00:00:00', N'2010-08-07T00:00:00'),
(N'2010-08-08T00:00:00', N'2010-08-08T00:00:00'),
(N'2010-08-09T00:00:00', N'2010-08-12T00:00:00'),
(N'2010-11-01T00:00:00', N'2010-12-31T00:00:00');
--
DECLARE #RowCount INT=0;
WHILE(1=1) --
BEGIN
SET #RowCount=0;
--
UPDATE T1
SET T1.EndTime=T2.EndTime
FROM dbo.#SampleTable AS T1
INNER JOIN dbo.#SampleTable AS T2 ON DATEADD(DAY, 1, T1.EndTime) BETWEEN T2.StartTime AND T2.EndTime;
--
SET #RowCount=#RowCount+##ROWCOUNT;
--
DELETE T1
FROM dbo.#SampleTable AS T1
INNER JOIN dbo.#SampleTable AS T2 ON T1.EndTime=T2.EndTime AND T1.StartTime>T2.StartTime;
--
SET #RowCount=#RowCount+##ROWCOUNT;
--
IF #RowCount=0 --
BREAK;
END;
SELECT * FROM #SampleTable
I was inspired by the Geometric Approach given by pwilcox, but wanted to try a different approach. This is using Trino, but I hope the functions used can also be found in other versions of SQL.
WITH Geo AS (
SELECT
transform( -- 6) See Below~
ST_Geometries( -- 5) Extracts an array of individual lines from the union.
geometry_union( -- 4) Returns the union of aggregated lines, melding all lines together into a single geometric multi-line.
array_agg( -- 3) Aggregation function that joins all lines together.
ST_LineString( -- 2) Makes the pairs of geometric points into lines.
ARRAY[ST_Point(0, to_unixtime(d1)), ST_Point(0, to_unixtime(d2))] -- 1) Takes unix time start and end values and makes them into an array of geometric points.
)
)
)
)
, x -> (ST_YMin(x), ST_Length(x))) AS timestamp_duration -- 6) From the array of lines, The minimum value and length of each line is extracted.
FROM #T -- The miniumum value is a timestamp, length is duration.
WHERE d1 <> d2 -- I had errors any time this was the case.
)
-- 7) Finally, I unnest the produced array and convert the values back into timestamps.
SELECT from_unixtime(timestamp) AS StartDate
, from_unixtime(timestamp + duration) AS EndDate
FROM Geo
CROSS JOIN UNNEST(timestamp_duration) AS t(timestamp, duration)
For reference, this took my company cluster about 2 minutes to make 400k start/end timestamps into 700 distinct start/end timestamps.
It also runs in just 2 stages.