Select continuous time intervals in SQL - sql

I have table with datetimes and i need to select continuous time intervals
My table:
Id
Time
1
2021-01-01 10:00:00
1
2021-01-01 10:01:00
1
2021-01-01 10:02:00
1
2021-01-01 10:04:00
2
2021-01-01 10:03:00
2
2021-01-01 10:04:00
2
2021-01-01 10:06:00
2
2021-01-01 10:07:00
Result i need:
id
date_from
date_to
1
2021-01-01 10:00:00
2021-01-01 10:02:00
1
2021-01-01 10:04:00
2021-01-01 10:04:00
2
2021-01-01 10:03:00
2021-01-01 10:04:00
2
2021-01-01 10:06:00
2021-01-01 10:07:00
I tried like this, but can't do that right
select id,
min(date_from) over
(partition by id, date_to
order by id)
as date_from,
max(date_to) over
(partition by id, date_from
order by id)
as date_to
from (
select id,
MIN(time) over
(PARTITION by id,
diff2 between 0 and 60
ORDER BY id, time)
as date_from,
max(MINUTE) over
(PARTITION by id,
diff between 0 and 60
ORDER BY id, time)
as date_to
from (
select *,
unix_timestamp(date_lead) - unix_timestamp(time)
as diff,
unix_timestamp(time) - unix_timestamp(date_lag)
as diff2
from (
select id, time,
NVL(LEAD(time) over
(PARTITION by id
ORDER BY id, time), time)
as date_lead,
NVL(LAG(time) over
(PARTITION by id
ORDER BY id, time), time)
as date_lag
from my_table)
)
)

select id, MIN(time), max(time) from
(SELECT * interval '1 minutes' *-1* (DENSE_RANK() OVER (PARTITION BY id ORDER by time) ) + TO_TIMESTAMP(time, 'YYYY-MM-DD HH24:MI:SS') as drank
from event) t1
GROUP by drank, id
order by id

Assuming your time stamps are precise (no seconds or fractions of a second), you can subtract an enumerated number of minutes from the time column. This is constant for "adjacent" rows:
select id, min(time), max(time)
from (select t.*,
row_number() over (partition by id order by time) as seqnum
from t
) t
group by id, time - seqnum * interval '1 minute';
If you have seconds and fractional seconds, then you might want to adjust the logic using date_trunc(). If this is an issue, I would suggest that you ask a new question with appropriate sample data and desired results.

Related

Explode time duration defined by start and end timestamp by the hour

I have a table with work shifts (1 row per shift) that include date, start and end time.
Main goal: I want to aggregate the number of working hours per hour per store.
This is what my shift table looks like:
employee_id
store
start_timestamp
end_timestamp
1
1
2022-01-01T07:00
2022-01-01T11:30
2
1
2022-01-01T08:30
2022-01-01T12:30
...
...
...
...
I want to "explode" the information into a table something like this:
hour
employee_id
store
date
scheduled_work (h)
07:00
1
1
2022-01-01
1
08:00
1
1
2022-01-01
1
09:00
1
1
2022-01-01
1
10:00
1
1
2022-01-01
1
11:00
1
1
2022-01-01
0.5
08:00
2
1
2022-01-01
0.5
09:00
2
1
2022-01-01
1
10:00
2
1
2022-01-01
1
11:00
2
1
2022-01-01
1
12:00
2
1
2022-01-01
0.5
...
...
...
...
...
I have tried using a method using cross joins and it consumed a lot of memory and looks like this:
with test as (
select 1 as employee_id, 1 as store_id, timestamp('2022-01-01 07:00:00') as start_timestamp, timestamp('2022-01-01 11:30:00') as end_timestamp union all
select 2 as employee_id, 1 as store_id, timestamp('2022-01-01 08:30:00') as start_timestamp, timestamp('2022-01-01 12:30:00') as end_timestamp
)
, cte as (
select ts
, test.*
, safe_divide(
timestamp_diff(
least(date_add(ts, interval 1 hour), end_timestamp)
, greatest(ts, start_timestamp)
, millisecond
)
, 3600000
) as scheduled_work
from test
cross join unnest(generate_timestamp_array(timestamp('2022-01-01 07:00:00'),
timestamp('2022-01-01 12:30:00'), interval 1 hour)) as ts
order by employee_id, ts)
select * from cte
where scheduled_work >= 0;
It's working but I know this will not be good when the number of shifts starts to add up. Does anyone have another solution that is more efficient?
I'm using BigQuery.
you might want to remove order by inside cte subquery, it'll affect the query performance.
And another similar approach:
WITH test AS (
select 1 as employee_id, 1 as store_id, timestamp('2022-01-01 07:00:00') as start_timestamp, timestamp('2022-01-01 11:30:00') as end_timestamp union all
select 2 as employee_id, 1 as store_id, timestamp('2022-01-01 08:30:00') as start_timestamp, timestamp('2022-01-01 12:30:00') as end_timestamp
),
explodes AS (
SELECT employee_id, store_id, EXTRACT(DATE FROM h) date, TIME_TRUNC(EXTRACT(TIME FROM h), HOUR) hour, 1 AS scheduled_work
FROM test,
UNNEST (GENERATE_TIMESTAMP_ARRAY(
TIMESTAMP_TRUNC(start_timestamp + INTERVAL 1 HOUR, HOUR),
TIMESTAMP_TRUNC(end_timestamp - INTERVAL 1 HOUR, HOUR), INTERVAL 1 HOUR
)) h
UNION ALL
SELECT employee_id, store_id, EXTRACT(DATE FROM h), TIME_TRUNC(EXTRACT(TIME FROM h), HOUR),
CASE offset
WHEN 0 THEN 1 - (EXTRACT(MINUTE FROM h) * 60 + EXTRACT(SECOND FROM h)) / 3600
WHEN 1 THEN (EXTRACT(MINUTE FROM h) * 60 + EXTRACT(SECOND FROM h)) / 3600
END
FROM test, UNNEST([start_timestamp, end_timestamp]) h WITH OFFSET
)
SELECT * FROM explodes WHERE scheduled_work > 0;
Consider below approach
with temp as (
select * replace(
parse_time('%H:%M', start_time) as start_time,
parse_time('%H:%M', end_time) as end_time
)
from your_table
)
select * except(start_time, end_time),
case
when hour = time_trunc(start_time, hour) then (60 - time_diff(start_time, hour, minute)) / 60
when hour = time_trunc(end_time, hour) then time_diff(end_time, hour, minute) / 60
else 1
end as scheduled_work
from (
select time_add(time_trunc(start_time, hour), interval delta hour) as hour,
employee_id, store, date, start_time, end_time
from temp, unnest(generate_array(0,time_diff(end_time, start_time, hour))) delta
)
order by employee_id, hour
if applied to sample data as in your question
output is

create time range with 2 columns date_time

The problem I am facing is how to find distinct time periods from multiple time periods with overlap in Teradata ANSI SQL.
For example, the attached tables contain multiple overlapping time periods, how can I combine those time periods into 3 unique time periods in Teradata SQL???
I think I can do it in python with the loop function, but not sure how to do it in SQL
ID
Start Date
End Date
001
2005-01-01
2006-01-01
001
2005-01-01
2007-01-01
001
2008-01-01
2008-06-01
001
2008-04-01
2008-12-01
001
2010-01-01
2010-05-01
001
2010-04-01
2010-12-01
001
2010-11-01
2012-01-01
My expected result is:
ID
start_Date
end_date
001
2005-01-01
2007-01-01
001
2008-01-01
2008-12-01
001
2010-01-01
2012-01-01
From Oracle 12, you can use MATCH_RECOGNIZE to perform a row-by-row comparison:
SELECT *
FROM table_name
MATCH_RECOGNIZE(
PARTITION BY id
ORDER BY start_date
MEASURES
FIRST(start_date) AS start_date,
MAX(end_date) AS end_date
ONE ROW PER MATCH
PATTERN (overlapping_ranges* last_range)
DEFINE overlapping_ranges AS NEXT(start_date) <= MAX(end_date)
)
Which, for the sample data:
CREATE TABLE table_name (ID, Start_Date, End_Date) AS
SELECT '001', DATE '2005-01-01', DATE '2006-01-01' FROM DUAL UNION ALL
SELECT '001', DATE '2005-01-01', DATE '2007-01-01' FROM DUAL UNION ALL
SELECT '001', DATE '2008-01-01', DATE '2008-06-01' FROM DUAL UNION ALL
SELECT '001', DATE '2008-04-01', DATE '2008-12-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-01-01', DATE '2010-05-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-04-01', DATE '2010-12-01' FROM DUAL UNION ALL
SELECT '001', DATE '2010-11-01', DATE '2012-01-01' FROM DUAL;
Outputs:
ID
START_DATE
END_DATE
001
2005-01-01 00:00:00
2007-01-01 00:00:00
001
2008-01-01 00:00:00
2008-12-01 00:00:00
001
2010-01-01 00:00:00
2012-01-01 00:00:00
db<>fiddle here
Update: Alternative query
SELECT id,
start_date,
end_date
FROM (
SELECT id,
dt,
SUM(cnt) OVER (PARTITION BY id ORDER BY dt) AS grp,
cnt
FROM (
SELECT ID,
dt,
SUM(type) OVER (PARTITION BY id ORDER BY dt, ROWNUM) * type AS cnt
FROM table_name
UNPIVOT (dt FOR type IN (start_date AS 1, end_date AS -1))
)
WHERE cnt IN (1,0)
)
PIVOT (MAX(dt) FOR cnt IN (1 AS start_date, 0 AS end_date))
Or, an equivalent that does not use UNPIVOT, PIVOT or ROWNUM and works in both Oracle and PostgreSQL:
SELECT id,
MAX(CASE cnt WHEN 1 THEN dt END) AS start_date,
MAX(CASE cnt WHEN 0 THEN dt END) AS end_date
FROM (
SELECT id,
dt,
SUM(cnt) OVER (PARTITION BY id ORDER BY dt) AS grp,
cnt
FROM (
SELECT ID,
dt,
SUM(type) OVER (PARTITION BY id ORDER BY dt, rn) * type AS cnt
FROM (
SELECT r.*,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY dt ASC, type DESC) AS rn
FROM (
SELECT id, 1 AS type, start_date AS dt FROM table_name
UNION ALL
SELECT id, -1 AS type, end_date AS dt FROM table_name
) r
) p
) s
WHERE cnt IN (1,0)
) t
GROUP BY id, grp
Update 2: Another Alternative
SELECT id,
MIN(start_date) AS start_date,
MAX(end_Date) AS end_date
FROM (
SELECT t.*,
SUM(CASE WHEN start_date <= prev_max THEN 0 ELSE 1 END)
OVER (PARTITION BY id ORDER BY start_date) AS grp
FROM (
SELECT t.*,
MAX(end_date) OVER (
PARTITION BY id ORDER BY start_date
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
) AS prev_max
FROM table_name t
) t
) t
GROUP BY id, grp
db<>fiddle Oracle PostgreSQL
This is a gaps and islands problem. Try this:
with u as
(select ID, start_date, end_date,
case
when start_date <= lag(end_date) over(partition by ID order by start_date, end_date) then 0
else 1 end as grp
from table_name),
v as
(select ID, start_date, end_date,
sum(grp) over(partition by ID order by start_date, end_date) as island
from u)
select ID, min(start_date) as start_Date, max(end_date) as end_date
from v
group by ID, island;
Fiddle
Basically you can identify "islands" by comparing start_date of current row to end_date of previous row (ordered by start_date, end_date), if it precedes it then it's the same island. Then you can do a rolling sum() to get the island numbers. Finally select min(start_date) and max(end_date) from each island to get the desired output.
This may work ,with little bit of change in function , I tried it in Dbeaver :
select ID,Start_Date,End_Date
from
(
select t.*,
dense_rank () over(partition by extract (year from Start_Date) order BY End_Date desc) drnk
from testing_123 t
) temp
where temp.drnk = 1
ORDER BY Start_Date;
Try this
WITH a as (
SELECT
ID,
LEFT(Start_Date, 4) as Year,
MIN(Start_Date) as New_Start_Date
FROM
TAB1
GROUP BY
ID,
LEFT(Start_Date, 4)
), b as (
SELECT
a.ID,
Year,
New_Start_Date,
End_Date
FROM
a
LEFT JOIN
TAB1
ON LEFT(a.New_Start_Date, 4) = LEFT(TAB1.Start_Date, 4)
)
select
ID,
New_Start_Date as Start_Date,
MAX(End_Date)
from
b
GROUP BY
ID,
New_Start_Date;
Example: https://dbfiddle.uk/?rdbms=mysql_8.0&fiddle=97f91b68c635aebfb752538cdd752ace

Overlapping effective dates aggregation

I am trying to aggregate overlapping effective dates. Any gaps between dates should be considered as separate rows. I am using min and max and I am getting below output but would like to see expected output.
My query
WITH test_data AS (
SELECT '2020-01-01' AS date_from,
'2020-01-03' AS date_to,
'1' AS product
UNION ALL
SELECT '2020-01-05' AS date_from,
'2020-01-07' AS date_to,
'1' AS product
UNION ALL
SELECT '2020-01-06' AS date_from,
'2020-01-10' AS date_to,
'1' AS product
)
SELECT product,
MIN(date_from) AS date_from,
MAX(date_to) AS date_to
FROM test_data
GROUP BY 1;
Source data
date_from
date_to
product
2020-01-01
2020-01-03
1
2020-01-05
2020-01-07
1
2020-01-06
2020-01-10
1
Output table
date_from
date_to
product
2020-01-01
2020-01-10
1
Expected output
date_from
date_to
product
2020-01-01
2020-01-03
1
2020-01-05
2020-01-10
1
Thanks in advance !
This is a type of gaps-and-islands problem. I recommend an approach like this:
SELECT product,
MIN(date_from) AS date_from,
MAX(date_to) AS date_to
FROM (SELECT td.*,
SUM(CASE WHEN prev_date_to >= date_from THEN 0 ELSE 1 END) OVER (PARTITION BY product ORDER BY date_to) as grp
FROM (SELECT td.*,
MAX(date_to) OVER (PARTITION BY product ORDER BY date_from ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) as prev_date_to
FROM test_data td
) td
) td
GROUP BY grp, product
ORDER BY product, MIN(date_from);
Here is a db<>fiddle.
What is this doing? The innermost subquery is getting the latest date_to on previous rows. This is used to determine if each row is "connected" to the previous row or if it starts a new grouping.
The middle subquery has logic which is a cumulative sum of when the rows start a new group. The outer query then aggregates by this grouping.
Merging of date ranges could be achieved with MATCH_RECOGNIZE.
Data preparation:
CREATE OR REPLACE TABLE test_data AS
SELECT '2020-01-01'::DATE AS date_from, '2020-01-03'::DATE AS date_to, '1' AS product
UNION ALL
SELECT '2020-01-05'::DATE AS date_from, '2020-01-07'::DATE AS date_to, '1' AS product
UNION ALL
SELECT '2020-01-06'::DATE AS date_from, '2020-01-10'::DATE AS date_to, '1' AS product;
Query:
SELECT *
FROM test_data t
MATCH_RECOGNIZE(
PARTITION BY product
ORDER BY date_from, date_to
MEASURES FIRST(date_from) date_from, MAX(date_to) date_to
PATTERN(a* b)
DEFINE a AS MAX(date_to) OVER() >= NEXT(date_from)
) mr;
db<>fiddle demo - Oracle
Related reading: Merging Overlapping Date Ranges with MATCH_RECOGNIZE by stewashton

How do I calculate the Rolling Average for the difference of days between events on BigQuery?

I have a table of events that goes like this:
date event_category event_planner
2019-09-22T00:00:00 soccer_night Marcus
2019-09-25T00:00:00 comedy_night John
2019-09-28T00:00:00 dance_party John
2019-10-02T00:00:00 soccer_night Marcus
The idea here is to get the rolling average of the difference between dates for every planner.
So far I have the distance in days for each planner separated by category with the following:
DATE_DIFF(SAFE_CAST(date AS date),LAG(SAFE_CAST(date AS date)) OVER (PARTITION BY event_category, event_planner ORDER BY date), day) AS result
What I expect is something like this:
date event_category event_planner rolling_avg
2019-09-22T00:00:00 soccer_night Marcus 0
2019-09-25T00:00:00 comedy_night John 0
2019-09-28T00:00:00 comedy_night John 3
2019-10-02T00:00:00 soccer_night Marcus 10
2019-10-10T00:00:00 comedy_night John 7
Below is for BigQuery Standard SQL
#standardSQL
SELECT * EXCEPT(day, diff), IFNULL(AVG(diff) OVER(PARTITION BY event_category, event_planner ORDER BY day), 0) rolling_avg
FROM (
SELECT *, DATE_DIFF(day, LAG(day) OVER(PARTITION BY event_category, event_planner ORDER BY day), DAY) diff
FROM (
SELECT *, SAFE_CAST(date AS DATE) AS day
FROM `project.dataset.table`
)
)
If to apply to sample data from your question
WITH `project.dataset.table` AS (
SELECT TIMESTAMP '2019-09-22T00:00:00' date, 'soccer_night' event_category, 'Marcus' event_planner UNION ALL
SELECT '2019-09-25T00:00:00', 'comedy_night', 'John' UNION ALL
SELECT '2019-09-28T00:00:00', 'comedy_night', 'John' UNION ALL
SELECT '2019-10-02T00:00:00', 'soccer_night', 'Marcus' UNION ALL
SELECT '2019-10-10T00:00:00', 'comedy_night', 'John'
)
result is
Row date event_category event_planner rolling_avg
1 2019-09-22 00:00:00 UTC soccer_night Marcus 0
2 2019-09-25 00:00:00 UTC comedy_night John 0
3 2019-09-28 00:00:00 UTC comedy_night John 3.0
4 2019-10-02 00:00:00 UTC soccer_night Marcus 10.0
5 2019-10-10 00:00:00 UTC comedy_night John 7.5
How should I modify to use the average from the last three events of the same type by the same planner?
#standardSQL
SELECT * EXCEPT(day, diff),
IFNULL(AVG(diff) OVER(PARTITION BY event_category, event_planner ORDER BY day ROWS BETWEEN 2 PRECEDING AND CURRENT ROW), 0) rolling_avg
FROM (
SELECT *, DATE_DIFF(day, LAG(day) OVER(PARTITION BY event_category, event_planner ORDER BY day), DAY) diff
FROM (
SELECT *, SAFE_CAST(date AS DATE) AS day
FROM `project.dataset.table`
)
)
You could compute the last date in a subquery using lag(), and then do a rolling average in the outer query:
select
t.*,
avg(date_diff(date, lag_date, day)) over(
partition by event_category, event_planner order by date
) rolling_avg
from (
select
t.*
lag(date) over(
partition by event_category, event_planner order by date
) lag_date
from mytable t
) t
For the average you can use:
(DATE_DIFF(MIN(SAFE_CAST(date AS date)) OVER (PARTITION BY event_category, event_planner),
SAFE_CAST(date AS date),
day
) /
NULLIF(COUNT(*) OVER (PARTITION BY event_category, event_planner) - 1, 0)
) AS result

Calculating concurrency from a set of ranges

I have a set of rows containing a start timestamp and a duration. I want to perform various summaries using the overlap or concurrency.
For example: peak daily concurrency, peak concurrency grouped on another column.
Example data:
timestamp,duration
2016-01-01 12:00:00,300
2016-01-01 12:01:00,300
2016-01-01 12:06:00,300
I would like to know that peak for the period was 12:01:00-12:05:00 at 2 concurrent.
Any ideas on how to achieve this using BigQuery or, less exciting, a Map/Reduce job?
For a per-minute resolution, with session lengths of up to 255 minutes:
SELECT session_minute, COUNT(*) c
FROM (
SELECT start, DATE_ADD(start, i, 'MINUTE') session_minute FROM (
SELECT * FROM (
SELECT TIMESTAMP("2015-04-30 10:14") start, 7 minutes
),(
SELECT TIMESTAMP("2015-04-30 10:15") start, 12 minutes
),(
SELECT TIMESTAMP("2015-04-30 10:15") start, 12 minutes
),(
SELECT TIMESTAMP("2015-04-30 10:18") start, 12 minutes
),(
SELECT TIMESTAMP("2015-04-30 10:23") start, 3 minutes
)
) a
CROSS JOIN [fh-bigquery:public_dump.numbers_255] b
WHERE a.minutes>b.i
)
GROUP BY 1
ORDER BY 1
STEP 1 - First you need find all periods (start and end) with
respective concurrent entries
SELECT ts AS start, LEAD(ts) OVER(ORDER BY ts) AS finish,
SUM(entry) OVER(ORDER BY ts) AS concurrent_entries
FROM (
SELECT ts, SUM(entry)AS entry
FROM
(SELECT ts, 1 AS entry FROM yourTable),
(SELECT DATE_ADD(ts, duration, 'second') AS ts, -1 AS entry FROM yourTable)
GROUP BY ts
HAVING entry != 0
)
ORDER BY ts
Assuming input as below
(SELECT TIMESTAMP('2016-01-01 12:00:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:01:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:06:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:07:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:10:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:11:00') AS ts, 300 AS duration)
the output of above query will look somehow like this:
start finish concurrent_entries
2016-01-01 12:00:00 UTC 2016-01-01 12:01:00 UTC 1
2016-01-01 12:01:00 UTC 2016-01-01 12:05:00 UTC 2
2016-01-01 12:05:00 UTC 2016-01-01 12:07:00 UTC 1
2016-01-01 12:07:00 UTC 2016-01-01 12:10:00 UTC 2
2016-01-01 12:10:00 UTC 2016-01-01 12:12:00 UTC 3
2016-01-01 12:12:00 UTC 2016-01-01 12:15:00 UTC 2
2016-01-01 12:15:00 UTC 2016-01-01 12:16:00 UTC 1
2016-01-01 12:16:00 UTC null 0
You might still want to polish above query a little - but mainly it does what you need
STEP 2 - now you can do any stats off of above result
For example peak on whole period:
SELECT
start, finish, concurrent_entries, RANK() OVER(ORDER BY concurrent_entries DESC) AS peak
FROM (
SELECT ts AS start, LEAD(ts) OVER(ORDER BY ts) AS finish,
SUM(entry) OVER(ORDER BY ts) AS concurrent_entries
FROM (
SELECT ts, SUM(entry)AS entry FROM
(SELECT ts, 1 AS entry FROM yourTable),
(SELECT DATE_ADD(ts, duration, 'second') AS ts, -1 AS entry FROM yourTable)
GROUP BY ts
HAVING entry != 0
)
)
ORDER BY peak