Exclude overlapping periods in time aggregate function - sql

I have a table containing each a start and and end date:
DROP TABLE temp_period;
CREATE TABLE public.temp_period
(
id integer NOT NULL,
"startDate" date,
"endDate" date
);
INSERT INTO temp_period(id,"startDate","endDate") VALUES(1,'2010-01-01','2010-03-31');
INSERT INTO temp_period(id,"startDate","endDate") VALUES(2,'2013-05-17','2013-07-18');
INSERT INTO temp_period(id,"startDate","endDate") VALUES(3,'2010-02-15','2010-05-31');
INSERT INTO temp_period(id,"startDate","endDate") VALUES(7,'2014-01-01','2014-12-31');
INSERT INTO temp_period(id,"startDate","endDate") VALUES(56,'2014-03-31','2014-06-30');
Now I want to know the total duration of all periods stored there. I need just the time as an interval. That's pretty easy:
SELECT sum(age("endDate","startDate")) FROM temp_period;
However, the problem is: Those periods do overlap. And I want to eliminate all overlapping periods, so that I get the total amount of time which is covered by at least one record in the table.
You see, there are quite some gaps in between the times, so passing the smallest start date and the most recent end date to the age function won't do the trick. However, I thought about doing that and subtracting the total amount of gaps, but no elegant way to do that came into my mind.
I use PostgreSQL 9.6.

What about this:
WITH
/* get all time points where something changes */
points AS (
SELECT "startDate" AS p
FROM temp_period
UNION SELECT "endDate"
FROM temp_period
),
/*
* Get all date ranges between these time points.
* The first time range will start with NULL,
* but that will be excluded in the next CTE anyway.
*/
inter AS (
SELECT daterange(
lag(p) OVER (ORDER BY p),
p
) i
FROM points
),
/*
* Get all date ranges that are contained
* in at least one of the intervals.
*/
overlap AS (
SELECT DISTINCT i
FROM inter
CROSS JOIN temp_period
WHERE i <# daterange("startDate", "endDate")
)
/* sum the lengths of the date ranges */
SELECT sum(age(upper(i), lower(i)))
FROM overlap;
For your data it will return:
┌──────────┐
│ interval │
├──────────┤
│ 576 days │
└──────────┘
(1 row)

You could try to use recursive cte to calculate the period. For each record, we will check if it's overlapped with previous records. If it is, we only calculate the period that is not overlapping.
WITH RECURSIVE days_count AS
(
SELECT startDate,
endDate,
AGE(endDate, startDate) AS total_days,
rowSeq
FROM ordered_data
WHERE rowSeq = 1
UNION ALL
SELECT GREATEST(curr.startDate, prev.endDate) AS startDate,
GREATEST(curr.endDate, prev.endDate) AS endDate,
AGE(GREATEST(curr.endDate, prev.endDate), GREATEST(curr.startDate, prev.endDate)) AS total_days,
curr.rowSeq
FROM ordered_data curr
INNER JOIN days_count prev
ON curr.rowSeq > 1
AND curr.rowSeq = prev.rowSeq + 1),
ordered_data AS
(
SELECT *,
ROW_NUMBER() OVER (ORDER BY startDate) AS rowSeq
FROM temp_period)
SELECT SUM(total_days) AS total_days
FROM days_count;
I've created a demo here

Actually there is a case that is not covered by the previous examples.
What if we have such a period ?
INSERT INTO temp_period(id,"startDate","endDate") VALUES(100,'2010-01-03','2010-02-10');
We have the following intervals:
Interval No. | | start_date | | end_date
--------------+------------------+------------+----------------+------------
1 | Interval start | 2010-01-01 | Interval end | 2010-03-31
2 | Interval start | 2010-01-03 | Interval end | 2010-02-10
3 | Interval start | 2010-02-15 | Interval end | 2010-05-31
4 | Interval start | 2013-05-17 | Interval end | 2013-07-18
5 | Interval start | 2014-01-01 | Interval end | 2014-12-31
6 | Interval start | 2014-03-31 | Interval end | 2014-06-30
Even though segment 3 overlaps segment 1, it's seen as a new segment, hence the (wrong) result:
sum
-----
620
(1 row)
The solution is to tweak the core of the query
CASE WHEN start_date < lag(end_date) OVER (ORDER BY start_date, end_date) then NULL ELSE start_date END
needs to be replaced by
CASE WHEN start_date < max(end_date) OVER (ORDER BY start_date, end_date rows between unbounded preceding and 1 preceding) then NULL ELSE start_date END
then it works as expected
sum
-----
576
(1 row)
Summary:
SELECT sum(e - s)
FROM (
SELECT left_edge as s, max(end_date) as e
FROM (
SELECT start_date, end_date, max(new_start) over (ORDER BY start_date, end_date) as left_edge
FROM (
SELECT start_date, end_date, CASE WHEN start_date < max(end_date) OVER (ORDER BY start_date, end_date rows between unbounded preceding and 1 preceding) then NULL ELSE start_date END AS new_start
FROM temp_period
) s1
) s2
GROUP BY left_edge
) s3;

This one required two outer joins on a complex query. One join to identify all overlaps with a startdate larger than THIS and to expand the timespan to match the larger of the two. The second join is needed to match records with no overlaps. Take the Min of the min and the max of the max, including non matched. I was using MSSQL so the syntax may be a bit different.
DECLARE #temp_period TABLE
(
id int NOT NULL,
startDate datetime,
endDate datetime
)
INSERT INTO #temp_period(id,startDate,endDate) VALUES(1,'2010-01-01','2010-03-31')
INSERT INTO #temp_period(id,startDate,endDate) VALUES(2,'2013-05-17','2013-07-18')
INSERT INTO #temp_period(id,startDate,endDate) VALUES(3,'2010-02-15','2010-05-31')
INSERT INTO #temp_period(id,startDate,endDate) VALUES(3,'2010-02-15','2010-07-31')
INSERT INTO #temp_period(id,startDate,endDate) VALUES(7,'2014-01-01','2014-12-31')
INSERT INTO #temp_period(id,startDate,endDate) VALUES(56,'2014-03-31','2014-06-30')
;WITH OverLaps AS
(
SELECT
Main.id,
OverlappedID=Overlaps.id,
OverlapMinDate,
OverlapMaxDate
FROM
#temp_period Main
LEFT OUTER JOIN
(
SELECT
This.id,
OverlapMinDate=CASE WHEN This.StartDate<Prior.StartDate THEN This.StartDate ELSE Prior.StartDate END,
OverlapMaxDate=CASE WHEN This.EndDate>Prior.EndDate THEN This.EndDate ELSE Prior.EndDate END,
PriorID=Prior.id
FROM
#temp_period This
LEFT OUTER JOIN #temp_period Prior ON Prior.endDate > This.startDate AND Prior.startdate < this.endDate AND This.Id<>Prior.ID
) Overlaps ON Main.Id=Overlaps.PriorId
)
SELECT
T.Id,
--If has overlapped then sum all overlapped records prior to this one, else not and overlap get the start and end
MinDate=MIN(COALESCE(HasOverlapped.OverlapMinDate,startDate)),
MaxDate=MAX(COALESCE(HasOverlapped.OverlapMaxDate,endDate))
FROM
#temp_period T
LEFT OUTER JOIN OverLaps IsAOverlap ON IsAOverlap.OverlappedID=T.id
LEFT OUTER JOIN OverLaps HasOverlapped ON HasOverlapped.Id=T.id
WHERE
IsAOverlap.OverlappedID IS NULL -- Exclude older records that have overlaps
GROUP BY
T.Id

Beware: the answer by Laurenz Albe has a huge scalability issue.
I was more than happy when I found it. I customized it for our needs. We deployed to staging and very soon, the server took several minutes to return the results.
Then I found this answer on postgresql.org. Much more efficient.
https://wiki.postgresql.org/wiki/Range_aggregation
SELECT sum(e - s)
FROM (
SELECT left_edge as s, max(end_date) as e
FROM (
SELECT start_date, end_date, max(new_start) over (ORDER BY start_date, end_date) as left_edge
FROM (
SELECT start_date, end_date, CASE WHEN start_date < lag(end_date) OVER (ORDER BY start_date, end_date) then NULL ELSE start_date END AS new_start
FROM temp_period
) s1
) s2
GROUP BY left_edge
) s3;
Result:
sum
-----
576
(1 row)

Related

Get Start and End date from multiple rows of dates, excluding weekends

I'm trying figure out how to return Start Date and End date based on data like in the below table:
Name
Date From
Date To
A
2022-01-03
2022-01-03
A
2021-12-29
2021-12-31
A
2021-12-28
2021-12-28
A
2021-12-27
2021-12-27
A
2021-12-23
2021-12-24
A
2021-11-08
2021-11-09
The result I am after would show like this:
Name
Date From
Date To
A
2021-12-23
2022-01-03
A
2021-11-08
2021-11-09
The dates in first table will sometimes go over weekends with the Date From and Date To, but in cases where the row ends on a Friday and next row starts on following Monday it will need to be classified as the same "block", as presented in the second table. I was hoping to use DATEFIRST setting to cater for the weekends to avoid using a calendar table, as per How do I exclude Weekend days in a SQL Server query?, but if calendar table ends up being the easiest way out I'm happy to look into creating one.
In above example I only have 1 Name, but the table will have multiple names and it will need to be grouped by that.
The only examples of this I am seeing are using only 1 date column for records and I struggled changing their code around to cater for my example. The closest example I found doesn't work for me as it is based on datetime fields and the time differences - find start and stop date for contiguous dates in multiple rows
This is a Gaps & Island problem with the twist that you need to consider weekend continuity.
You can do:
select max(name) as name, min(date_from) as date_from, max(date_to) as date_to
from (
select *, sum(inc) over(order by date_to) as grp
from (
select *,
case when lag(ext_to) over(order by date_to) = date_from
then 0 else 1 end as inc
from (
select *,
case when (datepart(weekday, date_to) = 6)
then dateadd(day, 3, date_to)
else dateadd(day, 1, date_to) end as ext_to
from t
) x
) y
) z
group by grp
Result:
name date_from date_to
---- ---------- ----------
A 2021-11-08 2021-11-09
A 2021-12-23 2022-01-03
See running example at db<>fiddle #1.
Note: Your question doesn't mention it, but you probably want to segment per person. I didn't do it.
EDIT: Adding partition by name
Partitioning by name is quite easy actually. The following query does it:
select name, min(date_from) as date_from, max(date_to) as date_to
from (
select *, sum(inc) over(partition by name order by date_to) as grp
from (
select *,
case when lag(ext_to) over(partition by name order by date_to) = date_from
then 0 else 1 end as inc
from (
select *,
case when (datepart(weekday, date_to) = 6)
then dateadd(day, 3, date_to)
else dateadd(day, 1, date_to) end as ext_to
from t
) x
) y
) z
group by name, grp
order by name, grp
See running query at db<>fiddle #2.
with extended as (
select name,
date_from,
case when datepart(weekday, date_to) = 6
then dateadd(day, 2, date_to) else date_to end as date_to
from t
), adjacent as (
select *,
case when dateadd(day, 1,
lag(date_to) over (partition by name order by date_from)) = date_from
then 0 else 1 end as brk
from extended
), blocked as (
select *, sum(brk) over (partition by name order by date_from) as grp
from adjacent
)
select name, min(date_from), max(date_to) from blocked
group by name, grp;
I'm assuming that ranges do no overlap and that all input dates do fall on weekdays. While hammering this out on my cellphone I originally made two mistakes. For some reason I got to and from dates reversed in my head and then I was thinking that Friday is 5 (as with ##datefirst) rather than 6. (Of course this could otherwise vary with the regional setting anyway.) One advantage of using table expressions is to modularize and bury certain details in lower levels of the logic. In this case it would be very easy to adjust dates should some of these assumptions prove to be wrong.
https://dbfiddle.uk/?rdbms=sqlserver_2019&fiddle=42e0c452d57d474232bcf991d6d3c43c

Count Data by Loop Calendar SQL/Oracle

I need to get the data that generates count of total ID by date between date_active and date_end using date ranges for each. If the dates are crossing each other the ID will adding up. here is the data I have right now,
TABLE CONTRACT:
ID DATE_ACTIVE DATE_END
1 05-FEB-13 08-NOV-13
1 21-DEC-18 06-OCT-19
2 05-FEB-13 27-JAN-14
3 05-FEB-13 07-NOV-13
4 06-FEB-13 02-NOV-13
4 25-OCT-14 13-APR-16
TABLE CALENDAR:
DT
05-FEB-13
06-FEB-13
07-FEB-13
08-FEB-13
09-FEB-13
..-DEC-19
what I want out is basically like this:
DT COUNT(ID)
05-FEB-13 3
06-FEB-13 4
07-FEB-13 4
08-FEB-13 4
09-FEB-13 4
10-FEB-13 4
....
03-NOV-13 3
....
08-NOV-13 2
09-NOV-13 1
....
28-JAN-14 0
....
25-OCT-14 1
....
13-APR-16 1
14-APR-16 0
....
21-DEC-18 1
....
06-OCT-19 1
07-OCT-19 0
....
....
And here is my query to get that result
with contract as (
select * from contract
where id in ('1','2','3','4')
)
,
cal as
(
select TRUNC (SYSDATE - ROWNUM) dt
from dual
connect by rownum < sysdate - to_date('05-FEB-13')
)
select aa.dt,count(distinct bb.id)id from cal aa
left join contract bb on aa.dt >= bb.date_active and aa.dt<= bb.date_end
group by aa.dt
order by 1
but the problem is I have 6 mio of ID and if I use this kind of query, the result maybe will take forever, and I'm having a hard times to figured out how to get the result with different query. It will be my pleasure if somebody can help me out of this. Thank you so much.
If you group your events by date_active and date_end, you will get the numbers of events which have started and ended on each separate day.
Not a lot of days have passed between 2013 and 2019 (about 2 000), so the grouped resultsets will be relatively short.
Now that you have the two groups, you can notice that the number of events on each given date is the number of events which have started on or before this date, minus the number of events which have finished on or before this date (I'm assuming the end dates are non-inclusive).
In other words, the number of events on every given day is:
The number of events on the previous date,
plus the number of events started on this date,
minus the number of events ended on this date.
This can be easily done using a window function.
This will require a join between the calendar table and the two groups, but fortunately all of them are relatively short (thousands of records) and the join would be fast.
Here's the query: http://sqlfiddle.com/#!4/b21ce/5
WITH cal AS
(
SELECT TRUNC (to_date('01-NOV-13') - ROWNUM) dt
FROM dual
CONNECT BY
rownum < to_date('01-NOV-13')- to_date('01-FEB-13')
),
started_on AS
(
SELECT date_active AS dt, COUNT(*) AS cnt_start
FROM contract
GROUP BY
date_active
),
ended_on AS
(
SELECT date_end AS dt, COUNT(*) AS cnt_end
FROM contract
GROUP BY
date_end
)
SELECT dt,
SUM(COALESCE(cnt_start, 0) - COALESCE(cnt_end, 0)) OVER (ORDER BY dt) cnt
FROM cal c
LEFT JOIN
started_on s
USING (dt)
LEFT JOIN
ended_on e
USING (dt)
(I used a fixed date instead of SYSDATE to keep the resultset short, but the idea is the same)
This query requires that the calendar starts before the earliest event, otherwise every result will be off by a fixed amount, the number of events before the beginning of the calendar.
You can replace the fixed date in the calendar condition with (SELECT MIN(date_active) FROM contract) which is instant if date_active is indexed.
Update:
If your contract dates can overlap and you want to collapse multiple overlapping contracts into a one continuous contract, you can use window functions to do so.
WITH cal AS
(
SELECT TRUNC (to_date('01-NOV-13') - ROWNUM) dt
FROM dual
CONNECT BY
rownum <= to_date('01-NOV-13')- to_date('01-FEB-13')
),
collapsed_contract AS
(
SELECT *
FROM (
SELECT c.*,
COALESCE(LAG(date_end_effective) OVER (PARTITION BY id ORDER BY date_active), date_active) AS date_start_effective
FROM (
SELECT c.*,
MAX(date_end) OVER (PARTITION BY id ORDER BY date_active) AS date_end_effective
FROM contract c
) c
) c
WHERE date_start_effective < date_end_effective
),
started_on AS
(
SELECT date_start_effective AS dt, COUNT(*) AS cnt_start
FROM collapsed_contract
GROUP BY
date_start_effective
),
ended_on AS
(
SELECT date_end_effective AS dt, COUNT(*) AS cnt_end
FROM collapsed_contract
GROUP BY
date_end_effective
)
SELECT dt,
SUM(COALESCE(cnt_start, 0) - COALESCE(cnt_end, 0)) OVER (ORDER BY dt) cnt
FROM cal c
LEFT JOIN
started_on s
USING (dt)
LEFT JOIN
ended_on e
USING (dt)
http://sqlfiddle.com/#!4/adeba/1
The query might seem bulky, but that's to make it more efficient, as all these window functions can be calculated in a single pass over the table.
Note however that this single pass relies on the table being sorted on (id, date_active) so an index on these two fields is crucial.
Firstly, row_number() over (order by id,date_active) analytic function is used in order to generate unique ID values those will be substituted in
connect by level <= ... and prior id = id syntax to get unpivoted hierarchical data :
with t0 as
(
select row_number() over (order by id,date_active) as id, date_active, date_end
from contract
), t1 as
(
select date_active + level - 1 as dt
from t0
connect by level <= date_end - date_active + 1
and prior id = id
and prior sys_guid() is not null
)
select dt, count(*)
from t1
group by dt
order by dt
Demo

Google Big Query SQL - Get most recent unique value by date

#EDIT - Following the comments, I rephrase my question
I have a BigQuery table that i want to use to get some KPI of my application.
In this table, I save each create or update as a new line in order to keep a better history.
So I have several times the same data with a different state.
Example of the table :
uuid |status |date
––––––|–––––––––––|––––––––––
3 |'inactive' |2018-05-12
1 |'active' |2018-05-10
1 |'inactive' |2018-05-08
2 |'active' |2018-05-08
3 |'active' |2018-05-04
2 |'inactive' |2018-04-22
3 |'inactive' |2018-04-18
We can see that we have multiple value of each data.
What I would like to get:
I would like to have the number of current 'active' entry (So there must be no 'inactive' entry with the same uuid after). And to complicate everything, I need this total per day.
So for each day, the amount of 'active' entries, including those from previous days.
So with this example I should have this result :
date | actives
____________|_________
2018-05-02 | 0
2018-05-03 | 0
2018-05-04 | 1
2018-05-05 | 1
2018-05-06 | 1
2018-05-07 | 1
2018-05-08 | 2
2018-05-09 | 2
2018-05-10 | 3
2018-05-11 | 3
2018-05-12 | 2
Actually i've managed to get the good amount of actives for one day. But my problem is when i want the results for each days.
What I've tried:
I'm stuck with two solutions that each return a different error.
First solution :
WITH
dates AS(
SELECT GENERATE_DATE_ARRAY(
DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH), CURRENT_DATE(), INTERVAL 1 DAY)
arr_dates )
SELECT
i_date date,
(
SELECT COUNT(uuid)
FROM (
SELECT
uuid, status, date,
RANK() OVER(PARTITION BY uuid ORDER BY date DESC) rank
FROM users
WHERE
PARSE_DATE("%Y-%m-%d", FORMAT_DATETIME("%Y-%m-%d",date)) <= i_date
)
WHERE
status = 'active'
and rank = 1
## rank is the condition which causes the error
) users
FROM
dates, UNNEST(arr_dates) i_date
ORDER BY i_date;
The SELECT with the RANK() OVER correctly returns the users with a rank column that allow me to know which entry is the last for each uuid.
But when I try this, I got a :
Correlated subqueries that reference other tables are not supported unless they can be de-correlated, such as by transforming them into an efficient JOIN. because of the rank = 1 condition.
Second solution :
WITH
dates AS(
SELECT GENERATE_DATE_ARRAY(
DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH), CURRENT_DATE(), INTERVAL 1 DAY)
arr_dates )
SELECT
i_date date,
(
SELECT
COUNT(t1.uuid)
FROM
users t1
WHERE
t1.date = (
SELECT MAX(t2.date)
FROM users t2
WHERE
t2.uuid = t1.uuid
## Here that's the i_date condition which causes problem
AND PARSE_DATE("%Y-%m-%d", FORMAT_DATETIME("%Y-%m-%d", t2.date)) <= i_date
)
AND status='active' ) users
FROM
dates,
UNNEST(arr_dates) i_date
ORDER BY i_date;
Here, the second select is working too and correctly returning the number of active user for a current day.
But the problem is when i try to use i_date to retrieve datas among the multiple days.
And Here i got a LEFT OUTER JOIN cannot be used without a condition that is an equality of fields from both sides of the join. error...
Which solution is more able to succeed ? What should i change ?
And, if my way of storing the data isn't good, how should i proceed in order to keep a precise history ?
Below is for BigQuery Standard SQL
#standardSQL
SELECT date, COUNT(DISTINCT uuid) total_active
FROM `project.dataset.table`
WHERE status = 'active'
GROUP BY date
-- ORDER BY date
Update to address your "rephrased" question :o)
Below example is using dummy data from your question
#standardSQL
WITH `project.dataset.users` AS (
SELECT 3 uuid, 'inactive' status, DATE '2018-05-12' date UNION ALL
SELECT 1, 'active', '2018-05-10' UNION ALL
SELECT 1, 'inactive', '2018-05-08' UNION ALL
SELECT 2, 'active', '2018-05-08' UNION ALL
SELECT 3, 'active', '2018-05-04' UNION ALL
SELECT 2, 'inactive', '2018-04-22' UNION ALL
SELECT 3, 'inactive', '2018-04-18'
), dates AS (
SELECT day FROM UNNEST((
SELECT GENERATE_DATE_ARRAY(MIN(date), MAX(date))
FROM `project.dataset.users`
)) day
), active_users AS (
SELECT uuid, status, date first, DATE_SUB(next_status.date, INTERVAL 1 DAY) last FROM (
SELECT uuid, date, status, LEAD(STRUCT(status, date)) OVER(PARTITION BY uuid ORDER BY date ) next_status
FROM `project.dataset.users` u
)
WHERE status = 'active'
)
SELECT day, COUNT(DISTINCT uuid) actives
FROM dates d JOIN active_users u
ON day BETWEEN first AND IFNULL(last, day)
GROUP BY day
-- ORDER BY day
with result
Row day actives
1 2018-05-04 1
2 2018-05-05 1
3 2018-05-06 1
4 2018-05-07 1
5 2018-05-08 2
6 2018-05-09 2
7 2018-05-10 3
8 2018-05-11 3
9 2018-05-12 2
I think this -- or something similar -- will do what you want:
SELECT day,
coalesce(running_actives, 0) - coalesce(running_inactives, 0)
FROM UNNEST(GENERATE_DATE_ARRAY(DATE('2015-05-11'), DATE('2018-06-29'), INTERVAL 1 DAY)
) AS day left join
(select date, sum(countif(status = 'active')) over (order by date) as running_actives,
sum(countif(status = 'active')) over (order by date) as running_inactives
from t
group by date
) a
on a.date = day
order by day;
The exact solution depends on whether the "inactive" is inclusive of the day (as above) or takes effect the next day. Either is handled the same way, by using cumulative sums of actives and inactives and then taking the difference.
In order to get data on all days, this generates the days using arrays and unnest(). If you have data on all days, that step may be unnecessary

Grouping Timestamps based on the interval between them

I have a table in Hive (SQL) with a bunch of timestamps that need to be grouped in order to create separate sessions based on the time difference between the timestamps.
Example:
Consider the following timestamps(Given in HH:MM for simplicity):
9.00
9.10
9.20
9.40
9.43
10.30
10.45
11.25
12.30
12.33
and so on..
So now, all timestamps that fall within 30 mins of the next timestamp come under the same session,
i.e. 9.00,9.10,9.20,9.40,9.43 form 1 session.
But since the difference between 9.43 and 10.30 is more than 30 mins, the time stamp 10.30 falls under a different session. Again, 10.30 and 10.45 fall under one session.
After we have created these sessions, we have to obtain the minimum timestamp for that session and the max timestamp.
I tried to subtract the current timestamp with its LEAD and place a flag if it is greater than 30 mins, but I'm having difficulty with this.
Any suggestion from you guys would be greatly appreciated. Please let me know if the question isn't clear enough.
Expected Output for this sample data:
Session_start Session_end
9.00 9.43
10.30 10.45
11.25 11.25 (same because the next time is not within 30 mins)
12.30 12.33
Hope this helps.
So it's not MySQL but Hive. I don't know Hive, but if it supports LAG, as you say, try this PostgreSQL query. You will probably have to change the time difference calculation, that's usually different from one dbms to another.
select min(thetime) as start_time, max(thetime) as end_time
from
(
select thetime, count(gap) over (rows between unbounded preceding and current row) as groupid
from
(
select thetime, case when thetime - lag(thetime) over (order by thetime) > interval '30 minutes' then 1 end as gap
from mytable
) times
) groups
group by groupid
order by min(thetime);
The query finds gaps, then uses a running total of gap counts to build group IDs, and the rest is aggregation.
SQL fiddle: http://www.sqlfiddle.com/#!17/8bc4a/6.
With MySQL lacking LAG and LEAD functions, getting the previous or next record is some work already. Here is how:
select
thetime,
(select max(thetime) from mytable afore where afore.thetime < mytable.thetime) as afore_time,
(select min(thetime) from mytable after where after.thetime > mytable.thetime) as after_time
from mytable;
Based on this we can build the whole query where we are looking for gaps (i.e. the time difference to the previous or next record is more than 30 minutes = 1800 seconds).
select
startrec.thetime as start_time,
(
select min(endrec.thetime)
from
(
select
thetime,
coalesce(time_to_sec(timediff((select min(thetime) from mytable after where after.thetime > mytable.thetime), thetime)), 1801) > 1800 as gap
from mytable
) endrec
where gap
and endrec.thetime >= startrec.thetime
) as end_time
from
(
select
thetime,
coalesce(time_to_sec(timediff(thetime, (select max(thetime) from mytable afore where afore.thetime < mytable.thetime))), 1801) > 1800 as gap
from mytable
) startrec
where gap;
SQL fiddle: http://www.sqlfiddle.com/#!2/d307b/20.
Try this..
SELECT MIN(session_time_tmp) session_start, MAX(session_time_tmp) session_end FROM
(
SELECT IF((TIME_TO_SEC(TIMEDIFF(your_time_field, COALESCE(#previousValue, your_time_field))) / 60) > 30 ,
#sessionCount := #sessionCount + 1, #sessionCount ) sessCount,
( #previousValue := your_time_field ) session_time_tmp FROM
(
SELECT your_time_field, #previousValue:= NULL, #sessionCount := 1 FROM yourtable ORDER BY your_time_field
) a
) b
GROUP BY sessCount
Just replace yourtable and your_time_field
Try this:
SELECT DATE_FORMAT(MIN(STR_TO_DATE(B.column1, '%H.%i')), '%H.%i') AS Session_start,
DATE_FORMAT(MAX(STR_TO_DATE(B.column1, '%H.%i')), '%H.%i') AS Session_end
FROM tableA A
LEFT JOIN ( SELECT A.column1, diff, IF(#diff:=diff < 30, #id, #id:=#id+1) AS rnk
FROM (SELECT B.column1, TIME_TO_SEC(TIMEDIFF(STR_TO_DATE(B.column1, '%H.%i'), STR_TO_DATE(A.column1, '%H.%i'))) / 60 AS diff
FROM tableA A
INNER JOIN tableA B ON STR_TO_DATE(A.column1, '%H.%i') < STR_TO_DATE(B.column1, '%H.%i')
GROUP BY STR_TO_DATE(A.column1, '%H.%i')
) AS A, (SELECT #diff:=0, #id:= 1) AS B
) AS B ON A.column1 = B.column1
GROUP BY IFNULL(B.rnk, 1);
Check the SQL FIDDLE DEMO
OUTPUT
| SESSION_START | SESSION_END |
|---------------|-------------|
| 9.00 | 9.43 |
| 10.30 | 10.45 |
| 11.25 | 11.25 |
| 12.30 | 12.33 |

Find date ranges between large gaps and ignore smaller gaps

I have a column of a mostly continous unique dates in ascending order. Although the dates are mostly continuos, there are some gaps in the dates of less than 3 days, others have more than 3 days.
I need to create a table where each record has a start date and an end date of the range that includes a gap of 3 days or less. But a new record has to be generated if the gap is longer than 3 days.
so if dates are:
1/2/2012
1/3/2012
1/4/2012
1/15/2012
1/16/2012
1/18/2012
1/19/2012
I need:
1/2/2012 1/4/2012
1/15/2012 1/19/2012
You can do something like this:
WITH CTE_Source AS
(
SELECT *, ROW_NUMBER() OVER (ORDER BY DT) RN
FROM dbo.Table1
)
,CTE_Recursion AS
(
SELECT *, 1 AS Grp
FROM CTE_Source
WHERE RN = 1
UNION ALL
SELECT src.*, CASE WHEN DATEADD(DD,3,rec.DT) < src.DT THEN rec.Grp + 1 ELSE Grp END AS Grp
FROM CTE_Source src
INNER JOIN CTE_Recursion rec ON src.RN = rec.RN +1
)
SELECT
MIN(DT) AS StartDT, MAX(DT) AS EndDT
FROM CTE_Recursion
GROUP BY Grp
First CTE is just to assign continuous numbers for all rows in order to join them later. Then using recursive CTE you can join on each next row assigning groups if date difference is larger than 3 days. In the end just group by grouping column and select desired results.
SQLFiddle DEMO