Grouping rows based on consecutive time values - sql

I'm trying to create a separate group with the rows that have consecutive time values within the same day.
For example, my current dataset is as follows:
Date StartTime EndTime StudentID Type Class Work Group *
2020-01-30 09:00:00 11:00:00 20789 A 178 56 1
2020-01-30 11:00:00 13:00:00 20789 A 789 67 1
2021-01-08 09:00:00 10:00:00 78945 D 195 13 2
2021-01-08 10:00:00 12:00:00 78945 D 789 12 2
2021-01-08 13:00:00 14:00:00 78945 D 398 13 3
2021-01-08 14:00:00 16:00:00 78945 D 543 13 3
If the rows have same Student ID and Type and the Start/End Time are consecutive within the same day,
I'd like to assign the same, unique Group ID number like the "Group" column in the data set. I tried to create the group column using Lag/Lead and Partition by but my current code is not working.
Could anyone please help me with this?
Thank you.

You may use window functions:
Table:
SELECT *
INTO Data
FROM (VALUES
(CONVERT(date, '2020-01-30'), CONVERT(time, '09:00:00'), CONVERT(time, '11:00:00'), 20789, 'A', 178, 56),
(CONVERT(date, '2020-01-30'), CONVERT(time, '11:00:00'), CONVERT(time, '13:00:00'), 20789, 'A', 789, 67),
(CONVERT(date, '2021-01-08'), CONVERT(time, '09:00:00'), CONVERT(time, '10:00:00'), 78945, 'D', 195, 13),
(CONVERT(date, '2021-01-08'), CONVERT(time, '10:00:00'), CONVERT(time, '12:00:00'), 78945, 'D', 789, 12),
(CONVERT(date, '2021-01-08'), CONVERT(time, '13:00:00'), CONVERT(time, '14:00:00'), 78945, 'D', 398, 13),
(CONVERT(date, '2021-01-08'), CONVERT(time, '14:00:00'), CONVERT(time, '16:00:00'), 78945, 'D', 543, 13)
) v (Date, StartTime, EndTime, StudentID, Type, Class, Work)
Statement:
SELECT
Date, StartTime, EndTime, StudentID, Type, Class, Work,
SUM(Change) OVER (ORDER BY Date, StartTime) AS GroupID
FROM (
SELECT
Date, StartTime, EndTime, StudentID, Type, Class, Work,
CASE
WHEN
StudentID = LAG(StudentID) OVER (ORDER BY Date, StudentID, StartTime) AND
StartTime = LAG(EndTime) OVER (ORDER BY Date, StudentID, StartTime) THEN 0
ELSE 1
END AS Change
FROM Data
) t
Result:
Date StartTime EndTime StudentID Type Class Work GroupID
2020-01-30 09:00:00.0000000 11:00:00.0000000 20789 A 178 56 1
2020-01-30 11:00:00.0000000 13:00:00.0000000 20789 A 789 67 1
2021-01-08 09:00:00.0000000 10:00:00.0000000 78945 D 195 13 2
2021-01-08 10:00:00.0000000 12:00:00.0000000 78945 D 789 12 2
2021-01-08 13:00:00.0000000 14:00:00.0000000 78945 D 398 13 3
2021-01-08 14:00:00.0000000 16:00:00.0000000 78945 D 543 13 3

Related

SQL: Split time interval into 1 hour with overlapping minutes split (Bigquery)

This is the data that I have:
date
event_type
interval_start
interval_end
duration_in_min
2022-06-06
s1
09:05:00
11:45:00
160
2022-06-01
s2
08:00:00
08:17:00
17
2022-05-31
c1
17:55:00
18:08:00
13
2022-04-05
s3
07:58:00
08:46:00
48
...
and this is what I would like to achieve:
interval represents a 1 hour interval (or maybe 59 min and 59 sec to be accurate, in case an event starts/ends at exactly 10:00:00 but it should not occur very often).
date
interval
event_type
interval_start
interval_end
duration_in_min
2022-06-06
09:00:00
s1
09:05:00
11:45:00
55
2022-06-06
10:00:00
s1
09:05:00
11:45:00
60
2022-06-06
11:00:00
s1
09:05:00
11:45:00
45
2022-06-01
08:00:00
s2
08:00:00
08:17:00
17
2022-05-31
17:00:00
c1
17:55:00
18:08:00
5
2022-05-31
18:00:00
c1
17:55:00
18:08:00
8
2022-04-05
07:00:00
s3
07:58:00
08:46:00
2
2022-04-05
08:00:00
s3
07:58:00
08:46:00
46
...
I struggle to sort the data per hour by getting a split for the overlapping minutes into a new interval(s).
Any help would be greatly appreciated :)
Consider below approach
select
date, time(hour, 0, 0) as `interval`,
event_type, interval_start, interval_end,
time_diff(least(time(hour + 1, 0, 0), interval_end), greatest(time(hour, 0, 0), interval_start), minute) as duration_in_min
from your_table,
unnest(generate_array(0, 23)) hour
where hour between extract(hour from time(interval_start)) and extract(hour from time(interval_end))
if applied to sample data in your question - output is

Get quarter start/end dates for more than a year (start year to current year)

I've been trying to get start and end dates range for each quarter given a specific date/year, like this:
SELECT DATEADD(mm, (quarter - 1) * 3, year_date) StartDate,
DATEADD(dd, 0, DATEADD(mm, quarter * 3, year_date)) EndDate
--quarter QuarterNo
FROM
(
SELECT '2012-01-01' year_date
) s CROSS JOIN
(
SELECT 1 quarter UNION ALL
SELECT 2 UNION ALL
SELECT 3 UNION ALL
SELECT 4
) q
which produces the following output:
2012-01-01 00:00:00 2012-04-01 00:00:00
2012-04-01 00:00:00 2012-07-01 00:00:00
2012-07-01 00:00:00 2012-10-01 00:00:00
2012-10-01 00:00:00 2013-01-01 00:00:00
Problem: I need to do this for a given start_date and end_date, the problem being the end_date=current_day, so how can I achieve this:
2012-01-01 00:00:00 2012-04-01 00:00:00
2012-04-01 00:00:00 2012-07-01 00:00:00
2012-07-01 00:00:00 2012-10-01 00:00:00
2012-10-01 00:00:00 2013-01-01 00:00:00
... ...
2021-01-01 00:00:00 2021-01-06 00:00:00
I think here is what you want to do :
SET startdatevar AS DATEtime = '2020-01-10'
;WITH RECURSIVE cte AS (
SELECT startdatevar AS startdate , DATEADD(QUARTER, 1 , startdatevar) enddate , 1 quarter
UNION ALL
SELECT enddate , CASE WHEN DATEADD(QUARTER, 1 , enddate) > CURRENT_DATE() THEN GETDATE() ELSE DATEADD(QUARTER, 1 , enddate) END enddate, quarter + 1
FROM cte
WHERE
cte.enddate <= CURRENT_DATE()
and quarter < 4
)
SELECT * FROM cte
to use your code , if you want to have more than 4 quarters :
SET quarter_limit = DATEDIFF(quarter , <startdate>,<enddate>)
;WITH RECURSIVE cte(q, qDate,enddate) as
(
select 1,
DATEFROMPARTS(year('2012-01-01'::date), 1, 1) -- First quarter date
,time_slice('2012-01-01'::date, 3, 'MONTH', 'END')
UNION ALL
select q+1,
DATEADD(q, 1, qdate) -- next quarter start date
,time_slice(qdate::date, (q+1)*3, 'MONTH', 'END')
from cte
where q < quarter_limit -- limiting the number of next quarters
AND cte.endDate <= <enddate>
)
SELECT * FROM cte
After #eshirvana's answer, I came up with this slightly change after your answer:
WITH RECURSIVE cte(q, qDate,enddate) as
(
select 1,
DATEFROMPARTS(year('2012-01-01'::date), 1, 1) -- First quarter date
,time_slice('2012-01-01'::date, 3, 'MONTH', 'END')
UNION ALL
select q+1,
DATEADD(q, 1, qdate) -- next quarter start date
,time_slice(qdate::date, (q+1)*3, 'MONTH', 'END')
from cte
where q <4 -- limiting the number of next quarters
AND cte.endDate <= CURRENT_DATE()
)
SELECT * FROM cte
Which works fine for whatever year I pass there (2012 will produce 4 records, 2021 just one, since we're still on the first quarter right now).
[EDIT]: it still doesn't work as expected after your 2nd code sugestion:
WITH RECURSIVE cte(q, qDate,enddate) as
(
select 1,
DATEFROMPARTS(year('2012-01-01'::date), 1, 1) -- First quarter date
,CASE WHEN time_slice('2012-01-01'::date, 3, 'MONTH', 'END') > CURRENT_DATE
THEN current_date
ELSE time_slice('2012-01-01'::date, 3, 'MONTH', 'END')
END
UNION ALL
select q+1,
DATEADD(q, 1, qdate) -- next quarter start date
,time_slice(qdate::date, (q+1)*3, 'MONTH', 'END')
from cte
where q < DATEDIFF(quarter , '2012-01-01'::date,'2021-01-06'::date)
AND cte.endDate <= '2021-01-06'::date
)
SELECT * FROM cte
is outputing this:
Sorry #eshirvana, it doesn't work as expected though. It all goes well to some point, but it's not returning all the records. Instead, it produces less records and wrong one, like this:
1 2012-01-01 2012-04-01
2 2012-04-01 2012-07-01
3 2012-07-01 2012-10-01
4 2012-10-01 2013-01-01
5 2013-01-01 2013-10-01
6 2013-04-01 2013-07-01
7 2013-07-01 2013-10-01
8 2013-10-01 2014-01-01
9 2014-01-01 2015-01-01
10 2014-04-01 2015-01-01
11 2014-07-01 2016-10-01
12 2014-10-01 2015-01-01
13 2015-01-01 2015-07-01
14 2015-04-01 2015-07-01
15 2015-07-01 2018-10-01
16 2015-10-01 2018-01-01
17 2016-01-01 2016-10-01
18 2016-04-01 2019-07-01
19 2016-07-01 2017-07-01
20 2016-10-01 2020-01-01
21 2017-01-01 2017-04-01
22 2017-04-01 2019-07-01
23 2017-07-01 2021-10-01
Although my logic it's still not ok for not printing just Q1 dates for 2021, could this output issues be related to date format or something?
Now, it seems to be working, at least for 2012-01-01 till today (2021-01-06).
The code :
WITH RECURSIVE cte(q, qDate,enddate) as
(
select
-- it might not be the first quarter, so better to protect that:
quarter('2012-01-01'::date)::numeric
, DATEFROMPARTS(year('2012-01-01'::date), 1, 1) -- First quarter date
, CASE WHEN time_slice('2012-01-01'::date, 3, 'MONTH', 'END') > '2021-01-06'::date
THEN '2021-01-06'::date
ELSE time_slice('2012-01-01'::date, 3, 'MONTH', 'END')
END
UNION ALL
select q+1
, DATEADD(q, 1, qdate) -- next quarter start date
,CASE WHEN time_slice(DATEADD(q, 1, qdate), 3, 'MONTH', 'END')> '2021-01-06'::date
THEN '2021-01-06'::date
ELSE time_slice(DATEADD(q, 1, qdate), 3, 'MONTH', 'END')
END
from cte
where q <= DATEDIFF(quarter , '2012-01-01'::date,'2021-01-06'::date)
AND cte.endDate <= '2021-01-06'::date
)
SELECT * FROM cte
The output:
1 2012-01-01 2012-04-01
2 2012-04-01 2012-07-01
3 2012-07-01 2012-10-01
4 2012-10-01 2013-01-01
5 2013-01-01 2013-04-01
6 2013-04-01 2013-07-01
7 2013-07-01 2013-10-01
8 2013-10-01 2014-01-01
9 2014-01-01 2014-04-01
10 2014-04-01 2014-07-01
11 2014-07-01 2014-10-01
12 2014-10-01 2015-01-01
13 2015-01-01 2015-04-01
14 2015-04-01 2015-07-01
15 2015-07-01 2015-10-01
16 2015-10-01 2016-01-01
17 2016-01-01 2016-04-01
18 2016-04-01 2016-07-01
19 2016-07-01 2016-10-01
20 2016-10-01 2017-01-01
21 2017-01-01 2017-04-01
22 2017-04-01 2017-07-01
23 2017-07-01 2017-10-01
24 2017-10-01 2018-01-01
25 2018-01-01 2018-04-01
26 2018-04-01 2018-07-01
27 2018-07-01 2018-10-01
28 2018-10-01 2019-01-01
29 2019-01-01 2019-04-01
30 2019-04-01 2019-07-01
31 2019-07-01 2019-10-01
32 2019-10-01 2020-01-01
33 2020-01-01 2020-04-01
34 2020-04-01 2020-07-01
35 2020-07-01 2020-10-01
36 2020-10-01 2021-01-01
37 2021-01-01 2021-01-06
In case you're wondering: yes, the idea is to present the end_date as last_day of the month+one. But it could easily be adapted.
It's not pretty, but I think it's somehow easy to understand.

View Historical Attendance data for a day every 10 min

I have a table which contains Students Attendance, the schema is
StudentId ClassId EventType EventTime
1 1 I 2018-10-31 07:00:00 AM
2 1 I 2018-10-31 07:02:00 AM
1 1 O 2018-10-31 07:31:00 AM
3 1 I 2018-10-31 07:45:00 AM
OutPut
ClassId StudentCount StartTime EndTime
1 2 2018-10-31 07:00:00 AM 2018-10-31 07:10:00 AM
1 2 2018-10-31 07:10:01 AM 2018-10-31 07:20:00 AM
1 2 2018-10-31 07:20:01 AM 2018-10-31 07:30:00 AM
1 1 2018-10-31 07:30:01 AM 2018-10-31 07:40:00 AM
1 2 2018-10-31 07:40:01 AM 2018-10-31 07:50:00 AM
You need to generate the times. One way uses a recursive CTE. Then there are various ways to get the count.
with times as (
select cast('2018-10-31 07:00:00' as datetime) dt
union all
select dateadd(minute, 10, dt)
from times
where dateadd(minute, 10, dt) < '2018-10-31 08:00:00'
)
select t.dt,
(select sum(case when eventtype = 'I' then 1 else -1 end)
from attendance a
where a.EventTime <= t.dt
) as attendance
from times;

Redshift SQL: Get date difference based on start and end dates

My table has start_date and end_date from which I need to find the hour difference. The issue is that both of these date times are not on the same day.
user start_date end_date difference
Alex 7/25/2016 16:00 7/26/2016 0:30 8.5
Alex 7/24/2016 16:00 7/25/2016 0:30 8.5
Alex 7/21/2016 16:00 7/22/2016 0:30 8.5
Alex 7/20/2016 16:00 7/21/2016 0:30 8.5
Alex 7/19/2016 16:00 7/20/2016 0:30 8.5
Alex 7/18/2016 16:00 7/19/2016 0:30 8.5
Alex 7/17/2016 16:00 7/18/2016 0:30 8.5
Alex 7/14/2016 16:00 7/15/2016 0:30 8.5
Alex 7/13/2016 16:00 7/14/2016 0:30 8.5
Alex 7/12/2016 16:00 7/13/2016 0:30 8.5
Alex 7/11/2016 16:00 7/12/2016 0:30 8.5
Alex 7/10/2016 16:00 7/11/2016 0:30 8.5
Usually it is 5 working days and I get the answer if I group them by start_date. But I need an new date column where I need the output as below. Please note that 15/7/2016 and 22/7/2016 was not present in the above table. I need the additional 0.5 hour & date for the 6th day to be included to my derived table.
User Date difference
Alex 7/25/2016 8.5
Alex 7/24/2016 8.5
Alex 7/22/2016 0.5
Alex 7/21/2016 8.0
Alex 7/20/2016 8.5
Alex 7/19/2016 8.5
Alex 7/18/2016 8.5
Alex 7/17/2016 8.5
Alex 7/15/2016 0.5
Alex 7/14/2016 8.0
Alex 7/13/2016 8.5
Alex 7/12/2016 8.5
Alex 7/11/2016 8.5
Alex 7/10/2016 8.5
I calculate the difference as
round(cast(datediff(seconds, start_date, end_date) as decimal)/3600,2)
Whenever there is sophisticated logic, I'd suggest to use union queries and split the logic into a select query (or even table) each. Then you'd be able to calculate this in two steps. The main difference seems to be whether the 0.5 between 00:00:00 and 00:30:00 should be counted to the previous workday or whether it should stand alone. The latter seems to be determined based on whether the end_date is also a workday itself. I see three cases:
Next day is a workday:
Report all hours on start_date
Next day is not a workday:
Report hours from start_date to midnight on start_date
Report hours from midnight to end_date on end_date
I used the following example data based on your description:
create temporary table _test (user varchar(20), start_date timestamp, end_date timestamp);
insert into _test values ('Alex', '7/25/2016 16:00', '7/26/2016 0:30'), ('Alex', '7/24/2016 16:00', '7/25/2016 0:30'), ('Alex', '7/21/2016 16:00', '7/22/2016 0:30'), ('Alex', '7/20/2016 16:00', '7/21/2016 0:30'), ('Alex', '7/19/2016 16:00', '7/20/2016 0:30'), ('Alex', '7/18/2016 16:00', '7/19/2016 0:30'), ('Alex', '7/17/2016 16:00', '7/18/2016 0:30'), ('Alex', '7/14/2016 16:00', '7/15/2016 0:30'), ('Alex', '7/13/2016 16:00', '7/14/2016 0:30'), ('Alex', '7/12/2016 16:00', '7/13/2016 0:30'), ('Alex', '7/11/2016 16:00', '7/12/2016 0:30'), ('Alex', '7/10/2016 16:00', '7/11/2016 0:30');
We will need to know whether the next day is a workday, so I suggest using the lead() window function (see documentation) which will give you the start_date from the next row.
create temporary table _differences as (
select
user_name
, start_date::date as start_date
, end_date::date as end_date
/**
* Calculate difference in hours between start_date and end_date: */
, round(cast(datediff(seconds, start_date, end_date) as decimal)/3600,2) as hours_start_to_end
/**
* Calculate difference in hours between start_date and midnight: */
, round(cast(datediff(seconds, start_date, dateadd(day, 1, start_date::date)) as decimal)/3600,2) as hours_start_to_midnight
/**
* Calculate difference between midnight on end_date and end_date: */
, round(cast(datediff(seconds, end_date::date, end_date) as decimal)/3600,2) as hours_midnight_to_end
/**
* Calculate number of days from end_date until next start_date: */
, datediff(day, end_date::date, lead(start_date::date) over(partition by user_name order by start_date::date)) as days_until_next_workday
from
_test
);
Then the following query:
select
user_name as user_name
, start_date as ref_date
, hours_start_to_end as difference
from
_differences
where
days_until_next_workday = 0 -- report all work hours on start_date
union
select
user_name as user_name
, start_date as ref_date
, hours_start_to_midnight as difference
from
_differences
where
days_until_next_workday > 0 -- report partial work hours on start_date
union
select
user_name as user_name
, end_date as ref_date
, hours_midnight_to_end as difference
from
_differences
where
days_until_next_workday > 0 -- report partial work hours on end_date
order by
user_name
, ref_date desc
;
Would yield the following result:
user_name | ref_date | difference
-----------+------------+------------
Alex | 2016-07-24 | 8.50
Alex | 2016-07-22 | 0.50
Alex | 2016-07-21 | 8.00
Alex | 2016-07-20 | 8.50
Alex | 2016-07-19 | 8.50
Alex | 2016-07-18 | 8.50
Alex | 2016-07-17 | 8.50
Alex | 2016-07-15 | 0.50
Alex | 2016-07-14 | 8.00
Alex | 2016-07-13 | 8.50
Alex | 2016-07-12 | 8.50
Alex | 2016-07-11 | 8.50
Alex | 2016-07-10 | 8.50
(13 rows)
You can see that 7/25/2016 is missing because there is no start_date on or after 7/26/2016, so you'll need to figure out how to account for that special case.
here is how I have done the calc and it works perfectly
select user, trunc(start_time) as date1,
SUM(case when id = 1 then round(cast(datediff(seconds, start_time, st_t1) as decimal)/3600,2) end) as SCHEDULE
from
(
select user, start_time,
case when trunc(start_time) <> trunc(end_time) then cast(to_char(start_time,'yyyy-mm-dd 23:59:59') as timestamp) else cast(to_char(end_time,'yyyy-mm-dd hh24:mi:ss') as timestamp) end as st_t1
from table1 a
where id = 1
group by user_name, trunc(start_time)
union
select user_name, trunc(end_time) as date1,
SUM(case when id = 1 then round(cast(datediff(seconds, st_t2, end_time) as decimal)/3600,2) end) as SCHEDULE
from
(
select user_name, end_time,
case when trunc(start_time) <> trunc(end_time) then cast(to_char(end_time,'yyyy-mm-dd 00:00:00') as timestamp) else cast(to_char(end_time,'yyyy-mm-dd hh24:mi:ss') as timestamp) end as st_t2
from table1 a
where id = 1
)
group by user, trunc(end_time)

Group by values from 4 a.m to 4 a.m SQL Server

I have data from 04/01/2012 00:00 to 05/01/2012 05:00
let say
StartDate Value
04/01/2012 00:00 10
04/01/2012 05:00 10
04/01/2012 08:00 10
05/01/2012 01:00 10
05/01/2012 04:00 10
05/01/2012 05:00 10
if I do a group by date
SUM(Value)...
GROUP BY YEAR(StartDate), MONTH(StartDate),DAY(StartDate)
this group data from 04/01/2012 00:00 to 05/01/2012 00:00 and sum the full day value i.e 30
but I need to group by different time
i.e 04/01/2012 04:00 to 05/01/2012 04:00 so that the result will be 40
how to achieve this. Anyone did this before...
SUM(Value)
...
GROUP BY CAST(DATEADD(hour, -4, MyDateCol) AS Date)
select dateadd(hh, 4, date) as date, total from (
select
dateadd(dd, datediff(dd, 0, dateadd(hh, -4, startDate)), 0) as date,
sum(value) as total
from yourTable
group by dateadd(dd, datediff(dd, 0, dateadd(hh, -4, startDate)), 0)
) t
Result:
date total
2012-01-03 04:00:00.000 10
2012-01-04 04:00:00.000 30
2012-01-05 04:00:00.000 20