Bigquery extracting sequences from timeseries data - sql

I have a timeseries in BQ, with additional data, and based on some of the data I want to extract sequences from the timeseries for further processing.
The following demonstrates the source table:
with dataset as (
select
timestamp('2023-01-25 00:00:00') as last_seen, 1 as vehicle_id, 1 as mode, 0 as activity
union all select timestamp('2023-01-25 00:00:02'), 1, 1, 0
union all select timestamp('2023-01-25 00:00:04'), 1, 1, 0
union all select timestamp('2023-01-25 00:00:00'), 2, 1, 0
union all select timestamp('2023-01-25 00:00:02'), 2, 1, 0
union all select timestamp('2023-01-25 00:00:04'), 2, 1, 0
union all select timestamp('2023-01-25 00:00:06'), 1, 2, 1
union all select timestamp('2023-01-25 00:00:08'), 1, 2, 1
union all select timestamp('2023-01-25 00:00:10'), 1, 2, 1
union all select timestamp('2023-01-25 00:00:12'), 1, 1, 0
union all select timestamp('2023-01-25 00:00:14'), 1, 1, 0
union all select timestamp('2023-01-25 00:00:16'), 1, 1, 0
union all select timestamp('2023-01-25 00:00:12'), 2, 1, 1
union all select timestamp('2023-01-25 00:00:14'), 2, 1, 1
union all select timestamp('2023-01-25 00:00:17'), 2, 1, 1
)
What I want is to have a result that for every time the mode and/or activity changes for each vehicle_id which includes the start and end timestamps. Eg like this:
vehicle_id
mode
activity
start
end
1
1
0
2023-01-25 00:00:00
2023-01-25 00:00:04
1
2
1
2023-01-25 00:00:06
2023-01-25 00:00:10
1
1
0
2023-01-25 00:00:12
2023-01-25 00:00:16
2
1
0
2023-01-25 00:00:00
2023-01-25 00:00:04
2
1
1
2023-01-25 00:00:12
2023-01-25 00:00:17
I have tried:
select * from dataset where true
qualify ifnull(mode != lag(mode) over win or activity != lag(activity) over win or mode != lead(mode) over win or activity != lead(activity) over win, true)
window win as (partition by vehicle_id order by last_seen)
But that gives start and end on separate rows, so it feels like a dead end as it might cause issues if a sequence does not have an end.
Thanks

You might consider below.
SELECT vehicle_id,
ANY_VALUE(mode) mode, ANY_VALUE(activity) activity,
MIN(last_seen) AS start, MAX(last_seen) AS `end`
FROM (
SELECT *, COUNTIF(flag) OVER w1 AS part FROM (
SELECT *, mode <> LAG(mode) OVER w0 OR activity <> LAG(activity) OVER w0 AS flag
FROM dataset
WINDOW w0 AS (PARTITION BY vehicle_id ORDER BY last_seen)
) WINDOW w1 AS (PARTITION BY vehicle_id ORDER BY last_seen)
) GROUP BY vehicle_id, part;
Query results

Related

How to find the row with the highest value cell based on another column from within a group of values?

I have this table:
Site_ID
Volume
RPT_Date
RPT_Hour
1
10
01/01/2021
1
1
7
01/01/2021
2
1
13
01/01/2021
3
1
11
01/16/2021
1
1
3
01/16/2021
2
1
5
01/16/2021
3
2
9
01/01/2021
1
2
24
01/01/2021
2
2
16
01/01/2021
3
2
18
01/16/2021
1
2
7
01/16/2021
2
2
1
01/16/2021
3
I need to select the RPT_Hour with the highest Volume for each set of dates
Needed Output:
Site_ID
Volume
RPT_Date
RPT_Hour
1
13
01/01/2021
1
1
11
01/16/2021
1
2
24
01/01/2021
2
2
18
01/16/2021
1
SELECT site_id, volume, rpt_date, rpt_hour
FROM (SELECT t.*,
ROW_NUMBER()
OVER (PARTITION BY site_id, rpt_date ORDER BY volume DESC) AS rn
FROM MyTable) t
WHERE rn = 1;
I cannot figure out how to group the table into like date groups. If I could do that, I think the rn = 1 will return the highest volume row for each date.
The way I see it, your query is OK (but rpt_hour in desired output is not).
SQL> with test (site_id, volume, rpt_date, rpt_hour) as
2 (select 1, 10, date '2021-01-01', 1 from dual union all
3 select 1, 7, date '2021-01-01', 2 from dual union all
4 select 1, 13, date '2021-01-01', 3 from dual union all
5 select 1, 11, date '2021-01-16', 1 from dual union all
6 select 1, 3, date '2021-01-16', 2 from dual union all
7 select 1, 5, date '2021-01-16', 3 from dual union all
8 --
9 select 2, 9, date '2021-01-01', 1 from dual union all
10 select 2, 24, date '2021-01-01', 3 from dual union all
11 select 2, 16, date '2021-01-01', 3 from dual union all
12 select 2, 18, date '2021-01-16', 1 from dual union all
13 select 2, 7, date '2021-01-16', 2 from dual union all
14 select 2, 1, date '2021-01-16', 3 from dual
15 ),
16 temp as
17 (select t.*,
18 row_number() over (partition by site_id, rpt_date order by volume desc) rn
19 from test t
20 )
21 select site_id, volume, rpt_date, rpt_hour
22 from temp
23 where rn = 1
24 /
SITE_ID VOLUME RPT_DATE RPT_HOUR
---------- ---------- ---------- ----------
1 13 01/01/2021 3
1 11 01/16/2021 1
2 24 01/01/2021 3
2 18 01/16/2021 1
SQL>
One option would be using MAX(..) KEEP (DENSE_RANK ..) OVER (PARTITION BY ..) analytic function without need of any subquery such as :
SELECT DISTINCT
site_id,
MAX(volume) KEEP (DENSE_RANK FIRST ORDER BY volume DESC) OVER
(PARTITION BY site_id, rpt_date) AS volume,
rpt_date,
MAX(rpt_hour) KEEP (DENSE_RANK FIRST ORDER BY volume DESC) OVER
(PARTITION BY site_id, rpt_date) AS rpt_hour
FROM t
GROUP BY site_id, rpt_date, volume, rpt_hour
ORDER BY site_id, rpt_date
Demo

Valid_from Valid_to from a full loaded table

There is a source table which loads the data full and monthly. The table looks like below example.
Source table:
pk
code_paym
code_terms
etl_id
1
2
3
2020-08-01
1
2
3
2020-09-01
1
2
4
2020-10-01
1
2
4
2020-11-01
1
2
4
2020-12-01
1
2
4
2021-01-01
1
2
3
2021-02-01
1
2
3
2021-03-01
1
2
3
2021-04-01
1
2
3
2021-05-01
I would like to create valid_from valid_to columns from the source table like below example.
Desired Output:
pk
code_paym
code_terms
valid_from
valid_to
1
2
3
2020-08-01
2020-09-01
1
2
4
2020-10-01
2021-01-01
1
2
3
2021-02-01
2021-05-01
As it can be seen attributes can go back to the same values by the time.
How can I make this output happen by sql code?
Thank you very much,
Regards
Using CONDITIONAL_TRUE_EVENT windowed function to determine continuous subgroups:
CREATE OR REPLACE TABLE t( pk INT, code_paym INT, code_terms INT, etl_id DATE)
AS
SELECT 1, 2, 3, '2020-08-01'
UNION ALL SELECT 1, 2, 3, '2020-09-01'
UNION ALL SELECT 1, 2, 4, '2020-10-01'
UNION ALL SELECT 1, 2, 4, '2020-11-01'
UNION ALL SELECT 1, 2, 4, '2020-12-01'
UNION ALL SELECT 1, 2, 4, '2021-01-01'
UNION ALL SELECT 1, 2, 3, '2021-02-01'
UNION ALL SELECT 1, 2, 3, '2021-03-01'
UNION ALL SELECT 1, 2, 3, '2021-04-01'
UNION ALL SELECT 1, 2, 3, '2021-05-01';
Query:
WITH cte AS (
SELECT t.*,
CONDITIONAL_TRUE_EVENT(CODE_TERMS != LAG(CODE_TERMS,1,CODE_TERMS)
OVER(PARTITION BY PK, CODE_PAYM ORDER BY ETL_ID))
OVER(PARTITION BY PK, CODE_PAYM ORDER BY ETL_ID) AS grp
FROM t
)
SELECT PK, CODE_PAYM, grp, MIN(ETL_ID) AS valid_from, MAX(ETL_ID) AS valid_to
FROM cte
GROUP BY PK, CODE_PAYM, grp;
Output:

SQL: Create multiple rows for a record based on months between two dates

My table has records as below for different Id's and different start and end dates
ID, Startdate, Enddate
1, 2017-02-14, 2018-11-05
I want to write an SQL without using date dimension table that gives below output: Basically one record for each month between start and end date.
1, 2017, 02
1, 2017, 03
1, 2017, 04
1, 2017, 05
1, 2017, 06
1, 2017, 07
1, 2017, 08
1, 2017, 09
1, 2017, 10
1, 2017, 11
1, 2017, 12
1, 2018, 01
1, 2018, 02
1, 2018, 03
1, 2018, 04
1, 2018, 05
1, 2018, 06
1, 2018, 07
1, 2018, 09
1, 2018, 10
1, 2018, 11
Please use below query example:
set #start_date = '2017-02-14';
set #end_date = LAST_DAY('2018-11-05');
WITH RECURSIVE date_range AS
(
select MONTH(#start_date) as month_, YEAR(#start_date) as year_, DATE_ADD(#start_date, INTERVAL 1 MONTH) as next_month_date
UNION
SELECT MONTH(dr.next_month_date) as month_, YEAR(dr.next_month_date) as year_, DATE_ADD(dr.next_month_date, INTERVAL 1 MONTH) as next_month_date
FROM date_range dr
where next_month_date <= #end_date
)
select month_, year_ from date_range
order by next_month_date desc
This is what I did and it worked like a charm:
-- sample data
WITH table_data
AS (
SELECT 1 AS id
,cast('2017-08-14' AS DATE) AS start_dt
,cast('2018-12-16' AS DATE) AS end_dt
UNION ALL
SELECT 2 AS id
,cast('2017-09-14' AS DATE) AS start_dt
,cast('2019-01-16' AS DATE) AS end_dt
)
-- find minimum date from the data
,starting_date (start_date)
AS (
SELECT min(start_dt)
FROM TABLE_DATA
)
--get all months between min and max dates
,all_dates
AS (
SELECT last_day(add_months(date_trunc('month', start_date), idx * 1)) month_date
FROM starting_date
CROSS JOIN _v_vector_idx
WHERE month_date <= add_months(start_date, abs(months_between((
SELECT min(start_dt) FROM TABLE_DATA), (SELECT max(end_dt) FROM TABLE_DATA))) + 1)
ORDER BY month_date
)
SELECT id
,extract(year FROM month_date)
,extract(month FROM month_date)
,td.start_dt
,td.end_dt
FROM table_data td
INNER JOIN all_dates ad
ON ad.month_date > td.start_dt
AND ad.month_date <= last_day(td.end_dt)
ORDER BY 1
,2
You have to generate date and from that have to pick year and month
select distinct year(date),month( date) from
(select * from (
select
date_add('2017-02-14 00:00:00.000', INTERVAL n5.num*10000+n4.num*1000+n3.num*100+n2.num*10+n1.num DAY ) as date
from
(select 0 as num
union all select 1
union all select 2
union all select 3
union all select 4
union all select 5
union all select 6
union all select 7
union all select 8
union all select 9) n1,
(select 0 as num
union all select 1
union all select 2
union all select 3
union all select 4
union all select 5
union all select 6
union all select 7
union all select 8
union all select 9) n2,
(select 0 as num
union all select 1
union all select 2
union all select 3
union all select 4
union all select 5
union all select 6
union all select 7
union all select 8
union all select 9) n3,
(select 0 as num
union all select 1
union all select 2
union all select 3
union all select 4
union all select 5
union all select 6
union all select 7
union all select 8
union all select 9) n4,
(select 0 as num
union all select 1
union all select 2
union all select 3
union all select 4
union all select 5
union all select 6
union all select 7
union all select 8
union all select 9) n5
) a
where date >'2017-02-14 00:00:00.000' and date < '2018-11-05'
) as t

Oracle aggregate specific groups based on sequence

I have a table T:
Entity type starttime sequence duration
1 A 2017010101 1 12
1 A 2017010102 2 11
1 A 2017010103 3 3
1 A 2017010104 4 1
1 A 2017010105 1 19
1 A 2017010106 2 18
2 A 2017010101 1 18
2 A 2017010102 1 100
3 A 2017010101 1 120
I need to aggregate the data so that each run of sequence has a total duration and the first starttime:
Entity type starttime sequence duration
1 A 2017010101 1 27
1 A 2017010105 1 37
2 A 2017010101 1 18
2 A 2017010102 1 100
3 A 2017010101 1 120
I believe this is a gaps-and-islands problem, but I can't quite figure it out...
I have tried to use a lead() over (partition by entity order by sequence) but this keeps grabbing the next run of sequence.
If sequence has no gaps then you can use row_number() and subtract sequence to create temporary column grp used next for aggregation:
select entity, type, min(starttime) starttime,
min(sequence) sequence, sum(duration) duration
from (select t.*,
row_number() over (partition by entity order by starttime) - sequence grp
from t)
group by entity, type, grp
order by entity, grp
Test:
with t(entity, type, starttime, sequence, duration) as (
select 1, 'A', 2017010101, 1, 12 from dual union all
select 1, 'A', 2017010102, 2, 11 from dual union all
select 1, 'A', 2017010103, 3, 3 from dual union all
select 1, 'A', 2017010104, 4, 1 from dual union all
select 1, 'A', 2017010105, 1, 19 from dual union all
select 1, 'A', 2017010106, 2, 18 from dual union all
select 2, 'A', 2017010101, 1, 18 from dual union all
select 2, 'A', 2017010102, 1, 100 from dual union all
select 3, 'A', 2017010101, 1, 120 from dual )
select entity, type, min(starttime) starttime,
min(sequence) sequence, sum(duration) duration
from (select t.*,
row_number() over (partition by entity order by starttime) - sequence grp
from t)
group by entity, type, grp
order by entity, grp
ENTITY TYPE STARTTIME SEQUENCE DURATION
---------- ---- ---------- ---------- ----------
1 A 2017010101 1 27
1 A 2017010105 1 37
2 A 2017010101 1 18
2 A 2017010102 1 100
3 A 2017010101 1 120
You don't need row_number() for this. You can just subtract the sequence from the starttime -- assuming starttime is a date. The difference is constant for each group of sequential values:
select entity, type, min(starttime) as starttime,
min(sequence) as sequence, sum(duration) as duration
from t
group by entity, type, (starttime - sequence)
order by entity, grp;
If starttime is a string then you need row_number() as Ponder suggests. If starttime is a number, then this works within a single month, but you probably want row_number().

Flag the records for the next 3 days using analytic/join functions Oracle

For each id, I want to flag records for the next 3 days. For any record not within 3 days, it starts with 1 again.
I don't want to use loops as it might slow down the performance.
My table is as below
Id Date Flag
-------------------------------------------
1 Jan 1st 1 (starting record for id 1. Any record with Jan 2nd - Jan 4th will be set to 0)
1 Jan 3rd 0 (From Jan 1st, it is within 3 days)
1 Jan 5th 1 (From Jan 1st, it is NOT within 3 days. So flag as 1.
Any record with Jan 6th - Jan 8th will be set to 0)
1 Jan 6th 0 (From Jan 5th, it is within 3 days)
2 Jan 15th 1 (Starting record for id 2)
2 Jan 17th 0 (From Jan 15th, it is within 3 days)
2 Jan 19th 1 (From Jan 15th, it is NOT within 3 days. So flag as 1)
EDIT: this answer is not correct
Test Case 1:
with src as (
select 1 as id, date '2014-01-01' as d, 1 as test from dual
union all select 1, date '2014-01-03', 0 from dual
union all select 1, date '2014-01-05', 1 from dual
union all select 1, date '2014-01-06', 0 from dual
union all select 2, date '2014-01-15', 1 from dual
union all select 2, date '2014-01-17', 0 from dual
union all select 2, date '2014-01-19', 1 from dual)
,q as (
select src.*
,first_value(d)
over (partition by id
order by d
range numtodsinterval(3, 'day') preceding
) as d1
from src)
select q.id, to_char(q.d,'DD/MM/YYYY') as d
,case when q.d1 =
lag(q.d1)
over (partition by id order by d)
then 0
else 1
end as flag
,test
from q
order by id, d;
Result (test passed):
id d flag test
== ========== ==== ====
1 01/01/2014 1 1
1 03/01/2014 0 0
1 05/01/2014 1 1
1 06/01/2014 0 0
2 15/01/2014 1 1
2 17/01/2014 0 0
2 19/01/2014 1 1
Test Case 2:
with src as (
select 1 as id, date '2014-01-12' as d, 1 as test from dual
union all select 1, date '2014-01-13', 0 from dual
union all select 1, date '2014-01-15', 0 from dual
union all select 1, date '2014-01-18', 1 from dual
union all select 1, date '2014-01-21', 0 from dual
union all select 1, date '2014-02-02', 1 from dual
union all select 1, date '2014-02-03', 0 from dual
union all select 1, date '2014-02-09', 1 from dual
union all select 1, date '2014-02-10', 0 from dual
union all select 1, date '2014-02-11', 0 from dual
union all select 1, date '2014-02-18', 1 from dual
union all select 1, date '2014-02-21', 0 from dual)
,q as (
select src.*
,first_value(d)
over (partition by id
order by d
range numtodsinterval(3, 'day') preceding
) as d1
from src)
select q.id, to_char(q.d,'DD/MM/YYYY') as d
,case when q.d1 =
lag(q.d1)
over (partition by id order by d)
then 0
else 1
end as flag
,test
from q
order by id, d;
Result (test failed):
id d flag test
== ========== ==== ====
1 12/01/2014 1 1
1 13/01/2014 0 0
1 15/01/2014 0 0
1 18/01/2014 1 1
1 21/01/2014 1 0 <--- test failed
1 02/02/2014 1 1
1 03/02/2014 0 0
1 09/02/2014 1 1
1 10/02/2014 0 0
1 11/02/2014 0 0
1 18/02/2014 1 1
1 21/02/2014 0 0