How to use DATEDIFF with a Window Function in SQL - sql

When I run the following query:
select
post_visid_high || ':' || post_visid_low as visitor_id
, datediff(minute, lag(date_time), date_time) over (partition by visitor_id order by date_time asc)
from adobe_data
I get
Invalid function type [DATEDIFF] for window function.
Invalid function type [TIMEDIFF] for window function.
I can rewrite the query as
select
post_visid_high || ':' || post_visid_low as visitor_id
, lag(date_time) over (partition by visitor_id order by date_time asc) as previous_date
, datediff(minute, previous_date, date_time) as difference_in_minutes
from adobe_data
But I am wondering if there is a better way to do this?

The issue is placement of ():
select
post_visid_high || ':' || post_visid_low as visitor_id
, datediff(minute, lag(date_time), date_time) over (partition by visitor_id
order by date_time asc)
from adobe_data
=>
select
post_visid_high || ':' || post_visid_low as visitor_id
, datediff(minute, lag(date_time) over (partition by visitor_id
order by date_time asc), date_time)
from adobe_data

You've put the window outside of the datediff, but it should be outside of the lag.
datediff(minute, lag(date_time), date_time) over (partition by visitor_id order by date_time asc)
Becomes...
datediff(minute, lag(date_time) over (partition by visitor_id order by date_time asc), date_time)
Also, long narrow code is easier to read and validate than short wise code...
datediff(
minute,
lag(date_time) over (partition by visitor_id order by date_time asc),
date_time
)
Or even...
datediff(
minute,
lag(date_time) over (
partition by visitor_id
order by date_time asc
),
date_time
)
It's also more friendly to diff tools, such as used by git.

Related

Mariadb: Use result of window function LAG in WHERE clause

I am using the following query to get the difference between two timestamps:
SELECT tracker_id,
TIMESTAMP,
LAG(TIMESTAMP) OVER(ORDER BY TIMESTAMP DESC),
TIMESTAMPDIFF(MINUTE,
TIMESTAMP,
LAG(TIMESTAMP) OVER(ORDER BY TIMESTAMP DESC)) AS diff_in_minutes
FROM comm_telemetry
WHERE comm_telemetry.tracker_id = "123456789"
ORDER BY comm_telemetry.timestamp DESC;
I want to filter the result to only show when diff_in_minutes > 0. The problem is, that windows functions are not allowed in WHERE clauses.
Any suggestion how so solve this?
You will need to first compute the lag in a subquery and then query that again to use it to filter.
WITH cte AS (
SELECT tracker_id,
TIMESTAMP,
TIMESTAMPDIFF(MINUTE,
TIMESTAMP,
LAG(TIMESTAMP) OVER (ORDER BY TIMESTAMP DESC)) AS diff_in_minutes
FROM comm_telemetry
WHERE tracker_id = '123456789'
)
SELECT tracker_id, TIMESTAMP, diff_in_minutes
FROM cte
WHERE diff_in_minutes > 0
ORDER BY TIMESTAMP DESC;
found a solution meanwhile:
WITH tbl_diff_in_minutes AS (SELECT
tracker_id,
`timestamp` as ts,
LAG( `timestamp` ) OVER ( ORDER BY `timestamp` DESC ) prev_ts,
TIMESTAMPDIFF(
MINUTE,
`timestamp`,
LAG( `timestamp` ) OVER ( ORDER BY `timestamp` DESC )) AS diff_in_minutes
FROM
comm_telemetry
WHERE
comm_telemetry.tracker_id = "123456789"
ORDER BY
comm_telemetry.`timestamp` DESC)
SELECT tracker_id, ts, prev_ts, diff_in_minutes FROM tbl_diff_in_minutes WHERE diff_in_minutes > 0;

Optimizing SQL query - finding a group with in a group

I have a working query and looking for ideas to optimize it.
Query explanation: Within each ID group (visitor_id), look for row where c_id != 0. From that row, show all consecutive rows within that ID group.
select t2.*
from (select *, row_number() OVER (PARTITION BY visitor_id ORDER BY date) as row_number
from "DB"."schema"."table"
where visitor_id in
(select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
) as t2
inner join
(select visitor_id, min(rn) as row_number
from
(select *, row_number() OVER (PARTITION BY visitor_id ORDER BY date) as rn
from "DB"."schema"."table"
where visitor_id in
(select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
) as filtered_table
where c_id != 0
group by visitor_id) as t1
on t2.visitor_id = t1.visitor_id
and t2.row_number >= t1.row_number
so you have a common sub expression
select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
so that can be moved to a CTE and run just once. like
WITH distinct_visitors AS (
SELECT DISTINCT visitor_id
FROM (SELECT * FROM "DB"."schema"."table" WHERE date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
but the sub clause filter is equally valid as a top level filter, and given it's a value inclusive range filter BETWEEN will give better performance.
WITH distinct_visitors AS (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND'2021-08-30'
AND c_id IN ('101')
)
then both uses of that CTE do the same ROW_NUMBER operation so that can be a CTE
and simplified as such
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*
FROM rw_rows AS t2
JOIN (
SELECT visitor_id,
min(rn) AS row_number
FROM rw_rows AS filtered_table
WHERE c_id != 0
GROUP BY visitor_id
) AS t1
ON t2.visitor_id = t1.visitor_id
AND t2.row_number >= t1.row_number
So we are want to keep all rows that come after the first non-zero c_id which a QUALIFY should be able to solve like:
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*,
MIN(IFF(c_id != 0, row_number, NULL )) OVER (PARTITION BY visitor_id) as min_rn
FROM rw_rows AS t2
QUALIFY t2.row_number >= min_rn
which without have run feels like the MIN also should be able to be moved to the QUALIFY like:
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*
FROM rw_rows AS t2
QUALIFY t2.row_number >= MIN(IFF(c_id != 0, row_number, NULL )) OVER (PARTITION BY visitor_id)
At which point the CTE is not needed, as it's just used once, so could be moved back in, or not as they are the same.

Is there any function could make date minus bigint type in Athena?

select user_id,first_event_date,(date(max(event_date)) - rnk) date_on
from
(select event_date,user_id,first_event_date,row_number() over(partition by user_id order by first_event_date desc) rnk
from retention_user_selected)
GROUP BY user_id, first_event_date
enter image description here
This would show error : '-' cannot be applied to date, bigint
Try using intervals:
date(max(event_date)) - rnk * interval '1 day'

Get the no of consecutive days a Field value is Stale

I wanted to get to the consecutive number of records a certain field value is stale based on the rates table.
From the below data records 3,4,5 have the same rate as 0.770827 ,so the days the rate is stale is 3 and the prior rate before stale is 0.770886.I would like to get help in writing a query to the no of records that have stale rate and also get to the prior rate of the same as well.In the below sample i am only showing CAD to USD ,but we need the same across different currencies .
Any assistance would be highly helpful.
Expected Output
When value changes mark row with 1, otherwise 0. Then sum this column (flg), you have now consecutive groups (grp). Use grp to aggregate, count, show min and max dates:
dbfiddle demo
select to_cur, from_cur, min(dt) dt_from, max(dt) dt_to, rate, count(1) cnt
from (
select dt, to_cur, from_cur, rate,
sum(flg) over (partition by to_cur, from_cur order by dt) grp
from (
select dt, to_cur, from_cur, rate,
case lag(rate) over (partition by to_cur, from_cur order by dt)
when rate then 0 else 1 end flg
from t))
group by grp, to_cur, from_cur, rate
order by from_cur, to_cur, min(dt)
If you want any specific group after group by add:
having count(1) >= 3
This is a gaps and islands problem.
You can use lag() to retrieve the previous rate for the same currencies tuple, and then do a window sum to define groups of consecutive records with the same rate. Then, you can aggregate the groups, and recover the previous rate using lag() again. The last step is to filter on groups that have at least 3 records.
select *
from (
select
from_cur,
to_cur,
rate,
max(date) max_date,
lag(rate) over(partition by from_cur, to_cur order by max(date)) lag_rate_grp,
count(*) cnt
from (
select
t.*,
sum(case when rate = lag_rate then 0 else 1 end) over(partition by from_date, to_date order by date) grp
from (
select
t.*,
lag(rate) over(partition by from_cur, to_cur order by date) lag_rate
from mytable t
) t
) t
group by from_cur, to_cur, rate, grp
) t
where cnt >= 3
order by from_cur, to_cur, max_date
Actually, using the difference between row numbers can save one level of nesting:
select *
from (
select
from_cur,
to_cur,
rate,
max(date) max_date,
lag(rate) over(partition by from_cur, to_cur order by max(date)) lag_rate_grp,
count(*) cnt
from (
select
t.*,
row_number() over(partition by from_cur, to_cur order by date) rn1,
row_number() over(partition by from_cur, to_cur, rate order by date) rn2
from mytable t
) t
group by from_cur, to_cur, rate, rn1 - rn2
) t
where cnt >= 3
order by from_cur, to_cur, max_date
If you want only the earliest record per currency tuple, then you can use row_number():
select *
from (
select
from_cur,
to_cur,
rate,
max(date) max_date,
lag(rate) over(partition by from_cur, to_cur order by max(date)) lag_rate_grp,
count(*) cnt,
row_number() over(partition by from_cur, to_cur, case when count(*) >= 3 then 0 else 1 end order by max(date)) rn
from (
select
t.*,
row_number() over(partition by from_cur, to_cur order by date) rn1,
row_number() over(partition by from_cur, to_cur, rate order by date) rn2
from mytable t
) t
group by from_cur, to_cur, rate, rn1 - rn2
) t
where cnt >= 3 and rn = 1
order by from_cur, to_cur
This is a gap-and-islands problem, but I would solve it just by subtracting a sequence from the date. And then aggregating:
select to_cur, from_cur, rate, min(date), max(date),
count(*) as days_stale
from (select r.*,
row_number() over (partition by to_cur, from_cur, rate order by date) as seqnum
from rates r
) r
group by (date - seqnum * interval '1' day)

Merge Overlapping Time Intervals based on Hierarchy in SQL

I am trying to solve a problem where i want to merge overlapping intervals for a given column id, but i also want to merge them based on hierarchy/priority. I have start_time and stop_time for each interval and each interval has a hierarchy/priority associated with it.
These are the following columns in the table:
id, start_time, stop_time, priority
I was able to solve the problem where i do not have take into account the priority, but i am struggling with this one.
Red colour: p1 (priority 1)
Blue Colour: p2 (priority 2)
Green colour: p3 (priority 3)
Note that in the example input below we will have 9 rows having same id, and the output will have 6 rows. Please note that for some id's might have only some of the priority values or just one, the solution should take care of that.
expected input and output:
Below is for BigQuery Standard SQL
#standardSQL
WITH check_times AS (
SELECT id, start_time AS time FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS time FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, time AS start_time, LEAD(time) OVER(PARTITION BY id ORDER BY time) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MIN(priority) priority
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, ANY_VALUE(priority) priority
FROM (
SELECT id, start_time, stop_time, priority, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time, priority,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) OR
priority != IFNULL(LAG(priority) OVER(PARTITION BY id ORDER BY start_time), -1) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
If to apply to sample data from your question - result is
Can you also share a solution where we merge intervals based on just id and no priority column
I just simply slightly adjusted above query to ignore priority
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time
FROM (
SELECT id, start_time, stop_time, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
with result
Row id start_time stop_time
1 1 0 36
2 1 41 47
This is a "combining" islands problem. One solution is to find where the islands begin and do a cumulative sum of the beginnings. You can determine the beginning by seeing where there are no overlaps:
select id, priority, min(start_time), max(stop_time)
from (select t.*,
countif(coalesce(prev_stop_time, stop_time) < stop_time) over (partition by id, priority order by start_time) as grp
from (select t.*,
max(stop_time) over (partition by id, priority order by start_time rows between unbounded preceding and 1 preceding) as prev_stop_time
from t
) t
) t
group by id, priority, grp;