Mariadb: Use result of window function LAG in WHERE clause - sql

I am using the following query to get the difference between two timestamps:
SELECT tracker_id,
TIMESTAMP,
LAG(TIMESTAMP) OVER(ORDER BY TIMESTAMP DESC),
TIMESTAMPDIFF(MINUTE,
TIMESTAMP,
LAG(TIMESTAMP) OVER(ORDER BY TIMESTAMP DESC)) AS diff_in_minutes
FROM comm_telemetry
WHERE comm_telemetry.tracker_id = "123456789"
ORDER BY comm_telemetry.timestamp DESC;
I want to filter the result to only show when diff_in_minutes > 0. The problem is, that windows functions are not allowed in WHERE clauses.
Any suggestion how so solve this?

You will need to first compute the lag in a subquery and then query that again to use it to filter.
WITH cte AS (
SELECT tracker_id,
TIMESTAMP,
TIMESTAMPDIFF(MINUTE,
TIMESTAMP,
LAG(TIMESTAMP) OVER (ORDER BY TIMESTAMP DESC)) AS diff_in_minutes
FROM comm_telemetry
WHERE tracker_id = '123456789'
)
SELECT tracker_id, TIMESTAMP, diff_in_minutes
FROM cte
WHERE diff_in_minutes > 0
ORDER BY TIMESTAMP DESC;

found a solution meanwhile:
WITH tbl_diff_in_minutes AS (SELECT
tracker_id,
`timestamp` as ts,
LAG( `timestamp` ) OVER ( ORDER BY `timestamp` DESC ) prev_ts,
TIMESTAMPDIFF(
MINUTE,
`timestamp`,
LAG( `timestamp` ) OVER ( ORDER BY `timestamp` DESC )) AS diff_in_minutes
FROM
comm_telemetry
WHERE
comm_telemetry.tracker_id = "123456789"
ORDER BY
comm_telemetry.`timestamp` DESC)
SELECT tracker_id, ts, prev_ts, diff_in_minutes FROM tbl_diff_in_minutes WHERE diff_in_minutes > 0;

Related

How to use DATEDIFF with a Window Function in SQL

When I run the following query:
select
post_visid_high || ':' || post_visid_low as visitor_id
, datediff(minute, lag(date_time), date_time) over (partition by visitor_id order by date_time asc)
from adobe_data
I get
Invalid function type [DATEDIFF] for window function.
Invalid function type [TIMEDIFF] for window function.
I can rewrite the query as
select
post_visid_high || ':' || post_visid_low as visitor_id
, lag(date_time) over (partition by visitor_id order by date_time asc) as previous_date
, datediff(minute, previous_date, date_time) as difference_in_minutes
from adobe_data
But I am wondering if there is a better way to do this?
The issue is placement of ():
select
post_visid_high || ':' || post_visid_low as visitor_id
, datediff(minute, lag(date_time), date_time) over (partition by visitor_id
order by date_time asc)
from adobe_data
=>
select
post_visid_high || ':' || post_visid_low as visitor_id
, datediff(minute, lag(date_time) over (partition by visitor_id
order by date_time asc), date_time)
from adobe_data
You've put the window outside of the datediff, but it should be outside of the lag.
datediff(minute, lag(date_time), date_time) over (partition by visitor_id order by date_time asc)
Becomes...
datediff(minute, lag(date_time) over (partition by visitor_id order by date_time asc), date_time)
Also, long narrow code is easier to read and validate than short wise code...
datediff(
minute,
lag(date_time) over (partition by visitor_id order by date_time asc),
date_time
)
Or even...
datediff(
minute,
lag(date_time) over (
partition by visitor_id
order by date_time asc
),
date_time
)
It's also more friendly to diff tools, such as used by git.

Optimizing SQL query - finding a group with in a group

I have a working query and looking for ideas to optimize it.
Query explanation: Within each ID group (visitor_id), look for row where c_id != 0. From that row, show all consecutive rows within that ID group.
select t2.*
from (select *, row_number() OVER (PARTITION BY visitor_id ORDER BY date) as row_number
from "DB"."schema"."table"
where visitor_id in
(select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
) as t2
inner join
(select visitor_id, min(rn) as row_number
from
(select *, row_number() OVER (PARTITION BY visitor_id ORDER BY date) as rn
from "DB"."schema"."table"
where visitor_id in
(select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
) as filtered_table
where c_id != 0
group by visitor_id) as t1
on t2.visitor_id = t1.visitor_id
and t2.row_number >= t1.row_number
so you have a common sub expression
select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
so that can be moved to a CTE and run just once. like
WITH distinct_visitors AS (
SELECT DISTINCT visitor_id
FROM (SELECT * FROM "DB"."schema"."table" WHERE date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
but the sub clause filter is equally valid as a top level filter, and given it's a value inclusive range filter BETWEEN will give better performance.
WITH distinct_visitors AS (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND'2021-08-30'
AND c_id IN ('101')
)
then both uses of that CTE do the same ROW_NUMBER operation so that can be a CTE
and simplified as such
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*
FROM rw_rows AS t2
JOIN (
SELECT visitor_id,
min(rn) AS row_number
FROM rw_rows AS filtered_table
WHERE c_id != 0
GROUP BY visitor_id
) AS t1
ON t2.visitor_id = t1.visitor_id
AND t2.row_number >= t1.row_number
So we are want to keep all rows that come after the first non-zero c_id which a QUALIFY should be able to solve like:
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*,
MIN(IFF(c_id != 0, row_number, NULL )) OVER (PARTITION BY visitor_id) as min_rn
FROM rw_rows AS t2
QUALIFY t2.row_number >= min_rn
which without have run feels like the MIN also should be able to be moved to the QUALIFY like:
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*
FROM rw_rows AS t2
QUALIFY t2.row_number >= MIN(IFF(c_id != 0, row_number, NULL )) OVER (PARTITION BY visitor_id)
At which point the CTE is not needed, as it's just used once, so could be moved back in, or not as they are the same.

Datediff , coalesce function throwing error in redshift

select * from
(select
convert (timestamp, '1970-01-01 00:00:00') as Fixed_Date,
DATEDIFF (Second,
Fixed_Date::timestamp,
COALESCE(detection::timestamp, Fixed_date::timestamp)
)as TTD_seconds,
row_number() over(partition by a.number order by a.sys_updated_on desc,
a.record_processed_datetime desc ) as rn
from Table a
)
where rn = 1
Detection column has null values, i want my datediff to return 0.
entering 0 is throwing me error as well.
appreciate all the help
error screenshot :
I think you need an alias for your subquery.
SELECT *
FROM (SELECT CONVERT(timestamp, '1970-01-01 00:00:00') as Fixed_Date,
DATEDIFF (Second, Fixed_Date::timestamp, COALESCE (u_time_to_detection::timestamp, fixed_date::timestamp,'1970-01-01 00:00:00') )as TTD_seconds,
row_number() over(partition by a.number order by a.sys_updated_on desc, a.record_processed_datetime desc ) as rn
FROM Table a ) as AA
WHERE rn = 1

Merge Overlapping Time Intervals based on Hierarchy in SQL

I am trying to solve a problem where i want to merge overlapping intervals for a given column id, but i also want to merge them based on hierarchy/priority. I have start_time and stop_time for each interval and each interval has a hierarchy/priority associated with it.
These are the following columns in the table:
id, start_time, stop_time, priority
I was able to solve the problem where i do not have take into account the priority, but i am struggling with this one.
Red colour: p1 (priority 1)
Blue Colour: p2 (priority 2)
Green colour: p3 (priority 3)
Note that in the example input below we will have 9 rows having same id, and the output will have 6 rows. Please note that for some id's might have only some of the priority values or just one, the solution should take care of that.
expected input and output:
Below is for BigQuery Standard SQL
#standardSQL
WITH check_times AS (
SELECT id, start_time AS time FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS time FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, time AS start_time, LEAD(time) OVER(PARTITION BY id ORDER BY time) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MIN(priority) priority
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, ANY_VALUE(priority) priority
FROM (
SELECT id, start_time, stop_time, priority, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time, priority,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) OR
priority != IFNULL(LAG(priority) OVER(PARTITION BY id ORDER BY start_time), -1) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
If to apply to sample data from your question - result is
Can you also share a solution where we merge intervals based on just id and no priority column
I just simply slightly adjusted above query to ignore priority
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time
FROM (
SELECT id, start_time, stop_time, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
with result
Row id start_time stop_time
1 1 0 36
2 1 41 47
This is a "combining" islands problem. One solution is to find where the islands begin and do a cumulative sum of the beginnings. You can determine the beginning by seeing where there are no overlaps:
select id, priority, min(start_time), max(stop_time)
from (select t.*,
countif(coalesce(prev_stop_time, stop_time) < stop_time) over (partition by id, priority order by start_time) as grp
from (select t.*,
max(stop_time) over (partition by id, priority order by start_time rows between unbounded preceding and 1 preceding) as prev_stop_time
from t
) t
) t
group by id, priority, grp;

For each row in query select top 20 from other query

I'm trying to do something and I'm not sure how to do it.
I have some data like this:
WITH a AS (SELECT theid, thename, thetimestamp FROM mytable)
SELECT thename, TRUNC (thetimestamp, 'HH24'), COUNT (theid) FROM a
group by thename,trunc(thetimestamp,'HH24') ORDER BY COUNT (theid) desc)
which returns me the count grouped by the hour and the name.
I would like it to just be
for each hour, top X counts
Is that possible?
I ended with:
SELECT thename, hour, cnt
FROM
( SELECT thename, hour, cnt,
rank() over (partition by hours order by cnt desc) rnk
FROM
( SELECT thename, TRUNC (thetimestamp, 'HH24') hour, COUNT (theid) cnt
FROM mytable
group by thename,trunc(thetimestamp,'HH24')
)
)
WHERE rnk <= :X
Try:
SELECT thename, hour, cnt
FROM
( SELECT thename, hour, cnt,
rank() over (partition by thename order by cnt desc) rnk
FROM
( SELECT thename, TRUNC (thetimestamp, 'HH24') hour, COUNT (theid) cnt
FROM mytable
group by thename,trunc(thetimestamp,'HH24')
)
)
WHERE rnk <= :X
(I didn't see the purpose of the WITH clause so I removed it from mine).
You could do that with row_number(), but it requires another subquery or another CTE. Here's the double CTE, since Tony Adrews already posted the subquery approach:
WITH a AS (
SELECT thename, TRUNC(thetimestamp, 'HH24') as hour, COUNT(*) cnt
FROM mytable
GROUP BY thename, TRUNC(thetimestamp, 'HH24')
), b AS (
SELECT
ROW_NUMBER() OVER (PARTITION BY hour ORDER BY ctn DESC) rn,
thename, hour, cnt
FROM a
)
SELECT *
FROM b
WHERE rn < 20