I have a working query and looking for ideas to optimize it.
Query explanation: Within each ID group (visitor_id), look for row where c_id != 0. From that row, show all consecutive rows within that ID group.
select t2.*
from (select *, row_number() OVER (PARTITION BY visitor_id ORDER BY date) as row_number
from "DB"."schema"."table"
where visitor_id in
(select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
) as t2
inner join
(select visitor_id, min(rn) as row_number
from
(select *, row_number() OVER (PARTITION BY visitor_id ORDER BY date) as rn
from "DB"."schema"."table"
where visitor_id in
(select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
) as filtered_table
where c_id != 0
group by visitor_id) as t1
on t2.visitor_id = t1.visitor_id
and t2.row_number >= t1.row_number
so you have a common sub expression
select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
so that can be moved to a CTE and run just once. like
WITH distinct_visitors AS (
SELECT DISTINCT visitor_id
FROM (SELECT * FROM "DB"."schema"."table" WHERE date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
but the sub clause filter is equally valid as a top level filter, and given it's a value inclusive range filter BETWEEN will give better performance.
WITH distinct_visitors AS (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND'2021-08-30'
AND c_id IN ('101')
)
then both uses of that CTE do the same ROW_NUMBER operation so that can be a CTE
and simplified as such
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*
FROM rw_rows AS t2
JOIN (
SELECT visitor_id,
min(rn) AS row_number
FROM rw_rows AS filtered_table
WHERE c_id != 0
GROUP BY visitor_id
) AS t1
ON t2.visitor_id = t1.visitor_id
AND t2.row_number >= t1.row_number
So we are want to keep all rows that come after the first non-zero c_id which a QUALIFY should be able to solve like:
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*,
MIN(IFF(c_id != 0, row_number, NULL )) OVER (PARTITION BY visitor_id) as min_rn
FROM rw_rows AS t2
QUALIFY t2.row_number >= min_rn
which without have run feels like the MIN also should be able to be moved to the QUALIFY like:
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*
FROM rw_rows AS t2
QUALIFY t2.row_number >= MIN(IFF(c_id != 0, row_number, NULL )) OVER (PARTITION BY visitor_id)
At which point the CTE is not needed, as it's just used once, so could be moved back in, or not as they are the same.
Related
I need to get max date for each row over other ids. Of course I can do this with CROSS JOIN and JOIN .
Like this
WITH t AS (
SELECT 1 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-09-01','2021-09-09', INTERVAL 1 DAY)) rep_date
UNION ALL
SELECT 2 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-08-20','2021-09-03', INTERVAL 1 DAY)) rep_date
UNION ALL
SELECT 3 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-08-25','2021-09-05', INTERVAL 1 DAY)) rep_date
)
SELECT id, rep_date, MAX(rep_date) OVER (PARTITION BY id) max_date, max_date_over_others FROM t
JOIN (
SELECT t.id, MAX(max_date) max_date_over_others FROM t
CROSS JOIN (
SELECT id, MAX(rep_date) max_date FROM t
GROUP BY 1
) t1
WHERE t1.id <> t.id
GROUP BY 1
) USING (id)
But it's too wired for huge tables. So I'm looking for the some simpler way to do this. Any ideas?
Your version is good enough I think. But if you want to try other options - consider below approach. It might looks more verbose from first look - but should be more optimal and cheaper to compare with your version with cross join
temp as (
select id,
greatest(
ifnull(max(max_date_for_id) over preceding_ids, '1970-01-01'),
ifnull(max(max_date_for_id) over following_ids, '1970-01-01')
) as max_date_for_rest_ids
from (
select id, max(rep_date) max_date_for_id
from t
group by id
)
window
preceding_ids as (order by id rows between unbounded preceding and 1 preceding),
following_ids as (order by id rows between 1 following and unbounded following)
)
select *
from t
join temp
using (id)
Assuming your original table data just has columns id and dt - wouldn't this solve it? I'm using the fact that if an id has the max dt of everything, then it gets the second-highest over the other id values.
WITH max_dates AS
(
SELECT
id,
MAX(dt) AS max_dt
FROM
data
GROUP BY
id
),
with_top1_value AS
(
SELECT
*,
MAX(dt) OVER () AS max_overall_dt_1,
MIN(dt) OVER () AS min_overall_dt
FROM
max_dates
),
with_top2_values AS
(
SELECT
*,
MAX(CASE WHEN dt = max_overall_dt_1 THEN min_overall_dt ELSE dt END) AS max_overall_dt2
FROM
with_top1_value
),
SELECT
*,
CASE WHEN dt = max_overall_dt1 THEN max_overall_dt2 ELSE max_overall_dt1 END AS max_dt_of_others
FROM
with_top2_values
I have a big query program below;
WITH cte AS(
SELECT *
FROM (
SELECT project_name,
SUM(reward_value) AS total_reward_value,
DATE_TRUNC(date_signing, MONTH) as month,
date_signing,
Row_number() over (partition by DATE_TRUNC(date_signing, MONTH)
order by SUM(reward_value) desc) AS rank
FROM `deals`
WHERE CAST(date_signing as DATE) > '2019-12-31'
AND CAST(date_signing as DATE) < '2020-02-01'
AND target_category = 'achieved'
AND project_name IS NOT NULL
GROUP BY project_name, month, date_signing
)
)
SELECT * FROM cte WHERE rank <= 5
that returns the following result:
While I expect to have each unique project to be SUM within each month and then I filter only the top 5.
Something like this:
I got the following error if the date_signing grouping is removed
PARTITION BY expression references column date_signing which is neither grouped nor aggregated at [16:48]
Any hints what should be corrected will be appreciated!
One more subquery maybe then?
WITH cte AS(
SELECT project_name,
SUM(reward_value) as reward_sum,
DATE_TRUNC(date_signing, MONTH) as month
FROM `deals`
WHERE CAST(date_signing as DATE) > '2019-12-31'
AND CAST(date_signing as DATE) < '2020-02-01'
AND target_category = 'achieved'
AND project_name IS NOT NULL
GROUP BY project_name, month
),
ranks AS (
SELECT
project_name,
reward_sum,
month,
ROW_NUMBER() over (PARTITION BY month ORDER BY reward_sum DESC) AS rank
)
SELECT *
FROM ranks
WHERE rank <= 5
yeah you can't do that , yo can show the last signing date instead:
WITH cte AS(
SELECT project_name,
SUM(reward_value),
DATE_TRUNC(date_signing, MONTH) as month,
MAX(date_signing) as last_signing_date,
Row_number() over (partition by DATE_TRUNC(date_signing, MONTH)
order by SUM(reward_value) desc) AS rank
FROM `deals`
WHERE CAST(date_signing as DATE) > '2019-12-31'
AND CAST(date_signing as DATE) < '2020-02-01'
AND target_category = 'achieved'
AND project_name IS NOT NULL
GROUP BY project_name, month
)
SELECT * FROM cte WHERE rank <= 5
I am trying to solve a problem where i want to merge overlapping intervals for a given column id, but i also want to track the maximum value for each overlapped interval. I have start_time and stop_time for each interval and each interval has a hierarchy/priority associated with it.
These are the following columns in the table:
id, start_time, stop_time, some_value
example input:
example output:
Below is for BigQuery Standard SQL and I assume you stll working on the same use-case as in previous question, so I wanted to keep it inline with that solution - and you can extend it for when you also want to account for priorities for example
So, anyway:
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MAX(some_value) some_value
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, MAX(some_value) some_value
FROM (
SELECT id, start_time, stop_time, some_value, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time, some_value,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
If to apply to your sample data - result is
Row id start_time stop_time some_value
1 1 0 36 50
2 1 41 47 23
Is it possible to add one more column to the result which will show number of events during that time period
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MAX(some_value) some_value, ANY_VALUE(To_JSON_STRING(b)) event_hash
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, MAX(some_value) some_value, COUNT(DISTINCT event_hash) events
FROM (
SELECT *, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT *,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
with result
Row id start_time stop_time some_value events
1 1 0 36 50 8
2 1 41 47 23 1
You can determine when a new grouping starts using a cumulative max(). Then a cumulative conditional count() to identify the groups . . . and finally aggregation:
select min(start_time), max(stop_time), max(some_value)
from (select t.*,
countif(prev_stop_time is null or prev_stop_time < start_time) over (partition by id order by start_time) as grp
from (select t.*,
max(stop_time) over (partition by id order by start_time rows between unbounded preceding and 1 preceding) as prev_stop_time
from t
) t
) t
group by item_id, grp;
I am trying to solve a problem where i want to merge overlapping intervals for a given column id, but i also want to merge them based on hierarchy/priority. I have start_time and stop_time for each interval and each interval has a hierarchy/priority associated with it.
These are the following columns in the table:
id, start_time, stop_time, priority
I was able to solve the problem where i do not have take into account the priority, but i am struggling with this one.
Red colour: p1 (priority 1)
Blue Colour: p2 (priority 2)
Green colour: p3 (priority 3)
Note that in the example input below we will have 9 rows having same id, and the output will have 6 rows. Please note that for some id's might have only some of the priority values or just one, the solution should take care of that.
expected input and output:
Below is for BigQuery Standard SQL
#standardSQL
WITH check_times AS (
SELECT id, start_time AS time FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS time FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, time AS start_time, LEAD(time) OVER(PARTITION BY id ORDER BY time) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MIN(priority) priority
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, ANY_VALUE(priority) priority
FROM (
SELECT id, start_time, stop_time, priority, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time, priority,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) OR
priority != IFNULL(LAG(priority) OVER(PARTITION BY id ORDER BY start_time), -1) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
If to apply to sample data from your question - result is
Can you also share a solution where we merge intervals based on just id and no priority column
I just simply slightly adjusted above query to ignore priority
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time
FROM (
SELECT id, start_time, stop_time, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
with result
Row id start_time stop_time
1 1 0 36
2 1 41 47
This is a "combining" islands problem. One solution is to find where the islands begin and do a cumulative sum of the beginnings. You can determine the beginning by seeing where there are no overlaps:
select id, priority, min(start_time), max(stop_time)
from (select t.*,
countif(coalesce(prev_stop_time, stop_time) < stop_time) over (partition by id, priority order by start_time) as grp
from (select t.*,
max(stop_time) over (partition by id, priority order by start_time rows between unbounded preceding and 1 preceding) as prev_stop_time
from t
) t
) t
group by id, priority, grp;
I'm trying to do something and I'm not sure how to do it.
I have some data like this:
WITH a AS (SELECT theid, thename, thetimestamp FROM mytable)
SELECT thename, TRUNC (thetimestamp, 'HH24'), COUNT (theid) FROM a
group by thename,trunc(thetimestamp,'HH24') ORDER BY COUNT (theid) desc)
which returns me the count grouped by the hour and the name.
I would like it to just be
for each hour, top X counts
Is that possible?
I ended with:
SELECT thename, hour, cnt
FROM
( SELECT thename, hour, cnt,
rank() over (partition by hours order by cnt desc) rnk
FROM
( SELECT thename, TRUNC (thetimestamp, 'HH24') hour, COUNT (theid) cnt
FROM mytable
group by thename,trunc(thetimestamp,'HH24')
)
)
WHERE rnk <= :X
Try:
SELECT thename, hour, cnt
FROM
( SELECT thename, hour, cnt,
rank() over (partition by thename order by cnt desc) rnk
FROM
( SELECT thename, TRUNC (thetimestamp, 'HH24') hour, COUNT (theid) cnt
FROM mytable
group by thename,trunc(thetimestamp,'HH24')
)
)
WHERE rnk <= :X
(I didn't see the purpose of the WITH clause so I removed it from mine).
You could do that with row_number(), but it requires another subquery or another CTE. Here's the double CTE, since Tony Adrews already posted the subquery approach:
WITH a AS (
SELECT thename, TRUNC(thetimestamp, 'HH24') as hour, COUNT(*) cnt
FROM mytable
GROUP BY thename, TRUNC(thetimestamp, 'HH24')
), b AS (
SELECT
ROW_NUMBER() OVER (PARTITION BY hour ORDER BY ctn DESC) rn,
thename, hour, cnt
FROM a
)
SELECT *
FROM b
WHERE rn < 20