Variable in dateadd function snowflake - sql

I have a query like this:
With cte as(
Select min(date1) as min_date,
max(date1) as max_date,
id,
city,
time_id
From some_table
Group by id, city, time_id
),
range as
(select dateadd('month', row_number()over(order by null), (select min_date from cte)) as date_expand
From table(generator (row_count => 12*36))))
Select * from range;
It gives this error:
Single row subquery returns more than one row.
Is there a way to pass a variable in the 3rd argument of dateadd function? Because my cte will return many min_date based on the group by clause. TIA

Yes, sub-select in SELECT need to only return one row, you have many rows in your CTE.
You query makes more sense to me if you did something like this:
With some_table as (
SELECT * FROM VALUES
(1, 'new york', 10, '2020-01-01'::date),
(1, 'new york', 10, '2020-02-01'::date),
(2, 'christchurch', 20, '2021-01-01'::date)
v(id, city, time_id, date1)
), cte as (
Select
min(date1) as min_date,
max(date1) as max_date,
id,
city,
time_id
FROM some_table
GROUP BY 3,4,5
), range as (
SELECT
id, city, time_id,
dateadd('month', row_number()over(partition by id, city, time_id ORDER BY null), min_date) as date_expand
FROM table(generator(rowcount =>12*36))
CROSS JOIN cte
)
Select * from range;
but if your CTE was changing to be like this:
With cte as (
Select
min(date1) as min_date,
max(date1) as max_date
FROM some_table
), range as (
SELECT
dateadd('month', row_number()over(ORDER BY null), (select min_date from cte)) as date_expand
FROM table(generator(rowcount =>2*4))
)
Select * from range;
this would work, as there is only one min_date value returned.
OR you could find the smallest of the min_dates like:
WITH cte as (
Select
min(date1) as min_date,
max(date1) as max_date,
id,
city,
time_id
FROM some_table
GROUP BY 3,4,5
), range as (
SELECT
dateadd('month', row_number()over(ORDER BY null), (select min(min_date) from cte)) as date_expand
FROM table(generator(rowcount =>2*3))
)
Select * from range;

Related

Optimizing SQL query - finding a group with in a group

I have a working query and looking for ideas to optimize it.
Query explanation: Within each ID group (visitor_id), look for row where c_id != 0. From that row, show all consecutive rows within that ID group.
select t2.*
from (select *, row_number() OVER (PARTITION BY visitor_id ORDER BY date) as row_number
from "DB"."schema"."table"
where visitor_id in
(select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
) as t2
inner join
(select visitor_id, min(rn) as row_number
from
(select *, row_number() OVER (PARTITION BY visitor_id ORDER BY date) as rn
from "DB"."schema"."table"
where visitor_id in
(select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
) as filtered_table
where c_id != 0
group by visitor_id) as t1
on t2.visitor_id = t1.visitor_id
and t2.row_number >= t1.row_number
so you have a common sub expression
select distinct visitor_id
from (select * from "DB"."schema"."table" where date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
so that can be moved to a CTE and run just once. like
WITH distinct_visitors AS (
SELECT DISTINCT visitor_id
FROM (SELECT * FROM "DB"."schema"."table" WHERE date >= '2021-08-01' and date <= '2021-08-30')
where c_id in ('101')
)
but the sub clause filter is equally valid as a top level filter, and given it's a value inclusive range filter BETWEEN will give better performance.
WITH distinct_visitors AS (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND'2021-08-30'
AND c_id IN ('101')
)
then both uses of that CTE do the same ROW_NUMBER operation so that can be a CTE
and simplified as such
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*
FROM rw_rows AS t2
JOIN (
SELECT visitor_id,
min(rn) AS row_number
FROM rw_rows AS filtered_table
WHERE c_id != 0
GROUP BY visitor_id
) AS t1
ON t2.visitor_id = t1.visitor_id
AND t2.row_number >= t1.row_number
So we are want to keep all rows that come after the first non-zero c_id which a QUALIFY should be able to solve like:
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*,
MIN(IFF(c_id != 0, row_number, NULL )) OVER (PARTITION BY visitor_id) as min_rn
FROM rw_rows AS t2
QUALIFY t2.row_number >= min_rn
which without have run feels like the MIN also should be able to be moved to the QUALIFY like:
WITH rw_rows AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY visitor_id ORDER BY date) AS row_number
FROM "DB"."schema"."table"
WHERE visitor_id IN (
SELECT DISTINCT visitor_id
FROM "DB"."schema"."table"
WHERE date BETWEEN '2021-08-01' AND '2021-08-30'
AND c_id in ('101')
)
)
SELECT t2.*
FROM rw_rows AS t2
QUALIFY t2.row_number >= MIN(IFF(c_id != 0, row_number, NULL )) OVER (PARTITION BY visitor_id)
At which point the CTE is not needed, as it's just used once, so could be moved back in, or not as they are the same.

How to get max date among others ids for current id using BigQuery?

I need to get max date for each row over other ids. Of course I can do this with CROSS JOIN and JOIN .
Like this
WITH t AS (
SELECT 1 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-09-01','2021-09-09', INTERVAL 1 DAY)) rep_date
UNION ALL
SELECT 2 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-08-20','2021-09-03', INTERVAL 1 DAY)) rep_date
UNION ALL
SELECT 3 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-08-25','2021-09-05', INTERVAL 1 DAY)) rep_date
)
SELECT id, rep_date, MAX(rep_date) OVER (PARTITION BY id) max_date, max_date_over_others FROM t
JOIN (
SELECT t.id, MAX(max_date) max_date_over_others FROM t
CROSS JOIN (
SELECT id, MAX(rep_date) max_date FROM t
GROUP BY 1
) t1
WHERE t1.id <> t.id
GROUP BY 1
) USING (id)
But it's too wired for huge tables. So I'm looking for the some simpler way to do this. Any ideas?
Your version is good enough I think. But if you want to try other options - consider below approach. It might looks more verbose from first look - but should be more optimal and cheaper to compare with your version with cross join
temp as (
select id,
greatest(
ifnull(max(max_date_for_id) over preceding_ids, '1970-01-01'),
ifnull(max(max_date_for_id) over following_ids, '1970-01-01')
) as max_date_for_rest_ids
from (
select id, max(rep_date) max_date_for_id
from t
group by id
)
window
preceding_ids as (order by id rows between unbounded preceding and 1 preceding),
following_ids as (order by id rows between 1 following and unbounded following)
)
select *
from t
join temp
using (id)
Assuming your original table data just has columns id and dt - wouldn't this solve it? I'm using the fact that if an id has the max dt of everything, then it gets the second-highest over the other id values.
WITH max_dates AS
(
SELECT
id,
MAX(dt) AS max_dt
FROM
data
GROUP BY
id
),
with_top1_value AS
(
SELECT
*,
MAX(dt) OVER () AS max_overall_dt_1,
MIN(dt) OVER () AS min_overall_dt
FROM
max_dates
),
with_top2_values AS
(
SELECT
*,
MAX(CASE WHEN dt = max_overall_dt_1 THEN min_overall_dt ELSE dt END) AS max_overall_dt2
FROM
with_top1_value
),
SELECT
*,
CASE WHEN dt = max_overall_dt1 THEN max_overall_dt2 ELSE max_overall_dt1 END AS max_dt_of_others
FROM
with_top2_values

Merge Overlapping Time Intervals based on Hierarchy in SQL

I am trying to solve a problem where i want to merge overlapping intervals for a given column id, but i also want to merge them based on hierarchy/priority. I have start_time and stop_time for each interval and each interval has a hierarchy/priority associated with it.
These are the following columns in the table:
id, start_time, stop_time, priority
I was able to solve the problem where i do not have take into account the priority, but i am struggling with this one.
Red colour: p1 (priority 1)
Blue Colour: p2 (priority 2)
Green colour: p3 (priority 3)
Note that in the example input below we will have 9 rows having same id, and the output will have 6 rows. Please note that for some id's might have only some of the priority values or just one, the solution should take care of that.
expected input and output:
Below is for BigQuery Standard SQL
#standardSQL
WITH check_times AS (
SELECT id, start_time AS time FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS time FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, time AS start_time, LEAD(time) OVER(PARTITION BY id ORDER BY time) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MIN(priority) priority
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, ANY_VALUE(priority) priority
FROM (
SELECT id, start_time, stop_time, priority, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time, priority,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) OR
priority != IFNULL(LAG(priority) OVER(PARTITION BY id ORDER BY start_time), -1) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
If to apply to sample data from your question - result is
Can you also share a solution where we merge intervals based on just id and no priority column
I just simply slightly adjusted above query to ignore priority
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time
FROM (
SELECT id, start_time, stop_time, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
with result
Row id start_time stop_time
1 1 0 36
2 1 41 47
This is a "combining" islands problem. One solution is to find where the islands begin and do a cumulative sum of the beginnings. You can determine the beginning by seeing where there are no overlaps:
select id, priority, min(start_time), max(stop_time)
from (select t.*,
countif(coalesce(prev_stop_time, stop_time) < stop_time) over (partition by id, priority order by start_time) as grp
from (select t.*,
max(stop_time) over (partition by id, priority order by start_time rows between unbounded preceding and 1 preceding) as prev_stop_time
from t
) t
) t
group by id, priority, grp;

SQL - Group rows by contiguous date

I have a table:
Value Date
100 01/01/2000
110 01/05/2002
100 01/10/2003
100 01/12/2004
I want to group the data in this way
Value StartDate EndDate
100 01/01/2000 30/04/2002
110 01/05/2002 30/09/2003
100 01/10/2003 NULL --> or value like '01/01/2099'
How can I accomplish this?
Can a CTE be useful and how?
For RDBMS supported window functions (example on MS SQL database):
with Test(value, dt) as(
select 100, cast('2000-01-01' as date) union all
select 110, cast('2002-05-01' as date) union all
select 100, cast('2003-10-01' as date) union all
select 100, cast('2004-12-01' as date)
)
select max(value) value, min(dt) startDate, max(end_dt) endDate
from (
select a.*, sum(brk) over(order by dt) grp
from (
select t.*,
case when value!=lag(value) over(order by dt) then 1 else 0 end brk,
DATEADD(DAY,-1,lead(dt,1,cast('2099-01-02' as date)) over(order by dt)) end_dt
from Test t
) a
) b
group by grp
order by startDate
I think the difference of row numbers is simpler in this case:
select value, min(date) as endDate,
dateadd(day, -1, lead(min(date)) over (order by min(date))) as endDate
from (select t.*,
row_number() over (order by date) as seqnum,
row_number() over (partition by value order by date) as seqnum_v
from t
) t
group by value, (seqnum - seqnum_v);
The difference of the row numbers defines the groups you want. This is a bit hard to see at first . . . if you stare at the results of the subquery, you'll see how it works.

For each row in query select top 20 from other query

I'm trying to do something and I'm not sure how to do it.
I have some data like this:
WITH a AS (SELECT theid, thename, thetimestamp FROM mytable)
SELECT thename, TRUNC (thetimestamp, 'HH24'), COUNT (theid) FROM a
group by thename,trunc(thetimestamp,'HH24') ORDER BY COUNT (theid) desc)
which returns me the count grouped by the hour and the name.
I would like it to just be
for each hour, top X counts
Is that possible?
I ended with:
SELECT thename, hour, cnt
FROM
( SELECT thename, hour, cnt,
rank() over (partition by hours order by cnt desc) rnk
FROM
( SELECT thename, TRUNC (thetimestamp, 'HH24') hour, COUNT (theid) cnt
FROM mytable
group by thename,trunc(thetimestamp,'HH24')
)
)
WHERE rnk <= :X
Try:
SELECT thename, hour, cnt
FROM
( SELECT thename, hour, cnt,
rank() over (partition by thename order by cnt desc) rnk
FROM
( SELECT thename, TRUNC (thetimestamp, 'HH24') hour, COUNT (theid) cnt
FROM mytable
group by thename,trunc(thetimestamp,'HH24')
)
)
WHERE rnk <= :X
(I didn't see the purpose of the WITH clause so I removed it from mine).
You could do that with row_number(), but it requires another subquery or another CTE. Here's the double CTE, since Tony Adrews already posted the subquery approach:
WITH a AS (
SELECT thename, TRUNC(thetimestamp, 'HH24') as hour, COUNT(*) cnt
FROM mytable
GROUP BY thename, TRUNC(thetimestamp, 'HH24')
), b AS (
SELECT
ROW_NUMBER() OVER (PARTITION BY hour ORDER BY ctn DESC) rn,
thename, hour, cnt
FROM a
)
SELECT *
FROM b
WHERE rn < 20