Postgresql: Gaps Between tsranges, empty set - sql

I have a tables of reservations for each user:
reservations_development=# \d reservations
Table "public.reservations"
Column | Type | Modifiers
------------+---------+-----------------------------------------------------------
id | integer | not null default nextval('reservations_id_seq'::regclass)
user_id | integer |
occurrence | tsrange |
Indexes:
"reservations_pkey" PRIMARY KEY, btree (id)
"reservations_occurrence_user_id_excl" EXCLUDE USING gist (occurrence WITH &&, user_id WITH =)
I am trying to create a view of the gaps/opening between reservations for each user, and I currently have the following query:
CREATE OR REPLACE VIEW reservation_gaps AS (
with user_mins as (select tsrange(LOCALTIMESTAMP, min(lower(occurrence))), user_id
FROM (
SELECT user_id, occurrence
FROM reservations
WHERE lower(occurrence) >= LOCALTIMESTAMP
) as y
GROUP BY user_id
),
gaps as (select
tsrange(upper(occurrence), lead(lower(occurrence),1, LOCALTIMESTAMP + interval '1 year') over (win_user_gaps)),
user_id
from (
select user_id, occurrence
from reservations
) as x
WINDOW win_user_gaps AS (PARTITION BY user_id ORDER BY occurrence)
UNION ALL SELECT * FROM user_mins
)
select *
FROM gaps
ORDER BY user_id, tsrange
);
It currently gives the expected results as long as the user has one reservation, but if the user is new, and has not currently been reserved I get an empty result.
I need to in some way append a {tsrange(LOCALTIMESTAMP, LOCALTIMESTAMP + interval '1 year'), user_id} row to the view for each user without a reservation, but I'm currently stumped as to how to do that.
Thanks

You should change the CTE to be a UNION ALL with the artificial rows and then use DISTINCT ON to select one row per user.
with user_mins as (SELECT DISTINCT ON (user_id) user_id, tsrange FROM(
select tsrange(LOCALTIMESTAMP, min(lower(occurrence))) as tsrange, user_id, 1 as priotity
FROM (
SELECT user_id, occurrence
FROM reservations
WHERE lower(occurrence) >= LOCALTIMESTAMP
) as y
GROUP BY user_id
UNION ALL
SELECT user_id, tsrange(LOCALTIMESTAMP, LOCALTIMESTAMP + interval '1 year'),
0
FROM users)
ORDER BY user_id, priority DESC
)

SQL Fiddle
with this_year as (
select tsrange(
date_trunc('year', current_date)::timestamp,
date_trunc('year', current_date)::timestamp + interval '1' year, '[)'
) as this_year
), gaps as (
select
user_id,
this_year - tsrange(lower(occurrence), 'infinity', '[]') lower_range,
this_year - tsrange('-infinity', upper(occurrence), '[]') upper_range,
this_year
from
reservations
cross join
this_year
)
select *
from (
select
user_id,
upper_range *
lead (lower_range, 1, this_year)
over (partition by user_id order by lower_range, upper_range)
as gap
from gaps
union (
select distinct on (user_id)
user_id,
tsrange(
lower(this_year),
coalesce(upper(lower_range), upper(this_year)),
'[)'
) as gap
from gaps
order by user_id, lower_range
)
) s
where gap != 'empty'
order by user_id, gap

Related

Grouping rows by ID and timestamp into sessions using BigQuery

I'm have a dataset like the one below and I'm looking to add the last column to this data.
The logic behind a session, is that it groups all rows by user_id into one session if they are within 5 days of the first event in a session.
In the example below, the users first event is 2023-01-01 which kicks off the first session. That is, although there is less than 5 days between 2023-01-04 and 2023-01-06, this is a new session as the 5 day counter resets when it's reached.
user_id timestamp session
user_1 2023-01-01 session_1
user_1 2023-01-01 session_1
user_1 2023-01-04 session_1
user_1 2023-01-06 session_2
user_1 2023-01-16 session_3
user_1 2023-01-16 session_3
user_1 2023-01-17 session_3
My data contains several users. How do I efficently add this session column in BigQuery?
It seems to be kind of cumulative capped sum problem. If I understood your requirements correctly, you might consider below.
I've answered similar problem here with some explanation about below cumsumbin user defined function.
CREATE TEMP FUNCTION cumsumbin(a ARRAY<INT64>) RETURNS INT64
LANGUAGE js AS """
bin = 1;
a.reduce((c, v) => {
if (c + Number(v) > 4) { bin += 1; return 0; }
else return c += Number(v);
}, 0);
return bin;
""";
WITH sample_table AS (
SELECT 'user_1' user_id, DATE '2023-01-01' timestamp UNION ALL
SELECT 'user_1' user_id, '2023-01-01' timestamp UNION ALL
SELECT 'user_1' user_id, '2023-01-04' timestamp UNION ALL
SELECT 'user_1' user_id, '2023-01-06' timestamp UNION ALL
SELECT 'user_1' user_id, '2023-01-16' timestamp UNION ALL
SELECT 'user_1' user_id, '2023-01-16' timestamp UNION ALL
SELECT 'user_1' user_id, '2023-01-17' timestamp
)
SELECT * EXCEPT(diff), 'session_' || cumsumbin(ARRAY_AGG(diff) OVER w1) session FROM (
SELECT *,
DATE_DIFF(timestamp, LAG(timestamp) OVER w0, DAY) AS diff
FROM sample_table
WINDOW w0 AS (PARTITION BY user_id ORDER BY timestamp)
) WINDOW w1 AS (PARTITION BY user_id ORDER BY timestamp);
Query results
Try the following:
with mydata as
(
select 'user_1' as user_id ,cast('2023-01-01' as date) as timestamp_
union all
select 'user_1' ,cast('2023-01-01' as date)
union all
select 'user_1' ,cast('2023-01-04' as date)
union all
select 'user_1' ,cast('2023-01-06' as date)
union all
select 'user_1' ,cast('2023-01-16' as date)
union all
select 'user_1' ,cast('2023-01-16' as date)
union all
select 'user_1' ,cast('2023-01-17' as date)
)
select user_id, timestamp_,
'session_' || dense_rank() over (partition by user_id order by div(df, 5)) as session
from
(
select *,
date_diff(timestamp_, min(timestamp_) over (partition by user_id), day) df
from mydata
) T
order by user_id, timestamp_
Output according to your input data:
The logic here is to find the date difference between each date and the the minimum date for each user, then perform an integer division by 5 on that data diff to create groups for the dates.
The use of dense_rank is to remove gaps that may occur from the grouping, if it's not important to have sessions ordered with no gaps you could remove it and use div(df, 5) instead.

How to get max date among others ids for current id using BigQuery?

I need to get max date for each row over other ids. Of course I can do this with CROSS JOIN and JOIN .
Like this
WITH t AS (
SELECT 1 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-09-01','2021-09-09', INTERVAL 1 DAY)) rep_date
UNION ALL
SELECT 2 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-08-20','2021-09-03', INTERVAL 1 DAY)) rep_date
UNION ALL
SELECT 3 AS id, rep_date FROM UNNEST(GENERATE_DATE_ARRAY('2021-08-25','2021-09-05', INTERVAL 1 DAY)) rep_date
)
SELECT id, rep_date, MAX(rep_date) OVER (PARTITION BY id) max_date, max_date_over_others FROM t
JOIN (
SELECT t.id, MAX(max_date) max_date_over_others FROM t
CROSS JOIN (
SELECT id, MAX(rep_date) max_date FROM t
GROUP BY 1
) t1
WHERE t1.id <> t.id
GROUP BY 1
) USING (id)
But it's too wired for huge tables. So I'm looking for the some simpler way to do this. Any ideas?
Your version is good enough I think. But if you want to try other options - consider below approach. It might looks more verbose from first look - but should be more optimal and cheaper to compare with your version with cross join
temp as (
select id,
greatest(
ifnull(max(max_date_for_id) over preceding_ids, '1970-01-01'),
ifnull(max(max_date_for_id) over following_ids, '1970-01-01')
) as max_date_for_rest_ids
from (
select id, max(rep_date) max_date_for_id
from t
group by id
)
window
preceding_ids as (order by id rows between unbounded preceding and 1 preceding),
following_ids as (order by id rows between 1 following and unbounded following)
)
select *
from t
join temp
using (id)
Assuming your original table data just has columns id and dt - wouldn't this solve it? I'm using the fact that if an id has the max dt of everything, then it gets the second-highest over the other id values.
WITH max_dates AS
(
SELECT
id,
MAX(dt) AS max_dt
FROM
data
GROUP BY
id
),
with_top1_value AS
(
SELECT
*,
MAX(dt) OVER () AS max_overall_dt_1,
MIN(dt) OVER () AS min_overall_dt
FROM
max_dates
),
with_top2_values AS
(
SELECT
*,
MAX(CASE WHEN dt = max_overall_dt_1 THEN min_overall_dt ELSE dt END) AS max_overall_dt2
FROM
with_top1_value
),
SELECT
*,
CASE WHEN dt = max_overall_dt1 THEN max_overall_dt2 ELSE max_overall_dt1 END AS max_dt_of_others
FROM
with_top2_values

Windows functions orderen by date when some dates doesn't exist

Suppose this example query:
select
id
, date
, sum(var) over (partition by id order by date rows 30 preceding) as roll_sum
from tab
When some dates are not present on date column the window will not consider the unexistent dates. How could i make this windowns aggregation including these unexistent dates?
Many thanks!
You can join a sequence containing all dates from a desired interval.
select
*
from (
select
d.date,
q.id,
q.roll_sum
from unnest(sequence(date '2000-01-01', date '2030-12-31')) d
left join ( your_query ) q on q.date = d.date
) v
where v.date > (select min(my_date) from tab2)
and v.date < (select max(my_date) from tab2)
In standard SQL, you would typically use a window range specification, like:
select
id,
date,
sum(var) over (
partition by id
order by date
range interval '30' day preceding
) as roll_sum
from tab
However I am unsure that Presto supports this syntax. You can resort a correlated subquery instead:
select
id,
date,
(
select sum(var)
from tab t1
where
t1.id = t.id
and t1.date >= t.date - interval '30' day
and t1.date <= t.date
) roll_sum
from tab t
I don't think Presto support window functions with interval ranges. Alas. There is an old fashioned way to doing this, by counting "ins" and "outs" of values:
with t as (
select id, date, var, 1 as is_orig
from t
union all
select id, date + interval '30 day', -var, 0
from t
)
select id.*
from (select id, date, sum(var) over (partition by id order by date) as running_30,
sum(is_org) as is_orig
from t
group by id, date
) id
where is_orig > 0

Active customers for each day who were active in last 30 days

I have a BQ table, user_events that looks like the following:
event_date | user_id | event_type
Data is for Millions of users, for different event dates.
I want to write a query that will give me a list of users for every day who were active in last 30 days.
This gives me total unique users on only that day; I can't get it to give me the last 30 for each date. Help is appreciated.
SELECT
user_id,
event_date
FROM
[TableA]
WHERE
1=1
AND user_id IS NOT NULL
AND event_date >= DATE_ADD(CURRENT_TIMESTAMP(), -30, 'DAY')
GROUP BY
1,
2
ORDER BY
2 DESC
Below is for BigQuery Standard SQL and has few assumption about your case:
there is only one row per date per user
user is considered active in last 30 days if user has at least 5 (sure can be any number - even just 1) entries/rows within those 30 days
If above make sense - see below
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM `yourTable`
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date
If above assumption #1 is not correct - you can just simple add pre-grouping as a sub-select
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM (
SELECT user_id, event_date
FROM `yourTable`
GROUP BY user_id, event_date
)
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date
UPDATE
From comments: If user have any of the event_type IN ('view', 'conversion', 'productDetail', 'search') , they will be considered active. That means any kind of event triggered within the app
So, you can go with below, I think
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM (
SELECT user_id, event_date
FROM `yourTable`
WHERE event_type IN ('view', 'conversion', 'productDetail', 'search')
GROUP BY user_id, event_date
)
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date

Google BigQuery: Rolling Count Distinct

I have a table with is simply a list of dates and user IDs (not aggregated).
We define a metric called active users for a given date by counting the distinct number of IDs that appear in the previous 45 days.
I am trying to run a query in BigQuery that, for each day, returns the day plus the number of active users for that day (count distinct user from 45 days ago until today).
I have experimented with window functions, but can't figure out how to define a range based on the date values in a column. Instead, I believe the following query would work in a database like MySQL, but does not in BigQuery.
SELECT
day,
(SELECT
COUNT(DISTINCT visid)
FROM daily_users
WHERE day BETWEEN DATE_ADD(t.day, -45, "DAY") AND t.day
) AS active_users
FROM daily_users AS t
GROUP BY 1
This doesn't work in BigQuery: "Subselect not allowed in SELECT clause."
How to do this in BigQuery?
BigQuery documentation claims that count(distinct) works as a window function. However, that doesn't help you, because you are not looking for a traditional window frame.
One method would adds a record for each date after a visit:
select theday, count(distinct visid)
from (select date_add(u.day, n.n, "day") as theday, u.visid
from daily_users u cross join
(select 1 as n union all select 2 union all . . .
select 45
) n
) u
group by theday;
Note: there may be simpler ways to generate a series of 45 integers in BigQuery.
Below should work with BigQuery
#legacySQL
SELECT day, active_users FROM (
SELECT
day,
COUNT(DISTINCT id)
OVER (ORDER BY ts RANGE BETWEEN 45*24*3600 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, TIMESTAMP_TO_SEC(TIMESTAMP(day)) AS ts
FROM daily_users
)
) GROUP BY 1, 2 ORDER BY 1
Above assumes that day field is represented as '2016-01-10' format.
If it is not a case , you should adjust TIMESTAMP_TO_SEC(TIMESTAMP(day)) in most inner select
Also please take a look at COUNT(DISTINC) specifics in BigQuery
Update for BigQuery Standard SQL
#standardSQL
SELECT
day,
(SELECT COUNT(DISTINCT id) FROM UNNEST(active_users) id) AS active_users
FROM (
SELECT
day,
ARRAY_AGG(id)
OVER (ORDER BY ts RANGE BETWEEN 3888000 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, UNIX_DATE(PARSE_DATE('%Y-%m-%d', day)) * 24 * 3600 AS ts
FROM daily_users
)
)
GROUP BY 1, 2
ORDER BY 1
You can test / play with it using below dummy sample
#standardSQL
WITH daily_users AS (
SELECT 1 AS id, '2016-01-10' AS day UNION ALL
SELECT 2 AS id, '2016-01-10' AS day UNION ALL
SELECT 1 AS id, '2016-01-11' AS day UNION ALL
SELECT 3 AS id, '2016-01-11' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-13' AS day
)
SELECT
day,
(SELECT COUNT(DISTINCT id) FROM UNNEST(active_users) id) AS active_users
FROM (
SELECT
day,
ARRAY_AGG(id)
OVER (ORDER BY ts RANGE BETWEEN 86400 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, UNIX_DATE(PARSE_DATE('%Y-%m-%d', day)) * 24 * 3600 AS ts
FROM daily_users
)
)
GROUP BY 1, 2
ORDER BY 1