Billing tier issues with 30 day active user query within bigquery - sql

Is there a way using bigquery that I can run this query and not have to use such a huge billing tier? It ranges anywhere from 11 - 20 on the billing tier. Is my only option to crank up the billing tier and let the charges flow?
WITH allTables AS (SELECT
app,
date,
SUM(CASE WHEN period = 1 THEN users END) as days_1
FROM (
SELECT
CONCAT(user_dim.app_info.app_id, ':', user_dim.app_info.app_platform) as app,
dates.date as date,
periods.period as period,
COUNT(DISTINCT user_dim.app_info.app_instance_id) as users
FROM `table.*` as activity
CROSS JOIN
UNNEST(event_dim) AS event
CROSS JOIN (
SELECT DISTINCT
TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC') as date
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event) as dates
CROSS JOIN (
SELECT
period
FROM (
SELECT 1 as period
)
) as periods
WHERE
dates.date >= TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC')
AND
FLOOR(TIMESTAMP_DIFF(dates.date, TIMESTAMP_MICROS(event.timestamp_micros), DAY)/periods.period) = 0
GROUP BY 1,2,3
)
GROUP BY 1,2) SELECT
app as target,
UNIX_SECONDS(date) as datapoint_time,
SUM(days_1) as datapoint_value
FROM allTables
WHERE
date >= TIMESTAMP_ADD(TIMESTAMP_TRUNC(CURRENT_TIMESTAMP, Day, 'UTC'), INTERVAL -1 DAY)
GROUP BY date,1
ORDER BY date ASC

Related

In Postgres how do I write a SQL query to select distinct values overall but aggregated over a set time period

What I mean by this is if I have a table called payments with a created_at column and user_id column I want to select the count of purchases aggregated weekly (can be any interval I want) but only selecting first time purchases e.g. if a user purchased for the first time in week 1 it would be counted but if he purchased again in week 2 he would not be counted.
created_at
user_id
timestamp
1
timestamp
1
This is the query I came up with. The issue is if the user purchases multiple times they are all included. How can I improve this?
WITH dates AS
(
SELECT *
FROM generate_series(
'2022-07-22T15:30:06.687Z'::DATE,
'2022-11-21T17:04:59.457Z'::DATE,
'1 week'
) date
)
SELECT
dates.date::DATE AS date,
COALESCE(COUNT(DISTINCT(user_id)), 0) AS registrations
FROM
dates
LEFT JOIN
payment ON created_at::DATE BETWEEN dates.date AND dates.date::date + '1 ${dateUnit}'::INTERVAL
GROUP BY
dates.date
ORDER BY
dates.date DESC;
You want to count only first purchases. So get those first purchases in the first step and work with these.
WITH dates AS
(
SELECT *
FROM generate_series(
'2022-07-22T15:30:06.687Z'::DATE,
'2022-11-21T17:04:59.457Z'::DATE,
'1 week'
) date
)
, first_purchases AS
(
SELECT user_id, MIN(created_at:DATE) AS purchase_date
FROM payment
GROUP BY user_id
)
SELECT
d.date,
COALESCE(COUNT(p.purchase_date), 0) AS registrations
FROM
dates d
LEFT JOIN
first_purchases p ON p.purchase_date >= d.date
AND p.purchase_date < d.date + '1 ${dateUnit}'::INTERVAL
GROUP BY
d.date
ORDER BY
d.date DESC;

How to limit datasets using _table_suffix on complex query?

I understand how _TABLE_SUFFIX works and have successfully used it before on simpler queries. I'm currently trying to build an application that will get active users from 100+ datasets but have been running into resource limits. In order to bypass these resource limits I'm going to loop and run the query multiple times and limit how much it selects at once using _TABLE_SUFFIX.
Here is my current query:
WITH allTables AS (SELECT
app,
date,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
CONCAT(user_dim.app_info.app_id, ':', user_dim.app_info.app_platform) as app,
dates.date as date,
periods.period as period,
COUNT(DISTINCT user_dim.app_info.app_instance_id) as users
FROM `table.app_events_*` as activity
WHERE _TABLE_SUFFIX BETWEEN '20170101' AND '20170502'
OR _TABLE_SUFFIX BETWEEN 'intraday_20170101' AND 'intraday_20170502'
CROSS JOIN
UNNEST(event_dim) AS event
CROSS JOIN (
SELECT DISTINCT
TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC') as date
FROM `table.app_events_*`
WHERE _TABLE_SUFFIX BETWEEN '20170101' AND '20170502'
OR _TABLE_SUFFIX BETWEEN 'intraday_20170101' AND 'intraday_20170502'
CROSS JOIN
UNNEST(event_dim) as event) as dates
CROSS JOIN (
SELECT
period
FROM (
SELECT 30 as period
)
) as periods
WHERE
dates.date >= TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC')
AND
FLOOR(TIMESTAMP_DIFF(dates.date, TIMESTAMP_MICROS(event.timestamp_micros), DAY)/periods.period) = 0
GROUP BY 1,2,3
)
GROUP BY 1,2)
SELECT
app as target,
UNIX_SECONDS(date) as datapoint_time,
SUM(days_30) as datapoint_value
FROM allTables
WHERE date >= TIMESTAMP_ADD(TIMESTAMP_TRUNC(CURRENT_TIMESTAMP, Day, 'UTC'), INTERVAL -30 DAY)
GROUP BY date,1
ORDER BY date ASC
This currently gives me:
Error: Syntax error: Expected ")" but got keyword CROSS at [14:3]
So my question is, how can I limit the amount of data I pull in using this query and _TABLE_SUFFIX? I feel like I'm missing something very simple here. Any help would be great, thanks!
The CROSS JOIN UNNEST(event_dim) AS event (and the cross join following it) needs to come before the WHERE clause. You can read more in the query syntax documentation.

BigQuery Tier 20 or higher required

I'm attempting to run the following query within BigQuery:
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(date)) as target,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
activity.date as date,
periods.period as period,
COUNT(DISTINCT user) as users
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event
) as activity
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT period
FROM
(
SELECT 7 as period
UNION ALL
SELECT 14 as period
UNION ALL
SELECT 30 as period
)
) as periods
WHERE
dates.date >= activity.date
AND
SAFE_CAST(FLOOR(TIMESTAMP_DIFF(TIMESTAMP_MICROS(dates.date), TIMESTAMP_MICROS(activity.date), DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC
It is working and will select the active users for specific time frames if I run it on a single table but within my actual application I'm going to be running this on all my datasets (40+). When I attempt to run it on a single dataset with all tables dataset.* I get this error:
Query exceeded resource limits for tier 1. Tier 20 or higher required.
I'm unsure what I can do now. I'm thinking that possibly I might have to end up moving this to code instead of SQL for performance sake.
I think I see the reason for this query to be CPU expensive so it gets "promoted" to that high billing tier
The reason is that sub-selects dates and activity have huge amount of rows because each row represents timestamp in microsecond so no pre-grouping is happenning at all
So, I recommend to transform below
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event
) as activity
into
FROM (
SELECT DISTINCT
DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE,
user_dim.app_info.app_instance_id AS user
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
) AS activity
and respectively below
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
into
CROSS JOIN (
SELECT DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
GROUP BY 1
) AS dates
above change will make number of rows much more lower so than CROSS JOIN will be not that expensive
of course than you need respectively modify other pieces of your query to accommodate fact that now date fields are actually of DATE type and not microseconds anymore
Hope this helps!

BigQuery Monthly Active Users?

I'm currently working off a query from this post. That query is written in Legacy SQL and will not work in my environment. I've modified the query to use the modern SQL functions and updated the SELECT date as date to use timestamp_micros.
I should also mention that the rows I'm trying to select are coming in from Firebase Analytics.
My Query:
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date,
periods.period as period,
COUNT(DISTINCT user_dim.app_info.app_instance_id) as users
FROM `com_sidearm_fanapp_uiowa_IOS.*` as activity
CROSS JOIN
UNNEST(event_dim) as event
CROSS JOIN (
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date
FROM `com_sidearm_fanapp_uiowa_IOS.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT
period
FROM
(
SELECT 7 as period
UNION ALL
SELECT 14 as period
UNION ALL
SELECT 30 as period
)
) as periods
WHERE
dates.date >= activity.date
AND
SAFE_CAST(FLOOR(TIMESTAMP_DIFF(dates.date, activity.date, DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY date
ORDER BY date DESC
Column name period is ambiguous at [24:13] error.
to fix this particular error - you should fix below
CROSS JOIN (
SELECT
period
FROM
(SELECT 7 as period),
(SELECT 14 as period),
(SELECT 30 as period)
) as periods
so it should look like:
CROSS JOIN (
SELECT
period
FROM
(SELECT 7 as period UNION ALL
SELECT 14 as period UNION ALL
SELECT 30 as period)
) as periods
Answer on your updated question
Try below. I didn't have chance to test it but hope it can help you fix your query
SELECT
date,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
activity.date as date,
periods.period as period,
COUNT(DISTINCT user) as users
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `yourTable` CROSS JOIN UNNEST(event_dim) as event
) as activity
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `yourTable` CROSS JOIN UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT period
FROM
(SELECT 7 as period UNION ALL
SELECT 14 as period UNION ALL
SELECT 30 as period)
) as periods
WHERE dates.date >= activity.date
AND SAFE_CAST(FLOOR(TIMESTAMP_DIFF(TIMESTAMP_MICROS(dates.date), TIMESTAMP_MICROS(activity.date), DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC

How to get the monthly 7-day active users?

In my database I have two fields that are used to identify a user, timestamp and instance_id. I want to be able to get the monthly 7-day active users from this data. I have tried the following query but it just returns the same timestamp and 1 for every row.
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(date)) as target,
SUM(CASE WHEN period = 7 THEN users END) as days_07
# SUM(CASE WHEN period = 14 THEN users END) as days_14,
# SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
activity.date as date,
periods.period as period,
COUNT(DISTINCT user) as users
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `hidden.*`
CROSS JOIN
UNNEST(event_dim) as event
) as activity
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `hidden.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT period
FROM
(
SELECT 7 as period
# UNION ALL
# SELECT 14 as period
# UNION ALL
# SELECT 30 as period
)
) as periods
WHERE
dates.date >= activity.date
AND
SAFE_CAST(FLOOR(TIMESTAMP_DIFF(TIMESTAMP_MICROS(dates.date), TIMESTAMP_MICROS(activity.date), DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC
I'm not too sure where to go from here and it's quite challenging to me because I'm not the best with SQL. Any assistance at all would be great. Thanks!
I should also mention that these queries are going to be run within BigQuery and the data is being exported to BigQuery from Firebase.
Try below
SELECT
DATE,
SUM(CASE WHEN period = 7 THEN users END) AS days_07,
SUM(CASE WHEN period = 14 THEN users END) AS days_14,
SUM(CASE WHEN period = 30 THEN users END) AS days_30
FROM (
SELECT
activity.date AS DATE,
periods.period AS period,
COUNT(DISTINCT user) AS users
FROM (
SELECT DISTINCT
DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE,
user_dim.app_info.app_instance_id AS user
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
) AS activity
CROSS JOIN (
SELECT DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
GROUP BY 1
) AS dates
CROSS JOIN (
SELECT period FROM
(SELECT 7 AS period UNION ALL
SELECT 14 AS period UNION ALL
SELECT 30 AS period)
) AS periods
WHERE dates.date >= activity.date
AND SAFE_CAST(FLOOR(DATE_DIFF(dates.date, activity.date, DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY DATE
ORDER BY DATE DESC