BigQuery Tier 20 or higher required - sql

I'm attempting to run the following query within BigQuery:
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(date)) as target,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
activity.date as date,
periods.period as period,
COUNT(DISTINCT user) as users
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event
) as activity
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT period
FROM
(
SELECT 7 as period
UNION ALL
SELECT 14 as period
UNION ALL
SELECT 30 as period
)
) as periods
WHERE
dates.date >= activity.date
AND
SAFE_CAST(FLOOR(TIMESTAMP_DIFF(TIMESTAMP_MICROS(dates.date), TIMESTAMP_MICROS(activity.date), DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC
It is working and will select the active users for specific time frames if I run it on a single table but within my actual application I'm going to be running this on all my datasets (40+). When I attempt to run it on a single dataset with all tables dataset.* I get this error:
Query exceeded resource limits for tier 1. Tier 20 or higher required.
I'm unsure what I can do now. I'm thinking that possibly I might have to end up moving this to code instead of SQL for performance sake.

I think I see the reason for this query to be CPU expensive so it gets "promoted" to that high billing tier
The reason is that sub-selects dates and activity have huge amount of rows because each row represents timestamp in microsecond so no pre-grouping is happenning at all
So, I recommend to transform below
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event
) as activity
into
FROM (
SELECT DISTINCT
DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE,
user_dim.app_info.app_instance_id AS user
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
) AS activity
and respectively below
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
into
CROSS JOIN (
SELECT DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
GROUP BY 1
) AS dates
above change will make number of rows much more lower so than CROSS JOIN will be not that expensive
of course than you need respectively modify other pieces of your query to accommodate fact that now date fields are actually of DATE type and not microseconds anymore
Hope this helps!

Related

SQL - Unequal left join BigQuery

New here. I am trying to get the Daily and Weekly active users over time. they have 30 days before they are considered inactive. My goal is to create graph's that can be split by user_id to show cohorts, regions, categories, etc.
I have created a date table to get every day for the time period and I have the simplified orders table with the base info that I need to calculate this.
I am trying to do a Left Join to get the status by date using the following SQL Query:
WITH daily_use AS (
SELECT
__key__.id AS user_id
, DATE_TRUNC(date(placeOrderDate), day) AS activity_date
FROM `analysis.Order`
where isBuyingGroupOrder = TRUE
AND testOrder = FALSE
GROUP BY 1, 2
),
dates AS (
SELECT DATE_ADD(DATE "2016-01-01", INTERVAL d.d DAY) AS date
FROM
(
SELECT ROW_NUMBER() OVER(ORDER BY __key__.id) -1 AS d
FROM `analysis.Order`
ORDER BY __key__.id
LIMIT 1096
) AS d
ORDER BY 1 DESC
)
SELECT
daily_use.user_id
, wd.date AS date
, MIN(DATE_DIFF(wd.date, daily_use.activity_date, DAY)) AS days_since_last_action
FROM dates AS wd
LEFT JOIN daily_use
ON wd.date >= daily_use.activity_date
AND wd.date < DATE_ADD(daily_use.activity_date, INTERVAL 30 DAY)
GROUP BY 1,2
I am getting this Error: LEFT OUTER JOIN cannot be used without a condition that is an equality of fields from both sides of the join. In BigQuery and was wondering how can I go around this. I am using Standard SQL within BigQuery.
Thank you
Below is for BigQuery Standard SQL and mostly reproduce logic in your query with exception of not including days where no activity at all is found
#standardSQL
SELECT
daily_use.user_id
, wd.date AS DATE
, MIN(DATE_DIFF(wd.date, daily_use.activity_date, DAY)) AS days_since_last_action
FROM dates AS wd
CROSS JOIN daily_use
WHERE wd.date BETWEEN
daily_use.activity_date AND DATE_ADD(daily_use.activity_date, INTERVAL 30 DAY)
GROUP BY 1,2
-- ORDER BY 1,2
if for whatever reason you still need to exactly reproduce your logic - you can embrace above with final left join as below:
#standardSQL
SELECT *
FROM dates AS wd
LEFT JOIN (
SELECT
daily_use.user_id
, wd.date AS date
, MIN(DATE_DIFF(wd.date, daily_use.activity_date, DAY)) AS days_since_last_action
FROM dates AS wd
CROSS JOIN daily_use
WHERE wd.date BETWEEN
daily_use.activity_date AND DATE_ADD(daily_use.activity_date, INTERVAL 30 DAY)
GROUP BY 1,2
) AS daily_use
USING (date)
-- ORDER BY 1,2

Billing tier issues with 30 day active user query within bigquery

Is there a way using bigquery that I can run this query and not have to use such a huge billing tier? It ranges anywhere from 11 - 20 on the billing tier. Is my only option to crank up the billing tier and let the charges flow?
WITH allTables AS (SELECT
app,
date,
SUM(CASE WHEN period = 1 THEN users END) as days_1
FROM (
SELECT
CONCAT(user_dim.app_info.app_id, ':', user_dim.app_info.app_platform) as app,
dates.date as date,
periods.period as period,
COUNT(DISTINCT user_dim.app_info.app_instance_id) as users
FROM `table.*` as activity
CROSS JOIN
UNNEST(event_dim) AS event
CROSS JOIN (
SELECT DISTINCT
TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC') as date
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event) as dates
CROSS JOIN (
SELECT
period
FROM (
SELECT 1 as period
)
) as periods
WHERE
dates.date >= TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC')
AND
FLOOR(TIMESTAMP_DIFF(dates.date, TIMESTAMP_MICROS(event.timestamp_micros), DAY)/periods.period) = 0
GROUP BY 1,2,3
)
GROUP BY 1,2) SELECT
app as target,
UNIX_SECONDS(date) as datapoint_time,
SUM(days_1) as datapoint_value
FROM allTables
WHERE
date >= TIMESTAMP_ADD(TIMESTAMP_TRUNC(CURRENT_TIMESTAMP, Day, 'UTC'), INTERVAL -1 DAY)
GROUP BY date,1
ORDER BY date ASC

How to limit datasets using _table_suffix on complex query?

I understand how _TABLE_SUFFIX works and have successfully used it before on simpler queries. I'm currently trying to build an application that will get active users from 100+ datasets but have been running into resource limits. In order to bypass these resource limits I'm going to loop and run the query multiple times and limit how much it selects at once using _TABLE_SUFFIX.
Here is my current query:
WITH allTables AS (SELECT
app,
date,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
CONCAT(user_dim.app_info.app_id, ':', user_dim.app_info.app_platform) as app,
dates.date as date,
periods.period as period,
COUNT(DISTINCT user_dim.app_info.app_instance_id) as users
FROM `table.app_events_*` as activity
WHERE _TABLE_SUFFIX BETWEEN '20170101' AND '20170502'
OR _TABLE_SUFFIX BETWEEN 'intraday_20170101' AND 'intraday_20170502'
CROSS JOIN
UNNEST(event_dim) AS event
CROSS JOIN (
SELECT DISTINCT
TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC') as date
FROM `table.app_events_*`
WHERE _TABLE_SUFFIX BETWEEN '20170101' AND '20170502'
OR _TABLE_SUFFIX BETWEEN 'intraday_20170101' AND 'intraday_20170502'
CROSS JOIN
UNNEST(event_dim) as event) as dates
CROSS JOIN (
SELECT
period
FROM (
SELECT 30 as period
)
) as periods
WHERE
dates.date >= TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC')
AND
FLOOR(TIMESTAMP_DIFF(dates.date, TIMESTAMP_MICROS(event.timestamp_micros), DAY)/periods.period) = 0
GROUP BY 1,2,3
)
GROUP BY 1,2)
SELECT
app as target,
UNIX_SECONDS(date) as datapoint_time,
SUM(days_30) as datapoint_value
FROM allTables
WHERE date >= TIMESTAMP_ADD(TIMESTAMP_TRUNC(CURRENT_TIMESTAMP, Day, 'UTC'), INTERVAL -30 DAY)
GROUP BY date,1
ORDER BY date ASC
This currently gives me:
Error: Syntax error: Expected ")" but got keyword CROSS at [14:3]
So my question is, how can I limit the amount of data I pull in using this query and _TABLE_SUFFIX? I feel like I'm missing something very simple here. Any help would be great, thanks!
The CROSS JOIN UNNEST(event_dim) AS event (and the cross join following it) needs to come before the WHERE clause. You can read more in the query syntax documentation.

BigQuery Monthly Active Users?

I'm currently working off a query from this post. That query is written in Legacy SQL and will not work in my environment. I've modified the query to use the modern SQL functions and updated the SELECT date as date to use timestamp_micros.
I should also mention that the rows I'm trying to select are coming in from Firebase Analytics.
My Query:
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date,
periods.period as period,
COUNT(DISTINCT user_dim.app_info.app_instance_id) as users
FROM `com_sidearm_fanapp_uiowa_IOS.*` as activity
CROSS JOIN
UNNEST(event_dim) as event
CROSS JOIN (
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(event.timestamp_micros)) as date
FROM `com_sidearm_fanapp_uiowa_IOS.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT
period
FROM
(
SELECT 7 as period
UNION ALL
SELECT 14 as period
UNION ALL
SELECT 30 as period
)
) as periods
WHERE
dates.date >= activity.date
AND
SAFE_CAST(FLOOR(TIMESTAMP_DIFF(dates.date, activity.date, DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY date
ORDER BY date DESC
Column name period is ambiguous at [24:13] error.
to fix this particular error - you should fix below
CROSS JOIN (
SELECT
period
FROM
(SELECT 7 as period),
(SELECT 14 as period),
(SELECT 30 as period)
) as periods
so it should look like:
CROSS JOIN (
SELECT
period
FROM
(SELECT 7 as period UNION ALL
SELECT 14 as period UNION ALL
SELECT 30 as period)
) as periods
Answer on your updated question
Try below. I didn't have chance to test it but hope it can help you fix your query
SELECT
date,
SUM(CASE WHEN period = 7 THEN users END) as days_07,
SUM(CASE WHEN period = 14 THEN users END) as days_14,
SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
activity.date as date,
periods.period as period,
COUNT(DISTINCT user) as users
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `yourTable` CROSS JOIN UNNEST(event_dim) as event
) as activity
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `yourTable` CROSS JOIN UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT period
FROM
(SELECT 7 as period UNION ALL
SELECT 14 as period UNION ALL
SELECT 30 as period)
) as periods
WHERE dates.date >= activity.date
AND SAFE_CAST(FLOOR(TIMESTAMP_DIFF(TIMESTAMP_MICROS(dates.date), TIMESTAMP_MICROS(activity.date), DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC

How to get the monthly 7-day active users?

In my database I have two fields that are used to identify a user, timestamp and instance_id. I want to be able to get the monthly 7-day active users from this data. I have tried the following query but it just returns the same timestamp and 1 for every row.
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', TIMESTAMP_MICROS(date)) as target,
SUM(CASE WHEN period = 7 THEN users END) as days_07
# SUM(CASE WHEN period = 14 THEN users END) as days_14,
# SUM(CASE WHEN period = 30 THEN users END) as days_30
FROM (
SELECT
activity.date as date,
periods.period as period,
COUNT(DISTINCT user) as users
FROM (
SELECT
event.timestamp_micros as date,
user_dim.app_info.app_instance_id as user
FROM `hidden.*`
CROSS JOIN
UNNEST(event_dim) as event
) as activity
CROSS JOIN (
SELECT
event.timestamp_micros as date
FROM `hidden.*`
CROSS JOIN
UNNEST(event_dim) as event
GROUP BY event.timestamp_micros
) as dates
CROSS JOIN (
SELECT period
FROM
(
SELECT 7 as period
# UNION ALL
# SELECT 14 as period
# UNION ALL
# SELECT 30 as period
)
) as periods
WHERE
dates.date >= activity.date
AND
SAFE_CAST(FLOOR(TIMESTAMP_DIFF(TIMESTAMP_MICROS(dates.date), TIMESTAMP_MICROS(activity.date), DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY date
ORDER BY date DESC
I'm not too sure where to go from here and it's quite challenging to me because I'm not the best with SQL. Any assistance at all would be great. Thanks!
I should also mention that these queries are going to be run within BigQuery and the data is being exported to BigQuery from Firebase.
Try below
SELECT
DATE,
SUM(CASE WHEN period = 7 THEN users END) AS days_07,
SUM(CASE WHEN period = 14 THEN users END) AS days_14,
SUM(CASE WHEN period = 30 THEN users END) AS days_30
FROM (
SELECT
activity.date AS DATE,
periods.period AS period,
COUNT(DISTINCT user) AS users
FROM (
SELECT DISTINCT
DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE,
user_dim.app_info.app_instance_id AS user
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
) AS activity
CROSS JOIN (
SELECT DATE(TIMESTAMP_MICROS(event.timestamp_micros)) AS DATE
FROM `firebase-analytics-sample-data.android_dataset.app_events_20160607`
CROSS JOIN UNNEST(event_dim) AS event
GROUP BY 1
) AS dates
CROSS JOIN (
SELECT period FROM
(SELECT 7 AS period UNION ALL
SELECT 14 AS period UNION ALL
SELECT 30 AS period)
) AS periods
WHERE dates.date >= activity.date
AND SAFE_CAST(FLOOR(DATE_DIFF(dates.date, activity.date, DAY)/periods.period) AS INT64) = 0
GROUP BY 1,2
)
GROUP BY DATE
ORDER BY DATE DESC