COUNT / MAX (COUNT) not working in BigQuery - google-bigquery

I'm not much used to SQL, but on my own I've been able to run this code:
SELECT
event_name,
COUNT(event_name) AS count,
COUNT(event_name) / SUM(COUNT(event_name)) OVER () * 100 AS event_percent
FROM `table_1`
WHERE
event_name IN ('session_start', 'view_item', 'select_item', 'add_to_cart', 'remove_from_cart', 'begin_checkout', 'purchase' )
GROUP BY
event_name
ORDER BY
count DESC
enter image description here
What I'd like to achive is the percentatge of each COUNT divided by the MAX COUNT. Example: purchase / session_start (22 / 1258)
If anyone can help.. I've tried some things but none worked

I guess a CTE would work
WITH prep AS (
SELECT
event_name,
COUNT(event_name) AS cnt,
COUNT(event_name) / SUM(COUNT(event_name)) OVER () * 100 AS event_percent
FROM `table_1`
WHERE
event_name IN ('session_start', 'view_item', 'select_item', 'add_to_cart', 'remove_from_cart', 'begin_checkout', 'purchase' )
GROUP BY
event_name
ORDER BY
count DESC
)
SELECT
*,
cnt / max(cnt) over()
FROM
prep

Related

Calculating % of COUNT with groupby function in bigquery

Running into some issues figuring out how to add in an extra column that will give me the percentage of the total of the aggregate of the count function. The query I have looks like this:
Select
count(*) AS num_rides,
member_casual
FROM `2020_bikeshare_data`
GROUP BY member_casual
ORDER BY num_rides DESC
And returns me this result:
num_rides
member_casual
2134988
member
1341217
casual
And what I'd like to do is add a 3rd column that lists the percent of the total each membership makes up
num_rides
member_casual
perc_tot
2134988
member
61.4%
1341217
casual
38.6
thoughts?
You window functions:
SELECT member_casual,
COUNT(*) AS num_rides,
COUNT(*) * 1.0 / SUM(COUNT(*)) OVER ()
FROM `2020_bikeshare_data`
GROUP BY member_casual
ORDER BY num_rides DESC;
No subquery is needed.
Consider below approach
select distinct member_casual,
count(num_rides) over type as num_rides,
round(count(num_rides) over type * 100.0 / count(num_rides) over(), 2) as perc_tot
from `2020_bikeshare_data`
window type as (partition by member_casual)
# order by num_rides desc
if applied to sample data in your question - output is
The simplest way is use a subquery as part of the column expression to calculate your percentage:
select
count(1) as num_rides,
member_casual,
sum(100) / (select sum(1.0) from `2020_bikeshare_data`) as perc_tot -- return as percentage
from
`2020_bikeshare_data`
group by
member_casual
Using the subquery, get the total number of rows and calculate the percentage accordingly.
Select
count(*) AS num_rides,
member_casual,
Concat(count(*) * 100 / totalRecord,' %') as perc_tot
FROM (SELECT *,COUNT(*) as totalRecord FROM `2020_bikeshare_data`)
GROUP BY member_casual
or
Select
count(*) AS num_rides,
member_casual,
Concat(count(*) * 100 / (SELECT COUNT(*) FROM `2020_bikeshare_data`) ,' %') as perc_tot
FROM `2020_bikeshare_data`
GROUP BY member_casual
In addition to the other answers, you can also break this down into simple SQL (without window functions) by organizing with CTEs.
with
data as (select * from `2020_bikeshare_data`),
total as (select count(*) as ride_count from data),
by_type as (select member_casual, count(*) as ride_count from data group by 1)
select
member_casual,
by_type.ride_count as num_rides,
by_type.ride_count / total.ride_count as perc_tot
from by_type
cross join total
In my opinion, this is much easier to see the perc_tot calculation.

Why is my BigQuery retention not lining up with Firebase?

I found this article which it mentions how to get retention so it is easier to manipulate, but I can't even get it to line up with just basic retention. I have messed with the dates all I can, but I don't understand why I am still getting anywhere from 2-12% off from Firebase. If you have any thoughts I will take it!
Here is the function:
WITH analytics_data AS (
SELECT user_pseudo_id, event_timestamp, event_name,
app_info.version AS app_version, -- This is new!
UNIX_MICROS(TIMESTAMP("2018-08-01 00:00:00", "-7:00")) AS start_day,
3600*1000*1000*24*7 AS one_week_micros
FROM `firebase-public-project.analytics_153293282.events_*`
WHERE _table_suffix BETWEEN '20180731' AND '20180829'
)
SELECT week_0_cohort / week_0_cohort AS week_0_pct,
week_1_cohort / week_0_cohort AS week_1_pct,
week_2_cohort / week_0_cohort AS week_2_pct,
week_3_cohort / week_0_cohort AS week_3_pct
FROM (
WITH week_3_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(3*one_week_micros) AND start_day+(4*one_week_micros)
),
week_2_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(2*one_week_micros) AND start_day+(3*one_week_micros)
),
week_1_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(1*one_week_micros) AND start_day+(2*one_week_micros)
),
week_0_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_name = 'first_open'
AND app_version = "2.62" -- This bit is new, too!
AND event_timestamp BETWEEN start_day AND start_day+(1*one_week_micros)
)
SELECT
(SELECT count(*)
FROM week_0_users) AS week_0_cohort,
(SELECT count(*)
FROM week_1_users
JOIN week_0_users USING (user_pseudo_id)) AS week_1_cohort,
(SELECT count(*)
FROM week_2_users
JOIN week_0_users USING (user_pseudo_id)) AS week_2_cohort,
(SELECT count(*)
FROM week_3_users
JOIN week_0_users USING (user_pseudo_id)) AS week_3_cohort
)

BigQuery SQL: filter on event sequence

I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download for the same user ("followed" meaning the event_time_utc of store_app_view is older than event_time_utc of store_app_download).
Sample data:
WITH
`project.dataset.dummy_data_init` AS (SELECT event_id FROM UNNEST(GENERATE_ARRAY(1, 10000)) event_id),
`project.dataset.dummy_data_completed` AS (SELECT event_id,
user_id[OFFSET(CAST(20 * RAND() - 0.5 AS INT64))] user_id,
app_id[OFFSET(CAST(100 * RAND() - 0.5 AS INT64))] app_id,
event_type[OFFSET(CAST(6 * RAND() - 0.5 AS INT64))] event_type,
event_time_utc[OFFSET(CAST(26 * RAND() - 0.5 AS INT64))] event_time_utc
FROM `project.dataset.dummy_data_init`,
(SELECT GENERATE_ARRAY(1, 20) user_id),
(SELECT GENERATE_ARRAY(1, 100) app_id),
(SELECT ['store_app_view', 'store_app_view', 'store_app_download','store_app_install','store_app_update','store_fetch_manifest'] event_type),
(SELECT GENERATE_TIMESTAMP_ARRAY('2020-01-01 00:00:00', '2020-01-26 00:00:00',
INTERVAL 1 DAY) AS event_time_utc))
Select * FROM `project.dataset.dummy_data_completed`
Thanks!
I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download.
Your provided query seems to have almost no connection to this question, so I'll ignore it.
For each user/app pair, you can get the rows that matching your conditions using GROUP BY:
select user_id, app_id
from t
group by user_id, app_id
having min(case when event_type = 'store_app_view' then event_time end) <
max(case when event_type = 'store_app_download' then event_time end);
To get the total for each app, use a subquery or CTE:
select app_id, count(*)
from (select user_id, app_id
from t
group by user_id, app_id
having min(case when event_type = 'store_app_view' then event_time end) <
max(case when event_type = 'store_app_download' then event_time end)
) ua
group by app_id;

Query for both daily aggregate, and then monthly aggregates in the same query?

I would like to count the number of daily unique active users by subreddit and day, and then aggregate these counts onto monthly unique active users by group and month. Doing each one individually is simple enough, but when I try to do them in one combined query, it tells me that I need to group by date_month_day in my second-level subquery, which would result in monthly_unique_users being the same as daily_unique_uauthors..(Error: Expression 'date_month_day' is not present in the GROUP BY list [invalidQuery]).
Here is the query I have so far:
SELECT * FROM
(
SELECT *,
(daily_unique_authors/monthly_unique_authors) * 1.0 AS ratio,
ROW_NUMBER() OVER (PARTITION BY date_month_day ORDER BY ratio DESC) rank
FROM
(
SELECT subreddit,
date_month_day,
daily_unique_authors,
SUM(daily_unique_authors) AS monthly_unique_authors,
LEFT(date_month_day, 7) as date_month
FROM
(
SELECT subreddit,
LEFT(DATE(SEC_TO_TIMESTAMP(created_utc)), 10) as date_month_day,
COUNT(UNIQUE(author)) as daily_unique_authors
FROM TABLE_QUERY([fh-bigquery:reddit_comments], "table_id CONTAINS \'20\' AND LENGTH(table_id)<8")
GROUP EACH BY subreddit, date_month_day
)
GROUP EACH BY subreddit, date_month))
WHERE rank <= 100
ORDER BY date_month ASC
The final output should ideally be something like:
subreddit date_month date_month_day daily_unique_users monthly_unique_users ratio
1 google 2005-12 2005-12-29 77 600 0.128
2 google 2005-12 2005-12-31 52 600 0.866
3 google 2005-12 2005-12-28 81 600 0.135
4 google 2005-12 2005-12-27 73 600 0.121
Below is for BigQuery Standard SQL
#standardSQL
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY date_month_day ORDER BY ratio DESC) rank
FROM (
SELECT
daily.subreddit subreddit,
daily.date_month date_month,
date_month_day,
daily_unique_authors,
monthly_unique_authors,
1.0 * daily_unique_authors / monthly_unique_authors AS ratio
FROM (
SELECT subreddit,
DATE(TIMESTAMP_SECONDS(created_utc)) AS date_month_day,
FORMAT_DATE('%Y-%m', DATE(TIMESTAMP_SECONDS(created_utc))) AS date_month,
COUNT(DISTINCT author) AS daily_unique_authors
FROM `fh-bigquery.reddit_comments.2018*`
GROUP BY subreddit, date_month_day, date_month
) daily
JOIN (
SELECT subreddit,
FORMAT_DATE('%Y-%m', DATE(TIMESTAMP_SECONDS(created_utc))) AS date_month,
COUNT(DISTINCT author) AS monthly_unique_authors
FROM `fh-bigquery.reddit_comments.2018*`
GROUP BY subreddit, date_month
) monthly
ON daily.subreddit = monthly.subreddit
AND daily.date_month = monthly.date_month
)
)
WHERE rank <= 100
ORDER BY date_month
Note: I tried to leave the original logic and structure as much as possible as it is in the question - so OP will be able to correlate answer with question and make further adjustments if needed :o)

NTILE() in BigQuery for non-uniform buckets

I'm trying to perform RFM segmentation on the Google Merchandise Store sample dataset on BigQuery. In my SQL query, NTILE(5) divides the rows into 5 buckets based on row ordering and returns the bucket number that is assigned to each row. In this case, each bucket are of equal size. Would like to find out how to create buckets of different sizes instead. For example, bucket 1 contains the bottom 10%, bucket 2 contains the next 20% of records etc. Thank you!
#standard SQL
SELECT
fullVisitorId,
NTILE(5) OVER (ORDER BY last_order_date) AS rfm_recency,
NTILE(5) OVER (ORDER BY count_order) AS rfm_frequency,
NTILE(5) OVER (ORDER BY avg_amount) AS rfm_monetary
FROM (
SELECT
fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
AVG(totals.totalTransactionRevenue)/1000000 AS avg_amount
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE
_table_suffix BETWEEN "101"
AND "801"
AND totals.totalTransactionRevenue IS NOT NULL
GROUP BY
fullVisitorId )
You can use row_number() and count(*) to define your own buckets:
SELECT fullVisitorId,
(CASE WHEN seqnum_r <= 0.1 * cnt THEN 1
WHEN seqnum_r <= 0.3 * cnt THEN 2
ELSE 3
END) as bin_r,
. . .
FROM (SELECT fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
(AVG(totals.totalTransactionRevenue) / 1000000) AS avg_amount,
COUNT(*) OVER () as cnt,
ROW_NUMBER() OVER (ORDER BY MAX(date)) as seqnum_r,
ROW_NUMBER() OVER (ORDER BY COUNT(*)) as seqnum_f,
ROW_NUMBER() OVER (ORDER BY AVG(totals.totalTransactionRevenue)) as seqnum_m
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE _table_suffix BETWEEN "101" AND "801" AND
totals.totalTransactionRevenue IS NOT NULL
GROUP BY fullVisitorId
) rfm
Below is for BigQuery Standard SQL and assumes your initial query works for for you, SQL UDF NON_UNIFORM_BUCKET() does the trick for you
#standard SQL
CREATE TEMP FUNCTION NON_UNIFORM_BUCKET(i INT64) AS (
CASE
WHEN i = 1 THEN 1
WHEN i IN (2, 3) THEN 2
WHEN i IN (4, 5, 6) THEN 3
WHEN i = 7 THEN 4
ELSE 5
END
);
SELECT
fullVisitorId,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY last_order_date)) AS rfm_recency,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY count_order)) AS rfm_frequency,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY avg_amount)) AS rfm_monetary
FROM (
SELECT
fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
AVG(totals.totalTransactionRevenue)/1000000 AS avg_amount
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE
_table_suffix BETWEEN "101"
AND "801"
AND totals.totalTransactionRevenue IS NOT NULL
GROUP BY
fullVisitorId )