Why is my BigQuery retention not lining up with Firebase? - sql

I found this article which it mentions how to get retention so it is easier to manipulate, but I can't even get it to line up with just basic retention. I have messed with the dates all I can, but I don't understand why I am still getting anywhere from 2-12% off from Firebase. If you have any thoughts I will take it!
Here is the function:
WITH analytics_data AS (
SELECT user_pseudo_id, event_timestamp, event_name,
app_info.version AS app_version, -- This is new!
UNIX_MICROS(TIMESTAMP("2018-08-01 00:00:00", "-7:00")) AS start_day,
3600*1000*1000*24*7 AS one_week_micros
FROM `firebase-public-project.analytics_153293282.events_*`
WHERE _table_suffix BETWEEN '20180731' AND '20180829'
)
SELECT week_0_cohort / week_0_cohort AS week_0_pct,
week_1_cohort / week_0_cohort AS week_1_pct,
week_2_cohort / week_0_cohort AS week_2_pct,
week_3_cohort / week_0_cohort AS week_3_pct
FROM (
WITH week_3_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(3*one_week_micros) AND start_day+(4*one_week_micros)
),
week_2_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(2*one_week_micros) AND start_day+(3*one_week_micros)
),
week_1_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(1*one_week_micros) AND start_day+(2*one_week_micros)
),
week_0_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_name = 'first_open'
AND app_version = "2.62" -- This bit is new, too!
AND event_timestamp BETWEEN start_day AND start_day+(1*one_week_micros)
)
SELECT
(SELECT count(*)
FROM week_0_users) AS week_0_cohort,
(SELECT count(*)
FROM week_1_users
JOIN week_0_users USING (user_pseudo_id)) AS week_1_cohort,
(SELECT count(*)
FROM week_2_users
JOIN week_0_users USING (user_pseudo_id)) AS week_2_cohort,
(SELECT count(*)
FROM week_3_users
JOIN week_0_users USING (user_pseudo_id)) AS week_3_cohort
)

Related

Double counting problem in Rolling weekly / monthly active endpoints

Here is my current code to calculate DAE,WAE,MAE:
select event_timestamp as day, Section, users as DAE, SUM(users)
OVER (PARTITION BY Section
ORDER BY event_timestamp
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as WAE,
SUM(users)
OVER (PARTITION BY Section
ORDER BY event_timestamp
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW) as MAE
from (
select count(distinct user_pseudo_id) as users, Section, event_timestamp
from
(select distinct *
from
(
select *,
CASE
WHEN param_value = "Names" or param_value = "SingleName" THEN 'Names'
ELSE param_value
END AS Section
from(
select user_pseudo_id, DATE_TRUNC(EXTRACT(DATE from TIMESTAMP_MICROS(event_timestamp)), DAY) as event_timestamp, event_name, params.value.string_value as param_value
from `rayn-deen-app.analytics_317927526.events_*`, unnest(event_params) as params
where (event_name = 'screen_view' and params.key = 'firebase_screen' and (
# Promises
params.value.string_value = "Promises"
# Favourites
or params.value.string_value = "Favourites"
))
group by user_pseudo_id, event_timestamp, event_name, param_value
order by event_timestamp, user_pseudo_id) raw
) base
order by event_timestamp) as events_table
group by Section, event_timestamp
)
The problem is that for WAE,MAE there is repeat counts of the same users happening. So for example user A was a "daily active user" for 4 days that week. Then in the WAE count, it will consider that as 4 users instead of one. So there is a problem of repeat counts which I need to remove somehow.

I want sql / Teradata query for below R-code Any help appreciated

I want write sql/ Teradata query for below R-code. I attached R code for your refrence.
SELECT `ID`, `Date`, `Agent`, `group`
FROM (
SELECT
*,
RANK() OVER (PARTITION BY `ID`, `group`, `Agent` ORDER BY `Date`) AS `q01`
FROM (
SELECT
*,
CEIL(CAST(difftime(`Date`, MIN(`Date`) OVER (PARTITION BY `ID`), 'week' AS `units`) / 4.0 AS INTEGER)) AS `group`
FROM (
SELECT `ID`, CAST(`Date` AS DATE) AS `Date`, `Agent`
FROM `dbplyr_003`
)
)
)
WHERE (`q01` <= 1)
Thanks

COUNT / MAX (COUNT) not working in BigQuery

I'm not much used to SQL, but on my own I've been able to run this code:
SELECT
event_name,
COUNT(event_name) AS count,
COUNT(event_name) / SUM(COUNT(event_name)) OVER () * 100 AS event_percent
FROM `table_1`
WHERE
event_name IN ('session_start', 'view_item', 'select_item', 'add_to_cart', 'remove_from_cart', 'begin_checkout', 'purchase' )
GROUP BY
event_name
ORDER BY
count DESC
enter image description here
What I'd like to achive is the percentatge of each COUNT divided by the MAX COUNT. Example: purchase / session_start (22 / 1258)
If anyone can help.. I've tried some things but none worked
I guess a CTE would work
WITH prep AS (
SELECT
event_name,
COUNT(event_name) AS cnt,
COUNT(event_name) / SUM(COUNT(event_name)) OVER () * 100 AS event_percent
FROM `table_1`
WHERE
event_name IN ('session_start', 'view_item', 'select_item', 'add_to_cart', 'remove_from_cart', 'begin_checkout', 'purchase' )
GROUP BY
event_name
ORDER BY
count DESC
)
SELECT
*,
cnt / max(cnt) over()
FROM
prep

BigQuery SQL: filter on event sequence

I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download for the same user ("followed" meaning the event_time_utc of store_app_view is older than event_time_utc of store_app_download).
Sample data:
WITH
`project.dataset.dummy_data_init` AS (SELECT event_id FROM UNNEST(GENERATE_ARRAY(1, 10000)) event_id),
`project.dataset.dummy_data_completed` AS (SELECT event_id,
user_id[OFFSET(CAST(20 * RAND() - 0.5 AS INT64))] user_id,
app_id[OFFSET(CAST(100 * RAND() - 0.5 AS INT64))] app_id,
event_type[OFFSET(CAST(6 * RAND() - 0.5 AS INT64))] event_type,
event_time_utc[OFFSET(CAST(26 * RAND() - 0.5 AS INT64))] event_time_utc
FROM `project.dataset.dummy_data_init`,
(SELECT GENERATE_ARRAY(1, 20) user_id),
(SELECT GENERATE_ARRAY(1, 100) app_id),
(SELECT ['store_app_view', 'store_app_view', 'store_app_download','store_app_install','store_app_update','store_fetch_manifest'] event_type),
(SELECT GENERATE_TIMESTAMP_ARRAY('2020-01-01 00:00:00', '2020-01-26 00:00:00',
INTERVAL 1 DAY) AS event_time_utc))
Select * FROM `project.dataset.dummy_data_completed`
Thanks!
I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download.
Your provided query seems to have almost no connection to this question, so I'll ignore it.
For each user/app pair, you can get the rows that matching your conditions using GROUP BY:
select user_id, app_id
from t
group by user_id, app_id
having min(case when event_type = 'store_app_view' then event_time end) <
max(case when event_type = 'store_app_download' then event_time end);
To get the total for each app, use a subquery or CTE:
select app_id, count(*)
from (select user_id, app_id
from t
group by user_id, app_id
having min(case when event_type = 'store_app_view' then event_time end) <
max(case when event_type = 'store_app_download' then event_time end)
) ua
group by app_id;

Google big query - Firebase analytics - Closed funnel for screen views (parameters)

I would to like get a closed funnel for my X screen views, which are parameters of event screen_view
I have found this very good tutorial - https://medium.com/firebase-developers/how-do-i-create-a-closed-funnel-in-google-analytics-for-firebase-using-bigquery-6eb2645917e1 but it is only for a closed funnel with events.
I would like to get this:
event_name event_param count_users
screen_view screen_name_1 100
screen_view screen_name_2 50
screen_view screen_name_3 20
screen_view screen_name_4 5
What I have tried is to change the provided code in the tutorial to event params, but I got to the point where I have no idea what to do next.
SELECT *,
IF (value.string_value = "screen_name1", user_pseudo_id, NULL) as funnel_1,
IF (value.string_value = "screen_name1" AND next_event = "screen_name2", user_pseudo_id, NULL) AS funnel_2
FROM (
SELECT p.value.string_value, user_pseudo_id , event_timestamp,
LEAD(p.value.string_value, 1) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS next_event
FROM `ProjectName.analytics_XX.events_20190119` as t1, UNNEST(event_params) as p
WHERE (p.value.string_value = "screen_name1" OR p.value.string_value = "screen_name2")
ORDER BY 2,3
LIMIT 100
)
Thanks for any help!
I have found the solution:
SELECT COUNT(DISTINCT funnel_1) as f1_users, COUNT(DISTINCT funnel_2) as f2_users FROM (
SELECT *,
IF (param.value.string_value = "screen_name1", user_pseudo_id, NULL) AS funnel_1,
IF (param.value.string_value = "screen_name1" AND next_screen = "screen_name2", user_pseudo_id, NULL) AS funnel_2
FROM (
SELECT TIMESTAMP_MICROS(event_timestamp), param, user_pseudo_id,
LEAD(param.value.string_value, 1) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) as next_screen
FROM `ProjectName.analytics_XX.events_20190119`, unnest(event_params) as param
WHERE
event_name = "screen_view" and
param.value.string_value IN ("screen_name1", "screen_name2")
AND _TABLE_SUFFIX BETWEEN '20190205' AND '20190312'
)
)