Google Big Query: Get New Visitor Count using Custom Dimension - google-bigquery

select PARSE_DATE('%Y%m%d', t.date) as Date
,count(distinct(fullvisitorid)) as User
,SUM( totals.newVisits ) AS New_Visitors
,(if(customDimensions.index=1, customDimensions.value,null)) as Orig
FROM `table` as t
CROSS JOIN UNNEST(hits) AS hit
CROSS JOIN UNNEST(hit.customDimensions ) AS customDimensions
group by Date, orig
Is there a way to get new visitor count and use the customDimension at the same time? The sum(total.newVisits) doesn't work.
Thanks

Below is for BigQuery Standard SQL
SELECT DATE
,COUNT(DISTINCT(fullvisitorid)) AS User
,SUM( newVisits ) AS New_Visitors
,Orig
FROM (
SELECT PARSE_DATE('%Y%m%d', t.date) AS DATE
,fullvisitorid
,totals.newVisits AS newVisits
,(IF(customDimensions.index=1, customDimensions.value,NULL)) AS Orig
FROM `table` AS t
CROSS JOIN UNNEST(hits) AS hit
CROSS JOIN UNNEST(hit.customDimensions ) AS customDimensions
GROUP BY DATE, orig, fullvisitorid, newVisits
)
GROUP BY DATE, Orig

The best way in your case is to remove the cross-joins and use sub-selects instead:
SELECT
PARSE_DATE('%Y%m%d', t.date) AS Date
,(SELECT value FROM UNNEST(customDimensions) WHERE index=1) Orig
,COUNT(DISTINCT(fullvisitorid)) AS User
,SUM( totals.newVisits ) AS New_Visitors
FROM
`table` t
GROUP BY Orig, Date
In case you have a dimension on hit scope and really need to flatten the table, you need to build a session id you can count distinct. That is because you are repeating all session scoped fields on hit-scope by applying the cross-join:
SELECT
PARSE_DATE('%Y%m%d', t.date) AS Date
,(SELECT value FROM h.customDimensions WHERE index=2) justAHitCd
,h.page.pagePathLevel1
,COUNT(DISTINCT(fullvisitorid)) AS User
-- create session id and count distinct
,COUNT(DISTINCT CONCAT(fullvisitorid, CAST(visitstarttime AS STRING)) ) AS all_sessions
-- only count distinct session id of sessions where totals.newVisits = 1
,COUNT(DISTINCT
IF(totals.newVisits=1,
CONCAT(fullvisitorid, CAST(visitstarttime AS STRING)),
NULL )
) AS New_Visitors
FROM
-- flatten table to hit scope (comma means cross-join in stnd sql)
`table` t, t.hits h
GROUP BY 1,2,3
So in case for new visitors I only provide a session id if totals.newVisits=1 - else the if-statement provides NULL which is not countable.
If you have something similar on product-scope, you'd need to create an ID that takes into account session and hit.
E.g. counting pages for productSku:
SELECT
PARSE_DATE('%Y%m%d', t.date) AS Date
,(SELECT value FROM h.customDimensions WHERE index=2) justAHitCd
,p.productSku
,COUNT(DISTINCT fullvisitorid) AS users
,COUNT(DISTINCT CONCAT(fullvisitorid, CAST(visitstarttime AS STRING))) AS sessions
,COUNT(DISTINCT
IF(h.type='PAGE',
CONCAT(fullvisitorid, cast(visitstarttime AS STRING),CAST(hitNumber AS STRING)),
NULL)
) as pageviews
,COUNT(1) AS products
FROM
`table` t, t.hits h LEFT JOIN h.product p
GROUP BY 1,2,3
Note, that I'm left joining the product array. Since it sometimes is empty a cross-join would destroy all hits information: cross-join with empty table results in empty table.
Hope that helps!

Related

How to calculate weekly retention in BigQuery using SQL

I have the following table with the week number and the retention rate.
|creation_week |num_engaged_users |num_users_in_cohort |retention_rate|
|:------------:|:-----------------:|:------------------:|:------------:|
|37| 373114 |4604 |67.637|
|38| 1860 |4604. |40.4|
|39| 1233 |4604 |26.781|
|40| 668 |4604 |14.509|
|41| 450 |4604 |9.774|
|42| 463| 4604|10.056|
What I need is to make it look something like this
|week |week0 |week1 |week2|week3|week4|week5|week6|
|:---:|:----:|:----:|:---:|:---:|:---:|:---:|:---:|
|week37|100|ret.rate|ret.rate|ret.rate|ret.rate|ret.rate|ret.rate|
|week38|100|ret.rate|ret.rate|ret.rate|ret.rate|ret.rate|
|week39|100|ret.rate|ret.rate|ret.rate|ret.rate|
|week40|100|ret.rate|ret.rate|ret.rate|
|week41|100|ret.rate|ret.rate|
|week42|100|ret.rate|
how can I do that using BigQuery SQL?
For some reason Stackoverflow doesn't allow to post this question unless all the tables are marked as code...
I will provide the SQL code I used in the first answer because it doesn't let me post it either
WITH
new_user_cohort AS (
WITH
#table with cookie and user_ids for the later matching
table_1 AS (
SELECT
DISTINCT props.value.string_value AS cookie_id,
user_id
FROM
`stockduel.analytics.events`,
UNNEST(event_properties) AS props
WHERE
props.key = 'cookie_id'
AND user_id>0),
#second table which gives acess to the sample with the users who performed the event
table_2 AS (
SELECT
DISTINCT props.value.string_value AS cookie_id,
EXTRACT(WEEK
FROM
creation_date) AS first_week
FROM
`stockduel.analytics.events`,
UNNEST(event_properties) AS props
WHERE
props.key = 'cookie_id'
AND event_type = 'launch_first_time'
#set the date from when starting cohort analysis
AND EXTRACT(WEEK
FROM
creation_date) = EXTRACT(WEEK
FROM
DATE '2021-09-15'))
#join user id with cookie_id and group the elements to remove the duplicates
SELECT
user_id,
first_week
FROM
table_2
JOIN
table_1
ON
table_1.cookie_id = table_2.cookie_id
#group the results to avoid duplicates
GROUP BY
user_id,
first_week ),
num_new_users AS (
SELECT
COUNT(*) AS num_users_in_cohort,
first_week
FROM
new_user_cohort
GROUP BY
first_week ),
engaged_users_by_day AS (
SELECT
COUNT(DISTINCT `stockduel.analytics.ws_raw_sessions_v2`.user_id) AS num_engaged_users,
EXTRACT(WEEK
FROM
started_at) AS creation_week,
FROM
`stockduel.analytics.ws_raw_sessions_v2`
JOIN
new_user_cohort
ON
new_user_cohort.user_id = `stockduel.analytics.ws_raw_sessions_v2`.user_id
WHERE
EXTRACT(WEEK
FROM
started_at) BETWEEN EXTRACT(WEEK
FROM
DATE '2021-09-15')
AND EXTRACT(WEEK
FROM
DATE '2021-10-22')
GROUP BY
creation_week )
SELECT
creation_week,
num_engaged_users,
num_users_in_cohort,
ROUND((100*(num_engaged_users / num_users_in_cohort)), 3) AS retention_rate
FROM
engaged_users_by_day
CROSS JOIN
num_new_users
ORDER BY
creation_week

How to calculate running sums with append-only rows

I have a table where rows are never mutated but only inserted; they are immutable records. It has the following fields:
id: int
user_id: int
created: datetime
is_cool: boolean
likes_fruits: boolean
An object is tied to a user, and the "current" object for a given user is the one that has the latest created date. E.g. if I want to update is_cool for a user, I'd append a record with a new created timestamp and is_cool=true.
I want to calculate how many users are is_cool at the end of each day. I.e. I'd like the output table to have the columns:
day: some kind of date_trunc('day', created)
cool_users_count: number of users that have is_cool at the end of this day.
What SQL query can i write that does this? FWIW I'm using Presto (or Redshift if need to).
Note that there are other columns, e.g. likes_fruits, which means a record where is_cool is false does not mean is_cool was just changed to false - it could have been false for a while.
This is what procedural pseudo-code would look like to represent what I'd want to do in SQL:
// rows = ...
min_date = min([row.created for row in rows])
max_date = max([row.created for row in rows])
counts_by_day = {}
for date in range(min_date, max_date):
rows_up_until_date = [row for row in rows if row.created <= date]
latest_row_by_user = rows_up_until_date.reduce(
{},
(acc, row) => acc[row.user_id] = row,
)
counts_by_day[date] = latest_row_by_user.filter(row => row.is_cool).length
You can do this using jus a query .. try using a sum on boolend and group by
select date(created), sum(is_cool)
from my_table
group by date(created)
or if you need the number of users
select t.date_created, count(*) num_user
from (
select distinct date(created) date_created, user_id
from my_table
where is_cool = TRUE
) t
group by t.date_created
or if need the last value for is_cool
select date(max_date), sum(is_cool)
from (
select t.user_id, t.max_date, m.is_cool, m.user_id
from my_table m
inner join (
select max(date_created) max_date, user_id
from my_table
group by user_id, date(date_created)
) t on t.max_date = m.date_created
and t.user_id = m.user_id
where m.is_cool = TRUE
) t2
group by date(max_date)
A correlated subquery might be the simplest solution. The following gets the value of is_cool for each user on each date:
select u.user_id, d.date,
(select t.is_cool
from t
where t.user_id = u.user_id and
t.created < dateadd(day, 1, d.date)
order by t.created desc
limit 1
) as is_cool
from (select distinct date(created) as date
from t
) d cross join
(select distinct user_id
from t
) u ;
Then aggregate:
select date, sum(is_cool)
from (select u.user_id, d.date,
(select t.is_cool
from t
where t.user_id = u.user_id and
t.created < dateadd(day, 1, d.date)
order by t.created desc
limit 1
) as is_cool
from (select distinct date(created) as date
from t
) d cross join
(select distinct user_id
from t
) u
) ud
group by date;

How can I find the previous page with Bigquery

I want to find out the previous page where the current page is a product page.
For example I have this page 'https://www.emag.ro/telefon-mobil-apple-iphone-x-64gb-4g-space-grey-mqac2rm-a/pd/DN094NBBM'and my previous page is this page 'https://www.emag.ro/search/telefoane-mobile/IPHONE/c?ref=srcql'
How in terms of hitnumber I can return how many users had this behavior.
I tried with this 2 query and I want to do a JOIN but I don't know how is better.
Also, I tried with LAG function but I don't know for sure if I catch all the users.
Thank you in advance.
with
view_product as (
SELECT
ga.fullVisitorId AS GA_USER_ID,
date as date,
h.hitnumber as hitnumber,
CONCAT(ga.fullVisitorId, cast(ga.visitId AS string)) AS SessionID,
(SELECT VALUE FROM h.customDimensions WHERE INDEX = 10) AS PAGETYPE,
(SELECT VALUE FROM h.customDimensions WHERE index =8) as ref_parameter,
visitid as visitid,
h.page.pagePath as page_path
FROM
`emagbigquery.0` ga,
UNNEST(hits) AS h
WHERE h.type='PAGE'
AND _TABLE_SUFFIX = '20190115'
AND (SELECT VALUE FROM h.customDimensions WHERE INDEX = 10) = 'viewproduct'
)
,
SEARCH_page_WITH_REF_SRCQL as (
select
date as date,
ga.fullVisitorId AS GA_USER_ID,
h.hitnumber as hitnumber,
CONCAT(ga.fullVisitorId, cast(ga.visitId AS string)) AS SessionID,
(SELECT VALUE FROM h.customDimensions WHERE INDEX = 10) AS PAGETYPE,
(SELECT VALUE FROM h.customDimensions WHERE index =8) as ref_parameter,
visitid as visitid,
h.page.pagePath as page_path
FROM
`emagbigquery.0` ga,
UNNEST(hits) AS h
WHERE h.type='PAGE'
AND _TABLE_SUFFIX = '20190115'
AND (SELECT VALUE FROM h.customDimensions WHERE INDEX = 10) = 'search'
AND (SELECT VALUE FROM h.customDimensions WHERE index =8) LIKE 'srcql'
)
select
COUNT(DISTINCT GA_USER_ID) AS USERS,
COUNT(DISTINCT SessionID) AS SESSIONS,
previous_page_from_srcql
from (
select
t1.ga_user_id,
t1.sessionid,
t2.hitnumber > t1.hitnumber as previous_page_from_srcql
from SEARCH_page_WITH_REF_SRCQL as t1
inner join view_product as t2
on t1.ga_user_id = t2.ga_user_id
group by
previous_page_from_srcql
Try UNNEST WITH OFFSET. It can give you an easy way to later determine that one row came after the other:
WITH path_and_prev AS (
SELECT ARRAY(
SELECT AS STRUCT session.page.pagePath
, LAG(session.page.pagePath) OVER(ORDER BY i) prevPagePath
FROM UNNEST(hits) session WITH OFFSET i
) x
FROM `google.com:analytics-bigquery.LondonCycleHelmet.ga_sessions_20130910`
)
SELECT COUNT(*) c, pagePath, prevPagePath
FROM path_and_prev, UNNEST(x)
WHERE pagePath='/vests/yellow.html'
AND prevPagePath='/vests/'
GROUP BY 2,3

Calculating last order date by UserID from GA data

I would like to calculate the last order date of an individual, by their UserID - my UserID is derived from a custom dimension from the automatically imported Google Analytics data.
I'm not sure how to go about this, i'm quite new to SQL, I think I might be looking for a window function, but not entirely sure!
Here is my code so far, but this returns the most recent order data against ALL IDs:
SELECT * FROM
(SELECT MAX(date) AS lastorddate, customDimension.value AS UserID
FROM `PROJECTNAME.ga_sessions_20*` AS t
CROSS JOIN UNNEST(t.customdimensions) AS customDimension
WHERE customDimension.index = 2
AND totals.transactions > 0
GROUP BY Date, UserID)
GROUP BY UserID, lastorddate
ORDER BY lastorddate DESC
LIMIT 500
Below should work:
#standardSQL
SELECT MAX(date) AS lastorddate, customDimension.value AS UserID
FROM `PROJECTNAME.ga_sessions_20*` AS t
CROSS JOIN UNNEST(t.customdimensions) AS customDimension
WHERE customDimension.index = 2
AND totals.transactions > 0
GROUP BY UserID
ORDER BY lastorddate DESC
LIMIT 500

How to use multiple custom dimensions in Google Big Query

Is there a way to use multiple custom dimensions in GBQ without using the Max function? My problem of using Max function is that it only saves the max pax_num, but I would like to have the count of visitors for all of the combinations of ( Date,product.v2ProductCategory,eCommerceAction.action_type
,product.v2ProductName). Note the pax_num is number of pax on that ticket. I need every combination of the dest+pax_num, not the dest+max(pax_num)
SELECT
Date
,count(distinct( concat(FULLVISITORID,cast(visitID as string)))) as visitor
, product.v2ProductCategory as product_category
,max(if(customDimensions.index=2, customDimensions.value,null)) as dest
,max((if(customDimensions.index=21, customDimensions.value,null)) ) as pax_num
,eCommerceAction.action_type as Action_type
,product.v2ProductName as product_name
FROM `table` as t
CROSS JOIN UNNEST(hits) AS hit
CROSS JOIN UNNEST(hit.customDimensions) AS customDimensions
CROSS JOIN UNNEST(hit.product) AS product
GROUP BY
Date
,product.v2ProductCategory
,eCommerceAction.action_type
,product.v2ProductName
Not sure if this is what you are looking for, but if you include the field pax_num in the group by you might already find what you need, like so:
select
date,
count(distinct( concat(FULLVISITORID,cast(visitID as string)))) as sessions,
product.v2ProductCategory category,
max(if(customDimensions.index=2, customDimensions.value, null)) as dest,
if(customDimensions.index=21, customDimensions.value,null) as pax_num,
eCommerceAction.action_type as act_type,
product.v2ProductName as product_name
from `table` as t,
unnest(hits) as hit,
unnest(hit.customDimensions) customDimensions,
unnest(hit.product) as product
group by
date,
category,
act_type,
pax_num,
product_name
having pax_num is not null
You gave as an example the pax_num values "paxnum_5" and "paxnum_6". If you insert the value pax_num in the group by operation, the count aggregation should happen on the level of pax_num which would preserve the values (and not mix everything into the max value as before).
Also, notice that if you count the distinct combination of fullvisitorids and visitids you are actually computing the total amount of sessions and not visitors (their definition is not the same).
Add the fullvisitorID solve the problem
SELECT
Date
,concat(fullVisitorID,cast(visitID as string)) as visitorID
,count(distinct( concat(FULLVISITORID,cast(visitID as string)))) as visitor
, product.v2ProductCategory as product_category
,max(if(customDimensions.index=2, customDimensions.value,null)) as dest
,max((if(customDimensions.index=21, customDimensions.value,null)) ) as pax_num
,eCommerceAction.action_type as Action_type
,product.v2ProductName as product_name
FROM `table` as t
CROSS JOIN UNNEST(hits) AS hit
CROSS JOIN UNNEST(hit.customDimensions) AS customDimensions
CROSS JOIN UNNEST(hit.product) AS product
GROUP BY
Date
,product.v2ProductCategory
,eCommerceAction.action_type
,product.v2ProductName
,visitorID