Get most recent record for each id - sql

I am trying to get a list of all users in a database. Then I have another table where I only have the users who are members.
The issue is that some of those who are members today, could have been customers, members or none of them earlier. So we could have duplicates.
What I want to do is to pick only the most recent record based on date column which is present in the database.
Here are the 2 tables output:
User table:
Users table
Members table:
Members table
Want to left join the tables with keeping all the distinct records from users table and most matching records from members table with the most recent cd.value.
WITH users AS(
SELECT
fullVisitorId AS Clientid
FROM `records`
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND
FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND
totals.visits = 1
), members As(
SELECT
MAX(date) AS date,
fullVisitorId AS Clientid,
cd.value AS CD_value,
cd.index AS CD_index
FROM `records`,
UNNEST(customDimensions) AS cd
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND
FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND
totals.visits = 1
AND
cd.index = 6
group by
Clientid,
CD_value,
CD_index
)
SELECT
users.ClientId AS clientId,
members.CD_value
from users
LEFT JOIN members ON users.ClientId = members.Clientid
group by
members.CD_value,
clientId
order by
clientId ASC

try by using row_number()
WITH users AS(
SELECT
fullVisitorId AS Clientid
FROM `records`
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND
FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND
totals.visits = 1
), members As(
SELECT
date AS date,
fullVisitorId AS Clientid,
cd.value AS CD_value,
cd.index AS CD_index,
row_number() over(partition by Clientid,
CD_value,
CD_index order by date desc) rn
FROM `records`,
UNNEST(customDimensions) AS cd
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND
FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND
totals.visits = 1
AND
cd.index = 6
), m2 as ( select * from members where rn=1)
SELECT distinct
users.ClientId AS clientId,
m2.CD_value
from users
LEFT JOIN m2 ON users.ClientId = m2.Clientid
order by
clientId ASC

Related

Split credit for transactions and Revenue between clicks on events (UA GA)

In this task, the idea is to assign the sales credit (transactions and revenues) equally to the events that were clicked on during the user's session. The output table would look like this, except that the revenue and transaction are split if the user had two events, three events, etc.
Below are three scenarios -> "three scenarios" on how transactions and revenues should be shared between events. Does anyone have an idea how to customize the code?
I include a code that assigns sales, but without dividing the credit into Revenue and Transactions, and this code would need to be modified.
three scenarios
output table
Grateful in advance for any help
with event_home_page as (select q.* except(isEntrance), if (isEntrance = true, 'true', 'false') isEntrance
from (
select
PARSE_DATE('%Y%m%d', CAST(date AS STRING)) as true_date,
hits.isEntrance,
hits.eventInfo.eventCategory,
hits.eventInfo.eventAction,
hits.eventInfo.eventLabel,
concat(fullvisitorid, cast(visitstarttime as string)) ID,
count(*) click
FROM `ga360.123456.ga_sessions_*`, unnest (hits) as hits
WHERE
_table_suffix = FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
and hits.page.pagePath in ('www.example.com/')
and regexp_contains(hits.eventInfo.eventCategory, '^clickable_element.*')
group by 1,2,3,4,5,6) q
),
transactions as (
select
PARSE_DATE('%Y%m%d', CAST(date AS STRING)) as true_date,
concat(fullvisitorid, cast(visitstarttime as string)) ID,
sum(totals.totalTransactionRevenue/1000000) as all_revenue,
sum(totals.transactions) all_transactions
FROM `ga360.123456.ga_sessions_*`
WHERE
_table_suffix = FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
group by 1,2
)
select hp.true_date, hp.isEntrance, hp.eventCategory, hp.eventAction, hp.eventLabel, hp.click, t.all_revenue revenue, t.all_transactions transactions
from event_home_page hp left join transactions t on hp.true_date=t.true_date and hp.id=t.id
order by revenue desc

Filtering rows which only relate to a most recent order (Google Analytics data)

I am working on a BigQuery query for an attribution project and I would like to return results which show me all the sessions relating to a customers most recent orders, excluding any previous orders and sessions which led up to them.
Here is my current SQL:
select
*,
row_number() over (partition by userid order by visitstarttime asc) as row_num
from
(select
IF(cd.index = 2, cd.value, NULL) AS userId,
concat(fullvisitorid, cast(visitid as string)) as sessionid,
visitstarttime,
hits.transaction.transactionid as transactionid
FROM
`my_project.dataset.ga_sessions_20*`,
UNNEST(customDimensions) AS cd,
UNNEST(hits) AS hits
WHERE
parse_DATE('%y%m%d',
_table_suffix) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 60 DAY)
AND DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
AND cd.index = 2
AND cd.value NOT LIKE "true"
AND cd.value NOT LIKE "false"
AND cd.value NOT LIKE "undefined"
AND cd.value IS NOT NULL
group by 1,2,3,4)
This code is returning all sessions with associated orders, but I only want to see those related to the most recent order. For example, here I only want to see from row 11:
And here I only want to see from row 3 to row 16:
How could I adjust my code to flag these rows accordingly?
You can count the number of transactions in reverse order and take the rows where the count is 1:
with q as (
your query here
)
select q.*
from (select q.*,
count(transactionid) over (partition by userid order by visitstarttime desc) as grp
from q
) q
where grp = 1;

Query across multiple datasets and a dynamic date range in BigQuery

I have a query that collects data from a dynamic date range (last 7 days) from one dataset in BigQuery - my data source is Google Analytics, so I have other datasets connected with identical schema. I'd like my query to also return data from other datasets, usually I would use a UNION ALL for this, but my query contains a complex categorization query which needs to be updated regularly and I'd rather not do this multiple times for each set.
Could you advise on how to query across datasets, or suggest a more elegant way to handle the UNION ALL approach?
SELECT
Date,
COUNT(DISTINCT VisitId) AS users,
COUNT(VisitId) AS sessions,
SUM(totals.transactions) AS orders,
CASE
# Organic Search - Google
WHEN ( channelGrouping LIKE "Organic Search"
OR trafficSource.source LIKE "com.google.android.googlequicksearchbox")
AND trafficSource.source LIKE "%google%" THEN "Organic Search - Google"
ELSE "Other"
END AS Channel,
hits.page.hostname AS site
FROM
`xxx.dataset1.ga_sessions_20*`
CROSS JOIN
UNNEST (hits) AS hits
WHERE
parse_DATE('%y%m%d',
_table_suffix) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 7 day)
AND DATE_SUB(CURRENT_DATE(), INTERVAL 1 day)
AND totals.visits = 1
AND hits.isEntrance IS TRUE
GROUP BY
Date,
Channel,
hits.isEntrance
ORDER BY
Users DESC
UPDATE: I have got as far as follows thanks to the responses below, the following queries all datasets in the UNION but the date range is not applying, instead all data is being queried, any ideas why it's not picking it up?
SELECT
Date,
LOWER(hits.page.hostname) AS site,
IFNULL(COUNT(VisitId),0) AS sessions,
IFNULL(SUM(totals.transactions),0) AS orders,
IFNULL(ROUND(SUM(totals.transactions)/COUNT(VisitId),4),0) AS conv_rate,
# Channel definition starts here
CASE
# Organic Search - Google
WHEN ( channelGrouping LIKE "Organic Search"
OR trafficSource.source LIKE "com.google.android.googlequicksearchbox")
AND trafficSource.source LIKE "%google%" THEN "Organic Search - Google"
ELSE "Other"
END AS Channel
FROM (
SELECT * FROM `xxx.43786551.ga_sessions_20*` UNION ALL
SELECT * FROM `xxx.43786097.ga_sessions_20*` UNION ALL
SELECT * FROM `xxx.43786092.ga_sessions_20*`
WHERE PARSE_DATE('%Y%m%d',_TABLE_SUFFIX) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY)
AND DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
)
CROSS JOIN UNNEST (hits) AS hits
WHERE totals.visits = 1
AND hits.isEntrance IS TRUE
GROUP BY
Date,
channel,
hits.isEntrance,
site
HAVING hits.isEntrance IS TRUE
#standardSQL
SELECT
DATE,
COUNT(DISTINCT VisitId) AS users,
COUNT(VisitId) AS sessions,
SUM(totals.transactions) AS orders,
CASE
# Organic Search - Google
WHEN ( channelGrouping LIKE "Organic Search"
OR trafficSource.source LIKE "com.google.android.googlequicksearchbox")
AND trafficSource.source LIKE "%google%" THEN "Organic Search - Google"
ELSE "Other"
END AS Channel,
hits.page.hostname AS site
FROM (
SELECT * FROM `xxx.dataset1.ga_sessions_20*` WHERE PARSE_DATE('%y%m%d',_TABLE_SUFFIX) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
UNION ALL SELECT * FROM `xxx.dataset2.ga_sessions_20*` WHERE PARSE_DATE('%y%m%d',_TABLE_SUFFIX) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
UNION ALL SELECT * FROM `xxx.dataset3.ga_sessions_20*` WHERE PARSE_DATE('%y%m%d',_TABLE_SUFFIX) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
)
CROSS JOIN UNNEST (hits) AS hits
WHERE totals.visits = 1
AND hits.isEntrance IS TRUE
GROUP BY
DATE,
Channel,
site
ORDER BY
Users DESC

Active customers for each day who were active in last 30 days

I have a BQ table, user_events that looks like the following:
event_date | user_id | event_type
Data is for Millions of users, for different event dates.
I want to write a query that will give me a list of users for every day who were active in last 30 days.
This gives me total unique users on only that day; I can't get it to give me the last 30 for each date. Help is appreciated.
SELECT
user_id,
event_date
FROM
[TableA]
WHERE
1=1
AND user_id IS NOT NULL
AND event_date >= DATE_ADD(CURRENT_TIMESTAMP(), -30, 'DAY')
GROUP BY
1,
2
ORDER BY
2 DESC
Below is for BigQuery Standard SQL and has few assumption about your case:
there is only one row per date per user
user is considered active in last 30 days if user has at least 5 (sure can be any number - even just 1) entries/rows within those 30 days
If above make sense - see below
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM `yourTable`
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date
If above assumption #1 is not correct - you can just simple add pre-grouping as a sub-select
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM (
SELECT user_id, event_date
FROM `yourTable`
GROUP BY user_id, event_date
)
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date
UPDATE
From comments: If user have any of the event_type IN ('view', 'conversion', 'productDetail', 'search') , they will be considered active. That means any kind of event triggered within the app
So, you can go with below, I think
#standardSQL
SELECT
user_id, event_date
FROM (
SELECT
user_id, event_date,
(COUNT(1)
OVER(PARTITION BY user_id
ORDER BY UNIX_DATE(event_date)
RANGE BETWEEN 30 PRECEDING AND 1 PRECEDING)
) >= 5 AS activity
FROM (
SELECT user_id, event_date
FROM `yourTable`
WHERE event_type IN ('view', 'conversion', 'productDetail', 'search')
GROUP BY user_id, event_date
)
)
WHERE activity
GROUP BY user_id, event_date
-- ORDER BY event_date

Billing tier issues with 30 day active user query within bigquery

Is there a way using bigquery that I can run this query and not have to use such a huge billing tier? It ranges anywhere from 11 - 20 on the billing tier. Is my only option to crank up the billing tier and let the charges flow?
WITH allTables AS (SELECT
app,
date,
SUM(CASE WHEN period = 1 THEN users END) as days_1
FROM (
SELECT
CONCAT(user_dim.app_info.app_id, ':', user_dim.app_info.app_platform) as app,
dates.date as date,
periods.period as period,
COUNT(DISTINCT user_dim.app_info.app_instance_id) as users
FROM `table.*` as activity
CROSS JOIN
UNNEST(event_dim) AS event
CROSS JOIN (
SELECT DISTINCT
TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC') as date
FROM `table.*`
CROSS JOIN
UNNEST(event_dim) as event) as dates
CROSS JOIN (
SELECT
period
FROM (
SELECT 1 as period
)
) as periods
WHERE
dates.date >= TIMESTAMP_TRUNC(TIMESTAMP_MICROS(event.timestamp_micros), DAY, 'UTC')
AND
FLOOR(TIMESTAMP_DIFF(dates.date, TIMESTAMP_MICROS(event.timestamp_micros), DAY)/periods.period) = 0
GROUP BY 1,2,3
)
GROUP BY 1,2) SELECT
app as target,
UNIX_SECONDS(date) as datapoint_time,
SUM(days_1) as datapoint_value
FROM allTables
WHERE
date >= TIMESTAMP_ADD(TIMESTAMP_TRUNC(CURRENT_TIMESTAMP, Day, 'UTC'), INTERVAL -1 DAY)
GROUP BY date,1
ORDER BY date ASC