How to run SUM() OVER PARTITION BY for COUNT DISTINCT - sql

I'm trying to get the number of distinct users for each event at a daily level while maintainig a running sum for every hour.
I'm using Athena/Presto as the query engine.
I tried the following query:
SELECT
eventname,
date(from_unixtime(time_bucket)) AS date,
(time_bucket % 86400)/3600 as hour,
count,
SUM(count) OVER (PARTITION BY eventname, date(from_unixtime(time_bucket)) ORDER BY eventname, time_bucket) AS running_sum_count
FROM (
SELECT
eventname,
CAST(eventtimestamp AS bigint) - CAST(eventtimestamp AS bigint) % 3600 AS time_bucket,
COUNT(DISTINCT moengageuserid) as count
FROM clickstream.moengage
WHERE date = '2020-08-20'
AND eventname IN ('e1', 'e2', 'e3', 'e4')
GROUP BY 1,2
ORDER BY 1,2
);
But on seeing the results I realized that taking SUM of COUNT DISTINCT is not correct as it's not additive.
So, I tried the below query
SELECT
eventname,
date(from_unixtime(time_bucket)) AS date,
(time_bucket % 86400)/3600 as hour,
SUM(COUNT(DISTINCT moengageuserid)) OVER (PARTITION BY eventname, date(from_unixtime(time_bucket)) ORDER BY eventname, time_bucket) AS running_sum
FROM (
SELECT
eventname,
CAST(eventtimestamp AS bigint) - CAST(eventtimestamp AS bigint) % 3600 AS time_bucket,
moengageuserid
FROM clickstream.moengage
WHERE date = '2020-08-20'
AND eventname IN ('e1', 'e2', 'e3', 'e4')
);
But this query fails with the following error:
SYNTAX_ERROR: line 5:99: ORDER BY expression '"time_bucket"' must be an aggregate expression or appear in GROUP BY clause

Count the first time a user appears for the running distinct count:
SELECT eventname, date(from_unixtime(time_bucket)) AS date,
(time_bucket % 86400)/3600 as hour,
COUNT(DISTINCT moengageuserid) as hour_cont,
SUM(CASE WHEN seqnunm = 1 THEN 1 ELSE 0 END) OVER (PARTITION BY eventname, date(from_unixtime(time_bucket)) ORDER BY time_bucket) AS running_distinct_count
FROM (SELECT eventname,
CAST(eventtimestamp AS bigint) - CAST(eventtimestamp AS bigint) % 3600 AS time_bucket,
moengageuserid as hour_count,
ROW_NUMBER() OVER (PARTITION BY eventname, moengageuserid ORDER BY eventtimestamp) as seqnum
FROM clickstream.moengage
WHERE date = '2020-08-20' AND
eventname IN ('e1', 'e2', 'e3', 'e4')
) m
GROUP BY 1, 2, 3
ORDER BY 1, 2;

To calculate running distinct count you can collect user IDs into set (distinct array) and get the size:
cardinality(set_agg(moengageuserid)) OVER (PARTITION BY eventname, date(from_unixtime(time_bucket)) ORDER BY eventname, time_bucket) AS running_sum
This is analytic function and will assign the same value to the whole partition (eventname, date), you can aggregate records in upper subquery using max(), etc.

Related

Lag functions and SUM

I need to get the list of users that have been offline for at least 20 min every day. Here's my data
I have this starting query but am stuck on how to sum the difference in offline_mins i.e. need to add "and sum(offline_mins)>=20" to the where clause
SELECT
userid,
connected,
LAG(recordeddt) OVER(PARTITION BY userid
ORDER BY userid,
recordeddt) AS offline_period,
DATEDIFF(minute, LAG(recordeddt) OVER(PARTITION BY userid
ORDER BY userid,
recordeddt),recordeddt) offline_mins
FROM device_data where connected=0;
My expected results :
Thanks in advance.
This reads like a gaps-and-island problem, where you want to group together adjacent rows having the same userid and status.
As a starter, here is a query that computes the islands:
select userid, connected, min(recordeddt) startdt, max(lead_recordeddt) enddt,
datediff(min(recordeddt), max(lead_recordeddt)) duration
from (
select dd.*,
row_number() over(partition by userid order by recordeddt) rn1,
row_number() over(partition by userid, connected order by recordeddt) rn2,
lead(recordeddt) over(partition by userid order by recordeddt) lead_recordeddt
from device_data dd
) dd
group by userid, connected, rn1 - rn2
Now, say you want users that were offline for at least 20 minutes every day. You can breakdown the islands per day, and use a having clause for filtering:
select userid
from (
select recordedday, userid, connected,
datediff(min(recordeddt), max(lead_recordeddt)) duration
from (
select dd.*, v.*,
row_number() over(partition by v.recordedday, userid order by recordeddt) rn1,
row_number() over(partition by v.recordedday, userid, connected order by recordeddt) rn2,
lead(recordeddt) over(partition by v.recordedday, userid order by recordeddt) lead_recordeddt
from device_data dd
cross apply (values (convert(date, recordeddt))) v(recordedday)
) dd
group by convert(date, recordeddt), userid, connected, rn1 - rn2
) dd
group by userid
having count(distinct case when connected = 0 and duration >= 20 then recordedday end) = count(distinct recordedday)
As noted this is a gaps and island problem. This is my take on it using a simple lag function to create groups, filter out the connected rows and then work on the date ranges.
CREATE TABLE #tmp(ID int, UserID int, dt datetime, connected int)
INSERT INTO #tmp VALUES
(1,1,'11/2/20 10:00:00',1),
(2,1,'11/2/20 10:05:00',0),
(3,1,'11/2/20 10:10:00',0),
(4,1,'11/2/20 10:15:00',0),
(5,1,'11/2/20 10:20:00',0),
(6,2,'11/2/20 10:00:00',1),
(7,2,'11/2/20 10:05:00',1),
(8,2,'11/2/20 10:10:00',0),
(9,2,'11/2/20 10:15:00',0),
(10,2,'11/2/20 10:20:00',0),
(11,2,'11/2/20 10:25:00',0),
(12,2,'11/2/20 10:30:00',0)
SELECT UserID, connected,DATEDIFF(minute,MIN(DT), MAX(DT)) OFFLINE_MINUTES
FROM
(
SELECT *, SUM(CASE WHEN connected <> LG THEN 1 ELSE 0 END) OVER (ORDER BY UserID,dt) grp
FROM
(
select *, LAG(connected,1,connected) OVER(PARTITION BY UserID ORDER BY UserID,dt) LG
from #tmp
) x
) y
WHERE connected <> 1
GROUP BY UserID,grp,connected
HAVING DATEDIFF(minute,MIN(DT), MAX(DT)) >= 20

How could i find out the day of the week that has the lowest number of single rider trips on average?

Below is my code but only returns the total amount of ride that is a single ride. I tried to apply two aggregate functions together (min and count) but didn't work
select count(VendorID) as num_1
from Yellow_trip_data
where passenger_count=1 and RatecodeID = 1
GROUP BY dayofweek(tpep_pickup_datetime);
8569039
10840901
10470560
10965369
10697947
9850593
9591603
WITH YourQuery as (
select dayofweek(tpep_pickup_datetime as day_of_week, count(VendorID) as num_1
from Yellow_trip_data
where passenger_count=1 and RatecodeID = 1
GROUP BY dayofweek(tpep_pickup_datetime))
SELECT day_of_week from YourQuery ORDER BY num_1 ASC LIMIT 1
I just used your query and selected the minimum number.
If you want to show the average:
WITH SingleRiderCountsPerDay AS (
select date_trunc(tpep_pickup_datetime,'DD') as pickup_date, count(VendorID) as number_of_rides
from Yellow_trip_data
where passenger_count=1 and RatecodeID = 1
GROUP BY date_trunc(tpep_pickup_datetime,'DD')
), AverageSingleRidesPerWeekday AS (
select dayofweek(pickup_date) as day_of_week, AVG(number_of_rides) as average_number_of_rides
from SingleRiderCountsPerDay
GROUP BY dayofweek(pickup_date)
) SELECT day_of_week, average_number_of_rides from AverageSingleRidesPerWeekday ORDER BY num_1 ASC LIMIT 1

BigQuery SQL: filter on event sequence

I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download for the same user ("followed" meaning the event_time_utc of store_app_view is older than event_time_utc of store_app_download).
Sample data:
WITH
`project.dataset.dummy_data_init` AS (SELECT event_id FROM UNNEST(GENERATE_ARRAY(1, 10000)) event_id),
`project.dataset.dummy_data_completed` AS (SELECT event_id,
user_id[OFFSET(CAST(20 * RAND() - 0.5 AS INT64))] user_id,
app_id[OFFSET(CAST(100 * RAND() - 0.5 AS INT64))] app_id,
event_type[OFFSET(CAST(6 * RAND() - 0.5 AS INT64))] event_type,
event_time_utc[OFFSET(CAST(26 * RAND() - 0.5 AS INT64))] event_time_utc
FROM `project.dataset.dummy_data_init`,
(SELECT GENERATE_ARRAY(1, 20) user_id),
(SELECT GENERATE_ARRAY(1, 100) app_id),
(SELECT ['store_app_view', 'store_app_view', 'store_app_download','store_app_install','store_app_update','store_fetch_manifest'] event_type),
(SELECT GENERATE_TIMESTAMP_ARRAY('2020-01-01 00:00:00', '2020-01-26 00:00:00',
INTERVAL 1 DAY) AS event_time_utc))
Select * FROM `project.dataset.dummy_data_completed`
Thanks!
I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download.
Your provided query seems to have almost no connection to this question, so I'll ignore it.
For each user/app pair, you can get the rows that matching your conditions using GROUP BY:
select user_id, app_id
from t
group by user_id, app_id
having min(case when event_type = 'store_app_view' then event_time end) <
max(case when event_type = 'store_app_download' then event_time end);
To get the total for each app, use a subquery or CTE:
select app_id, count(*)
from (select user_id, app_id
from t
group by user_id, app_id
having min(case when event_type = 'store_app_view' then event_time end) <
max(case when event_type = 'store_app_download' then event_time end)
) ua
group by app_id;

NTILE() in BigQuery for non-uniform buckets

I'm trying to perform RFM segmentation on the Google Merchandise Store sample dataset on BigQuery. In my SQL query, NTILE(5) divides the rows into 5 buckets based on row ordering and returns the bucket number that is assigned to each row. In this case, each bucket are of equal size. Would like to find out how to create buckets of different sizes instead. For example, bucket 1 contains the bottom 10%, bucket 2 contains the next 20% of records etc. Thank you!
#standard SQL
SELECT
fullVisitorId,
NTILE(5) OVER (ORDER BY last_order_date) AS rfm_recency,
NTILE(5) OVER (ORDER BY count_order) AS rfm_frequency,
NTILE(5) OVER (ORDER BY avg_amount) AS rfm_monetary
FROM (
SELECT
fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
AVG(totals.totalTransactionRevenue)/1000000 AS avg_amount
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE
_table_suffix BETWEEN "101"
AND "801"
AND totals.totalTransactionRevenue IS NOT NULL
GROUP BY
fullVisitorId )
You can use row_number() and count(*) to define your own buckets:
SELECT fullVisitorId,
(CASE WHEN seqnum_r <= 0.1 * cnt THEN 1
WHEN seqnum_r <= 0.3 * cnt THEN 2
ELSE 3
END) as bin_r,
. . .
FROM (SELECT fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
(AVG(totals.totalTransactionRevenue) / 1000000) AS avg_amount,
COUNT(*) OVER () as cnt,
ROW_NUMBER() OVER (ORDER BY MAX(date)) as seqnum_r,
ROW_NUMBER() OVER (ORDER BY COUNT(*)) as seqnum_f,
ROW_NUMBER() OVER (ORDER BY AVG(totals.totalTransactionRevenue)) as seqnum_m
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE _table_suffix BETWEEN "101" AND "801" AND
totals.totalTransactionRevenue IS NOT NULL
GROUP BY fullVisitorId
) rfm
Below is for BigQuery Standard SQL and assumes your initial query works for for you, SQL UDF NON_UNIFORM_BUCKET() does the trick for you
#standard SQL
CREATE TEMP FUNCTION NON_UNIFORM_BUCKET(i INT64) AS (
CASE
WHEN i = 1 THEN 1
WHEN i IN (2, 3) THEN 2
WHEN i IN (4, 5, 6) THEN 3
WHEN i = 7 THEN 4
ELSE 5
END
);
SELECT
fullVisitorId,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY last_order_date)) AS rfm_recency,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY count_order)) AS rfm_frequency,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY avg_amount)) AS rfm_monetary
FROM (
SELECT
fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
AVG(totals.totalTransactionRevenue)/1000000 AS avg_amount
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE
_table_suffix BETWEEN "101"
AND "801"
AND totals.totalTransactionRevenue IS NOT NULL
GROUP BY
fullVisitorId )

Return max value from a SQL selection

I do have a table license_Usage where which works like a log of the usage of licenses in a day
ID User license date
1 1 A 22/2/2015
2 1 A 23/2/2015
3 1 B 22/2/2015
4 2 A 22/2/2015
Where I want to Count how many licenses per user in a day, the result shoul look like:
QuantityOfLicenses User date
2 1 22/2/2015
1 2 22/2/2015
For that I did the following query :
select count(license) as [Quantity of licenses],[user],[date]
From license_Usage
where date = '22/2/2015'
Group by [date], [user]
which works, but know I want to know which user have used the most number of licenses, for that I did the following query:
select MAX(result.[Quantity of licenses])
From (
select count(license) as [Quantity of licenses],[user],[date]
From license_Usage
Group by [date], [user]
) as result
And it returns the max value of 2, but when I want to know which user have used 2 licenses,I try this query with no success :
select result.user, MAX(result.[Quantity of licenses])
From (
select count(license) as [Quantity of licenses],[user],[date]
From license_Usage
Group by [date], [user]
) as result
Group by result.user
You can use something like this:
select top 1 *
From (
select count(license) as Quantity,[user],[date]
From license_Usage
Group by [date], [user]
) as result
order by Quantity desc
If you need to have a fetch that fetches all the rows that have max in case there's several, then you'll have to use rank() window function
Use RANK to rank the users by the number of licenses per day.
SELECT
LicPerDay.*,
RANK() OVER (PARTITION BY [date] ORDER BY Qty DESC) AS User_Rank
FROM (
SELECT
COUNT(license) AS Qty,
User,
[date]
FROM license_usage
GROUP BY User, [date]
) LicPerDay
Any user with User_Rank = 1 will have the most licenses for that day.
If you only want the top user for each day, wrap the query above as a subquery and filter on User_Rank = 1:
SELECT * FROM (
SELECT
LicPerDay.*,
RANK() OVER (PARTITION BY [date] ORDER BY Qty) AS User_Rank
FROM (
SELECT
COUNT(license) AS Qty,
User,
[date]
FROM license_usage
GROUP BY User, [date]
) LicPerDay
) LicPerDayRanks
WHERE User_Rank = 1
Use a Windowed Aggregate Function, RANK, to get the highest count:
SELECT * FROM (
SELECT
User,
[date]
COUNT(license) AS Qty,
-- rank by descending number for each day ??
--RANK() OVER (PARTITION BY [date] ORDER BY COUNT(license) DESC) AS rnk
-- rank by descending number
RANK() OVER (ORDER BY COUNT(license) DESC) AS rnk
FROM license_usage
GROUP BY User, [date]
) dt
WHERE rnk = 1