NTILE() in BigQuery for non-uniform buckets - sql

I'm trying to perform RFM segmentation on the Google Merchandise Store sample dataset on BigQuery. In my SQL query, NTILE(5) divides the rows into 5 buckets based on row ordering and returns the bucket number that is assigned to each row. In this case, each bucket are of equal size. Would like to find out how to create buckets of different sizes instead. For example, bucket 1 contains the bottom 10%, bucket 2 contains the next 20% of records etc. Thank you!
#standard SQL
SELECT
fullVisitorId,
NTILE(5) OVER (ORDER BY last_order_date) AS rfm_recency,
NTILE(5) OVER (ORDER BY count_order) AS rfm_frequency,
NTILE(5) OVER (ORDER BY avg_amount) AS rfm_monetary
FROM (
SELECT
fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
AVG(totals.totalTransactionRevenue)/1000000 AS avg_amount
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE
_table_suffix BETWEEN "101"
AND "801"
AND totals.totalTransactionRevenue IS NOT NULL
GROUP BY
fullVisitorId )

You can use row_number() and count(*) to define your own buckets:
SELECT fullVisitorId,
(CASE WHEN seqnum_r <= 0.1 * cnt THEN 1
WHEN seqnum_r <= 0.3 * cnt THEN 2
ELSE 3
END) as bin_r,
. . .
FROM (SELECT fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
(AVG(totals.totalTransactionRevenue) / 1000000) AS avg_amount,
COUNT(*) OVER () as cnt,
ROW_NUMBER() OVER (ORDER BY MAX(date)) as seqnum_r,
ROW_NUMBER() OVER (ORDER BY COUNT(*)) as seqnum_f,
ROW_NUMBER() OVER (ORDER BY AVG(totals.totalTransactionRevenue)) as seqnum_m
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE _table_suffix BETWEEN "101" AND "801" AND
totals.totalTransactionRevenue IS NOT NULL
GROUP BY fullVisitorId
) rfm

Below is for BigQuery Standard SQL and assumes your initial query works for for you, SQL UDF NON_UNIFORM_BUCKET() does the trick for you
#standard SQL
CREATE TEMP FUNCTION NON_UNIFORM_BUCKET(i INT64) AS (
CASE
WHEN i = 1 THEN 1
WHEN i IN (2, 3) THEN 2
WHEN i IN (4, 5, 6) THEN 3
WHEN i = 7 THEN 4
ELSE 5
END
);
SELECT
fullVisitorId,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY last_order_date)) AS rfm_recency,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY count_order)) AS rfm_frequency,
NON_UNIFORM_BUCKET(NTILE(10) OVER (ORDER BY avg_amount)) AS rfm_monetary
FROM (
SELECT
fullVisitorId,
MAX(date) AS last_order_date,
COUNT(*) AS count_order,
AVG(totals.totalTransactionRevenue)/1000000 AS avg_amount
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_20170*`
WHERE
_table_suffix BETWEEN "101"
AND "801"
AND totals.totalTransactionRevenue IS NOT NULL
GROUP BY
fullVisitorId )

Related

Lag functions and SUM

I need to get the list of users that have been offline for at least 20 min every day. Here's my data
I have this starting query but am stuck on how to sum the difference in offline_mins i.e. need to add "and sum(offline_mins)>=20" to the where clause
SELECT
userid,
connected,
LAG(recordeddt) OVER(PARTITION BY userid
ORDER BY userid,
recordeddt) AS offline_period,
DATEDIFF(minute, LAG(recordeddt) OVER(PARTITION BY userid
ORDER BY userid,
recordeddt),recordeddt) offline_mins
FROM device_data where connected=0;
My expected results :
Thanks in advance.
This reads like a gaps-and-island problem, where you want to group together adjacent rows having the same userid and status.
As a starter, here is a query that computes the islands:
select userid, connected, min(recordeddt) startdt, max(lead_recordeddt) enddt,
datediff(min(recordeddt), max(lead_recordeddt)) duration
from (
select dd.*,
row_number() over(partition by userid order by recordeddt) rn1,
row_number() over(partition by userid, connected order by recordeddt) rn2,
lead(recordeddt) over(partition by userid order by recordeddt) lead_recordeddt
from device_data dd
) dd
group by userid, connected, rn1 - rn2
Now, say you want users that were offline for at least 20 minutes every day. You can breakdown the islands per day, and use a having clause for filtering:
select userid
from (
select recordedday, userid, connected,
datediff(min(recordeddt), max(lead_recordeddt)) duration
from (
select dd.*, v.*,
row_number() over(partition by v.recordedday, userid order by recordeddt) rn1,
row_number() over(partition by v.recordedday, userid, connected order by recordeddt) rn2,
lead(recordeddt) over(partition by v.recordedday, userid order by recordeddt) lead_recordeddt
from device_data dd
cross apply (values (convert(date, recordeddt))) v(recordedday)
) dd
group by convert(date, recordeddt), userid, connected, rn1 - rn2
) dd
group by userid
having count(distinct case when connected = 0 and duration >= 20 then recordedday end) = count(distinct recordedday)
As noted this is a gaps and island problem. This is my take on it using a simple lag function to create groups, filter out the connected rows and then work on the date ranges.
CREATE TABLE #tmp(ID int, UserID int, dt datetime, connected int)
INSERT INTO #tmp VALUES
(1,1,'11/2/20 10:00:00',1),
(2,1,'11/2/20 10:05:00',0),
(3,1,'11/2/20 10:10:00',0),
(4,1,'11/2/20 10:15:00',0),
(5,1,'11/2/20 10:20:00',0),
(6,2,'11/2/20 10:00:00',1),
(7,2,'11/2/20 10:05:00',1),
(8,2,'11/2/20 10:10:00',0),
(9,2,'11/2/20 10:15:00',0),
(10,2,'11/2/20 10:20:00',0),
(11,2,'11/2/20 10:25:00',0),
(12,2,'11/2/20 10:30:00',0)
SELECT UserID, connected,DATEDIFF(minute,MIN(DT), MAX(DT)) OFFLINE_MINUTES
FROM
(
SELECT *, SUM(CASE WHEN connected <> LG THEN 1 ELSE 0 END) OVER (ORDER BY UserID,dt) grp
FROM
(
select *, LAG(connected,1,connected) OVER(PARTITION BY UserID ORDER BY UserID,dt) LG
from #tmp
) x
) y
WHERE connected <> 1
GROUP BY UserID,grp,connected
HAVING DATEDIFF(minute,MIN(DT), MAX(DT)) >= 20

conditional running sum

I'm trying to return the number of unique users that converted over time.
So I have the following query:
WITH CTE
As
(
SELECT '2020-04-01' as date,'userA' as user,1 as goals Union all
SELECT '2020-04-01','userB',0 Union all
SELECT '2020-04-01','userC',0 Union all
SELECT '2020-04-03','userA',1 Union all
SELECT '2020-04-05','userC',1 Union all
SELECT '2020-04-06','userC',0 Union all
SELECT '2020-04-06','userB',0
)
select
date,
COUNT(DISTINCT
IF
(goals >= 1,
user,
NULL)) AS cad_converters
from CTE
group by date
I'm trying to count distinct user but I need to find a way to apply the distinct count to the whole date. I probably need to do something like a cumulative some...
expected result would be something like this
date, goals, total_unique_converted_users
'2020-04-01',1,1
'2020-04-01',0,1
'2020-04-01',0,1
'2020-04-03',1,2
'2020-04-05',1,2
'2020-04-06',0,2
'2020-04-06',0,2
Below is for BigQuery Standard SQL
#standardSQL
SELECT t.date, t.goals, total_unique_converted_users
FROM `project.dataset.table` t
LEFT JOIN (
SELECT a.date,
COUNT(DISTINCT IF(b.goals >= 1, b.user, NULL)) AS total_unique_converted_users
FROM `project.dataset.table` a
CROSS JOIN `project.dataset.table` b
WHERE a.date >= b.date
GROUP BY a.date
)
USING(date)
I would approach this by tagging when the first goal is scored for each name. Then simply do a cumulative sum:
select cte.* except (seqnum), countif(seqnum = 1) over (order by date)
from (select cte.*,
(case when goals = 1 then row_number() over (partition by user, goals order by date) end) as seqnum
from cte
) cte;
I realize this can be expressed without the case in the subquery:
select cte.* except (seqnum), countif(seqnum = 1 and goals = 1) over (order by date)
from (select cte.*,
row_number() over (partition by user, goals order by date) as seqnum
from cte
) cte;

Query to get the sum of values for the maximum number of months

This query in Oracle 11 gets the sum of value for the last 1 years, and it works when there are 1 years of data.
When there is less than 1 years of data, this query returns 0, instead of the sum of values until whatever the oldest years are.
For example, if there are only 6 months of data, the query should return the sum of values until the 6th month.
SELECT SUM (DECODE (rnk, 11, rt, 0)) 1Y
FROM (SELECT entity_id,rnk,
SUM (ABS(NVL (value, 0))) OVER (PARTITION BY TRIM (entity_id) ORDER BY rnk) rt
FROM (SELECT psm.*,RANK () OVER (PARTITION BY entity_id ORDER BY period_end_date DESC) AS rnk
FROM myTable psm
WHERE psm.entity_id = '1'
ORDER BY period_end_date DESC
) rank_tab
WHERE rnk < 12
);
If the biggest rank is 6, the result from the above query is 0
I attempted this, but got the error "ORA-00978: nested group function without GROUP BY"
SELECT case when rnk < 11
then SUM (DECODE (rnk, Max(rnk), rt, 0))
else SUM (DECODE (rnk, 11, rt, 0))
end as Y
FROM (SELECT entity_id,rnk,
SUM (ABS(NVL (value, 0))) OVER (PARTITION BY TRIM (entity_id) ORDER BY rnk) rt
FROM (SELECT psm.*,RANK () OVER (PARTITION BY entity_id ORDER BY period_end_date DESC) AS rnk
FROM myTable psm
WHERE psm.entity_id = '1'
ORDER BY period_end_date DESC
) rank_tab
WHERE rnk < 12
);
Sample data:
entity_id value period_end_date
1 1 9/30/19
1 2 8/31/19
1 3 7/31/19
1 4 6/30/19
1 5 5/31/19
1 6 4/30/19
In the above example, 1Y should return 1+2+3+4+5+6 = 21.
Instead my query returns 0 because it is looking for rnk = 11, which doesn't exist.
SUM (DECODE (rnk, 11, rt, 0)) 1Y
Thank you.
EDIT:
This works. But, if you know of a better way to do it, please let me know. Thank you.
SELECT
CASE WHEN MRank < 11 then maxY else OneY end as lc_incearned_1Y
FROM (
WITH R as
(SELECT MAX(RNK) MaxRank FROM (
SELECT RANK () OVER (PARTITION BY TRIM (entity_id) ORDER BY period_end_date
DESC) AS rnk FROM myTbl psm
WHERE TRIM (psm.entity_id) = '1' AND period_end_date <
to_date('9/30/2019','MM/DD/YYYY')
ORDER BY period_end_date DESC))
select MAX(MaxRank) MRank,
SUM (DECODE (rnk, MaxRank, rt, 0)) maxY,
SUM (DECODE (rnk, 11, rt, 0)) OneY, --13051.97
FROM (SELECT entity_id,rnk,
SUM (ABS (NVL (value, 0))) OVER (PARTITION BY TRIM (entity_id) ORDER BY rnk) rt
FROM (SELECT psm.*,RANK () OVER (PARTITION BY TRIM (entity_id) ORDER BY period_end_date DESC) AS rnk FROM CREF.PORTFOLIO_SUMM_MTHEND psm
WHERE TRIM (psm.entity_id) = '1' AND period_end_date < to_date('9/30/2019','MM/DD/YYYY')
ORDER BY period_end_date DESC) rank_tab WHERE rnk < 12) T,R)
It seems you need to sum your values up starting from the latest period_end_date to the earliest date within the eleven months range. It would be suitable to use max(period_end_date) over (partition by entity_id order by period_end_date desc) analytic function along with your current rank() function. And then apply months_between(<max_period_end_date>,period_end_date). If your need to look up from the current date, then get rid of max() analytic function and replace <max_period_end_date> with trunc(sysdate) in months_between() function. So, use :
with t as
(
select max(period_end_date) over (partition by entity_id order by period_end_date desc) as mx,
rank() over (partition by entity_id order by period_end_date desc) as rnk,
t.*
from myTable t
)
select sum(nvl(value,0)) as sum_value
from t
where months_between(mx,period_end_date)<=11
Demo

Max dates for each sequence within partitions

I would like to see if somebody has an idea how to get the max and min dates within each 'id' using the 'row_num' column as an indicator when the sequence starts/ends in SQL Server 2016.
The screenshot below shows the desired output in columns 'min_date' and 'max_date'.
Any help would be appreciated.
You could use windowed MIN/MAX:
WITH cte AS (
SELECT *,SUM(CASE WHEN row_num > 1 THEN 0 ELSE 1 END)
OVER(PARTITION BY id, cat ORDER BY date_col) AS grp
FROM tab
)
SELECT *, MIN(date_col) OVER(PARTITION BY id, cat, grp) AS min_date,
MAX(date_col) OVER(PARTITION BY id, cat, grp) AS max_date
FROM cte
ORDER BY id, date_col, cat;
Rextester Demo
Try something like
SELECT
Q1.id, Q1.cat,
MIN(Q1.date) AS min_dat,
MAX(Q1.date) AS max_dat
FROM
(SELECT
*,
ROW_NUMBER() OVER (PARTITION BY id, cat ORDER BY [date]) AS r1,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY [date]) AS r2
) AS Q1
GROUP BY
Q1.id, Q1.r2 - Q1.r1

Tree / Group UNION SQL-Query for a Report

I have a query for a dynamicreport as a datasource.
The result till now is:
There are 3 queries connected with UNION. Line 1 all data accumulated for the company. Line 2 all Data for the location and line 3 the detail data.
It is like a tree. But my problem is, that the accumulation is not correct (AnzahlMinuten). Is there an other way to display this data in a dynamicreport. This 3 queries can be very time intense. I also use the RANK() function because i got multiple entries for the time a license is used.
If there are no other easier solutions, where is my fault in the connected queries with union, so that the accumulation is not correct?
SELECT Gesellschaftsname,Standortname,Lizenzname,Abteilungsname,Kostenstelle,
COUNT(DISTINCT username) AS AnzahlUser,
SUM(DISTINCT RuntimeMinute) AS AnzahlMinuten,
1 FROM (SELECT * FROM(SELECT DISTINCT Standortname,
DATEPART(YEAR,PK_Date) AS Jahr,
DATEPART(month,PK_Date) AS Monat,
Lizenzname,COUNT(DISTINCT username) AS AnzUser,
SUM(DISTINCT DATEDIFF(minute,starttime ,pk_date)) AS RuntimeMinute,
starttime,
username,
pk_date,
Abteilungsname,
Gesellschaftsname,
Kostenstelle,
RANK() Over (PARTITION BY starttime ORDER BY pk_date DESC) As Rank
FROM BenutzerLizenz,Benutzer,Abteilung,Lizenz,Standort,Gesellschaft,Kostenstelle
WHERE BenutzerLizenz.PK_ID_user=Benutzer.PK_ID_user AND BenutzerLizenz.PK_ID_lic=Lizenz.PK_ID_lic AND PK_ID_standort=FK_ID_standort AND PK_ID_Abteilung = FK_ID_Abteilung AND PK_ID_Gesellschaft = FK_ID_Gesellschaft AND PK_ID_Kostenstelle = FK_ID_Kostenstelle AND
DATEPART(month,PK_Date) IN ('06','07') AND
DATEPART(YEAR,PK_Date) = '2013' AND
Lizenzname IN ('DESIGNER','iman_nth') AND
Standortname IN ('Unterlüß','Neuenburg')
GROUP BY Standortname, Lizenzname, starttime, pk_date, username ,Abteilungsname, Kostenstelle, Gesellschaftsname) tmp
WHERE Rank = 1)tmp2 GROUP BY Standortname,Lizenzname,Abteilungsname, Kostenstelle, Gesellschaftsname
UNION
SELECT Gesellschaftsname,'','','','',
COUNT(DISTINCT username) AS AnzahlUser,
SUM(DISTINCT RuntimeMinute) AS AnzahlMinuten,2
FROM (SELECT * FROM(SELECT DISTINCT Gesellschaftsname,
DATEPART(YEAR,PK_Date) AS Jahr,
DATEPART(month,PK_Date) AS Monat,
COUNT(DISTINCT username) AS AnzUser,
SUM(DISTINCT DATEDIFF(minute,starttime ,pk_date)) AS RuntimeMinute,
starttime,
username,
pk_date,
RANK() Over (PARTITION BY starttime ORDER BY pk_date DESC) As Rank
FROM BenutzerLizenz,Benutzer,Lizenz,Standort,Gesellschaft
WHERE BenutzerLizenz.PK_ID_user=Benutzer.PK_ID_user AND BenutzerLizenz.PK_ID_lic=Lizenz.PK_ID_lic AND PK_ID_Gesellschaft = FK_ID_Gesellschaft AND
DATEPART(month,PK_Date) IN ('06','07') AND
DATEPART(YEAR,PK_Date) = '2013' AND
Lizenzname IN ('DESIGNER','iman_nth') AND
Standortname IN ('Unterlüß','Neuenburg')
GROUP BY Gesellschaftsname,starttime, pk_date, username) tmp
WHERE Rank = 1)tmp2 GROUP BY Gesellschaftsname
UNION
SELECT '',Standortname,'','','',
COUNT(DISTINCT username) AS AnzahlUser,
SUM(DISTINCT RuntimeMinute) AS AnzahlMinuten,3
FROM (SELECT * FROM(SELECT DISTINCT Standortname,
DATEPART(YEAR,PK_Date) AS Jahr,
DATEPART(month,PK_Date) AS Monat,
COUNT(DISTINCT username) AS AnzUser,
SUM(DISTINCT DATEDIFF(minute,starttime ,pk_date)) AS RuntimeMinute,
starttime,
username,
pk_date,
RANK() Over (PARTITION BY starttime ORDER BY pk_date DESC) As Rank
FROM BenutzerLizenz,Benutzer,Abteilung,Lizenz,Standort
WHERE BenutzerLizenz.PK_ID_user=Benutzer.PK_ID_user AND BenutzerLizenz.PK_ID_lic=Lizenz.PK_ID_lic AND PK_ID_standort=FK_ID_standort AND PK_ID_Abteilung = FK_ID_Abteilung AND
DATEPART(month,PK_Date) IN ('06','07') AND
DATEPART(YEAR,PK_Date) = '2013' AND
Lizenzname IN ('DESIGNER','iman_nth') AND
Standortname IN ('Unterlüß','Neuenburg')
GROUP BY Standortname, starttime, pk_date, username) tmp
WHERE Rank = 1)tmp2 GROUP BY Standortname
ORDER BY 2
I think the main issue is with the use of "distinct." This is not a coding problem. When summing distinct on multiple grouping levels, the totals of sub-groups may be greater than the total of the top group. For example:
GroupId Value
1 1
1 2
1 3
2 2
2 4
2 5
Sum(distinct value) on group 1 = 6
sum(distinct value) on group 2 = 11
sum(distinct value) on both groups = 15
Also, in general, it sounds like you are asking for a neater way to solve this problem of multiple grouping levels in a single recordset. I did something like this at a previous job:
sql fiddle
The idea is that you build the list of possible groups first in a CTE as
Level1 Level2 Level3
A NULL NULL
A AA NULL
A AB NULL
A AA AAA
A AA AAB
A AB ABA
A AB ABB
then join that to your data on the three levels and group by Level1, Level2, Level3. It's a lot cleaner.