BigQuery: Querying with standard sql - sql

I have this table:
client_id session_id time action transaction_id
------------------------------------------------------
1 1 15:01 view NULL
1 1 15:02 basket NULL
1 1 15:03 basket NULL
1 1 15:04 purchase 1
1 2 15:05 basket NULL
1 2 15:06 purchase 2
1 2 15:07 view NULL
And I want inside the session, for all the previous actions to register the transaction_id that occur for the first time (therefore at 15:03 transaction_id = NULL)
session_id time transaction_id
------------------------------------
1 15:01 1
1 15:02 1
1 15:03 NULL
1 15:04 1
2 15:05 2
2 15:06 2
2 15:07 NULL

Hmmm . . . assuming that there is only one transaction id per session, then you can use window functions:
select t.*,
(case when row_number() over (partition by client_id, session_id, action
order by time) = 1
then max(transactc
ion_id) over (partition by client_id, session_id)
end) as new_transaction_id
from t

Below is for BigQuery Standard SQL
#standardSQL
SELECT
client_id, session_id, time, action,
(CASE
WHEN ROW_NUMBER()
OVER (PARTITION BY client_id, session_id, grp, action ORDER BY time) = 1
THEN MAX(transaction_id) OVER (PARTITION BY client_id, session_id, grp) END
) AS transaction_id
FROM (
SELECT *,
COUNTIF(transaction_id IS NOT NULL)
OVER(PARTITION BY client_id, session_id
ORDER BY time ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp
FROM YourTable
)
-- ORDER BY client_id, session_id, time
You can test play with dummy data as below
#standardSQL
WITH YourTable AS (
SELECT 1 AS client_id, 1 AS session_id, '15:01' AS time, 'view' AS action, NULL AS transaction_id UNION ALL
SELECT 1, 1, '15:02', 'basket', NULL UNION ALL
SELECT 1, 1, '15:03', 'basket', NULL UNION ALL
SELECT 1, 1, '15:04', 'purchase', 1 UNION ALL
SELECT 1, 1, '15:05', 'basket', NULL UNION ALL
SELECT 1, 1, '15:06', 'basket', NULL UNION ALL
SELECT 1, 1, '15:07', 'purchase', 3 UNION ALL
SELECT 1, 2, '15:08', 'basket', NULL UNION ALL
SELECT 1, 2, '15:09', 'purchase', 2 UNION ALL
SELECT 1, 2, '15:10', 'view', NULL
)
SELECT
client_id, session_id, time, action,
(CASE
WHEN ROW_NUMBER()
OVER (PARTITION BY client_id, session_id, grp, action ORDER BY time) = 1
THEN MAX(transaction_id) OVER (PARTITION BY client_id, session_id, grp) END
) AS transaction_id
FROM (
SELECT *,
COUNTIF(transaction_id IS NOT NULL)
OVER(PARTITION BY client_id, session_id
ORDER BY time ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp
FROM YourTable
)
-- ORDER BY client_id, session_id, time
Output is as expected
client_id session_id time action transaction_id
1 1 15:01 view 1
1 1 15:02 basket 1
1 1 15:03 basket null
1 1 15:04 purchase 1
1 1 15:05 basket 3
1 1 15:06 basket null
1 1 15:07 purchase 3
1 2 15:08 basket 2
1 2 15:09 purchase 2
1 2 15:10 view null

Related

Flag rows that appear between rows with specific strings

Let's say I have a table like this:
user_id
order
action
1
1
start
1
2
other
1
3
other
1
4
end
1
5
other
2
1
start
2
2
other
2
3
end
2
4
other
2
5
start
2
6
other
2
7
end
And I want to create a new column that flags the rows that appear between "start" and "end" events for each user (ordering by "order"):
user_id
order
action
is_between_start_and_end
1
1
start
NULL
1
2
other
1
1
3
other
1
1
4
end
NULL
1
5
other
NULL
2
1
start
NULL
2
2
other
1
2
3
end
NULL
2
4
other
NULL
2
5
start
NULL
2
6
other
1
2
7
end
NULL
How can I achieve this?
Consider below approach
select * except(grp),
if(
countif(action = 'end') over (partition by user_id, grp order by `order`) = 0
and action != 'start', 1, null
) as is_between_start_and_end
from (
select *,
countif(action = 'start') over (partition by user_id order by `order`) as grp
from your_table
)
if applied to sample data in y our question - output is
This can be solved with windows functions.
with tbl as (
Select 1 as user_id, 1 as order_it,"start" as action
Union all select 1 , 2 ,"other"
Union all select 1 , 3 ,"other"
Union all select 1 , 4 ,"end"
Union all select 1 , 5 ,"other"
Union all select 2 , 1 ,"start"
Union all select 2 , 2 ,"other"
Union all select 2 , 3 ,"end"
Union all select 2 , 4 ,"other"
Union all select 2 , 5 ,"start"
Union all select 2 , 6 ,"other"
Union all select 2 , 7 ,"end"
),
helper as (
Select *,
countif(action="end") over win_before as ends,
countif(action="start") over win_before as starts,
first_value(if(action="end" or action="start",action,null) ignore nulls) over (partition by user_id order by order_it rows between current row and unbounded following) as end_to_come
from tbl
window win_before as (partition by user_id order by order_it rows between unbounded preceding and current row)
order by user_id,order_it
)
select *,
if(end_to_come="end" and starts-ends=1,1,null) as is_between_start_and_end
from helper
order by user_id,order_it
This should work but could surely be more optimized
with input as (
select 1 user_id, 1 as order_, 'start' action union all
select 1, 2, 'other' union all
select 1, 3, 'other' union all
select 1 , 4 , 'end' union all
select 1 , 5 , 'other' union all
select 2 , 1 , 'start' union all
select 2 , 2 , 'other' union all
select 2 , 3 , 'end' union all
select 2 , 4 , 'other' union all
select 2 , 5 , 'start' union all
select 2 , 6 , 'other' union all
select 2 , 7 , 'end'
)
select
*,
if (
order_ > max(if(action = 'start', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and current row) and
order_ < min(if(action = 'end', order_, null))
over(partition by user_id order by order_ range between current row and unbounded following) and
coalesce(order_ not between
max(if(action = 'end', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and 1 preceding)
and min(if(action = 'start', order_, null))
over(partition by user_id order by order_ range between 1 following and unbounded following), true)
, 1, null) as flag
from input
order by 1,2
Edit: It should also take into account weird cases where for instance a 3rd user has end > other > start > other > end > other in that order. The flag should only apply to the 4th item. If you have start > other > start > other > end however, it's unclear if items 2,3,4 or 4 or 2,4 should be flagged. I think it would only flag 4 here
Edit2: This version should flag 2,3,4
if (
order_ > max(if(action = 'start', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and 1 preceding) and
order_ < min(if(action = 'end', order_, null))
over(partition by user_id order by order_ range between current row and unbounded following) and
coalesce(max(if(action = 'start', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and 1 preceding) >
max(if(action = 'end', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and current row),true)
, 1, null) as flag

SQL: Running total count of distinct values

I'm trying to obtain rolling number of unique values in a window.
Here's how my table looks like:
SELECT
user_id
, order_date
, product
FROM example_table
WHERE user_id = 1
ORDER BY order_date ASC
user_id
order_date
product
1
2021-01-01
A
1
2021-01-01
B
1
2021-01-04
A
1
2021-01-07
C
1
2021-01-09
C
1
2021-01-20
A
Here's what I'm trying to achieve:
user_id
order_date
product
cum_dist_count
1
2021-01-01
A
1
1
2021-01-02
B
2
1
2021-01-04
A
2
1
2021-01-07
C
3
1
2021-01-09
C
3
1
2021-01-20
A
3
In other words, I want to be able to see how many unique items a customer has bough so far, and be able to see that for particular date (so for the example above: on 2021-01-04 they have bought 2 unique items and for 2021-01-07 that number was 3).
I've tried grouping by selecting user_id and product, and min(order_date) in a CTE, then doing ROW_NUMBER over user_id and product in that CTE and that worked partially - I'm able to seethe dates the countof unique products has changed (so for this example: 2021-01-01, 2021-01-02 and 2021-01-07, but then I loose the rows "between" which I still want to be able to access.
with cte as (
SELECT
user_id
, product
, min(order_date) as first_order
FROM example_table
GROUP BY 1,2
ORDER BY order_date ASC
)
SELECT
user_id
, first_order
, product
, ROW_NUMBER() OVER (PARTITION BY user_id, product ORDER BY first_order) AS number_of_unique_products
WHERE user_id = 1
With the above, I would get:
user_id
order_date
product
cum_dist_count
1
2021-01-01
A
1
1
2021-01-02
B
2
1
2021-01-07
C
3
The DB is in BigQuery StandardSQL.
Any help is much appreciated!
For each item, you can record the earliest date it appears. Then add those up:
select et.* except (seqnum),
countif(seqnum = 1) over (partition by user_id order by order_date) as running_distinct_count
from (select et.*,
row_number() over (partition by user_id, product order by order_date) as seqnum
from example_table et
) et
Below is for BigQuery
select * except(cum_products),
(select count(distinct product) from t.cum_products product) as cum_dist_count
from (
select *,
array_agg(product) over prev_rows as cum_products
from example_table
window prev_rows as (partition by user_id order by order_date)
) t
if applied to sample data in your question
with example_table as (
select 1 user_id, '2021-01-01' order_date, 'A' product union all
select 1, '2021-01-02', 'B' union all
select 1, '2021-01-04', 'A' union all
select 1, '2021-01-07', 'C' union all
select 1, '2021-01-09', 'C' union all
select 1, '2021-01-20', 'A'
)
output is

To pivot a table based on a specific event value using Query

I want to make Table A like Table B.
I'd like to see what events the User caused before the Purchase event.
I've used row_number() over (partition by client_id, event_type order by time) and it's simply a pivot. How do I make logic?
Table A
client_id event_type count time
A cart 1 AM 12:00:00
A view 4 AM 12:01:00
A purchase 2 AM 12:05:00
A view 2 AM 12:10:00
B view 3 AM 12:03:00
B purchase 1 AM 12:05:00
B view 2 AM 12:10:00
Table B
client_id view cart purchase
A 4 1 2
A 2 0 0
B 3 0 1
B 2 0 0
Here is a way of doing this, i define a group of events as belonging to a single "session/activity" before purchase using the block grp_split.
Then i get this grouping correctly done in the block x, by replacing null values with the previously not null value using the max(grp) over(partition by client_id order by time1) as grp2.
After that its a matter of pivoting the columns for view,cart and purchase
with data
as (
select 'A' as client_id,'cart' as event_type , 1 as count1, cast('AM 12:00:00' as time) as time1 union all
select 'A' as client_id,'view' as event_type , 4 as count1, cast('AM 12:01:00' as time) as time1 union all
select 'A' as client_id,'purchase' as event_type , 2 as count1, cast('AM 12:05:00' as time) as time1 union all
select 'A' as client_id,'view' as event_type , 2 as count1, cast('AM 12:10:00' as time) as time1 union all
select 'B' as client_id,'view' as event_type , 3 as count1, cast('AM 12:03:00' as time) as time1 union all
select 'B' as client_id,'purchase' as event_type , 1 as count1, cast('AM 12:05:00' as time) as time1 union all
select 'B' as client_id,'view' as event_type , 2 as count1, cast('AM 12:10:00' as time) as time1
)
,grp_split
as(
select case when lag(event_type) over(partition by client_id order by time1)='purchase'
or lag(event_type) over(partition by client_id order by time1) is null
then
row_number() over(partition by client_id order by time1)
end as grp
,*
from data
)
select x.client_id
,max(case when event_type='view' then count1 else 0 end) as view
,max(case when event_type='cart' then count1 else 0 end) as cart
,max(case when event_type='purchase' then count1 else 0 end) as purchase
from (
select *
,max(grp) over(partition by client_id order by time1) as grp2
from grp_split
)x
group by client_id
,grp2
order by client_id
output
+-----------+------+------+----------+
| client_id | view | cart | purchase |
+-----------+------+------+----------+
| A | 4 | 1 | 2 |
| A | 2 | 0 | 0 |
| B | 3 | 0 | 1 |
| B | 2 | 0 | 0 |
+-----------+------+------+----------+
working example
https://dbfiddle.uk/?rdbms=postgres_12&fiddle=aeeb0878b9094e061c469bb0efb7a024

How to select RANK over condition (depending previous row)

What we have
I have table like this
id PlayerId Amount
----------- ----------- -----------
1 1 10
2 1 20
3 1 30
4 1 40
5 1 40
11 1 20
13 1 20
15 1 40
14 2 19
12 2 10
6 2 1
7 2 5
8 2 10
9 2 20
10 2 30
I have to select only rows where amount greater than previous row amount (per player).
So here is a query
SELECT a.id,
a.PlayerId,
a.Amount,
a.PreVval,
a.NextVal
FROM (SELECT id,
PlayerId,
Amount,
LAG(Amount) OVER (PARTITION BY PlayerId ORDER BY id) PreVval,
lead(Amount) OVER (PARTITION BY PlayerId ORDER BY id) NextVal
FROM dbo.Bets ) a
WHERE a.Amount > a.PreVval OR a.Amount < a.NextVal OR (a.PreVval IS NULL AND a.Amount < a.NextVal)
ORDER BY a.PlayerId, a.id
id PlayerId Amount PreVval NextVal
----------- ----------- ----------- ----------- -----------
1 1 10 NULL 20
2 1 20 10 30
3 1 30 20 40
4 1 40 30 40
13 1 20 20 40
15 1 40 20 NULL
6 2 1 NULL 5
7 2 5 1 10
8 2 10 5 20
9 2 20 10 30
10 2 30 20 10
12 2 10 30 19
14 2 19 10 NULL
Question
So now i need to select sets where increase step count > 4 , i mean 1,2,3,4 for player 1 and 6,7,8,9,10 for player 2
Query should run over 15m rows
The following query is an example where you can set the "step_count".
WITH Bets(id,PlayerId,Amount)
AS
(
SELECT 1,1,10 UNION ALL
SELECT 2,1,20 UNION ALL
SELECT 3,1,30 UNION ALL
SELECT 4,1,40 UNION ALL
SELECT 5,1,40 UNION ALL
SELECT 11,1,20 UNION ALL
SELECT 13,1,20 UNION ALL
SELECT 15,1,40 UNION ALL
SELECT 14,2,19 UNION ALL
SELECT 12,2,10 UNION ALL
SELECT 6,2,1 UNION ALL
SELECT 7,2,5 UNION ALL
SELECT 8,2,10 UNION ALL
SELECT 9,2,20 UNION ALL
SELECT 10,2,30
)
,split_ranges
as(
select *,case when lag(amount) over(partition by playerid order by id) > amount
or lag(amount) over(partition by playerid order by id) is null
then row_number() over(partition by playerid order by id)
end as rnk_val
from bets
)
,groups_data
as(
select *
,max(rnk_val) over(partition by playerid order by id) as fill_ranges
from split_ranges
)
select * from (
select *,count(*) over(partition by playerid,fill_ranges) as cnt
from groups_data
)x
where x.cnt>=4
https://dbfiddle.uk/?rdbms=sqlserver_2019&fiddle=6bd815da2cbfa8f65bc999e5736f2041
The following logic is bit tricky, but you can check this out-
DEMO HERE
WITH CTE
AS
(
SELECT * FROM
(
SELECT id,
PlayerId,
Amount,
LAG(Amount) OVER (PARTITION BY PlayerId ORDER BY id) PreVval,
lead(Amount) OVER (PARTITION BY PlayerId ORDER BY id) NextVal,
ISNULL(LAG(ID,3) OVER (PARTITION BY PlayerId ORDER BY id),0) LAG3,
ISNULL(LAG(ID,2) OVER (PARTITION BY PlayerId ORDER BY id),0) LAG2,
ISNULL(LAG(ID,1) OVER (PARTITION BY PlayerId ORDER BY id),0) LAG1,
ISNULL(LEAD(ID,1) OVER (PARTITION BY PlayerId ORDER BY id),0) LEAD1,
ISNULL(LEAD(ID,2) OVER (PARTITION BY PlayerId ORDER BY id),0) LEAD2,
ISNULL(LEAD(ID,3) OVER (PARTITION BY PlayerId ORDER BY id),0) LEAD3
FROM Bets
) a
WHERE a.Amount > a.PreVval
OR a.Amount < a.NextVal
OR (a.PreVval IS NULL AND a.Amount < a.NextVal)
)
SELECT id,PlayerId,Amount,PreVval,NextVal
FROM CTE A
WHERE ID-LAG3 = 3
OR LEAD1 - LAG2 = 3
OR LEAD2 - LAG1 = 3
OR LEAD3 - ID = 3

Oracle SQL loop LEAD() through partition

I have a set that looks something like this
ID date_IN date_out
1 1/1/18 1/2/18
1 1/3/18 1/4/18
1 1/5/18 1/8/18
2 1/1/18 1/5/18
2 1/7/18 1/9/18
I began by
SELECT ID, date_IN, Date_out, lead(date_out) over ( partition by (ID)
order by ID) as next_out
From table
And get something like this...
ID date_IN date_out next_out
1 1/1/18 1/2/18 1/4/18
1 1/3/18 1/4/18 1/8/18
1 1/5/18 1/8/18 Null
2 1/1/18 1/5/18 1/9/18
2 1/7/18 1/9/18 Null
The problem I’m going to to have is that in my actual data many of the ID’s have A LOT of entries. The goal is to have all of the date_out’s appear on one row per ID....
ID date_IN date_out next_out next_out1 etc. etc.
1 1/1/18 1/2/18 1/4/18 1/8/18 X X
2 1/1/18 1/5/18 1/7/18 X Null Null
Is there a way to loop the lead() through the entire partition, order by ID drop everything but the first row then move on to the next ID?
Here is one approach, which assumes that you only expect to have a maximum of three date pairs per ID. You may assign a row number and then aggregate by ID:
WITH cte AS (
SELECT ID, date_IN, date_out,
ROW_NUMBER() OVER (PARTITION BY ID ORDER BY date_IN) rn
FROM yourTable
)
SELECT
ID,
MAX(CASE WHEN rn = 1 THEN date_IN END) AS date_IN,
MAX(CASE WHEN rn = 1 THEN date_out END) AS date_out,
MAX(CASE WHEN rn = 2 THEN next_IN END) AS next_in_1,
MAX(CASE WHEN rn = 2 THEN date_out END) AS next_out_2,
MAX(CASE WHEN rn = 3 THEN date_IN END) AS next_in_2,
MAX(CASE WHEN rn = 3 THEN date_out END) AS next_out_2
FROM cte
GROUP BY ID
No need to do a loop but use the offset option. Below is lifted from the documentation.
offset
Optional. It is the physical offset from the current row in the table.
If this parameter is omitted, the default is 1.
example; lead(date_out) means next value
lead(date_out, 2) means 2nd row after current row
lead(date_out, 3) 3rd row after current row and so on.
in your code; use below snippet;
lead(date_out) over ( partition by (ID) order by ID) as next_out,
lead(date_out, 2) over ( partition by (ID) order by ID) as next_out2,
lead(date_out, 3) over ( partition by (ID) order by ID) as next_out3
WITH TAB AS(
SELECT 1 ID, CAST('2018/01/01' AS DATE) DATE_IN, CAST('2018/01/02' AS DATE) DATE_OUT FROM DUAL
UNION
SELECT 1, CAST('2018/01/03' AS DATE) , CAST('2018/01/04' AS DATE) FROM DUAL
UNION
SELECT 1, CAST('2018/01/05' AS DATE) , CAST('2018/01/08' AS DATE) FROM DUAL
UNION
SELECT 1, CAST('2018/01/09' AS DATE) , CAST('2018/01/10' AS DATE) FROM DUAL
UNION
SELECT 1, CAST('2018/01/11' AS DATE) , CAST('2018/01/12' AS DATE) FROM DUAL
UNION
SELECT 2, CAST('2018/01/01' AS DATE) , CAST('2018/01/05' AS DATE) FROM DUAL
UNION
SELECT 2, CAST('2018/01/07' AS DATE) , CAST('2018/01/09' AS DATE) FROM DUAL
) --select * from tab;
, LEAF_CALC AS( --CONNECTING THE DATE_OUTS
SELECT
ID
,SYS_CONNECT_BY_PATH(DATE_OUT, '$') HRCHY
, LEVEL LVL
, CONNECT_BY_ISLEAF ISLEAF
FROM TAB
CONNECT BY PRIOR DATE_OUT < DATE_IN
START WITH ID = 1
) --SELECT * FROM LEAF_CALC;
, DATA_SORT AS( --ADDING ALL DATE_OUTS IN 1 ROW
SELECT
P.ID, P.HRCHY
FROM LEAF_CALC P,
(SELECT ID, MAX(LVL) MAXLVL FROM
LEAF_CALC
GROUP BY ID) C
WHERE P.ID = C.ID
AND P.LVL = C.MAXLVL
)--SELECT * FROM DATA_SORT
--SEGREGATING ALL DATES USING REGEXP_SUBSTR
SELECT
ID
, REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 1) DATE_IN
, REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 2) NEXT_OUT
, REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 3) NEXT_OUT2
, COALESCE(REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 4), 'NA') NEXT_OUT3
, COALESCE(REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 5), 'NA') NEXT_OUT4
FROM DATA_SORT;