Flag rows that appear between rows with specific strings - sql

Let's say I have a table like this:
user_id
order
action
1
1
start
1
2
other
1
3
other
1
4
end
1
5
other
2
1
start
2
2
other
2
3
end
2
4
other
2
5
start
2
6
other
2
7
end
And I want to create a new column that flags the rows that appear between "start" and "end" events for each user (ordering by "order"):
user_id
order
action
is_between_start_and_end
1
1
start
NULL
1
2
other
1
1
3
other
1
1
4
end
NULL
1
5
other
NULL
2
1
start
NULL
2
2
other
1
2
3
end
NULL
2
4
other
NULL
2
5
start
NULL
2
6
other
1
2
7
end
NULL
How can I achieve this?

Consider below approach
select * except(grp),
if(
countif(action = 'end') over (partition by user_id, grp order by `order`) = 0
and action != 'start', 1, null
) as is_between_start_and_end
from (
select *,
countif(action = 'start') over (partition by user_id order by `order`) as grp
from your_table
)
if applied to sample data in y our question - output is

This can be solved with windows functions.
with tbl as (
Select 1 as user_id, 1 as order_it,"start" as action
Union all select 1 , 2 ,"other"
Union all select 1 , 3 ,"other"
Union all select 1 , 4 ,"end"
Union all select 1 , 5 ,"other"
Union all select 2 , 1 ,"start"
Union all select 2 , 2 ,"other"
Union all select 2 , 3 ,"end"
Union all select 2 , 4 ,"other"
Union all select 2 , 5 ,"start"
Union all select 2 , 6 ,"other"
Union all select 2 , 7 ,"end"
),
helper as (
Select *,
countif(action="end") over win_before as ends,
countif(action="start") over win_before as starts,
first_value(if(action="end" or action="start",action,null) ignore nulls) over (partition by user_id order by order_it rows between current row and unbounded following) as end_to_come
from tbl
window win_before as (partition by user_id order by order_it rows between unbounded preceding and current row)
order by user_id,order_it
)
select *,
if(end_to_come="end" and starts-ends=1,1,null) as is_between_start_and_end
from helper
order by user_id,order_it

This should work but could surely be more optimized
with input as (
select 1 user_id, 1 as order_, 'start' action union all
select 1, 2, 'other' union all
select 1, 3, 'other' union all
select 1 , 4 , 'end' union all
select 1 , 5 , 'other' union all
select 2 , 1 , 'start' union all
select 2 , 2 , 'other' union all
select 2 , 3 , 'end' union all
select 2 , 4 , 'other' union all
select 2 , 5 , 'start' union all
select 2 , 6 , 'other' union all
select 2 , 7 , 'end'
)
select
*,
if (
order_ > max(if(action = 'start', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and current row) and
order_ < min(if(action = 'end', order_, null))
over(partition by user_id order by order_ range between current row and unbounded following) and
coalesce(order_ not between
max(if(action = 'end', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and 1 preceding)
and min(if(action = 'start', order_, null))
over(partition by user_id order by order_ range between 1 following and unbounded following), true)
, 1, null) as flag
from input
order by 1,2
Edit: It should also take into account weird cases where for instance a 3rd user has end > other > start > other > end > other in that order. The flag should only apply to the 4th item. If you have start > other > start > other > end however, it's unclear if items 2,3,4 or 4 or 2,4 should be flagged. I think it would only flag 4 here
Edit2: This version should flag 2,3,4
if (
order_ > max(if(action = 'start', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and 1 preceding) and
order_ < min(if(action = 'end', order_, null))
over(partition by user_id order by order_ range between current row and unbounded following) and
coalesce(max(if(action = 'start', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and 1 preceding) >
max(if(action = 'end', order_, null))
over(partition by user_id order by order_ range between unbounded preceding and current row),true)
, 1, null) as flag

Related

return row where column value changed from last change

I have a table and i want to know the minimum date since the last change grouped by 2 columns
in the data, I want to know the lates PartNumberID by location, with the min date since the last change.
*Expected row it's not part of the table
DATA:
Location
RecordAddedDate
PartNumberID
ExpectedRow
7
2022-06-23
1
I want this row
8
2022-06-23
1
I want this row
8
2022-06-24
1
8
2022-06-25
1
9
2022-06-23
1
I want this row
15
2022-06-23
1
15
2022-06-24
1
15
2022-06-25
2
15
2022-06-26
1
I want this row
15
2022-06-27
1
Expected output:
Location
RecordAddedDate
PartNumberID
7
2022-06-23
1
8
2022-06-23
1
9
2022-06-23
1
15
2022-06-26
1
I'm on sql
I have tried with but I dont know how to stop when the value change
with cte as (
select t.LocationID, t.RecordAddedDate, t.PartNumberID
FROM mytable t
INNER JOIN (select PL.LocationID, PL.RecordAddedDate, PL.PartNumberID
FROM mytable PL INNER JOIN
(SELECT PSCc.LocationID, MAX(PSCc.RecordAddedDate) AS DateSetup
FROM mytable PSCc
WHERE PSCc.RecordDeleted = 0
GROUP BY PSCc.LocationID) AS PSCc ON PSCc.LocationID = PL.LocationID AND PSCc.DateSetup = RecordAddedDate) as tt on t.RecordAddedDate<=tt.RecordAddedDate and t.LocationID= tt.LocationID and t.PartNumberID= tt.PartNumberID
)
select *
from cte c
where not exists(
select 1 from cte
where cte.LocationID = c.LocationID
and cte.PartNumberID=c.PartNumberID
and cte.RecordAddedDate<c.RecordAddedDate
)
order by LocationID,RecordAddedDate
Thank you
use lag() to find the last change (order by RecordAddedDate desc) in PartNumberID.
cumulative sum sum(isChange) to group the related rows under same group no. grp = 0 with be the rows of the last change
To get the min - RecordAddedDate, use row_number()
with
cte1 as
(
select *,
isChange = case when PartNumberID
= isnull(lag(PartNumberID) over (partition by Location
order by RecordAddedDate desc),
PartNumberID)
then 0
else 1
end
from mytable
),
cte2 as
(
select *, grp = sum(isChange) over (partition by Location order by RecordAddedDate desc)
from cte1
),
cte3 as
(
select *, rn = row_number() over (partition by Location order by RecordAddedDate)
from cte2 t
where t.grp = 0
)
select *
from cte3 t
where t.rn = 1
db<>fiddle demo

Running Total of all Previous Rows BigQuery

I have a BigQuery Table which looks like Below:
ID SessionNumber CountOfAction Category
1 1 1 B
1 2 3 A
1 3 1 A
1 4 4 B
1 5 5 B
I am trying to get the running total of all previous rows for CountofAction where category = A. The final Output should be
ID SessionNumber CountOfAction
1 1 0 --no previous rows have countofAction for category = A
1 2 0 --no previous rows have countofAction for category = A
1 3 3 --previous row (Row 2) has countofAction = 3 for category = A
1 4 4 --previous rows (Row 2 and 3) have countofAction = 3 and 1 for category = A
1 5 4 --previous rows (Row 2 and 3) have countofAction = 3 and 1 for category = A
Below is the query I have written but it doesn't give me desired output
select
ID,
SessionNumber ,
SUM(CountofAction) OVER(Partition by clieIDntid ORDER BY SessionNumber ROWS BETWEEN UNBOUNDED
PRECEDING AND 1 PRECEDING)as CumulativeCountofAction
From TAble1 where category = 'A'
I would really appreciate any help on this! Thanks in advance
Filtering on category in the where clause evicts (id, sessionNumber) tuples where category 'A' does not appear, which is not what you want.
Instead, you can use aggregation and a conditional sum():
select
id,
sessionNumber,
sum(sum(if(category = 'A', countOfAction, 0))) over(
partition by id
order by sessionNumber
rows between unbounded preceding and 1 preceding
) CumulativeCountofAction
from mytable t
group by id, sessionNumber
order by id, sessionNumber
Below is for BigQuery Standard SQL
#standardSQL
SELECT ID, SessionNumber,
IFNULL(SUM(IF(category = 'A', CountOfAction, 0)) OVER(win), 0) AS CountOfAction
FROM `project.dataset.table`
WINDOW win AS (ORDER BY SessionNumber ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING)
If to apply to sample data from your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 ID, 1 SessionNumber, 1 CountOfAction, 'B' Category UNION ALL
SELECT 1, 2, 3, 'A' UNION ALL
SELECT 1, 3, 1, 'A' UNION ALL
SELECT 1, 4, 4, 'B' UNION ALL
SELECT 1, 5, 5, 'B'
)
SELECT ID, SessionNumber,
IFNULL(SUM(IF(category = 'A', CountOfAction, 0)) OVER(win), 0) AS CountOfAction
FROM `project.dataset.table`
WINDOW win AS (ORDER BY SessionNumber ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING)
result is
Row ID SessionNumber CountOfAction
1 1 1 0
2 1 2 0
3 1 3 3
4 1 4 4
5 1 5 4

Oracle SQL query to get the difference value with two where clause

Im trying to make a query to get the difference of avg(score1+score2/2) from 'current' and the most recent 'archived' . to do a chart on Oracle Apex
Table name: myTable
id | score1 | score2 | status | date
------------------------------------------
1 | 10 | 20 | current| 07/09/19
2 | 20 | 30 |archived| 04/09/19
3 | 15 | 35 |archived| 02/09/19
wanted the result: (avg(score1 + score2/2) where status = 'current') - (avg(score1 + score2/2) where status = 'archived' only the most recent)
Im tried
Hmmm . . . one method is conditional aggregation:
select max(case when status = 'current' then score_avg end), as current_score,
max(case when status = 'archive' then score_avg end), as last_archive_score,
(max(case when status = 'current' then score_avg end) -
max(case when status = 'archive' then score_avg end)
) as diff
from (select t.*,
row_number() over (partition by status order by date desc) as seqnum,
(score1 + score2) / 2 as score_avg
from t
) t
where seqnum = 1;
I am guessing that you really want (score1 + score2) / 2). However, if you want score1 + score2 / 2, then use that expression instead.
u want this?
select status , avg(score1 + score2/2) from you_table
group by status
or
select (select avg(score1 + score2/2) from you_table
where status='current')-(select avg(score1 + score2/2) from you_table
where status='archived') diff from dual
One option would be using
min/max(score1) keep (dense_rank first order by "date" desc) over (partition by status)
to compute the archived case, and an ordinary arithmetic average computation for current case (depending on the sample data, there exists only one row for current case )
with myTable( id, score1, score2, status, "date" )as
(
select 1, 10, 20, 'current' , date'2019-09-07' from dual union all
select 2, 20, 30, 'archived', date'2019-09-04' from dual union all
select 3, 15, 35, 'archived', date'2019-09-02' from dual
), t as
(
select
case when status = 'current' then ( score1 + score2 ) / 2 end as curr,
case when status = 'archived' then
(
(
min(score1) keep (dense_rank first order by "date" desc) over (partition by status)+
min(score2) keep (dense_rank first order by "date" desc) over (partition by status)
)/2
)
end as arch
from myTable
)
select max(curr)-max(arch) as "Avg.Result"
from t;
Demo

Oracle SQL loop LEAD() through partition

I have a set that looks something like this
ID date_IN date_out
1 1/1/18 1/2/18
1 1/3/18 1/4/18
1 1/5/18 1/8/18
2 1/1/18 1/5/18
2 1/7/18 1/9/18
I began by
SELECT ID, date_IN, Date_out, lead(date_out) over ( partition by (ID)
order by ID) as next_out
From table
And get something like this...
ID date_IN date_out next_out
1 1/1/18 1/2/18 1/4/18
1 1/3/18 1/4/18 1/8/18
1 1/5/18 1/8/18 Null
2 1/1/18 1/5/18 1/9/18
2 1/7/18 1/9/18 Null
The problem I’m going to to have is that in my actual data many of the ID’s have A LOT of entries. The goal is to have all of the date_out’s appear on one row per ID....
ID date_IN date_out next_out next_out1 etc. etc.
1 1/1/18 1/2/18 1/4/18 1/8/18 X X
2 1/1/18 1/5/18 1/7/18 X Null Null
Is there a way to loop the lead() through the entire partition, order by ID drop everything but the first row then move on to the next ID?
Here is one approach, which assumes that you only expect to have a maximum of three date pairs per ID. You may assign a row number and then aggregate by ID:
WITH cte AS (
SELECT ID, date_IN, date_out,
ROW_NUMBER() OVER (PARTITION BY ID ORDER BY date_IN) rn
FROM yourTable
)
SELECT
ID,
MAX(CASE WHEN rn = 1 THEN date_IN END) AS date_IN,
MAX(CASE WHEN rn = 1 THEN date_out END) AS date_out,
MAX(CASE WHEN rn = 2 THEN next_IN END) AS next_in_1,
MAX(CASE WHEN rn = 2 THEN date_out END) AS next_out_2,
MAX(CASE WHEN rn = 3 THEN date_IN END) AS next_in_2,
MAX(CASE WHEN rn = 3 THEN date_out END) AS next_out_2
FROM cte
GROUP BY ID
No need to do a loop but use the offset option. Below is lifted from the documentation.
offset
Optional. It is the physical offset from the current row in the table.
If this parameter is omitted, the default is 1.
example; lead(date_out) means next value
lead(date_out, 2) means 2nd row after current row
lead(date_out, 3) 3rd row after current row and so on.
in your code; use below snippet;
lead(date_out) over ( partition by (ID) order by ID) as next_out,
lead(date_out, 2) over ( partition by (ID) order by ID) as next_out2,
lead(date_out, 3) over ( partition by (ID) order by ID) as next_out3
WITH TAB AS(
SELECT 1 ID, CAST('2018/01/01' AS DATE) DATE_IN, CAST('2018/01/02' AS DATE) DATE_OUT FROM DUAL
UNION
SELECT 1, CAST('2018/01/03' AS DATE) , CAST('2018/01/04' AS DATE) FROM DUAL
UNION
SELECT 1, CAST('2018/01/05' AS DATE) , CAST('2018/01/08' AS DATE) FROM DUAL
UNION
SELECT 1, CAST('2018/01/09' AS DATE) , CAST('2018/01/10' AS DATE) FROM DUAL
UNION
SELECT 1, CAST('2018/01/11' AS DATE) , CAST('2018/01/12' AS DATE) FROM DUAL
UNION
SELECT 2, CAST('2018/01/01' AS DATE) , CAST('2018/01/05' AS DATE) FROM DUAL
UNION
SELECT 2, CAST('2018/01/07' AS DATE) , CAST('2018/01/09' AS DATE) FROM DUAL
) --select * from tab;
, LEAF_CALC AS( --CONNECTING THE DATE_OUTS
SELECT
ID
,SYS_CONNECT_BY_PATH(DATE_OUT, '$') HRCHY
, LEVEL LVL
, CONNECT_BY_ISLEAF ISLEAF
FROM TAB
CONNECT BY PRIOR DATE_OUT < DATE_IN
START WITH ID = 1
) --SELECT * FROM LEAF_CALC;
, DATA_SORT AS( --ADDING ALL DATE_OUTS IN 1 ROW
SELECT
P.ID, P.HRCHY
FROM LEAF_CALC P,
(SELECT ID, MAX(LVL) MAXLVL FROM
LEAF_CALC
GROUP BY ID) C
WHERE P.ID = C.ID
AND P.LVL = C.MAXLVL
)--SELECT * FROM DATA_SORT
--SEGREGATING ALL DATES USING REGEXP_SUBSTR
SELECT
ID
, REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 1) DATE_IN
, REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 2) NEXT_OUT
, REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 3) NEXT_OUT2
, COALESCE(REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 4), 'NA') NEXT_OUT3
, COALESCE(REGEXP_SUBSTR(HRCHY, '[^$]+', 1, 5), 'NA') NEXT_OUT4
FROM DATA_SORT;

BigQuery: Querying with standard sql

I have this table:
client_id session_id time action transaction_id
------------------------------------------------------
1 1 15:01 view NULL
1 1 15:02 basket NULL
1 1 15:03 basket NULL
1 1 15:04 purchase 1
1 2 15:05 basket NULL
1 2 15:06 purchase 2
1 2 15:07 view NULL
And I want inside the session, for all the previous actions to register the transaction_id that occur for the first time (therefore at 15:03 transaction_id = NULL)
session_id time transaction_id
------------------------------------
1 15:01 1
1 15:02 1
1 15:03 NULL
1 15:04 1
2 15:05 2
2 15:06 2
2 15:07 NULL
Hmmm . . . assuming that there is only one transaction id per session, then you can use window functions:
select t.*,
(case when row_number() over (partition by client_id, session_id, action
order by time) = 1
then max(transactc
ion_id) over (partition by client_id, session_id)
end) as new_transaction_id
from t
Below is for BigQuery Standard SQL
#standardSQL
SELECT
client_id, session_id, time, action,
(CASE
WHEN ROW_NUMBER()
OVER (PARTITION BY client_id, session_id, grp, action ORDER BY time) = 1
THEN MAX(transaction_id) OVER (PARTITION BY client_id, session_id, grp) END
) AS transaction_id
FROM (
SELECT *,
COUNTIF(transaction_id IS NOT NULL)
OVER(PARTITION BY client_id, session_id
ORDER BY time ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp
FROM YourTable
)
-- ORDER BY client_id, session_id, time
You can test play with dummy data as below
#standardSQL
WITH YourTable AS (
SELECT 1 AS client_id, 1 AS session_id, '15:01' AS time, 'view' AS action, NULL AS transaction_id UNION ALL
SELECT 1, 1, '15:02', 'basket', NULL UNION ALL
SELECT 1, 1, '15:03', 'basket', NULL UNION ALL
SELECT 1, 1, '15:04', 'purchase', 1 UNION ALL
SELECT 1, 1, '15:05', 'basket', NULL UNION ALL
SELECT 1, 1, '15:06', 'basket', NULL UNION ALL
SELECT 1, 1, '15:07', 'purchase', 3 UNION ALL
SELECT 1, 2, '15:08', 'basket', NULL UNION ALL
SELECT 1, 2, '15:09', 'purchase', 2 UNION ALL
SELECT 1, 2, '15:10', 'view', NULL
)
SELECT
client_id, session_id, time, action,
(CASE
WHEN ROW_NUMBER()
OVER (PARTITION BY client_id, session_id, grp, action ORDER BY time) = 1
THEN MAX(transaction_id) OVER (PARTITION BY client_id, session_id, grp) END
) AS transaction_id
FROM (
SELECT *,
COUNTIF(transaction_id IS NOT NULL)
OVER(PARTITION BY client_id, session_id
ORDER BY time ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp
FROM YourTable
)
-- ORDER BY client_id, session_id, time
Output is as expected
client_id session_id time action transaction_id
1 1 15:01 view 1
1 1 15:02 basket 1
1 1 15:03 basket null
1 1 15:04 purchase 1
1 1 15:05 basket 3
1 1 15:06 basket null
1 1 15:07 purchase 3
1 2 15:08 basket 2
1 2 15:09 purchase 2
1 2 15:10 view null