skip consecutive rows after specific value - sql

Note: I have a working query, but am looking for optimisations to use it on large tables.
Suppose I have a table like this:
id session_id value
1 5 7
2 5 1
3 5 1
4 5 12
5 5 1
6 5 1
7 5 1
8 6 7
9 6 1
10 6 3
11 6 1
12 7 7
13 8 1
14 8 2
15 8 3
I want the id's of all rows with value 1 with one exception:
skip groups with value 1 that directly follow a value 7 within the same session_id.
Basically I would look for groups of value 1 that directly follow a value 7, limited by the session_id, and ignore those groups. I then show all the remaining value 1 rows.
The desired output showing the id's:
5
6
7
11
13
I took some inspiration from this post and ended up with this code:
declare #req_data table (
id int primary key identity,
session_id int,
value int
)
insert into #req_data(session_id, value) values (5, 7)
insert into #req_data(session_id, value) values (5, 1) -- preceded by value 7 in same session, should be ignored
insert into #req_data(session_id, value) values (5, 1) -- ignore this one too
insert into #req_data(session_id, value) values (5, 12)
insert into #req_data(session_id, value) values (5, 1) -- preceded by value != 7, show this
insert into #req_data(session_id, value) values (5, 1) -- show this too
insert into #req_data(session_id, value) values (5, 1) -- show this too
insert into #req_data(session_id, value) values (6, 7)
insert into #req_data(session_id, value) values (6, 1) -- preceded by value 7 in same session, should be ignored
insert into #req_data(session_id, value) values (6, 3)
insert into #req_data(session_id, value) values (6, 1) -- preceded by value != 7, show this
insert into #req_data(session_id, value) values (7, 7)
insert into #req_data(session_id, value) values (8, 1) -- new session_id, show this
insert into #req_data(session_id, value) values (8, 2)
insert into #req_data(session_id, value) values (8, 3)
select id
from (
select session_id, id, max(skip) over (partition by grp) as 'skip'
from (
select tWithGroups.*,
( row_number() over (partition by session_id order by id) - row_number() over (partition by value order by id) ) as grp
from (
select session_id, id, value,
case
when lag(value) over (partition by session_id order by session_id) = 7
then 1
else 0
end as 'skip'
from #req_data
) as tWithGroups
) as tWithSkipField
where tWithSkipField.value = 1
) as tYetAnotherOutput
where skip != 1
order by id
This gives the desired result, but with 4 select blocks I think it's way too inefficient to use on large tables.
Is there a cleaner, faster way to do this?

The following should work well for this.
WITH
cte_ControlValue AS (
SELECT
rd.id, rd.session_id, rd.value,
ControlValue = ISNULL(CAST(SUBSTRING(MAX(bv.BinVal) OVER (PARTITION BY rd.session_id ORDER BY rd.id), 5, 4) AS INT), 999)
FROM
#req_data rd
CROSS APPLY ( VALUES (CAST(rd.id AS BINARY(4)) + CAST(NULLIF(rd.value, 1) AS BINARY(4))) ) bv (BinVal)
)
SELECT
cv.id, cv.session_id, cv.value
FROM
cte_ControlValue cv
WHERE
cv.value = 1
AND cv.ControlValue <> 7;
Results...
id session_id value
----------- ----------- -----------
5 5 1
6 5 1
7 5 1
11 6 1
13 8 1
Edit: How and why it works...
The basic premise is taken from Itzik Ben-Gan's "The Last non NULL Puzzle".
Essentially, we are relying 2 different behaviors that most people don't usually think about...
1) NULL + anything = NULL.
2) You can CAST or CONVERT an INT into a fixed length BINARY data type and it will continue to sort as an INT (as opposed to sorting like a text string).
This is easier to see when the intermittent steps are added to the query in the CTE...
SELECT
rd.id, rd.session_id, rd.value,
bv.BinVal,
SmearedBinVal = MAX(bv.BinVal) OVER (PARTITION BY rd.session_id ORDER BY rd.id),
SecondHalfAsINT = CAST(SUBSTRING(MAX(bv.BinVal) OVER (PARTITION BY rd.session_id ORDER BY rd.id), 5, 4) AS INT),
ControlValue = ISNULL(CAST(SUBSTRING(MAX(bv.BinVal) OVER (PARTITION BY rd.session_id ORDER BY rd.id), 5, 4) AS INT), 999)
FROM
#req_data rd
CROSS APPLY ( VALUES (CAST(rd.id AS BINARY(4)) + CAST(NULLIF(rd.value, 1) AS BINARY(4))) ) bv (BinVal)
Results...
id session_id value BinVal SmearedBinVal SecondHalfAsINT ControlValue
----------- ----------- ----------- ------------------ ------------------ --------------- ------------
1 5 7 0x0000000100000007 0x0000000100000007 7 7
2 5 1 NULL 0x0000000100000007 7 7
3 5 1 NULL 0x0000000100000007 7 7
4 5 12 0x000000040000000C 0x000000040000000C 12 12
5 5 1 NULL 0x000000040000000C 12 12
6 5 1 NULL 0x000000040000000C 12 12
7 5 1 NULL 0x000000040000000C 12 12
8 6 7 0x0000000800000007 0x0000000800000007 7 7
9 6 1 NULL 0x0000000800000007 7 7
10 6 3 0x0000000A00000003 0x0000000A00000003 3 3
11 6 1 NULL 0x0000000A00000003 3 3
12 7 7 0x0000000C00000007 0x0000000C00000007 7 7
13 8 1 NULL NULL NULL 999
14 8 2 0x0000000E00000002 0x0000000E00000002 2 2
15 8 3 0x0000000F00000003 0x0000000F00000003 3 3
Looking at the BinVal column, we see an 8 byte hex value for all non-[value] = 1 rows and NULLS where [value] = 1... The 1st 4 bytes are the Id (used for ordering) and the 2nd 4 bytes are [value] (used to set the "previous non-1 value" or set the whole thing to NULL.
The 2nd step is to "smear" the non-NULL values into the NULLs using the window framed MAX function, partitioned by session_id and ordered by id.
The 3rd step is to parse out the last 4 bytes and convert them back to an INT data type (SecondHalfAsINT) and deal with any nulls that result from not having any non-1 preceding value (ControlValue).
Since we can't reference a windowed function in the WHERE clause, we have to throw the query into a CTE (a derived table would work just as well) so that we can use the new ControlValue in the where clause.

SELECT CRow.id
FROM #req_data AS CRow
CROSS APPLY (SELECT MAX(id) AS id FROM #req_data PRev WHERE PRev.Id < CRow.id AND PRev.session_id = CRow.session_id AND PRev.value <> 1 ) MaxPRow
LEFT JOIN #req_data AS PRow ON MaxPRow.id = PRow.id
WHERE CRow.value = 1 AND ISNULL(PRow.value,1) <> 7

You can use the following query:
select id, session_id, value,
coalesce(sum(case when value <> 1 then 1 end)
over (partition by session_id order by id), 0) as grp
from #req_data
to get:
id session_id value grp
----------------------------
1 5 7 1
2 5 1 1
3 5 1 1
4 5 12 2
5 5 1 2
6 5 1 2
7 5 1 2
8 6 7 1
9 6 1 1
10 6 3 2
11 6 1 2
12 7 7 1
13 8 1 0
14 8 2 1
15 8 3 2
So, this query detects islands of consecutive 1 records that belong to the same group, as specified by the first preceding row with value <> 1.
You can use a window function once more to detect all 7 islands. If you wrap this in a second cte, then you can finally get the desired result by filtering out all 7 islands:
;with session_islands as (
select id, session_id, value,
coalesce(sum(case when value <> 1 then 1 end)
over (partition by session_id order by id), 0) as grp
from #req_data
), islands_with_7 as (
select id, grp, value,
count(case when value = 7 then 1 end)
over (partition by session_id, grp) as cnt_7
from session_islands
)
select id
from islands_with_7
where cnt_7 = 0 and value = 1

Related

SQL Server Pivoting - No middle column, no aggregation

--EDIT: original table sample, requested in comments
job_id
change_id
change
1
1
5□6□
1
2
7□8□
1
3
9□10□
2
4
1□3□
This is a C# reflection of an object to serialise the data in the Change field.
The desired result is the following:
Job ID
Change ID
Change from
Change to
1
1
5
6
1
2
7
8
1
3
9
10
2
4
1
3
I managed to identify the character as CHAR(1), in order to be able to split it using the following query (which lead to the unpivoted table, which might or might not be useful- apparently not as per comments below, since the order is uncertain):
SELECT job_id, change_id, VALUE change
FROM change_table
CROSS APPLY STRING_SPLIT(change,CHAR(1))
Job ID
Change ID
Changes
1
1
5
1
1
6
1
1
1
2
7
1
2
8
1
2
1
3
9
1
3
10
1
3
2
4
1
2
4
3
2
4
It's kind of painful when delimited data has a trailing delimiter. Here is a simple solution to this using PARSENAME. I had to add and extra space back on the end here because the PARSENAME function gets confused when the last character is a period.
declare #Changes table
(
job_id int
, change_id int
, change varchar(20)
)
insert #Changes values
(1, 1, '5 6 ')
, (1, 2, '7 8 ')
, (1, 3, '9 10 ')
, (2, 4, '1 3 ')
select c.job_id
, c.change_id
, ChangeFrom = parsename(replace(c.change, ' ', '.') + ' ', 3)
, ChangeTo = parsename(replace(c.change, ' ', '.') + ' ', 2)
from #Changes c
Assuming, the Changes value of the last of three rows is ''.
Does this work for you?
SELECT
*,
'' blank
FROM (
SELECT
job_id,
change_id,
changes AS changes_from,
LEAD(changes) OVER (PARTITION BY job_id, change_id ORDER BY job_id) AS changes_to
FROM jobs
) j
WHERE changes_from != '' AND changes_to != ''
Output
job_id
change_id
changes_from
changes_to
blank
1
1
5
6
1
1
7
8
1
2
9
10
2
3
1
3
db<>fiddle here

What is the most efficient SQL query to find the max N values for every entities in a table

I wrote these 2 queries, the first one is keeping duplicates and the second one is dropping them
Does anyone know a more efficient way to achieve this?
Queries are for MSSQL, returning the top 3 values
1-
SELECT TMP.entity_id, TMP.value
FROM(
SELECT TAB.entity_id, LEAD(TAB.entity_id, 3, 0) OVER(ORDER BY TAB.entity_id, TAB.value) AS next_id, TAB.value
FROM mytable TAB
) TMP
WHERE TMP.entity_id <> TMP.next_id
2-
SELECT TMP.entity_id, TMP.value
FROM(
SELECT TMX.entity_id, LEAD(TMX.entity_id, 3, 0) OVER(ORDER BY TMX.entity_id, TMX.value) AS next_id, TMX.value
FROM(
SELECT TAB.entity_id, LEAD(TAB.entity_id, 1, 0) OVER(ORDER BY TAB.entity_id, TAB.value) AS next_id, TAB.value, LEAD(TAB.value, 1, 0) OVER(ORDER BY TAB.entity_id, TAB.value) AS next_value
FROM mytable TAB
) TMX
WHERE TMP.entity_id <> TMP.next_id OR TMX.value <> TMX.next_value
) TMP
WHERE TMP.entity_id <> TMP.next_id
Example:
Table:
entity_id value
--------- -----
1 9
1 11
1 12
1 3
2 25
2 25
2 5
2 37
3 24
3 9
3 2
3 15
Result Query 1 (25 appears twice for entity_id 2):
entity_id value
--------- -----
1 9
1 11
1 12
2 25
2 25
2 37
3 9
3 15
3 24
Result Query 2 (25 appears only once for entity_id 2):
entity_id value
--------- -----
1 9
1 11
1 12
2 5
2 25
2 37
3 9
3 15
3 24
You can use the ROW_NUMBER which will allow duplicates as follows:
select entity_id, value from
(select t.*, row_number() over (partition by entity_id order by value desc) as rn
from your_Table) where rn <= 3
You can use the rank to remove the duplicate as follows:
select distinct entity_id, value from
(select t.*, rank() over (partition by entity_id order by value desc) as rn
from your_Table) where rn <= 3

Distance from maximum value for each distance

I would like to calculate the distance to maximum value for each possible distance. As an example:
Row Distance Value
1 1 2 --> 1 (Distance from Row 1)
2 2 3 --> 2 (Distance from Row 2)
3 3 3 --> 2 (Distance from Row 2)
4 4 1 --> 2 (Distance from Row 2)
5 5 5 --> 5 (Distance from Row 5)
6 6 1 --> 5 (Distance from Row 5)
Explanation: Row 6 has value of 5 because the first occurrence of maximum value between rows 1 through 6 was at distance 5.
I have tried to use some windows functions but cannot figure out how to put it together.
Sample data:
--drop table tmp_maxval;
create table tmp_maxval (dst number, val number);
insert into tmp_maxval values(1, 3);
insert into tmp_maxval values(2, 2);
insert into tmp_maxval values(3, 1);
insert into tmp_maxval values(4, 2);
insert into tmp_maxval values(5, 4);
insert into tmp_maxval values(6, 2);
insert into tmp_maxval values(7, 2);
insert into tmp_maxval values(8, 5);
insert into tmp_maxval values(9, 5);
insert into tmp_maxval values(10,1);
commit;
Functions I think can be useful in solving this:
select t.*,
max(val) over(order by dst),
case when val >= max(val) over(order by dst) then 1 else 0 end ,
case when row_number() over(partition by val order by dst) = 1 then 1 else 0 end as first_occurence
from
ap_risk.tmp_maxval t
select dst, val,
max(case when flag is null then dst end) over (order by dst)
as first_occurrence
from (
select dst, val,
case when val <= max(val) over (order by dst
rows between unbounded preceding and 1 preceding)
then 1 end as flag
from tmp_maxval
)
order by dst
;
DST VAL FIRST_OCCURRENCE
---------- ---------- ----------------
1 3 1
2 2 1
3 1 1
4 2 1
5 4 5
6 2 5
7 2 5
8 5 8
9 5 8
10 1 8
Or, if you are on Oracle version 12.1 or higher, MATCH_RECOGNIZE can do quick work of this assignment:
select dst, val, first_occurrence
from tmp_maxval t
match_recognize(
order by dst
measures a.dst as first_occurrence
all rows per match
pattern (a x*)
define x as val <= a.val
)
order by dst
;
You can get the maximum value using a cumulative max:
select mv.*, max(mv.value) over (order by mv.distance) as max_value
from ap_risk.tmp_maxval mv;
I think this answers your question. If you want the distance itself:
select mv.*,
min(case when max_value = value then distance end) over (order by distance) as first_distance_at_max_value
from (select mv.*, max(mv.value) over (order by mv.distance) as max_value
from ap_risk.tmp_maxval mv
) mv;
You could use either max() or min() combined with case when:
select t.*,
min(case when val = mv then dst end) over (partition by mv order by dst) v1,
max(case when val = mv then dst end) over (partition by mv order by dst) v2
from (select t.*, max(val) over (order by dst) mv from tmp_maxval t) t
order by dst
Result:
DST VAL MV V1 V2
---------- ---------- ---------- ---------- ----------
1 3 3 1 1
2 2 3 1 1
3 1 3 1 1
4 2 3 1 1
5 4 4 5 5
6 2 4 5 5
7 2 4 5 5
8 5 5 8 8
9 5 5 8 9
10 1 5 8 9
Explained logic and words first occurence suggest that you need min(), but third row in your example suggest max() ;-) In data which you provided you can observe difference in rows 9-10. Choose what you want.

Assign rownumber in SQL grouped on value and n rows per rownumber

I am trying to generate a report with 3 rows per page for each order number using the following SQL.
As you can see from the results the fields Actual & Expected do not match up.
Any help would be appreciated.
set nocount on
DECLARE #Orders TABLE (Expected int, OrderNumber INT, OrderDetailsNumber int)
Insert into #orders values (0,1,1)
Insert into #orders values (0,1,2)
Insert into #orders values (0,1,3)
Insert into #orders values (1,1,4)
Insert into #orders values (2,2,5)
Insert into #orders values (2,2,6)
Insert into #orders values (2,2,7)
Insert into #orders values (3,2,8)
Insert into #orders values (3,2,9)
select cast(((row_number() over( order by OrderNumber)) -1) /3 as int) as [Actual]
,*
from #orders
Actual Expected OrderNumber OrderDetailsNumber
----------- ----------- ----------- ------------------
0 0 1 1
0 0 1 2
0 0 1 3
1 1 1 4
1 2 2 5
1 2 2 6
2 2 2 7
2 3 2 8
2 3 2 9
Right, after a couple of edits I have the final answer:
SELECT DENSE_RANK() OVER (Order BY OrderNumber, floor(RowNumber/3)) - 1 AS Actual,
Expected,
OrderNumber,
OrderDetailsNumber
FROM
(
SELECT *,
ROW_NUMBER() OVER (
PARTITION BY OrderNumber
ORDER BY OrderDetailsNumber
) - 1 AS RowNumber
FROM #Orders
) RowNumberTable
Gives the result (with extra rows for testing):
Actual Expected OrderNumber OrderDetailsNumber
-------------------- ----------- ----------- ------------------
0 0 1 1
0 0 1 2
0 0 1 3
1 1 1 4
1 1 1 12
2 2 2 5
2 2 2 6
2 2 2 7
3 3 2 8
3 3 2 9
3 4 2 11
4 3 2 27
5 5 3 10
This only works where OrderDetailsNumber is unique such that the result is deterministic.
Edit
I've now got the complete code working, however the dependence on OrderDetailsNumber being in order is very iffy, hopefully you can test and edit as required.
Edit 2
I've put the 'golfed' version in the main answer.
WITH FirstCTE AS
(
SELECT
OrderNumber,
OrderDetailsNumber,
Expected,
ROW_NUMBER() OVER (
PARTITION BY OrderNumber
ORDER BY OrderDetailsNumber
) - 1 AS RowNumber
FROM #Orders
)
, SecondCTE AS
(
SELECT OrderDetailsNumber as odn,
floor(RowNumber/3) as page_for_order_number,
DENSE_RANK() OVER (Order BY OrderNumber, floor(RowNumber/3)) - 1 AS Actual
FROM FirstCTE
)
SELECT c2.page_for_order_number,
c1.RowNumber,
C2.Actual,
c1.Expected,
c1.OrderNumber,
c1.OrderDetailsNumber
FROM FirstCTE AS c1
INNER JOIN SecondCTE AS c2
on c2.odn = c1.OrderDetailsNumber
This strikes me as a bit of a hack, but it works...
Divide the row_number() by 3, and use CEILINGto get the smallest integer greater than or equal to the result of that division.
select row_number() over( order by OrderNumber) as [Actual],
cast (row_number() over(order by ordernumber) as decimal(5,1)) / 3,
CEILING(cast (row_number() over(order by ordernumber) as decimal(5,1)) / 3)as GRPR,
*
from #orders
EDIT: Dang it, can never get results to line up. The 3rd column in the result set is your "page number".
Which yields:
Actual (No column name) PG_NBR Expected OrderNumber OrderDetailsNumber
1 0.333333 1 0 1 1
2 0.666666 1 0 1 2
3 1.000000 1 0 1 3
4 1.333333 2 1 1 4
5 1.666666 2 2 2 5
6 2.000000 2 2 2 6
7 2.333333 3 2 2 7
8 2.666666 3 3 2 8
9 3.000000 3 3 2 9

TSQL OVER clause: COUNT(*) OVER (ORDER BY a)

This is my code:
USE [tempdb];
GO
IF OBJECT_ID(N'dbo.t') IS NOT NULL
BEGIN
DROP TABLE dbo.t
END
GO
CREATE TABLE dbo.t
(
a NVARCHAR(8),
b NVARCHAR(8)
);
GO
INSERT t VALUES ('a', 'b');
INSERT t VALUES ('a', 'b');
INSERT t VALUES ('a', 'b');
INSERT t VALUES ('c', 'd');
INSERT t VALUES ('c', 'd');
INSERT t VALUES ('c', 'd');
INSERT t VALUES ('c', 'd');
INSERT t VALUES ('e', NULL);
INSERT t VALUES (NULL, NULL);
INSERT t VALUES (NULL, NULL);
INSERT t VALUES (NULL, NULL);
INSERT t VALUES (NULL, NULL);
GO
SELECT a, b,
COUNT(*) OVER (ORDER BY a)
FROM t;
On this page of BOL, Microsoft says that:
If PARTITION BY is not specified, the function treats all rows of the
query result set as a single group.
So based on my understanding, the last SELECT statement will give me the following result. Since all records are considered as in one single group, right?
a b
-------- -------- -----------
NULL NULL 12
NULL NULL 12
NULL NULL 12
NULL NULL 12
a b 12
a b 12
a b 12
c d 12
c d 12
c d 12
c d 12
e NULL 12
But the actual result is:
a b
-------- -------- -----------
NULL NULL 4
NULL NULL 4
NULL NULL 4
NULL NULL 4
a b 7
a b 7
a b 7
c d 11
c d 11
c d 11
c d 11
e NULL 12
Anyone can help to explain why? Thanks.
It gives a running total (this functionality was not implemented in SQL Server until version 2012.)
The ORDER BY defines the window to be aggregated with UNBOUNDED PRECEDING and CURRENT ROW as the default when not specified. SQL Server defaults to the less well performing RANGE option rather than ROWS.
They have different semantics in the case of ties in that the window for the RANGE version includes not just the current row (and preceding rows) but also any additional tied rows with the same value of a as the current row. This can be seen in the number of rows counted by each in the results below.
SELECT a,
b,
COUNT(*) OVER (ORDER BY a
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS [Rows],
COUNT(*) OVER (ORDER BY a
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS [Range],
COUNT(*) OVER() AS [Over()]
FROM t;
Returns
a b Rows Range Over()
-------- -------- ----------- ----------- -----------
NULL NULL 1 4 12
NULL NULL 2 4 12
NULL NULL 3 4 12
NULL NULL 4 4 12
a b 5 7 12
a b 6 7 12
a b 7 7 12
c d 8 11 12
c d 9 11 12
c d 10 11 12
c d 11 11 12
e NULL 12 12 12
To achieve the result that you were expecting to get omit both the PARTITION BY and ORDER BY and use an empty OVER() clause (also shown above).
If ROWS/RANGE is not specified but ORDER BY is specified, RANGE UNBOUNDED PRECEDING AND CURRENT ROW is used as the default for window frame
So what does that mean, let's focus on "UNBOUNDED PRECEDING AND CURRENT ROW". This gives a running total from the starting row to the current row.
But in case if you want to have an overall count then you can also specify
"UNBOUNDED PRECEDING AND UNBOUNDED Following"
This considers entire data set and Over() is just a shortcut of this
select a,b,
count(*) over(order by a) as [count],
COUNT(*) OVER (ORDER BY a
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS [Range],
COUNT(*) OVER (ORDER BY a
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS [Rows],
COUNT(*) OVER (ORDER BY a
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED Following) AS [Range_Unbounded_following],
COUNT(*) OVER (ORDER BY a
ROWs BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED Following) AS [Row_Unbounded_following]
,COUNT(*) OVER () AS [Plain_over]
from t
order by [count]
Result is
a b count Range Rows Range_Unbounded_following Row_Unbounded_following Plain_over
-------- -------- ----------- ----------- ----------- ------------------------- ----------------------- -----------
NULL NULL 4 4 1 12 12 12
NULL NULL 4 4 2 12 12 12
NULL NULL 4 4 3 12 12 12
NULL NULL 4 4 4 12 12 12
a b 7 7 5 12 12 12
a b 7 7 6 12 12 12
a b 7 7 7 12 12 12
c d 11 11 8 12 12 12
c d 11 11 9 12 12 12
c d 11 11 10 12 12 12
c d 11 11 11 12 12 12
e NULL 12 12 12 12 12 12