Rows inside the greatest streak? - sql

Given the Rows
symbol_id profit date
1 100 2009-08-18 01:01:00
1 100 2009-08-18 01:01:01
1 156 2009-08-18 01:01:04
1 -56 2009-08-18 01:01:06
1 18 2009-08-18 01:01:07
How would I most efficiently select the rows that are involved in the greatest streak (of profit).
The greatest streak would be the first 3 rows, and I would want those rows. The query I came up with is just a bunch of nested queries and derived tables. I am looking for an efficient way to do this using common table expressions or something more advanced.

You haven't defined how 0 profit should be treated or what happens if there is a tie for longest streak. But something like...
;WITH T1 AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY symbol_id ORDER BY date) -
ROW_NUMBER() OVER (PARTITION BY symbol_id, SIGN(profit)
ORDER BY date) AS Grp
FROM Data
), T2 AS
(
SELECT *,
COUNT(*) OVER (PARTITION BY symbol_id,Grp) AS StreakLen
FROM T1
)
SELECT TOP 1 WITH TIES *
FROM T2
ORDER BY StreakLen DESC
Or - if you are looking for most profitable streak
;WITH T1 AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY symbol_id ORDER BY date) -
ROW_NUMBER() OVER (PARTITION BY symbol_id, CASE WHEN profit >= 0 THEN 1 END
ORDER BY date) AS Grp
FROM Data
), T2 AS
(
SELECT *,
SUM(profit) OVER (PARTITION BY symbol_id,Grp) AS StreakProfit
FROM T1
)
SELECT TOP 1 WITH TIES *
FROM T2
ORDER BY StreakProfit DESC

declare #T table
(
symbol_id int,
profit int,
[date] datetime
)
insert into #T values
(1, 100, '2009-08-18 01:01:00'),
(1, 100, '2009-08-18 01:01:01'),
(1, 156, '2009-08-18 01:01:04'),
(1, -56, '2009-08-18 01:01:06'),
(1, 18 , '2009-08-18 01:01:07')
;with C1 as
(
select *,
row_number() over(order by [date]) as rn
from #T
),
C2 as
(
select *,
rn - row_number() over(order by rn) as grp
from C1
where profit >= 0
)
select top 1 with ties *
from C2
order by sum(profit) over(partition by grp) desc
Result:
symbol_id profit date rn grp
----------- ----------- ----------------------- -------------------- --------------------
1 100 2009-08-18 01:01:00.000 1 0
1 100 2009-08-18 01:01:01.000 2 0
1 156 2009-08-18 01:01:04.000 3 0

If that's a MSSQL server then you want to consider using TOP 3 in your select clause
and ORDER BY PROFIT DESC.
If mysql/postgres you might want to consider using limit in your select clause with
the same order by too.
hope this helps.

Related

How to get longest consecutive same value?

How to get the rows of the longest consecutive same value?
Table Learning:
rowID
values
1
1
2
1
3
0
4
0
5
0
6
1
7
0
8
1
9
1
10
1
Longest consecutive value is 1 (rowID 8-10 as rowID 1-2 is 2 and rowID 6-6 is 1). How to query to get the actual rows of consecutive values (not just rowStart and rowEnd values) like :
rowID
values
8
1
9
1
10
1
And for longest consecutive values of both 1 and 0?
DB Fiddle
I think that the simplest approach is to use a window count to define the islands. Then to get the "longest" island, we just need to aggregate, sort and limit:
select min(valueid) grp_start, max(valueid) grp_end
from (select t.*, sum(value = 0) over(order by valueid) grp from testing t) t
where value = 1
group by grp
order by count(*) desc limit 1
In the DB Fiddle that you provided, the query returns:
grp_start
grp_end
8
10
This is a gaps and islands problem, and one approach is to use the difference in row numbers method:
WITH cte AS (
SELECT *, ROW_NUMBER() OVER (ORDER BY rowID) rn1,
ROW_NUMBER() OVER (PARTITION BY values ORDER BY rowID) rn2
FROM yourTable
),
cte2 AS (
SELECT *,
MIN(rowID) OVER (PARTITION BY values, rn1 - rn2) AS minRowID,
MAX(rowID) OVER (PARTITION BY values, rn1 - rn2) AS maxRowID
FROM cte1
),
cte3 AS (
SELECT *, RANK() OVER (PARTITION BY values ORDER BY maxRowID - minRowID DESC) rnk
FROM cte2
)
SELECT rowID, values
FROM cte3
WHERE rnk = 1
ORDER BY values, rowID;

SQL Server : create group of N rows each and give group number for each group

I want to create a SQL query that SELECT a ID column and adds an extra column to the query which is a group number as shown in the output below.
Each group consists of 3 rows and should have the MIN(ID) as a GroupID for each group. The order by should be ASC on the ID column.
ID GroupNr
------------
100 100
101 100
102 100
103 103
104 103
105 103
106 106
107 106
108 106
I've tried solutions with ROW_NUMBER() and DENSE_RANK(). And also this query:
SELECT
*, MIN(ID) OVER (ORDER BY ID ASC ROWS 2 PRECEDING) AS Groupnr
FROM
Table
ORDER BY
ID ASC
Use row_number() to enumerate the rows, arithmetic to assign the group and then take the minimum of the id:
SELECT t.*, MIN(ID) OVER (PARTITION BY grp) as groupnumber
FROM (SELECT t.*,
( (ROW_NUMBER() OVER (ORDER BY ID) - 1) / 3) as grp
FROM Table
) t
ORDER BY ID ASC;
It is possible to do this without a subquery, but the logic is rather messy:
select t.*,
(case when row_number() over (order by id) % 3 = 0
then lag(id, 2) over (order by id)
when row_number() over (order by id) % 3 = 2
then lag(id, 1) over (order by id)
else id
end) as groupnumber
from table t
order by id;
Assuming you want the lowest value in the group, and they are always groups of 3, rather than the NTILE (as Saravantn suggests, which splits the data into that many even(ish) groups), you could use a couple of window functions:
WITH Grps AS(
SELECT V.ID,
(ROW_NUMBER() OVER (ORDER BY V.ID) -1) / 3 AS Grp
FROM (VALUES(100),
(101),
(102),
(103),
(104),
(105),
(106),
(107),
(108))V(ID))
SELECT G.ID,
MIN(G.ID) OVER (PARTITION BY G.Grp) AS GroupNr
FROM Grps G;
SELECT T2.ID, T1.ID
FROM (
SELECT MIN(ID) AS ID, GroupNr
FROM
(
SELECT ID, ( Row_number()OVER(ORDER BY ID) - 1 ) / 3 + 1 AS GroupNr
FROM Table
) AS T1
GROUP BY GroupNr
) AS T1
INNER JOIN (
SELECT ID, ( Row_number()OVER(ORDER BY ID) - 1 ) / 3 + 1 AS GroupNr
FROM Table
) T2 ON T1.GroupNr = T2.GroupNr

Get consecutive days with condition

There is a table with three columns:
CREATE TABLE #t1 ( Id INT
,VisitDate DATE
,Counter INT)
AND test data:
INSERT INTO #t1 VALUES (1,'2019-01-01', 50)
INSERT INTO #t1 VALUES (2,'2019-01-02', 15)
INSERT INTO #t1 VALUES (3,'2019-01-03', 7)
INSERT INTO #t1 VALUES (4,'2019-01-04', 7)
INSERT INTO #t1 VALUES (5,'2019-01-05', 18)
INSERT INTO #t1 VALUES (6,'2019-01-06', 19)
INSERT INTO #t1 VALUES (7,'2019-01-07', 11)
INSERT INTO #t1 VALUES (8,'2019-01-08', 1)
INSERT INTO #t1 VALUES (9,'2019-01-09', 19)
Need to find three and more consecutive days where Counter more or equal ten:
Id VisitDate Counter
5 2019-01-05 18
6 2019-01-06 19
7 2019-01-07 11
My SELECT statement is
;WITH cte AS
(
SELECT *
,IIF(Counter > 10, 1,0) AS MoreThanTen
FROM #t1
), lag_lead_cte AS
(
SELECT *
,LAG(MoreThanTen) OVER (ORDER BY VisitDate) AS LagShift
,(LAG(MoreThanTen) OVER (ORDER BY VisitDate) + MoreThanTen ) AS LagMoreThanTen
,LEAD(MoreThanTen) OVER (ORDER BY VisitDate) AS LeadShift
,(LEAD(MoreThanTen) OVER (ORDER BY VisitDate) + MoreThanTen ) AS LeadMoreThanTen
FROM cte
)
SELECT *
FROM lag_lead_cte
WHERE LagMoreThanTen = 2 OR LeadMoreThanTen = 2
But the result is not fully consistent
Id VisitDate Counter
1 2019-01-01 50
2 2019-01-02 15
5 2019-01-05 18
6 2019-01-06 19
7 2019-01-07 11
It looks like a gaps-and-islands problem.
Here is one way to do it.
I'm assuming SQL Server based on the T-SQL tag.
Run this query CTE-by-CTE and examine intermediate results to understand how it works.
Query
WITH
CTE_rn
AS
(
SELECT *
,CASE WHEN Counter>10 THEN 1 ELSE 0 END AS MoreThanTen
,ROW_NUMBER() OVER (ORDER BY VisitDate) AS rn1
,ROW_NUMBER() OVER (PARTITION BY CASE WHEN Counter>10 THEN 1 ELSE 0 END ORDER BY VisitDate) AS rn2
FROM #t1
)
,CTE_Groups
AS
(
SELECT
*
,rn1-rn2 AS Diff
,COUNT(*) OVER (PARTITION BY MoreThanTen, rn1-rn2) AS GroupLength
FROM CTE_rn
)
SELECT
ID
,VisitDate
,Counter
FROM CTE_Groups
WHERE
GroupLength >= 3
AND Counter > 10
ORDER BY VisitDate
;
Result
+----+------------+---------+
| ID | VisitDate | Counter |
+----+------------+---------+
| 5 | 2019-01-05 | 18 |
| 6 | 2019-01-06 | 19 |
| 7 | 2019-01-07 | 11 |
+----+------------+---------+
Try this:
select Id, VisitDate, Counter from (
select Id, VisitDate, Counter, count(*) over (partition by grp) cnt from (
select *,
-- here I used difference between row number and day to group consecutive days
row_number() over (order by visitDate) - day(visitDate) grp
from #t1
where [Counter] > 10
) a
) a where cnt >= 3 --where group count is greater or equal to three
Based on the comment that days does not need to be consecutive, just rows have to be consecutive, here is updated query, which uses similair technique:
select id, visitdate, counter from (
select id, visitdate, counter, count(*) over (partition by grp) cnt from (
select *, rn - row_number() over (order by visitDate) grp from (
select *,
case when (Counter > 10) or (lag(Counter) over (order by visitDate) > 10 and Counter > 10) then
row_number() over (order by visitdate) end rn
from #t1
) a where rn is not null
) a
) a where cnt >= 3
I think this might be most simply handled by just looking at the sequences using lead() and lag():
select id, visitdate, counter
from (select t1.*,
lag(counter, 2) over (order by visitdate) as counter_2p,
lag(counter, 1) over (order by visitdate) as counter_1p,
lead(counter, 1) over (order by visitdate) as counter_1l,
lead(counter, 2) over (order by visitdate) as counter_2l
from t1
) t1
where counter >= 10 and
((counter_2p >= 10 and counter_1p >= 10) or
(counter_1p >= 10 and counter_1l >= 10) or
(counter_1l >= 10 and counter_2l >= 10)
);
Cross apply also works for this Question
with result as (
select
t.Id as Id1,t.VisitDate as VisitDate1,t.Counter as Counter1
,tt.Id as Id2,tt.VisitDate as VisitDate2,tt.Counter as Counter2
from #t1 t cross join #t1 tt where DATEDIFF(Day,t.VisitDate,tt.visitDate)=1
and t.Counter>10 and tt.Counter>10
)
select Id1 as Id,VisitDate1 as VisitDate ,Counter1 as [Counter] from result
union
select Id2 as Id,VisitDate2 as VisitDate,Counter2 as [Counter] from result

Efficient way to cluster a timeline OR reconstruct a batch number

I'm working on a large dataset (150k / day) of a tester database. Each row contains data about a specific test of the product. Each tester inserts the results of his test.
I want to do some measurements like pass-fail-rate over a shift per product and tester. The problem is there are no batch numbers assigned so I can't select this easy.
Considering the given subselect of the whole table:
id tBegin orderId
------------------------------------
1 2018-10-20 00:00:05 1
2 2018-10-20 00:05:15 1
3 2018-10-20 01:00:05 1
10 2018-10-20 10:03:05 3
12 2018-10-20 11:04:05 8
20 2018-10-20 14:15:05 3
37 2018-10-20 18:12:05 1
My goal is it to cluster the data to the following
id tBegin orderId pCount
--------------------------------------------
1 2018-10-20 00:00:05 1 3
10 2018-10-20 10:03:05 3 1
12 2018-10-20 11:04:05 8 1
20 2018-10-20 14:15:05 3 1
37 2018-10-20 18:12:05 1 1
A simple GROUP BY orderID won't do the trick, so I came upwith the following
SELECT
MIN(c.id) AS id,
MIN(c.tBegin) AS tBegin,
c.orderId,
COUNT(*) AS pCount
FROM (
SELECT t2.id, t2.tBegin, t2.orderId,
( SELECT TOP 1 t.id
FROM history t
WHERE t.tBegin > t2.tBegin
AND t.orderID <> t2.orderID
AND <restrict date here further>
ORDER BY t.tBegin
) AS nextId
FROM history t2
) AS c
WHERE <restrict date here>
GROUP BY c.orderID, c.nextId
I left out the WHEREs that select the correct date and tester.
This works, but it seams very inefficient. I have worked with small databases, but I'm new to SQL Server 2017.
I appreciate your help very much!
You can use window functions for this:
DECLARE #t TABLE (id INT, tBegin DATETIME, orderId INT);
INSERT INTO #t VALUES
(1 , '2018-10-20 00:00:05', 1),
(2 , '2018-10-20 00:05:15', 1),
(3 , '2018-10-20 01:00:05', 1),
(10, '2018-10-20 10:03:05', 3),
(12, '2018-10-20 11:04:05', 8),
(20, '2018-10-20 14:15:05', 3),
(37, '2018-10-20 18:12:05', 1);
WITH cte1 AS (
SELECT *, CASE WHEN orderId = LAG(orderId) OVER (ORDER BY tBegin) THEN 0 ELSE 1 END AS chg
FROM #t
), cte2 AS (
SELECT *, SUM(chg) OVER(ORDER BY tBegin) AS grp
FROM cte1
), cte3 AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY grp ORDER BY tBegin) AS rn
FROM cte2
)
SELECT *
FROM cte3
WHERE rn = 1
The first cte assigns a "change flag" to each row where the value changed
The second cte uses a running sum to convert 1s and 0s to a number which can be used to group rows
Finally you number rows within each group and select first row per group
Demo on DB Fiddle
You can use cumulative approach :
select min(id) as id, max(tBegin), orderid, count(*)
from (select h.*,
row_number() over (order by id) as seq1,
row_number() over (partition by orderid order by id) as seq2
from history h
) h
group by orderid, (seq1 - seq2)
order by id;

Get specific row from a subquery using aggregate function

I am trying to get a specific row from a subquery, but I cannot use an aggregate function in a WHERE clause and I have read that I should be using a HAVING clause but I have no idea where to start.
This is my current sql statement:
SELECT *
FROM
(
select ID, SUM(BALANCE) AS Balance FROM bankacc GROUP BY ID
)A
I will get :
ID | Balance
1 | 30
2 | 40
3 | 50
4 | 50
I need the rows with the MAX(Balance), but I have no idea where to start, please help.
With window function:
DECLARE #t TABLE ( ID INT, Amount MONEY )
INSERT INTO #t
VALUES ( 1, 10 ),
( 1, 10 ),
( 1, 10 ),
( 2, 5 ),
( 2, 20 ),
( 3, 50 )
SELECT ID ,
Amount
FROM ( SELECT ID ,
SUM(Amount) AS Amount ,
RANK() OVER ( ORDER BY SUM(Amount) DESC ) AS rn
FROM #t
GROUP BY ID
) t
WHERE rn = 1
With TOP and TIES:
SELECT TOP 1 WITH TIES
ID ,
SUM(Amount) AS Amount
FROM #t
GROUP BY ID
ORDER BY Amount desc
These versions will return rows where sum will be max, not just top 1 row.
Output:
ID Amount
3 50.00
you can wrap it in a subquery:
SELECT q.id, max(q.b)
FROM
(
select ID, SUM(BALANCE) b FROM bankacc GROUP BY ID
) q
group by q.id
or order them in dessending order and get first record:
select top 1 ID, SUM(BALANCE) b FROM bankacc GROUP BY ID order by b desc
in MySQL you need to use limit 1 instead of top 1
I think this should be simple.
-- This will return only 1 record, even if there are 2 records for MAX same amount
SELECT top 1 ID ,
Amount
FROM ( SELECT ID ,
SUM(Amount) AS Amount
FROM Table
GROUP BY ID
) t
Order by Amount desc,ID asc
Using Window function : This will return what you want.
SELECT ID ,
Amount
FROM ( SELECT ID ,
SUM(Amount) AS Amount ,
RANK() OVER ( ORDER BY SUM(Amount) DESC ) AS rnk
FROM Table
GROUP BY ID
) t
WHERE rnk = 1