Get consecutive days with condition - sql

There is a table with three columns:
CREATE TABLE #t1 ( Id INT
,VisitDate DATE
,Counter INT)
AND test data:
INSERT INTO #t1 VALUES (1,'2019-01-01', 50)
INSERT INTO #t1 VALUES (2,'2019-01-02', 15)
INSERT INTO #t1 VALUES (3,'2019-01-03', 7)
INSERT INTO #t1 VALUES (4,'2019-01-04', 7)
INSERT INTO #t1 VALUES (5,'2019-01-05', 18)
INSERT INTO #t1 VALUES (6,'2019-01-06', 19)
INSERT INTO #t1 VALUES (7,'2019-01-07', 11)
INSERT INTO #t1 VALUES (8,'2019-01-08', 1)
INSERT INTO #t1 VALUES (9,'2019-01-09', 19)
Need to find three and more consecutive days where Counter more or equal ten:
Id VisitDate Counter
5 2019-01-05 18
6 2019-01-06 19
7 2019-01-07 11
My SELECT statement is
;WITH cte AS
(
SELECT *
,IIF(Counter > 10, 1,0) AS MoreThanTen
FROM #t1
), lag_lead_cte AS
(
SELECT *
,LAG(MoreThanTen) OVER (ORDER BY VisitDate) AS LagShift
,(LAG(MoreThanTen) OVER (ORDER BY VisitDate) + MoreThanTen ) AS LagMoreThanTen
,LEAD(MoreThanTen) OVER (ORDER BY VisitDate) AS LeadShift
,(LEAD(MoreThanTen) OVER (ORDER BY VisitDate) + MoreThanTen ) AS LeadMoreThanTen
FROM cte
)
SELECT *
FROM lag_lead_cte
WHERE LagMoreThanTen = 2 OR LeadMoreThanTen = 2
But the result is not fully consistent
Id VisitDate Counter
1 2019-01-01 50
2 2019-01-02 15
5 2019-01-05 18
6 2019-01-06 19
7 2019-01-07 11

It looks like a gaps-and-islands problem.
Here is one way to do it.
I'm assuming SQL Server based on the T-SQL tag.
Run this query CTE-by-CTE and examine intermediate results to understand how it works.
Query
WITH
CTE_rn
AS
(
SELECT *
,CASE WHEN Counter>10 THEN 1 ELSE 0 END AS MoreThanTen
,ROW_NUMBER() OVER (ORDER BY VisitDate) AS rn1
,ROW_NUMBER() OVER (PARTITION BY CASE WHEN Counter>10 THEN 1 ELSE 0 END ORDER BY VisitDate) AS rn2
FROM #t1
)
,CTE_Groups
AS
(
SELECT
*
,rn1-rn2 AS Diff
,COUNT(*) OVER (PARTITION BY MoreThanTen, rn1-rn2) AS GroupLength
FROM CTE_rn
)
SELECT
ID
,VisitDate
,Counter
FROM CTE_Groups
WHERE
GroupLength >= 3
AND Counter > 10
ORDER BY VisitDate
;
Result
+----+------------+---------+
| ID | VisitDate | Counter |
+----+------------+---------+
| 5 | 2019-01-05 | 18 |
| 6 | 2019-01-06 | 19 |
| 7 | 2019-01-07 | 11 |
+----+------------+---------+

Try this:
select Id, VisitDate, Counter from (
select Id, VisitDate, Counter, count(*) over (partition by grp) cnt from (
select *,
-- here I used difference between row number and day to group consecutive days
row_number() over (order by visitDate) - day(visitDate) grp
from #t1
where [Counter] > 10
) a
) a where cnt >= 3 --where group count is greater or equal to three
Based on the comment that days does not need to be consecutive, just rows have to be consecutive, here is updated query, which uses similair technique:
select id, visitdate, counter from (
select id, visitdate, counter, count(*) over (partition by grp) cnt from (
select *, rn - row_number() over (order by visitDate) grp from (
select *,
case when (Counter > 10) or (lag(Counter) over (order by visitDate) > 10 and Counter > 10) then
row_number() over (order by visitdate) end rn
from #t1
) a where rn is not null
) a
) a where cnt >= 3

I think this might be most simply handled by just looking at the sequences using lead() and lag():
select id, visitdate, counter
from (select t1.*,
lag(counter, 2) over (order by visitdate) as counter_2p,
lag(counter, 1) over (order by visitdate) as counter_1p,
lead(counter, 1) over (order by visitdate) as counter_1l,
lead(counter, 2) over (order by visitdate) as counter_2l
from t1
) t1
where counter >= 10 and
((counter_2p >= 10 and counter_1p >= 10) or
(counter_1p >= 10 and counter_1l >= 10) or
(counter_1l >= 10 and counter_2l >= 10)
);

Cross apply also works for this Question
with result as (
select
t.Id as Id1,t.VisitDate as VisitDate1,t.Counter as Counter1
,tt.Id as Id2,tt.VisitDate as VisitDate2,tt.Counter as Counter2
from #t1 t cross join #t1 tt where DATEDIFF(Day,t.VisitDate,tt.visitDate)=1
and t.Counter>10 and tt.Counter>10
)
select Id1 as Id,VisitDate1 as VisitDate ,Counter1 as [Counter] from result
union
select Id2 as Id,VisitDate2 as VisitDate,Counter2 as [Counter] from result

Related

Efficient way to cluster a timeline OR reconstruct a batch number

I'm working on a large dataset (150k / day) of a tester database. Each row contains data about a specific test of the product. Each tester inserts the results of his test.
I want to do some measurements like pass-fail-rate over a shift per product and tester. The problem is there are no batch numbers assigned so I can't select this easy.
Considering the given subselect of the whole table:
id tBegin orderId
------------------------------------
1 2018-10-20 00:00:05 1
2 2018-10-20 00:05:15 1
3 2018-10-20 01:00:05 1
10 2018-10-20 10:03:05 3
12 2018-10-20 11:04:05 8
20 2018-10-20 14:15:05 3
37 2018-10-20 18:12:05 1
My goal is it to cluster the data to the following
id tBegin orderId pCount
--------------------------------------------
1 2018-10-20 00:00:05 1 3
10 2018-10-20 10:03:05 3 1
12 2018-10-20 11:04:05 8 1
20 2018-10-20 14:15:05 3 1
37 2018-10-20 18:12:05 1 1
A simple GROUP BY orderID won't do the trick, so I came upwith the following
SELECT
MIN(c.id) AS id,
MIN(c.tBegin) AS tBegin,
c.orderId,
COUNT(*) AS pCount
FROM (
SELECT t2.id, t2.tBegin, t2.orderId,
( SELECT TOP 1 t.id
FROM history t
WHERE t.tBegin > t2.tBegin
AND t.orderID <> t2.orderID
AND <restrict date here further>
ORDER BY t.tBegin
) AS nextId
FROM history t2
) AS c
WHERE <restrict date here>
GROUP BY c.orderID, c.nextId
I left out the WHEREs that select the correct date and tester.
This works, but it seams very inefficient. I have worked with small databases, but I'm new to SQL Server 2017.
I appreciate your help very much!
You can use window functions for this:
DECLARE #t TABLE (id INT, tBegin DATETIME, orderId INT);
INSERT INTO #t VALUES
(1 , '2018-10-20 00:00:05', 1),
(2 , '2018-10-20 00:05:15', 1),
(3 , '2018-10-20 01:00:05', 1),
(10, '2018-10-20 10:03:05', 3),
(12, '2018-10-20 11:04:05', 8),
(20, '2018-10-20 14:15:05', 3),
(37, '2018-10-20 18:12:05', 1);
WITH cte1 AS (
SELECT *, CASE WHEN orderId = LAG(orderId) OVER (ORDER BY tBegin) THEN 0 ELSE 1 END AS chg
FROM #t
), cte2 AS (
SELECT *, SUM(chg) OVER(ORDER BY tBegin) AS grp
FROM cte1
), cte3 AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY grp ORDER BY tBegin) AS rn
FROM cte2
)
SELECT *
FROM cte3
WHERE rn = 1
The first cte assigns a "change flag" to each row where the value changed
The second cte uses a running sum to convert 1s and 0s to a number which can be used to group rows
Finally you number rows within each group and select first row per group
Demo on DB Fiddle
You can use cumulative approach :
select min(id) as id, max(tBegin), orderid, count(*)
from (select h.*,
row_number() over (order by id) as seq1,
row_number() over (partition by orderid order by id) as seq2
from history h
) h
group by orderid, (seq1 - seq2)
order by id;

Create episode for each value with new Begin and End Dates

This is in reference to below Question
Loop through each value to the seq num
But now Client want to see the data differently and started a new thread for this question.
below is the requirement.
This is the data .
ID seqNum DOS Service End Date
1 1 1/1/2017 1/15/2017
1 2 1/16/2017 1/16/2017
1 3 1/17/2017 1/21/2017
1 4 1/22/2017 2/13/2017
1 5 2/14/2017 3/21/2017
1 6 2/16/2017 3/21/2017
Expected outPut:
ID SeqNum DOSBeg DOSEnd
1 1 1/1/2017 1/30/2017
1 2 1/31/2017 3/1/2017
1 3 3/2/2017 3/31/2017
For each DOSBeg, add 29 and that is DOSEnd. then Add 1 to DOSEnd (1/31/2017) is new DOSBeg.
Now add 29 to (1/31/2017) and that is 3/1/2017 which is DOSEnd . Repeat this untill DOSend >=Max End Date i.e 3/21/2017.
Basically, we need episode of 29 days for each ID.
I tried with this code and it is giving me duplicates.
with cte as (
select ID, minDate as DOSBeg,dateadd(day,29,mindate) as DOSEnd
from #temp
union all
select ID,dateadd(day,1,DOSEnd) as DOSBeg,dateadd(day,29,dateadd(day,1,DOSEnd)) as DOSEnd
from cte
)
select ID,DOSBeg,DOSEnd
from cte
OPTION (MAXRECURSION 0)
Here mindate is Minimum DOS for this ID i.e. 1/1/2017
I came up with below logic and this is working fine for me. Is there any better way than this ?
declare #table table (id int, seqNum int identity(1,1), DOS date, ServiceEndDate date)
insert into #table
values
(1,'20170101','20170115'),
(1,'20170116','20170116'),
(1,'20170117','20170121'),
(1,'20170122','20170213'),
(1,'20170214','20170321'),
(1,'20170216','20170321'),
(2,'20170101','20170103'),
(2,'20170104','20170118')
select * into #temp from #table
--drop table #data
select distinct ID, cast(min(DOS) over (partition by ID) as date) as minDate
,row_Number() over (partition by ID order by ID, DOS) as SeqNum,
DOS,
max(ServiceEndDate) over (partition by ID)as maxDate
into #data
from #temp
--drop table #StartDateLogic
with cte as
(select ID,mindate as startdate,maxdate
from #data
union all
select ID,dateadd(day,30,startdate) as startdate,maxdate
from cte
where maxdate >= dateadd(day,30,startdate))
select distinct ID,startdate
into #StartDateLogic
from cte
OPTION (MAXRECURSION 0)
--final Result set
select ID
,ROW_NUMBER() over (Partition by ID order by ID,StartDate) as SeqNum
,StartDate
,dateadd(day,29,startdate) as EndDate
from #StartDateLogic
You were on the right track wit the recursive cte, but you forgot the anchor.
declare #table table (id int, seqNum int identity(1,1), DOS date, ServiceEndDate date)
insert into #table
values
(1,'20170101','20170115'),
(1,'20170116','20170116'),
(1,'20170117','20170121'),
(1,'20170122','20170213'),
(1,'20170214','20170321'),
(1,'20170216','20170321'),
(2,'20170101','20170103'),
(2,'20170104','20170118')
;with dates as(
select top 1 with ties id, seqnum, DOSBeg = DOS, DOSEnd = dateadd(day,29,DOS)
from #table
order by row_number() over (partition by id order by seqnum)
union all
select t.id, t.seqNum, DOSBeg = dateadd(day,1,d.DOSEnd), DOSEnd = dateadd(day,29,dateadd(day,1,d.DOSEnd))
from dates d
inner join #table t on
d.id = t.id and t.seqNum = d.seqNum + 1
)
select *
from dates d
where d.DOSEnd <= (select max(dateadd(month,1,ServiceEndDate)) from #table where id = d.id)
order by id, seqNum

sql : get consecutive group 'n' rows (could be inbetween)

Below is my theater table:
create table theater
(
srno integer,
seatno integer,
available boolean
);
insert into theater
values
(1, 100,true),
(2, 200,true),
(3, 300,true),
(4, 400,false),
(5, 500,true),
(6, 600,true),
(7, 700,true),
(8, 800,true);
I want a sql which should take input as 'n' and returns me the first 'n' consecutive available seats, like
if n = 2 output should be 100,200
if n = 4 output should be 500,600,700,800
NOTE: I am trying to build an query for postgres 9.3
In SQL-Server you can do It in following:
DECLARE #num INT = 4
;WITH cte AS
(
SELECT *,COUNT(1) OVER(PARTITION BY cnt) pt FROM
(
SELECT tt.*
,(SELECT COUNT(srno) FROM theater t WHERE available <> 'true' and srno < tt.srno) AS cnt
FROM theater tt
WHERE available = 'true'
) t1
)
SELECT TOP (SELECT #num) srno, seatno, available
FROM cte
WHERE pt >= #num
OUTPUT
srno seatno available
5 500 true
6 600 true
7 700 true
8 800 true
This will find the available seats. written for sqlserver 2008+:
DECLARE #num INT = 4
;WITH CTE as
(
SELECT
srno-row_number() over (partition by available order by srno) grp,
srno, seatno, available
FROM theater
), CTE2 as
(
SELECT grp, count(*) over (partition by grp) cnt,
srno, seatno, available
FROM CTE
WHERE available = 'true'
)
SELECT top(#num)
srno, seatno, available
FROM CTE2
WHERE cnt >= #num
ORDER BY srno
Result:
srno seatno available
5 500 1
6 600 1
7 700 1
8 800 1
-- naive solution without window using functions
-- [the funny +-100 constants are caused by
-- "consecutive" seats being 100 apart]
-- -------------------------------------------
WITH bot AS ( -- start of an island --
SELECT seatno FROM theater t
WHERE t.available
AND NOT EXISTS (select * from theater x
where x.available AND x.seatno = t.seatno -100)
)
, top AS ( -- end of an island --
SELECT seatno FROM theater t
WHERE t.available
AND NOT EXISTS (select * from theater x
where x.available AND x.seatno = t.seatno +100)
)
, mid AS ( -- [start,end] without intervening gaps --
SELECT l.seatno AS bot, h.seatno AS top
FROM bot l
JOIN top h ON h.seatno >= l.seatno
AND NOT EXISTS (
SELECT * FROM theater x
WHERE NOT x.available
AND x.seatno >= l.seatno AND x.seatno <= h.seatno)
)
-- all the consecutive ranges
-- [ the end query should select from this
-- , using "cnt >= xxx" ]
SELECT bot, top
, 1+(top-bot)/100 AS cnt
FROM mid;
Result:
bot | top | cnt
-----+-----+-----
100 | 300 | 3
500 | 800 | 4
(2 rows)
thanks guys, but i have done achieved it like below,
select srno, seatno from (
select *, count(0) over (order by grp) grp1 from (
select t1.*,
sum(group_flag) over (order by srno) as grp
from (
select *,
case
when lag(available) over (order by srno) = available then null
else 1
end as group_flag
from theater
) t1 ) tx ) tr where tr.available=true and tr.grp1 >= 2 limit 2

Rows inside the greatest streak?

Given the Rows
symbol_id profit date
1 100 2009-08-18 01:01:00
1 100 2009-08-18 01:01:01
1 156 2009-08-18 01:01:04
1 -56 2009-08-18 01:01:06
1 18 2009-08-18 01:01:07
How would I most efficiently select the rows that are involved in the greatest streak (of profit).
The greatest streak would be the first 3 rows, and I would want those rows. The query I came up with is just a bunch of nested queries and derived tables. I am looking for an efficient way to do this using common table expressions or something more advanced.
You haven't defined how 0 profit should be treated or what happens if there is a tie for longest streak. But something like...
;WITH T1 AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY symbol_id ORDER BY date) -
ROW_NUMBER() OVER (PARTITION BY symbol_id, SIGN(profit)
ORDER BY date) AS Grp
FROM Data
), T2 AS
(
SELECT *,
COUNT(*) OVER (PARTITION BY symbol_id,Grp) AS StreakLen
FROM T1
)
SELECT TOP 1 WITH TIES *
FROM T2
ORDER BY StreakLen DESC
Or - if you are looking for most profitable streak
;WITH T1 AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY symbol_id ORDER BY date) -
ROW_NUMBER() OVER (PARTITION BY symbol_id, CASE WHEN profit >= 0 THEN 1 END
ORDER BY date) AS Grp
FROM Data
), T2 AS
(
SELECT *,
SUM(profit) OVER (PARTITION BY symbol_id,Grp) AS StreakProfit
FROM T1
)
SELECT TOP 1 WITH TIES *
FROM T2
ORDER BY StreakProfit DESC
declare #T table
(
symbol_id int,
profit int,
[date] datetime
)
insert into #T values
(1, 100, '2009-08-18 01:01:00'),
(1, 100, '2009-08-18 01:01:01'),
(1, 156, '2009-08-18 01:01:04'),
(1, -56, '2009-08-18 01:01:06'),
(1, 18 , '2009-08-18 01:01:07')
;with C1 as
(
select *,
row_number() over(order by [date]) as rn
from #T
),
C2 as
(
select *,
rn - row_number() over(order by rn) as grp
from C1
where profit >= 0
)
select top 1 with ties *
from C2
order by sum(profit) over(partition by grp) desc
Result:
symbol_id profit date rn grp
----------- ----------- ----------------------- -------------------- --------------------
1 100 2009-08-18 01:01:00.000 1 0
1 100 2009-08-18 01:01:01.000 2 0
1 156 2009-08-18 01:01:04.000 3 0
If that's a MSSQL server then you want to consider using TOP 3 in your select clause
and ORDER BY PROFIT DESC.
If mysql/postgres you might want to consider using limit in your select clause with
the same order by too.
hope this helps.

How to limit the selection in SQL Server by sum of a column?

Can I limit rows by sum of a column in a SQL Server database?
For example:
Type | Time (in minutes)
-------------------------
A | 50
B | 10
C | 30
D | 20
E | 70
...
And I want to limit the selection by sum of time. For example maximum of 100 minutes. Table must look like this:
Type | Time (in minutes)
-------------------------
A | 50
B | 10
C | 30
Any ideas? Thanks.
DECLARE #T TABLE
(
[Type] CHAR(1) PRIMARY KEY,
[Time] INT
)
INSERT INTO #T
SELECT 'A',50 UNION ALL
SELECT 'B',10 UNION ALL
SELECT 'C',30 UNION ALL
SELECT 'D',20 UNION ALL
SELECT 'E',70;
WITH RecursiveCTE
AS (
SELECT TOP 1 [Type], [Time], CAST([Time] AS BIGINT) AS Total
FROM #T
ORDER BY [Type]
UNION ALL
SELECT R.[Type], R.[Time], R.Total
FROM (
SELECT T.*,
T.[Time] + Total AS Total,
rn = ROW_NUMBER() OVER (ORDER BY T.[Type])
FROM #T T
JOIN RecursiveCTE R
ON R.[Type] < T.[Type]
) R
WHERE R.rn = 1 AND Total <= 100
)
SELECT [Type], [Time], Total
FROM RecursiveCTE
OPTION (MAXRECURSION 0);
Or if your table is small
SELECT t1.[Type],
t1.[Time],
SUM(t2.[Time])
FROM #T t1
JOIN #T t2
ON t2.[Type] <= t1.[Type]
GROUP BY t1.[Type],t1.[Time]
HAVING SUM(t2.[Time]) <=100