How to query the V shaped data? - sql

TxnID RunningAmount MemberID
==================================
1 80000 20
2 90000 20
3 70000 20 //<==== Falls but previously never below 100k, hence ignore
4 90000 20
5 110000 20
6 60000 20 //<==== Falls below 100k, hence we want ID 8
7 80000 20
8 120000 20
9 85000 28
...
....
How to construct the query such that it group by members, get the first transactionID that formed the "V" shape. Even a pseudocode is fine, I can't share my attempt because I am totally clueless about how to do it.
UPDATES:
Sorry for the lack of explanations on the conditions. The base amount we looking is 100k. ID is random, definitely we need to have rownumber
We ignore all transactions before ID = 5 because their runningAmount is never exceeded 100k.
Now when ID=5, exceeded 100k, we check if transactions after ID=5 if there is a down trend in runningAmount that falls below 100k.
Immediately we see ID=6 falls below 100k, so we want to find the first transaction that exceed 100k again(if there is).
From the data sample above, the expected result is only one record, which is ID=8.
For every member, there will only be either one or zero record found based on the conditions I've mentioned

Try this query:
declare #tbl table(TxnID int, RunningAmount int, MemberID int);
insert into #tbl values
(1, 80000, 20),
(2, 90000, 20),
(3, 70000, 20),
(4, 90000, 20),
(5, 110000, 20),
(6, 60000, 20),
(7, 120000, 20),
(8, 85000, 28);
select TxnID, RunningAmount, MemberID,
LAG(VShape) over (partition by MemberID order by TxnID) VShape
from (
select TxnID, RunningAmount, MemberID,
case when rn < lagrn and rn < leadrn then 1 else 0 end VShape
from (
select *,
LAG(rn) over (partition by MemberID order by TxnID) lagRn,
LEAD(rn) over (partition by MemberID order by TxnID) leadRn
from (
select TxnID,
RunningAmount,
MemberID,
ROW_NUMBER() over (partition by MemberID order by RunningAmount) rn
from #tbl
) a
) a
) a
Last column VShape indicates if value in RunningAmount completes V shape (although you could be more clearer on what it means instead of everybody figuring it out). Now you can filter values based on RunningAmount (wheter they fall below or above 100k).
Here is version for earlier versions of SQL Server that don't have LAG and LEAD functions:
;with cte as (
select *,
ROW_NUMBER() over (partition by MemberID order by RunningAmount) rn
from #tbl
), cte2 as (
select c1.TxnID, c1.RunningAmount, c1.MemberID, c1.rn, c2.rn [lagRn] , c3.rn [leadRn]
from cte c1
left join cte c2 on c1.TxnID = c2.TxnID + 1 and c1.MemberID = c2.MemberID
left join cte c3 on c1.TxnID = c3.TxnID - 1 and c1.MemberID = c3.MemberID
), cte3 as (
select TxnID, RunningAmount, MemberID,
case when rn < lagrn and rn < leadrn then 1 else 0 end VShape
from cte2
), FinalResult as (
select c1.TxnID, c1.RunningAmount, c1.MemberID, c2.VShape
from cte3 c1
left join cte3 c2 on c1.TxnID = c2.TxnID + 1 and c1.MemberID = c2.MemberID
)
select fr.*, fr2.RunningAmount RunningAmountLagBy2 from FinalResult fr
left join FinalResult fr2 on fr.TxnID = fr2.TxnID + 2
where fr.RunningAmount > 100000 and fr2.RunningAmount > 100000 and fr.VShape = 1
UPDATE
After question update, here's solution:
select TxnID from (
select *, ROW_NUMBER() over (partition by VShape order by TxnID) CompletesVShape from (
select TxnID,
RunningAmount,
MemberID,
sum(case when RunningAmount >= 100000 then 1 else 0 end) over (partition by MemberID order by TxnID rows between unbounded preceding and current row) VShape
from #tbl
) a
) a where VShape > 1 and CompletesVShape = 1

Based on your question update and assuming for V shape necessary condition is to get above and below running amounts > 100000 and middle be smaller than above and below running amounts, below is a query showing how to do it in 2008 sql server.
also see live demo
; with firstlargeamount as
(
select MemberId, minTrxid=min(TxnID)
from t
where RunningAmount>100000
group by MemberId
)
,tbl as
(
select *,
rn=row_number() over( partition by MemberId order by TxnId)
from
t
)
select t3.*,f.*
from tbl t1
join tbl t2
on
t1.memberId=t2.memberid and t1.rn=t2.rn +1
and t1.RunningAmount<t2.RunningAmount
join tbl t3
on
t1.memberId=t3.memberid and t1.rn=t3.rn -1
and t1.RunningAmount<t3.RunningAmount
join firstlargeamount f
on
f.Memberid=t2.memberid and f.minTrxid>=t1.TxnID
Explanation:
First step is to generate a row number sequence at member level as cte tbl and min limiting transaction in cte firstlargeamount
Second step is double self join to find above and below records per row which satisfy the V shape criteria as well join with firstlargeamount to find rows which satisfy the 100000 criteria
Note that the above and below records are simply found using +1/-1 from the current records's row number computed in the step 1

Related

Finding last and second last date and corresponding values

Consider the following schema (fiddle):
CREATE TABLE meters
(
id int,
description varchar(10)
);
CREATE TABLE readings
(
id int,
meterid int,
date date,
value int
);
INSERT INTO readings (id, meterid, date, value)
VALUES
(1, 4, '20081231', 500),
(2, 4, '20090203', 550),
(3, 1, '20090303', 300),
(4, 2, '20090303', 244),
(5, 4, '20090303', 600),
(6, 1, '20090403', 399),
(7, 2, '20090403', 288),
(8, 3, '20090403', 555);
INSERT INTO meters (id, description)
VALUES
(1, 'this'),
(2, 'is'),
(3, 'not'),
(4, 'really'),
(5, 'relevant');
For each meter.id I need to find the latest reading date, value, and the value difference vs previous reading.
For the sample data my output would look like this (plus some other columns from meters):
meterid
latest
value
delta value
1
20090403
399
99
2
20090403
288
44
3
20090403
555
null
4
20090303
600
50
5
null
null
null
I figured I could first create a query with the relevant info and then join with that, but I struggle with achieving that
I've tried to adapt this method but for each id I get 2 rows instead of one
SELECT
p.meterid,
[1] AS [LastDate],
[2] AS [BeforeLastDate]
FROM
(SELECT TOP (2) WITH ties
*,
RowN = ROW_NUMBER() OVER (PARTITION BY r.meterid ORDER BY date DESC)
FROM
readings AS r
ORDER BY
(ROW_NUMBER() OVER (PARTITION BY r.meterid ORDER BY date DESC) - 1) / 2 + 1) a
PIVOT
(MAX(date) FOR RowN IN ([1], [2])) p
ORDER BY
p.meterId
I'm looking for ideas how to solve the double row issue, or if that's a dead end how to get my desired output
If I understand correctly, you can just use window functions:
select m.id, r.date, r.value, r.value - prev_value
from meters m left join
(select r.*,
lag(value) over (partition by meterid order by date) as prev_value,
row_number() over (partition by meterid order by date desc) as seqnum
from readings r
) r
on r.meterid = m.id and seqnum = 1
order by m.id;
No aggregation is necessary. Here is a db<>fiddle.
Use LEAD to get the next value going backwards, and ROW_NUMBER to get the first row.
SELECT *
FROM
(SELECT *,
delta_value = value - LEAD(value) over (PARTITION BY r.meterid ORDER BY date DESC),
RowN = Row_Number() over(PARTITION BY r.meterid ORDER BY date DESC)
FROM readings AS r
) a
WHERE RowN = 1
ORDER BY p.meterId

Select the matching/sum up total in SQL Server

I want to get all the id's that matches/sum up my total qty.
Example if my total qty is 40 then my query will stops until it sums up all the qty at exactly or greater than 40.
See screenshots
If you are using sql server 2012 and above, you can use this script.
;WITH CTE AS (
SELECT PK_TRXNO, FK_iwItems, qty,
total = SUM(qty) OVER( PARTITION BY FK_iwItems ORDER BY PK_TRXNO DESC ROWS UNBOUNDED PRECEDING )
FROM #MyTable
)
, CTE2 AS (
SELECT *,
RN = ROW_NUMBER() OVER(PARTITION BY (CASE WHEN total > 40 THEN 1 ELSE 0 END) ORDER BY PK_TRXNO DESC)
FROM CTE
)
SELECT * FROM CTE2
WHERE total <= 40 OR ( total> 40 AND RN = 1)

How do I get N records before given one?

How do I get N records before given one?
I have the following table structure:
Id, Message
1, John Doe
2, Jane Smith
3, Error
4, Jane Smith
5, Michael Pirs
7, Gabriel Angelos
8, Error
Is there a way to get the N records before each Error and join all such records?
So the expected result for the N =2 will be
1, John Doe
2, Jane Smith
5, Michael Pirs
7, Gabriel Angelos
Fiddle
You need to create a row number column if your Ids do not increment without gaps. Then you can use a simple join to find the previous N. Your previous N could overlap... so you have to add distinct if you do not want duplicates.
declare #N as integer
set #N=2
;with cte_tbl (Id, Message, rownum) AS
(
select *, ROW_NUMBER() over (order by id) as rownum from test
)
select distinct Prev.Id, Prev.Message
from cte_tbl
join cte_tbl Prev
on Prev.rownum between cte_tbl.rownum-#N and cte_tbl.rownum-1
where cte_tbl.Message = 'Error'
and Prev.Message <> 'Error'
order by Prev.Id
If the one of the previous #N records is an error, the 'error' record will NOT show up. This would have to be modified if you do want those to be included. Just simply remove the line and Prev.Message <> 'Error'.
You can do this using cross apply. The logic is a bit different from typical applications, because you only want the records from the cross apply subquery:
select t2.*
from table t cross apply
(select top 2 t.*
from table t2
where t2.id < t.id
order by t2.id desc
) t2
where t2.message = 'Error';
For those inclined, there is also a method using window functions, but it is a little more cumbersome. Do a reverse cumulative sum of Error records to identify values before a given error. Then enumerate these and choose the ones you want:
select t.id, t.message
from (select t.*, row_number() over (partition by grp order by id desc) as seqnum
from (select t.*,
sum(case when message = 'Error' then 1 else 0 end) over
(order by id desc)) as grp
from table t
) t
where seqnum between 2 and 3;
Note that the filter is between 2 and 3, because 'Error' has a value of 1.
Get all the rows that are 'Error' and join with id previous to it. Assuming your IDs are consecutive. If they aren't get a consecutive id with the help or ROW_NUMBER().
You can try this:
select
T.*
from (
select
id iderror
from myTable
where
Message = 'Error'
) errorRows
inner join myTable T on
T.id between errorRows.iderror -2 and errorRows.iderror -1 and
T.Message <> 'Error'
This would be a little bit easier if you were using an identity field for ID, then you would have continuous numbers, but you can use this method. I am Ranking the rows and then returning the ones prior to the error.
select t1.Rank_ID, t1.id, t1.message, te.id
from (select rank() over(order by id) as Rank_ID, id, message from tbl_test) t1
inner join (select rank() over(order by id)as Rank_ID, id, message from tbl_test) te
on t1.Rank_ID between te.Rank_ID-2 and te.Rank_ID-1
where te.message='Error'

SQL query for column threaded relationship

This is a simplified view of a table. I apologize, but I could not save a picture of the table so I hope this is ok.
c1___c2
1____a
1____b
2____a
2____b
2____c
2____d
3____e
3____a
4____z
5____d
The result is that due to the relationships of column C2,
Group 1 would include, 1,2,3,5 (because they have overlapping c2 values basically stating a=b=c=d=e)
Group 2 would include 4
I have millions of rows with this kind of data and currently there is a cursor job that runs x number of times to build these groups. I am able to visualize how this should work, but I have not been able to build a query that can pull out this relationship.
Any suggestions?
Thank you
Tested on SQL Server 2012:
WITH t AS (
SELECT
t.c1,
t.c2,
tm.c1_min
FROM
Test t
JOIN
(
SELECT
c2,
MIN(c1) AS c1_min
FROM
Test
GROUP BY
c2
) AS tm
ON
t.c2 = tm.c2
),
rt AS (
SELECT
c1_min,
c1,
1 AS cnt
FROM
t
UNION ALL
SELECT
rt.c1_min,
t.c1,
rt.cnt + 1 AS cnt
FROM
rt
JOIN
t
ON
rt.c1 = t.c1_min
AND
rt.c1 < t.c1
)
SELECT
SUM(t.rst) OVER (ORDER BY t.ord ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS group_number,
t.c1
FROM
(
SELECT
t.c1,
t.rst,
t.ord
FROM
(
SELECT
rt.c1,
CASE
WHEN rt.c1_min = MIN(rt.c1_min) OVER (ORDER BY rt.c1_min, rt.c1 ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) THEN 0
ELSE 1
END AS rst,
ROW_NUMBER() OVER (ORDER BY rt.c1_min, rt.c1) AS ord,
ROW_NUMBER() OVER (PARTITION BY rt.c1 ORDER BY rt.c1_min, rt.cnt) AS qfy
FROM
rt
) AS t
WHERE
t.qfy = 1
) AS t;

Calculating percentile rankings in MS SQL

What's the best way to calculate percentile rankings (e.g. the 90th percentile or the median score) in MSSQL 2005?
I'd like to be able to select the 25th, median, and 75th percentiles for a single column of scores (preferably in a single record so I can combine with average, max, and min). So for example, table output of the results might be:
Group MinScore MaxScore AvgScore pct25 median pct75
----- -------- -------- -------- ----- ------ -----
T1 52 96 74 68 76 84
T2 48 98 74 68 75 85
I would think that this would be the simplest solution:
SELECT TOP N PERCENT FROM TheTable ORDER BY TheScore DESC
Where N = (100 - desired percentile). So if you wanted all rows in the 90th percentile, you'd select the top 10%.
I'm not sure what you mean by "preferably in a single record". Do you mean calculate which percentile a given score for a single record would fall into? e.g. do you want to be able to make statements like "your score is 83, which puts you in the 91st percentile." ?
EDIT: OK, I thought some more about your question and came up with this interpretation. Are you asking how to calculate the cutoff score for a particular percentile? e.g. something like this: to be in the 90th percentile you must have a score greater than 78.
If so, this query works. I dislike sub-queries though, so depending on what it was for, I'd probably try to find a more elegant solution. It does, however, return a single record with a single score.
-- Find the minimum score for all scores in the 90th percentile
SELECT Min(subq.TheScore) FROM
(SELECT TOP 10 PERCENT TheScore FROM TheTable
ORDER BY TheScore DESC) AS subq
Check out the NTILE command -- it will give you percentiles pretty easily!
SELECT SalesOrderID,
OrderQty,
RowNum = Row_Number() OVER(Order By OrderQty),
Rnk = RANK() OVER(ORDER BY OrderQty),
DenseRnk = DENSE_RANK() OVER(ORDER BY OrderQty),
NTile4 = NTILE(4) OVER(ORDER BY OrderQty)
FROM Sales.SalesOrderDetail
WHERE SalesOrderID IN (43689, 63181)
How about this:
SELECT
Group,
75_percentile = MAX(case when NTILE(4) OVER(ORDER BY score ASC) = 3 then score else 0 end),
90_percentile = MAX(case when NTILE(10) OVER(ORDER BY score ASC) = 9 then score else 0 end)
FROM TheScore
GROUP BY Group
I've been working on this a little more, and here's what I've come up with so far:
CREATE PROCEDURE [dbo].[TestGetPercentile]
#percentile as float,
#resultval as float output
AS
BEGIN
WITH scores(score, prev_rank, curr_rank, next_rank) AS (
SELECT dblScore,
(ROW_NUMBER() OVER ( ORDER BY dblScore ) - 1.0) / ((SELECT COUNT(*) FROM TestScores) + 1) [prev_rank],
(ROW_NUMBER() OVER ( ORDER BY dblScore ) + 0.0) / ((SELECT COUNT(*) FROM TestScores) + 1) [curr_rank],
(ROW_NUMBER() OVER ( ORDER BY dblScore ) + 1.0) / ((SELECT COUNT(*) FROM TestScores) + 1) [next_rank]
FROM TestScores
)
SELECT #resultval = (
SELECT TOP 1
CASE WHEN t1.score = t2.score
THEN t1.score
ELSE
t1.score + (t2.score - t1.score) * ((#percentile - t1.curr_rank) / (t2.curr_rank - t1.curr_rank))
END
FROM scores t1, scores t2
WHERE (t1.curr_rank = #percentile OR (t1.curr_rank < #percentile AND t1.next_rank > #percentile))
AND (t2.curr_rank = #percentile OR (t2.curr_rank > #percentile AND t2.prev_rank < #percentile))
)
END
Then in another stored procedure I do this:
DECLARE #pct25 float;
DECLARE #pct50 float;
DECLARE #pct75 float;
exec SurveyGetPercentile .25, #pct25 output
exec SurveyGetPercentile .50, #pct50 output
exec SurveyGetPercentile .75, #pct75 output
Select
min(dblScore) as minScore,
max(dblScore) as maxScore,
avg(dblScore) as avgScore,
#pct25 as percentile25,
#pct50 as percentile50,
#pct75 as percentile75
From TestScores
It still doesn't do quite what I'm looking for. This will get the stats for all tests; whereas I would like to be able to select from a TestScores table that has multiple different tests in it and get back the same stats for each different test (like I have in my example table in my question).
The 50th percentile is same as the median. When computing other percentile, say the 80th, sort the data for the 80 percent of data in ascending order and the other 20 percent in descending order, and take the avg of the two middle value.
NB: The median query has been around for a long time, but cannot remember where exactly I got it from, I have only amended it to compute other percentiles.
DECLARE #Temp TABLE(Id INT IDENTITY(1,1), DATA DECIMAL(10,5))
INSERT INTO #Temp VALUES(0)
INSERT INTO #Temp VALUES(2)
INSERT INTO #Temp VALUES(8)
INSERT INTO #Temp VALUES(4)
INSERT INTO #Temp VALUES(3)
INSERT INTO #Temp VALUES(6)
INSERT INTO #Temp VALUES(6)
INSERT INTO #Temp VALUES(6)
INSERT INTO #Temp VALUES(7)
INSERT INTO #Temp VALUES(0)
INSERT INTO #Temp VALUES(1)
INSERT INTO #Temp VALUES(NULL)
--50th percentile or median
SELECT ((
SELECT TOP 1 DATA
FROM (
SELECT TOP 50 PERCENT DATA
FROM #Temp
WHERE DATA IS NOT NULL
ORDER BY DATA
) AS A
ORDER BY DATA DESC) +
(
SELECT TOP 1 DATA
FROM (
SELECT TOP 50 PERCENT DATA
FROM #Temp
WHERE DATA IS NOT NULL
ORDER BY DATA DESC
) AS A
ORDER BY DATA ASC)) / 2.0
--90th percentile
SELECT ((
SELECT TOP 1 DATA
FROM (
SELECT TOP 90 PERCENT DATA
FROM #Temp
WHERE DATA IS NOT NULL
ORDER BY DATA
) AS A
ORDER BY DATA DESC) +
(
SELECT TOP 1 DATA
FROM (
SELECT TOP 10 PERCENT DATA
FROM #Temp
WHERE DATA IS NOT NULL
ORDER BY DATA DESC
) AS A
ORDER BY DATA ASC)) / 2.0
--75th percentile
SELECT ((
SELECT TOP 1 DATA
FROM (
SELECT TOP 75 PERCENT DATA
FROM #Temp
WHERE DATA IS NOT NULL
ORDER BY DATA
) AS A
ORDER BY DATA DESC) +
(
SELECT TOP 1 DATA
FROM (
SELECT TOP 25 PERCENT DATA
FROM #Temp
WHERE DATA IS NOT NULL
ORDER BY DATA DESC
) AS A
ORDER BY DATA ASC)) / 2.0
i'd do something like:
select #n = count(*) from tbl1
select #median = #n / 2
select #p75 = #n * 3 / 4
select #p90 = #n * 9 / 10
select top 1 score from (select top #median score from tbl1 order by score asc) order by score desc
is this right?
i'd probably use a the sql server 2005
row_number() over (order by score ) / (select count(*) from scores)
or something along those lines.
Percentile is calculated by
(Rank -1) /(total_rows -1) when you sort values in ascending order.
The below query will give you percentile value between 0 and 1. Person with lowest marks will have 0 percentile.
SELECT Name, marks, (rank_1-1)/((select count(*) as total_1 from table)-1)as percentile_rank
from
(
SELECT Name,
Marks,
RANK() OVER (ORDER BY Marks) AS rank_1
from table
) as A