MS SQL LAST_VALUE() Order by - sql

I need to pick the last value for the group_id ordered by id from the following example table:
drop table if exists #temp1
create table #temp1 (group_id int, id int, val varchar(10))
insert into #temp1 values (1111, 1, 'Yes')
insert into #temp1 values (1111, 2, 'No')
insert into #temp1 values (1111, 3, NULL)
insert into #temp1 values (2222, 5, 'No')
insert into #temp1 values (2222, 3, NULL)
insert into #temp1 values (2222, 1, 'No')
The expected result is 1111 - Yes and 2222 - No.
If I write the following query, it seems to pick the last value based on how the rows are ordered in the table and not by id column.
SELECT group_id, MAX(last_val)
FROM
(
SELECT a.group_id, LAST_VALUE(a.val) OVER (PARTITION BY a.group_id ORDER BY a.group_id) AS last_val FROM #temp1 a
) a
GROUP BY group_id
If I write the following, it seems to do a Max of val alphabetically:
SELECT group_id, MAX(last_val)
FROM
(
SELECT a.group_id, LAST_VALUE(a.val) OVER (PARTITION BY a.group_id ORDER BY a.id) AS last_val FROM #temp1 a
) a
GROUP BY group_id
In both cases, the results are different from what I need. Can someone please suggest how to get the val for the last id?

First, I recommend FIRST_VALUE() with a descending sort. Then, you need to use the right ORDER BY column:
SELECT group_id, MAX(last_val)
FROM (SELECT a.group_id,
FIRST_VALUE(a.val) OVER (PARTITION BY a.group_id ORDER BY a.id DESC) AS last_val
FROM #temp1 a
) a
GROUP BY group_id;
Why do I prefer FIRST_VALUE() for this? The issue is the default window frame, which is BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW. This can interact unexpected with LAST_VALUE().

Why do you group twice ? couldn't you just alter the window ?
SELECT DISTINCT
a.group_id
,FIRST_VALUE(a.val) OVER (PARTITION BY a.group_id
ORDER BY a.id DESC
ROWS BETWEEN
UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING) AS last_val
FROM #temp1

Related

Latest entry in a group SQL Server

Given the sample data below, I need a list of the ids whose latest entry is Rejected. Thus, I need to see id 2 because its latest is 6/4/2020 and that is Rejected. I do not want to see id 1 as its latest entry is Requested.
CREATE TABLE #temp
(
id int,
mydate datetime,
status VARCHAR(20)
)
INSERT INTO #temp VALUES (1, '6/1/2020', 'Rejected')
INSERT INTO #temp VALUES (1, '6/2/2020', 'Requested')
INSERT INTO #temp VALUES (1, '6/3/2020', 'Rejected')
INSERT INTO #temp VALUES (1, '6/4/2020', 'Requested')
INSERT INTO #temp VALUES (2, '6/1/2020', 'Requested')
INSERT INTO #temp VALUES (2, '6/2/2020', 'Requested')
INSERT INTO #temp VALUES (2, '6/3/2020', 'Requested')
INSERT INTO #temp VALUES (2, '6/4/2020', 'Rejected')
SELECT * FROM #temp
SELECT id, MAX(mydate)
FROM #temp
WHERE status = 'Rejected'
GROUP BY id
This is my pathetic attempt so far
SELECT id, MAX(mydate)
FROM #temp
WHERE status = 'Rejected'
GROUP BY id
But this will only bring back the latest date in each group. I need a list where the latest entry was Rejected. I expect the answer to be embarrassingly simple but I'm having a heck of a time with this.
Thanks
Carl
You can get this using row_number() function as shown below.
;WITH cte
AS (
SELECT Id
,mydate
,STATUS
,ROW_NUMBER() OVER (
PARTITION BY Id, status ORDER BY mydate desc
) row_num
FROM #temp
)
SELECT *
FROM cte
WHERE row_num = 1
AND STATUS = 'Rejected'
Here is the live db<>fiddle demo.
One method uses aggregation and having:
select id
from #temp
group by id
having max(case when status = 'Rejected' then mydate end) = max(mydate);
This is almost a direct translation of your question: the latest date for 'Rejected' is the latest date for a given id.
More traditional methods use a correlated subquery:
select t.*
from #temp t
where t.mydate = (select max(t2.mydate)
from #temp t2
where t2.id = t.id
) and
t.status = 'Rejected';
Or window functions:
select t.*
from (select t.*,
row_number() over (partition by id order by mydate desc) as seqnum
from #temp t
) t
where t.seqnum = 1 and t.status = 'Rejected';

Finding a random sample of unique data across multiple columns - SQL Server

Given a set of data in a SQL Server database with the following columns
AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2
All three columns are primary keys from the same users table. I need to find a random sample that will include every UserID available in all three columns no matter the position while guaranteeing the fewest unique AccountID's possible.
--SET UP TEST DATA
CREATE TABLE MY_TABLE
(
AccountID int,
UserID_Salesperson int,
UserID_Servicer1 int,
UserID_Servicer2 int
)
INSERT INTO MY_TABLE (AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2)
VALUES (12345, 1, 1, 2)
INSERT INTO MY_TABLE (AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2)
VALUES (12346, 3, 2, 1)
INSERT INTO MY_TABLE (AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2)
VALUES (12347, 4, 3, 1)
INSERT INTO MY_TABLE (AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2)
VALUES (12348, 1, 2, 3)
--VIEW THE NEW TABLE
SELECT * FROM MY_TABLE
--NORMALIZE DATA (Unique List of UserID's)
SELECT DISTINCT MyDistinctUserIDList
FROM
(SELECT UserID_Salesperson as MyDistinctUserIDList, 'Sales' as Position
FROM MY_TABLE
UNION
SELECT UserID_Servicer1, 'Service1' as Position
FROM MY_TABLE
UNION
SELECT UserID_Servicer2, 'Service2' as Position
FROM MY_TABLE) MyDerivedTable
--NORMALIZED DATA
SELECT *
FROM
(SELECT AccountID, UserID_Salesperson as MyDistinctUserIDList, 'Sales' as Position
FROM MY_TABLE
UNION
SELECT AccountID, UserID_Servicer1, 'Service1' as Position
FROM MY_TABLE
UNION
SELECT AccountID, UserID_Servicer2, 'Service2' as Position
FROM MY_TABLE) MyDerivedTable
DROP TABLE MY_TABLE
For this example table, I could select AccountID (12347 and 12348) OR (12347 and 12346) to get the least accounts with all users.
My current solution is inefficient and can make mistakes. I currently select a random AccountID, insert the data into a temp table and try to find the next insert from something I have not already put in the temp table. I loop through the records until it finds something not used beforeā€¦ and after a few thousand loops it will give up and select any record.
I don't know how you guarantee the fewest account ids, but you can get one row per user id using:
select t.*
from (select t.*,
row_number() over (partition by UserId order by newid()) as seqnum
from my_table t cross apply
(values (t.UserID_Salesperson), (t.UserID_Servicer1), (t.UserID_Servicer2)
) v(UserID)
) t
where seqnum = 1;
Your original table doesn't have a primary key. Assuming that there is one row per account, you can dedup this so it doesn't have duplicate accounts:
select top (1) with ties t.*
from (select t.*,
row_number() over (partition by UserId order by newid()) as seqnum
from my_table t cross apply
(values (t.UserID_Salesperson), (t.UserID_Servicer1), (t.UserID_Servicer2)
) v(UserID)
) t
where seqnum = 1
order by row_number() over (partition by accountID order by accountID);

How can I query only the latest iteration?

I'm wondering how to query the latest iteration of a field in my results.
For example, I write a query that'll return me this list of IDs:
132GBD00
132GBD01
59RTW900
59RTW901
59RTW902
376BH200
376BH201
376BH202
376BH203
5789DD00
I'd like the query to to return this result:
132GBD01
59RTW902
376BH203
5789DD00
Notice that the similar IDs differ in only the last two characters. 00 being the original and 01, 02, etc coming after. If I write a query like:
SELECT memid
FROM MEMBERID
WHERE MEMBERID = ???
The table has dates, but I cannot search for distinct memid and filter by a max(date) because sometimes the latest iteration date is NULL. I'm trying to see if it's possible to look at a list of IDs and filter by the last two characters in the ID to see which is greater and return that.
Apparently, the last two numbers are sequence numbers. You can get the most recent one with a group by:
select max(memid) as memid
from members
group by left(memid, len(memid) - 2);
If you wanted other columns, then you would use row_number() instead.
Try this
WITH cte AS (SELECT Memid
, ROW_NUMBER() OVER (PARTITION BY LEFT(Memid, LEN(Memid) - 2) ORDER BY memid DESC) AS Rownum
FROM MEMBERID
)
SELECT Memid
FROM cte
WHERE Rownum = 1;
you can use row_number as below:
Select top(1) with ties * from Members
Order by Row_Number() over (partition by SUBSTRING(memid, 1, len(memid)-2) order by convert(int,substring(memid, len(memid)-1, 2)) desc)
Or outer query as below:
Select MemId from (
Select *, RowN = Row_Number() over (partition by SUBSTRING(memid, 1, len(memid)-2) order by convert(int,substring(memid, len(memid)-1, 2)) desc)
from Members
) a Where a.RowN = 1
With other columns as well
I created a temp table using your data. Here is a pretty simple way to do it:
CREATE TABLE #Values (
SomeValue varchar(20)
);
INSERT INTO #Values
SELECT '132GBD00';
INSERT INTO #Values
SELECT '132GBD01';
INSERT INTO #Values
SELECT '59RTW900';
INSERT INTO #Values
SELECT '59RTW901';
INSERT INTO #Values
SELECT '59RTW902';
INSERT INTO #Values
SELECT '376BH200';
INSERT INTO #Values
SELECT '376BH201';
INSERT INTO #Values
SELECT '376BH202';
INSERT INTO #Values
SELECT '376BH203';
INSERT INTO #Values
SELECT '5789DD00';
SELECT DISTINCT
LAST_VALUE(SUBSTRING(SomeValue, 1, 6)) OVER (PARTITION BY SUBSTRING(SomeValue, 1, 6) ORDER BY SomeValue) AS LasID
FROM #Values

How to select top 3 values from each group in a table with SQL which have duplicates [duplicate]

This question already has answers here:
Select top 10 records for each category
(14 answers)
Closed 5 years ago.
Assume we have a table which has two columns, one column contains the names of some people and the other column contains some values related to each person. One person can have more than one value. Each value has a numeric type. The question is we want to select the top 3 values for each person from the table. If one person has less than 3 values, we select all the values for that person.
The issue can be solved if there are no duplicates in the table by the query provided in this article Select top 3 values from each group in a table with SQL . But if there are duplicates, what is the solution?
For example, if for one name John, he has 5 values related to him. They are 20,7,7,7,4. I need to return the name/value pairs as below order by value descending for each name:
-----------+-------+
| name | value |
-----------+-------+
| John | 20 |
| John | 7 |
| John | 7 |
-----------+-------+
Only 3 rows should be returned for John even though there are three 7s for John.
In many modern DBMS (e.g. Postgres, Oracle, SQL-Server, DB2 and many others), the following will work just fine. It uses CTEs and ranking function ROW_NUMBER() which is part of the latest SQL standard:
WITH cte AS
( SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name
ORDER BY value DESC
)
AS rn
FROM t
)
SELECT name, value, rn
FROM cte
WHERE rn <= 3
ORDER BY name, rn ;
Without CTE, only ROW_NUMBER():
SELECT name, value, rn
FROM
( SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name
ORDER BY value DESC
)
AS rn
FROM t
) tmp
WHERE rn <= 3
ORDER BY name, rn ;
Tested in:
Postgres
Oracle
SQL-Server
In MySQL and other DBMS that do not have ranking functions, one has to use either derived tables, correlated subqueries or self-joins with GROUP BY.
The (tid) is assumed to be the primary key of the table:
SELECT t.tid, t.name, t.value, -- self join and GROUP BY
COUNT(*) AS rn
FROM t
JOIN t AS t2
ON t2.name = t.name
AND ( t2.value > t.value
OR t2.value = t.value
AND t2.tid <= t.tid
)
GROUP BY t.tid, t.name, t.value
HAVING COUNT(*) <= 3
ORDER BY name, rn ;
SELECT t.tid, t.name, t.value, rn
FROM
( SELECT t.tid, t.name, t.value,
( SELECT COUNT(*) -- inline, correlated subquery
FROM t AS t2
WHERE t2.name = t.name
AND ( t2.value > t.value
OR t2.value = t.value
AND t2.tid <= t.tid
)
) AS rn
FROM t
) AS t
WHERE rn <= 3
ORDER BY name, rn ;
Tested in MySQL
I was going to downvote the question. However, I realized that it might really be asking for a cross-database solution.
Assuming you are looking for a database independent way to do this, the only way I can think of uses correlated subqueries (or non-equijoins). Here is an example:
select distinct t.personid, val, rank
from (select t.*,
(select COUNT(distinct val) from t t2 where t2.personid = t.personid and t2.val >= t.val
) as rank
from t
) t
where rank in (1, 2, 3)
However, each database that you mention (and I note, Hadoop is not a database) has a better way of doing this. Unfortunately, none of them are standard SQL.
Here is an example of it working in SQL Server:
with t as (
select 1 as personid, 5 as val union all
select 1 as personid, 6 as val union all
select 1 as personid, 6 as val union all
select 1 as personid, 7 as val union all
select 1 as personid, 8 as val
)
select distinct t.personid, val, rank
from (select t.*,
(select COUNT(distinct val) from t t2 where t2.personid = t.personid and t2.val >= t.val
) as rank
from t
) t
where rank in (1, 2, 3);
Using GROUP_CONCAT and FIND_IN_SET you can do that.Check SQLFIDDLE.
SELECT *
FROM tbl t
WHERE FIND_IN_SET(t.value,(SELECT
SUBSTRING_INDEX(GROUP_CONCAT(t1.value ORDER BY VALUE DESC),',',3)
FROM tbl t1
WHERE t1.name = t.name
GROUP BY t1.name)) > 0
ORDER BY t.name,t.value desc
If your result set is not so heavy, you can write a stored procedure (or an anonymous PL/SQL-block) for that problem which iterates the result set and finds the bigges three by a simple comparing algorithm.
Try this -
CREATE TABLE #list ([name] [varchar](100) NOT NULL, [value] [int] NOT NULL)
INSERT INTO #list VALUES ('John', 20), ('John', 7), ('John', 7), ('John', 7), ('John', 4);
WITH cte
AS (
SELECT NAME
,value
,ROW_NUMBER() OVER (
PARTITION BY NAME ORDER BY (value) DESC
) RN
FROM #list
)
SELECT NAME
,value
FROM cte
WHERE RN < 4
ORDER BY value DESC
This works for MS SQL. Should be workable in any other SQL dialect that has the ability to assign row numbers in a group by or over clause (or equivelant)
if object_id('tempdb..#Data') is not null drop table #Data;
GO
create table #data (name varchar(25), value integer);
GO
set nocount on;
insert into #data values ('John', 20);
insert into #data values ('John', 7);
insert into #data values ('John', 7);
insert into #data values ('John', 7);
insert into #data values ('John', 5);
insert into #data values ('Jack', 5);
insert into #data values ('Jane', 30);
insert into #data values ('Jane', 21);
insert into #data values ('John', 5);
insert into #data values ('John', -1);
insert into #data values ('John', -1);
insert into #data values ('Jane', 18);
set nocount off;
GO
with D as (
SELECT
name
,Value
,row_number() over (partition by name order by value desc) rn
From
#Data
)
SELECT Name, Value
FROM D
WHERE RN <= 3
order by Name, Value Desc
Name Value
Jack 5
Jane 30
Jane 21
Jane 18
John 20
John 7
John 7

Finding a better way to do a top 1 per group

I am attempting to find the most recent value per id that is older than 1/1/2013
create table #foo
(
id int,
value money,
entry_date datetime
)
insert into #foo values (1, 1.00, '1/1/2012')
insert into #foo values (1, 2.00, '2/1/2012')
insert into #foo values (1, 7.00, '1/1/2013')
insert into #foo values (2, 1.00, '1/1/2013')
insert into #foo values (2, 1.00, '2/1/2013')
insert into #foo values (3, 5.00, '3/1/2012')
The following gives me the solution but I know I am doing this the wrong way.
select id, value
from
(
select id, value, row_number() over (partition by id order by entry_date desc) as ind
from #foo
where entry_date < '1/1/2013'
) a where ind = 1
--Results:
--id value
------------- ---------------------
--1 2.00
--3 5.00
Id 2 is not returned due to not having any records older than 1/1/2013.
What is the correct way to accomplish what I am attempting to do?
You could also use a subquery to get the result:
select f1.id, f1.value
from #foo f1
inner join
(
select id, max(entry_date) entry_date
from #foo
where entry_date < '1/1/2013'
group by id
) f2
on f1.id = f2.id
and f1.entry_date = f2.entry_date;
See SQL Fiddle with Demo
This is along the same lines, but you can also use a TOP 1 WITH TIES in combination with the ROW_NUMBER() to eliminate the need for a subquery:
select top 1 with ties id, value
from #foo
where entry_date < '1/1/2013'
order by row_number() over (partition by id order by entry_date desc)
It's a little cleaner, in my opinion. Unfortunately, it can also perform slightly slower. Still, it's always good to know different uses for SQL functions.
With SQL-Server 2005 you have ranking functions and common-table-expressions(CTE).
WITH CTE AS
(
SELECT id,value,entry_date,
RN = ROW_NUMBER() OVER (PARTITION BY id ORDER BY entry_date DESC)
FROM dbo.TableName
WHERE entry_date < '1/1/2013'
)
SELECT id,value,entry_date FROM CTE WHERE RN = 1
returns the most recent record per id, so ORDER BY entry_date DESC instead of value.
If you want all "max-recent" values in case there are multiple, replace ROW_NUMBER with DENSE_RANK.
Use option with EXISTS operator
SELECT t.id, t.value
FROM #foo t
WHERE t.entry_date < '1/1/2013'
AND EXISTS(
SELECT 1
FROM #foo t2
WHERE t.id = t2.id
AND t2.entry_date < '1/1/2013'
HAVING MAX(t2.entry_date) = t.entry_date
)
Demo on SQLFiddle
For improving performance use this index:
CREATE INDEX x ON #foo(id, entry_date) INCLUDE(value)