Latest entry in a group SQL Server - sql

Given the sample data below, I need a list of the ids whose latest entry is Rejected. Thus, I need to see id 2 because its latest is 6/4/2020 and that is Rejected. I do not want to see id 1 as its latest entry is Requested.
CREATE TABLE #temp
(
id int,
mydate datetime,
status VARCHAR(20)
)
INSERT INTO #temp VALUES (1, '6/1/2020', 'Rejected')
INSERT INTO #temp VALUES (1, '6/2/2020', 'Requested')
INSERT INTO #temp VALUES (1, '6/3/2020', 'Rejected')
INSERT INTO #temp VALUES (1, '6/4/2020', 'Requested')
INSERT INTO #temp VALUES (2, '6/1/2020', 'Requested')
INSERT INTO #temp VALUES (2, '6/2/2020', 'Requested')
INSERT INTO #temp VALUES (2, '6/3/2020', 'Requested')
INSERT INTO #temp VALUES (2, '6/4/2020', 'Rejected')
SELECT * FROM #temp
SELECT id, MAX(mydate)
FROM #temp
WHERE status = 'Rejected'
GROUP BY id
This is my pathetic attempt so far
SELECT id, MAX(mydate)
FROM #temp
WHERE status = 'Rejected'
GROUP BY id
But this will only bring back the latest date in each group. I need a list where the latest entry was Rejected. I expect the answer to be embarrassingly simple but I'm having a heck of a time with this.
Thanks
Carl

You can get this using row_number() function as shown below.
;WITH cte
AS (
SELECT Id
,mydate
,STATUS
,ROW_NUMBER() OVER (
PARTITION BY Id, status ORDER BY mydate desc
) row_num
FROM #temp
)
SELECT *
FROM cte
WHERE row_num = 1
AND STATUS = 'Rejected'
Here is the live db<>fiddle demo.

One method uses aggregation and having:
select id
from #temp
group by id
having max(case when status = 'Rejected' then mydate end) = max(mydate);
This is almost a direct translation of your question: the latest date for 'Rejected' is the latest date for a given id.
More traditional methods use a correlated subquery:
select t.*
from #temp t
where t.mydate = (select max(t2.mydate)
from #temp t2
where t2.id = t.id
) and
t.status = 'Rejected';
Or window functions:
select t.*
from (select t.*,
row_number() over (partition by id order by mydate desc) as seqnum
from #temp t
) t
where t.seqnum = 1 and t.status = 'Rejected';

Related

Group by with gap in date sequence ("gaps and islands")

I am trying to solve a "gaps and islands" by date issue I'm facing (kudos to Gordon Linoff helping me identify this issue). I want to group the below table by person, office and job while respecting order by person,from_date. consider the table below:
declare #temp table(person varchar(25),office varchar(25),job varchar(25),from_date date,to_date date)
insert into #temp values ('jon','ny','programmer','1/1/2020','1/3/2020');
insert into #temp values ('jon','ny','programmer','1/4/2020','1/5/2020');
insert into #temp values ('jon','dc','programmer','1/6/2020','1/7/2020');
insert into #temp values ('jon','ny','programmer','1/8/2020','1/9/2020');
insert into #temp values ('lou','ny','programmer','1/1/2020','1/3/2020');
insert into #temp values ('lou','ny','programmer','1/4/2020','1/5/2020');
insert into #temp values ('lou','dc','programmer','1/6/2020','1/7/2020');
insert into #temp values ('lou','ny','programmer','1/8/2020','1/9/2020');
the intended output is
This is a type of gaps-and-islands problem. If there are no gaps in the dates, the simplest solution is the difference of row numbers:
select person, office, job, min(from_date), max(to_date)
from (select t.*,
row_number() over (partition by person, office, job order by from_date) as seqnum,
row_number() over (partition by person, office order by from_date) as seqnum_2
from t
) t
group by person, office, job, (seqnum - seqnum_2)
This is a general solution:
WITH preceders_and_followers AS (
SELECT
b.person,
b.office,
b.job,
b.from_date,
b.to_date,
CASE
WHEN EXISTS (
SELECT
c.*
FROM
ora$ptt_tmp c
WHERE
b.person = c.person
AND b.office = c.office
AND b.job = c.job
AND ( b.from_date - 1 BETWEEN c.from_date AND c.to_date )
) THEN
1
END AS has_preceder,
CASE
WHEN EXISTS (
SELECT
c.*
FROM
ora$ptt_tmp c
WHERE
b.person = c.person
AND b.office = c.office
AND b.job = c.job
AND ( b.to_date + 1 BETWEEN c.from_date AND c.to_date )
) THEN
1
END AS has_follower
FROM
ora$ptt_tmp b
ORDER BY
1,
2,
3
)
SELECT DISTINCT
pf1.person,
pf1.office,
pf1.job,
pf1.from_date,
(
SELECT
MIN(pf2.to_date)
FROM
preceders_and_followers pf2
WHERE
pf1.person = pf2.person
AND pf1.office = pf2.office
AND pf1.job = pf2.job
AND pf2.to_date >= pf1.from_date
AND has_follower IS NULL
) to_date
FROM
preceders_and_followers pf1
WHERE
pf1.has_preceder IS NULL
ORDER BY
1,
4,
2,
3;

MS SQL LAST_VALUE() Order by

I need to pick the last value for the group_id ordered by id from the following example table:
drop table if exists #temp1
create table #temp1 (group_id int, id int, val varchar(10))
insert into #temp1 values (1111, 1, 'Yes')
insert into #temp1 values (1111, 2, 'No')
insert into #temp1 values (1111, 3, NULL)
insert into #temp1 values (2222, 5, 'No')
insert into #temp1 values (2222, 3, NULL)
insert into #temp1 values (2222, 1, 'No')
The expected result is 1111 - Yes and 2222 - No.
If I write the following query, it seems to pick the last value based on how the rows are ordered in the table and not by id column.
SELECT group_id, MAX(last_val)
FROM
(
SELECT a.group_id, LAST_VALUE(a.val) OVER (PARTITION BY a.group_id ORDER BY a.group_id) AS last_val FROM #temp1 a
) a
GROUP BY group_id
If I write the following, it seems to do a Max of val alphabetically:
SELECT group_id, MAX(last_val)
FROM
(
SELECT a.group_id, LAST_VALUE(a.val) OVER (PARTITION BY a.group_id ORDER BY a.id) AS last_val FROM #temp1 a
) a
GROUP BY group_id
In both cases, the results are different from what I need. Can someone please suggest how to get the val for the last id?
First, I recommend FIRST_VALUE() with a descending sort. Then, you need to use the right ORDER BY column:
SELECT group_id, MAX(last_val)
FROM (SELECT a.group_id,
FIRST_VALUE(a.val) OVER (PARTITION BY a.group_id ORDER BY a.id DESC) AS last_val
FROM #temp1 a
) a
GROUP BY group_id;
Why do I prefer FIRST_VALUE() for this? The issue is the default window frame, which is BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW. This can interact unexpected with LAST_VALUE().
Why do you group twice ? couldn't you just alter the window ?
SELECT DISTINCT
a.group_id
,FIRST_VALUE(a.val) OVER (PARTITION BY a.group_id
ORDER BY a.id DESC
ROWS BETWEEN
UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING) AS last_val
FROM #temp1

how to insert many records excluding some

I want to create a table with a subset of records from a master table.
for example, i have:
id name code
1 peter 73
2 carl 84
3 jack 73
I want to store peter and carl but not jack because has same peter's code.
I need hight performance because i have 20M records.
I try this:
SELECT id, name, DISTINCT(code) INTO new_tab
FROM old_tab
WHERE (conditions)
but don't work.
Assuming you want to pick the row with the maximum id per code, then this should do it:
insert into new_tab (id, name, code)
(SELECT id, name, code
FROM
(
SELECT id, name, code, rank() as rnk OVER (PARTITION BY code ORDER BY id DESC)
FROM old_tab WHERE rnk = 1
)
)
and for the minimum id per code, just change the sort order in the rank from DESC to ASC:
insert into new_tab (id, name, code)
(SELECT id, name, code
FROM
(
SELECT id, name, code, rank() as rnk OVER (PARTITION BY code ORDER BY id ASC)
FROM old_tab WHERE rnk = 1
)
)
Using a derived table, you can find the minID for each code, then join back to that in the outer to get the rest of the columns for that ID from oldTab.
select id,name,code
insert into newTabFROM
from old_tab t inner join
(SELECT min(id) as minId, code
from old_tab group by code) x
on t.id = x.minId
WHERE (conditions)
Try this:
CREATE TABLE #Temp
(
ID INT,
Name VARCHAR(50),
Code INT
)
INSERT #Temp VALUES (1, 'Peter', 73)
INSERT #Temp VALUES (2, 'Carl', 84)
INSERT #Temp VALUES (3, 'Jack', 73)
SELECT t2.ID, t2.Name, t2.Code
FROM #Temp t2
JOIN (
SELECT t.Code, MIN(t.ID) ID
FROM #temp t
JOIN (
SELECT DISTINCT Code
FROM #Temp
) d
ON t.Code = d.Code
GROUP BY t.Code
) b
ON t2.ID = b.ID

How to select top 3 values from each group in a table with SQL which have duplicates [duplicate]

This question already has answers here:
Select top 10 records for each category
(14 answers)
Closed 5 years ago.
Assume we have a table which has two columns, one column contains the names of some people and the other column contains some values related to each person. One person can have more than one value. Each value has a numeric type. The question is we want to select the top 3 values for each person from the table. If one person has less than 3 values, we select all the values for that person.
The issue can be solved if there are no duplicates in the table by the query provided in this article Select top 3 values from each group in a table with SQL . But if there are duplicates, what is the solution?
For example, if for one name John, he has 5 values related to him. They are 20,7,7,7,4. I need to return the name/value pairs as below order by value descending for each name:
-----------+-------+
| name | value |
-----------+-------+
| John | 20 |
| John | 7 |
| John | 7 |
-----------+-------+
Only 3 rows should be returned for John even though there are three 7s for John.
In many modern DBMS (e.g. Postgres, Oracle, SQL-Server, DB2 and many others), the following will work just fine. It uses CTEs and ranking function ROW_NUMBER() which is part of the latest SQL standard:
WITH cte AS
( SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name
ORDER BY value DESC
)
AS rn
FROM t
)
SELECT name, value, rn
FROM cte
WHERE rn <= 3
ORDER BY name, rn ;
Without CTE, only ROW_NUMBER():
SELECT name, value, rn
FROM
( SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name
ORDER BY value DESC
)
AS rn
FROM t
) tmp
WHERE rn <= 3
ORDER BY name, rn ;
Tested in:
Postgres
Oracle
SQL-Server
In MySQL and other DBMS that do not have ranking functions, one has to use either derived tables, correlated subqueries or self-joins with GROUP BY.
The (tid) is assumed to be the primary key of the table:
SELECT t.tid, t.name, t.value, -- self join and GROUP BY
COUNT(*) AS rn
FROM t
JOIN t AS t2
ON t2.name = t.name
AND ( t2.value > t.value
OR t2.value = t.value
AND t2.tid <= t.tid
)
GROUP BY t.tid, t.name, t.value
HAVING COUNT(*) <= 3
ORDER BY name, rn ;
SELECT t.tid, t.name, t.value, rn
FROM
( SELECT t.tid, t.name, t.value,
( SELECT COUNT(*) -- inline, correlated subquery
FROM t AS t2
WHERE t2.name = t.name
AND ( t2.value > t.value
OR t2.value = t.value
AND t2.tid <= t.tid
)
) AS rn
FROM t
) AS t
WHERE rn <= 3
ORDER BY name, rn ;
Tested in MySQL
I was going to downvote the question. However, I realized that it might really be asking for a cross-database solution.
Assuming you are looking for a database independent way to do this, the only way I can think of uses correlated subqueries (or non-equijoins). Here is an example:
select distinct t.personid, val, rank
from (select t.*,
(select COUNT(distinct val) from t t2 where t2.personid = t.personid and t2.val >= t.val
) as rank
from t
) t
where rank in (1, 2, 3)
However, each database that you mention (and I note, Hadoop is not a database) has a better way of doing this. Unfortunately, none of them are standard SQL.
Here is an example of it working in SQL Server:
with t as (
select 1 as personid, 5 as val union all
select 1 as personid, 6 as val union all
select 1 as personid, 6 as val union all
select 1 as personid, 7 as val union all
select 1 as personid, 8 as val
)
select distinct t.personid, val, rank
from (select t.*,
(select COUNT(distinct val) from t t2 where t2.personid = t.personid and t2.val >= t.val
) as rank
from t
) t
where rank in (1, 2, 3);
Using GROUP_CONCAT and FIND_IN_SET you can do that.Check SQLFIDDLE.
SELECT *
FROM tbl t
WHERE FIND_IN_SET(t.value,(SELECT
SUBSTRING_INDEX(GROUP_CONCAT(t1.value ORDER BY VALUE DESC),',',3)
FROM tbl t1
WHERE t1.name = t.name
GROUP BY t1.name)) > 0
ORDER BY t.name,t.value desc
If your result set is not so heavy, you can write a stored procedure (or an anonymous PL/SQL-block) for that problem which iterates the result set and finds the bigges three by a simple comparing algorithm.
Try this -
CREATE TABLE #list ([name] [varchar](100) NOT NULL, [value] [int] NOT NULL)
INSERT INTO #list VALUES ('John', 20), ('John', 7), ('John', 7), ('John', 7), ('John', 4);
WITH cte
AS (
SELECT NAME
,value
,ROW_NUMBER() OVER (
PARTITION BY NAME ORDER BY (value) DESC
) RN
FROM #list
)
SELECT NAME
,value
FROM cte
WHERE RN < 4
ORDER BY value DESC
This works for MS SQL. Should be workable in any other SQL dialect that has the ability to assign row numbers in a group by or over clause (or equivelant)
if object_id('tempdb..#Data') is not null drop table #Data;
GO
create table #data (name varchar(25), value integer);
GO
set nocount on;
insert into #data values ('John', 20);
insert into #data values ('John', 7);
insert into #data values ('John', 7);
insert into #data values ('John', 7);
insert into #data values ('John', 5);
insert into #data values ('Jack', 5);
insert into #data values ('Jane', 30);
insert into #data values ('Jane', 21);
insert into #data values ('John', 5);
insert into #data values ('John', -1);
insert into #data values ('John', -1);
insert into #data values ('Jane', 18);
set nocount off;
GO
with D as (
SELECT
name
,Value
,row_number() over (partition by name order by value desc) rn
From
#Data
)
SELECT Name, Value
FROM D
WHERE RN <= 3
order by Name, Value Desc
Name Value
Jack 5
Jane 30
Jane 21
Jane 18
John 20
John 7
John 7

Select latest revision of each row in a table

I have table structures that include a composite primary key of id & revision where both are integers.
I need a query that will return the latest revision of each row. If I understood this answer correctly then the following would have worked on an Oracle DB.
SELECT Id, Title
FROM ( SELECT Id, Revision, MAX(Revision) OVER (PARTITION BY Id) LatestRevision FROM Task )
WHERE Revision = LatestRevision
I am using SQL Server (2005) and need a performant query to do the same.
I think this should work (I didn't test it)...
SELECT ID,
Title
FROM Task AS T
INNER JOIN
(
SELECT ID,
Max(Revision)
FROM Task
GROUP BY ID
) AS sub
ON T.ID = sub.ID
AND T.Revision = sub.Revision
See this post by ayende for an ealuation of the Best strategies.
I would try to create a subquery like this:
SELECT Id, Title
FROM Task T, (Select ID, Max(Revision) MaxRev from Task group by ID) LatestT
WHERE T.Revision = LatestT.MaxRev and T.ID = LatestT.ID
Another option is to "cheat" and create a trigger that will flag the revision as latest revision if one item is added.
Then add that field to the index. (I would link the table to insert only)
Also an index on ID, Revision desc could help the performance.
The query you posted will work in SQL 2005 (in compatibility mode 90) with the syntax errors corrected:
SELECT t1.Id, t1.Title
FROM ( SELECT Id, Revision, MAX(Revision) OVER (PARTITION BY Id) LatestRevision FROM Task ) AS x
JOIN Task as t1
ON t1.Revision = x.LatestRevision
AND t1.id = x.id
try this:
DECLARE #YourTable table(RowID int, Revision int, Title varchar(10))
INSERT INTO #YourTable VALUES (1,1,'A')
INSERT INTO #YourTable VALUES (2,1,'B')
INSERT INTO #YourTable VALUES (2,2,'BB')
INSERT INTO #YourTable VALUES (3,1,'C')
INSERT INTO #YourTable VALUES (4,1,'D')
INSERT INTO #YourTable VALUES (1,2,'AA')
INSERT INTO #YourTable VALUES (2,3,'BBB')
INSERT INTO #YourTable VALUES (5,1,'E')
INSERT INTO #YourTable VALUES (5,2,'EE')
INSERT INTO #YourTable VALUES (4,2,'DD')
INSERT INTO #YourTable VALUES (4,3,'DDD')
INSERT INTO #YourTable VALUES (6,1,'F')
;WITH YourTableRank AS
(
SELECT
RowID,Revision,Title, ROW_NUMBER() OVER(PARTITION BY RowID ORDER BY RowID,Revision DESC) AS Rank
FROM #YourTable
)
SELECT
RowID, Revision, Title
FROM YourTableRank
WHERE Rank=1
OUTPUT:
RowID Revision Title
----------- ----------- ----------
1 2 AA
2 3 BBB
3 1 C
4 3 DDD
5 2 EE
6 1 F
(6 row(s) affected)