Grouping records - ROW NUMBER and PARTITION BY - sql

Please see the DDL below:
create table #Test (ID int identity not null, name varchar(100), primary key (id))
insert into #Test (name) values ('Ian')
insert into #Test (name) values ('Ian')
insert into #Test (name) values ('Ian')
insert into #Test (name) values ('Mark')
insert into #Test (name) values ('James')
insert into #Test (name) values ('James')
insert into #Test (name) values ('Henry')
I am looking for the output below:
Ian 1
Ian 1
Ian 1
Mark 2
James 3
James 3
Henry 4
All the Ians' have the same number. All the James' have the same number. I have been experimenting with ROW NUMBER and PARTITION BY but I have been unsuccessful so far.

You can do this in a few ways, but row_number() per se is not one of them.
Here is a method:
select t.name, dense_rank() over (order by nameid)
from (select t.*, min(id) over (partition by name) as nameid
from #test t
) t;
This calculates the min id for each name and then uses that for dense_rank().
If you don't care about the particular ordering, you can use dense_rank() on the name:
select name, dense_rank() over (order by name)
from #test t;

Use DENSE_RANK instead.
Select Name, DENSE_RANK() Over (Order By Name)
From #Test

You can get the desired output with:
with prep as
(
select name
, DENSE_RANK() over (order by c) [rank]
from (
select distinct name, min(id) c
from #Test
group by name
) a
)
select T.name
, prep.[rank]
from #Test T
inner join prep on prep.name = T.name
The first (sub)query selects the lowest possible ID per record, the DENSE_RANK then ensures the numbering is seqential, and the final query uses those prepped results against the original #Test table to duplicate the data per row.
You can find out more about the ranking functions here: https://msdn.microsoft.com/en-us/library/ms189798.aspx

Related

Finding a random sample of unique data across multiple columns - SQL Server

Given a set of data in a SQL Server database with the following columns
AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2
All three columns are primary keys from the same users table. I need to find a random sample that will include every UserID available in all three columns no matter the position while guaranteeing the fewest unique AccountID's possible.
--SET UP TEST DATA
CREATE TABLE MY_TABLE
(
AccountID int,
UserID_Salesperson int,
UserID_Servicer1 int,
UserID_Servicer2 int
)
INSERT INTO MY_TABLE (AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2)
VALUES (12345, 1, 1, 2)
INSERT INTO MY_TABLE (AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2)
VALUES (12346, 3, 2, 1)
INSERT INTO MY_TABLE (AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2)
VALUES (12347, 4, 3, 1)
INSERT INTO MY_TABLE (AccountID, UserID_Salesperson, UserID_Servicer1, UserID_Servicer2)
VALUES (12348, 1, 2, 3)
--VIEW THE NEW TABLE
SELECT * FROM MY_TABLE
--NORMALIZE DATA (Unique List of UserID's)
SELECT DISTINCT MyDistinctUserIDList
FROM
(SELECT UserID_Salesperson as MyDistinctUserIDList, 'Sales' as Position
FROM MY_TABLE
UNION
SELECT UserID_Servicer1, 'Service1' as Position
FROM MY_TABLE
UNION
SELECT UserID_Servicer2, 'Service2' as Position
FROM MY_TABLE) MyDerivedTable
--NORMALIZED DATA
SELECT *
FROM
(SELECT AccountID, UserID_Salesperson as MyDistinctUserIDList, 'Sales' as Position
FROM MY_TABLE
UNION
SELECT AccountID, UserID_Servicer1, 'Service1' as Position
FROM MY_TABLE
UNION
SELECT AccountID, UserID_Servicer2, 'Service2' as Position
FROM MY_TABLE) MyDerivedTable
DROP TABLE MY_TABLE
For this example table, I could select AccountID (12347 and 12348) OR (12347 and 12346) to get the least accounts with all users.
My current solution is inefficient and can make mistakes. I currently select a random AccountID, insert the data into a temp table and try to find the next insert from something I have not already put in the temp table. I loop through the records until it finds something not used beforeā€¦ and after a few thousand loops it will give up and select any record.
I don't know how you guarantee the fewest account ids, but you can get one row per user id using:
select t.*
from (select t.*,
row_number() over (partition by UserId order by newid()) as seqnum
from my_table t cross apply
(values (t.UserID_Salesperson), (t.UserID_Servicer1), (t.UserID_Servicer2)
) v(UserID)
) t
where seqnum = 1;
Your original table doesn't have a primary key. Assuming that there is one row per account, you can dedup this so it doesn't have duplicate accounts:
select top (1) with ties t.*
from (select t.*,
row_number() over (partition by UserId order by newid()) as seqnum
from my_table t cross apply
(values (t.UserID_Salesperson), (t.UserID_Servicer1), (t.UserID_Servicer2)
) v(UserID)
) t
where seqnum = 1
order by row_number() over (partition by accountID order by accountID);

SQL:selecting top record for a group of column

I need to select top record with max value for a group of other columns. In the sample data below, i need to select top record with max 'Weightage' for each group of 'id', 'ItemType'
create table sampleTable(id int, ItemType varchar(10), ItemCode varchar(10), Weightage int)
insert into sampleTable values(1, 'A','aaa',2)
insert into sampleTable values(1, 'A','bbb',3)
insert into sampleTable values(1, 'A','ccc',3)
insert into sampleTable values(1, 'B','ddd',1)
insert into sampleTable values(1, 'B','eee',2)
insert into sampleTable values(2, 'A','aaa',1)
The expected output should be
id ItemType ItemCode
--------------------------------
1 A bbb
1 B eee
2 A aaa
I tried as follows
SELECT top 1 id, ItemType,ItemCode
FROM sampleTable WITH(NOLOCK)
GROUP BY id,ItemType,ItemCode,Weightage
ORDER BY Weightage desc
But it is not giving expected result. Thanks
Here is one way using ROW_NUMBER
SELECT TOP 1 WITH ties id,
itemtype,
itemcode
FROM sampletable WITH(nolock)
GROUP BY id,
itemtype,
itemcode,
weightage
ORDER BY Row_number()OVER(partition BY id, itemtype ORDER BY weightage DESC)
TOP 1 with TIES will return the records with Tie based on Order by
Hope you know the meaning of using WITH(NOLOCK) hint. It will pull uncommitted data as well
Here's one option using row_number():
select *
from (
select *, row_number() over (partition by id, itemtype order by Weightage desc) rn
from sampletable
) t
where rn = 1
SQL Fiddle Demo

how to insert many records excluding some

I want to create a table with a subset of records from a master table.
for example, i have:
id name code
1 peter 73
2 carl 84
3 jack 73
I want to store peter and carl but not jack because has same peter's code.
I need hight performance because i have 20M records.
I try this:
SELECT id, name, DISTINCT(code) INTO new_tab
FROM old_tab
WHERE (conditions)
but don't work.
Assuming you want to pick the row with the maximum id per code, then this should do it:
insert into new_tab (id, name, code)
(SELECT id, name, code
FROM
(
SELECT id, name, code, rank() as rnk OVER (PARTITION BY code ORDER BY id DESC)
FROM old_tab WHERE rnk = 1
)
)
and for the minimum id per code, just change the sort order in the rank from DESC to ASC:
insert into new_tab (id, name, code)
(SELECT id, name, code
FROM
(
SELECT id, name, code, rank() as rnk OVER (PARTITION BY code ORDER BY id ASC)
FROM old_tab WHERE rnk = 1
)
)
Using a derived table, you can find the minID for each code, then join back to that in the outer to get the rest of the columns for that ID from oldTab.
select id,name,code
insert into newTabFROM
from old_tab t inner join
(SELECT min(id) as minId, code
from old_tab group by code) x
on t.id = x.minId
WHERE (conditions)
Try this:
CREATE TABLE #Temp
(
ID INT,
Name VARCHAR(50),
Code INT
)
INSERT #Temp VALUES (1, 'Peter', 73)
INSERT #Temp VALUES (2, 'Carl', 84)
INSERT #Temp VALUES (3, 'Jack', 73)
SELECT t2.ID, t2.Name, t2.Code
FROM #Temp t2
JOIN (
SELECT t.Code, MIN(t.ID) ID
FROM #temp t
JOIN (
SELECT DISTINCT Code
FROM #Temp
) d
ON t.Code = d.Code
GROUP BY t.Code
) b
ON t2.ID = b.ID

How to select top 3 values from each group in a table with SQL which have duplicates [duplicate]

This question already has answers here:
Select top 10 records for each category
(14 answers)
Closed 5 years ago.
Assume we have a table which has two columns, one column contains the names of some people and the other column contains some values related to each person. One person can have more than one value. Each value has a numeric type. The question is we want to select the top 3 values for each person from the table. If one person has less than 3 values, we select all the values for that person.
The issue can be solved if there are no duplicates in the table by the query provided in this article Select top 3 values from each group in a table with SQL . But if there are duplicates, what is the solution?
For example, if for one name John, he has 5 values related to him. They are 20,7,7,7,4. I need to return the name/value pairs as below order by value descending for each name:
-----------+-------+
| name | value |
-----------+-------+
| John | 20 |
| John | 7 |
| John | 7 |
-----------+-------+
Only 3 rows should be returned for John even though there are three 7s for John.
In many modern DBMS (e.g. Postgres, Oracle, SQL-Server, DB2 and many others), the following will work just fine. It uses CTEs and ranking function ROW_NUMBER() which is part of the latest SQL standard:
WITH cte AS
( SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name
ORDER BY value DESC
)
AS rn
FROM t
)
SELECT name, value, rn
FROM cte
WHERE rn <= 3
ORDER BY name, rn ;
Without CTE, only ROW_NUMBER():
SELECT name, value, rn
FROM
( SELECT name, value,
ROW_NUMBER() OVER (PARTITION BY name
ORDER BY value DESC
)
AS rn
FROM t
) tmp
WHERE rn <= 3
ORDER BY name, rn ;
Tested in:
Postgres
Oracle
SQL-Server
In MySQL and other DBMS that do not have ranking functions, one has to use either derived tables, correlated subqueries or self-joins with GROUP BY.
The (tid) is assumed to be the primary key of the table:
SELECT t.tid, t.name, t.value, -- self join and GROUP BY
COUNT(*) AS rn
FROM t
JOIN t AS t2
ON t2.name = t.name
AND ( t2.value > t.value
OR t2.value = t.value
AND t2.tid <= t.tid
)
GROUP BY t.tid, t.name, t.value
HAVING COUNT(*) <= 3
ORDER BY name, rn ;
SELECT t.tid, t.name, t.value, rn
FROM
( SELECT t.tid, t.name, t.value,
( SELECT COUNT(*) -- inline, correlated subquery
FROM t AS t2
WHERE t2.name = t.name
AND ( t2.value > t.value
OR t2.value = t.value
AND t2.tid <= t.tid
)
) AS rn
FROM t
) AS t
WHERE rn <= 3
ORDER BY name, rn ;
Tested in MySQL
I was going to downvote the question. However, I realized that it might really be asking for a cross-database solution.
Assuming you are looking for a database independent way to do this, the only way I can think of uses correlated subqueries (or non-equijoins). Here is an example:
select distinct t.personid, val, rank
from (select t.*,
(select COUNT(distinct val) from t t2 where t2.personid = t.personid and t2.val >= t.val
) as rank
from t
) t
where rank in (1, 2, 3)
However, each database that you mention (and I note, Hadoop is not a database) has a better way of doing this. Unfortunately, none of them are standard SQL.
Here is an example of it working in SQL Server:
with t as (
select 1 as personid, 5 as val union all
select 1 as personid, 6 as val union all
select 1 as personid, 6 as val union all
select 1 as personid, 7 as val union all
select 1 as personid, 8 as val
)
select distinct t.personid, val, rank
from (select t.*,
(select COUNT(distinct val) from t t2 where t2.personid = t.personid and t2.val >= t.val
) as rank
from t
) t
where rank in (1, 2, 3);
Using GROUP_CONCAT and FIND_IN_SET you can do that.Check SQLFIDDLE.
SELECT *
FROM tbl t
WHERE FIND_IN_SET(t.value,(SELECT
SUBSTRING_INDEX(GROUP_CONCAT(t1.value ORDER BY VALUE DESC),',',3)
FROM tbl t1
WHERE t1.name = t.name
GROUP BY t1.name)) > 0
ORDER BY t.name,t.value desc
If your result set is not so heavy, you can write a stored procedure (or an anonymous PL/SQL-block) for that problem which iterates the result set and finds the bigges three by a simple comparing algorithm.
Try this -
CREATE TABLE #list ([name] [varchar](100) NOT NULL, [value] [int] NOT NULL)
INSERT INTO #list VALUES ('John', 20), ('John', 7), ('John', 7), ('John', 7), ('John', 4);
WITH cte
AS (
SELECT NAME
,value
,ROW_NUMBER() OVER (
PARTITION BY NAME ORDER BY (value) DESC
) RN
FROM #list
)
SELECT NAME
,value
FROM cte
WHERE RN < 4
ORDER BY value DESC
This works for MS SQL. Should be workable in any other SQL dialect that has the ability to assign row numbers in a group by or over clause (or equivelant)
if object_id('tempdb..#Data') is not null drop table #Data;
GO
create table #data (name varchar(25), value integer);
GO
set nocount on;
insert into #data values ('John', 20);
insert into #data values ('John', 7);
insert into #data values ('John', 7);
insert into #data values ('John', 7);
insert into #data values ('John', 5);
insert into #data values ('Jack', 5);
insert into #data values ('Jane', 30);
insert into #data values ('Jane', 21);
insert into #data values ('John', 5);
insert into #data values ('John', -1);
insert into #data values ('John', -1);
insert into #data values ('Jane', 18);
set nocount off;
GO
with D as (
SELECT
name
,Value
,row_number() over (partition by name order by value desc) rn
From
#Data
)
SELECT Name, Value
FROM D
WHERE RN <= 3
order by Name, Value Desc
Name Value
Jack 5
Jane 30
Jane 21
Jane 18
John 20
John 7
John 7

Select latest revision of each row in a table

I have table structures that include a composite primary key of id & revision where both are integers.
I need a query that will return the latest revision of each row. If I understood this answer correctly then the following would have worked on an Oracle DB.
SELECT Id, Title
FROM ( SELECT Id, Revision, MAX(Revision) OVER (PARTITION BY Id) LatestRevision FROM Task )
WHERE Revision = LatestRevision
I am using SQL Server (2005) and need a performant query to do the same.
I think this should work (I didn't test it)...
SELECT ID,
Title
FROM Task AS T
INNER JOIN
(
SELECT ID,
Max(Revision)
FROM Task
GROUP BY ID
) AS sub
ON T.ID = sub.ID
AND T.Revision = sub.Revision
See this post by ayende for an ealuation of the Best strategies.
I would try to create a subquery like this:
SELECT Id, Title
FROM Task T, (Select ID, Max(Revision) MaxRev from Task group by ID) LatestT
WHERE T.Revision = LatestT.MaxRev and T.ID = LatestT.ID
Another option is to "cheat" and create a trigger that will flag the revision as latest revision if one item is added.
Then add that field to the index. (I would link the table to insert only)
Also an index on ID, Revision desc could help the performance.
The query you posted will work in SQL 2005 (in compatibility mode 90) with the syntax errors corrected:
SELECT t1.Id, t1.Title
FROM ( SELECT Id, Revision, MAX(Revision) OVER (PARTITION BY Id) LatestRevision FROM Task ) AS x
JOIN Task as t1
ON t1.Revision = x.LatestRevision
AND t1.id = x.id
try this:
DECLARE #YourTable table(RowID int, Revision int, Title varchar(10))
INSERT INTO #YourTable VALUES (1,1,'A')
INSERT INTO #YourTable VALUES (2,1,'B')
INSERT INTO #YourTable VALUES (2,2,'BB')
INSERT INTO #YourTable VALUES (3,1,'C')
INSERT INTO #YourTable VALUES (4,1,'D')
INSERT INTO #YourTable VALUES (1,2,'AA')
INSERT INTO #YourTable VALUES (2,3,'BBB')
INSERT INTO #YourTable VALUES (5,1,'E')
INSERT INTO #YourTable VALUES (5,2,'EE')
INSERT INTO #YourTable VALUES (4,2,'DD')
INSERT INTO #YourTable VALUES (4,3,'DDD')
INSERT INTO #YourTable VALUES (6,1,'F')
;WITH YourTableRank AS
(
SELECT
RowID,Revision,Title, ROW_NUMBER() OVER(PARTITION BY RowID ORDER BY RowID,Revision DESC) AS Rank
FROM #YourTable
)
SELECT
RowID, Revision, Title
FROM YourTableRank
WHERE Rank=1
OUTPUT:
RowID Revision Title
----------- ----------- ----------
1 2 AA
2 3 BBB
3 1 C
4 3 DDD
5 2 EE
6 1 F
(6 row(s) affected)