How to select both row_number and count over partition? - sql

I need to find duplicate record (with master record id and duplicate record ids):
select ciid, name from (
select ciid, name, row_number() over (
partition by related_id, name order by updatedate desc) rn
) where rn = 1;
This gives me the master record IDs, but it also includes records without duplicates.
If I use
select ciid, name from (
select ciid, name, row_number() over (
partition by related_id, name order by updatedate desc) rn
) where rn > 1;
This gets me all the duplicate records, but not the master record.
I was wishing if I do something like:
select ciid, name from (
select ciid, name, row_number() over (
partition by related_id, name order by updatedate desc
) rn, count(*) over (
partition by related_id, name order by updatedate desc
) cnt
) where rn = 1 and cnt > 1;
But I was worried about the performance, or even is it actually doing what I want.
How do I get the master record only for the ones with duplicates? Please note that name is not unique column. Only ciid is unique.

I ended up using similar query in my question:
select ciid, name from (
select ciid, name, row_number() over (
partition by related_id, name order by updatedate desc
) rn, count(*) over (
partition by related_id, name desc
) cnt
) where rn = 1 and cnt > 1;
Works surprisingly well. The master record is where rn = 1 and duplicates are where rn > 1. Make sure count(*) over (partition ..) cannot have order by clause.

I haven't tested this (because I don't have real data and am too lazy to create some), but it seems something along these lines might work:
with has_duplicates as (
select related_id, name
from yourtable
group by related_id, name
having count (*) > 1
),
with_dupes as (
select
y.ccid, y.name,
row_number() over (partition by y.related_id, y.name order by y.updatedate desc) rn
from
yourtable y,
has_duplicates d
where
y.related_id = d.related_id and
y.name = d.name
)
select
ccid, name
from with_dupes
where rn = 1

select ciid, name
from (
select ciid, name,
dense_rank() over (partition by related_id, name order by updatedate desc) rn
from tablename) t
group by ciid,name
having count(distinct rn) > 1;
Edit: To find duplicates, why not just do this.
select x.ciid, x.name, x.updatedate
from tablename x join
(
select name, related_id, max(updatedate) as mxdt, count(*)
from tablename
group by name, related_id
having count(*) > 1
) t
on x.updatedate = t.mxdt and x.name = t.name
You can do a group by with having to select only those id's having more than one row with the same row number.

Related

how to update all rows except first?

I have this sql query :
update CCUSTOMERINFO set VALIDTO=sysdate where (
select * from (
select row_number() over (order by created desc) rn, customer_id, CCUSTOMERINFO.VALIDTO
from CCUSTOMERINFO
where customer_id=100309772 order by created DESC) where rn > 1);
But it say it have some mistake.
This query returns all i want to update :
select * from (
select row_number() over (order by created desc) rn, customer_id, CCUSTOMERINFO.VALIDTO
from CCUSTOMERINFO
where customer_id=100309772 order by created DESC) where rn > 1)
Any suggestion how can i do that?
Use it like the below
update CCUSTOMERINFO set VALIDTO=sysdate where rowid in (
select row_id from (
select row_number() over (order by created desc) rn, customer_id, rowid row_id,
CCUSTOMERINFO.VALIDTO
from CCUSTOMERINFO
where customer_id=100309772 order by created DESC) where rn > 1);

Select most recent status for each ID and department code

I have the following table:
I want to get the most recent status for each dept_code that a CL_ID has. So the desired output would be this:
I have tried the following but this give me just the most recent status for each client and not each of their dept_codes.
SELECT *
FROM [CIMSHR6_MERGED].[dbo].[C3CLSTAT] C
INNER JOIN
(SELECT CLIENT_NUMBER, MAX(STATUS_DATE) AS SDATE
FROM [CIMSHR6_MERGED].[dbo].[C3CLSTAT]
GROUP BY CLIENT_NUMBER) X
ON X.CLIENT_NUMBER = C.CLIENT_NUMBER
AND X.SDATE = C.STATUS_DATE
ORDER BY C.CLIENT_NUMBER
Any help would be much appreciated. Thanks.
A convenient method that works in SQL Server is:
select top (1) cl.*
from [CIMSHR6_MERGED].[dbo].[C3CLSTAT] cl
order by row_number() over (partition by cl_id, dept_code order by status_date desc);
A method that is efficient with the right indexes in almost any database is:
select cl.*
from [CIMSHR6_MERGED].[dbo].[C3CLSTAT] cl
where cl.status_date = (select max(cl2.status_date)
from [CIMSHR6_MERGED].[dbo].[C3CLSTAT] cl2
where cl2.cl_id = cl.cl_id and cl2.dept_code = cl.dept_code
);
The right index is on (cl_id, dept_code, status_date).
I would also use ROW_NUMBER, but with a subquery:
SELECT CL_ID, Status_date, Status, Dept_code
FROM
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY CL_ID, Dept_code ORDER BY Status_date DESC) rn
FROM CIMSHR6_MERGED].[dbo].[C3CLSTAT]
) t
WHERE rn = 1;
1) Firstly group everything on Dept_Code,CL_ID and assign rank for each row with in the group in descending order.
2) Select all the rows with rnk=1 which would display your desired result.
SELECT Z.CL_ID,
Z.Status_Date,
Z.Status,
Z.Dept_Code
FROM
(
SELECT *,
RANK() OVER( PARTITION BY Dept_Code,CL_ID, ORDER BY Status_Date DESC ) AS rnk
FROM [CIMSHR6_MERGED].[dbo].[C3CLSTAT]
) Z
WHERE Z.rnk = 1;
This would work for almost all databases
select * from c3clstat c
where exists
(select 1 from c3clstat c1
where c1.cl_id=c.cl_id
and c1.dept_code=c.dept_code
group by cl_id,dept_code
having c.status_date=max(c1.status_date)
)

T-SQL : return latest value of each ID

I have a table looks like
ID, name, Likes, Login_time
select * from mytbl
I want to filter this table:
distinct ID, name like, login_time(last login time)
I tried this query, but it didn't work.
select *
from
(select
name, likes, login_time
rank() over (partition by id order by login_time desc) as rank
from
mytbl) t
where
t.rank = 1
use row_number instead of rank
select *
from
(
select
id, name, likes, login_time,
ROW_NUMBER() over (partition by id order by login_time desc) as rank
from
mytbl )t
where
t.rank = 1
Try this
;WITH temps AS
(
SELECT Id, name, likes, login_time, row_number() over(PARTITION BY Id ORDER BY login_time desc) AS RowIndex
)
SELECT Id, name, likes, login_time FROM temps
WHERE RowIndex = 1

T-SQL select into temp table does not have correct result

-- This Result Is Right and work correctly:
SELECT AutoId, Name,[Group],[Priority], SUMCalculatedPercent
FROM
(SELECT DISTINCT *,
ROW_NUMBER() OVER
(
PARTITION BY [Group] ORDER BY SUMCalculatedPercent DESC,[Priority]
)
AS ranker
FROM #GroupMasterNameChoose
)Z
WHERE ranker = 1
ORDER BY Z.SUMCalculatedPercent DESC,Z.[Priority]
-- This Result Is Wrong :
SELECT AutoId, Name,[Group],[Priority], SUMCalculatedPercent
INTO #GroupOwner
FROM
(SELECT DISTINCT *,
ROW_NUMBER() OVER
(
PARTITION BY [Group] ORDER BY SUMCalculatedPercent DESC,[Priority]
)
AS ranker
FROM #GroupMasterNameChoose
)Z
WHERE ranker = 1
ORDER BY Z.SUMCalculatedPercent DESC,Z.[Priority]
--
Problem : I Need Store My Right Result To Temp Table
Remove ORDER BY Z.SUMCalculatedPercent DESC,Z.[Priority]

How to perform reference a window function inside current table?

I have this part in a larger query which consume lot of RAM:
TopPerPost as
(
select Id,
CloseReasonTypeId,
Name,
ReasonsPerPost.TotalByCloseReason,
row_number() over(partition by Id order by TotalByCloseReason desc) seq -- Get the most common Id (The most common close Reason)
from ReasonsPerPost
where Name is NOT NULL and TopPerPost.seq=1 -- Remove useless results here, instead of doing it later
)
but I got The multi-part identifier "TopPerPost.seq" could not be bound.
Last detail... I only Use theNamecolumn in a laterINNER JOINof that table.
You can't reference a window function in the where of the same query. Just create a second cte.
with TopPerPost as
(
select Id,
CloseReasonTypeId,
Name,
ReasonsPerPost.TotalByCloseReason,
row_number() over(partition by Id order by TotalByCloseReason desc) seq -- Get the most common Id
from ReasonsPerPost
where Name is NOT NULL
)
, OnlyTheTop as
(
select *
from TopPerPost
where seq = 1
)
Or you can do it like this.
select * from
(
select Id,
CloseReasonTypeId,
Name,
ReasonsPerPost.TotalByCloseReason,
row_number() over(partition by Id order by TotalByCloseReason desc) seq -- Get the most common Id
from ReasonsPerPost
where Name is NOT NULL
) s
where seq = 1
Here is another option that should eliminate the need for so many rows being returned.
select Id,
CloseReasonTypeId,
Name,
s.TotalByCloseReason
from ReasonsPerPost rpp
cross apply
(
select top 1 TotalByCloseReason
from ReasonsPerPost rpp2
where rpp2.Id = rpp.Id
order by TotalByCloseReason desc
) s
where Name is NOT NULL
Attempt #4...this would be a LOT easier with a sql fiddle to work with.
select Id,
CloseReasonTypeId,
Name,
s.TotalByCloseReason
from ReasonsPerPost rpp
inner join
(
select top 1 TotalByCloseReason
from ReasonsPerPost rpp2
where rpp2.Id = rpp.Id
and Name is NOT NULL
order by TotalByCloseReason desc
) s on s.Id = rpp.Id
where Name is NOT NULL
The below might work for your need.
But without looking at the data is hard to tell it will or not.
;with t as
(
Select Id, max(totalbyclosereason) TC from reasonsperpost where name is not null group by id
)
Select T.id,t.tc,c.closereasontypeid,c.name
From t join reasonsperpost c on t.id = c.id and t.tc = c.totalbyclosereason