Need to delete duplicate records from the table using row_number() - sql

I am having a table test having data as follows and I want to delete the trsid 124 and I have millions entry in my DB it is just a scenarion. Concept is to delete the duplicate entry from the table
--------------------------------------------
TrsId | ID | Name |
--------------------------------------------
123 | 1 | ABC |
124 | 1 | ABC |
I am trying something like
delete from test
select T.* from
(
select ROW_NUMBER() over (partition by ID order by name) as r,
Trsid,
ID,
name
from test
) t
where r = 2
Even if I update the query which is Ok for me
update test set id=NULL
select T.* from
(
select ROW_NUMBER() over (partition by ID order by name) as r,
Trsid,
ID,
name
from test
) t
where r = 2
But if i run both this query it deletes all the records from table test. And if i update it update both the records.
I dont know what I am doing wrong here

WITH cte AS
(
SELECT ROW_NUMBER() OVER(PARTITION by ID ORDER BY name) AS Row
FROM test
)
DELETE FROM cte
WHERE Row > 1

Use the below query.
;WITH cte_1
AS (SELECT ROW_NUMBER() OVER(PARTITION BY ID,NAME ORDER BY TrsId ) Rno,*
FROM YourTable)
DELETE
FROM cte_1
WHERE RNO>1

WITH cte_DUP AS (
SELECT * FROM (
select <col1,col2,col3..coln>, row_number()
over(partition by <col1,col2,col3..coln>
order by <col1,col2,col3..coln> ) rownumber
from <your table> ) AB WHERE rownumber > 1)
DELETE FROM cte_DUP WHERE ROWNUMBER > 1

To find duplicate records we can write like below query,
;WITH dup_val
AS (SELECT a,
b,
Row_number()
OVER(
partition BY a, b
ORDER BY b, NAME)AS [RANK]
FROM table_name)
SELECT *
FROM dup_val
WHERE [rank] <> 1;

Related

Distinct particular field in select query

I have table with below sample values.
|Id|Keyword|insertedon|
|:-|:------|:---------|
|1 | abcd | 13/12/20 |
|2 | cdef | 14/12/20 |
|3 | abcd | 14/12/20 |
|4 | defg | 14/12/20 |
In the above table i need distinct values of keywords order by insertedon desc order.
I need recent top 5 results.
Expected Result:
defc
abcd
cdef
Please let me know how to achieve this.
You get the top 5 results with TOP(5) in SQL Server. You'd order the keywords by their last insertedon date:
select top(5) keyword
from mytable
group by keyword
order by max(insertedon) desc;
If you are looking for latest entries based on insertedon column, you can find using the group by clause, something like this:
select keyword, max(insertedon)
from table
group by keyword
order by 2 desc
You can just use select distinct:
select distinct keyword
from t;
If you wanted a full row, you could use row_number():
select t.*
from (select t.*,
row_number() over (partition by keyword order by newid()) as seqnum
from t
) t
where seqnum = 1;
EDIT:
For the edited version, you can use:
select distinct keyword
from (select top (5) keyword
from t
order by insertedon desc
) k
Give a row number based on the descending order of the date column and then select the row wth row number 1.
Query
;with cte as(
select [rn] = row_number() over(
partition by [keyword]
order by [insertedon] desc, [id] desc
)
)
select [keyword] from cte
where [rn] = 1;
You can use the analytical functions as follows:
select t.* from
(select t.*,
row_number() over (partition by keyword order by insertedon desc) as rn,
Dense_rank() over (order by insertedon desc) as dr
from t ) t where rn = 1 and dr <= 5;

Group sequential repeated values sqlite

I have data that repeated sequentially..
A
A
A
B
B
B
A
A
A
I need to group them like this
A
B
A
What is the best approach to do so using sqlite?
Assuming that you have a column that defines the ordering of the rows, say id, you can address this gaps-and-island problem with window functions:
select col, count(*) cnt, min(id) first_id, max(id) last_id
from (
select t.*,
row_number() over(order by id) rn1,
row_number() over(partition by col order by id) rn2
from mytable t
) t
group by col, rn1 - rn2
order by min(id)
I added a few columns to the resultset that give more information about the content of each group.
If you have defined a column that defines the order of the rows, like an id, you can use window function LEAD():
select col
from (
select col, lead(col, 1, '') over (order by id) next_col
from tablename
)
where col <> next_col
See the demo.
Results:
| col |
| --- |
| A |
| B |
| A |

Delete duplicated record

I have a table which contains a lot of duplicated rows like this:
id_emp id date ch_in ch_out
1 34103 2019-09-01
1 34193 2019-09-01 17:00
1 34194 2019-09-02 07:03:21 16:59:26
1 34104 2019-09-02 07:03:21 16:59:26
1 33361 2019-09-02 NULL NULL
I want just one row for each date and others must delete with condition like I want the output must be:
id_emp id date ch_in ch_out
1 34193 2019-09-01 17:00
1 34104 2019-09-02 07:03:21 16:59:26
I tried to use distinct but nothing working:
select distinct id_emp, id, date_1, ch_in,ch_out
from ch_inout
where id_emp=1 order by date_1 asc
And I tried too using this query to delete:
select *
from (
select *, rn=row_number() over (partition by date_1 order by id)
from ch_inout
) x
where rn > 1;
But nothing is working the result is empty.
You can use aggregation:
select id_emp, max(id) as id, date, min(ch_in), max(ch_out)
from ch_inout
group by id_emp, date;
This returns the maximum id for each group of rows. That is not exactly what is returned in the question, but you don't specify the logic.
EDIT:
If you want to delete all but the largest id for each id_emp/date combination, you can use:
delete c from ch_inout c
where id < (select max(c2.id)
from ch_inout c2
where c2.id_emp = c.id_emp and c2.date = c.date
);
You can use ROW_NUMBER() to identify the records you want to delete. Assuming that you want to keep the record with the lowest id on each date:
SELECT *
FROM (
SELECT
t.*,
ROW_NUMBER() OVER(PARTITION BY date ORDER BY id) rn
FROM ch_inout t
) x
WHERE rn > 1
You can easily turn this into a DELETE statement:
WITH cte AS (
SELECT
t.*,
ROW_NUMBER() OVER(PARTITION BY date ORDER BY id) rn
FROM ch_inout t
)
DELETE FROM cte WHERE rn > 1

Modify record using stored procedure

I am new with the stored procedure.
I have 4 million records so that manually cannot do that so use stored procedure.
I have a table like:
Id Name
-----------------
1 abc
2 xyz
3 abc
4 pqr
5 abc
6 pqr
And in that table one filed is called Name. In Name column, some are record are same name so I want to modify records and want like:
Id Name
---------------------
1 abc
2 xyz
3 abc-1
4 pqr
5 abc-2
6 pqr-1
& Insert it into another table which have same schema.
You can do this using an updatable CTE:
with toupdate as (
select t.*, row_number() over (partition by name order by id) as seqnum
from onetable t
)
update toupdate
set name = name + '-' + cast(seqnum - 1 as varchar(255))
where seqnum > 1;
Actually, that updates it in place. To put this into another table:
with toinsert as (
select t.*, row_number() over (partition by name order by id) as seqnum
from onetable t
)
select id,
(case when seqnum = 1 then name
else name + '-' + cast(seqnum - 1 as varchar(255))
end) as name
into newtable
from toinsert;
This will update the table
;WITH cte AS
(
SELECT id,
ROW_NUMBER() OVER(PARTITION BY Name ORDER BY Id) AS rno,
FROM table1
)
update t.Name = t.Name + '-'+ c.rno
from table1 t
join cte c on c.id = t.id
where c.rno >1
To insert simply use select with charindex
select * into Table2 from table1
where CHARINDEX('-',name) > 1

Any other alternative to write this SQL query

I need to select data base upon three conditions
Find the latest date (StorageDate Column) from the table for each record
See if there is more then one entry for date (StorageDate Column) found in first step for same ID (ID Column)
and then see if DuplicateID is = 2
So if table has following data:
ID |StorageDate | DuplicateTypeID
1 |2014-10-22 | 1
1 |2014-10-22 | 2
1 |2014-10-18 | 1
2 |2014-10-12 | 1
3 |2014-10-11 | 1
4 |2014-09-02 | 1
4 |2014-09-02 | 2
Then I should get following results
ID
1
4
I have written following query but it is really slow, I was wondering if anyone has better way to write it.
SELECT DISTINCT(TD.RecordID)
FROM dbo.MyTable TD
JOIN (
SELECT T1.RecordID, T2.MaxDate,COUNT(*) AS RecordCount
FROM MyTable T1 WITH (nolock)
JOIN (
SELECT RecordID, MAX(StorageDate) AS MaxDate
FROM MyTable WITH (nolock)
GROUP BY RecordID)T2
ON T1.RecordID = T2.RecordID AND T1.StorageDate = T2.MaxDate
GROUP BY T1.RecordID, T2.MaxDate
HAVING COUNT(*) > 1
)PT ON TD.RecordID = PT.RecordID AND TD.StorageDate = PT.MaxDate
WHERE TD.DuplicateTypeID = 2
Try this and see how the performance goes:
;WITH
tmp AS
(
SELECT *,
RANK() OVER (PARTITION BY ID ORDER BY StorageDate DESC) AS StorageDateRank,
COUNT(ID) OVER (PARTITION BY ID, StorageDate) AS StorageDateCount
FROM MyTable
)
SELECT DISTINCT ID
FROM tmp
WHERE StorageDateRank = 1 -- latest date for each ID
AND StorageDateCount > 1 -- more than 1 entry for date
AND DuplicateTypeID = 2 -- DuplicateTypeID = 2
You can use analytic function rank , can you try this query ?
Select recordId from
(
select *, rank() over ( partition by recordId order by [StorageDate] desc) as rn
from mytable
) T
where rn =1
group by recordId
having count(*) >1
and sum( case when duplicatetypeid =2 then 1 else 0 end) >=1