How to remove duplicate rows using CTE? - sql

I would like to remove duplicate rows from my table. So that I have used ROW_NUMBER() function in order to find the duplicate values. After that I wanted to add WHERE claues to my query and so that I modify my query and used "CTE" but it gives me an error
ORA-00928: missing SELECT keyword
This is the query which runs successfully for my use case :
WITH RowNumCTE as
(
SELECT ID,parcelid,propertyaddress,saledate,saleprice,legalreference,
ROW_NUMBER() OVER
( PARTITION BY parcelid,propertyaddress,saledate,saleprice,legalreference
ORDER BY id ) AS rn
FROM housedata
)
SELECT *
FROM RowNumCTE

To delete duplicates:
delete housedata where rowid in
( select lead(rowid) over (partition by parcelid, propertyaddress, saledate, saleprice, legalreference order by id)
from housedata );
To delete duplicates using a CTE:
delete housedata where id in
( with cte as
( select id
, row_number() over(partition by parcelid, propertyaddress, saledate, saleprice, legalreference order by id) as rn
from housedata )
select id from cte
where rn > 1 );

I think you need
BEGIN
FOR d IN
(
SELECT ROW_NUMBER() OVER
( PARTITION BY parcelid,propertyaddress,saledate,saleprice,legalreference
ORDER BY id ) AS rn,
h.*
FROM housedata h
)
LOOP
IF d.rn > 1 THEN
DELETE housedata WHERE id = d.id;
END IF;
END LOOP;
COMMIT;
END;
/
considering those five columns compose the grouping criteria for the desired deletion and id is a primary key column.

Related

Finding duplicate values in a table where all the columns are not the same

I am working with a set of data in a table.
For simplicity i have the table like below with some sample data:
Some of the data in this table came from a different source, such data are the ones that have cqmRecordID != null
I need to find duplicate values in this table and delete the duplicate ones that came over from the other source (ones with a cqmRecordID)
A record is considered duplicate if they have the same values for these cols:
[Name]
Cast([CreatedDate] as Date)
[CreatedBy]
So in the sample data i have above, record #5 and record #6 would be considered duplicates.
As solutions I came up with these two queries:
Query #1:
select * from (
select recordid, cqmrecordid, ROW_NUMBER() over (partition by name, cast(createddate as date), createdby
order by cqmrecordid, recordid) as rownum
from vmsNCR ) A
where cqmrecordid is not null
order by recordid
Query #2:
select A.recordID, A.cqmRecordID, B.RecordID, B.cqmRecordID
from vmsNCR A
join vmsNCR B
on A.Name = B.Name
and cast(A.CreatedDate as date) = cast(B.CreatedDate as date)
and A.CreatedBy = B.CreatedBy
and A.RecordID != B.RecordID
and A.cqmRecordID is not null
order by A.RecordID
Is there a better approach to this? Is one better than the other performance wise?
If you want to fetch all the rows without duplicates, then:
select t.* -- or all columns except seqnum
from (select t.*,
row_number() over (partition by name, cast(createddate as date), createdby
order by (case when cqmRecordId is not null then 1 else 2 end)
) as seqnum
from t
) t
where seqnum = 1;
If you want performance, create a columns and then an index:
alter table t add cqmRecordId_flag as (case when cqmRecordId is null then 0 else 1 end) persisted;
alter table t add createddate_date as (cast(createddate as date)) persisted;
And then an index:
create index idx_t_4 on t(name, createddate_date, createdby, cqmRecordId_flag desc);
EDIT:
If you actually just want to delete the NULL values from the table, you can use:
delete t from t
where t.cqmRecordId is null and
exists (select 1
from t t2
where t2.name = t.name and
convert(date, t2.createddate_date) =convert(date, t.createddate_date) and
t2.createdby = t.createdby and
t2.cqmRecordId is not null
);
You can use the same logic with select to just select the duplicates.
Try below Query it might work for You
;WITH TestCTE
AS
(
SELECT *,ROW_NUMBER() OVER(
PARTITION BY [Name],Cast([CreatedDate] as Date),[CreatedBy]
ORDER BY RecordId
) AS RowNumber
)
DELETE FROM TestCTE
WHERE RowNumber > 1
Use the below code to eliminate duplicates
;WITH CTE
AS
(
SELECT ROW_NUMBER() OVER(
PARTITION BY [Name],Cast([CreatedDate] as Date),[CreatedBy]
ORDER BY cqmRecordId
) AS Rnk
,*
)
DELETE FROM CTE
WHERE Rnk <> 1

get only row that meet condition if such row exist and if not get the row that meet another condition

this sounds like a simple question but I just cant find the right way.
given the simplified table
with t as (
select ordernumber, orderdate, case when ordertype in (5,21) then 1 else 0 end is_restore , ordertype, row_number() over(order by orderdate) rn from
(
select to_date('29.08.08','DD.MM.YY') orderdate,'313' ordernumber, 1 as ordertype from dual union all
select to_date('13.03.15','DD.MM.YY') orderdate, '90/4/2' ordernumber, 5 as ordertype from dual
)
)
select * from t -- where clause should be here
for every row is_restore guaranteed to be 1 or 0.
if table has a row where is_restore=1 then select ordernumber,orderdate of that row and nothing else.
If a table does not have a row where is_restore=1 then select ordernumber,orderdate of the row where rn=1(row where rn=1 is guaranteed to exist in a table)
Given the requirements above what do I need to put in where clause to get the following?
You could use ROW_NUMBER:
CREATE TABLE t
AS
select ordernumber, orderdate,
case when ordertype in (5,21) then 1 else 0 end is_restore, ordertype,
row_number() over(order by orderdate) rn
from (
select to_date('29.08.08','DD.MM.YY') orderdate,'313' ordernumber,
1 as ordertype
from dual union all
select to_date('13.03.15','DD.MM.YY') orderdate, '90/4/2' ordernumber,
5 as ordertype
from dual);
-------------------
with cte as (
select t.*,
ROW_NUMBER() OVER(/*PARTITION BY ...*/ ORDER BY is_restore DESC, rn) AS rnk
from t
)
SELECT *
FROM cte
WHERE rnk = 1;
db<>fiddle demo
Here is sql, that doesn't use window functions, maybe it will be useful for those, whose databases don't support OVER ( ... ) or when there are indexed fields, on which query is based.
SELECT
*
FROM t
WHERE t.is_restore = 1
OR (
NOT EXISTS (SELECT 1 FROM t WHERE t.is_restore = 1)
AND t.rn = 1
)

Oracle: select values only from row with min(id)

SELECT
ass.assessmentAmount -- want to fetch assessmentAmount of min(ass.assessmentId)
ass.assessmentId
FROM
--bunch of joins
WHERE
ass.assessmentId = (SELECT min(ass2.assessmentId) FROM Assessment ass2
--same bunch of joins
It looks very confusing because I have 6 joins with conditions and I don't want to repeat it two times. Is there another way of doing this?
Use the MIN( ass.assessmentId ) OVER () analytic function:
SELECT *
FROM (
SELECT ass.assessmentAmount,
ass.assessmentId,
MIN( ass.assessmentId ) OVER () AS min_assessmentId
FROM --bunch of joins
)
WHERE assessmentId = min_assessmentId;
You can also use RANK():
SELECT *
FROM (
SELECT ass.assessmentAmount,
ass.assessmentId,
RANK() OVER ( ORDER BY ass.assessmentId ) AS rnk
FROM --bunch of joins
)
WHERE rnk = 1;
If assessmentId is UNIQUE and can only have a single row as a minimum then you could replace RANK with ROW_NUMBER; however, you could also then get the desired result using the ROWNUM pseudocolumn:
SELECT *
FROM (
SELECT ass.assessmentAmount,
ass.assessmentId
FROM --bunch of joins
ORDER BY ass.assessmentId ASC
)
WHERE ROWNUM = 1;
Use a CTE with a row_number
with CTE as
(
select assessmentId,
assessmentAmount ,
row_number() over (order by assessmentid asc) as rn
from --bunch of joins
)
select *
from CTE
where rn = 1

Insert Result Query Into Table

I have problem with an insert query in Postgresql.
I have query like this :
select *
from (
select *,
row_number() over (partition by id order by id) as row_number
from lookup_temp
) as rows
where row_number = 1
and I want to insert the result to table lookup_temp.
How can I do this?
I am assuming that you are trying to insert lookup_temp with only one row of each id repeating in your select (because of using this select *,row_number() over (partition by id order by id) as row_numberfrom lookup_temp) to the same table lookup_temp. if yes the below query is enough for you.
delete from lookup_temp where ctid in (
select ctid from (
select ctid,
row_number() over (partition by id order by id) as row_number
from lookup_temp
) as rows
where row_number <> 1)
ctid
The physical location of the row version within its table. Note that
although the ctid can be used to locate the row version very quickly,
a row's ctid will change if it is updated or moved by VACUUM FULL.
Therefore ctid is useless as a long-term row identifier. The OID, or
even better a user-defined serial number, should be used to identify
logical rows.
You can perform an INSERT from SELECT to get the result into the lookup_temp table
INSERT into lookup_temp (specify your columns) VALUES
(
select *
from (
select *,
row_number() over (partition by id order by id) as row_number
from lookup_temp
) as rows
where row_number = 1
)
Your query can be simpler with distinct on
insert into lookup_temp
select distinct on (id) *
from lookup_temp
If you are inserting into another table specify the columns
insert into another_table (id, c1, c2...)
select distinct on (id) id, c1, c2...
from lookup_temp
http://www.postgresql.org/docs/current/static/sql-select.html#SQL-DISTINCT
--Just do it this way. It should work fine
select *
INTO lookup_temp
from (
select *, row_number() over (partition by id order by id) as row_number
from lookup_temp
) as rows
where row_number = 1

SQL query: how to distinct count of a column group by another column

In my table I need to know if each ID has one and only one ID_name. How can I write such query?
I tried:
select ID, count(distinct ID_name) as count_name
from table
group by ID
having count_name > 1
But it takes forever to run.
Any thoughts?
select ID
from YourTable
group by
ID
having count(distinct ID_name) > 1
or
select *
from YourTable yt1
where exists
(
select *
from YourTable yt2
where yt1.ID = yt2.ID
and yt1.ID_Name <> yt2.ID_Name
)
Now, most ID columns are defined as primary key and are unique. So in a regular database you'd expect both queries to return an empty set.
select tt.ID,max(tt.myRank)
from
(
select
ip.ID,ip.ID_name,
ROW_Number() over (partition by ip.ID,ip.ID_nameorder by ip.ID) as myRank
from YourTable ip
) tt
group by tt.ID
This gives you every ID with it's total number of ID_Name
If you want only those ID's which have more than one name associated just add a where clause
e.g.
select tt.ID,max(tt.myRank)
from
(
select
ip.ID,ip.ID_name,
ROW_NUMBER() over (partition by ip.ID,ip.ID_nameorder by ip.ID) as myRank
from YourTable ip
) tt
**where tt.myRank > 1**
group by tt.ID