Remove duplicates before insert sql - sql

I have temporary table #tempRD and I am trying to insert the resultset into a table as follows:
insert into Routing (RoutingKeyID, LocationID, Data, ServiceID, CountryID)
select
rk.ID, rd.LocationID, rd.Data, rd.service, rd.CountryID
from
#tempRD rd
inner join
RoutingKey rk on rk.serviceID = #ID and rk.Name=rd.[Key]
Now when this happens I get duplicate key errors
Cannot insert duplicate key row in object 'dbo.Routing' with unique index 'UIX_Routing_RoutingKeyID_CountryID'. The duplicate key value is (51, 433)
How can I check if a row by routingkeyid and countryid already exists before I do an insert?
I have used a cursor to do this but it takes a long long time.

Try this?
INSERT INTO
Routing (
RoutingKeyID,
LocationID,
Data,
ServiceID,
CountryID)
SELECT
rk.ID,
rd.LocationID,
rd.Data,
rd.[service],
rd.CountryID
FROM
#tempRD rd
INNER JOIN RoutingKey rk ON rk.serviceID = #ID AND rk.Name = rd.[Key]
LEFT JOIN Routing r ON r.RoutingKeyID = rk.ID AND r.CountryID = rd.CountryID
WHERE
r.RoutingKeyID IS NULL;

insert into Routing (RoutingKeyID, LocationID, Data, ServiceID, CountryID)
select *
from (
select rk.ID, rd.LocationID, rd.Data, rd.service, rd.CountryID, rank() over(partition by rk.ID, rd.CountryID order by rd.LocationID, rd.Data, rd.Service) as rnk
from #tempRD rd
inner join RoutingKey rk on rk.serviceID = #ID and rk.Name=rd.[Key]
) a
where rnk = 1
With the rank() over(partition by ... order by ....) you create a ranking of records that have a duplicate combination of rk.ID, rd.CountryID. At the end you apply the WHERE-clause to only use the first occurrence of the combination.
You could also use a select distinct if the other fields are also duplicates. I didn't try the code, there may be typo's ;-)

Using Row_number window function you can find the duplicates. Note i have ordered by id desc to get the latest record of duplicate.
;WITH cte
AS (SELECT Row_number() OVER(partition BY RoutingKeyID, CountryID ORDER BY id DESC) rn,
rk.ID,
rd.LocationID,
rd.Data,
rd.service,
rd.CountryID
FROM #tempRD rd)
INSERT INTO Routing
(RoutingKeyID,LocationID,Data,ServiceID,CountryID)
SELECT rk.ID,
rd.LocationID,
rd.Data,
rd.service,
rd.CountryID
FROM cte
WHERE rn = 1

Can use something like
MERGE INTO Routing
USING (SELECT #RoutingKeyID, #LocationID, #Data, #ServiceID, #CountryID) AS source (RoutingKeyID, LocationID, Data, ServiceID, CountryID)
ON (Routing.RoutingKeyID= source.RoutingKeyID)
WHEN NOT MATCHED THEN
INSERT (RoutingKeyID, LocationID, Data, ServiceID, CountryID)
VALUES (source.RoutingKeyID, source.LocationID, source.Data, source.ServiceID, source.CountryID)

Related

Select the newest record

I would like to run a select statement that runs and select only the newest record by Recored_timestampe field for the keys teacher_id and student_id. So any time, it runs it needs to provide only one record. how could I do it, please? The output could be without the field Recored_timestampe. Thanks
Using the window function,partitioned by teacher_id and student_id and sorting it by recorded_timestamp will give you the desired result.
select * from(select teacher_id,student_id,teacher_name,comment ,recorded_timestamp, row_number() over(partition by teacher_id,student_id order by recorded_timestamp desc)as rownum from temp0607)out1 where rownum=1
Also you may have to look at the way recorded_timestamp is stored. If it is stored as string, you can convert it into timestamp using from_unixtime(unix_timestamp(recorded_timestamp,'dd/MM/yyyy HH:mm'),'dd/MM/yyyy HH:mm')
First, arrange the record by datetime
SELECT *,RANK() OVER (PARTITION BY student_id ORDER BY Recored_timestamp desc) as ranking
FROM #temp
Then, if you want to know the newest record with student_id which is not null, then you can use OUTER APPLY to add a column which is non-NULL student_id.
OUTER APPLY (SELECT student_id
FROM #temp
WHERE #temp.teacher_id = ranktable.teacher_id
AND student_id IS NOT NULL
) AS jointable
Here is an example:
Create Table #temp
(
teacher_id int
,student_id int
,teacher_name varchar(40)
,comment varchar(100)
,Recored_timestamp datetime
)
INSERT INTO #temp
VALUES
(449,111,'lucy','Could be better','2021-05-04 07:00:00.000')
,(449,null,'lucy','smily','2021-05-11 07:00:00.000')
,(449,111,'lucy','not listening','2021-05-08 07:00:00.000')
,(448,null,'Toni','Good','2021-06-04 09:00:00.000')
,(448,222,'Toni','not doing as expected','2021-06-04 08:00:00.000')
SELECT DISTINCT teacher_id,
jointable.student_id,
teacher_name,
comment,
Recored_timestamp,
ranking
FROM
(
SELECT *,RANK() OVER (PARTITION BY teacher_id ORDER BY Recored_timestamp DESC) AS ranking
FROM #temp
) AS ranktable
OUTER APPLY (SELECT student_id
FROM #temp
WHERE #temp.teacher_id = ranktable.teacher_id
AND student_id IS NOT NULL
) AS jointable
WHERE ranking = 1 --only newest record will be extracted
Drop table #temp
You can base from this query to get the newest data.
SELECT TOP 1 * FROM tablename T1
INNER JOIN(SELECT teacher_id, Max(Recored_timestamp) as MaxDate from tablename GROUP BY teacher_id) T2 ON T2.teacher_id = T1.teacher_id AND T1.Recored_timestamp = T2.MaxDate

How to use distinct when you select multiple column in SQL

I have use simple inner join statement and getting result into CTE table. I want to select distinct 'ServiceId' from CTE. I have following query
SELECT DISTINCT(ServicesId), ServiceNo, ServiceDate, DealerCode FROM CTE_Temp
Suppose there are duplicate entries of ServiceId in CTE then I want to select first entry only and ignore rest of them.
You can use ROW_NUMBER() OVER() for this. Just replace the column in the ORDER BY to define what's first.
;WITH AnotherCTE AS(
SELECT
ServicesId, ServiceNo, ServiceDate, DealerCode,
RN = ROW_NUMBER() OVER(PARTITION BY ServicesID ORDER BY ServiceDate DESC)
FROM CTE_Temp
)
SELECT
ServicesId, ServiceNo, ServiceDate, DealerCode
FROM AnotherCTE
WHERE RN = 1

How to find duplicate records in PostgreSQL

I have a PostgreSQL database table called "user_links" which currently allows the following duplicate fields:
year, user_id, sid, cid
The unique constraint is currently the first field called "id", however I am now looking to add a constraint to make sure the year, user_id, sid and cid are all unique but I cannot apply the constraint because duplicate values already exist which violate this constraint.
Is there a way to find all duplicates?
The basic idea will be using a nested query with count aggregation:
select * from yourTable ou
where (select count(*) from yourTable inr
where inr.sid = ou.sid) > 1
You can adjust the where clause in the inner query to narrow the search.
There is another good solution for that mentioned in the comments, (but not everyone reads them):
select Column1, Column2, count(*)
from yourTable
group by Column1, Column2
HAVING count(*) > 1
Or shorter:
SELECT (yourTable.*)::text, count(*)
FROM yourTable
GROUP BY yourTable.*
HAVING count(*) > 1
From "Find duplicate rows with PostgreSQL" here's smart solution:
select * from (
SELECT id,
ROW_NUMBER() OVER(PARTITION BY column1, column2 ORDER BY id asc) AS Row
FROM tbl
) dups
where
dups.Row > 1
In order to make it easier I assume that you wish to apply a unique constraint only for column year and the primary key is a column named id.
In order to find duplicate values you should run,
SELECT year, COUNT(id)
FROM YOUR_TABLE
GROUP BY year
HAVING COUNT(id) > 1
ORDER BY COUNT(id);
Using the sql statement above you get a table which contains all the duplicate years in your table. In order to delete all the duplicates except of the the latest duplicate entry you should use the above sql statement.
DELETE
FROM YOUR_TABLE A USING YOUR_TABLE_AGAIN B
WHERE A.year=B.year AND A.id<B.id;
You can join to the same table on the fields that would be duplicated and then anti-join on the id field. Select the id field from the first table alias (tn1) and then use the array_agg function on the id field of the second table alias. Finally, for the array_agg function to work properly, you will group the results by the tn1.id field. This will produce a result set that contains the the id of a record and an array of all the id's that fit the join conditions.
select tn1.id,
array_agg(tn2.id) as duplicate_entries,
from table_name tn1 join table_name tn2 on
tn1.year = tn2.year
and tn1.sid = tn2.sid
and tn1.user_id = tn2.user_id
and tn1.cid = tn2.cid
and tn1.id <> tn2.id
group by tn1.id;
Obviously, id's that will be in the duplicate_entries array for one id, will also have their own entries in the result set. You will have to use this result set to decide which id you want to become the source of 'truth.' The one record that shouldn't get deleted. Maybe you could do something like this:
with dupe_set as (
select tn1.id,
array_agg(tn2.id) as duplicate_entries,
from table_name tn1 join table_name tn2 on
tn1.year = tn2.year
and tn1.sid = tn2.sid
and tn1.user_id = tn2.user_id
and tn1.cid = tn2.cid
and tn1.id <> tn2.id
group by tn1.id
order by tn1.id asc)
select ds.id from dupe_set ds where not exists
(select de from unnest(ds.duplicate_entries) as de where de < ds.id)
Selects the lowest number ID's that have duplicates (assuming the ID is increasing int PK). These would be the ID's that you would keep around.
Inspired by Sandro Wiggers, I did something similiar to
WITH ordered AS (
SELECT id,year, user_id, sid, cid,
rank() OVER (PARTITION BY year, user_id, sid, cid ORDER BY id) AS rnk
FROM user_links
),
to_delete AS (
SELECT id
FROM ordered
WHERE rnk > 1
)
DELETE
FROM user_links
USING to_delete
WHERE user_link.id = to_delete.id;
If you want to test it, change it slightly:
WITH ordered AS (
SELECT id,year, user_id, sid, cid,
rank() OVER (PARTITION BY year, user_id, sid, cid ORDER BY id) AS rnk
FROM user_links
),
to_delete AS (
SELECT id,year,user_id,sid, cid
FROM ordered
WHERE rnk > 1
)
SELECT * FROM to_delete;
This will give an overview of what is going to be deleted (there is no problem to keep year,user_id,sid,cid in the to_delete query when running the deletion, but then they are not needed)
In your case, because of the constraint you need to delete the duplicated records.
Find the duplicated rows
Organize them by created_at date - in this case I'm keeping the oldest
Delete the records with USING to filter the right rows
WITH duplicated AS (
SELECT id,
count(*)
FROM products
GROUP BY id
HAVING count(*) > 1),
ordered AS (
SELECT p.id,
created_at,
rank() OVER (partition BY p.id ORDER BY p.created_at) AS rnk
FROM products o
JOIN duplicated d ON d.id = p.id ),
products_to_delete AS (
SELECT id,
created_at
FROM ordered
WHERE rnk = 2
)
DELETE
FROM products
USING products_to_delete
WHERE products.id = products_to_delete.id
AND products.created_at = products_to_delete.created_at;
Following SQL syntax provides better performance while checking for duplicate rows.
SELECT id, count(id)
FROM table1
GROUP BY id
HAVING count(id) > 1
begin;
create table user_links(id serial,year bigint, user_id bigint, sid bigint, cid bigint);
insert into user_links(year, user_id, sid, cid) values (null,null,null,null),
(null,null,null,null), (null,null,null,null),
(1,2,3,4), (1,2,3,4),
(1,2,3,4),(1,1,3,8),
(1,1,3,9),
(1,null,null,null),(1,null,null,null);
commit;
set operation with distinct and except.
(select id, year, user_id, sid, cid from user_links order by 1)
except
select distinct on (year, user_id, sid, cid) id, year, user_id, sid, cid
from user_links order by 1;
except all also works. Since id serial make all rows unique.
(select id, year, user_id, sid, cid from user_links order by 1)
except all
select distinct on (year, user_id, sid, cid)
id, year, user_id, sid, cid from user_links order by 1;
So far works nulls and non-nulls.
delete:
with a as(
(select id, year, user_id, sid, cid from user_links order by 1)
except all
select distinct on (year, user_id, sid, cid)
id, year, user_id, sid, cid from user_links order by 1)
delete from user_links using a where user_links.id = a.id returning *;

select the latest result based on DateTime field

I have a simple table with only 4 fields.
http://sqlfiddle.com/#!3/06d7d/1
CREATE TABLE Assessment (
id INTEGER IDENTITY(1,1) PRIMARY KEY,
personId INTEGER NOT NULL,
dateTaken DATETIME,
outcomeLevel VARCHAR(2)
)
INSERT INTO Assessment (personId, dateTaken, outcomeLevel)
VALUES (1, '2014-04-01', 'L1')
INSERT INTO Assessment (personId, dateTaken, outcomeLevel)
VALUES (1, '2014-04-05', 'L2')
INSERT INTO Assessment (personId, dateTaken, outcomeLevel)
VALUES (2, '2014-04-03', 'E3')
INSERT INTO Assessment (personId, dateTaken, outcomeLevel)
VALUES (2, '2014-04-07', 'L1')
I am trying to select for each "personId" their latest assessment result based on the dateTaken.
So my desired output for the following data would be.
[personId, outcomeLevel]
[1, L2]
[2, L1]
Thanks,
Danny
Try this:
;with cte as
(select personId pid, max(dateTaken) maxdate
from assessment
group by personId)
select personId, outcomeLevel
from assessment a
inner join cte c on a.personId = c.pid
where c.maxdate = a.dateTaken
order by a.personId
;with Cte as (Select personId,outcomeLevel, C= ROW_NUMBER()
over(PARTITION By personId Order By dateTaken desc)
From #Assessment
)
Select * from cte where C=1
Sample here
SELECT asst.personId,
asst.outcomeLevel
FROM dbo.Assessment asst
WHERE asst.dateTaken=(SELECT MAX(ast.dateTaken)
FROM assessment ast
WHERE asst.personid=ast.personId)
ORDER BY asst.personId
Result will be like this
personId outcomeLevel
1 L2
2 L1
Here is a possible solution using common table expression:
WITH cte AS (
SELECT
ROW_NUMBER() OVER (PARTITION BY personId ORDER BY dateTaken DESC) AS rn
, personId
, outcomeLevel
FROM
[dbo].[Assessment]
)
SELECT
personId
, outcomeLevel
FROM
cte
WHERE
rn = 1
About CTEs
A common table expression (CTE) can be thought of as a temporary result set that is defined within the execution scope of a single SELECT, INSERT, UPDATE, DELETE, or CREATE VIEW statement. A CTE is similar to a derived table in that it is not stored as an object and lasts only for the duration of the query. Unlike a derived table, a CTE can be self-referencing and can be referenced multiple times in the same query. From MSDN: Using Common Table Expressions
try this:
SELECT a.personId, a.outcomeLevel
FROM Assessment a
INNER JOIN
(
SELECT max(dateTaken) as datetaken1, personId
FROM Assessment
GROUP BY personId ) b
ON a.dateTaken = b.datetaken1
demo: http://sqlfiddle.com/#!3/06d7d/9
Idea is to first derive a table with the max dates per person and then join that with the original table on the date field so you can get the outcome level for this maxed date...
This should work perfectly without cte :
SELECT [Table4].[personId], [Table4].[outcomeLevel]
FROM (
SELECT [Table1].[personId]
FROM [Assessment] AS [Table1]
GROUP BY [Table1].[personId]
) AS [Table2]
CROSS APPLY (
SELECT TOP (1) [Table3].[personId], [Table3].[outcomeLevel], [Table3].[dateTaken]
FROM [Assessment] AS [Table3]
WHERE [Table2].[personId] = [Table3].[personId]
ORDER BY [Table3].[dateTaken] DESC
) AS [Table4]
ORDER BY [Table4].[dateTaken] DESC

How do I delete duplicate rows in SQL Server using the OVER clause?

Here are the columns in my table:
Id
EmployeeId
IncidentRecordedById
DateOfIncident
Comments
TypeId
Description
IsAttenIncident
I would like to delete duplicate rows where EmployeeId, DateOfIncident, TypeId and Description are the same - just to clarify - I do want to keep one of them. I think I should be using the OVER clause with PARTITION, but I am not sure.
Thanks
If you want to keep one row of the duplicate-groups you can use ROW_NUMBER. In this example i keep the row with the lowest Id:
WITH CTE AS
(
SELECT rn = ROW_NUMBER()
OVER(
PARTITION BY employeeid, dateofincident, typeid, description
ORDER BY Id ASC), *
FROM dbo.TableName
)
DELETE FROM cte
WHERE rn > 1
use this query without using CTE....
delete a from
(select id,name,place, ROW_NUMBER() over (partition by id,name,place order by id) row_Count
from dup_table) a
where a.row_Count >1
You can use the following query. This has an assumption that you want to keep the latest row and delete the other duplicates.
DELETE [YourTable]
FROM [YourTable]
LEFT OUTER JOIN (
SELECT MAX(ID) as RowId
FROM [YourTable]
GROUP BY EmployeeId, DateOfIncident, TypeId, Description
) as KeepRows ON
[YourTable].ID = KeepRows.RowId
WHERE
KeepRows.RowId IS NULL