Identify duplicates in a group with one value being different

Identify duplicates in a group with one value being different - sql

USE TEMPDB
go
IF OBJECT_ID(N'#TEST', N'U') IS NOT NULL
DROP TABLE dbo.#TEST
CREATE TABLE #TEST
(NAME VARCHAR(50),
line int,
RANKS INT)
INSERT INTO #TEST
(name, line,RANKS)
VALUES
('Tom',1, 1), --keep
('Tom',2, 1),--keep
('Toms',1, 0), --keep
('Toms',2, 0),--keep
('Dave',1, 0),--delete
('Dave',2, 0),--keep
('Dave',1, 1),--keep
('TIm',1,1),--keep
('TIm',1,0),--delete
('Matt',1,0),--delete
('Matt',1,1)--keep
if the same name and line are falling under different ranks, then I need to delete the onde falling under rank '0' and if the same person with the same line is falling under different ranks, then the one under rank 0 should be deleted.

Something like this should work:
delete t1
from #TEST t1
where exists(select *
from #TEST t2
where t2.NAME = t1.NAME
and t2.line = t1.line
and t2.RANKS <> t1.RANKS)
and t1.RANKS = 0;

Related

How to optimize a trigger?

CREATE TRIGGER T
ON TABLE_2
AFTER INSERT
AS
DECLARE #bought_t int,
#name_t varchar(20)
SELECT #name_t = name_t
FROM inserted
SELECT #bought_t = bought_t
FROM TABLE_1
WHERE name_t = #name_t
IF #bought_t < 100
BEGIN
UPDATE TABLE_1
SET bought_t = #bought_t + 1
WHERE TABLE_1.name_t = #name_t
END
ELSE
ROLLBACK TRANSACTION
The column (TABLE_1) I'm making the update to after the insert happens in the 'TABLE_2' is supposed to hold values between 50 and 100. So I'm asking If this trigger is as professional and optimized as It could be? or I have some flaws that could lead to bugs/security issues.

Basically, you need to completely rewrite your trigger to be set-based and to be able to work with multiple rows in the Inserted pseudo table.
Fortunately, that also makes it easier - in my opinion - try something like this:
CREATE TRIGGER T
ON TABLE_2
AFTER INSERT
AS
UPDATE T1
SET bought_t = bought_t + 1
FROM TABLE_1 T1
INNER JOIN Inserted i ON i.name_t = T1.name_t
WHERE T1.bought_t < 100
UPDATE: demo to prove this works:
-- create the two tables
CREATE TABLE TABLE_2 (ID INT NOT NULL IDENTITY(1,1), ProdName VARCHAR(50))
CREATE TABLE TABLE_1 (ProdName VARCHAR(50), Bought INT)
GO
-- create trigger on "TABLE_2" to update "TABLE_1"
CREATE TRIGGER T2Insert
ON TABLE_2
AFTER INSERT
AS
UPDATE T1
SET Bought = Bought + 1
FROM TABLE_1 T1
INNER JOIN Inserted i ON T1.ProdName = i.ProdName
WHERE T1.Bought < 100
GO
-- initialize TABLE_1 with some seed data
INSERT INTO dbo.TABLE_1 (ProdName, Bought)
VALUES ( 'Prod1', 0), ('Prod2', 20), ('Prod3', 40), ('Prod4', 40), ('Prod100', 100)
-- insert new values into TABLE_2
INSERT INTO dbo.TABLE_2 (ProdName)
VALUES ('Prod1'), ('Prod100'), ('Prod2'), ('Prod4')
-- get data to check
SELECT * FROM dbo.TABLE_1
This renders output:
As you can easily see:
Prod1, Prod2, Prod4 that were inserted caused an update of the value Bought
Prod100 which was also inserted did not cause an update of Bought
UPDATE #2: if you need to be able to insert multiple identical values at once, you need to slightly enhance your trigger like this:
CREATE TRIGGER T2Insert
ON TABLE_2
AFTER INSERT
AS
-- declare table variable to hold names and update counts
DECLARE #UpdateCount TABLE (Name VARCHAR(50), UpdCount INT)
-- from the "Inserted" table, determine which names are being
-- inserted how many times using GROUP BY
INSERT INTO #UpdateCount (Name, UpdCount)
SELECT ProdName, COUNT(*)
FROM Inserted
GROUP BY ProdName
-- now join to this temporary table, and update as many times
-- as needed (instead of +1 for all cases)
UPDATE T1
SET Bought = Bought + uc.UpdCount
FROM TABLE_1 T1
INNER JOIN #UpdateCount uc ON uc.Name = T1.ProdName
WHERE T1.Bought < 100
GO

Merge search multiple condition - SQL Server

I am trying to understand the merge search condition and have come across the following problem.
Table1
id groupid description
-------------------------
1 10 Good
2 20 Better
Table2
id groupid description
-------------------------
1 10 Very Good
1 20 Much Better
I intend to merge the source (table1) to target (table2) on the id present in both but only groupid = 20 present in target table.
Here is what I am writing
Merge table1 source
Using table2 target ON (target.id = source.id AND target.groupid = 20)
When Matched
Then update
set target.description = source.description
The output I am expecting is
Table2
id groupid description
-------------------------
1 10 Very Good
1 20 Good
But I am not 100% sure of the ON clause (merge search condition) with multiple conditions of checking target.id = source.id and target.groupid = 20. Is the result always predictable and matching the expectation above in these multiple conditions ? Or is predictability a question here and should I be adding target.groupId = 20 in the "when matched AND" condition ?

It looks like your join is wrong. You are either needing to join on the GROUPID or your data is incorrect.
JOINING ON GROUP
create table #table1 (id int, groupid int, description varchar(64))
create table #table2 (id int, groupid int, description varchar(64))
insert into #table1 values
(1,10,'Good'),
(2,20,'Better')
insert into #table2 values
(1,10,'Very Good'),
(1,20,'Much Better')
Merge #table2 t
Using #table1 s
ON (t.groupid = s.groupid AND t.groupid = 20)
When Matched
Then update
set t.description = s.description;
select * from #table2
drop table #table2
drop table #table1
Otherwise, there isn't any way to correlate "better" from ID = 2 to a row where ID = 1. This goes against your original join condition on the ID column.
BASED OFF EDITED EXPECTED OUTPUT
create table #table1 (id int, groupid int, description varchar(64))
create table #table2 (id int, groupid int, description varchar(64))
insert into #table1 values
(1,10,'Good'),
(2,20,'Better')
insert into #table2 values
(1,10,'Very Good'),
(1,20,'Much Better')
Merge #table2 t
Using #table1 s
ON (t.id = s.id) --you could also put the and t.groupid = 20 here...
When Matched and t.groupid = 20
Then update
set t.description = s.description;
select * from #table2
drop table #table2
drop table #table1

SQL query takes more than an hour to execute for 200k rows

I have two tables each with around 200,000 rows. I have run the query below and it still hasn't completed after running for more than an hour. What could be the explanation for this?
SELECT
dbo.[new].[colom1],
dbo.[new].[colom2],
dbo.[new].[colom3],
dbo.[new].[colom4],
dbo.[new].[Value] as 'nieuwe Value',
dbo.[old].[Value] as 'oude Value'
FROM dbo.[new]
JOIN dbo.[old]
ON dbo.[new].[colom1] = dbo.[old].[colom1]
and dbo.[new].[colom2] = dbo.[old].[colom2]
and dbo.[new].[colom3] = dbo.[old].[colom3]
and dbo.[new].[colom4] = dbo.[old].[colom4]
where dbo.[new].[Value] <> dbo.[old].[Value]
from comment;

It seems that for an equality join on a single column, the rows with NULL value in the join key are being filtered out, but this is not the case for joins on multiple columns.
As a result, the hash join complexity is changed from O(N) to O(N^2).
======================================================================
In that context I would like to recommend a great article written by Paul White on similar issues -
Hash Joins on Nullable Columns
======================================================================
I have generated a small simulation of this use-case and I encourage you to test your solutions.
create table mytab1 (c1 int null,c2 int null)
create table mytab2 (c1 int null,c2 int null)
;with t(n) as (select 1 union all select n+1 from t where n < 10)
insert into mytab1 select null,null from t t0,t t1,t t2,t t3,t t4
insert into mytab2 select null,null from mytab1
insert into mytab1 values (111,222);
insert into mytab2 values (111,222);
select * from mytab1 t1 join mytab2 t2 on t1.c1 = t2.c1 and t1.c2 = t2.c2
For the OP query we should remove rows with NULL values in any of the join key columns.
SELECT
dbo.[new].[colom1],
dbo.[new].[colom2],
dbo.[new].[colom3],
dbo.[new].[colom4],
dbo.[new].[Value] as 'nieuwe Value',
dbo.[old].[Value] as 'oude Value'
FROM dbo.[new]
JOIN dbo.[old]
ON dbo.[new].[colom1] = dbo.[old].[colom1]
and dbo.[new].[colom2] = dbo.[old].[colom2]
and dbo.[new].[colom3] = dbo.[old].[colom3]
and dbo.[new].[colom4] = dbo.[old].[colom4]
where dbo.[new].[Value] <> dbo.[old].[Value]
and dbo.[new].[colom1] is not null
and dbo.[new].[colom2] is not null
and dbo.[new].[colom3] is not null
and dbo.[new].[colom4] is not null
and dbo.[old].[colom1] is not null
and dbo.[old].[colom2] is not null
and dbo.[old].[colom3] is not null
and dbo.[old].[colom4] is not null

Using EXCEPT join, you only have to make the larger HASH join on those values that have changed, so much faster:
/*
create table [new] ( colom1 int, colom2 int, colom3 int, colom4 int, [value] int)
create table [old] ( colom1 int, colom2 int, colom3 int, colom4 int, [value] int)
insert old values (1,2,3,4,10)
insert old values (1,2,3,5,10)
insert old values (1,2,3,6,10)
insert old values (1,2,3,7,10)
insert old values (1,2,3,8,10)
insert old values (1,2,3,9,10)
insert new values (1,2,3,4,11)
insert new values (1,2,3,5,10)
insert new values (1,2,3,6,11)
insert new values (1,2,3,7,10)
insert new values (1,2,3,8,10)
insert new values (1,2,3,9,11)
*/
select n.colom1, n.colom2 , n.colom3, n.colom4, n.[value] as newvalue, o.value as oldvalue
from new n
inner join [old] o on n.colom1=o.colom1 and n.colom2=o.colom2 and n.colom3=o.colom3 and n.colom4=o.colom4
inner join
(
select colom1, colom2 , colom3, colom4, [value] from new
except
select colom1, colom2 , colom3, colom4, [value] from old
) i on n.colom1=i.colom1 and n.colom2=i.colom2 and n.colom3=i.colom3 and n.colom4=i.colom4

Select records with order of IN clause

I have
SELECT * FROM Table1 WHERE Col1 IN(4,2,6)
I want to select and return the records with the specified order which i indicate in the IN clause
(first display record with Col1=4, Col1=2, ...)
I can use
SELECT * FROM Table1 WHERE Col1 = 4
UNION ALL
SELECT * FROM Table1 WHERE Col1 = 6 , .....
but I don't want to use that, cause I want to use it as a stored procedure and not auto generated.

I know it's a bit late but the best way would be
SELECT *
FROM Table1
WHERE Col1 IN( 4, 2, 6 )
ORDER BY CHARINDEX(CAST(Col1 AS VARCHAR), '4,2,67')
Or
SELECT CHARINDEX(CAST(Col1 AS VARCHAR), '4,2,67')s_order,
*
FROM Table1
WHERE Col1 IN( 4, 2, 6 )
ORDER BY s_order

You have a couple of options. Simplest may be to put the IN parameters (they are parameters, right) in a separate table in the order you receive them, and ORDER BY that table.

The solution is along this line:
SELECT * FROM Table1
WHERE Col1 IN(4,2,6)
ORDER BY
CASE Col1
WHEN 4 THEN 1
WHEN 2 THEN 2
WHEN 6 THEN 3
END

select top 0 0 'in', 0 'order' into #i
insert into #i values(4,1)
insert into #i values(2,2)
insert into #i values(6,3)
select t.* from Table1 t inner join #i i on t.[in]=t.[col1] order by i.[order]

Replace the IN values with a table, including a column for sort order to used in the query (and be sure to expose the sort order to the calling application):
WITH OtherTable (Col1, sort_seq)
AS
(
SELECT Col1, sort_seq
FROM (
VALUES (4, 1),
(2, 2),
(6, 3)
) AS OtherTable (Col1, sort_seq)
)
SELECT T1.Col1, O1.sort_seq
FROM Table1 AS T1
INNER JOIN OtherTable AS O1
ON T1.Col1 = O1.Col1
ORDER
BY sort_seq;
In your stored proc, rather than a CTE, split the values into table (a scratch base table, temp table, function that returns a table, etc) with the sort column populated as appropriate.

I have found another solution. It's similar to the answer from onedaywhen, but it's a little shorter.
SELECT sort.n, Table1.Col1
FROM (VALUES (4), (2), (6)) AS sort(n)
JOIN Table1
ON Table1.Col1 = sort.n

I am thinking about this problem two different ways because I can't decide if this is a programming problem or a data architecture problem. Check out the code below incorporating "famous" TV animals. Let's say that we are tracking dolphins, horses, bears, dogs and orangutans. We want to return only the horses, bears, and dogs in our query and we want bears to sort ahead of horses to sort ahead of dogs. I have a personal preference to look at this as an architecture problem, but can wrap my head around looking at it as a programming problem. Let me know if you have questions.
CREATE TABLE #AnimalType (
AnimalTypeId INT NOT NULL PRIMARY KEY
, AnimalType VARCHAR(50) NOT NULL
, SortOrder INT NOT NULL)
INSERT INTO #AnimalType VALUES (1,'Dolphin',5)
INSERT INTO #AnimalType VALUES (2,'Horse',2)
INSERT INTO #AnimalType VALUES (3,'Bear',1)
INSERT INTO #AnimalType VALUES (4,'Dog',4)
INSERT INTO #AnimalType VALUES (5,'Orangutan',3)
CREATE TABLE #Actor (
ActorId INT NOT NULL PRIMARY KEY
, ActorName VARCHAR(50) NOT NULL
, AnimalTypeId INT NOT NULL)
INSERT INTO #Actor VALUES (1,'Benji',4)
INSERT INTO #Actor VALUES (2,'Lassie',4)
INSERT INTO #Actor VALUES (3,'Rin Tin Tin',4)
INSERT INTO #Actor VALUES (4,'Gentle Ben',3)
INSERT INTO #Actor VALUES (5,'Trigger',2)
INSERT INTO #Actor VALUES (6,'Flipper',1)
INSERT INTO #Actor VALUES (7,'CJ',5)
INSERT INTO #Actor VALUES (8,'Mr. Ed',2)
INSERT INTO #Actor VALUES (9,'Tiger',4)
/* If you believe this is a programming problem then this code works */
SELECT *
FROM #Actor a
WHERE a.AnimalTypeId IN (2,3,4)
ORDER BY case when a.AnimalTypeId = 3 then 1
when a.AnimalTypeId = 2 then 2
when a.AnimalTypeId = 4 then 3 end
/* If you believe that this is a data architecture problem then this code works */
SELECT *
FROM #Actor a
JOIN #AnimalType at ON a.AnimalTypeId = at.AnimalTypeId
WHERE a.AnimalTypeId IN (2,3,4)
ORDER BY at.SortOrder
DROP TABLE #Actor
DROP TABLE #AnimalType

ORDER BY CHARINDEX(','+convert(varchar,status)+',' ,
',rejected,active,submitted,approved,')
Just put a comma before and after a string in which you are finding the substring index or you can say that second parameter.
And first parameter of CHARINDEX is also surrounded by , (comma).

Use APPLY with an UPDATE statement as the row source

I tried to perform the following in order to update a psuedo-identity value at the same time as using the value to create new rows, but APPLY does not like UPDATE statements as the right table source. What's the most elegant alternative outside of simply using an identity column?
create table Temp1(
id int not null identity(1,1) primary key
,data nvarchar(max) null)
create table Temp2(
id int not null primary key
,fkTemp1 int not null references Temp1(id)
,data nvarchar(max) null)
create table Numbering(
ObjectCode int not null primary key
,AutoKey int)
insert into Temp1(data) values('test string')
insert into Temp1(data) values('another test string')
insert into Numbering(ObjectCode, AutoKey) values(4, 1)
insert into Temp2(id, fkTemp1, data)
select n.AutoKey, t1.id, t1.data
from Temp1 t1
left join Temp2 t2 on t2.fkTemp1 = t1.id
cross apply (update Numbering set AutoKey = AutoKey + 1 output inserted.AutoKey where ObjectCode = 4) n
where t2.id is null -- only insert where a target row does not already exist

You cannot do an INSERT and UPDATE on two different tables in one statement in SQL Server 2005.
In SQL Server 2008 there is MERGE construct, however, it works only on single table.
Just run two statements in a transaction:
BEGIN TRANSACTION
DECLARE #AutoKey INT
SELECT #AutoKey = AutoKey
FROM Numbering WITH (UPDLOCK)
WHERE ObjectCode = 4
INSERT
INTO temp2
SELECT #AutoKey + ROW_NUMBER() OVER (ORDER BY id), id, data
FROM temp1
WHERE id NOT IN
(
SELECT fkTemp1
FROM temp2
)
UPDATE Numbering
SET AutoKey = AutoKey + ##ROWCOUNT
WHERE ObjectCode = 4
COMMIT
Update:
As #Remus Rusanu pointed out, you actually can pipeline UPDATE output clause into a table in SQL Server 2005.
However, it seems you can neither JOIN nor CROSS APPLY the OUTPUT resultset to the result of other queries.

This will do it, but you'll have to fix the "T2.ID IS NULL" problem...
Declare #Key as int
Declare #cnt as int
Begin Transaction
Set #cnt = (Select count(*)
from Temp1 t1 left join Temp2 t2 on t2.fkTemp1 = t1.id
--where t2.id is null -- note: does not work, not sure what is intended
)
update Numbering set #Key = AutoKey = AutoKey + #cnt where ObjectCode = 4
insert into Temp2(id, fkTemp1, data)
select #Key+ROW_NUMBER() over (Order By t1.id)
, t1.id, t1.data
from Temp1 t1
left join Temp2 t2 on t2.fkTemp1 = t1.id
--where t2.id is null -- note: does not work,
Commit Transaction

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Identify duplicates in a group with one value being different - sql

Something like this should work: delete t1 from #TEST t1 where exists(select * from #TEST t2 where t2.NAME = t1.NAME and t2.line = t1.line and t2.RANKS <> t1.RANKS) and t1.RANKS = 0;

Related

How to optimize a trigger?

Merge search multiple condition - SQL Server

SQL query takes more than an hour to execute for 200k rows

Select records with order of IN clause

Use APPLY with an UPDATE statement as the row source

Categories

Resources