Duplicate in same row in different columns - sql

i have a table like this:
CREATE TABLE #my_table (
intID int IDENTITY (1, 1),
num_1 varchar(100) NOT NULL,
num_2 varchar(100) NOT NULL,
num_3 varchar(100) NOT NULL,
num_4 varchar(100),
num_5 varchar(100),
isDuplicate char(1) DEFAULT 'N'
)
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'c', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'c', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'c', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'a', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'a', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'c', 'd', 'c')
I need to find duplicates in columns and get the row number which is duplicate.
my result should be
duplicate rows last 3 rows and is duplicate flag should be updated to 'Y'

This could also do the trick:
UPDATE #my_table
SET isDuplicate = 'Y'
WHERE intID IN (
SELECT intID FROM #my_table
WHERE EXISTS
(SELECT 1
FROM (VALUES
(num_1)
,(num_2)
,(num_3)
,(num_4)
,(num_5)) AS X (n)
WHERE NULLIF(n, '') IS NOT NULL
GROUP BY n
HAVING COUNT(*)>1
)
)
More information about table value constructors you can find here.

This should do the trick :
select num_1, num_2, num_3, count(*)
from #my_table
group by num_1, num_2, num_3
having count(*) > 1
Regards

This would set the duplicate column in the table to 'Y' if its a duplicate, you can the query from that
UPDATE #my_table
SET isDuplicate = 'Y'
WHERE intID IN
(
SELECT intID
FROM
(
SELECT intID, num_1, num_2, num_3,num_4, num_5,
RANK() OVER(PARTITION BY num_1, num_2, num_3, num_4, num_5 ORDER BY intID ASC) AS [rank]
FROM #my_table
) a
WHERE [rank] > 1
);

Try this:
UPDATE #my_table
SET isDuplicate =
CASE
WHEN
(select count(*)
from #my_table t2
where t2.intID <> #my_table.intID
and t2.num_1 = #my_table.num_1
and t2.num_2 = #my_table.num_2
and t2.num_3 = #my_table.num_3
and t2.num_4 = #my_table.num_4
and t2.num_5 = #my_table.num_5
) > 0 then 'Y'
ELSE 'N'
END

Related

Find duplicate sets of data grouped by foreign key

How to check if the above table contains duplicate group of rows based on id. For ex here first two rows of id 1 are matching with the next two rows of id 2 but id 2 also have the third row which is not matching with any two rows of id 1 so it's not duplicate and there could be n no of ids.
I tried it to do with the group by and string_agg but it didn't work.
Here what I tried:
declare #t2 Table( m1 int, m2 int,n varchar(50),n2 varchar(50), id int)
insert into #t2 values(3,1,'c','',1),(2,1,'s','o',1),(2,1,'s','o',2),(3,1,'c','',2),(3,1,'f','',2)
if exists( SELECT *
FROM #t2
GROUP BY m1,m2,n,n2
HAVING COUNT(*) > 1)
begin
select 'Same.'
end
else
begin
select 'not found'
end
Any help here will be great.
Thanks
Thanks Iptr As per your solution in comment I am posting the same here:
declare #t2 table(m1 int, m2 int, n varchar(5), n2 varchar(5), id int);
insert into #t2(m1, m2, n, n2, id)
values
(3, 1, 'c', '', 1),
(2, 1, 's', 'o', 1),
(2, 1, 's', 'o', 2),
(3, 1, 'c', '', 2),
(3, 1, 'f', '', 2),
(3, 1, 'c', '', 4),
(2, 1, 's', 'o', 4),
(3, 1, 'c', '', 10),
(2, 1, 's', 'o', 10),
(3, 1, 'c', '', 5);
--if exists(select a.id from(.. having count(*) = a.idcnt)
select a.id, b.id
from
(
select *, count(*) over (partition by id) as idcnt
from #t2
) as a
join
(
select *, count(*) over (partition by id) as idcnt
from #t2
) as b on a.id </*>*/ b.id and a.m1 = b.m1 and a.m2 = b.m2 and a.n = b.n and a.n2 = b.n2 and a.idcnt = b.idcnt
group by a.id, b.id, a.idcnt
having count(*) = a.idcnt;
--if exists(select j.j from (.. having count(*) > 1;)
select string_agg(i.id, ',')
from
(
select distinct id
from #t2
) as i
cross apply
(
select r.m1, r.m2, r.n, r.n2
from #t2 as r
where r.id = i.id
order by r.m1, r.m2, r.n, r.n2
for json path
) as j(j)
group by j.j
having count(*) > 1;
You can count how many different ids for each set of rows. If the count is more than one, then there are duplicates. For example:
select m1, m2, n, n2, count(distinct id) as cnt
from t
group by m1, m2, n, n2
having count(distinct id) > 1

T-SQL how to filter by multiple criteria but prioritize what is returned?

I have the data below.
create table #results (
id int
, result_attr char(1)
, result varchar(100)
)
insert into
#results (id, result_attr, result)
values
(1, 'E', '***ERROR')
, (2, 'E', '***CORRECTED')
, (3, 'E', '***RESULTED')
, (4, 'E', '***AMENDED')
, (4, 'E', 'FOO')
, (5, 'E', 'ERROR***')
, (5, 'E', 'CORPOREAL')
, (6, 'E', '***CORRECTED')
, (7, 'E', '***RESULTED')
, (7, 'E', 'ABUNDANT')
, (7, 'E', 'PLENTITUDE')
, (8, 'E', 'INCORRECT')
, (9, 'A', 'HIGH')
, (10, 'A', 'LOW')
select *
from #results
drop table #results
The complete result set is:
My desired result set is:
This doesn't quite work:
select
res.id
, res.result_attr
, res.result
from #results as res
where
(charindex('***', res.result) > 0 or res.result_attr = 'E')
Tricky part being that I would want to exclude ID #4 with result "FOO" as well as ID #5 with result "CORPOREAL" and ID #7 with results "ABUNDANT" and "PLENTITUDE", but I want to keep ID #8 with result "INCORRECT". All in all, I want to exclude the following:
I've tried some windowing functions and other things, but am a bit stuck on this one. I would appreciate any assistance!
You can use row_number analytical function with conditional ordering as follows:
Select * from
(select
res.id
, res.result_attr
, res.result
, row_number() over (partition by res.id
order by case when charindex('***', res.result) > 0
then 1 else 2 end) as rn
from #results as res
where
(charindex('***', res.result) > 0 or res.result_attr = 'E') t
Where rn = 1
Order by id
Using:
select
res.id
, res.result_attr
, res.result
from results as res
where res.result LIKE '%***%' OR res.result = 'INCORRECT';
db<>fiddle demo

Remove duplicates by multiple column criteria

I have following table
CREATE TABLE Test (
ID INT NOT NULL IDENTITY(1,1) PRIMARY KEY,
FIRST VARCHAR(10) NOT NULL,
SECOND VARCHAR(10) NOT NULL
)
Table filled with some duplicate data. TestTarget table have same structure and it filled using following procedural algorithm:
DECLARE #first varchar(10), #second varchar(10)
DECLARE c CURSOR FAST_FORWARD
FOR
SELECT first, second FROM Test ORDER BY id
OPEN c
FETCH NEXT FROM c INTO #first, #second
WHILE ##fetch_status = 0
BEGIN
IF NOT EXISTS(SELECT 1 FROM TestTarget WHERE first=#first OR second=#second)
INSERT INTO TestTarget (first, second) VALUES(#first, #second)
FETCH NEXT FROM c INTO #first, #second
END
CLOSE c
DEALLOCATE c
Briefly here we checking target table before insert if it already contains such 'first' OR 'second' value.
Example:
Source table
ID FIRST SECOND
1 A 2
2 A 1
3 A 3
4 B 2
5 B 1
6 B 3
7 B 2
8 B 4
9 C 2
10 C 3
INSERT INTO Test (first, second)
VALUES ('A', '2'),
('A', '1'),
('A', '3'),
('B', '2'),
('B', '1'),
('B', '3'),
('B', '2'),
('B', '4'),
('C', '2'),
('C', '3')
Target table
ID FIRST SECOND
1 A 2
5 B 1
10 C 3
Real source table have x*100k rows and at least 2 rows for same 'first' or 'second' column.
I'm looking for set based solution if it ever possible or please at least something faster than such loop because it takes hours for my real case.
NOTE Classic duplicate removals via partition/join/etc. is not the case here because it will produce different results even with different final number of rows.
INSERT INTO TestTarget (first, second)
SELECT first,second
FROM Test t
WHERE NOT EXISTS
(
SELECT 1
FROM Test t2
WHERE t2.id>t.id and (t2.first=t.first or t2.second=t.second)
)
I cannot think of any simple set based solution to your problem, I am afraid, but I would hope that something along the following lines would be much faster than your existing cursor:
declare #test table
(id int,
first varchar(1),
second varchar(1))
declare #target table
(id int,
first varchar(1),
second varchar(1))
declare #temp table
(id int,
first varchar(1),
second varchar(1))
INSERT INTO #Test (id, first, second)
VALUES (1, 'A', '2'),
(2, 'A', '1'),
(3, 'A', '3'),
(4, 'B', '2'),
(5, 'B', '1'),
(6, 'B', '3'),
(7, 'B', '2'),
(8, 'B', '4'),
(9, 'C', '2'),
(10, 'C', '3')
declare #firsts table
(first varchar(1))
declare #seconds table
(second varchar(1))
INSERT INTO #firsts
SELECT DISTINCT first FROM #test
INSERT INTO #seconds
SELECT DISTINCT second FROM #test
declare #firstcnt int = (SELECT count(*) FROM #firsts)
declare #secondcnt int = (SELECT count(*) FROM #firsts)
WHILE (#firstcnt > 0 AND #secondcnt > 0)
BEGIN
DELETE FROM #temp
INSERT INTO #temp
SELECT TOP 1 t.id, t.first, t.second FROM #test t
INNER JOIN #firsts f On t.first = f.first
INNER JOIN #seconds s On t.second = s.second
ORDER BY id
INSERT INTO #target
SELECT * FROM #temp
DELETE FROM #firsts WHERE first = (SELECT first FROM #temp)
SET #firstcnt = #firstcnt - 1
DELETE FROM #seconds WHERE second = (SELECT second FROM #temp)
SET #secondcnt = #secondcnt - 1
END
SELECT * FROM #target
This does produce the desired values and I would expect it to be faster because the while loop only needs to run for the total number of unique value pairs, rather than having to step through the entire table.
It also gives 10 C 3 as the last row, which I take to be correct, despite #Gordon's comment. If I understand the question correctly, the ID order takes precedence: that is to say, although 'A' and 'B' have entries with '3' as the second value, these entries have a greater id, than another second value that can legitimately be inserted.
HTH
using Recursive CTE,
declare #Target table(col1 varchar(20),col2 int)
declare #Test table(col1 varchar(20),col2 int)
INSERT INTO #Test (col1, col2
VALUES ('A', '2')
('A', '1')
('A', '3'),
('B', '1')
('B', '2'),
('B', '3'),
('B', '2'),
('B', '4'),
('C', '2'),
('C', '3')
 
;With CTE as
(
select col1 ,col2
,DENSE_RANK()over( ORDER by col1)rn1
from #Test
)
,cte1 AS(
select top 1 c.col1,c.col2,rn1 from cte c where rn1=1
union ALL
select c.col1,c.col2,c.rn1 from cte c
inner join cte1 c1
on c.rn1>c1.rn
where c.col2!=c1.col2
)
insert into #Target
select col1,col2 FROM(
select *,ROW_NUMBER()over(partition by col1 order by (select null)) rn2 from cte1
)t4
where rn2=1
select * from #Target

Multiple SQL MAX when items are not in order

I have some data as below:
DECLARE #MyTable AS TABLE
(productName varchar(13), test1 int,test2 int)
INSERT INTO #MyTable
(productName, test1,test2)
VALUES
('a', 1,1),
('a', 2,2),
('a', 3,3),
('b', 1,4),
('b', 2,5),
('b', 3,6),
('a', 1,7),
('a', 4,8),
('a', 5,9)
;
SELECT productname,MAX(test1) from #MyTable group BY productname
a MAX query on test1 column gives
a,5
b,3
but I need to have result as
a,3
b,3
a,5
when I have order by test2
You can solve this by using a trick with row_numbers, so that you assign 2 different row numbers, one for the whole data and one that is partitioned by productname. If you compare the difference between these numbers, you can figure out when product name has changed, and use that to determine the max values for each group.
select productname, max(test1) from (
SELECT *,
row_number() over (order by test2 asc) -
row_number() over (partition by productname order by test2 asc) as GRP
from #MyTable
) X
group by productname, GRP
You can test this in SQL Fiddle
If the test2 column is always a row number without gaps, you can use that too instead of the first row number column. If you need ordering in the data, you'll have to for example to use the max of test1 to do that.
Please check the following SQL Select statement
DECLARE #MyTable AS TABLE (productName varchar(13), test1 int,test2 int)
INSERT INTO #MyTable
(productName, test1,test2)
VALUES
('a', 1,1),
('a', 2,2),
('a', 3,3),
('b', 1,4),
('b', 2,5),
('b', 3,6),
('a', 1,7),
('a', 4,8),
('a', 5,9)
DECLARE #MyTableNew AS TABLE (id int identity(1,1), productName varchar(13), test1 int,test2 int)
insert into #MyTableNew select * from #MyTable
--select * from #MyTableNew
;with cte as (
SELECT
id, productName, test1, test2,
case when (lag(productName,1,'') over (order by id)) = productName then 0 else 1 end ischange
from #MyTableNew
), cte2 as (
select t.*,(select sum(ischange) from cte where id <= t.id) grp from cte t
)
select distinct grp, productName, max(test1) over (partition by grp) from cte2
This is implemented according to the following SQL Server Lag() function tutorial
The Lag() function is used to identify and order the groups in table data
Please try this query
DECLARE #MyTable AS TABLE
(productName varchar(13), test1 int,test2 int)
INSERT INTO #MyTable
(productName, test1,test2)
VALUES
('a', 1,1),
('a', 2,2),
('a', 3,3),
('b', 1,4),
('b', 2,5),
('b', 3,6),
('a', 1,7),
('a', 4,8),
('a', 5,9)
;
SELECT productname,MAX(test1)
from #MyTable
where test1 = test2
group BY productname
union all
SELECT productname,MAX(test1)
from #MyTable
where test1 != test2
group BY productname

How to improve the efficiency and performance of this SQL select query

This script is very slow. How can I make it faster? It is being run against a large set of data.
I need to find instances where a group of fields are expected to have the same values but don't.
Here's a succinct example of the query:
CREATE TABLE #Example ( ID int, UserID int, ColA char(1), ColB char(1), ColC char(1), ColD char(1))
INSERT INTO #Example VALUES (1, 1, 'A', 'B', 'C', 'D');
INSERT INTO #Example VALUES (2, 1, 'A', 'B', 'C', 'D');
INSERT INTO #Example VALUES (3, 1, 'A', 'B', 'C', 'D');
INSERT INTO #Example VALUES (4, 1, 'A', 'B', 'C', 'D');
INSERT INTO #Example VALUES (5, 1, 'A', 'B', 'C', 'X');
SELECT UserID, ColA, ColB, ColC
FROM ( SELECT DISTINCT a.UserID, a.ColA, a.ColB, a.ColC, a.ColD FROM #Example a ) x
GROUP BY UserID, ColA, ColB, ColC
HAVING COUNT(ColD) > 1
As is, this returns 1 row, which is what I want, it's just slow. If the X in row 5 was a D then 0 rows returned.
Not sure if it's faster, but here is the same query as yours written more succinctly. I don't think you need the subquery.
SELECT UserID, ColA, ColB, ColC
FROM #Example a
GROUP BY UserID, ColA, ColB, ColC
HAVING COUNT(DISTINCT ColD) > 1;