Find duplicate sets of data grouped by foreign key

Find duplicate sets of data grouped by foreign key - sql

How to check if the above table contains duplicate group of rows based on id. For ex here first two rows of id 1 are matching with the next two rows of id 2 but id 2 also have the third row which is not matching with any two rows of id 1 so it's not duplicate and there could be n no of ids.
I tried it to do with the group by and string_agg but it didn't work.
Here what I tried:
declare #t2 Table( m1 int, m2 int,n varchar(50),n2 varchar(50), id int)
insert into #t2 values(3,1,'c','',1),(2,1,'s','o',1),(2,1,'s','o',2),(3,1,'c','',2),(3,1,'f','',2)
if exists( SELECT *
FROM #t2
GROUP BY m1,m2,n,n2
HAVING COUNT(*) > 1)
begin
select 'Same.'
end
else
begin
select 'not found'
end
Any help here will be great.
Thanks

Thanks Iptr As per your solution in comment I am posting the same here:
declare #t2 table(m1 int, m2 int, n varchar(5), n2 varchar(5), id int);
insert into #t2(m1, m2, n, n2, id)
values
(3, 1, 'c', '', 1),
(2, 1, 's', 'o', 1),
(2, 1, 's', 'o', 2),
(3, 1, 'c', '', 2),
(3, 1, 'f', '', 2),
(3, 1, 'c', '', 4),
(2, 1, 's', 'o', 4),
(3, 1, 'c', '', 10),
(2, 1, 's', 'o', 10),
(3, 1, 'c', '', 5);
--if exists(select a.id from(.. having count(*) = a.idcnt)
select a.id, b.id
from
(
select *, count(*) over (partition by id) as idcnt
from #t2
) as a
join
(
select *, count(*) over (partition by id) as idcnt
from #t2
) as b on a.id </*>*/ b.id and a.m1 = b.m1 and a.m2 = b.m2 and a.n = b.n and a.n2 = b.n2 and a.idcnt = b.idcnt
group by a.id, b.id, a.idcnt
having count(*) = a.idcnt;
--if exists(select j.j from (.. having count(*) > 1;)
select string_agg(i.id, ',')
from
(
select distinct id
from #t2
) as i
cross apply
(
select r.m1, r.m2, r.n, r.n2
from #t2 as r
where r.id = i.id
order by r.m1, r.m2, r.n, r.n2
for json path
) as j(j)
group by j.j
having count(*) > 1;

You can count how many different ids for each set of rows. If the count is more than one, then there are duplicates. For example:
select m1, m2, n, n2, count(distinct id) as cnt
from t
group by m1, m2, n, n2
having count(distinct id) > 1

Related

How to write a SQL statement to return all children of each root,extend to each root

I have a table.
the value of [contents] can insert any word(Below are just examples)
create table #TreeTable (
Id uniqueidentifier,
Contents NVARCHAR(200),
ParentId uniqueidentifier,
Floors Int,
Sort Int,
)
insert into #TreeTable([Id], [ParentId], [Contents],[Floors],[Sort]) values
(1, 0, '1',1,1),
(2, 0, '2',1,2),
(3, 1, '1-1',2,1),
(4, 1, '1-2',2,2),
(5, 2, '2-1',2,1),
(6, 3, '1-1-1',3,1),
(7, 4, '1-2-1',3,1);
(8, 4, '1-2-2',3,2),
(9, 6, '1-1-1-1',4,1),
(10, 6,'1-1-1-2',4,2);
I want to write a T-SQL statement that will return all children of each root with the root ID　and Below is the result I expected
[Id][ParentId][Contents] [Levels][Sort]
(1, 0, '1', 1, 1)
(3, 1, '1-1', 2, 1)
(6, 3, '1-1-1', 3, 1)
(9, 6, '1-1-1-1', 4, 1)
(10, 6, '1-1-1-2', 4, 2)
(4, 1, '1-2', 2, 2)
(7, 4, '1-2-1', 3, 1)
(8, 4, '1-2-2', 3, 2)
(2, 0, '2', 1, 2)
(5, 2, '2-1', 2, 1)
The depth of the tree may be arbitrarily long
I try to use this ，but the reuslt is not what i expected
with RecursiveTable as(
------ start-------
SELECT a.*
FROM TreeTable a
WHERE
a.ParentId = 0
------ end ---------
union all
SELECT b.*
FROM TreeTable b
join RecursiveTable a on a.Id =b.ParentId
)
select * from RecursiveTable

If we may believe [Contents] values, then simply
SELECT [Id],
[ParentId],
[Contents],
LEN([Contents]) - LEN(REPLACE([Contents], '-', '')) + 1 [Levels],
[Sort]
FROM #TreeTable
ORDER BY [Contents];
fiddle
If not then
WITH cte AS
(
SELECT [Id], [ParentId], CAST([Sort] AS NCHAR), [Sort], 1 [Levels]
FROM #TreeTable
WHERE [ParentId] = 0
UNION ALL
SELECT t.[Id], t.[ParentId], cte.[Contents] + '-' + t.[Sort], t.[Sort], 1 + cte.[Levels]
FROM #TreeTable t
JOIN cte ON t.[ParentId] = cte.[Id]
)
SELECT [Id], [ParentId], [Contents], [Levels], [Sort]
FROM cte
ORDER BY [Contents];
fiddle (optimize types adjusting for their matching in CTE by yourself).

U can try:
WITH RECURSIVETABLE (ID, [PARENTID], [CONTENTS],[LEVELS],[SORT],[ORDER]) AS
(
------ START-------
SELECT A.ID, A.[PARENTID], A.[CONTENTS],A.[FLOORS],A.[SORT], CONVERT(VARCHAR(250),RIGHT('00000'+CONVERT(VARCHAR(50),ROW_NUMBER() OVER(ORDER BY SORT)),5))
FROM #TREETABLE A
WHERE
A.PARENTID = 0
------ END ---------
UNION ALL
SELECT B.ID, B.[PARENTID], B.[CONTENTS],B.[FLOORS],B.[SORT], CONVERT(VARCHAR(250),CONVERT(VARCHAR,A.[ORDER])+'.'+RIGHT('00000'+CONVERT(VARCHAR(50),ROW_NUMBER() OVER(ORDER BY B.[FLOORS],B.SORT)),5))
FROM #TREETABLE B
JOIN RECURSIVETABLE A ON A.ID =B.PARENTID
)
SELECT * FROM RECURSIVETABLE ORDER BY [ORDER]

How to use ROW_NUMBER when grouping records?

I have the following:
DECLARE #items TABLE
(
ItemId int NOT NULL,
[Description] varchar(255) NOT NULL,
Amount money NOT NULL
);
INSERT INTO #items SELECT 1, 'A', 10;
INSERT INTO #items SELECT 2, 'A', 10;
INSERT INTO #items SELECT 3, 'B', 11;
INSERT INTO #items SELECT 4, 'B', 11;
INSERT INTO #items SELECT 5, 'B', 11;
INSERT INTO #items SELECT 6, 'C', 12;
INSERT INTO #items SELECT 7, 'C', 12;
INSERT INTO #items SELECT 8, 'A', 10;
INSERT INTO #items SELECT 9, 'A', 10;
SELECT
ROW_NUMBER() OVER(PARTITION BY b.ItemId ORDER BY b.[Description]),
[Description],
COUNT(ItemId) OVER(PARTITION BY b.ItemId),
SUM(Amount) OVER(PARTITION BY b.ItemId)
FROM #items b
The result should be:
1, A, 4, 40
2, B, 3, 33
3, C, 2, 24
However the items are not being grouped.
So how to I need to use ROW_NUMBER to group records?

Is this what you want?
SELECT ROW_NUMBER() OVER (ORDER BY i.Description),
i.Description,
COUNT(*),
SUM(i.Amount)
FROM #items i
GROUP BY Description
ORDER BY Description;
Here is a rextester.

If you don't want use GROUP BY by itself you may do a subquery with two row_number(), something like this:
select ROW_NUMBER() over(order by t.[Description]), t.Description, t.cnt, t.summ
from (
SELECT
ROW_NUMBER() OVER(PARTITION BY b.[Description] ORDER BY b.[Description] ) rn,
[Description],
COUNT(ItemId) OVER(PARTITION BY b.[Description]) cnt,
SUM(Amount) OVER(PARTITION BY b.[Description]) summ
FROM #items b
) t where rn = 1
And anyway you shouldn't group data by the ItemId - it's a wrong way to achieve your aim

Duplicate in same row in different columns

i have a table like this:
CREATE TABLE #my_table (
intID int IDENTITY (1, 1),
num_1 varchar(100) NOT NULL,
num_2 varchar(100) NOT NULL,
num_3 varchar(100) NOT NULL,
num_4 varchar(100),
num_5 varchar(100),
isDuplicate char(1) DEFAULT 'N'
)
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'c', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'c', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'c', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'a', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'a', 'd', 'e')
INSERT INTO #my_table (num_1, num_2, num_3, num_4, num_5)
VALUES ('a', 'b', 'c', 'd', 'c')
I need to find duplicates in columns and get the row number which is duplicate.
my result should be
duplicate rows last 3 rows and is duplicate flag should be updated to 'Y'

This could also do the trick:
UPDATE #my_table
SET isDuplicate = 'Y'
WHERE intID IN (
SELECT intID FROM #my_table
WHERE EXISTS
(SELECT 1
FROM (VALUES
(num_1)
,(num_2)
,(num_3)
,(num_4)
,(num_5)) AS X (n)
WHERE NULLIF(n, '') IS NOT NULL
GROUP BY n
HAVING COUNT(*)>1
)
)
More information about table value constructors you can find here.

This should do the trick :
select num_1, num_2, num_3, count(*)
from #my_table
group by num_1, num_2, num_3
having count(*) > 1
Regards

This would set the duplicate column in the table to 'Y' if its a duplicate, you can the query from that
UPDATE #my_table
SET isDuplicate = 'Y'
WHERE intID IN
(
SELECT intID
FROM
(
SELECT intID, num_1, num_2, num_3,num_4, num_5,
RANK() OVER(PARTITION BY num_1, num_2, num_3, num_4, num_5 ORDER BY intID ASC) AS [rank]
FROM #my_table
) a
WHERE [rank] > 1
);

Try this:
UPDATE #my_table
SET isDuplicate =
CASE
WHEN
(select count(*)
from #my_table t2
where t2.intID <> #my_table.intID
and t2.num_1 = #my_table.num_1
and t2.num_2 = #my_table.num_2
and t2.num_3 = #my_table.num_3
and t2.num_4 = #my_table.num_4
and t2.num_5 = #my_table.num_5
) > 0 then 'Y'
ELSE 'N'
END

Identifying/comparing sets of rows within groups

I have a matter which seemed simple to solve but now I find it troublesome.
In simplification - I need to find a way to identify unique sets of rows within groups defined by another column. In basic example the source table contains only two columns:
routeID nodeID nodeName
1 1 a
1 2 b
2 1 a
2 2 b
3 1 a
3 2 b
4 1 a
4 2 c
5 1 a
5 2 c
6 1 a
6 2 b
6 3 d
7 1 a
7 2 b
7 3 d
So, the routeID column refers to set of nodes which define a route.
What I need to do is to somehow group the routes, so that there will be only one unique sequence of nodes for one routeID.
In my actual case I tried to use window function to add columns which help to identify nodes sequence, but I still have no idea how to get those unique sequences and group routes.
As a final effect I want to get only unique routes - for example routes 1,2 and 3 aggregated to one route.
Do you have any idea how to help me ?
EDIT:
The other table which I would like to join with the one from the example may look like that:
journeyID nodeID nodeName routeID
1 1 a 1
1 2 b 1
2 1 a 1
2 2 b 1
3 1 a 4
3 2 c 4
...........................
...........................

You can try this idea:
DECLARE #DataSource TABLE
(
[routeID] TINYINT
,[nodeID] TINYINT
,[nodeName] CHAR(1)
);
INSERT INTO #DataSource ([routeID], [nodeID], [nodeName])
VALUES ('1', '1', 'a')
,('1', '2', 'b')
,('2', '1', 'a')
,('2', '2', 'b')
,('3', '1', 'a')
,('3', '2', 'b')
,('4', '1', 'a')
,('4', '2', 'c')
,('5', '1', 'a')
,('5', '2', 'c')
,('6', '1', 'a')
,('6', '2', 'b')
,('6', '3', 'd')
,('7', '1', 'a')
,('7', '2', 'b')
,('7', '3', 'd');
SELECT DS.[routeID]
,nodes.[value]
,ROW_NUMBER() OVER (PARTITION BY nodes.[value] ORDER BY [routeID]) AS [rowID]
FROM
(
-- getting unique route ids
SELECT DISTINCT [routeID]
FROM #DataSource DS
) DS ([routeID])
CROSS APPLY
(
-- for each route id creating CSV list with its node ids
SELECT STUFF
(
(
SELECT ',' + [nodeName]
FROM #DataSource DSI
WHERE DSI.[routeID] = DS.[routeID]
ORDER BY [nodeID]
FOR XML PATH(''), TYPE
).value('.', 'VARCHAR(MAX)')
,1
,1
,''
)
) nodes ([value]);
The code will give you this output:
So, you simple need to filter by rowID = 1. Of course, you can change the code as you like in order to satisfy your bussness criteria (for example showing no the first route ID with same nodes, but the last).
Also, ROW_NUMBER function cannot be used directly in the WHERE clause, so you need to wrap the code before filtering:
WITH DataSource AS
(
SELECT DS.[routeID]
,nodes.[value]
,ROW_NUMBER() OVER (PARTITION BY nodes.[value] ORDER BY [routeID]) AS [rowID]
FROM
(
-- getting unique route ids
SELECT DISTINCT [routeID]
FROM #DataSource DS
) DS ([routeID])
CROSS APPLY
(
-- for each route id creating CSV list with its node ids
SELECT STUFF
(
(
SELECT ',' + [nodeName]
FROM #DataSource DSI
WHERE DSI.[routeID] = DS.[routeID]
ORDER BY [nodeID]
FOR XML PATH(''), TYPE
).value('.', 'VARCHAR(MAX)')
,1
,1
,''
)
) nodes ([value])
)
SELECT DS2.*
FROM DataSource DS1
INNER JOIN #DataSource DS2
ON DS1.[routeID] = DS2.[routeID]
WHERE DS1.[rowID] = 1;

ok, let's use some recursion to create a complete node list for each routeID
First of all let's populate source table and journeyes tale
-- your source
declare #r as table (routeID int, nodeID int, nodeName char(1))
-- your other table
declare #j as table (journeyID int, nodeID int, nodeName char(1), routeID int)
-- temp results table
declare #routes as table (routeID int primary key, nodeNames varchar(1000))
;with
s as (
select *
from (
values
(1, 1, 'a'),
(1, 2, 'b'),
(2, 1, 'a'),
(2, 2, 'b'),
(3, 1, 'a'),
(3, 2, 'b'),
(4, 1, 'a'),
(4, 2, 'c'),
(5, 1, 'a'),
(5, 2, 'c'),
(6, 1, 'a'),
(6, 2, 'b'),
(6, 3, 'd'),
(7, 1, 'a'),
(7, 2, 'b'),
(7, 3, 'd')
) s (routeID, nodeID, nodeName)
)
insert into #r
select *
from s
;with
s as (
select *
from (
values
(1, 1, 'a', 1),
(1, 2, 'b', 1),
(2, 1, 'a', 1),
(2, 2, 'b', 1),
(3, 1, 'a', 4),
(3, 2, 'c', 4)
) s (journeyID, routeID, nodeID, nodeName)
)
insert into #j
select *
from s
now let's exctract routes:
;with
d as (
select *, row_number() over (partition by r.routeID order by r.nodeID desc) n2
from #r r
),
r as (
select d.*, cast(nodeName as varchar(1000)) Names, cast(0 as bigint) i2
from d
where nodeId=1
union all
select d.*, cast(r.names + ',' + d.nodeName as varchar(1000)), r.n2
from d
join r on r.routeID = d.routeID and r.nodeId=d.nodeId-1
)
insert into #routes
select routeID, Names
from r
where n2=1
table #routes will be like this:
routeID nodeNames
1 'a,b'
2 'a,b'
3 'a,b'
4 'a,c'
5 'a,c'
6 'a,b,d'
7 'a,b,d'
an now the final output:
-- the unique routes
select MIN(r.routeID) routeID, nodeNames
from #routes r
group by nodeNames
-- the unique journyes
select MIN(journeyID) journeyID, r.nodeNames
from #j j
inner join #routes r on j.routeID = r.routeID
group by nodeNames
output:
routeID nodeNames
1 'a,b'
4 'a,c'
6 'a,b,d'
and
journeyID nodeNames
1 'a,b'
3 'a,c'

SQL Server function to get top level parent in hierarchy

I have following table (master_group) structure :
code name under
1 National Sales Manager 1
2 regional sales manager 1
3 area sales manager 2
4 sales manager 3
How do I get the ultimate parent of a particular row like :
code name under ultimateparent
1 National Sales Manager 1 1
2 regional sales manager 1 1
3 area sales manager 2 1
4 sales manager 3 1

With recursive cte going from top to childs:
with cte as(
select *, code as ultimate from t where code = under
union all
select t.*, c.ultimate from t
join cte c on c.code = t.under
where t.code <> t.under
)
select * from cte
For data:
create table t (code int, name varchar(100), under int)
insert into t values
(1, 'National Sales Manager', 1),
(2, 'regional sales manager', 1),
(3, 'area sales manager', 2),
(4, 'sales manager', 3),
(5, 'a', 5),
(6, 'b', 5),
(7, 'c', 5),
(8, 'd', 7),
(9, 'e', 7),
(10, 'f', 9),
(11, 'g', 9)
it generates the output:
code name under ultimate
1 National Sales Manager 1 1
5 a 5 5
6 b 5 5
7 c 5 5
8 d 7 5
9 e 7 5
10 f 9 5
11 g 9 5
2 regional sales manager 1 1
3 area sales manager 2 1
4 sales manager 3 1
Fiddle http://sqlfiddle.com/#!6/17c12e/1

You can use a recursive CTE to walk the tree and then choose the highest level for each code:
with cte as (
select mg.code, mg.name as name, mg.under as under, mg.under as parent, 1 as lev
from master_group mg
union all
select mg.code, mg.name, mg.under, cte.under as parent, cte.lev + 1
from master_group mg join
cte
on mg.under = cte.code
where cte.under is not null and cte.under <> mg.code
)
select code, name, under, parent as ultimateparent
from (select cte.*, max(lev) over (partition by cte.code) as maxlev
from cte
) t
where lev = maxlev;
Here is a SQL Fiddle.

I would put NULL as under (in my example ParentId) when it's the top record. With this assumption here's a solution
;
WITH Result AS
(
SELECT Id, ParentId, Name, Id as [Top] FROM
sample
where ParentId IS NULL
UNION ALL
SELECT s.Id, s.ParentId, s.Name, [Top]
FROM sample s INNER JOIN Result R ON s.ParentId = R.Id
)
http://sqlfiddle.com/#!6/13b9d/14

I suggest you to use a recursive function like this:
CREATE FUNCTION dbo.parentID (#code int)
RETURNS int AS
BEGIN
DECLARE #ResultVar int
SELECT #ResultVar = (SELECT under FROM master_group WHERE code = #code)
IF #ResultVar <> #code
BEGIN
SELECT #ResultVar = dbo.parentID(#ResultVar)
END
RETURN #ResultVar
END
GO
An use it like this:
SELECT *,
dbo.parentId(code) AS ultimateparent
FROM master_group

I'm going to shamelessly steal the data setup from another answer and demonstrate how you'd do this with hierarchyid:
create table t (code int, name varchar(100), under int)
insert into t values
(1, 'National Sales Manager', null),
(2, 'regional sales manager', 1),
(3, 'area sales manager', 2),
(4, 'sales manager', 3),
(5, 'a', null),
(6, 'b', 5),
(7, 'c', 5),
(8, 'd', 7),
(9, 'e', 7),
(10, 'f', 9),
(11, 'g', 9);
with cte as (
select code, name, under as parentCode, code as ultimateParent, cast('/' + cast(code as varchar) + '/' as nvarchar(max)) as h
from t
where under is null
union all
select child.code, child.name, child.under as ParentCode, parent.ultimateParentCode, cast(parent.h + cast(child.code as varchar) + '/' as nvarchar(max))
from t as child
join cte as parent
on child.under = parent.code
), hier as (
select code, name, parentCode, ultimateParentCode, cast(h as hierarchyid) as h
from cte
)
select code, name, parentCode, ultimateParentCode, h.ToString(), h.GetAncestor(h.GetLevel()-1).ToString()
from hier
Keep in mind, the recursive CTE need only be done once (or on data changes). The point that I'm making is that once you have a hierarchyid calculated (which you can store in row, btw), it's easy to answer the question you're posing with method calls on the hierarchyid (and possibly a join if you want to get back the progenitor's info).

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Find duplicate sets of data grouped by foreign key - sql

You can count how many different ids for each set of rows. If the count is more than one, then there are duplicates. For example: select m1, m2, n, n2, count(distinct id) as cnt from t group by m1, m2, n, n2 having count(distinct id) > 1

Related

How to write a SQL statement to return all children of each root,extend to each root

How to use ROW_NUMBER when grouping records?

Duplicate in same row in different columns

Identifying/comparing sets of rows within groups

SQL Server function to get top level parent in hierarchy

Categories

Resources