Identifying duplicate GROUPS of data in SQL

Identifying duplicate GROUPS of data in SQL - sql

My question is how to identify duplicate (repeating) 'groups' of data within an SQL table. I am using SQL Server 2005 at the moment so prefer solutions based on that or ansi-sql.
Here is a sample table and expected result (below) to base this question on:
declare #data table (id nvarchar(10), fund nvarchar(1), xtype nvarchar(1))
insert into #data select 'Switch_1', 'A', 'S'
insert into #data select 'Switch_1', 'X', 'B'
insert into #data select 'Switch_1', 'Y', 'B'
insert into #data select 'Switch_1', 'Z', 'B'
insert into #data select 'Switch_2', 'A', 'S'
insert into #data select 'Switch_2', 'X', 'B'
insert into #data select 'Switch_2', 'Y', 'B'
insert into #data select 'Switch_2', 'Z', 'B'
insert into #data select 'Switch_3', 'C', 'S'
insert into #data select 'Switch_3', 'D', 'B'
insert into #data select 'Switch_4', 'C', 'S'
insert into #data select 'Switch_4', 'F', 'B'
(new data)
insert into #data select 'Switch_5', 'A', 'S'
insert into #data select 'Switch_5', 'X', 'B'
insert into #data select 'Switch_5', 'Y', 'B'
insert into #data select 'Switch_5', 'Z', 'B'
-- id fund xtype match
-- ---------- ---- ----- ---------
-- Switch_1 A S Match_1
-- Switch_1 X B Match_1
-- Switch_1 Y B Match_1
-- Switch_1 Z B Match_1
-- Switch_2 A S Match_1
-- Switch_2 X B Match_1
-- Switch_2 Y B Match_1
-- Switch_2 Z B Match_1
-- Switch_3 C S
-- Switch_3 D B
-- Switch_4 C S
-- Switch_4 F B
(new results)
-- Switch_5 A S Match_1
-- Switch_5 X B Match_1
-- Switch_5 Y B Match_1
-- Switch_5 Z B Match_1
I only want matches on an ALL or NOTHING basis (i.e. All records in the group match all records in another group - not a part match). Any match id can be used (I have used Match_1 above but can be numeric etc.)
Thanks for any help here.
(EDIT: I guess I should add that there could be any number of rows per group, not just the 2 or 4 shown in the sample above - and I'm also trying to avoid cursors)
(EDIT 2: I seem to have an issue if there are more than one matches found. The output from the SQL supplied is returning duplicate records for Switch_1 when there are more than one matches found. I have updated the sample data accordingly. Not sure if Lieven is still following this - I'm also looking at the solution and will post here if found.)

The flow of execution is as follows
q: Combine all funds and xtypes of one id into one string using an XML PATH construction
r: Select a ROW_NUMBER and the respective id's for matching groups
Select the results by LEFT JOINING #data and r
SQL Statement
;WITH q AS (
SELECT DISTINCT d.id
, DuplicateData = STUFF((SELECT ', ' + fund + xtype FROM #data WHERE id = d.id FOR XML PATH('')), 1, 2, '')
FROM #data d
)
, r AS (
SELECT id1 = q1.id
, id2 = q2.id
, rn = ROW_NUMBER() OVER (ORDER BY q1.ID)
FROM q q1
INNER JOIN q q2 ON q1.DuplicateData = q2.DuplicateData AND q1.id < q2.id
)
SELECT id
, fund
, xtype
, match = 'Match_' + CAST(r.rn AS VARCHAR(32))
FROM #data d
LEFT OUTER JOIN r ON d.id IN (r.id1, r.id2)
Results
id fund xtype match
---------- ---- ----- --------------------------------------
Switch_1 A S Match_1
Switch_1 X B Match_1
Switch_1 Y B Match_1
Switch_1 Z B Match_1
Switch_2 A S Match_1
Switch_2 X B Match_1
Switch_2 Y B Match_1
Switch_2 Z B Match_1
Switch_3 C S NULL
Switch_3 D B NULL
Switch_4 C S NULL
Switch_4 F B NULL

Here it is another query for it:
create table #temp1 (
id varchar(10),
fund nvarchar(1),
xtype nvarchar(1)
)
insert into #temp1 select 'Switch_1', 'A', 'S'
insert into #temp1 select 'Switch_1', 'X', 'B'
insert into #temp1 select 'Switch_1', 'Y', 'B'
insert into #temp1 select 'Switch_1', 'Z', 'B'
insert into #temp1 select 'Switch_2', 'A', 'S'
insert into #temp1 select 'Switch_2', 'X', 'B'
insert into #temp1 select 'Switch_2', 'Y', 'B'
insert into #temp1 select 'Switch_2', 'Z', 'B'
insert into #temp1 select 'Switch_3', 'C', 'S'
insert into #temp1 select 'Switch_3', 'D', 'B'
insert into #temp1 select 'Switch_4', 'C', 'S'
insert into #temp1 select 'Switch_4', 'F', 'B'
select t1.*, case when t2.equal = t3.total then 'True' else 'False' end as 'Match' from #temp1 t1
left outer join (select m.id, count(m2.id) as 'equal' from #temp1 m
inner join #temp1 m2 on m.Id <> m2.Id and m.fund = m2.fund and m.xtype = m2.xtype
group by m.id) t2 on t1.id = t2.id
inner join (select m3.id, count(m3.fund) as 'total' from #temp1 m3 group by m3.id) t3 on t3.id = t1.id
drop table #temp1

Related

How to get previously changed records?

How to get previously changed records for particular data?
From the below records, I have tried but didn't got idea to make it
DECLARE #l_MSISDN AS TABLE
(
ID INT IDENTITY(1,1),
Old NVARCHAR(50),
New NVARCHAR(50),
AuthDate DATETIME
)
INSERT INTO #l_MSISDN VALUES
('A','B',GETDATE()),
('B','C',GETDATE()),
('C','D',GETDATE()),
('R','T',GETDATE()),
('R','Q',GETDATE())
;WITH CTE AS
(
select New,OLD,AuthDate
from #l_MSISDN nolock
UNION ALL
SELECT * from cte
where New = OLD
)
select * from cte
order by AuthDate
Old New AuthDate
A B 2018-04-04 11:06:51.953
B C 2018-04-04 10:39:03.563
C D 2014-12-20 06:25:20.397
R T 2016-02-10 15:25:20.123
Q R 2015-09-21 15:25:20.330
I expect output as
old new Authdate
A B 2014-12-20 06:25:20.397
B C 2015-09-21 15:25:20.330
C D 2016-02-10 15:25:20.123
I'll give input as D

You weren't too far off with your attempt. You need to reference your original table in the UNION ALL as well by a JOIN:
DECLARE #Start char(1) = 'D';
SELECT *
INTO #Temp
FROM (VALUES ('A', 'B', CONVERT(datetime,'2018-04-04T11:06:51.953')),
('B', 'C', CONVERT(datetime,'2018-04-04T10:39:03.563')),
('C', 'D', CONVERT(datetime,'2014-12-20T06:25:20.397')),
('R', 'T', CONVERT(datetime,'2016-02-10T15:25:20.123')),
('Q', 'R', CONVERT(datetime,'2015-09-21T15:25:20.330'))) V (Old, New, AuthDate);
WITH rCTe AS(
SELECT T.Old,
T.New,
T.AuthDate
FROM #Temp T
WHERE T.New = #Start
UNION ALL
SELECT T.Old,
T.New,
T.AuthDate
FROM #Temp T
JOIN rCTE r ON r.Old = T.New)
SELECT r.Old,
r.New,
r.AuthDate
FROM rCTe r;
DROP TABLE #Temp;

SQL logic for the Vlookup function in excel/ How to do a Vlookup in SQL

I have a table with 2 columns OLD_VALUE and NEW_VALUE and 5 rows. 1st row has values (A,B). Other row values can be (B,C),(C,D),(E,D),(D,F). I want to update all the old values with the new value (how a vlookup in excel would work) The Final Result Required: The newest value in the above example would be D,F. i.e. D points to F. E and C point to D. B points to C and A points to B. D pointing to F is the last and newest and there are no more successions after D,F. So (OLD_VALUE,NEW_VALUE)->(A,F), (B,F), (C,F), (D,F), (E,F). I want 5 rows with the NEW_VALUE as 'F'. The level of successions can be ranging from 1 to x.

This is the table I have used for the script:
declare #t as table(old_value char(1), new_value char(1));
insert into #t values('A','B')
insert into #t values('B','C')
insert into #t values('C','D')
insert into #t values('E','D')
insert into #t values('D','F')
This needs to be done with a recursive CTE. First, you will need to define an anchor for the CTE. The anchor in this case should be the record with the latest value. This is how I define the anchor:
select old_value, new_value, 1 as level
from #t
where new_value NOT IN (select old_value from #t)
And here is the recursive CTE I used to locate the latest value for each row:
;with a as(
select old_value, new_value, 1 as level
from #t
where new_value NOT IN (select old_value from #t)
union all
select b.old_value, a.new_value, a.level + 1
from a INNER JOIN #t b ON a.old_value = b.new_value
)
select * from a
Results:
old_value new_value level
--------- --------- -----------
D F 1
C F 2
E F 2
B F 3
A F 4
(5 row(s) affected)

I think a recursive CTE like the following is what you're looking for (where the parent is the row whose second value does not exist as a first value elsewhere). If there's no parent(s) to anchor to, this would fail (e.g. if you had A->B, B->C, C->A, you'd get no result), but it should work for your case:
DECLARE #T TABLE (val1 CHAR(1), val2 CHAR(2));
INSERT #T VALUES ('A', 'B'), ('B', 'C'), ('C', 'D'), ('E', 'D'), ('D', 'F');
WITH CTE AS
(
SELECT val1, val2
FROM #T AS T
WHERE NOT EXISTS (SELECT 1 FROM #T WHERE val1 = T.val2)
UNION ALL
SELECT T.val1, CTE.val2
FROM #T AS T
JOIN CTE
ON CTE.val1 = T.val2
)
SELECT *
FROM CTE;

Matching multiple key/value pairs in SQL

I have metadata stored in a key/value table in SQL Server. (I know key/value is bad, but this is free-form metadata supplied by users, so I can't turn the keys into columns.) Users need to be able to give me an arbitrary set of key/value pairs and have me return all DB objects that match all of those criteria.
For example:
Metadata:
Id Key Value
1 a p
1 b q
1 c r
2 a p
2 b p
3 c r
If the user says a=p and b=q, I should return object 1. (Not object 2, even though it also has a=p, because it has b=p.)
The metadata to match is in a table-valued sproc parameter with a simple key/value schema. The closest I have got is:
select * from [Objects] as o
where not exists (
select * from [Metadata] as m
join #data as n on (n.[Key] = m.[Key])
and n.[Value] != m.[Value]
and m.[Id] = o.[Id]
)
My "no rows exist that don't match" is an attempt to implement "all rows match" by forming its contrapositive. This does eliminate objects with mismatching metadata, but it also returns objects with no metadata at all, so no good.
Can anyone point me in the right direction? (Bonus points for performance as well as correctness.)

; WITH Metadata (Id, [Key], Value) AS -- Create sample data
(
SELECT 1, 'a', 'p' UNION ALL
SELECT 1, 'b', 'q' UNION ALL
SELECT 1, 'c', 'r' UNION ALL
SELECT 2, 'a', 'p' UNION ALL
SELECT 2, 'b', 'p' UNION ALL
SELECT 3, 'c', 'r'
),
data ([Key], Value) AS -- sample input
(
SELECT 'a', 'p' UNION ALL
SELECT 'b', 'q'
),
-- here onwards is the actual query
data2 AS
(
-- cnt is to count no of input rows
SELECT [Key], Value, cnt = COUNT(*) OVER()
FROM data
)
SELECT m.Id
FROM Metadata m
INNER JOIN data2 d ON m.[Key] = d.[Key] AND m.Value= d.Value
GROUP BY m.Id
HAVING COUNT(*) = MAX(d.cnt)

The following SQL query produces the result that you require.
SELECT *
FROM #Objects m
WHERE Id IN
(
-- Include objects that match the conditions:
SELECT m.Id
FROM #Metadata m
JOIN #data d ON m.[Key] = d.[Key] AND m.Value = d.Value
-- And discount those where there is other metadata not matching the conditions:
EXCEPT
SELECT m.Id
FROM #Metadata m
JOIN #data d ON m.[Key] = d.[Key] AND m.Value <> d.Value
)
Test schema and data I used:
-- Schema
DECLARE #Objects TABLE (Id int);
DECLARE #Metadata TABLE (Id int, [Key] char(1), Value char(2));
DECLARE #data TABLE ([Key] char(1), Value char(1));
-- Data
INSERT INTO #Metadata VALUES
(1, 'a', 'p'),
(1, 'b', 'q'),
(1, 'c', 'r'),
(2, 'a', 'p'),
(2, 'b', 'p'),
(3, 'c', 'r');
INSERT INTO #Objects VALUES
(1),
(2),
(3),
(4); -- Object with no metadata
INSERT INTO #data VALUES
('a','p'),
('b','q');

Select only distinct values from two columns from a table

If I have a table such as
1 A
1 B
1 A
1 B
2 C
2 C
And I want to select distinct from the two columns so that I would get
1
2
A
B
C
How can I word my query? Is the only way to concatenate the columns and wrap them around a distinct function operator?

You could use a union to create a table of all values from both columns:
select col1 as BothColumns
from YourTable
union
select col2
from YourTable
Unlike union all, union removes duplicates, even if they come from the same side of the union.

SQL Fiddle
Why even distinct in Union, try this :
select cast(id as char(1)) from test
union
select val from test

Please try:
Select Col1 from YourTable
union
Select Col2 from YourTable
UNION removes duplicate records (where all columns in the results are the same), UNION ALL does not.
Please check What is the difference between UNION and UNION ALL
For multiple columns, you can go for UNPIVOT.
SELECT distinct DistValues
FROM
(SELECT Col1, Col2, Col3
FROM YourTable) p
UNPIVOT
(DistValues FOR Dist IN
(Col1, Col2, Col3)
)AS unpvt;

Try this one -
DECLARE #temp TABLE
(
Col1 INT
, Col2 NVARCHAR(50)
)
INSERT INTO #temp (Col1, Col2)
VALUES (1, 'ab5defg'), (2, 'ae4eii')
SELECT disword = (
SELECT DISTINCT dt.ch
FROM (
SELECT ch = SUBSTRING(t.mtxt, n.number + 1, 1)
FROM [master].dbo.spt_values n
CROSS JOIN (
SELECT mtxt = (
SELECT CAST(Col1 AS VARCHAR(10)) + Col2
FROM #temp
FOR XML PATH(''), TYPE).value('.', 'VARCHAR(MAX)'
)
) t
WHERE [type] = N'p'
AND number <= LEN(mtxt) - 1
) dt
FOR XML PATH(''), TYPE).value('.', 'VARCHAR(MAX)'
)
Or try this -
DECLARE #temp TABLE
(
a CHAR(1), b CHAR(1)
)
INSERT INTO #temp (a, b)
VALUES
('1', 'A'), ('1', 'B'), ('1', 'A'),
('1', 'B'), ('2', 'C'), ('2', 'C')
SELECT a
FROM #temp
UNION
SELECT b
FROM #temp

Because what you want select is in different columns, you can use union like below:
select distinct tarCol from
(select distinct column1 as tarCol from table
union
select distinct column2 from table) as tarTab

You can use like this to get multiple distinct column values
(SELECT DISTINCT `enodeb` as res,
"enodeb" as columnname
FROM `raw_metrics`)
UNION
(SELECT DISTINCT `interval` as res,
"interval" as columnname
FROM `raw_metrics`)

How can I improve this SQL query?

I ran into an interesting SQL problem today and while I came up with a solution that works I doubt it's the best or most efficient answer. I defer to the experts here - help me learn something and improve my query! RDBMS is SQL Server 2008 R2, query is part of an SSRS report that will run against about 100,000 rows.
Essentially I have a list of IDs that could have multiple values associated with them, the values being Yes, No, or some other string. For ID x, if any of the values are a Yes, x should be Yes, if they are all No, it should be No, if they contain any other values but yes and no, display that value. I only want to return 1 row per ID, no duplicates.
The simplified version and test case:
DECLARE #tempTable table ( ID int, Val varchar(1) )
INSERT INTO #tempTable ( ID, Val ) VALUES ( 10, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 11, 'N')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 11, 'N')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 12, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 12, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 12, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 13, 'N')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 14, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 14, 'N')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 15, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 16, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 17, 'F')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 18, 'P')
SELECT DISTINCT t.ID, COALESCE(t2.Val, t3.Val, t4.Val)
FROM #tempTable t
LEFT JOIN
(
SELECT ID, Val
FROM #tempTable
WHERE Val = 'Y'
) t2 ON t.ID = t2.ID
LEFT JOIN
(
SELECT
ID, Val FROM #tempTable
WHERE Val = 'N'
) t3 ON t.ID = t3.ID
LEFT JOIN
(
SELECT ID, Val
FROM #tempTable
WHERE Val <> 'Y' AND Val <> 'N'
) t4 ON t.ID = t4.ID
Thanks in advance.

Let's answer an easier problem: for each id, get the Val which is last in the alphabet. This will work if Y and N are the only values. And the query is much simpler:
SELECT t.ID, MAX(t.Val) FROM t GROUP BY t.ID;
So, reduce your case to the simple case. Use an enum (if your DB supports it) or break the value codes into another table with a collation column (in this case, you could have 1 for Y, 2 for N, and 999 for all other possible values, and you want the smallest). Then
SELECT ID, c.Val FROM
(SELECT t.ID, MIN(codes.collation) AS mx
FROM t join codes on t.Val = codes.Val GROUP BY t.ID) AS q
JOIN codes c ON mx=c.collation;
Here codes has two columns, Val and Collation.
You can also do this with a CTE type query, as long as you have the Values ordered as you want them. This approach has one join to a small lookup table and should be much, much faster than 3 self-joins.
WITH q AS (SELECT t.id, t.Val, ROW_NUMBER() AS r FROM t JOIN codes ON t.Val=codes.Val
PARTITION BY t.id ORDER BY codes.collation)
SELECT q.id, q.Val WHERE r=1;

I'd change it to this just to make it easier to read:
SELECT DISTINCT t.ID, COALESCE(t2.Val, t3.Val, t4.Val)
FROM #tempTable t
LEFT JOIN #tempTable t2 ON t.ID = t2.ID and t2.Val = 'Y'
LEFT JOIN #tempTable t3 ON t.ID = t3.ID and t3.Val = 'N'
LEFT JOIN #tempTable t4 ON t.ID = t4.ID and t4.Val <> 'Y' AND t4.Val <> 'N'
Gives the same results as your example.
I also looked at the execution plans for both and they looked exactly the same, I doubt you'd see any performance difference.

Try this:
;WITH a AS (
SELECT
ID,
SUM(CASE Val WHEN 'Y' THEN 1 ELSE 0 END) AS y,
SUM(CASE Val WHEN 'N' THEN 0 ELSE 1 END) AS n,
MIN(CASE WHEN Val IN ('Y','N') THEN NULL ELSE Val END) AS first_other
FROM #tempTable
GROUP BY ID
)
SELECT
ID,
CASE WHEN y > 0 THEN 'Y' WHEN n = 0 THEN 'N' ELSE first_other END AS Val
FROM a
If there are any 'Y' values then the sum of y will be greater than 0
If all values are 'N' then the sum of n will be zero
Get the first non 'Y' or 'N' character available if needed
In this case the result can be determined with only one pass through
the table

I'm reading your spec like this:
if any ID is Y then Y
if all IDs are N then N
else display value (other than Y or N)
eliminate rows per (1)
delete from #tempTable
where not Val='Y' and ID in (
select distinct ID
from #tempTable
where Val='Y'
)
select distinct to eliminate multiple N's per (2).
select distinct * from #tempTable
group up various "other" values to get a single row per ID.
SELECT A.Id, AllVals =
SubString(
(SELECT ', ' + B.Val
FROM C as B
WHERE A.Id = B.Id
FOR XML PATH ( '' ) ), 3, 1000)
FROM C as A
GROUP BY Id
Entire runnable query:
declare #tempTable table (ID int, Val char(1))
INSERT INTO #tempTable ( ID, Val ) VALUES ( 10, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 11, 'N')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 11, 'N')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 12, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 12, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 12, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 13, 'N')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 14, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 14, 'N')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 15, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 16, 'Y')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 17, 'F')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 18, 'P')
INSERT INTO #tempTable ( ID, Val ) VALUES ( 18, 'F')
delete from #tempTable
where not Val='Y' and ID in (
select distinct ID
from #tempTable
where Val='Y'
);
WITH C as (select distinct * from #tempTable)
SELECT A.Id, AllVals =
SubString(
(SELECT ', ' + B.Val
FROM C as B
WHERE A.Id = B.Id
FOR XML PATH ( '' ) ), 3, 1000)
FROM C as A
GROUP BY Id
OUTPUT:
Id AllVals
10 Y
11 N
12 Y
13 N
14 Y
15 Y
16 Y
17 F
18 F, P

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Identifying duplicate GROUPS of data in SQL - sql

Related

How to get previously changed records?

SQL logic for the Vlookup function in excel/ How to do a Vlookup in SQL

Matching multiple key/value pairs in SQL

Select only distinct values from two columns from a table

How can I improve this SQL query?

Categories

Resources