SQL Server : find duplicate record by column value

SQL Server : find duplicate record by column value - sql

I have a value like below dataset. Now how I can find the duplicate DataSetID like: 201 & 401 is duplicate record.

Use PIVOT and ROW_Number
For Non Duplicates
FIDDLE DEMO
SELECT * FROm Tbl WHERE DateSetID IN
(
SELECT DateSetID FROM
(
SELECT DateSetID,[Name], [Age], [Gender],ROW_NUMBER() OVER (PARTITION BY [Name], [Age],
[Gender] ORDER BY DateSetID) RN
FROM (SELECT * FROM Tbl) AS SourceTable
PIVOT(MAX(ColumnB) FOR ColumnA IN ([Name], [Age], [Gender])
) AS PivotTable)Tmp WHERE RN = 1
);
For Duplicates alone
FIDDLE DEMO
SELECT T.* FROM Tbl T JOIN (
SELECT DatasetID, ColumnA, ColumnB
FROM
(
SELECT DatasetID, [Name], [Age], [Gender], ROW_NUMBER() OVER (PARTITION BY [Name], [Age], [Gender] ORDER BY DatasetID) RN
FROM (SELECT * FROM Tbl) AS SourceTable
PIVOT(MAX(ColumnB) FOR ColumnA IN ([Name], [Age], [Gender])) AS PivotTable
)Tmp
UNPIVOT
(
ColumnB
FOR ColumnA in ([Name], [Age], [Gender])
) AS UnpivotOp
WHERE RN > 1
)X ON T.ColumnA = X.ColumnA AND T.ColumnB = X.ColumnB;

You need to count pivoted rows using ubounded rows window (default)
SELECT *
FROm Tbl
WHERE DatasetID IN (
SELECT DatasetID
FROM (
SELECT DatasetID, [Name], [Age], [Gender]
,count(*) OVER (PARTITION BY [Name], [Age], [Gender]) cnt
FROM Tbl
PIVOT(MAX(ColumnB) FOR ColumnA IN ([Name], [Age], [Gender])
) AS PivotTable
)Tmp
WHERE cnt > 1
);
Fiddle

concat the columns, and apply group by.
select distinct Datesetid from tableA
where concat(columnA, columnB) in (
select concat(columnA, columnB) from
tableA
group by concat(columnA, columnB)
having count(1) > 1)
see dbfiddle.

write give below query & find duplicate value in table
SELECT DISTINCT FirstName, LastName, MobileNo FROM CUSTOMER;

Related

How to retrieve Count of all records and other records in single query

I have the following query
select name,trip_id from main order by name
I want to retrieve count of all the records and all the columns in the tables.
for ex if i have 200 rows in table i want to have the output as
select name,trip_id,count(*) from main
Is it possible in a single query?

Use analytic count:
select name, trip_id,
count(*) over() as cnt
from main
order by name
;

DECLARE
#Main TABLE
(
[name] VARCHAR(50),
trip_id INT
);
INSERT INTO #Main
(
[name],
trip_id
)
VALUES
(
'Jim', 1
),
(
'Ian', 2
),
(
'Susan', 2
);
-- Option 1
SELECT
[name],
trip_id,
COUNT(*) OVER () AS ct
FROM
#Main;
-- Option 2
SELECT
[name],
trip_id,
(
SELECT
COUNT(*)
FROM
#Main
)
ct
FROM
#Main;
-- Option 3
SELECT
[name],
trip_id,
v.ct
FROM
#Main
CROSS APPLY
(
SELECT
COUNT(*) FROM #Main
) v (ct);
-- Option 4
SELECT
t1.[name],
t1.trip_id,
t2.ct
FROM
#Main t1
JOIN
(
SELECT
COUNT(*) ct
FROM
#Main
) t2
ON 1 = 1;
-- Option 5
SELECT
t1.[name],
t1.trip_id,
t2.ct
FROM
#Main t1
CROSS JOIN
(
SELECT
COUNT(*) ct
FROM
#Main
) t2;

Select distinct on 2 columns, but return all columns from SQL Server

Is there a way to select distinct on 2 columns but return all columns?
For example
select distinct name, type
from dbo.Table
but return all columns from dbo.Table?
I found this solution that works for a single column,
SELECT
*
FROM
(SELECT
[name], [type],
[col1], [col2], [col3],
[etc], [dateAdded], [ID],
ROW_NUMBER() OVER (PARTITION BY type ORDER BY dateAdded DESC) rownumber
FROM
[dbo].[Table]) a
WHERE
rownumber = 1;
is it possible to do this for multiple columns?

You can use partition by name and type as below
SELECT * from(select
[name]
,[type]
,[col1]
,[col2]
,[col3]
,[etc]
,[dateAdded]
,[ID]
,ROW_NUMBER() OVER(Partition by name, type order by dateAdded DESC) rownumber from [dbo].[Table]) a where rownumber = 1;

Return all rows/columns where multiple columns are non distinct

I have a query where I want to find rows where both ActivityDate and TaskId have multiple entries at the same time:
SELECT
ActivityDate, taskId
FROM
[DailyTaskHours]
GROUP BY
ActivityDate, taskId
HAVING
COUNT(*) > 1
The above query appears to work. However I want all of the columns to return now just the two (ActivityDate, taskId). This doesn't work:
SELECT *
FROM
[DailyTaskHours]
GROUP BY
ActivityDate, taskId
HAVING
COUNT(*) > 1
because many of the columns are not in the group by clause. I don't want any columns to be effected by the HAVING COUNT(*) > 1 other than ActivityDate, taskId.
How do I achieve this?

WITH sel as(
SELECT
ActivityDate, taskId
FROM
[DailyTaskHours]
GROUP BY
ActivityDate, taskId
HAVING
COUNT(*) > 1
)
SELECT *
FROM [DailyTaskHours] d
INNER JOIN sel ON d.ActivityDate = sel.ActivityDate AND d.taskId = sel.taskId

SELECT t1.*
FROM
[DailyTaskHours] t1
INNER JOIN (
SELECT
ActivityDate, taskId
FROM
[DailyTaskHours]
GROUP BY
ActivityDate, taskId
HAVING
COUNT(*) > 1
) t2 ON (
t1.ActivityDate = t2.ActivityDate AND
t1.taskId = t2.taskId
)

-- fully functional example.
DECLARE #table TABLE ( ActivityDate DATE, TaskID INT);
SET NOCOUNT ON;
INSERT #table VALUES ('01/01/2013',1);
INSERT #table VALUES ('01/02/2013',1);
INSERT #table VALUES ('01/02/2013',2);
INSERT #table VALUES ('01/03/2013',1);
INSERT #table VALUES ('01/03/2013',2);
INSERT #table VALUES ('01/03/2013',5); -- duplicate date,taskid
INSERT #table VALUES ('01/03/2013',5); -- duplicate date,taskid
SET NOCOUNT OFF;
SELECT A.*
FROM #table A
INNER JOIN (
SELECT [ActivityDate], TaskId
FROM #table
GROUP BY [ActivityDate], TaskId
HAVING Count(*) > 1
) AS B ON B.[ActivityDate]=A.ActivityDate AND B.TaskId=A.TaskId;

Here is the SQL Fiddle that shows the following query in action:
SELECT DISTINCT m.*
FROM
(
SELECT s.ActivityDate, s.taskId
FROM DailyTaskHours s
GROUP BY s.ActivityDate, s.taskId
HAVING COUNT(*) > 1
) sub
JOIN DailyTaskHours m
ON m.taskId = sub.taskId
AND m.ActivityDate = sub.ActivityDate

SQL cross apply

I have a SQL table which contains audit information:
GroupId AuditDate ID FirstName LastName
1 01/06/2011 123 Michael Jackson
1 01/09/2010 123 M J
1 01/06/2009 123 Mike J
and trying show the differences between the audit records:
GroupId AuditDate ID Attribute From To
1 01/06/2011 123 FirstName M Michael
1 01/06/2011 123 LastName J Jackson
1 01/09/2010 123 FirstName Mike M
1 01/06/2009 123 FirstName NULL Mike
1 01/06/2009 123 LastName NULL J
I am using the following SQL query:
WITH result AS (
SELECT [Current].Id,
[Current].GroupId,
[Current].AuditDate,
[Current].FirstName,
[Current].LastName
Previous.FirstName AS PFirstName,
Previous.LastName AS PLastName,
FROM
(SELECT
*, ROW_NUMBER() OVER(PARTITION BY GroupId ORDER BY AuditDate ASC) AS RowNumber
FROM
AuditTable
WHERE
Id = #ID
) AS [Current]
LEFT JOIN
(SELECT
*, ROW_NUMBER() OVER(PARTITION BY GroupId ORDER BY AuditDate ASC) AS RowNumber
FROM
AuditTable
WHERE
Id = #ID
) AS [Previous]
ON
[Current].RowNumber = [Previous].RowNumber + 1
)
SELECT r.Id,r.GroupId, r.AuditDate
x.Attribute,
x.[From],
x.[To]
FROM result r
CROSS APPLY
(
VALUES
('FirstName', t.FirstName, t.PFirstName),
('LastName', t.LastName, t.PLastName),
) x (Attribute, [To], [From])
where
ISNULL(x.[From],'') <> ISNULL(x.[To],'')
ORDER BY r.AuditDate asc;
Is it possible to merge two select queries to improve the performance?

Try this query
WITH result AS (
SELECT Id,
GroupId,
AuditDate,
FirstName,
LastName,
ROW_NUMBER() OVER(PARTITION BY GroupId ORDER BY AuditDate ASC) AS RowNumber
FROM AuditTable
WHERE Id = #ID
)
SELECT r.Id,r.GroupId, r.AuditDate,
x.Attribute,
x.[From],
x.[To]
FROM result r LEFT JOIN result r2 ON r.RowNumber = r2.RowNumber + 1
CROSS APPLY (
VALUES ('FirstName', r.FirstName, r2.FirstName),
('LastName', r.LastName, r2.LastName)
) x (Attribute, [To], [From])
WHERE ISNULL(x.[From],'') <> ISNULL(x.[To],'')
ORDER BY r.AuditDate ASC;
Demo on SQLFiddle

You can eliminate both subqueries entirely by using lag():
WITH result AS (
SELECT Id,
GroupId,
AuditDate,
FirstName,
LastName,
lag(FirstName) over (PARTITION BY GroupId ORDER BY AuditDate ASC)
AS PFirstName,
lag(LastName) over (PARTITION BY GroupId ORDER BY AuditDate ASC)
AS PLastName
FROM AuditTable
WHERE Id = #ID
)
...
Here is the relevant documentation.
Update: However, this is only available in SQL Server 2012, unfortunately. If you have an earlier version, you will need some sort of self join.
If you can't use lag(), you should at least be able to reduce your code from 3 queries to 2: include the row number in your first select statement, and left join one subquery to it rather than having two subqueries. I'm not sure whether this way or Chris Moutray's way would be faster.
WITH result AS (
SELECT ROW_NUMBER() OVER(PARTITION BY GroupId ORDER BY AuditDate ASC) AS RowNumber
[Current].Id,
[Current].GroupId,
[Current].AuditDate,
[Current].FirstName,
[Current].LastName
[Previous].FirstName AS PFirstName,
[Previous].LastName AS PLastName,
FROM AuditTable as [Current]
LEFT JOIN
(SELECT
*, ROW_NUMBER() OVER(PARTITION BY GroupId ORDER BY AuditDate ASC) AS RowNumber
FROM
AuditTable
WHERE
Id = #ID
) AS [Previous]
ON
[Current].RowNumber = [Previous].RowNumber + 1
)

You can use LAG in SQL Server 2012. I've used UNION ALL here to unpivot columns into rows.
Depending on how you filter and what your group level is, add/modify the PARTITION BY
DECLARE #foo TABLE (GroupId tinyint, AuditDate date, ID tinyint, FirstName varchar(100), LastName varchar(100));
INSERT #foo VALUES (1, '20110601', 123, 'Michael', 'Jackson'), (1, '20100901', 123, 'M', 'J'), (1, '20090601', 123, 'Mike', 'J');
SELECT
X.GroupId, X.AuditDate, X.ID, X.[From], X.[To]
FROM
(
SELECT
F.GroupId, F.AuditDate, F.ID, 'FirstName' AS Attribute, LAG(F.FirstName) OVER (/*PARTITION BY GroupId, ID*/ ORDER BY AuditDate) AS [From], F.FirstName AS [To]
FROM
#foo F
UNION ALL
SELECT
F.GroupId, F.AuditDate, F.ID, 'LastName' AS Attribute, LAG(F.LastName) OVER (/*PARTITION BY GroupId, ID*/ ORDER BY AuditDate) AS [From], F.LastName AS [To]
FROM
#foo F
) X
WHERE
ISNULL(X.[From], '') <> ISNULL(X.[To], '')
ORDER BY
X.AuditDate DESC, X.Attribute

SELECT COUNT(DISTINCT [name]) from several tables

I can perform the following SQL Server selection of distinct (or non-repeating names) from a column in one table like so:
SELECT COUNT(DISTINCT [Name]) FROM [MyTable]
But what if I have more than one table (all these tables contain the name field called [Name]) and I need to know the count of non-repeating names in two or more tables.
If I run something like this:
SELECT COUNT(DISTINCT [Name]) FROM [MyTable1], [MyTable2], [MyTable3]
I get an error, "Ambiguous column name 'Name'".
PS. All three tables [MyTable1], [MyTable2], [MyTable3] are a product of a previous selection.

After the clarification, use:
SELECT x.name, COUNT(x.[name])
FROM (SELECT [name]
FROM [MyTable]
UNION ALL
SELECT [name]
FROM [MyTable2]
UNION ALL
SELECT [name]
FROM [MyTable3]) x
GROUP BY x.name
If I understand correctly, use:
SELECT x.name, COUNT(DISTINCT x.[name])
FROM (SELECT [name]
FROM [MyTable]
UNION ALL
SELECT [name]
FROM [MyTable2]
UNION ALL
SELECT [name]
FROM [MyTable3]) x
GROUP BY x.name
UNION will remove duplicates; UNION ALL will not, and is faster for it.

EDIT: Had to change after seeing recent comment.
Does this give you what you want? This gives a count for each person after combining the rows from all tables.
SELECT [NAME], COUNT(*) as TheCount
FROM
(
SELECT [Name] FROM [MyTable1]
UNION ALL
SELECT [Name] FROM [MyTable2]
UNION ALL
SELECT [Name] FROM [MyTable3]
) AS [TheNames]
GROUP BY [NAME]

Here's another way:
SELECT x.name, SUM(x.cnt)
FROM ( SELECT [name], COUNT(*) AS cnt
FROM [MyTable]
GROUP BY [name]
UNION ALL
SELECT [name], COUNT(*) AS cnt
FROM [MyTable2]
GROUP BY [name]
UNION ALL
SELECT [name], COUNT(*) AS cnt
FROM [MyTable3]
GROUP BY [name]
) AS x
GROUP BY x.name

In case you have different amounts of columns per table, like:
table1 has 3 columns,
table2 has 2 columns,
table3 has 1 column
And you want to count the amount of distinct values of different column names, what it was useful to me in AthenaSQL was to use CROSS JOIN since your output would be only one row, it would be just 1 combination:
SELECT * FROM (
SELECT COUNT(DISTINCT name1) as amt_name1,
COUNT(DISTINCT name2) as amt_name2,
COUNT(DISTINCT name3) as amt_name3,
FROM table1 ) t1
CROSS JOIN
(SELECT COUNT(DISTINCT name4) as amt_name4,
COUNT(DISTINCT name5) as amt_name5,
MAX(t3.amt_name6) as amt_name6
FROM table2
CROSS JOIN
(SELECT COUNT(DISTINCT name6) as amt_name6
FROM table3) t3) t2
Would return a table with one row and their counts:
amt_name1 | amt_name2 | amt_name3 | amt_name4 | amt_name5 | amt_name6
4123 | 675 | 564 | 2346 | 18667 | 74567

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

SQL Server : find duplicate record by column value - sql

I have a value like below dataset. Now how I can find the duplicate DataSetID like: 201 & 401 is duplicate record.

concat the columns, and apply group by. select distinct Datesetid from tableA where concat(columnA, columnB) in ( select concat(columnA, columnB) from tableA group by concat(columnA, columnB) having count(1) > 1) see dbfiddle.

write give below query & find duplicate value in table SELECT DISTINCT FirstName, LastName, MobileNo FROM CUSTOMER;

Related

How to retrieve Count of all records and other records in single query

Select distinct on 2 columns, but return all columns from SQL Server

Return all rows/columns where multiple columns are non distinct

SQL cross apply

SELECT COUNT(DISTINCT [name]) from several tables

Categories

Resources