Remove duplicates with less null values - sql

I have a table of employees which contains about 25 columns. Right now there are a lot of duplicates and I would like to try and get rid of some of these duplicates.
First, I want to find the duplicates by looking for multiple records that have the same values in first name, last name, employee number, company number and status.
SELECT
firstname,lastname,employeenumber, companynumber, statusflag
FROM
employeemaster
GROUP BY
firstname,lastname,employeenumber,companynumber, statusflag
HAVING
(COUNT(*) > 1)
This gives me duplicates but my goal is to find and keep the best single record and delete the other records. The "best single record" is defined by the record with the least amount of NULL values in all of the other columns. How can I do this?
I am using Microsoft SQL Server 2012 MGMT Studio.
EXAMPLE:
Red: DELETE
Green: KEEP
NOTE: There are a lot more columns in the table than what this table shows.

You can use the sys.columns table to get a list of columns and build a dynamic query. This query will return a 'KeepThese' value for every record you want to keep based on your given criteria.
-- insert test data
create table EmployeeMaster
(
Record int identity(1,1),
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50)
);
insert into EmployeeMaster
(
FirstName,
LastName,
EmployeeNumber,
CompanyNumber,
StatusFlag,
UserName,
Branch
)
values
('Jake','Jones',1234,1,1,'JJONES','PHX'),
('Jake','Jones',1234,1,1,NULL,'PHX'),
('Jake','Jones',1234,1,1,NULL,NULL),
('Jane','Jones',5678,1,1,'JJONES2',NULL);
-- get records with most non-null values with dynamic sys.column query
declare #sql varchar(max)
select #sql = '
select e.*,
row_number() over(partition by
e.FirstName,
e.LastName,
e.EmployeeNumber,
e.CompanyNumber,
e.StatusFlag
order by n.NonNullCnt desc) as KeepThese
from EmployeeMaster e
cross apply (select count(n.value) as NonNullCnt from (select ' +
replace((
select 'cast(' + c.name + ' as varchar(50)) as value union all select '
from sys.columns c
where c.object_id = t.object_id
for xml path('')
) + '#',' union all select #','') + ')n)n'
from sys.tables t
where t.name = 'EmployeeMaster'
exec(#sql)

Try this.
;WITH cte
AS (SELECT Row_number()
OVER(
partition BY firstname, lastname, employeenumber, companynumber, statusflag
ORDER BY (SELECT NULL)) rn,
firstname,
lastname,
employeenumber,
companynumber,
statusflag,
username,
branch
FROM employeemaster),
cte1
AS (SELECT a.firstname,
a.lastname,
a.employeenumber,
a.companynumber,
a.statusflag,
Row_number()
OVER(
partition BY a.firstname, a.lastname, a.employeenumber, a.companynumber, a.statusflag
ORDER BY (CASE WHEN a.username IS NULL THEN 1 ELSE 0 END +CASE WHEN a.branch IS NULL THEN 1 ELSE 0 END) )rn
-- add the remaining columns in case statement
FROM cte a
JOIN employeemaster b
ON a.firstname = b.firstname
AND a.lastname = b.lastname
AND a.employeenumber = b.employeenumber
AND a.companynumbe = b.companynumber
AND a.statusflag = b.statusflag)
SELECT *
FROM cte1
WHERE rn = 1

I test with MySQL and use NULL String concat to found the best record. Because LENGTH ( NULL || 'data') is 0. Only if all column not NULL some length exists. Maybe this is not perfekt.
create table EmployeeMaster
(
Record int auto_increment,
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50),
PRIMARY KEY(record)
);
INSERT INTO EmployeeMaster
(
FirstName, LastName, EmployeeNumber, CompanyNumber, StatusFlag, UserName, Branch
) VALUES ('Jake', 'Jones', 1234, 1, 1, 'JJONES', 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, NULL), ('Jane', 'Jones', 5678, 1, 1, 'JJONES2', NULL);
My query idea looks like this
SELECT e.*
FROM employeemaster e
JOIN ( SELECT firstname,
lastname,
employeenumber,
companynumber,
statusflag,
MAX( LENGTH ( username || branch ) ) data_quality
FROM employeemaster
GROUP BY firstname, lastname, employeenumber, companynumber, statusflag
HAVING count(*) > 1
) g
ON LENGTH ( username || branch ) = g.data_quality

Related

Convert Table to Specific Column Wise

I have a table like this. How can I convert to this format?
DECLARE #A TaBLE (KeyValue INT, Name VARCHAR(50), Value VARCHAR(512))
INSERT INTO #A
VALUES (0, 'AccountID', '192507'), (0, 'member_id', '999159'),
(0, 'firstname', 'Test1'), (0, 'lastname', 'Last1'),
(1, 'AccountID', '192508'), (1, 'member_id', '999160'),
(1, 'firstname', 'Test2'), (1, 'lastname', 'Last2')
SELECT * FROM #A
I have table rows for this model:
KeyValue Name Value
-----------------------------------
0 AccountID 192507
0 member_id 999159
0 firstname Test1
0 lastname Last1
1 AccountID 192508
1 member_id 999160
1 firstname Test2
1 lastname Last2
My expected output is:
AccountID member_id firstname lastname
--------------------------------------------
192507 999159 Test1 Last1
192508 999160 Test2 Last2
I tried this code But it didn't work
select *
from
(
select Name,value
from #A
) d
pivot
(
MAX(value)
for Name in (AccountID,member_id,firstname,lastname)
) piv;
Try this below logic-
DEMO HERE
SELECT
MAX(CASE WHEN Name = 'AccountID' THEN Value ELSE NULL END) AccountID,
MAX(CASE WHEN Name = 'member_id' THEN Value ELSE NULL END) member_id ,
MAX(CASE WHEN Name = 'firstname' THEN Value ELSE NULL END) firstname ,
MAX(CASE WHEN Name = 'lastname' THEN Value ELSE NULL END) lastname
FROM #A
GROUP BY KeyValue
You can get the desired result by using PIVOT. In your query you just need to select all the columns, like below.
SELECT AccountID, member_id, firstname, lastname
FROM
(
select * from #A
) d
PIVOT
(
MAX(value)
FOR Name IN (AccountID, member_id, firstname, lastname)
) piv;
You can run the test here.
In the temp table, you should select all useful information like this
select AccountID, member_id, firstname, lastname
from
(
select * from #A -- instead of `select Name,value`
) d
pivot
(
MAX(value)
for Name in (AccountID,member_id,firstname,lastname)
) piv;
Result here

Looping through groups of records

SQL Server 2014, I have a table with a number of rows for example 15, 5 have a groupid column of 736881 and 10 have a group id column 3084235. What I want to do is process each group of records in turn and load the results in to a table.
I have written the code to do this but I think I am not setting the loopcounter incorrectly set as I keep getting the groupid of records 736881 loaded twice.
I cant't currently post the test data due to containing personal information but if the mistake is not obvious I will try and create some dummy data.
SELECT #LoopCounter = min(rowfilter) , #maxrowfilter = max(rowfilter)
FROM peops6
WHILE ( #LoopCounter IS NOT NULL
AND #LoopCounter <= #maxrowfilter)
begin
declare #customer_dist as Table (
[id] [int] NOT NULL,
[First_Name] [varchar](50) NULL,
[Last_Name] [varchar](50) NULL,
[DoB] [date] NULL,
[post_code] [varchar](50) NULL,
[mobile] [varchar](50) NULL,
[Email] [varchar](100) NULL );
INSERT INTO #customer_dist (id, First_Name, Last_Name, DoB, post_code, mobile, Email)
select id, first_name, last_name, dob, postcode, mobile_phone, email from peops6 where rowfilter = #LoopCounter
insert into results
SELECT result.* ,
[dbo].GetPercentageOfTwoStringMatching(result.DoB, d.DoB) [DOB%match] ,
[dbo].GetPercentageOfTwoStringMatching(result.post_code, d.post_code) [post_code%match] ,
[dbo].GetPercentageOfTwoStringMatching(result.mobile, d.mobile) [mobile%match] ,
[dbo].GetPercentageOfTwoStringMatching(result.Email, d.Email) [email%match]
FROM ( SELECT ( SELECT MIN(id)
FROM #customer_dist AS sq
WHERE sq.First_Name = cd.First_Name
AND sq.Last_Name = cd.Last_Name
AND ( sq.DoB = cd.DoB
OR sq.mobile = cd.mobile
OR sq.Email = cd.Email
OR sq.post_code = cd.post_code )) nid ,
*
FROM #customer_dist AS cd ) AS result
INNER JOIN #customer_dist d ON result.nid = d.id order by 1, 2 asc;
SELECT #LoopCounter = min(rowfilter) FROM peops6
WHERE rowfilter > #LoopCounter
end
You need to truncate your table variable (#customer_dist) at the end of the loop:
....
-- Add this
TRUNCATE TABLE #customer_dist
SELECT #LoopCounter = min(rowfilter) FROM peops6
WHERE rowfilter > #LoopCounter
end
See: https://social.msdn.microsoft.com/Forums/sqlserver/en-US/42ef20dc-7ad8-44f7-b676-a4596fc0d593/declaring-a-table-variable-inside-a-loop-does-not-delete-the-previous-data?forum=transactsql
I am not sure you need a LOOP like using a SQL Cursor to fulfill this task
Please check following SQL statement where I used multiple CTE expressions
with customer_dist as (
select
rowfilter,
id, first_name, last_name, dob, postcode, mobile_phone, email
from peops6
), result as (
SELECT
(
SELECT
MIN(id)
FROM customer_dist AS sq
WHERE
sq.rowfilter = cd.rowfilter
AND sq.First_Name = cd.First_Name
AND sq.Last_Name = cd.Last_Name
AND (sq.DoB = cd.DoB OR sq.mobile_phone = cd.mobile_phone OR sq.Email = cd.Email OR sq.postcode = cd.postcode )
) nid,
*
FROM customer_dist AS cd
)
SELECT
result.* ,
[dbo].edit_distance(result.DoB, d.DoB) [DOB%match] ,
[dbo].edit_distance(result.postcode, d.postcode) [post_code%match] ,
[dbo].edit_distance(result.mobile_phone, d.mobile_phone) [mobile%match] ,
[dbo].edit_distance(result.Email, d.Email) [email%match]
FROM result
INNER JOIN customer_dist d
ON result.nid = d.id
order by 1, 2 asc;
Please note, I used my fuzzy string matching Levenshtein Distance Algorithm in this sample instead of your function
And the outcome is as follows
Only you need to add the INSERT statement just before the last SELECT statement
Hope it is useful

dynamically select column name that changed

I have a table as shown below.
ID NAME ADDRESS CITY ROLE Date_Modified
1 Tom something austin manager X
2 Tom nothing austin principal Y
3 Tom anything dallas VP Z
How do write a query to select the column name that have changed between entries 1,2 and 3? Currently I am building a report that needs to identify change. This is what I have so far and need to work with it.
I need to be able to detect via stored proc and see output below.
Id ColumnName DateChanged
2 Address Y
2 Role Y
3 Address Z
3 Role Z
If I understood your question correctly, what you need is detecting changes from one row to another and unpivoting the data. Usage of LAG required SQL Server 2012 or more.
;with cte as (
-- LAG for id is used to skip first row from selection
select id, LAG(id, 1) OVER (ORDER BY id) AS OldId,
address, LAG(address, 1) OVER (ORDER BY id) AS OldAddress,
role, LAG(role, 1) OVER (ORDER BY id) AS OldRole,
Date_Modified
from audit_data
)
SELECT id, ColName, data_col, Date_Modified
FROM
(
select id, address, role, Date_Modified
from cte
-- detect any change in monitored data
where ((OldAddress IS NULL OR address <> OldAddress)
OR (OldRole IS NULL OR role <> OldRole))
AND OldId IS NOT NULL
) AS cp
-- unpivot address and role into data_col column
UNPIVOT
(
data_col FOR ColName IN (address, role)
) AS up;
Data used for setup:
-- drop table audit_data
create table audit_data (
id int,
name VARCHAR(100),
address VARCHAR(100),
city varchar(100),
role VARCHAR(100),
Date_Modified DATETIME2
)
insert into audit_data values (1, 'Tom', 'something', 'austin', 'manager', '20150103'),
(2, 'Tom', 'nothing', 'austin', 'principa', '20150205'),
(3, 'Tom', 'anything', 'dallas', 'VP', '20150314')
go
[Edit] SQL 2008R2 version:
;with ad_cte as (
select id, address, role, Date_Modified, ROW_NUMBER() OVER (ORDER BY id) RowNo
from audit_data
),
cte as (
select ad.id,
ad.address, ad_old.address AS OldAddress,
ad.role, ad_old.role AS OldRole,
ad.Date_Modified
from ad_cte ad
join ad_cte ad_old on ad_old.RowNo + 1 = ad.RowNo
)
SELECT id, ColName, data_col, Date_Modified
FROM
(
select id, address, role, Date_Modified
from cte
-- detect any change in monitored data
where ((OldAddress IS NULL OR address <> OldAddress)
OR (OldRole IS NULL OR role <> OldRole))
-- this should be changed for generality
AND cte.id > 1
) AS cp
-- unpivot address and role into data_col column
UNPIVOT
(
data_col FOR ColName IN (address, role)
) AS up;
This is very similar to Alexei's answer:
CREATE TABLE #temp( ID INT IDENTITY(1, 1),
NAME VARCHAR(30),
ADDRESS VARCHAR(30),
CITY VARCHAR(30),
ROLE VARCHAR(30),
Date_Modified DATETIME );
INSERT INTO #temp
SELECT 'Tom',
'something',
'austin',
'manager',
DATEADD(day, -3, GETDATE())
UNION
SELECT 'Tom',
'nothing',
'austin',
'principal',
DATEADD(day, -2, GETDATE())
UNION
SELECT 'Tom',
'anything',
'dallas',
'VP',
DATEADD(day, -1, GETDATE());
SELECT 'Jon',
'something',
'san antonio',
'assistant manager',
DATEADD(day, -3, GETDATE())
UNION
SELECT 'Jon',
'something',
'austin',
'assistant manager',
DATEADD(day, -2, GETDATE())
UNION
SELECT 'Jon',
'anything',
'dallas',
'manager',
DATEADD(day, -1, GETDATE());
SELECT id,
ColName,
Date_Modified
FROM(
SELECT DISTINCT B.ID,
B.Name,
CASE
WHEN A.ADDRESS <> B.ADDRESS
THEN B.ADDRESS
END AS ADDRESS,
CASE
WHEN A.CITY <> B.CITY
THEN B.CITY
END AS CITY,
CASE
WHEN A.ROLE <> B.ROLE
THEN B.ROLE
END AS ROLE,
B.Date_Modified
FROM(
SELECT *,
ROW_NUMBER() OVER(PARTITION BY NAME ORDER BY Date_Modified DESC) AS ROWNUM
FROM #temp ) AS A
INNER JOIN(
SELECT *,
ROW_NUMBER() OVER(PARTITION BY NAME ORDER BY Date_Modified DESC) AS ROWNUM
FROM #temp ) AS B ON A.NAME = B.NAME
AND CHECKSUM(A.NAME, A.ADDRESS, A.CITY, A.ROLE) <> CHECKSUM(B.NAME, B.ADDRESS, B.CITY, B.ROLE)
AND A.ROWNUM = B.ROWNUM - 1 ) AS cp
UNPIVOT( data FOR ColName IN( address,
role )) AS up;

Is it possible to simplify the SQL query used to produce the checksum based difference between a table and the given TVP?

I need to write many entities into the database. I want to optimize it by:
Issuing a preflight request to compute the difference between the current data in the table and the new data in the process memory.
Only update/delete/insert the relevant records.
All the data is checksumed, so I am only going to compare the checksums.
Here is my preflight request:
;WITH src AS (
SELECT cs.AdmClientSiteId, src.Id ClientId, Checksum, AuxId
FROM #src src
JOIN AdmClientSite cs ON cs.AdmClientMasterId = src.Id
WHERE cs.AdmSiteId = #AdmSiteId
), dst AS (
SELECT dst.AdmClientSiteId, cs.AdmClientMasterId ClientId, Checksum, LegalAuxId AuxId
FROM AdmCustomerInfoLegal dst
JOIN AdmClientSite cs ON cs.AdmClientSiteId = dst.AdmClientSiteId
WHERE cs.AdmSiteId = #AdmSiteId
)
SELECT ISNULL(src.AdmClientSiteId, dst.AdmClientSiteId) AdmClientSiteId, ISNULL(src.ClientId, dst.ClientId) ClientId, ISNULL(src.AuxId, dst.AuxId) AuxId,
CASE
WHEN src.Checksum IS NULL THEN 0 -- DBAction.DELETE
WHEN dst.Checksum IS NOT NULL THEN 1 -- DBAction.UPDATE
ELSE 2 -- DBAction.INSERT
END Action
FROM src
FULL JOIN dst ON src.AdmClientSiteId = dst.AdmClientSiteId AND src.AuxId = dst.AuxId
WHERE src.Checksum IS NULL OR dst.Checksum IS NULL OR src.Checksum <> dst.Checksum
ORDER BY Action, ClientId
In this code:
#src is a TVP
AdmCustomerInfoLegal is the table to be updated
The schema of #src is slightly different from that of AdmCustomerInfoLegal.
My question - can it be simplified/improved?
Yes , you can use merge statement which used in above condition.
I does not have time , please check this example and modify at your end.
create table temptable (id int, firstname varchar(50), lastname varchar(50), email varchar(50), homephone varchar(50))
insert into temptable values
(1,'aaa' , 'bbb', 'xxx#yyy.com', 1234444),
(2,'aaa' , 'bbb', null, null),
(3,'ccc' , 'ddd', 'abc#ddey.com', null),
(4,'ccc' , 'ddd', null, 34343322 )
select * from temptable
;with cte as
(
select firstname, lastname
,(select top 1 id from temptable b where b.firstname = a.firstname and b.lastname = a.lastname and ( b.email is not null or b.homephone is not null)) tid
,(select top 1 email from temptable b where b.firstname = a.firstname and b.lastname = a.lastname and b.email is not null ) email
,(select top 1 homephone from temptable b where b.firstname = a.firstname and b.lastname = a.lastname and b.homephone is not null ) homephone
from temptable a
group by firstname , lastname
)
--select * from cte
merge temptable as a
using cte as b
on ( a.id = b.tid )
when matched
then
update set a.email = b.email , a.homephone = b.homephone
when not matched by source then
delete ;
select * from temptable
drop table temptable

SQL Recursive Coalesce

I'm trying to create a column that contains all cities of the referenced addresses.
DECLARE #AddressList nvarchar(max)
SELECT #AddressList = COALESCE(#AddressList + ' ', '') + City FROM [Address]
SELECT
Employee.*,
(SELECT #AddressList) AS AddressCities
FROM Employee
But I dont know where to put the WHERE clause.
...
(SELECT #AddressList WHERE EmployeeId = Employee.EmployeeId) AS AddressCities
...
The above test doesnt work..
Table schemas are:
Employee
EmployeeId
Name
Address
Street
City
EmployeeId
If i understand you correctly, you wish to show all Cities in a single column for the employee. So you wish to GROUP BY and CONCAT.
Using Sql Server 2005, try this (working example)
DECLARE #Employee TABLE(
EmployeeId INT,
NAME VARCHAR(100)
)
INSERT INTO #Employee (EmployeeId,[NAME]) SELECT 1, 'A'
INSERT INTO #Employee (EmployeeId,[NAME]) SELECT 2, 'B'
DECLARE #Address TABLE(
Street VARCHAR(50),
City VARCHAR(50),
EmployeeId INT
)
INSERT INTO #Address (Street,City, EmployeeId) SELECT 'A','A', 1
INSERT INTO #Address (Street,City, EmployeeId) SELECT 'B','B', 1
INSERT INTO #Address (Street,City, EmployeeId) SELECT 'C','C', 1
INSERT INTO #Address (Street,City, EmployeeId) SELECT 'D','D', 2
INSERT INTO #Address (Street,City, EmployeeId) SELECT 'E','E', 2
INSERT INTO #Address (Street,City, EmployeeId) SELECT 'F','F', 2
SELECT e.EmployeeId,
e.[NAME],
(
SELECT al.City + ','
FROM #Address al
WHERE al.EmployeeId = e.EmployeeId
FOR XML PATH('')
)
FROM #Employee e
GROUP BY e.EmployeeId,
e.[NAME]
Do need more information about what you mean by 'column that contains all cities'. How is what you want different to the following might help you phrase the question
SELECT e.EmployeeId,e.Name,a.City
FROM Employee e
INNER JOIN Address a ON a.EmployeeId = e.EmployeeId
GROUP BY e.EmployeeId,e.Name
-- update
I think I see what you mean, do you want like:
EmployeeID | Name | Address
1 | John | 'London','Paris','Rome'
2 | Jane | 'New York','Miami'
?