MSSQL: procedure to remove duplicates - sql

Consider the table which does not have any primary or foreign keys. I would like to write procedure which will remove all the duplicate rows given the table name.
The row should be considered duplicate of other if all of the fields are the same.
Can you suggest me if this is possible. One thing I tried is to group by every field but this approach is not universal.

You could achieve it using Dynamic-SQL
Quick backed solution (great room for improvements):
CREATE TABLE tab1(a INT, b INT);
INSERT INTO tab1(a,b) VALUES (1,1),(1,1),(1,1),(2,3);
GO
Procedure:
CREATE PROCEDURE dbo.remove_duplicates
#tab_name SYSNAME
,#debug BIT = 0
AS
BEGIN
SET NOCOUNT ON;
-- TODO: validation if table does not exist, raise error
-- TODO: Add #schema parameter
-- TODO: Wrap with BEGIN TRY, omit calculated columns, CAST `TEXT/IMAGE/BINARY`....
DECLARE #sql NVARCHAR(MAX) =
'WITH cte AS
(
SELECT *, rn = ROW_NUMBER() OVER(PARTITION BY <cols> ORDER BY (SELECT 1))
FROM <tab_placeholder>
)
DELETE FROM cte
WHERE rn <> 1;';
DECLARE #cols NVARCHAR(MAX) = STUFF((SELECT ',' + column_name
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = #tab_name
AND TABLE_SCHEMA = 'dbo'
FOR XML PATH('')), 1, 1, '');
SET #sql = REPLACE(#sql, '<tab_placeholder>', QUOTENAME(#tab_name));
SET #sql = REPLACE(#sql, '<cols>', #cols);
IF #debug = 1 SELECT #sql;
EXEC dbo.sp_executesql #sql;
END
GO
Execution:
EXEC [dbo].[remove_duplicates] #tab_name = 'tab1', #debug = 1;
SELECT * FROM tab1;
LiveDemo

This will remove duplicates from a table. Your partition by must contain the fields that you wish to group by to determine what a duplicate is. In your case, all of them.
IF OBJECT_ID('tempdb..#TABLE') IS NOT NULL DROP TABLE #TABLE
CREATE TABLE #TABLE ( SOMEINT INT,SOMEVALUE VARCHAR(255) )
INSERT INTO #TABLE ( SOMEINT, SOMEVALUE )
VALUES (1,'VALUE1')
,(1,'VALUE2')
,(1,'VALUE2')
,(1,'VALUE3')
,(1,'VALUE4')
,(1,'VALUE4')
,(1,'VALUE4')
,(1,'VALUE4')
,(1,'VALUE5')
,(1,'VALUE6')
,(1,'VALUE6')
,(1,'VALUE6')
,(1,'VALUE7')
,(1,'VALUE8')
,(1,'VALUE8')
,(1,'VALUE9')
,(1,'VALUE10')
;WITH dedup
AS
(
SELECT *, ROW_NUMBER() OVER (PARTITION BY SOMEINT,SOMEVALUE ORDER BY SOMEINT ASC) AS SEQUENCE
FROM #TABLE
)
DELETE
FROM dedup
WHERE SEQUENCE > 1
GO
SELECT * FROM #TABLE

There are number of ways
First,
Create a temp table, and copy distinct data to that temp table. Delete or truncate the data from your actual table. And copy the temp table to your actual table. Drop the temp table
SELECT DISTINCT * INTO #table1 from TABLE1
DELETE FROM TABLE1
INSERT INTO TABLE1
SELECT * FROM #table1
DROP TABLE #table1
or
Second,
Add one column to the table, update that column using ROW_NUMBER PARTITION, then remove the rows where COLUMN <> 1 . Drop the newly created column.

Related

Cross-database subquery on using database name stored in record

Is the script structure below possible?
Select
*,
(Select Count(*)
from [A.DatabaseName].dbo.TableA
where SomeID = A.SomeID) As Total
From
[Database1].dbo.Table1 A
The subquery above is dependent on the database name from [Database1].
Is this doable? If yes, how can this be implemented?
The Dynamic Query will help you.
DECLARE #DBName VARCHAR(100),#SQLQuery VARCHAR(1000)
SELECT #DBName = A.DatabaseName FROM [Database1].dbo.Table1
SELECT #SQLQuery = 'Select
*,
(Select Count(*)
from '+#DBName+'.dbo.TableA
where SomeID = A.SomeID) As Total
From
[Database1].dbo.Table1 A'
EXEC (#SQLQuery)
---> Edit
I think i understand the weird thing you are tying to do.
You store some database name into a table and the want to call if from a subquery.
You have to try something like this :
CREATE DATABASE test
use test
CREATE TABLE client
(
ID IDENTITY(1,1),
[name] varchar(20)
)
INSERT INTO client
([name])
VALUES
('Jean'), ('Paul'), ('Mark'), ('Pierre');
CREATE TABLE allTable
(
NomSchema VARCHAR(200),
NomTable VARCHAR(200)
)
INSERT INTO allTable
VALUES
(
'TEST','client'
)
IF OBJECT_ID('tempdb..#ResultA') IS NOT NULL
DROP TABLE #ResultA
BEGIN TRAN
DECLARE #sql VARCHAR(200);
SELECT TOP(1) * INTO #resultA FROM allTable
SET #sql = 'SELECT * FROM ' + (SELECT quotename(#resultA.NomSchema) + '.dbo.' + quotename(#resultA.NomTable) FROM #resultA)
SELECT #sql
EXEC(#sql)
DROP TABLE #resultA
COMMIT
GO
-- DROP DATABASE TEST

drop all SQL tables that appear in a query

I am attempting to develop a script to compare two databases to determine extra tables in one, then delete those tables. Here's my current script to locate the extraneous tables:
-- Any location where TARGET appears, replace TARGET with the database to be
-- modified
-- Any location where MODEL appears, replace MODEL with the database being
-- used as a model for comparison
select 'TARGET' as dbname, t1.table_name
from TARGET.[INFORMATION_SCHEMA].[tables] as t1
where table_name not in (select t2.table_name
from
MODEL.[INFORMATION_SCHEMA].[tables] as t2
)
That gives me the results I need, but now I need to fire out how to drop the tables. I'm afraid I'm utterly lost at this point. Wouldn't mind a way to declare variables instead of typing in the DBname repeatedly either, but not sure I can in this instance.
You could use dynamic SQL:
DECLARE #sql NVARCHAR(MAX) = N'';
select #sql += CONCAT('DROP TABLE ',QUOTENAME(t1.table_name,''''),';',CHAR(13))
from TARGET.[INFORMATION_SCHEMA].[tables] as t1
where table_name not in (select t2.table_name
from MODEL.[INFORMATION_SCHEMA].[tables] as t2);
SELECT #sql; -- debug
--EXEC(#sql);
EDIT:
MySQL(may need some nitpicking):
SET #s = (select GROUP_CONCAT('DROP TABLE ''' + t1.table_name + ''';' SEPARATOR CHAR(13))
from TARGET.INFORMATION_SCHEMA.tables as t1
where table_name not in (select t2.table_name
from MODEL.INFORMATION_SCHEMA.tables as t2));
SELECT #s; -- debug
PREPARE stmt FROM #s;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
My immediate thought is to assign each result in your query a Row Number and place your results into a temp table. Use a while loop starting at 1 and loop through the maximum number you have within that temp table getting the name of the table that Row Number is assigned to each loop. Use that name to delete from the database.
select 'TARGET' as dbname, t1.table_name
, ROW_NUMBER() OVER (Partition By t1.table_name) AS RowNumber
INTO #temp
from TARGET.[INFORMATION_SCHEMA].[tables] as t1
where table_name not in (select t2.table_name
from MODEL.[INFORMATION_SCHEMA].[tables] as t2)
DECLARE #counter INT = 1
DECLARE #maxNum INT
SELECT #maxNum = MAX(RowNumber) FROM #temp
While #counter <= #maxNum
BEGIN
DECLARE #tableName AS VARCHAR(MAX)
SELECT #tableName = table_name FROM #temp WHERE RowNumber = #counter
DELETE TABLE #tableName ' This may not be possible, but follow my lead
#counter += 1
END
DROP TABLE #temp
I am not sure if "DELETE TABLE #tableName" is a proper command but there is probably a very similar solution using what I have given you. I assume this is T-SQL..

Reorder columns in final stored procedure SELECT / OUTPUT

I need to reorder columns in the final SELECT statement in a stored procedure. Column orders needs to be fetched from another table.
I have a solution based on dynamic SQL. Is there any better way to do it? I have around 100 columns to return with millions of rows for an Excel export. Is there any other performance optimized solution other than a dynamic query?
Please find sample code below for my current solution.
IF OBJECT_ID( 'tempdb..#TempColumns') IS NOT NULL
BEGIN
DROP TABLE #TempColumns
END
IF OBJECT_ID( 'tempdb..#TempColumnsOrder') IS NOT NULL
BEGIN
DROP TABLE #TempColumnsOrder
END
CREATE TABLE #TempColumns
(
ID INT IDENTITY,
FirstName VARCHAR(MAX),
LastName VARCHAR(MAX),
Gender VARCHAR(MAX)
)
INSERT INTO #TempColumns
VALUES ('ABC', 'DEF', 'MALE'), ('PR', 'ZA', 'FEMALE'), ('ERT', 'GFG', 'MALE')
CREATE TABLE #TempColumnsOrder
(
ID INT IDENTITY,
ColumnName VARCHAR(MAX),
ColumnOrder INT
)
INSERT INTO #TempColumnsOrder
VALUES ('FirstName', 3), ('LastName', 2), ('Gender', 1)
SELECT * FROM #TempColumns
SELECT * FROM #TempColumnsOrder
DECLARE #script VARCHAR(MAX)
SELECT #script = 'SELECT '
SELECT #script = #script + ColumnName + ','
FROM #TempColumnsOrder
ORDER BY ColumnOrder
PRINT #script
SELECT #script = SUBSTRING(RTRIM(#script), 1, LEN(RTRIM(#script)) - 1)
SELECT #script = #script + ' FROM #TempColumns'
EXEC (#script)
IF OBJECT_ID( 'tempdb..#TempColumns') IS NOT NULL
BEGIN
DROP TABLE #TempColumns
END
IF OBJECT_ID( 'tempdb..#TempColumnsOrder') IS NOT NULL
BEGIN
DROP TABLE #TempColumnsOrder
END
Thanks for reply, Is there any better way in Dynamic SQL other than what i did?
You can eliminate the unsupported string concatenation you are using, and modernize and simply the code:
DROP TABLE IF EXISTS #TempColumns
DROP TABLE IF EXISTS #TempColumnsOrder
CREATE TABLE #TempColumns
(
ID INT IDENTITY,
FirstName VARCHAR(MAX),
LastName VARCHAR(MAX),
Gender VARCHAR(MAX)
)
INSERT INTO #TempColumns
Values('ABC','DEF','MALE'),('PR','ZA','FEMALE'),('ERT','GFG','MALE')
CREATE TABLE #TempColumnsOrder
(
ID INT IDENTITY,
ColumnName VARCHAR(MAX),
ColumnOrder INT
)
INSERT INTO #TempColumnsOrder
Values('FirstName',3), ('LastName',2), ('Gender',1)
SELECT * FROM #TempColumns
SELECT * FROM #TempColumnsOrder
DECLARE #script VARCHAR(MAX) = concat(
'SELECT ',
(select STRING_AGG(QUOTENAME(ColumnName),', ') WITHIN GROUP (ORDER BY ColumnOrder)
FROM #TempColumnsOrder),
' FROM #TempColumns')
print #script
EXEC (#script)
DROP TABLE IF EXISTS #TempColumns
DROP TABLE IF EXISTS #TempColumnsOrder
SELECT #script = #script + ColumnName + ',' FROM #TempColumnsOrder
ORDER BY ColumnOrder
The behavior of aggregate string concatenation with the above technique is not guaranteed. The actual behavior is plan-dependent so you may not get the desired results.
In SQL Server 2017 and Azure SQL Database, STRING_AGG is the proper method:
SELECT STRING_AGG(ColumnName, ',') WITHIN GROUP(ORDER BY ColumnOrder)
FROM #TempColumnsOrder;
In older SQL Versions like SQL Server 2012, the best method is with XML PATH():
SELECT #script = #script +
STUFF((SELECT ',' + ColumnName
FROM #TempColumnsOrder
ORDER BY ColumnOrder
FOR XML PATH(''), TYPE).value('.', 'NVARCHAR(MAX)'),1,1,'');
See this answer for details about how the above query works.

How do I get a collection of every value in every column of a table?

I have two tables, Values and SpecialValues.
Values has two columns, RecordID and ValueName.
SpecialValues is a table which contains a single row, and thirty columns named SpecialValueName1, SpecialValueName2, SpecialValueName3, etc.
There are obvious database design problems with this system.
That aside, can someone explain to me how to query SpecialValues so that I can get a collection of all the values of every row from the table, and exclude them from a Select from Values?
There's probably some easy way to do this or create a View for it or something, but I think looking at this code might have broken me for the moment...
EDIT: I'd like a query to get all the individual values from every row and column of a given table (in this case the SpecialValues table) so that the query does not need to be updated the next time someone adds another column to the SpecialValues table.
This creates a #SpecialValuesColumns temporary table to store all the column names from SpecialValues.
It then uses a cursor to insert all the values from each of those columns into another temporary table #ProtectedValues.
It then uses a NOT IN query to exclude all of those values from a query to Values.
This code is bad and I feel bad for writing it, but it seems like the least-worst option open to me right now.
DECLARE #SpecialColumnsCount INT;
DECLARE #Counter INT;
DECLARE #CurrentColumnName VARCHAR(255);
DECLARE #ExecSQL VARCHAR(1024);
SET #Counter = 1;
CREATE TABLE #ProtectedValues(RecordID INT IDENTITY(1,1) PRIMARY KEY NOT NULL, Value VARCHAR(255));
DECLARE #SpecialValuesColumns TABLE (RecordID INT IDENTITY(1,1) PRIMARY KEY NOT NULL, ColumnName VARCHAR(255));
INSERT INTO #SpecialValuesColumns (ColumnName)
SELECT COLUMN_NAME
FROM INFORMATION_SCHEMA.COLUMNS
WHERE
TABLE_NAME = 'SpecialValues' AND
DATA_TYPE = 'varchar' AND
CHARACTER_MAXIMUM_LENGTH = 255
SELECT #SpecialColumnsCount = COUNT(*) FROM #SpecialValuesColumns
WHILE #Counter <= #SpecialColumnsCount
BEGIN
SELECT #CurrentColumnName = ColumnName FROM #SpecialValuesColumns WHERE RecordID = #Counter;
SET #ExecSQL = 'INSERT INTO #ProtectedValues (Value) SELECT ' + #CurrentColumnName + ' FROM SpecialValues'
EXEC (#ExecSQL)
SET #Counter = #Counter + 1;
END
SELECT * FROM Values WHERE ValueName NOT IN (SELECT ValueName COLLATE DATABASE_DEFAULT FROM #ProtectedValues)
DROP TABLE #ProtectedValues;
I might have misunderstood but doesn't this do it?
SELECT * FROM Values
WHERE ValueName NOT IN (
SELECT SpecialValueName1 FROM SpecialValues
UNION SELECT SpecialValueName2 FROM SpecialValues
UNION SELECT SpecialValueName3 FROM SpecialValues
etc..
)
You could of course make the subquery into a view instead.
*Edit:
This is quite ugly but should solve your problem:
First Create procedure #1
CREATE PROCEDURE [dbo].[SP1]
As
DECLARE
#Query nvarchar(MAX),
#Table nvarchar(255),
#Columns nvarchar(255)
CREATE TABLE #TempTable (Value nvarchar(255))
SET #Table = 'SpecialValues'
SELECT [COLUMN_NAME]
FROM [INFORMATION_SCHEMA].[COLUMNS]
WHERE [TABLE_NAME] = #Table
DECLARE Table_Cursor CURSOR FOR
SELECT COLUMN_NAME
FROM [INFORMATION_SCHEMA].[COLUMNS]
WHERE [TABLE_NAME] = #Table
OPEN Table_Cursor
FETCH NEXT FROM Table_Cursor INTO #Columns
WHILE ##FETCH_STATUS = 0
BEGIN
INSERT INTO #TempTable EXEC SP2 #Columns = #Columns, #Table = #Table
FETCH NEXT FROM Table_Cursor INTO #Columns
END
CLOSE Table_Cursor
DEALLOCATE Table_Cursor
SELECT ValueName FROM Value WHERE Value NOT IN (SELECT * FROM #TempTable)
TRUNCATE TABLE #TempTable
DROP TABLE #TempTable
Then Create procedure #2
CREATE PROCEDURE [dbo].[SP2]
#Columns nvarchar(255) = '',
#Table nvarchar(255)
AS
DECLARE
#Query nvarchar(MAX)
SET #Query = 'SELECT TOP 1 CONVERT(nvarchar, ' + #Columns + ') FROM ' + #Table
EXEC (#Query)
Then lastly execute the procedure
EXEC SP1
You need to unpivot the values in specialvalues. A pretty easy way to do that is with cross apply syntax:
select sv.value
from specialvalues sv cross apply
(values(sv.SpecialValueName1), (sv.SpecialValueName2), . . .
) sv(value)
where sv.value is not null;
You can exclude these from the list using not in, not exists or a left join.
What ever way you cut it, you have to specify the columns in SpecialValues, you can do this with a long set of UNION queries, or use UNPIVOT:
select SpecialValue
from (select SpecialValueName1,SpecialValueName2,SpecialValueName3 from #SpecialValues) p
unpivot (SpecialValue FOR ROW IN (SpecialValueName1,SpecialValueName2,SpecialValueName3))
AS unpvt
You can then incorporate this into a query on Values using NOT IN
select * from [Values] where ValueName not in (
select SpecialValue
from (select SpecialValueName1,SpecialValueName2,SpecialValueName3 from #SpecialValues) p
unpivot (SpecialValue FOR ROW IN (SpecialValueName1,SpecialValueName2,SpecialValueName3))
AS unpvt
)

SQL Combine two statements from old tabels into a new one

I want to create a new table in a stored procedure. In the new table I need a row with a combined value from the old tables.
It should look like sees
Table_old1 Table_old2 Table_new
----------------------------------------
Edward Mary EdwardMary
Daniel John DanielJohn
George Sam GeorgeSam
Steven Alaina StevenAlaina
Paul Edward PaulEdward
For the stored procedure I use some parameters for dynamic SQL.
I tried the following code my result was no success at all ;(
CREATE PROCEDURE build_together
#tblname sysname
AS
DECLARE #sql nvarchar(4000)
SELECT #sql = ' CREATE TABLE all_together AS('
' SELECT all.Values, choosen.Values ' +
'INTO all_together'+
' FROM dbo.tbl_all_possible all, dbo.' + quotename(#tblname) + ' choosen);'
EXEC sp_executesql #sql
Any insight would be greatly appreciated.
I am guessing you want to generate some sample data. This should be useful and if you want to modify and add Joins, that should be easy as well.
SCRIPT:
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[usp_BuildTogether]') AND type in (N'P', N'PC'))
DROP PROCEDURE dbo.usp_BuildTogether
GO
CREATE PROCEDURE usp_BuildTogether
#OneTableName sysname = 'tbl_all_possible'
,#OneColumnName sysname = 'Value'
,#TwoTableName sysname = 'tbl_all_possible2'
,#TwoColumnName sysname = 'Value'
,#CombinedTableName sysname = 'all_together'
AS
BEGIN
DECLARE #sql nvarchar(4000) = NULL
SELECT #sql = 'IF object_id(N'''+#CombinedTableName+''')>0'+CHAR(13)+CHAR(10)+
'BEGIN'+CHAR(13)+CHAR(10)+
' Drop table '+#CombinedTableName+CHAR(13)+CHAR(10)+
'END'+CHAR(13)+CHAR(10)
EXEC (#sql)
SELECT #sql = 'SELECT one.'+#OneColumnName+' AS Table_old1, two.'+#TwoColumnName+' as Table_old2, one.'+#OneColumnName+'+'' ''+two.'+#TwoColumnName+' as Table_new'+CHAR(13)+CHAR(10)+
'INTO '+#CombinedTableName+CHAR(13)+CHAR(10)+
'FROM '+QUOTENAME(#OneTableName)+' one, '+QUOTENAME(#TwoTableName)+' two;'
EXEC (#sql)
END
TEST DATA:
--Cleanup Old test tables
IF object_id(N'all_together')>0
BEGIN
Drop table all_together
END
IF object_id(N'tbl_all_possible')>0
BEGIN
Drop table tbl_all_possible
END
Create Table tbl_all_possible (ID INT IDENTITY(1,1) ,Value varchar(250))
IF object_id(N'tbl_all_possible2')>0
BEGIN
Drop table tbl_all_possible2
END
Create Table tbl_all_possible2 (ID INT IDENTITY(1,1) ,Value varchar(250))
-- Insert Test Data
INSERT INTO tbl_all_possible
SELECT 'Edward' UNION ALL
SELECT 'Daniel' UNION ALL
SELECT 'George' UNION ALL
SELECT 'Steven' UNION ALL
SELECT 'Paul'
INSERT INTO tbl_all_possible2
SELECT 'Mary' UNION ALL
SELECT 'John' UNION ALL
SELECT 'Sam' UNION ALL
SELECT 'Alaina' UNION ALL
SELECT 'Edward'
--Execute SP
EXEC SP_EXECUTESQL usp_BuildTogether
--Verify Result
SELECT * from all_together
--Cleanup
IF object_id(N'tbl_all_possible')>0
BEGIN
Drop table tbl_all_possible
END
IF object_id(N'tbl_all_possible2')>0
BEGIN
Drop table tbl_all_possible2
END
IF object_id(N'all_together')>0
BEGIN
Drop table all_together
END
Just try this out,may not be correct as you didn't mention any key columns, just gave it a spin.
SELECT old1.name as oldname1,old2.name as oldname2,old1.name+old2.name as newname AS name
(SELECT name,row_number() OVER (ORDER BY (SELECT NULL)) AS id FROM old_table1 )old1
JOIN
(SELECT name,row_number() OVER (ORDER BY (SELECT NULL)) AS id FROM old_table2 )old2
ON old1.id=old2.id
You can use simple JOIN and CONCATENATE to achieve this.
Try the following:
CREATE PROCEDURE build_together
AS
CREATE TABLE Table_New
--Insert column names here
INSERT INTO Table_New
SELECT old1.ColumnName + old2.ColumnName AS New_ColumnName
FROM Table_old1 old1, Table_old2 old2
--If you have any where condition or specific Primary key that you need to match,
--then add a where clause:
WHERE old1.Primary_Key = old2.Primary_Key