Issue using CHARINDEX function in SQL Server - sql

could someone help me? I'm trying to get a specific value in my delimited column.
Column_A is my data
Column_B is what I could get
Column_C is what I want
Basically I'm trying to get the values between the 3rd ":" and the 4th ":"
I'm using this piece of code here:
select SourceID
, SUBSTRING(SourceID,CHARINDEX(':', SourceID, CHARINDEX(':', SourceID) + 1) + 1,
CHARINDEX(':', SourceID, CHARINDEX(':', SourceID, CHARINDEX(':', SourceID) + 1) + 1) -6)
from temp.table
Thanks in advance

You may try with a recursive CTE to retrieve any part of the string as you wish. Something like this
CREATE TABLE #Temp
(
MyString NVARCHAR(2000)
)
INSERT INTO #TEMP
VALUES('42:45:ABCD:GGRFG34:SADSAD'),('65:213:5435423:234234')
;WITH CTE AS
(
SELECT
ParentSTring = MyString,
MyString = CASE CHARINDEX(':',MyString) WHEN 0 THEN NULL ELSE SUBSTRING(MyString,CHARINDEX(':',MyString)+1,LEN(MyString)) END,
Part = CASE CHARINDEX(':',MyString) WHEN 0 THEN MyString ELSE SUBSTRING(MyString,1,CHARINDEX(':',MyString)-1) END,
Seq = 1
FROM
#Temp
UNION ALL
SELECT
ParentSTring,
MyString = CASE CHARINDEX(':',MyString) WHEN 0 THEN NULL ELSE SUBSTRING(MyString,CHARINDEX(':',MyString)+1,LEN(MyString)) END,
Part = CASE CHARINDEX(':',MyString) WHEN 0 THEN MyString ELSE SUBSTRING(MyString,1,CHARINDEX(':',MyString)-1) END,
Seq = ISNULL(Seq,0)+1
FROM
CTE
WHERE
ISNULL(MyString, '') <> ''
)
SELECT
*
FROM
CTE
WHERE
Seq = 3 -- for retrieving the 3rd string, change this accordingly
Result

First, if performance is important then a recursive CTE is NOT what you want. I demonstrate why in a moment.
I have a simple solution here, called SubstringBetween8K, but it's overkill for what you are doing. For this a simple Cascading APPLY will do the trick and perform the best. First the sample data:
IF OBJECT_ID('tempdb..#temp') IS NOT NULL DROP TABLE #temp;
GO
CREATE TABLE #temp (SourceId VARCHAR(1000));
INSERT #temp VALUES('42:45:10856x2019035x1200:GGRFG34:SADSAD.9999999999999999'),
('65:213:999555x2019035x9444:5435423:234234,123123.111'),
('999:12344:5555511056x35x9111:5435423:234234,555555555555'),
('225:0:11056x2019035x9444:5435423:ABAFLHG.882');
Next for the Cascading APPLY solution.
SELECT Item = SUBSTRING(t.SourceId, f2.Pos+1, f3.Pos-f2.Pos-1)
FROM #temp AS t
CROSS APPLY (VALUES(CHARINDEX(':',t.SourceId))) AS f1(Pos)
CROSS APPLY (VALUES(CHARINDEX(':',t.SourceId,f1.Pos+1))) AS f2(Pos)
CROSS APPLY (VALUES(CHARINDEX(':',t.SourceId,f2.Pos+1))) AS f3(Pos);
Results:
Item
------------------------
10856x2019035x1200
999555x2019035x9444
5555511056x35x9111
11056x2019035x9444
Now a quick performance test which will demonstrate why not to use a recursive CTE.
--==== Sample data
IF OBJECT_ID('tempdb..#temp') IS NOT NULL DROP TABLE #temp;
GO
CREATE TABLE #temp (SourceId VARCHAR(1000));
INSERT #temp VALUES('42:45:10856x2019035x1200:GGRFG34:SADSAD.9999999999999999'),
('65:213:999555x2019035x9444:5435423:234234,123123.111'),
('999:12344:5555511056x35x9111:5435423:234234,555555555555'),
('225:0:11056x2019035x9444:5435423:ABAFLHG.882');
--==== Add 10K rows for performance testing
INSERT #temp
SELECT TOP (100000) sourceId
FROM #temp
CROSS JOIN sys.all_columns, sys.all_columns AS b
GO
--==== Performance Test
IF OBJECT_ID('tempdb..#t1') IS NOT NULL DROP TABLE #t1;
IF OBJECT_ID('tempdb..#t2') IS NOT NULL DROP TABLE #t2;
GO
SET STATISTICS TIME, IO ON;
PRINT CHAR(10)+'Cascading CTE'+CHAR(10)+REPLICATE('-',90);
SELECT Item = SUBSTRING(t.SourceId, f2.Pos+1, f3.Pos-f2.Pos-1)
INTO #t1
FROM #temp AS t
CROSS APPLY (VALUES(CHARINDEX(':',t.SourceId))) AS f1(Pos)
CROSS APPLY (VALUES(CHARINDEX(':',t.SourceId,f1.Pos+1))) AS f2(Pos)
CROSS APPLY (VALUES(CHARINDEX(':',t.SourceId,f2.Pos+1))) AS f3(Pos);
PRINT CHAR(10)+'Recursive CTE'+CHAR(10)+REPLICATE('-',90);
;WITH CTE AS
(
SELECT
ParentSTring = SourceId,
SourceId = CASE CHARINDEX(':',SourceId) WHEN 0 THEN NULL ELSE SUBSTRING(SourceId,CHARINDEX(':',SourceId)+1,LEN(SourceId)) END,
Part = CASE CHARINDEX(':',SourceId) WHEN 0 THEN SourceId ELSE SUBSTRING(SourceId,1,CHARINDEX(':',SourceId)-1) END,
Seq = 1
FROM #temp
UNION ALL
SELECT
ParentSTring,
MyString = CASE CHARINDEX(':',SourceId) WHEN 0 THEN NULL ELSE SUBSTRING(SourceId,CHARINDEX(':',SourceId)+1,LEN(SourceId)) END,
Part = CASE CHARINDEX(':',SourceId) WHEN 0 THEN SourceId ELSE SUBSTRING(SourceId,1,CHARINDEX(':',SourceId)-1) END,
Seq = ISNULL(Seq,0)+1
FROM CTE
WHERE ISNULL(SourceId, '') <> ''
)
SELECT Part
INTO #t2
FROM CTE
WHERE Seq = 3
SET STATISTICS TIME, IO OFF;
Test Results:
Cascading CTE
------------------------------------------------------------------------------------------
Table '#temp'. Scan count 9, logical reads 807, physical reads 0...
SQL Server Execution Times: CPU time = 327 ms, elapsed time = 111 ms.
Recursive CTE
------------------------------------------------------------------------------------------
Table 'Worktable'. Scan count 2, logical reads 4221845, physical reads 0...
Table '#temp'. Scan count 1, logical reads 807, physical reads 0...
SQL Server Execution Times: CPU time = 8781 ms, elapsed time = 9370 ms.
From 1/10th of a second, down from 10 seconds. A roughly 100X performance improvement. Part of the issue with the recursive CTE is the excessive IO (reads). Note the 4.3 million reads for a simple 10K rows.

Related

SQL - STRING_SPLIT string position

I have a table with two columns of comma-separated strings. The way the data is formatted, the number of comma-separated items in both columns is equal, and the first value in colA is related to the first value in colB, and so on. (It's obviously not a very good data format, but it's what I'm working with.)
If I have the following row (PrimaryKeyID | column1 | column2):
1 | a,b,c | A,B,C
then in this data format, a & 1 are logically related, b & 2, etc.
I want to use STRING_SPLIT to split these columns, but using it twice obviously crosses them with each other, resulting in a total of 9 rows.
1 | a | A
1 | b | A
1 | c | A
1 | a | B
1 | b | B
1 | c | B
1 | a | C
1 | b | C
1 | c | C
What I want is just the 3 "logically-related" columns
1 | a | A
1 | b | B
1 | c | C
However, STRING_SPLIT(myCol,',') doesn't appear to save the String Position anywhere.
I have done the following:
SELECT tbl.ID,
t1.Column1Value,
t2.Column2Value
FROM myTable tbl
INNER JOIN (
SELECT t.ID,
ss.value AS Column1Value,
ROW_NUMBER() OVER (PARTITION BY t.ID ORDER BY t.ID) as StringOrder
FROM myTable t
CROSS APPLY STRING_SPLIT(t.column1,',') ss
) t1 ON tbl.ID = t1.ID
INNER JOIN (
SELECT t.ID,
ss.value AS Column2Value,
ROW_NUMBER() OVER (PARTITION BY t.ID ORDER BY t.ID) as StringOrder
FROM myTable t
CROSS APPLY STRING_SPLIT(t.column2,',') ss
) t1 ON tbl.ID = t2.ID AND t1.StringOrder = t2.StringOrder
This appears to work on my small test set, but in my opinion there is no reason to expect it to work guaranteed every time. The ROW_NUMBER() OVER (PARTITION BY ID ORDER BY ID) is obviously a meaningless ordering, but it appears that, in absence of any real ordering, STRING_SPLIT is returning the values in the "default" order that they were already in. Is this "expected" behaviour? Can I count on this? Is there any other way of accomplishing what I'm attempting to do?
Thanks.
======================
EDIT
I got what I wanted (I think) with the following UDF. However it's pretty slow. Any suggestions?
CREATE FUNCTION fn.f_StringSplit(#string VARCHAR(MAX),#delimiter VARCHAR(1))
RETURNS #r TABLE
(
Position INT,
String VARCHAR(255)
)
AS
BEGIN
DECLARE #current_position INT
SET #current_position = 1
WHILE CHARINDEX(#delimiter,#string) > 0 BEGIN
INSERT INTO #r (Position,String) VALUES (#current_position, SUBSTRING(#string,1,CHARINDEX(#delimiter,#string) - 1))
SET #current_position = #current_position + 1
SET #string = SUBSTRING(#string,CHARINDEX(#delimiter,#string) + 1, LEN(#string) - CHARINDEX(#delimiter,#string))
END
--add the last one
INSERT INTO #r (Position, String) VALUES(#current_position,#string)
RETURN
END
The only way I've discovered to expressively maintain the order of the String_Split() function this is using the Row_Number() function with a literal value in the "order by".
For example:
declare #Version nvarchar(128)
set #Version = '1.2.3';
with V as (select value v, Row_Number() over (order by (select 0)) n from String_Split(#Version, '.'))
select
(select v from V where n = 1) Major,
(select v from V where n = 2) Minor,
(select v from V where n = 3) Revision
Returns:
Major Minor Revision
----- ----- ---------
1 2 3
Update: if you are using a newer version of SQL Server, you can now provide an optional third bit argument which indicates that and ordinal column should also be included in the result. See my other answer here for more details.
Fortunately in newer SQL Server (Azure and 2022) an optional flag has been added to String_Split to include an "ordinal" column. If you are using a newer version of SQL Server, this finally provides a solution that is logically correct rather than implementation specific.
New definition:
String_Split(string, separator [, enable_ordinal])
e.g. String_Split('1.2.3', '.', 1)
Example:
with V as (select Value v, Ordinal n from String_Split('1.2.3', '.', 1))
select
(select v from V where n = 1) Major,
(select v from V where n = 2) Minor,
(select v from V where n = 3) Revision
Returns:
Major Minor Revision
----- ----- ---------
1 2 3
Your idea is fine, but your order by is not using a stable sort. I think it is safer to do:
SELECT tbl.ID, t1.Column1Value, t2.Column2Value
FROM myTable tbl INNER JOIN
(SELECT t.ID, ss.value AS Column1Value,
ROW_NUMBER() OVER (PARTITION BY t.ID
ORDER BY CHARINDEX(',' + ss.value + ',', ',' + t.column1 + ',')
) as StringOrder
FROM myTable t CROSS APPLY
STRING_SPLIT(t.column1,',') ss
) t1
ON tbl.ID = t1.ID INNER JOIN
(SELECT t.ID, ss.value AS Column2Value,
ROW_NUMBER() OVER (PARTITION BY t.ID
ORDER BY CHARINDEX(',' + ss.value + ',', ',' + t.column2 + ',')
) as StringOrder
FROM myTable t CROSS APPLY
STRING_SPLIT(t.column2, ',') ss
) t2
ON tbl.ID = t2.ID AND t1.StringOrder = t2.StringOrder;
Note: This may not work as desired if the strings have non-adjacent duplicates.
I'm a little late to this question, but I was just attempting the same thing with string_split since I've run into a performance problem of late. My experience with string splitters in T-SQL has led me to use recursive CTE's for most things containing fewer than 1,000 delimited values. Ideally, a CLR procedure would be used if you need ordinal in your string split.
That said, I've come to a similar conclusion as you on getting ordinal from string_split. You can see the queries and statistics below which, in order, are the bare string_split function, a CTE RowNumber of string_split, and then my personal string split CTE function I derived from this awesome write-up. The main difference between my CTE-based function and the one in the write-up is I made it an Inline-TVF instead of their implementation of a MultiStatement-TVF, which you can read about the differences here.
In my experiments I haven't seen a deviation using ROW_NUMBER on a constant returning the internal order of the delimited string, so I will be using it until such time as I find a problem with it, but if order is imperative in a business setting, I would probably recommend the Moden splitter featured in the first link above, which links to the author's article here since it is right in-line with the performance seen by the less safe string_split with RowNumber approach.
set nocount on;
declare
#iter int = 0,
#rowcount int,
#val varchar(max) = '';
while len(#val) < 1e6
select
#val += replicate(concat(#iter, ','), 8e3),
#iter += 1;
raiserror('Begin string_split Built-In', 0, 0) with nowait;
set statistics time, io on;
select
*
from
string_split(#val, ',')
where
[value] > '';
select
#rowcount = ##rowcount;
set statistics time, io off;
print '';
raiserror('End string_split Built-In | Return %d Rows', 0, 0, #rowcount) with nowait;
print '';
raiserror('Begin string_split Built-In with RowNumber', 0, 0) with nowait;
set statistics time, io on;
with cte
as (
select
*,
[group] = 1
from
string_split(#val, ',')
where
[value] > ''
),
cteCount
as (
select
*,
[id] = row_number() over (order by [group])
from
cte
)
select
*
from
cteCount;
select
#rowcount = ##rowcount;
set statistics time, io off;
print '';
raiserror('End string_split Built-In with RowNumber | Return %d Rows', 0, 0, #rowcount) with nowait;
print '';
raiserror('Begin Moden String Splitter', 0, 0) with nowait;
set statistics time, io on;
select
*
from
dbo.SplitStrings_Moden(#val, ',')
where
item > '';
select
#rowcount = ##rowcount;
set statistics time, io off;
print '';
raiserror('End Moden String Splitter | Return %d Rows', 0, 0, #rowcount) with nowait;
print '';
raiserror('Begin Recursive CTE String Splitter', 0, 0) with nowait;
set statistics time, io on;
select
*
from
dbo.fn_splitByDelim(#val, ',')
where
strValue > ''
option
(maxrecursion 0);
select
#rowcount = ##rowcount;
set statistics time, io off;
Statistics being
Begin string_split Built-In
SQL Server Execution Times:
CPU time = 2000 ms, elapsed time = 5325 ms.
SQL Server Execution Times:
CPU time = 0 ms, elapsed time = 0 ms.
End string_split Built-In | Return 331940 Rows
Begin string_split Built-In with RowNumber
SQL Server Execution Times:
CPU time = 2094 ms, elapsed time = 8119 ms.
SQL Server Execution Times:
CPU time = 0 ms, elapsed time = 0 ms.
End string_split Built-In with RowNumber | Return 331940 Rows
Begin Moden String Splitter
SQL Server parse and compile time:
CPU time = 0 ms, elapsed time = 6 ms.
SQL Server Execution Times:
CPU time = 8734 ms, elapsed time = 9009 ms.
SQL Server Execution Times:
CPU time = 0 ms, elapsed time = 0 ms.
End Moden String Splitter | Return 331940 Rows
Begin Recursive CTE String Splitter
Table 'Worktable'. Scan count 2, logical reads 1991648, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
SQL Server Execution Times:
CPU time = 147188 ms, elapsed time = 147480 ms.
SQL Server Execution Times:
CPU time = 0 ms, elapsed time = 0 ms.
End Recursive CTE String Splitter | Return 331940 Rows
SELECT
PrimaryKeyID ,t2.items as column1, t1.items as column2 from [YourTableName]
cross Apply [dbo].[Split](column2) as t1
cross Apply [dbo].[Split](column1) as t2
Mark, here is a solution I would use. Assuming that [column 1] in your table has the "key" values that are more less stable, and [column2] has corresponding "field" values that can be sometimes omitted or NULL:
There will be two extractions, one for [column 1] - which I assume is the Key, another for [column 2] - which I assume is the sort of "values" for the "key", they will be auto parsed then by STRING_SPLIT function.
These two INDEPENDENT result-sets will be then re-numbered based on the time of operation (which is always sequential). Take note, we renumber not by the field content or position of the comma etc, BUT by the timestamp.
Then they will get joined back together by LEFT OUTER JOIN; note not by INNER JOIN due to the fact that our "field values" could get omitted, while "keys" will always be there
Below is the TSQL code, as this is my first post to this site, hope it looks ok:
SELECT T1.ID, T1.KeyValue, T2.FieldValue
from (select t1.ID, row_number() OVER (PARTITION BY t1.ID ORDER BY current_timestamp) AS KeyRow, t2.value AS KeyValue
from myTable t1
CROSS APPLY STRING_SPLIT(t1.column1,',') as t2) T1
LEFT OUTER JOIN
(select t1.ID, row_number() OVER (PARTITION BY t1.ID ORDER BY current_timestamp) AS FieldRow, t3.value AS FieldValue
from myTable t1
CROSS APPLY STRING_SPLIT(t1.column2,',') as t3) T2 ON T1.ID = T2.ID AND T1.KeyRow = T2.FieldRow
This is very simple
CREATE TABLE #a(
id [INT] IDENTITY(1,1) NOT NULL,
OrgId INT )
INSERT INTO #a
(
OrgId
)
SELECT value FROM STRING_SPLIT('18,44,45,46,47,48,49,50,51,52,53', ',')
Select * from #a
Here is a t-sql function that uses string_split and adds the ordinal column:
drop function if exists [dbo].[varchar_split2];
go
create function [dbo].[varchar_split2]
(
#text varchar(max),
#delimiter char(1) = ','
)
returns #result table ([Ordinal] int not null identity(1, 1) primary key, [Value] varchar(128) not null)
as
begin
insert #result ([Value])
select
[Value]
from
string_split(#text, #delimiter)
where
0 != len([Value])
;
return;
end;
go

Performance gap using sub query with STGeomFromText

I'm using a geometric table, with polygons inside.
The problem is the point I try to match is stored in a table, and I can't get the same performance using one query instead of two :
-- this is the base / best time
SELECT *
FROM dbo.table1
WHERE geomField.STContains(
GEOMETRY::STGeomFromText(
'POINT(6.82 7.21)'
,0)
) = 1
-- this is more or less the same as the previous
DECLARE #g GEOMETRY = GEOMETRY::STGeomFromText(
(select top 1 'POINT(6.82 7.21)')
,0);
SELECT *
FROM dbo.table1
WHERE geomField.STContains(#g) = 1
-- this is slow as hell
SELECT *
FROM dbo.table1
WHERE geomField.STContains(
GEOMETRY::STGeomFromText(
(select top 1 'POINT(6.82 7.21)')
,0)
) = 1
Is there any way to improve the last one ? (I'm using EXEC sp_executesql in the backend and the 2nd option mean a stored procedure)

Invert The Sequence

This is a question that was asked to me during an interview. For a Table like this (Which may have hundreds or thousands of records of this kind)
what is the best way to revert the Seq so that A will be seq 4 and B will be seq 1 like this :
I gave him the below query (I used CTE Just to create the sample scenario. In the original case it was a table ) :
WITH CTE
AS
(
SELECT
Seq = 1,Nm = 'A'
UNION
SELECT
Seq = 2,Nm = 'A'
UNION
SELECT
Seq = 3,Nm = 'B'
UNION
SELECT
Seq = 4,Nm = 'B'
)
SELECT
Seq = ROW_NUMBER() OVER(ORDER BY Seq DESC),
Seq,
Nm
FROM CTE;
are there any alternative Dynamic queries of achieving the same in a more efficient way?
How about?
select seq, (case when nm = 'B' then 'A' else 'B' end) as nm
from t;
If I save the count of the table to a declared variable, then I can subquery to get a reversed 2nd column. The SQL Server Execution plan indicates the batch cost is less than with ROWNUMBER().
DECLARE #tablename TABLE(seq int PRIMARY KEY, Nm char(1))
INSERT INTO #tablename VALUES (1,'A'), (2,'A'), (3,'B'), (4,'B')
DECLARE #count int = (SELECT COUNT(*) FROM #tablename)
SELECT seq
,(SELECT Nm
FROM #tablename T2
WHERE T2.seq = #count +1 - T1.seq) AS [Nm]
FROM #tablename T1
An alternative query, not using an OVER-clause
SELECT
(SELECT MAX(mytable.Seq) FROM mytable) - mytable.Seq + 1 As Seq,
Nm
FROM mytable
ORDER BY mytable.Seq DESC
I don't think it's appropriate to help with interview questions here, but since this one is so unrealistic, and there are so many complicated answers here, I pose this much simpler solution:
DECLARE #MaxSeq INT;
SELECT #MaxSeq = MAX(Seq) FROM Table;
UPDATE Table
Set Seq = #MaxSeq - Seq + 1

Stored procedure becomes slow every couple of days

I am facing an issue on SQL Server in which my stored procedure becomes slow after couple of days.
Below is the sample of my stored procedure.
Could this be a caching issue on the server side? Can I increase the server's cache size to resolve the problem?
Normally the stored procedure returns data in one second.
#START_VALUE int=null,
#END_VALUE int=null
#UID NVARCHAR(MAX)=null,
AS
BEGIN
SELECT
dbo.TABLE1.ID,
ROW_NUMBER() OVER (ORDER BY TABLE1.UPDATED_ON desc) AS RN,
CONVERT(VARCHAR(10), dbo.TABLE1.DATE, 101) AS TDATE,
CATEGORY = (
SELECT TOP 1 COLUMN1
FROM TABLE5 CT1
WHERE TABLE1.CATEGORY = CT1.CATEGORY_ID
),
TYPETEXT = (
SELECT TOP 1 COLUMN1
FROM TABLE6 CT1
WHERE TABLE1.TYPE = CT1.TYPE_ID
),
IMAGE = STUFF(( SELECT DISTINCT ',' + CAST(pm.C1 AS varchar(12))
FROM TABLE2 pm
WHERE pm.ID = TABLE1.ID AND pm.C1 IS NOT NULL AND pm.C1 <> ''
FOR XML PATH('')),
1, 1, '' ) INTO #tempRecords
FROM dbo.TABLE1
WHERE ((#UID is null OR dbo.TABLE1.ID = #UID )
ORDER BY TABLE1.UPDATED DESC
SELECT #count = COUNT(*) FROM #tempRecords;
SELECT *, CONVERT([int],#count) AS 'TOTAL_RECORDS'
FROM #tempRecords
WHERE #tempRecords.RN BETWEEN CONVERT([bigint], #START_VALUE) AND CONVERT([bigint], #END_VALUE)
END
GO
'
A few performance tips:
1) #UID is null OR dbo.TABLE1.ID = #UID --> this is bad because you'll have one execution plan when UID is null and when it's not. Build a dynamic sql query and you'll get 2 execution plans.
2) Update stats in a maintenance plan.
3) Check index fragmentation.
4) Try to do the same thing without using a temp table.
5) Try to avoid castings.

Loop through sql result set and remove [n] duplicates

I've got a SQL Server db with quite a few dupes in it. Removing the dupes manually is just not going to be fun, so I was wondering if there is any sort of sql programming or scripting I can do to automate it.
Below is my query that returns the ID and the Code of the duplicates.
select a.ID, a.Code
from Table1 a
inner join (
SELECT Code
FROM Table1 GROUP BY Code HAVING COUNT(Code)>1)
x on x.Code= a.Code
I'll get a return like this, for example:
5163 51727
5164 51727
5165 51727
5166 51728
5167 51728
5168 51728
This snippet shows three returns for each ID/Code (so a primary "good" record and two dupes). However this isnt always the case. There can be up to [n] dupes, although 2-3 seems to be the norm.
I just want to somehow loop through this result set and delete everything but one record. THE RECORDS TO DELETE ARE ARBITRARY, as any of them can be "kept".
You can use row_number to drive your delete.
ie
CREATE TABLE #table1
(id INT,
code int
);
WITH cte AS
(select a.ID, a.Code, ROW_NUMBER() OVER(PARTITION by COdE ORDER BY ID) AS rn
from #Table1 a
)
DELETE x
FROM #table1 x
JOIN cte ON x.id = cte.id
WHERE cte.rn > 1
But...
If you are going to be doing a lot of deletes from a very large table you might be better off to select out the rows you need into a temp table & then truncate your table and re-insert the rows you need.
Keeps the Transaction log from getting hammered, your CI getting Fragged and should be quicker too!
It is actually very simple:
DELETE FROM Table1
WHERE ID NOT IN
(SELECT MAX(ID)
FROM Table1
GROUP BY CODE)
Self join solution with a performance test VS cte.
create table codes(
id int IDENTITY(1,1) NOT NULL,
code int null,
CONSTRAINT [PK_codes_id] PRIMARY KEY CLUSTERED
(
id ASC
))
declare #counter int, #code int
set #counter = 1
set #code = 1
while (#counter <= 1000000)
begin
print ABS(Checksum(NewID()) % 1000)
insert into codes(code) select ABS(Checksum(NewID()) % 1000)
set #counter = #counter + 1
end
GO
set statistics time on;
delete a
from codes a left join(
select MIN(id) as id from codes
group by code) b
on a.id = b.id
where b.id is null
set statistics time off;
--set statistics time on;
-- WITH cte AS
-- (select a.id, a.code, ROW_NUMBER() OVER(PARTITION by code ORDER BY id) AS rn
-- from codes a
-- )
-- delete x
-- FROM codes x
-- JOIN cte ON x.id = cte.id
-- WHERE cte.rn > 1
--set statistics time off;
Performance test results:
With Join:
SQL Server Execution Times:
CPU time = 3198 ms, elapsed time = 3200 ms.
(999000 row(s) affected)
With CTE:
SQL Server Execution Times:
CPU time = 4197 ms, elapsed time = 4229 ms.
(999000 row(s) affected)
It's basically done like this:
WITH CTE_Dup AS
(
SELECT*,
ROW_NUMBER()OVER (PARTITIONBY SalesOrderno, ItemNo ORDER BY SalesOrderno, ItemNo)
AS ROW_NO
from dbo.SalesOrderDetails
)
DELETEFROM CTE_Dup WHERE ROW_NO > 1;
NOTICE: MUST INCLUDE ALL FIELDS!!
Here is another example:
CREATE TABLE #Table (C1 INT,C2 VARCHAR(10))
INSERT INTO #Table VALUES (1,'SQL Server')
INSERT INTO #Table VALUES (1,'SQL Server')
INSERT INTO #Table VALUES (2,'Oracle')
SELECT * FROM #Table
;WITH Delete_Duplicate_Row_cte
AS (SELECT ROW_NUMBER()OVER(PARTITION BY C1, C2 ORDER BY C1,C2) ROW_NUM,*
FROM #Table )
DELETE FROM Delete_Duplicate_Row_cte WHERE ROW_NUM > 1
SELECT * FROM #Table