SQL: efficiently append incremental number to string, avoiding duplicates - sql

I have a set of records (table [#tmp_origin]) containing duplicate entries in a string field ([Names]). I would like to insert the whole content of [#tmp_origin] into the destination table [#tmp_destination], that does NOT allow duplicates and may already contain items.
If the string in the origin table does not exist in the destination table, then in is simply inserted in the destination table, as is.
If an entry in the destination table already exists with the same value of the entry in the original table, a string-ified incremental number must be appended to the string, before it is inserted in the destination table.
The process of moving data in this way has been implemented with a cursor, in this sample script:
-- create initial situation (origin and destination table, both containing items)
-- Begin
CREATE TABLE [#tmp_origin] ([Names] VARCHAR(10))
CREATE TABLE [#tmp_destination] ([Names] VARCHAR(10))
CREATE UNIQUE INDEX [IX_UniqueName] ON [#tmp_destination]([Names] ASC)
INSERT INTO [#tmp_origin]([Names]) VALUES ('a')
INSERT INTO [#tmp_origin]([Names]) VALUES ('a')
INSERT INTO [#tmp_origin]([Names]) VALUES ('b')
INSERT INTO [#tmp_origin]([Names]) VALUES ('c')
INSERT INTO [#tmp_destination]([Names]) VALUES ('a')
INSERT INTO [#tmp_destination]([Names]) VALUES ('a_1')
INSERT INTO [#tmp_destination]([Names]) VALUES ('b')
-- create initial situation - End
DECLARE #Name VARCHAR(10)
DECLARE NamesCursor CURSOR LOCAL FORWARD_ONLY FAST_FORWARD READ_ONLY FOR
SELECT [Names]
FROM [#tmp_origin];
OPEN NamesCursor;
FETCH NEXT FROM NamesCursor INTO #Name;
WHILE ##FETCH_STATUS = 0
BEGIN
DECLARE #finalName VARCHAR(10)
SET #finalName = #Name
DECLARE #counter INT
SET #counter = 1
WHILE(1=1)
BEGIN
IF NOT EXISTS(SELECT * FROM [#tmp_destination] WHERE [Names] = #finalName)
BREAK;
SET #finalName = #Name + '_' + CAST(#counter AS VARCHAR)
SET #counter = #counter + 1
END
INSERT INTO [#tmp_destination] ([Names]) (
SELECT #finalName
)
FETCH NEXT FROM NamesCursor INTO #Name;
END
CLOSE NamesCursor;
DEALLOCATE NamesCursor;
SELECT *
FROM [#tmp_destination]
/*
Expected result:
a
a_1
a_2
a_3
b
b_1
c
*/
DROP TABLE [#tmp_origin]
DROP TABLE [#tmp_destination]
This works correctly, but its performance drastically slows down when the number of items to insert increases.
Any idea to speed it up?
thanks

Using a windowing function allows the duplicates to be numbered. You can also get the count from the destination table (will need where condition to strip off the suffix you've added):
select orig.names,
row_number() over (partition by orig.names order by orig.names) as rowNo,
dest.count
from ##tmp_origin orig
cross apply (select count(1) from #tmp_destination where names = orig.names) as dest
An insert can be built from the above (new suffix is rowNo + dest.count -1 if greater than zero).
Suggest you refactor the destination temporary table to include the name and suffix as separate columns – this might mean having a new intermediate stage – because this will make the matching logic much simpler.

Something like this:
insert [#tmp_destination]
select CASE WHEN row_number() over(partition by Names order by Names) > 1 THEN Names + '_' + CONVERT(VARCHAR(10), row_number() over(partition by Names order by Names)) ELSE Names END
from [#tmp_origin]

I wouldn't use a cursor in that case. Instead, I would build the query using ROW_NUMBER(). This way you add a counter in your original table, and then use this counter to append to your [Names]:
SELECT [Names], ROW_NUMBER() OVER (PARTITION BY [Names] ORDER BY [Names]) - 1 AS [counter]
INTO #tmp_origin_with_counter
FROM #tmp_origin
SELECT CONCAT([Names], IIF([counter] = 0, '', '_'+ CAST([counter] AS NVARCHAR)))
INTO #tmp_destination
FROM #tmp_origin_with_counter

Related

sql : copying into a table in chunks?

My task is I need to create index on a large table in SQL Server (~370G). The plan is to
create a new table with the same columns and
create a clustered index in the new table on three columns
copy in small chunks (grouped by the three columns) the original data into the new table.
I can do 1) and 2) in SQL with the following script:
SELECT TOP 0 *
INTO js_sample_indexed
FROM dbo.js_sample
CREATE CLUSTERED INDEX domain_event_platform_idx
ON dbo.js_sample_indexed (domain ASC, event_type ASC, platform ASC)
GO
But I am stuck in the third step. Presumably there are thousands of values in the index, for example, an value might be ('Amazon', 'search', 'mobile').
So I might need to put a where statement in a for loop, while updating the condition for selection every time.
But I'm stuck at how to store and retrieve the values in each column (e.g. 'domain') using SQL.
Don't know whether I've phrased this question clearly, but any comments would be helpful. Thanks!
I am assuming that there is an identity field of some sort (a sequentially numbered field used as an index) on the table. For this example, I will call this field ID. If this is true, then a simple looping construct will do what you need.
DECLARE #MinID int, #MaxID int, #Step int = 10000 -- Move 10k records per loop
SELECT #MinID = MIN(ID), #MaxID = MAX(ID)
FROM MyTableToCopyFrom
While ##MinID <= #MaxID
BEGIN
INSERT INTO MyTableToCopyTo (Field1, Field2, Field3, Fieldx)
SELECT Field1, Field2, Field3, Field4
FROM MyTableToCopyFrom
WHERE ID >= #MinId
AND ID < #MinId + #Step
SET #MinID = #MinID + #Step
END
So I came up with an answer after some reading and asking. Here is the code:
USE jumpshot_data
GO
DROP TABLE dbo.js_indexed
-- create a new table with existing structure
SELECT TOP 0 *
INTO dbo.js_full_indexed_1
FROM dbo.js_test
CREATE CLUSTERED INDEX domain_event_platform_idx
ON dbo.js_full_indexed_1 (domain ASC, event_type ASC, platform ASC)
GO
CREATE NONCLUSTERED INDEX device_id_idx
ON js_full_indexed_1 (device_id ASC);
-- using cursor to loop through meta-data table, and insert by chunk into the new table
DECLARE #event_type varchar(50)
DECLARE #platform varchar(50)
DECLARE #domain varchar(50)
DECLARE SelectionCursor CURSOR LOCAL FOR
SELECT * FROM dbo.js_index_info
OPEN SelectionCursor
FETCH NEXT FROM SelectionCursor into #event_type, #platform, #domain
WHILE (##FETCH_STATUS = 0)
BEGIN
-- operation at each row
INSERT INTO dbo.js_full_indexed_1
SELECT *
FROM dbo.js_test
WHERE event_type = #event_type AND domain = #domain AND platform = #platform
-- loop condition
FETCH NEXT FROM SelectionCursor into #event_type, #platform, #domain
END
CLOSE SelectionCursor
DEALLOCATE SelectionCursor
GO

SQL Loop through tables and columns to find which columns are NOT empty

I created a temp table #test containing 3 fields: ColumnName, TableName, and Id.
I would like to see which rows in the #test table (columns in their respective tables) are not empty? I.e., for every column name that i have in the ColumnName field, and for the corresponding table found in the TableName field, i would like to see whether the column is empty or not. Tried some things (see below) but didn't get anywhere. Help, please.
declare #LoopCounter INT = 1, #maxloopcounter int, #test varchar(100),
#test2 varchar(100), #check int
set #maxloopcounter = (select count(TableName) from #test)
while #LoopCounter <= #maxloopcounter
begin
DECLARE #PropIDs TABLE (tablename varchar(max), id int )
Insert into #PropIDs (tablename, id)
SELECT [tableName], id FROM #test
where id = #LoopCounter
set #test2 = (select columnname from #test where id = #LoopCounter)
declare #sss varchar(max)
set #sss = (select tablename from #PropIDs where id = #LoopCounter)
set #check = (select count(#test2)
from (select tablename
from #PropIDs
where id = #LoopCounter) A
)
print #test2
print #sss
print #check
set #LoopCounter = #LoopCounter + 1
end
In order to use variables as column names and table names in your #Check= query, you will need to use Dynamic SQL.
There is most likely a better way to do this but I cant think of one off hand. Here is what I would do.
Use the select and declare a cursor rather than a while loop as you have it. That way you dont have to count on sequential id's. The cursor would fetch fields columnname, id and tablename
In the loop build a dynamic sql statement
Set #Sql = 'Select Count(*) Cnt Into #Temp2 From ' + TableName + ' Where ' + #columnname + ' Is not null And ' + #columnname <> '''''
Exec(#Sql)
Then check #Temp2 for a value greater than 0 and if this is what you desire you can use the #id that was fetched to update your #Temp table. Putting the result into a scalar variable rather than a temp table would be preferred but cant remember the best way to do that and using a temp table allows you to use an update join so it would well in my opinion.
https://www.mssqltips.com/sqlservertip/1599/sql-server-cursor-example/
http://www.sommarskog.se/dynamic_sql.html
Found a way to extract all non-empty tables from the schema, then just joined with the initial temp table that I had created.
select A.tablename, B.[row_count]
from (select * from #test) A
left join
(SELECT r.table_name, r.row_count, r.[object_id]
FROM sys.tables t
INNER JOIN (
SELECT OBJECT_NAME(s.[object_id]) table_name, SUM(s.row_count) row_count, s.[object_id]
FROM sys.dm_db_partition_stats s
WHERE s.index_id in (0,1)
GROUP BY s.[object_id]
) r on t.[object_id] = r.[object_id]
WHERE r.row_count > 0 ) B
on A.[TableName] = B.[table_name]
WHERE ROW_COUNT > 0
order by b.row_count desc
How about this one - bitmask computed column checks for NULLability. Value in the bitmask tells you if a column is NULL or not. Counting base 2.
CREATE TABLE FindNullComputedMask
(ID int
,val int
,valstr varchar(3)
,NotEmpty as
CASE WHEN ID IS NULL THEN 0 ELSE 1 END
|
CASE WHEN val IS NULL THEN 0 ELSE 2 END
|
CASE WHEN valstr IS NULL THEN 0 ELSE 4 END
)
INSERT FindNullComputedMask
SELECT 1,1,NULL
INSERT FindNullComputedMask
SELECT NULL,2,NULL
INSERT FindNullComputedMask
SELECT 2,NULL, NULL
INSERT FindNullComputedMask
SELECT 3,3,3
SELECT *
FROM FindNullComputedMask

Conditional SQL Insert

I have to write a. insert statement that looks at a table and inserts a record if the conditions are met. This is a one time thing so not overly concerned about it being efficient.
the table contains a work breakdown structure for a project ( each project having, a project level(wbs1), a phase level(wbs2) and a task level (wbs3)
that table looks like this
Wbs1 wbs2 wbs3 name
262 ProjectA
262 01 Data Analsys
262 01 01 Data cleansing
262 01 02 Data Transforming
I need to insert a phase(WBS2) to each project(WBS1) with an insert statement, for example adding a wbs2 "02" to each project(wbs1).
writing the insert statment is no problem and I select the data from the project level since most of it is redundant so no issue there, im just not sure how to have it loop through and add the phase to each project, since there are multiple rows with the same project(wbs1) number
insert statement sample
Insert into dbo.pr ([WBS1],[WBS2],[WBS3],[Name])
(Select [WBS1],'999',[WBS3],'In-House Expenses'
from dbo.pr where wbs1 = #ProjectID
and wbs2 ='')
How do i run this statement to inserta row every project?(wbs1)
hopefully this makes sense.
You can use a temporary table with an added RowNumber field and then a WHILE loop to handle the looping over each row. You can then run an IF EXISTS as a criteria check before running the stored procedure. See below for example
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
DECLARE #ProjectId NVARCHAR(50) = '262'
CREATE TABLE #Temp (RowNumber INT, wbs1 NVARCHAR(255), wbs2 NVARCHAR(255), wbs3 NVARCHAR(255), name NVARCHAR(255))
INSERT INTO #Temp
SELECT ROW_NUMBER() OVER (ORDER BY wbs1, wbs2, wbs3, name)
,pr.*
FROM pr
select *
from #temp
-- Create loop variables to handle incremeting
DECLARE #Counter INT = 1;
DECLARE #MaxLoop INT = (SELECT COUNT(wbs1) FROM #temp)
WHILE #Counter <= #MaxLoop
BEGIN
-- Use if Exists to check the current looped meets whatever critiera you have
IF EXISTS (SELECT 'true'
FROM #Temp
WHERE RowNumber = #Counter
AND wbs1 = #ProjectId
AND wbs2 = ''
)
BEGIN
Insert into pr (wbs1,wbs2,wbs3,name)
(Select [WBS1],'999',[WBS3],'In-House Expenses'
from #temp where RowNumber = #Counter)
END
-- Remember to increment the counter
SET #Counter = #Counter + 1;
END
SELECT *
FROM pr
drop table #temp

Slow MSSQL stored procedure in processes excel files with only 30,000 rows

I have a web applciation with an iterface that users can uplaod files on. The data form the excel file is collected, concatenated and passed to
a stored procedure which process and returns data.
A brief explanation of the stored procedure.
The stored Procedure collects the string, break it down using a delimeter and stores it in a temp variable table.
Another process is run trough the temp table, where a count is done to find the exact match count and approximate match count by comparing each string
agains a view which contains
all the names to compare against for each row in the first
An exact match count is where the eact string is found in the view for example.. (Bobby Bolonski )
An approximate match is done using a levenshtein distance algorithm database function with a frequency of 2.
temo table #temp1.
The result (name, exactmatch count and approximate match count) are stored in the final temp table.
a select statement is run on the last temp table to return all the data to the application..
MY problem is that, when i passed huge files like and excel file with 27000 names. IT took like 2 hours to process and return data from the database.
I have checked both servers where the application is on and where the database is on.
On the application server. Both memory and cpu usage are less than 15 %
On the database server. both memory and cpu usage are also less than 15 %.
Am looking for advice on what improvements i can do to make the process faster.
Below is the copy of the stored procedure as it is doing all the work and returning the results to the web application.
CREATE PROCEDURE [dbo].[FindMatch]
#fullname varchar(max),#frequency int,
#delimeter varchar(max) AS
set #frequency = 2
declare #transID bigint
SELECT #transID = ABS(CAST(CAST(NEWID() AS VARBINARY(5)) AS Bigint))
DECLARE #exactMatch int = 99
DECLARE #approximateMatch int = 99
declare #name varchar(50)
DECLARE #TEMP1 TABLE (fullname varchar(max),approxMatch varchar(max), exactmatch varchar(max))
DECLARE #ID varchar(max)
--declare a temp table
DECLARE #TEMP TABLE (ID int ,fullname varchar(max),approxMatch varchar(max), exactmatch varchar(max))
--split and store the result in the #temp table
insert into #TEMP (ID,fullname) select * from fnSplitTest(#fullname, #delimeter)
--loop trough the #temp table
WHILE EXISTS (SELECT ID FROM #TEMP)
BEGIN
SELECT Top 1 #ID = ID FROM #TEMP
select #name = fullname from #TEMP where id = #ID
--get the exact match count of the first row from the #temp table and so on until the loop ends
select #exactMatch = count(1) from getalldata where replace(name,',','') COLLATE Latin1_general_CI_AI = #name COLLATE Latin1_general_CI_AI
--declare temp #TEMP3
DECLARE #TEMP3 TABLE (name varchar(max))
--insert into #temp 3 only the data that are similar to our search name so as not to loop over all the data in the view
INSERT INTO #TEMP3(name)
select name from getalldata where SOUNDEX(name) LIKE SOUNDEX(#name)
--get the approximate count using the [DEMLEV] function.
--this function uses the Damerau levenshtein distance algorithm to calculate the distinct between the search string
--and the names inserted into #temp3 above. Uses frequency 2 so as to eliminate all the others
select #approximateMatch = count(1) from #TEMP3 where
dbo.[DamLev](replace(name,',',''),#name,#frequency) <= #frequency and
dbo.[DamLev](replace(name,',',''),#name,#frequency) > 0 and name != #name
--insert into #temp1 at end of every loop results
insert into #TEMP1 (fullname,approxMatch, exactmatch) values(#name,#approximateMatch,#exactMatch)
insert into FileUploadNameInsert (name) values (#name + ' ' +cast(#approximateMatch as varchar) + ' ' + cast(#exactMatch as varchar) + ', ' + cast(#transID as varchar) )
DELETE FROM #TEMP WHERE ID= #ID
delete from #TEMP3
END
--Return all the data stored in #temp3
select fullname,exactmatch,approxMatch, #transID as transactionID from #TEMP1
GO
In my opinion,
Use Openrowset to directly read the records into a pre-defined, properly indexed table of your database.
Now, perform your operations using this table at back-end using pre-defined Stored Procedures.
It should take around 15 minutes for 30,000 rows.

Update specific columns in a table iteratively (Do a bulk update)

My Table Schema is as follows:
Gender: char(1), not null
Last Name: varchar(25), null
First Name: varhcar(35), not null
The data in the table looks like:
Gender | Last Name | First Name |
M Doe John
F Marie Jane
M Jones Jameson
F Simpson Alice
I now am trying to update all the names in the table from the names present in the txt file.
My Query is as follows:
-- Sort out the Forenames we'll be using for the data, we make a #Name2 table because I have yet to figure our
-- inserting specific columns using BULK INSERT and without using a format file.
CREATE TABLE #Name (Name VARCHAR(50))
CREATE TABLE #ForeNames (FirstName VARCHAR(50), Gender VARCHAR(1))
-- Move data in the #Name2 table
BULK INSERT #Name FROM "c:\girlsforenames.txt" WITH (ROWTERMINATOR='\n')
-- Now move it to the forename table and add the gender
INSERT INTO #ForeNames SELECT [Name], 'F' FROM #Name
-- Delete the names from temporary table
TRUNCATE TABLE #Name
-- Same for the boys
BULK INSERT #Name FROM "c:\boysforenames.txt" WITH (ROWTERMINATOR='\n')
INSERT INTO #ForeNames SELECT [Name], 'M' FROM #Name
-- Now do the surnames
TRUNCATE TABLE #Name
BULK INSERT #Name FROM "c:\surnames.txt" WITH (ROWTERMINATOR='\n')
DECLARE #Counter BIGINT
SET #Counter = 4
WHILE (#Counter > 0)
BEGIN
UPDATE TableName
set
[last_name]= (SELECT TOP 1 FirstName from #ForeNames),
[first_name]=(SELECT TOP 1 Name FROM #Name ORDER BY NEWID()),
[gender]= ( SELECT TOP 1 Gender FROM #ForeNames ORDER BY NEWID());
SET #Counter=#Counter-1
END
DROP TABLE #Name
DROP TABLE #ForeNames
SELECT * FROM TableName
What Happens is all the rows in the table are updated with the same values and each time i execute the query they are updated with the new set of values.
What I want is to loop through each row and update it and den update the next row with the other set of random name. But here it is updating the same random name across all the rows of the table.
Any help would be appreciated.
Each SELECT statement is only being executed once in your example (and thus returning 1 result), and since your UPDATE isn't being limited, you're applying the same value to every row.
If you want to update each row with different values, you can use a CTE and the ROW_NUMBER() function to update rows at a time.
There's no need to loop, you can do it in one fell swoop:
WITH cte AS (SELECT *,ROW_NUMBER() OVER (ORDER BY (SELECT 1)) AS n1
FROM TableName
)
UPDATE cte
SET FirstName = names.Name
FROM cte
JOIN (SELECT *,ROW_NUMBER() OVER (ORDER BY NEWID()) AS n2
FROM #name
)names
on cte.n1 = names.n2
Demo: SQL Fiddle
This example is just for the FirstName.