How to remove similar text from another value - sql

I have a large dataset, millions of rows, containing various people and items they're associated with. In many cases, these peoples' names are present in the item name as well. I would like to find the shortest substring of item name in which the owner name, or parts of their name, are no longer present.
A sample of the data is as follows:
CREATE TABLE test ([ID] nvarchar(255), [OWNER] nvarchar(255), [ITEM] nvarchar(255))
INSERT INTO test
SELECT '1','A B C','A B X X X'
UNION ALL
SELECT '2','ABC DEF','XABCD XX X'
UNION ALL
SELECT '2','ABC DEF','YABCD X X'
UNION ALL
SELECT '3','X X X X','YPD X X'
UNION ALL
SELECT '4','XYZ','X X X'
UNION ALL
SELECT '5','A B C','OOO PPP QQQ'
With ideal output being:
ID | OWNER | ITEM | SHORT ITEM
1 | A B C | A B X X X | X X X
2 | ABC DEF | XABCD XX X | XX X
2 | ABC DEF | YABCD X X | X X
3 | X X X X | YPD X X | X X
4 | XYZ | X X X | X X X
5 | A B C | OOO PPP DDD| PPP QQQ
This output includes a couple of cases in which I wanted to remove something from the item name which was not the owner's name, and so I have hardcoded that into the query. I've written the following query:
;WITH p1 as( --Retrieving first word of ITEM and ITEM minus first word
SELECT SUBSTRING([ITEM],1,
case when CHARINDEX(' ',[ITEM])=0 then LEN([ITEM]) --When no space in ITEM, return ITEM
else CHARINDEX(' ', [ITEM]) -1 end) as w1p --Return the first word separated by space
,SUBSTRING([ITEM],CHARINDEX(' ',[ITEM])+1,100) as m1p --Return everything minus the first word
,[ITEM]
,[ID]
,[OWNER]
FROM test
),p2 as( --Retrieving second word of ITEM and ITEM minus second word
SELECT SUBSTRING(m1p,1,
case when CHARINDEX(' ',m1p)=0 then LEN(m1p)
else CHARINDEX(' ',m1p) -1 end) as w2p
,SUBSTRING(m1p,CHARINDEX(' ',m1p)+1,100) as m2p
,[ITEM]
,[ID]
,[w1p]
,[m1p]
FROM p1
),p3 as( --Retrieving third word of ITEM and ITEM minus third word
SELECT SUBSTRING(m2p,1,
case when CHARINDEX(' ',m2p)=0 then LEN(m2p)
else CHARINDEX(' ',m2p) -1 end) as w3p
,SUBSTRING(m2p,CHARINDEX(' ',m2p)+1,100) as m3p
,*
FROM p2
),p4 as( --Retrieving fourth word of ITEM and ITEM minus fourth word
SELECT SUBSTRING(m3p,1,
case when CHARINDEX(' ',m3p)=0 then LEN(m3p)
else CHARINDEX(' ',m3p) -1 end) as w4p
,SUBSTRING(m3p,CHARINDEX(' ',m3p)+1,100) as m4p
,*
FROM p3
),m1 as( --Retrieving first word of OWNER and OWNER minus first word
SELECT SUBSTRING([OWNER],1,
CASE WHEN CHARINDEX(' ',[OWNER])=0 THEN LEN([OWNER])
ELSE CHARINDEX(' ',[OWNER])-1 end) as w1m
,SUBSTRING([OWNER],CHARINDEX(' ',[OWNER])+1,100) as m1m
,[OWNER]
,[ID]
FROM p1
GROUP BY [OWNER], [ID]
),m2 as( --Retrieving second word of OWNER and OWNER minus second word
SELECT SUBSTRING(m1m,1,
case when CHARINDEX(' ', m1m) = 0 then LEN(m1m)
else CHARINDEX(' ', m1m) -1 end) as w2m
,SUBSTRING(m1m,CHARINDEX(' ',m1m)+1,100) as m2m
,*
FROM m1
),m3 as( --Retrieving third word of OWNER and OWNER minus third word
SELECT SUBSTRING(m2m,1,
case when CHARINDEX(' ', m2m) = 0 then LEN(m2m)
else CHARINDEX(' ', m2m) -1 end) as w3m
,SUBSTRING(m2m,CHARINDEX(' ',m2m)+1,100) as m3m
,*
FROM m2
),m4 as( --Retrieving fourth word of OWNER
SELECT SUBSTRING(m3m,1,
case when CHARINDEX(' ', m3m) = 0 then LEN(m3m)
else CHARINDEX(' ', m3m) -1 end) as w4m
,*
FROM m3
),ms as( --Adding special cases not caught by regular query
SELECT CASE WHEN [ID] IN ('3','5') THEN
CASE WHEN [ID] = '3' THEN 'YPD'
WHEN [ID] = '5' THEN 'OOO'
ELSE NULL END
ELSE NULL END as SPEC
,*
FROM m4
)
SELECT m.[ID] --Finding closest shortname
,m.[OWNER]
,p.[ITEM]
,CASE WHEN SUBSTRING(p.[ITEM],1,LEN(m.SPEC)) = SPEC AND SPEC IS NOT NULL THEN LTRIM(SUBSTRING(p.[ITEM],LEN(m.SPEC)+1,100)) --If hardcoded phrase in ITEM, return ITEM minus that phrase
WHEN p.w1p LIKE '%'+m.w1m+'%' AND p.w2p NOT LIKE '%'+m.w2m+'%' AND p.w3p NOT LIKE '%'+m.w3m+'%' AND p.w4p NOT LIKE '%'+m.w4m+'%' THEN p.m1p --If first word of ITEM match first of OWNER, return ITEM minus first
WHEN p.w1p LIKE '%'+m.w1m+'%' AND p.w2p LIKE '%'+m.w2m+'%' AND p.w3p NOT LIKE '%'+m.w3m+'%' AND p.w4p NOT LIKE '%'+m.w4m+'%' THEN p.m2p --If first two words of ITEM match first of OWNER, return ITEM minus two words
WHEN p.w1p LIKE '%'+m.w1m+'%' AND p.w2p LIKE '%'+m.w2m+'%' AND p.w3p LIKE '%'+m.w3m+'%' AND p.w4p NOT LIKE '%'+m.w4m+'%' THEN p.m3p --If first three words of ITEM match first of OWNER, return ITEM minus three words
WHEN p.w1p LIKE '%'+m.w1m+'%' AND p.w2p LIKE '%'+m.w2m+'%' AND p.w3p LIKE '%'+m.w3m+'%' AND p.w4p LIKE '%'+m.w4m+'%' THEN p.m4p --If first four words of ITEM match first of OWNER, return ITEM minus four words
ELSE p.[ITEM]
END AS [SHORT ITEM]
FROM p4 p
LEFT JOIN ms m ON p.[ID] = m.[ID]
While this achieves my goal, it does not look very nice and feels like it could be optimized. It requires a where statement to have any sort of speed in execution. While I would likely not be running this on the full dataset anyway, I am looking for ways to improve. I do not have permission to view execution plans, so I cannot share that.
Thank you for any help or advice you can offer.

OK I developed this using only the first three rows of the table. Replace #test with your table name. Let me know if this works for you.
select *, row_number() over(order by id) rowid
into #a
from #test
select *, item short_item, row_number() over(order by id) rowid
into #b
from #test
declare #iterator int=1
declare #owner varchar(max)
declare #owneroriginal varchar(max)
declare #item varchar(max)
declare #itemoriginal varchar(max)
declare #itemactualoriginal varchar(max)
while #iterator<=(select max(rowid) from #a)
begin
select #owner=[owner] from #a where rowid=#iterator
select #owneroriginal=[owner] from #a where rowid=#iterator
select #item=[item] from #a where rowid=#iterator
select #itemoriginal=[item] from #a where rowid=#iterator
select #itemactualoriginal=[item] from #a where rowid=#iterator
while #owner<>''
begin
select #owner=left(#owneroriginal, charindex(' ',#owneroriginal))
select #owneroriginal= ltrim(replace(#owneroriginal,#owner,''))
select #item=left(#itemoriginal, charindex(' ',#itemoriginal))
select #itemoriginal= ltrim(replace(#itemoriginal,#item,''))
--select #owner, #owneroriginal, #item, #itemoriginal
if #itemactualoriginal
like '%'+rtrim(#owner)+'%' and #owner<>''
begin
--select 1
update #b
set short_item=replace(short_item, rtrim(#item),'')
where rowid=#iterator
end
else if ##rowcount=0
update #b
set short_item=
case
when #owner = ''
then ltrim(replace(short_item, #owneroriginal,''))
else ltrim(replace(short_item, #owner,''))
end
where rowid=#iterator
end
set #iterator=#iterator+1
end
select id, owner, item, short_item from #b

Related

SQL Create Row Number, skip certain row numbers

I am trying to create row numbers in SQL but do not want to use certain numbers.
Example Illustration:
Bob, 100
Matt, 120
Dan, 150
Bill, 156
Tim, 175
Normally in SQL....if I were to use the Row_Number() function...it would count - 1, 2, 3, 4, 5.
I would instead like to skip certain numbers so that it counts - 1, 2, 5, 6, 7
Below is my code
SELECT
CASE
WHEN clean_client_nm = 'Total Opportunities'
THEN 0
ELSE ROW_NUMBER() OVER (ORDER BY clean_ft_ee DESC)
END AS row_id,
prod,
client_nm,
clean_ft_ee,
ISNULL(Med, ''),
ISNULL(Stop, ''),
ISNULL(Den, ''),
ISNULL(VIS, ''),
ISNULL(Life, ''),
ISNULL(STD, ''),
ISNULL(LTD, ''),
ISNULL(Worksite, ''),
client_sic
FROM
(SELECT * FROM TOTAL
UNION ALL
SELECT * FROM BASE) AS X
ORDER BY
CASE
WHEN clean_client_nm = 'Total Opportunities'
THEN 0
ELSE 1
END,
clean_ft_ee DESC, client_nm
Answer is to create a temp/variable table and insert the specified values a user needs and then to use the FirstCursor logic to sequentially update the row_id fields in the table needed.
declare FirstCursor cursor global for select row_num from #row_num
open FirstCursor
while #count > 0
begin
fetch FirstCursor into #row_id;
with top1 as (
select top 1 lives, row_id
from #singlecovclient
where row_id is null order by lives desc)
update top1 set row_id = #row_id
set #count = #count - 1
end
close FirstCursor
deallocate FirstCursor

SQL Server: Split string value with single quotations

Trying to modify a set of code, so that given a string, the string has to be split and passed to the code to be used.
This is the code that I have right now.
DECLARE #xml xml,
#str varchar(100),
#delimiter varchar(10)
SET #str = '100'
SET #delimiter = ','
SET #xml = cast(('<X>'+replace(#str, #delimiter, '</X><X>')+'</X>') as
xml)
SELECT C.value('.', 'varchar(10)') as value
FROM #xml.nodes('X') as X(C)
For a single-valued string, this works just fine. But I need to use more than one like, ('100', '100A', '100B'...).
The string value will not contain anything other than 3-digit numbers or 3-digit numbers + an alphabet character, or 3 alphabet letter characters.
I also tried something else, but this is too slow.
declare #values table
(
Value varchar(1000)
)
insert into #values values ('100'),('100A'),('100B'),('100C')
Select *
from table
where myField in (select value from #value)
How can I modify the code for this requirement?
You need to create a table valued function that you can pass your string to split into using a cross apply:
Function
create function [dbo].[fn_StringSplit4k]
(
#str nvarchar(4000) = ' ' -- String to split.
,#delimiter as nvarchar(20) = ',' -- Delimiting value to split on.
,#num as int = null -- Which value to return.
)
returns table
as
return
-- Start tally table with 10 rows.
with n(n) as (select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1 union all select 1)
-- Select the same number of rows as characters in #str as incremental row numbers.
-- Cross joins increase exponentially to a max possible 10,000 rows to cover largest #str length.
,t(t) as (select top (select len(isnull(#str,'')) a) row_number() over (order by (select null)) from n n1,n n2,n n3,n n4)
-- Return the position of every value that follows the specified delimiter.
--,s(s) as (select 1 union all select t+len(replace(#delimiter,' ','.')) from t where substring(isnull(#str,''),t,len(replace(#delimiter,' ','.'))) = #delimiter)
,s(s) as (select 1 union all select t+1 from t where case when #delimiter = '' and t < len(#str) then 1 else case when substring(isnull(#str,''),t,1) = #delimiter then 1 else 0 end end = 1)
-- Return the start and length of every value, to use in the SUBSTRING function.
-- ISNULL/NULLIF combo handles the last value where there is no delimiter at the end of the string.
,l(s,l) as (select s,case when #delimiter = '' then 1 else isnull(nullif(charindex(#delimiter,isnull(#str,''),s),0)-s,4000) end from s)
select rn
,item
from(select row_number() over(order by s) as rn
,substring(#str,s,l) as item
from l
) a
where rn = #num
or #num is null;
Usage
select s.item
from YourTable as t
cross apply dbo.fn_StringSplit4k(t.YourString,',',null) as s;

Two dimensional rank using T-SQL

This is the data I'm dealing with:
I would like to find a way, in sql, of adding numbers to the yellow column which will rank the Names in such a way that I get the following.
note: This is the final pivoted result - in the sql table there is no need to pivot the data.
This ranking is decided via these rules:
The most recent week (ie Wk5 column) is the most important.
The next most recent week is next most important.
...so on to the left with the oldest week column "WK1" being the least important.
A data value that is small e.g. 1, is best. A data value that is high e.g. 7, is not good. A blank space is the worst and if at all possible should be located near the bottom of the page - but rules 1/2/3 always take precedence.
This is the data with a placeholder of 0 in the column Idx:
CREATE TABLE #values
(
Name varchar(5),
Idx int,
"Week" varchar(5),
Amount int
);
INSERT INTO #values
VALUES
('A',0,'WK1',3),
('T',0,'WK1',2),
('H',0,'WK1',1),
('P',0,'WK1',4),
('V',0,'WK1',6),
('N',0,'WK1',5),
('A',0,'WK2',2),
('F',0,'WK2',1),
('K',0,'WK2',3),
('P',0,'WK2',4),
('W',0,'WK2',7),
('V',0,'WK2',5),
('B',0,'WK2',6),
('A',0,'WK3',1),
('F',0,'WK3',2),
('T',0,'WK3',3),
('K',0,'WK3',4),
('W',0,'WK3',5),
('V',0,'WK3',6),
('N',0,'WK3',7),
('A',0,'WK4',2),
('F',0,'WK4',1),
('T',0,'WK4',5),
('K',0,'WK4',4),
('B',0,'WK4',6),
('A',0,'WK5',1),
('F',0,'WK5',2),
('T',0,'WK5',3),
('H',0,'WK5',4),
('K',0,'WK5',5);
This is my current attempt:
WITH
allData AS
(
SELECT Name,
"Week",
newRank = RANK() OVER (ORDER BY "Week" DESC,Amount)
FROM #values
)
,allData2 AS
(
SELECT *,
newRank2 = 1 / CONVERT(NUMERIC(18,10),newRank)
FROM allData
)
,allData3 AS
(
SELECT Name,
smRank = SUM(newRank2)
FROM allData2
GROUP BY Name
)
SELECT Name,
smRank,
rnk = RANK() OVER (ORDER BY smRank DESC)
INTO #RankA
FROM allData3;
UPDATE X
SET X.Idx = Y.rnk
FROM #values X
INNER JOIN #RankA Y ON
X.Name = Y.Name;
Unfortunately if I pivot the results, and then order by the Idx column it is not in the order I am aiming at.
This is based on two nested ROW_NUMBERs:
select *,
row_number()
over (order by "Week" desc, amount)
from
(
select *,
row_number()
over (partition by name
order by "Week" desc, amount) as rn
from #values
) as dt
where rn = 1 -- for each name find the latest week and it's lowest number
What if two names share the same week/amount? You might consider RANK or DENSE_RANK instead.
Using your #values table, here is how to pivot it (since the data you provided was not in the same table format) and then assign a value to the index based on your requirements.
select *
, ROW_NUMBER() OVER(ORDER BY CASE WHEN wk5 IS NULL THEN 1 ELSE 0 END, wk5, CASE WHEN wk4 IS NULL THEN 1 ELSE 0 END, wk4, CASE WHEN wk3 IS NULL THEN 1 ELSE 0 END,wk3, CASE WHEN wk2 IS NULL THEN 1 ELSE 0 END,wk2, CASE WHEN wk1 IS NULL THEN 1 ELSE 0 END, wk1) AS new_index
from (
select * from #values
) p
PIVOT (
MAX(Amount)
FOR [week] IN (wk1, wk2, wk3, wk4, wk5)) AS pvt
USING DYNAMIC FOR 52 WEEKS
DECLARE #COLS AS NVARCHAR(MAX),
#QUERY AS NVARCHAR(MAX)
SELECT #COLS = STUFF(( SELECT distinct ','+QUOTENAME(C.[week])
FROM #values AS C
FOR XML PATH('')), 1, 1, '')
SET #QUERY = '
select *
, ROW_NUMBER() OVER(ORDER BY CASE WHEN wk5 IS NULL THEN 1 ELSE 0 END, wk5, CASE WHEN wk4 IS NULL THEN 1 ELSE 0 END, wk4, CASE WHEN wk3 IS NULL THEN 1 ELSE 0 END,wk3, CASE WHEN wk2 IS NULL THEN 1 ELSE 0 END,wk2, CASE WHEN wk1 IS NULL THEN 1 ELSE 0 END, wk1) AS new_index
from (
select * from #values
) p
PIVOT (
MAX(Amount)
FOR [week] IN (' + #cols+ ')) AS pvt'
EXEC(#QUERY)

replace value in varchar(max) field with join

I have a table that contains text field with placeholders. Something like this:
Row Notes
1. This is some notes ##placeholder130## this ##myPlaceholder##, #oneMore#. End.
2. Second row...just a ##test#.
(This table contains about 1-5k rows on average. Average number of placeholders in one row is 5-15).
Now, I have a lookup table that looks like this:
Name Value
placeholder130 Dog
myPlaceholder Cat
oneMore Cow
test Horse
(Lookup table will contain anywhere from 10k to 100k records)
I need to find the fastest way to join those placeholders from strings to a lookup table and replace with value. So, my result should look like this (1st row):
This is some notes Dog this Cat, Cow. End.
What I came up with was to split each row into multiple for each placeholder and then join it to lookup table and then concat records back to original row with new values, but it takes around 10-30 seconds on average.
You could try to split the string using a numbers table and rebuild it with for xml path.
select (
select coalesce(L.Value, T.Value)
from Numbers as N
cross apply (select substring(Notes.notes, N.Number, charindex('##', Notes.notes + '##', N.Number) - N.Number)) as T(Value)
left outer join Lookup as L
on L.Name = T.Value
where N.Number <= len(notes) and
substring('##' + notes, Number, 2) = '##'
order by N.Number
for xml path(''), type
).value('text()[1]', 'varchar(max)')
from Notes
SQL Fiddle
I borrowed the string splitting from this blog post by Aaron Bertrand
SQL Server is not very fast with string manipulation, so this is probably best done client-side. Have the client load the entire lookup table, and replace the notes as they arrived.
Having said that, it can of course be done in SQL. Here's a solution with a recursive CTE. It performs one lookup per recursion step:
; with Repl as
(
select row_number() over (order by l.name) rn
, Name
, Value
from Lookup l
)
, Recurse as
(
select Notes
, 0 as rn
from Notes
union all
select replace(Notes, '##' + l.name + '##', l.value)
, r.rn + 1
from Recurse r
join Repl l
on l.rn = r.rn + 1
)
select *
from Recurse
where rn =
(
select count(*)
from Lookup
)
option (maxrecursion 0)
Example at SQL Fiddle.
Another option is a while loop to keep replacing lookups until no more are found:
declare #notes table (notes varchar(max))
insert #notes
select Notes
from Notes
while 1=1
begin
update n
set Notes = replace(n.Notes, '##' + l.name + '##', l.value)
from #notes n
outer apply
(
select top 1 Name
, Value
from Lookup l
where n.Notes like '%##' + l.name + '##%'
) l
where l.name is not null
if ##rowcount = 0
break
end
select *
from #notes
Example at SQL Fiddle.
I second the comment that tsql is just not suited for this operation, but if you must do it in the db here is an example using a function to manage the multiple replace statements.
Since you have a relatively small number of tokens in each note (5-15) and a very large number of tokens (10k-100k) my function first extracts tokens from the input as potential tokens and uses that set to join to your lookup (dbo.Token below). It was far too much work to look for an occurrence of any of your tokens in each note.
I did a bit of perf testing using 50k tokens and 5k notes and this function runs really well, completing in <2 seconds (on my laptop). Please report back how this strategy performs for you.
note: In your example data the token format was not consistent (##_#, ##_##, #_#), I am guessing this was simply a typo and assume all tokens take the form of ##TokenName##.
--setup
if object_id('dbo.[Lookup]') is not null
drop table dbo.[Lookup];
go
if object_id('dbo.fn_ReplaceLookups') is not null
drop function dbo.fn_ReplaceLookups;
go
create table dbo.[Lookup] (LookupName varchar(100) primary key, LookupValue varchar(100));
insert into dbo.[Lookup]
select '##placeholder130##','Dog' union all
select '##myPlaceholder##','Cat' union all
select '##oneMore##','Cow' union all
select '##test##','Horse';
go
create function [dbo].[fn_ReplaceLookups](#input varchar(max))
returns varchar(max)
as
begin
declare #xml xml;
select #xml = cast(('<r><i>'+replace(#input,'##' ,'</i><i>')+'</i></r>') as xml);
--extract the potential tokens
declare #LookupsInString table (LookupName varchar(100) primary key);
insert into #LookupsInString
select distinct '##'+v+'##'
from ( select [v] = r.n.value('(./text())[1]', 'varchar(100)'),
[r] = row_number() over (order by n)
from #xml.nodes('r/i') r(n)
)d(v,r)
where r%2=0;
--tokenize the input
select #input = replace(#input, l.LookupName, l.LookupValue)
from dbo.[Lookup] l
join #LookupsInString lis on
l.LookupName = lis.LookupName;
return #input;
end
go
return
--usage
declare #Notes table ([Id] int primary key, notes varchar(100));
insert into #Notes
select 1, 'This is some notes ##placeholder130## this ##myPlaceholder##, ##oneMore##. End.' union all
select 2, 'Second row...just a ##test##.';
select *,
dbo.fn_ReplaceLookups(notes)
from #Notes;
Returns:
Tokenized
--------------------------------------------------------
This is some notes Dog this Cat, Cow. End.
Second row...just a Horse.
Try this
;WITH CTE (org, calc, [Notes], [level]) AS
(
SELECT [Notes], [Notes], CONVERT(varchar(MAX),[Notes]), 0 FROM PlaceholderTable
UNION ALL
SELECT CTE.org, CTE.[Notes],
CONVERT(varchar(MAX), REPLACE(CTE.[Notes],'##' + T.[Name] + '##', T.[Value])), CTE.[level] + 1
FROM CTE
INNER JOIN LookupTable T ON CTE.[Notes] LIKE '%##' + T.[Name] + '##%'
)
SELECT DISTINCT org, [Notes], level FROM CTE
WHERE [level] = (SELECT MAX(level) FROM CTE c WHERE CTE.org = c.org)
SQL FIDDLE DEMO
Check the below devioblog post for reference
devioblog post
To get speed, you can preprocess the note templates into a more efficient form. This will be a sequence of fragments, with each ending in a substitution. The substitution might be NULL for the last fragment.
Notes
Id FragSeq Text SubsId
1 1 'This is some notes ' 1
1 2 ' this ' 2
1 3 ', ' 3
1 4 '. End.' null
2 1 'Second row...just a ' 4
2 2 '.' null
Subs
Id Name Value
1 'placeholder130' 'Dog'
2 'myPlaceholder' 'Cat'
3 'oneMore' 'Cow'
4 'test' 'Horse'
Now we can do the substitutions with a simple join.
SELECT Notes.Text + COALESCE(Subs.Value, '')
FROM Notes LEFT JOIN Subs
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq
This produces a list of fragments with substitutions complete. I am not an MSQL user, but in most dialects of SQL you can concatenate these fragments in a variable quite easily:
DECLARE #Note VARCHAR(8000)
SELECT #Note = COALESCE(#Note, '') + Notes.Text + COALSCE(Subs.Value, '')
FROM Notes LEFT JOIN Subs
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq
Pre-processing a note template into fragments will be straightforward using the string splitting techniques of other posts.
Unfortunately I'm not at a location where I can test this, but it ought to work fine.
I really don't know how it will perform with 10k+ of lookups.
how does the old dynamic SQL performs?
DECLARE #sqlCommand NVARCHAR(MAX)
SELECT #sqlCommand = N'PlaceholderTable.[Notes]'
SELECT #sqlCommand = 'REPLACE( ' + #sqlCommand +
', ''##' + LookupTable.[Name] + '##'', ''' +
LookupTable.[Value] + ''')'
FROM LookupTable
SELECT #sqlCommand = 'SELECT *, ' + #sqlCommand + ' FROM PlaceholderTable'
EXECUTE sp_executesql #sqlCommand
Fiddle demo
And now for some recursive CTE.
If your indexes are correctly set up, this one should be very fast or very slow. SQL Server always surprises me with performance extremes when it comes to the r-CTE...
;WITH T AS (
SELECT
Row,
StartIdx = 1, -- 1 as first starting index
EndIdx = CAST(patindex('%##%', Notes) as int), -- first ending index
Result = substring(Notes, 1, patindex('%##%', Notes) - 1)
-- (first) temp result bounded by indexes
FROM PlaceholderTable -- **this is your source table**
UNION ALL
SELECT
pt.Row,
StartIdx = newstartidx, -- starting index (calculated in calc1)
EndIdx = EndIdx + CAST(newendidx as int) + 1, -- ending index (calculated in calc4 + total offset)
Result = Result + CAST(ISNULL(newtokensub, newtoken) as nvarchar(max))
-- temp result taken from subquery or original
FROM
T
JOIN PlaceholderTable pt -- **this is your source table**
ON pt.Row = T.Row
CROSS APPLY(
SELECT newstartidx = EndIdx + 2 -- new starting index moved by 2 from last end ('##')
) calc1
CROSS APPLY(
SELECT newtxt = substring(pt.Notes, newstartidx, len(pt.Notes))
-- current piece of txt we work on
) calc2
CROSS APPLY(
SELECT patidx = patindex('%##%', newtxt) -- current index of '##'
) calc3
CROSS APPLY(
SELECT newendidx = CASE
WHEN patidx = 0 THEN len(newtxt) + 1
ELSE patidx END -- if last piece of txt, end with its length
) calc4
CROSS APPLY(
SELECT newtoken = substring(pt.Notes, newstartidx, newendidx - 1)
-- get the new token
) calc5
OUTER APPLY(
SELECT newtokensub = Value
FROM LookupTable
WHERE Name = newtoken -- substitute the token if you can find it in **your lookup table**
) calc6
WHERE newstartidx + len(newtxt) - 1 <= len(pt.Notes)
-- do this while {new starting index} + {length of txt we work on} exceeds total length
)
,lastProcessed AS (
SELECT
Row,
Result,
rn = row_number() over(partition by Row order by StartIdx desc)
FROM T
) -- enumerate all (including intermediate) results
SELECT *
FROM lastProcessed
WHERE rn = 1 -- filter out intermediate results (display only last ones)

Grouping runs of data

SQL Experts,
Is there an efficient way to group runs of data together using SQL?
Or is it going to be more efficient to process the data in code.
For example if I have the following data:
ID|Name
01|Harry Johns
02|Adam Taylor
03|John Smith
04|John Smith
05|Bill Manning
06|John Smith
I need to display this:
Harry Johns
Adam Taylor
John Smith (2)
Bill Manning
John Smith
#Matt: Sorry I had trouble formatting the data using an embedded html table it worked in the preview but not in the final display.
Try this:
select n.name,
(select count(*)
from myTable n1
where n1.name = n.name and n1.id >= n.id and (n1.id <=
(
select isnull(min(nn.id), (select max(id) + 1 from myTable))
from myTable nn
where nn.id > n.id and nn.name <> n.name
)
))
from myTable n
where not exists (
select 1
from myTable n3
where n3.name = n.name and n3.id < n.id and n3.id > (
select isnull(max(n4.id), (select min(id) - 1 from myTable))
from myTable n4
where n4.id < n.id and n4.name <> n.name
)
)
I think that'll do what you want. Bit of a kludge though.
Phew! After a few edits I think I have all the edge cases sorted out.
I hate cursors with a passion... but here's a dodgy cursor version...
Declare #NewName Varchar(50)
Declare #OldName Varchar(50)
Declare #CountNum int
Set #CountNum = 0
DECLARE nameCursor CURSOR FOR
SELECT Name
FROM NameTest
OPEN nameCursor
FETCH NEXT FROM nameCursor INTO #NewName
WHILE ##FETCH_STATUS = 0
BEGIN
if #OldName <> #NewName
BEGIN
Print #OldName + ' (' + Cast(#CountNum as Varchar(50)) + ')'
Set #CountNum = 0
END
SELECT #OldName = #NewName
FETCH NEXT FROM nameCursor INTO #NewName
Set #CountNum = #CountNum + 1
END
Print #OldName + ' (' + Cast(#CountNum as Varchar(50)) + ')'
CLOSE nameCursor
DEALLOCATE nameCursor
My solution just for kicks (this was a fun exercise), no cursors, no iterations, but i do have a helper field
-- Setup test table
DECLARE #names TABLE (
id INT IDENTITY(1,1),
name NVARCHAR(25) NOT NULL,
grp UNIQUEIDENTIFIER NULL
)
INSERT #names (name)
SELECT 'Harry Johns' UNION ALL
SELECT 'Adam Taylor' UNION ALL
SELECT 'John Smith' UNION ALL
SELECT 'John Smith' UNION ALL
SELECT 'Bill Manning' UNION ALL
SELECT 'Bill Manning' UNION ALL
SELECT 'Bill Manning' UNION ALL
SELECT 'John Smith' UNION ALL
SELECT 'Bill Manning'
-- Set the first id's group to a newid()
UPDATE n
SET grp = newid()
FROM #names n
WHERE n.id = (SELECT MIN(id) FROM #names)
-- Set the group to a newid() if the name does not equal the previous
UPDATE n
SET grp = newid()
FROM #names n
INNER JOIN #names b
ON (n.ID - 1) = b.ID
AND ISNULL(b.Name, '') <> n.Name
-- Set groups that are null to the previous group
-- Keep on doing this until all groups have been set
WHILE (EXISTS(SELECT 1 FROM #names WHERE grp IS NULL))
BEGIN
UPDATE n
SET grp = b.grp
FROM #names n
INNER JOIN #names b
ON (n.ID - 1) = b.ID
AND n.grp IS NULL
END
-- Final output
SELECT MIN(id) AS id_start,
MAX(id) AS id_end,
name,
count(1) AS consecutive
FROM #names
GROUP BY grp,
name
ORDER BY id_start
/*
Results:
id_start id_end name consecutive
1 1 Harry Johns 1
2 2 Adam Taylor 1
3 4 John Smith 2
5 7 Bill Manning 3
8 8 John Smith 1
9 9 Bill Manning 1
*/
Well, this:
select Name, count(Id)
from MyTable
group by Name
will give you this:
Harry Johns, 1
Adam Taylor, 1
John Smith, 2
Bill Manning, 1
and this (MS SQL syntax):
select Name +
case when ( count(Id) > 1 )
then ' ('+cast(count(Id) as varchar)+')'
else ''
end
from MyTable
group by Name
will give you this:
Harry Johns
Adam Taylor
John Smith (2)
Bill Manning
Did you actually want that other John Smith on the end of your results?
EDIT: Oh I see, you want consecutive runs grouped. In that case, I'd say you need a cursor or to do it in your program code.
How about this:
declare #tmp table (Id int, Nm varchar(50));
insert #tmp select 1, 'Harry Johns';
insert #tmp select 2, 'Adam Taylor';
insert #tmp select 3, 'John Smith';
insert #tmp select 4, 'John Smith';
insert #tmp select 5, 'Bill Manning';
insert #tmp select 6, 'John Smith';
select * from #tmp order by Id;
select Nm, count(1) from
(
select Id, Nm,
case when exists (
select 1 from #tmp t2
where t2.Nm=t1.Nm
and (t2.Id = t1.Id + 1 or t2.Id = t1.Id - 1))
then 1 else 0 end as Run
from #tmp t1
) truns group by Nm, Run
[Edit] That can be shortened a bit
select Nm, count(1) from (select Id, Nm, case when exists (
select 1 from #tmp t2 where t2.Nm=t1.Nm
and abs(t2.Id-t1.Id)=1) then 1 else 0 end as Run
from #tmp t1) t group by Nm, Run
For this particular case, all you need to do is group by the name and ask for the count, like this:
select Name, count(*)
from MyTable
group by Name
That'll get you the count for each name as a second column.
You can get it all as one column by concatenating like this:
select Name + ' (' + cast(count(*) as varchar) + ')'
from MyTable
group by Name