Join [one word per row] to rows of phrases with [multiple words per row] - sql

Please excuse the length of the question. I included a test script to demo the situation and my best attempt at a solution.
There are two tables:
test_WORDS = Words extracted in order from several sources. The OBJ_FK column is the ID of the source. WORD_ID is an identifier for the word itself that is unique within the source. Each row contains one word.
test_PHRASE = a list of phrases to be searched for in test_WORDS. The PHRASE_TEXT column is a space separated phrase like 'foo bar' (see below) so that each row contains multiple words.
Requirement:
Return the first word from test_WORDS that is the start of a matching a phrase from test_PHRASE.
I would prefer something set based to avoid RBAR approach below. Also my solution is limited to 5 word phrases. I need to support up to 20 word phrases. Is it possible to match the words from a row in test_PHRASE to contiguous rows in the test_WORD without cursors?
After breaking the phrase words out into a temporary table, the problem boils down to matching portions of two sets together in row order.
-- Create test data
CREATE TABLE [dbo].[test_WORDS](
[OBJ_FK] [bigint] NOT NULL, --FK to the source object
[WORD_ID] [int] NOT NULL, --The word order in the source object
[WORD_TEXT] [nvarchar](50) NOT NULL,
CONSTRAINT [PK_test_WORDS] PRIMARY KEY CLUSTERED
(
[OBJ_FK] ASC,
[WORD_ID] ASC
)
) ON [PRIMARY]
GO
CREATE TABLE [dbo].[test_PHRASE](
[ID] [int], --PHRASE ID
[PHRASE_TEXT] [nvarchar](150) NOT NULL --Space-separated phrase
CONSTRAINT [PK_test_PHRASE] PRIMARY KEY CLUSTERED
(
[ID] ASC
)
)
GO
INSERT INTO dbo.test_WORDS
SELECT 1,1,'aaa' UNION ALL
SELECT 1,2,'bbb' UNION ALL
SELECT 1,3,'ccc' UNION ALL
SELECT 1,4,'ddd' UNION ALL
SELECT 1,5,'eee' UNION ALL
SELECT 1,6,'fff' UNION ALL
SELECT 1,7,'ggg' UNION ALL
SELECT 1,8,'hhh' UNION ALL
SELECT 2,1,'zzz' UNION ALL
SELECT 2,2,'yyy' UNION ALL
SELECT 2,3,'xxx' UNION ALL
SELECT 2,4,'www'
INSERT INTO dbo.test_PHRASE
SELECT 1, 'bbb ccc ddd' UNION ALL --should match
SELECT 2, 'ddd eee fff' UNION ALL --should match
SELECT 3, 'xxx xxx xxx' UNION ALL --should NOT match
SELECT 4, 'zzz yyy xxx' UNION ALL --should match
SELECT 5, 'xxx www ppp' UNION ALL --should NOT match
SELECT 6, 'zzz yyy xxx www' --should match
-- Create variables
DECLARE #maxRow AS INTEGER
DECLARE #currentRow AS INTEGER
DECLARE #phraseSubsetTable AS TABLE(
[ROW] int IDENTITY(1,1) NOT NULL,
[ID] int NOT NULL, --PHRASE ID
[PHRASE_TEXT] nvarchar(150) NOT NULL
)
--used to split the phrase into words
--note: No permissions to sys.dm_fts_parser
DECLARE #WordList table
(
ID int,
WORD nvarchar(50)
)
--Records to be returned to caller
DECLARE #returnTable AS TABLE(
OBJECT_FK INT NOT NULL,
WORD_ID INT NOT NULL,
PHRASE_ID INT NOT NULL
)
DECLARE #phrase AS NVARCHAR(150)
DECLARE #phraseID AS INTEGER
-- Get subset of phrases to simulate a join that would occur in production
INSERT INTO #phraseSubsetTable
SELECT ID, PHRASE_TEXT
FROM dbo.test_PHRASE
--represent subset of phrases caused by join in production
WHERE ID IN (2,3,4)
-- Loop each phrase in the subset, split into rows of words and return matches to the test_WORDS table
SET #maxRow = ##ROWCOUNT
SET #currentRow = 1
WHILE #currentRow <= #maxRow
BEGIN
SELECT #phrase=PHRASE_TEXT, #phraseID=ID FROM #phraseSubsetTable WHERE row = #currentRow
--clear previous phrase that was split into rows
DELETE FROM #WordList
--Recursive Function with CTE to create recordset of words, one per row
;WITH Pieces(pn, start, stop) AS (
SELECT 1, 1, CHARINDEX(' ', #phrase)
UNION ALL
SELECT pn + 1, stop + 1, CHARINDEX(' ', #phrase, stop + 1)
FROM Pieces
WHERE stop > 0)
--Create the List of words with the CTE above
insert into #WordList
SELECT pn,
SUBSTRING(#phrase, start, CASE WHEN stop > 0 THEN stop-start ELSE 1056 END) AS WORD
FROM Pieces
DECLARE #wordCt as int
select #wordCt=count(ID) from #WordList;
-- Do the actual query using a CTE with a rownumber that repeats for every SOURCE OBJECT
;WITH WordOrder_CTE AS (
SELECT OBJ_FK, WORD_ID, WORD_TEXT,
ROW_NUMBER() OVER (Partition BY OBJ_FK ORDER BY WORD_ID) AS rownum
FROM test_WORDS)
--CREATE a flattened record of the first word in the phrase and join it to the rest of the words.
INSERT INTO #returnTable
SELECT r1.OBJ_FK, r1.WORD_ID, #phraseID AS PHRASE_ID
FROM WordOrder_CTE r1
INNER JOIN #WordList w1 ON r1.WORD_TEXT = w1.WORD and w1.ID=1
LEFT JOIN WordOrder_CTE r2
ON r1.rownum = r2.rownum - 1 and r1.OBJ_FK = r2.OBJ_FK
LEFT JOIN #WordList w2 ON r2.WORD_TEXT = w2.WORD and w2.ID=2
LEFT JOIN WordOrder_CTE r3
ON r1.rownum = r3.rownum - 2 and r1.OBJ_FK = r3.OBJ_FK
LEFT JOIN #WordList w3 ON r3.WORD_TEXT = w3.WORD and w3.ID=3
LEFT JOIN WordOrder_CTE r4
ON r1.rownum = r4.rownum - 3 and r1.OBJ_FK = r4.OBJ_FK
LEFT JOIN #WordList w4 ON r4.WORD_TEXT = w4.WORD and w4.ID=4
LEFT JOIN WordOrder_CTE r5
ON r1.rownum = r5.rownum - 4 and r1.OBJ_FK = r5.OBJ_FK
LEFT JOIN #WordList w5 ON r5.WORD_TEXT = w5.WORD and w5.ID=5
WHERE (#wordCt < 2 OR w2.ID is not null) and
(#wordCt < 3 OR w3.ID is not null) and
(#wordCt < 4 OR w4.ID is not null) and
(#wordCt < 5 OR w5.ID is not null)
--loop
SET #currentRow = #currentRow+1
END
--Return the first words of each matching phrase
SELECT OBJECT_FK, WORD_ID, PHRASE_ID FROM #returnTable
GO
--Clean up
DROP TABLE [dbo].[test_WORDS]
DROP TABLE [dbo].[test_PHRASE]
Edited solution:
This is an edit of the correct solution provided below to account for non-contiguous word IDs. Hope this helps someone as much as it did me.
;WITH
numberedwords AS (
SELECT
OBJ_FK,
WORD_ID,
WORD_TEXT,
rowcnt = ROW_NUMBER() OVER
(PARTITION BY OBJ_FK ORDER BY WORD_ID DESC),
totalInSrc = COUNT(WORD_ID) OVER (PARTITION BY OBJ_FK)
FROM dbo.test_WORDS
),
phrasedwords AS (
SELECT
nw1.OBJ_FK,
nw1.WORD_ID,
nw1.WORD_TEXT,
PHRASE_TEXT = RTRIM((
SELECT [text()] = nw2.WORD_TEXT + ' '
FROM numberedwords nw2
WHERE nw1.OBJ_FK = nw2.OBJ_FK
AND nw2.rowcnt BETWEEN nw1.rowcnt AND nw1.totalInSrc
ORDER BY nw2.OBJ_FK, nw2.WORD_ID
FOR XML PATH ('')
))
FROM numberedwords nw1
GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt, nw1.totalInSrc
)
SELECT *
FROM phrasedwords pw
INNER JOIN test_PHRASE tp
ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID
Note: The final query I used in production uses indexed temp tables instead of CTEs. I also limited the length of the PHRASE_TEXT column to my needs. With these improvements, I was able to reduce my query time from over 3 minutes to 3 seconds!

Here's a solution that uses a different approach: instead of splitting the phrases into words it combines the words into phrases.
Edited: changed the rowcnt expression to using COUNT(*) OVER …, as suggested by #ErikE in the comments.
;WITH
numberedwords AS (
SELECT
OBJ_FK,
WORD_ID,
WORD_TEXT,
rowcnt = COUNT(*) OVER (PARTITION BY OBJ_FK)
FROM dbo.test_WORDS
),
phrasedwords AS (
SELECT
nw1.OBJ_FK,
nw1.WORD_ID,
nw1.WORD_TEXT,
PHRASE_TEXT = RTRIM((
SELECT [text()] = nw2.WORD_TEXT + ' '
FROM numberedwords nw2
WHERE nw1.OBJ_FK = nw2.OBJ_FK
AND nw2.WORD_ID BETWEEN nw1.WORD_ID AND nw1.rowcnt
ORDER BY nw2.OBJ_FK, nw2.WORD_ID
FOR XML PATH ('')
))
FROM numberedwords nw1
GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt
)
SELECT *
FROM phrasedwords pw
INNER JOIN test_PHRASE tp
ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID

Using a Split function should work.
Split Function
CREATE FUNCTION dbo.Split
(
#RowData nvarchar(2000),
#SplitOn nvarchar(5)
)
RETURNS #RtnValue table
(
Id int identity(1,1),
Data nvarchar(100)
)
AS
BEGIN
Declare #Cnt int
Set #Cnt = 1
While (Charindex(#SplitOn,#RowData)>0)
Begin
Insert Into #RtnValue (data)
Select
Data = ltrim(rtrim(Substring(#RowData,1,Charindex(#SplitOn,#RowData)-1)))
Set #RowData = Substring(#RowData,Charindex(#SplitOn,#RowData)+1,len(#RowData))
Set #Cnt = #Cnt + 1
End
Insert Into #RtnValue (data)
Select Data = ltrim(rtrim(#RowData))
Return
END
SQL Statement
SELECT DISTINCT p.*
FROM dbo.test_PHRASE p
LEFT OUTER JOIN (
SELECT p.ID
FROM dbo.test_PHRASE p
CROSS APPLY dbo.Split(p.PHRASE_TEXT, ' ') sp
LEFT OUTER JOIN dbo.test_WORDS w ON w.WORD_TEXT = sp.Data
WHERE w.OBJ_FK IS NULL
) ignore ON ignore.ID = p.ID
WHERE ignore.ID IS NULL

This performs a little better than other solutions given. if you don't need WORD_ID, just WORD_TEXT, you can remove a whole column. I know this was over a year ago, but I wonder if you can get 3 seconds down to 30 ms? :)
If this query seems good, then my biggest speed advice is to put the entire phrases into a separate table (using your example data, it would have only 2 rows with phrases of length 8 words and 4 words).
SELECT
W.OBJ_FK,
X.Phrase,
P.*,
Left(P.PHRASE_TEXT,
IsNull(NullIf(CharIndex(' ', P.PHRASE_TEXT), 0) - 1, 2147483647)
) WORD_TEXT,
Len(Left(X.Phrase, PatIndex('%' + P.PHRASE_TEXT + '%', ' ' + X.Phrase) - 1))
- Len(Replace(
Left(X.Phrase, PatIndex('%' + P.PHRASE_TEXT + '%', X.Phrase) - 1), ' ', '')
)
WORD_ID
FROM
(SELECT DISTINCT OBJ_FK FROM dbo.test_WORDS) W
CROSS APPLY (
SELECT RTrim((SELECT WORD_TEXT + ' '
FROM dbo.test_WORDS W2
WHERE W.OBJ_FK = W2.OBJ_FK
ORDER BY W2.WORD_ID
FOR XML PATH (''))) Phrase
) X
INNER JOIN dbo.test_PHRASE P
ON X.Phrase LIKE '%' + P.PHRASE_TEXT + '%';
Here's another version for curiosity's sake. It doesn't perform quite as well.
WITH Calc AS (
SELECT
P.ID,
P.PHRASE_TEXT,
W.OBJ_FK,
W.WORD_ID StartID,
W.WORD_TEXT StartText,
W.WORD_ID,
Len(W.WORD_TEXT) + 2 NextPos,
Convert(varchar(150), W.WORD_TEXT) MatchingPhrase
FROM
dbo.test_PHRASE P
INNER JOIN dbo.test_WORDS W
ON P.PHRASE_TEXT + ' ' LIKE W.WORD_TEXT + ' %'
UNION ALL
SELECT
C.ID,
C.PHRASE_TEXT,
C.OBJ_FK,
C.StartID,
C.StartText,
W.WORD_ID,
C.NextPos + Len(W.WORD_TEXT) + 1,
Convert(varchar(150), C.MatchingPhrase + Coalesce(' ' + W.WORD_TEXT, ''))
FROM
Calc C
INNER JOIN dbo.test_WORDS W
ON C.OBJ_FK = W.OBJ_FK
AND C.WORD_ID + 1 = W.WORD_ID
AND Substring(C.PHRASE_TEXT, C.NextPos, 2147483647) + ' ' LIKE W.WORD_TEXT + ' %'
)
SELECT C.OBJ_FK, C.PHRASE_TEXT, C.StartID, C.StartText, C.ID
FROM Calc C
WHERE C.PHRASE_TEXT = C.MatchingPhrase;

Related

replace value in varchar(max) field with join

I have a table that contains text field with placeholders. Something like this:
Row Notes
1. This is some notes ##placeholder130## this ##myPlaceholder##, #oneMore#. End.
2. Second row...just a ##test#.
(This table contains about 1-5k rows on average. Average number of placeholders in one row is 5-15).
Now, I have a lookup table that looks like this:
Name Value
placeholder130 Dog
myPlaceholder Cat
oneMore Cow
test Horse
(Lookup table will contain anywhere from 10k to 100k records)
I need to find the fastest way to join those placeholders from strings to a lookup table and replace with value. So, my result should look like this (1st row):
This is some notes Dog this Cat, Cow. End.
What I came up with was to split each row into multiple for each placeholder and then join it to lookup table and then concat records back to original row with new values, but it takes around 10-30 seconds on average.
You could try to split the string using a numbers table and rebuild it with for xml path.
select (
select coalesce(L.Value, T.Value)
from Numbers as N
cross apply (select substring(Notes.notes, N.Number, charindex('##', Notes.notes + '##', N.Number) - N.Number)) as T(Value)
left outer join Lookup as L
on L.Name = T.Value
where N.Number <= len(notes) and
substring('##' + notes, Number, 2) = '##'
order by N.Number
for xml path(''), type
).value('text()[1]', 'varchar(max)')
from Notes
SQL Fiddle
I borrowed the string splitting from this blog post by Aaron Bertrand
SQL Server is not very fast with string manipulation, so this is probably best done client-side. Have the client load the entire lookup table, and replace the notes as they arrived.
Having said that, it can of course be done in SQL. Here's a solution with a recursive CTE. It performs one lookup per recursion step:
; with Repl as
(
select row_number() over (order by l.name) rn
, Name
, Value
from Lookup l
)
, Recurse as
(
select Notes
, 0 as rn
from Notes
union all
select replace(Notes, '##' + l.name + '##', l.value)
, r.rn + 1
from Recurse r
join Repl l
on l.rn = r.rn + 1
)
select *
from Recurse
where rn =
(
select count(*)
from Lookup
)
option (maxrecursion 0)
Example at SQL Fiddle.
Another option is a while loop to keep replacing lookups until no more are found:
declare #notes table (notes varchar(max))
insert #notes
select Notes
from Notes
while 1=1
begin
update n
set Notes = replace(n.Notes, '##' + l.name + '##', l.value)
from #notes n
outer apply
(
select top 1 Name
, Value
from Lookup l
where n.Notes like '%##' + l.name + '##%'
) l
where l.name is not null
if ##rowcount = 0
break
end
select *
from #notes
Example at SQL Fiddle.
I second the comment that tsql is just not suited for this operation, but if you must do it in the db here is an example using a function to manage the multiple replace statements.
Since you have a relatively small number of tokens in each note (5-15) and a very large number of tokens (10k-100k) my function first extracts tokens from the input as potential tokens and uses that set to join to your lookup (dbo.Token below). It was far too much work to look for an occurrence of any of your tokens in each note.
I did a bit of perf testing using 50k tokens and 5k notes and this function runs really well, completing in <2 seconds (on my laptop). Please report back how this strategy performs for you.
note: In your example data the token format was not consistent (##_#, ##_##, #_#), I am guessing this was simply a typo and assume all tokens take the form of ##TokenName##.
--setup
if object_id('dbo.[Lookup]') is not null
drop table dbo.[Lookup];
go
if object_id('dbo.fn_ReplaceLookups') is not null
drop function dbo.fn_ReplaceLookups;
go
create table dbo.[Lookup] (LookupName varchar(100) primary key, LookupValue varchar(100));
insert into dbo.[Lookup]
select '##placeholder130##','Dog' union all
select '##myPlaceholder##','Cat' union all
select '##oneMore##','Cow' union all
select '##test##','Horse';
go
create function [dbo].[fn_ReplaceLookups](#input varchar(max))
returns varchar(max)
as
begin
declare #xml xml;
select #xml = cast(('<r><i>'+replace(#input,'##' ,'</i><i>')+'</i></r>') as xml);
--extract the potential tokens
declare #LookupsInString table (LookupName varchar(100) primary key);
insert into #LookupsInString
select distinct '##'+v+'##'
from ( select [v] = r.n.value('(./text())[1]', 'varchar(100)'),
[r] = row_number() over (order by n)
from #xml.nodes('r/i') r(n)
)d(v,r)
where r%2=0;
--tokenize the input
select #input = replace(#input, l.LookupName, l.LookupValue)
from dbo.[Lookup] l
join #LookupsInString lis on
l.LookupName = lis.LookupName;
return #input;
end
go
return
--usage
declare #Notes table ([Id] int primary key, notes varchar(100));
insert into #Notes
select 1, 'This is some notes ##placeholder130## this ##myPlaceholder##, ##oneMore##. End.' union all
select 2, 'Second row...just a ##test##.';
select *,
dbo.fn_ReplaceLookups(notes)
from #Notes;
Returns:
Tokenized
--------------------------------------------------------
This is some notes Dog this Cat, Cow. End.
Second row...just a Horse.
Try this
;WITH CTE (org, calc, [Notes], [level]) AS
(
SELECT [Notes], [Notes], CONVERT(varchar(MAX),[Notes]), 0 FROM PlaceholderTable
UNION ALL
SELECT CTE.org, CTE.[Notes],
CONVERT(varchar(MAX), REPLACE(CTE.[Notes],'##' + T.[Name] + '##', T.[Value])), CTE.[level] + 1
FROM CTE
INNER JOIN LookupTable T ON CTE.[Notes] LIKE '%##' + T.[Name] + '##%'
)
SELECT DISTINCT org, [Notes], level FROM CTE
WHERE [level] = (SELECT MAX(level) FROM CTE c WHERE CTE.org = c.org)
SQL FIDDLE DEMO
Check the below devioblog post for reference
devioblog post
To get speed, you can preprocess the note templates into a more efficient form. This will be a sequence of fragments, with each ending in a substitution. The substitution might be NULL for the last fragment.
Notes
Id FragSeq Text SubsId
1 1 'This is some notes ' 1
1 2 ' this ' 2
1 3 ', ' 3
1 4 '. End.' null
2 1 'Second row...just a ' 4
2 2 '.' null
Subs
Id Name Value
1 'placeholder130' 'Dog'
2 'myPlaceholder' 'Cat'
3 'oneMore' 'Cow'
4 'test' 'Horse'
Now we can do the substitutions with a simple join.
SELECT Notes.Text + COALESCE(Subs.Value, '')
FROM Notes LEFT JOIN Subs
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq
This produces a list of fragments with substitutions complete. I am not an MSQL user, but in most dialects of SQL you can concatenate these fragments in a variable quite easily:
DECLARE #Note VARCHAR(8000)
SELECT #Note = COALESCE(#Note, '') + Notes.Text + COALSCE(Subs.Value, '')
FROM Notes LEFT JOIN Subs
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq
Pre-processing a note template into fragments will be straightforward using the string splitting techniques of other posts.
Unfortunately I'm not at a location where I can test this, but it ought to work fine.
I really don't know how it will perform with 10k+ of lookups.
how does the old dynamic SQL performs?
DECLARE #sqlCommand NVARCHAR(MAX)
SELECT #sqlCommand = N'PlaceholderTable.[Notes]'
SELECT #sqlCommand = 'REPLACE( ' + #sqlCommand +
', ''##' + LookupTable.[Name] + '##'', ''' +
LookupTable.[Value] + ''')'
FROM LookupTable
SELECT #sqlCommand = 'SELECT *, ' + #sqlCommand + ' FROM PlaceholderTable'
EXECUTE sp_executesql #sqlCommand
Fiddle demo
And now for some recursive CTE.
If your indexes are correctly set up, this one should be very fast or very slow. SQL Server always surprises me with performance extremes when it comes to the r-CTE...
;WITH T AS (
SELECT
Row,
StartIdx = 1, -- 1 as first starting index
EndIdx = CAST(patindex('%##%', Notes) as int), -- first ending index
Result = substring(Notes, 1, patindex('%##%', Notes) - 1)
-- (first) temp result bounded by indexes
FROM PlaceholderTable -- **this is your source table**
UNION ALL
SELECT
pt.Row,
StartIdx = newstartidx, -- starting index (calculated in calc1)
EndIdx = EndIdx + CAST(newendidx as int) + 1, -- ending index (calculated in calc4 + total offset)
Result = Result + CAST(ISNULL(newtokensub, newtoken) as nvarchar(max))
-- temp result taken from subquery or original
FROM
T
JOIN PlaceholderTable pt -- **this is your source table**
ON pt.Row = T.Row
CROSS APPLY(
SELECT newstartidx = EndIdx + 2 -- new starting index moved by 2 from last end ('##')
) calc1
CROSS APPLY(
SELECT newtxt = substring(pt.Notes, newstartidx, len(pt.Notes))
-- current piece of txt we work on
) calc2
CROSS APPLY(
SELECT patidx = patindex('%##%', newtxt) -- current index of '##'
) calc3
CROSS APPLY(
SELECT newendidx = CASE
WHEN patidx = 0 THEN len(newtxt) + 1
ELSE patidx END -- if last piece of txt, end with its length
) calc4
CROSS APPLY(
SELECT newtoken = substring(pt.Notes, newstartidx, newendidx - 1)
-- get the new token
) calc5
OUTER APPLY(
SELECT newtokensub = Value
FROM LookupTable
WHERE Name = newtoken -- substitute the token if you can find it in **your lookup table**
) calc6
WHERE newstartidx + len(newtxt) - 1 <= len(pt.Notes)
-- do this while {new starting index} + {length of txt we work on} exceeds total length
)
,lastProcessed AS (
SELECT
Row,
Result,
rn = row_number() over(partition by Row order by StartIdx desc)
FROM T
) -- enumerate all (including intermediate) results
SELECT *
FROM lastProcessed
WHERE rn = 1 -- filter out intermediate results (display only last ones)

initialize and increment variable inside cte query sqlserver 2008

I am using sqlserver 2008 ,I want to initialize and increment variable (#NUMTwo) both at the same time, in my second part(Problem Line).
I am creating a cte query.
Is this possible , if yes then please let me know.
following is a sample example.I hope i am clear.
CREATE table #TempTable
(
childProductID INT,parentProductID INT,productModel varchar(50),[Num2] VARCHAR(100)
)
DECLARE #NUMTwo INT = 0
WITH tableR AS
(
-- First Part
SELECT childProductID = null,parentProductID=null,productModel from Products where productid in (#a),[Num2] = convert(varchar(100), '')
UNION ALL
--Second Part
SELECT e.childProductID,e.parentProductID,prd.productModel FROM ProductIncludes AS e
,[Num2] = convert(varchar(100),'1.' + #NUMTwo+=1 ) -- Problem line
INNER JOIN Products AS PRD ON e.childProductID = PRD.productID
WHERE parentProductID in (#a)
)
INSERT INTO #TempTable(childProductID,parentProductID,productModel,[Num2])
SELECT childProductID,parentProductID,productModel,[Num2]
END
SELECT * FROM #TempTable
You need to "Initialize" a column in the acnhor part of the query, and then "oncrement" this column in the recursive parts.
Something like
DECLARE #NUMTwo INT = 0
;WITH Test AS (
SELECT [Num2] = convert(varchar(MAX), ''),
#NUMTwo [N]
UNION ALL
SELECT [Num2] = '1.' + convert(varchar(MAX),[N]+1),
[N]+1
FROM TEst
WHERE [N] < 10
)
SELECT *
FROM Test
SQL Fiddle DEMO
If the parameter #NUMTwo is just for numbering rows you can use the ROW_NUMBER() OVER(...) instead of it like so:
WITH tableR AS
(
SELECT childProductID = NULL, parentProductID = NULL,
productModel, NUMTwo = CAST('0' AS VARCHAR(10))
FROM Products
WHERE
productid in (#a),
[Num2] = convert(varchar(100), '')
UNION ALL
SELECT e.childProductID, e.parentProductID,
prd.productModel,
NUMTwo = '1.' +
CAST( ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS VARCHAR(10))
FROM ProductIncludes AS e
INNER JOIN Products AS PRD ON e.childProductID = PRD.productID
WHERE parentProductID in (#a)
)

Parsing text to multiple columns

I have a feed that is populating a single text field in a table with statistics.
I need to pull this data into multiple fields in another table
but the strange format makes importing automatically difficult.
The file format is flat text but an example is below:
08:34:52 Checksum=180957248,TicketType=6,InitialUserType=G,InitialUserID=520,CommunicationType=Incoming,Date=26-03-2012,Time=08:35:00,Service=ST,Duration=00:00:14,Cost=0.12
Effectively it's made up of:
[timestamp] [Field1 name]=[Field1 value],[Field2 name]=[Field2 value],[Field4 name]=[Field4 value]...[CR]
All fields are always in the same order but not always present.
Total columns could be anywhere from 5 to 30.
I've tried the below function to translate it which seems to work mostly but seems to randomly skip fields:
Parsing the data:
(SELECT [Data].[dbo].[GetFromTextString] ( 'Checksum=' ,',' ,RAWTEXT)) AS RowCheckSum,
(SELECT [Data].[dbo].[GetFromTextString] ( 'TicketType=' ,',' ,RAWTEXT)) AS TicketType,
And the Function:
CREATE FUNCTION [dbo].[GetFromTextString]
-- Input start and end and return value.
(#uniqueprefix VARCHAR(100),
#commonsuffix VARCHAR(100),
#datastring VARCHAR(MAX) )
RETURNS VARCHAR(MAX) -- Picked Value.
AS
BEGIN
DECLARE #ADJLEN INT = LEN(#uniqueprefix)
SET #datastring = #datastring + #commonsuffix
RETURN (
CASE WHEN (CHARINDEX(#uniqueprefix,#datastring) > 0)
AND (CHARINDEX(#uniqueprefix + #commonsuffix,#datastring) = 0)
THEN SUBSTRING(#datastring, PATINDEX('%' + #uniqueprefix + '%',#datastring)+#ADJLEN, CHARINDEX(#commonsuffix,#datastring,PATINDEX('%' + #uniqueprefix + '%',#datastring))- PATINDEX('%' + #uniqueprefix + '%',#datastring)-#ADJLEN) ELSE NULL END
)
END
Could anyone suggest a better/cleaner way to strip out the data or could someone work out why this formula skips rows?
Any help really appreciated.
NOTE - THE FIRST SOLUTION IS RUBBISH. I HAVE LEFT IN IT FOR HISTORICAL REASONS, BUT A BETTER SOLUTION IS CONTAINED BELOW
I am not even sure if this will be faster than your current method, but it is the way I would approach the issue (If i was forced into an SQL only solution). The first thing that is required is a table valued function that will perform a split function:
CREATE FUNCTION dbo.Split (#TextToSplit VARCHAR(MAX), #Delimiter VARCHAR(MAX))
RETURNS #Values TABLE (Position INT IDENTITY(1, 1) NOT NULL, TextValues VARCHAR(MAX) NOT NULL)
AS
BEGIN
WHILE CHARINDEX(#Delimiter, #TextToSplit) > 0
BEGIN
INSERT #Values
SELECT LEFT(#TextToSplit, CHARINDEX(#Delimiter, #TextToSplit) - 1)
SET #TextToSplit = SUBSTRING(#TextToSplit, CHARINDEX(#Delimiter, #TextToSplit) + 1, LEN(#TextToSplit))
END
INSERT #Values VALUES (#TextToSplit)
RETURN
END
For my example I am working from a temp table #Worklist, you may need to adapt yours accordingly, or you could just insert the relevant data into #Worklist where I have used dummy data:
DECLARE #WorkList TABLE (ID INT IDENTITY(1, 1) NOT NULL, TextField VARCHAR(MAX))
INSERT #WorkList
SELECT '08:34:52 Checksum=180957248,TicketType=6,InitialUserType=G,InitialUserID=520,CommunicationType=Incoming,Date=26-03-2012,Time=08:35:00,Service=ST,Duration=00:00:14,Cost=0.12'
UNION
SELECT '08:34:52 Checksum=180957249,TicketType=5,InitialUserType=H,InitialUserID=521,CommunicationType=Outgoing,Date=27-03-2012,Time=14:27:00,Service=ST,Duration=00:15:12,Cost=0.37'
The main bit of the query is done here. It is quite long, so I have tried to comment it as well as possible. If further clarification is required I can add more comments.
DECLARE #Output TABLE (ID INT IDENTITY(1, 1) NOT NULL, TextField VARCHAR(MAX))
DECLARE #KeyPairs TABLE (WorkListID INT NOT NULL, KeyField VARCHAR(MAX), ValueField VARCHAR(MAX))
-- STORE TIMESTAMP DATA - THIS ASSUMES THE FIRST SPACE IS THE END OF THE TIMESTAMP
INSERT #KeyPairs
SELECT ID, 'TimeStamp', LEFT(TextField, CHARINDEX(' ', TextField))
FROM #WorkList
-- CLEAR THE TIMESTAMP FROM THE WORKLIST
UPDATE #WorkList
SET TextField = SUBSTRING(TextField, CHARINDEX(' ', TextField) + 1, LEN(TextField))
DECLARE #ID INT = (SELECT MIN(ID) FROM #WorkList)
WHILE #ID IS NOT NULL
BEGIN
-- SPLIT THE STRING FIRST INTO ALL THE PAIRS (e.g. Checksum=180957248)
INSERT #Output
SELECT TextValues
FROM dbo.Split((SELECT TextField FROM #WorkList WHERE ID = #ID), ',')
DECLARE #ID2 INT = (SELECT MIN(ID) FROM #Output)
-- FOR ALL THE PAIRS SPLIT THEM INTO A KEY AND A VALUE (USING THE POSITION OF THE SPLIT FUNCTION)
WHILE #ID2 IS NOT NULL
BEGIN
INSERT #KeyPairs
SELECT #ID,
MAX(CASE WHEN Position = 1 THEN TextValues ELSE '' END),
MAX(CASE WHEN Position = 2 THEN TextValues ELSE '' END)
FROM dbo.Split((SELECT TextField FROM #Output WHERE ID = #ID2), '=')
DELETE #Output
WHERE ID = #ID2
SET #ID2 = (SELECT MIN(ID) FROM #Output)
END
DELETE #WorkList
WHERE ID = #ID
SET #ID = (SELECT MIN(ID) FROM #WorkList)
END
-- WE NOW HAVE A TABLE CONTAINING EAV MODEL STYLE DATA. THIS NEEDS TO BE PIVOTED INTO THE CORRECT FORMAT
-- ENSURE COLUMNS ARE LISTED IN THE ORDER YOU WANT THEM TO APPEAR
SELECT *
FROM #KeyPairs p
PIVOT
( MAX(ValueField)
FOR KeyField IN
( [TimeStamp], [Checksum], [TicketType], [InitialUserType],
[InitialUserID], [CommunicationType], [Date], [Time],
[Service], [Duration], [Cost]
)
) AS PivotTable;
EDIT (4 YEARS LATER)
A recent upvote brought this to my attention and the I hate myself a little bit for ever posting this answer in its current form.
A much better split function would be:
CREATE FUNCTION dbo.Split
(
#List NVARCHAR(MAX),
#Delimiter NVARCHAR(255)
)
RETURNS TABLE
WITH SCHEMABINDING AS
RETURN
( WITH N1 AS (SELECT N FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1), (1)) n (N)),
N2(N) AS (SELECT 1 FROM N1 a CROSS JOIN N1 b),
N3(N) AS (SELECT 1 FROM N2 a CROSS JOIN N2 b),
N4(N) AS (SELECT 1 FROM N3 a CROSS JOIN N3 b),
cteTally(N) AS
( SELECT 0 UNION ALL
SELECT TOP (DATALENGTH(ISNULL(#List,1))) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM n4
),
cteStart(N1) AS
( SELECT t.N+1
FROM cteTally t
WHERE (SUBSTRING(#List,t.N,1) = #Delimiter OR t.N = 0)
)
SELECT Item = SUBSTRING(#List, s.N1, ISNULL(NULLIF(CHARINDEX(#Delimiter,#List,s.N1),0)-s.N1,8000)),
Position = s.N1,
ItemNumber = ROW_NUMBER() OVER(ORDER BY s.N1)
FROM cteStart s
);
Then there is no need for looping at all, you just have a proper set based solution by calling the split function twice to get your EAV style data set:
DECLARE #WorkList TABLE (ID INT IDENTITY(1, 1) NOT NULL, TextField VARCHAR(MAX))
INSERT #WorkList
SELECT '08:34:52 Checksum=180957248,TicketType=6,InitialUserType=G,InitialUserID=520,CommunicationType=Incoming,Date=26-03-2012,Time=08:35:00,Service=ST,Duration=00:00:14,Cost=0.12'
UNION
SELECT '08:34:52 Checksum=180957249,TicketType=5,InitialUserType=H,InitialUserID=521,CommunicationType=Outgoing,Date=27-03-2012,Time=14:27:00,Service=ST,Duration=00:15:12,Cost=0.37';
WITH KeyPairs AS
( SELECT w.ID,
[Timestamp] = LEFT(w.TextField, CHARINDEX(' ', w.TextField)),
KeyField = MAX(CASE WHEN v.ItemNumber = 1 THEN v.Item END),
ValueField = MAX(CASE WHEN v.ItemNumber = 2 THEN v.Item END)
FROM #WorkList AS w
CROSS APPLY dbo.Split(SUBSTRING(TextField, CHARINDEX(' ', TextField) + 1, LEN(TextField)), ',') AS kp
CROSS APPLY dbo.Split(kp.Item, '=') AS v
GROUP BY w.ID, kp.ItemNumber,w.TextField
)
SELECT *
FROM KeyPairs AS kp
PIVOT
( MAX(ValueField)
FOR KeyField IN
( [Checksum], [TicketType], [InitialUserType],
[InitialUserID], [CommunicationType], [Date], [Time],
[Service], [Duration], [Cost]
)
) AS pvt;

SQL Server 2005 Find the all the characters in between ##----## of column row

I have a table Template with following columns:
ID, [Text]
Suppose I have two records in a Template table,
1, 'His full name is ##Full_Name##,he was born on ##Date_of_Birth## in city of ##Place_of_Birth##'
2, 'I am ##First_Name##, I was born on ##Date_of_Birth##'
I want to find ##Full_Name##, ##Date_of_Birth## and ##Place_of_Birth## in a row.
In the other row there can be two match result which are ##First_Name## and ##Date_of_Birth## .
The output of the two records would be like that:
1, Full_Name, Date_of_Birth, Place_of_Birth
2, First_Name, Date_of_Birth
Can anyone help me?
Here it is:
DECLARE #Text NVARCHAR(MAX)
SET #Text ='His full name is ##Full_Name##,he was born on ##Date_of_Birth## in city of ##Place_of_Birth##'
DECLARE #Delimiter VARCHAR(1000)
SET #Delimiter = '##';
WITH
numbers(number)
AS ( SELECT ROW_NUMBER() OVER ( ORDER BY o.object_id, o2.object_id ) Number
FROM sys.objects o
CROSS JOIN sys.objects o2
) ,
counts ( began, number )
AS ( SELECT CHARINDEX(#Delimiter, #TExt, 0)
, 1
UNION ALL
SELECT CHARINDEX(#Delimiter, #TExt, began + 1)
, number + 1
FROM counts
WHERE CHARINDEX(#Delimiter, #TExt, began + 1) > 0
) ,
subs ( began, ends, number )
AS ( SELECT C.began + LEN(#delimiter)
, c2.began
, c.number
FROM counts C
INNER JOIN counts C2 ON c2.number = c.number + 1
WHERE C.number % 2 = 1
)
SELECT SUBSTRING(#TExt, began, ( ends - began )) Word
FROM subs
You could implement CLR Regex function.
Here is an example: Regular Expressions Make Pattern Matching And Data Extraction Easier

lucene, or sql fulltext?

I want to create a search website to search docs (all kinds of formats including pdf), images, videos, and audio. I also want to be able to filter my search results based on some criteria like author name, date, etc.
I'm doing this in .NET, so what's the easiest way to get up and running? SQL fulltext searching seems tempting because I'm familiar with sql, and plus since I want to filter my search results, it will be easy to store the filter fields for each item.
If your primary concern is getting it up and running quickly and easily, then SQL fulltext search is definitely the way to go.
Lucene.NET has its advantages, but it is by no means a walk in the park to set up correctly. The documentation is a bit lacking and there are a very limited number of examples on the web.
Stored procedure for snippets:
CREATE PROCEDURE SimpleCommentar
#SearchTerm nvarchar(100),
#Style nvarchar(200)
AS
BEGIN
CREATE TABLE #match_docs
(
doc_id bigint NOT NULL PRIMA
);
INSERT INTO #match_docs
(
doc_id
)
SELECT DISTINCT
Commentary_ID
FROM Commentary
WHERE FREETEXT
(
Commentary,
#SearchTerm,
LANGUAGE N'English'
);
DECLARE #db_id int = DB_ID(),
#table_id int = OBJECT_ID(N'
#column_id int =
(
SELECT
column_id
FROM sys.columns
WHERE object_id = OBJECT_I
AND name = N'Commentary'
);
SELECT
s.Commentary_ID,
t.Title,
MIN
(
N'...' + SUBSTRING
(
REPLACE
(
c.Commentary,
s.Display_Term,
N'<span style="' + #Style + '">' + s.Display_Term + '</span>'
),
s.Pos - 512,
s.Length + 1024
) + N'...'
) AS Snippet
FROM
(
SELECT DISTINCT
c.Commentary_ID,
w.Display_Term,
PATINDEX
(
N'%[^a-z]' + w.Display_Term + N'[^a-z]%',
c.Commentary
) AS Pos,
LEN(w.Display_Term) AS Length
FROM sys.dm_fts_index_keywords_by_document
(
#db_id,
#table_id
) w
INNER JOIN dbo.Commentary c
ON w.document_id = c.Commentary_ID
WHERE w.column_id = #column_id
AND EXISTS
(
SELECT 1
FROM #match_docs m
WHERE m.doc_id = w.document_id
)
AND EXISTS
(
SELECT 1
FROM sys.dm_fts_parser
(
N'FORMSOF(FREETEXT, "' + #SearchTerm + N'")',
1033,
0,
1
) p
WHERE p.Display_Term = w.Display_Term
)
) s
INNER JOIN dbo.Commentary c
ON s.Commentary_ID = c.Commentary_ID
INNER JOIN dbo.Book_Commentary bc
ON c.Commentary_ID = bc.Commentary_ID
INNER JOIN dbo.Book_Title bt
ON bc.Book_ID = bt.Book_ID
INNER JOIN dbo.Title t
ON bt.Title_ID = t.Title_ID
WHERE t.Is_Primary_Title = 1
GROUP BY
s.Commentary_ID,
t.Title;
DROP TABLE #match_docs;
END;