Concatenate Rows into String with Many to Many Joins - sql

I have a query that is working for almost all my data scenarios...except one. I have a master table, a detail table (with one to many line items), and many description tables that join to the detail table (Code Descriptions, etc). I need to concatenate all detail records into one string value so that I have one record per master record. So if there's 3 details for the master record, all values need to be concatenated into one record to represent the master.
My issue comes when the Master record has multiple detail records but the detail records don't have a match to the description tables 1 to 1. For instance, in my scenario the master record has 3 detail records which only one of the detail records has a description record. So 1 to 3 to 1, Master to Detail to Description. This is causing an issue. When trying to concatenate the records the code is not working because of the NULL value created from the Detail to Description join. The only way I can seem to get this to work is to do a Distinct sub-query and then do my concatenation logic on the outside of that. I feel like there has to be a better way or that I am simply missing something. I provided example code below to show my issue. There are 3 selects that get run. The first is the flat result with all records from the joins. The second is my original logic showing the flaw. The third is a working version that I am hoping someone knows how to do better. I greatly appreciate any help with this issue.
DECLARE #Notification table(
SystemID int NOT NULL,
NotificationID int
);
DECLARE #NotificationItems table(
SystemID int NOT NULL,
NotificationID VARCHAR(100),
LineItem VARCHAR(100)
);
DECLARE #NotificationCauses table(
SystemID int NOT NULL,
NotificationID VARCHAR(100),
LineItem VARCHAR(100),
TestValue VARCHAR(100)
);
INSERT INTO #Notification
SELECT 40,1 UNION
SELECT 40,2 UNION
SELECT 40,3 UNION
SELECT 40,4
INSERT INTO #NotificationItems
SELECT 40,1,1 UNION
SELECT 40,1,2 UNION
SELECT 40,1,3 UNION
SELECT 40,2,1 UNION
SELECT 40,2,2 UNION
SELECT 40,3,1
INSERT INTO #NotificationCauses
SELECT 40,1,1,'Code_A' UNION
SELECT 40,2,1,'Code_B' UNION
SELECT 40,2,2,'Code_C' UNION
SELECT 40,3,1,'Code_D'
--SELECT *
--FROM #Notification
--SELECT *
--FROM #NotificationItems
SELECT *
FROM #Notification AS n
LEFT OUTER JOIN #NotificationItems AS ni
ON n.NotificationID = ni.NotificationID
AND n.SystemID = ni.SystemID
LEFT OUTER JOIN #NotificationCauses AS nc
ON ni.NotificationID = nc.NotificationID
AND ni.SystemID = nc.SystemID
AND ni.LineItem = nc.LineItem
SELECT DISTINCT n.SystemID, n.NotificationID
,SUBSTRING(
(
SELECT DISTINCT
CASE WHEN LTRIM(RTRIM(ni1.LineItem)) <> ISNULL('','') THEN ', '+ni1.LineItem ELSE '' END AS [text()]
FROM #NotificationItems AS ni1
WHERE ni1.SystemID = ni.SystemID AND ni1.NotificationID = ni.NotificationID --AND a1.LineItem = a.LineItem
ORDER BY 1
FOR XML PATH ('')
), 2, 1000) AS [LineItem]
,SUBSTRING(
(
SELECT DISTINCT
CASE WHEN LTRIM(RTRIM(nc1.TestValue)) <> ISNULL('','') THEN ', '+nc1.TestValue ELSE '' END AS [text()]
FROM #NotificationCauses AS nc1
WHERE nc1.SystemID = nc.SystemID AND nc1.NotificationID = nc.NotificationID --AND nc1.LineItem = nc.LineItem
ORDER BY 1
FOR XML PATH ('')
), 2, 1000) AS [TestValues]
FROM #Notification AS n
LEFT OUTER JOIN #NotificationItems AS ni
ON n.SystemID = ni.SystemID
AND n.NotificationID = ni.NotificationID
LEFT OUTER JOIN #NotificationCauses AS nc
ON ni.SystemID = nc.SystemID
AND ni.NotificationID = nc.NotificationID
AND ni.LineItem = nc.LineItem
SELECT DISTINCT SystemID, NotificationID
,SUBSTRING(
(
SELECT DISTINCT
CASE WHEN LTRIM(RTRIM(a1.LineItem)) <> ISNULL('','') THEN ', '+a1.LineItem ELSE '' END AS [text()]
FROM #NotificationItems AS a1
WHERE a1.SystemID = a.SystemID AND a1.NotificationID = a.NotificationID --AND a1.LineItem = a.LineItem
ORDER BY 1
FOR XML PATH ('')
), 2, 1000) AS [LineItem]
,SUBSTRING(
(
SELECT DISTINCT
CASE WHEN LTRIM(RTRIM(a1.TestValue)) <> ISNULL('','') THEN ', '+a1.TestValue ELSE '' END AS [text()]
FROM #NotificationCauses AS a1
WHERE a1.SystemID = a.SystemID AND a1.NotificationID = a.NotificationID --AND a1.LineItem = a.LineItem
ORDER BY 1
FOR XML PATH ('')
), 2, 1000) AS [TestValues]
FROM
(
SELECT DISTINCT n.NotificationID, n.SystemID, ni.LineItem, nc.TestValue
FROM #Notification AS n
LEFT OUTER JOIN #NotificationItems AS ni
ON n.SystemID = ni.SystemID
AND n.NotificationID = ni.NotificationID
LEFT OUTER JOIN #NotificationCauses AS nc
ON ni.SystemID = nc.SystemID
AND ni.NotificationID = nc.NotificationID
AND ni.LineItem = nc.LineItem
) AS a

I have cleared up the query and that's the result
SELECT n.SystemID, n.NotificationID
, SUBSTRING((SELECT COALESCE(', ' + ni.LineItem, '') [text()]
FROM NotificationItems AS ni
WHERE ni.SystemID = n.SystemID
AND ni.NotificationID = n.NotificationID
ORDER BY 1
FOR XML PATH ('')
), 2, 1000) AS [LineItem]
, SUBSTRING((SELECT COALESCE(', ' + nc.TestValue, '') [text()]
FROM NotificationItems AS ni
INNER JOIN NotificationCauses nc
ON ni.SystemID = nc.SystemID
AND ni.NotificationID = nc.NotificationID
AND ni.LineItem = nc.LineItem
WHERE ni.SystemID = n.SystemID
AND ni.NotificationID = n.NotificationID
ORDER BY 1
FOR XML PATH ('')
), 2, 1000) AS [TestValues]
FROM Notification n
Those are the "smell" founded:
the ISNULL('', ''), it really does nothing, replaced with blank string '' (more to come)
the FROM of the main select, there are three table when only one is really used, the unused two are removed, that needed an update of the FROM and WHERE condition subqueries
the DISTINCT of the subqueries, again it does nothing, stripped away
the CASE in the subqueries add a comma before a value, if the trimmed value is not null, that is exactly what COALESCE(', ' + value, '') does (if the trim is needed re-add it)
Everything else is only formatting the query my way, 'cause it's easier to read if it's formatted your way (something like review by refactoring)
Here is a SQLFiddle demo of the cleared query with the data provided

Related

How can I use two select statements in one view

This first select statement gives me a result which i want.
SELECT
b.num,
DATEPART(yyyy, b.dateofstatus) AS yearofinsert,
b.yearbegin,
ac.code,
ac.store,
FROM table1 b LEFT JOIN class ac
ON b.idclass = ac.id LEFT JOIN admin.dbo.DboObj s
ON s.id = b.idobj LEFT JOIN
But when I include this another one statement it gives me an error "problem near WITH", i know that problem is temporary table can't be included in view, but how can i pass that problem, is there any other solution instead of using tmp table?
;WITH tmp(idsubj,DataItem, subject) AS
(
SELECT
idsubj,
LEFT(subject, CHARINDEX(CHAR(10), subject + CHAR(10)) - 1),
STUFF(subject, 1, CHARINDEX(CHAR(10), subject + CHAR(10)), '')
FROM table1 WHERE idobj = 2
UNION all
SELECT
idsubj,
LEFT(subject, CHARINDEX(CHAR(10), subject + CHAR(10)) - 1),
STUFF(subject, 1, CHARINDEX(CHAR(10), subject + CHAR(10)), '')
FROM tmp
WHERE
subject > ''
)
SELECT
idsubj,
DataItem
INTO #a FROM tmp
ORDER BY idsubj
OPTION (maxrecursion 0)
SELECT replace(#a.DataItem,' ',' ') as allbooks, #a.idsubj
FROM #a
LEFT JOIN table1 ON #a.idsubj=table1.idsubj
LEFT JOIN table2 po ON #a.idsubj = po.idsubj
ORDER BY idsubj

Join table based on column value in SQL Server

Item table
id key code description
------------------------------
1 1 misc miscellaneous
2 1 med medicine
Miscellaneous table:
id code description
------------------------
1 misc1 miscellaneous
2 misc1 miscellaneous
Medicine table:
id code description
---------------------------
1 medicine1 medicine
2 medicine1 medicine
I have this table structure; my main table is the Item table and I want to JOIN the main table with other table based on the column value in main table. The column that determines the table to be joined is code. If code is misc join with misc table if value is med join with medicine table.
I know the basic JOIN of table like
SELECT *
FROM item
INNER JOIN miscellaneous ON item.key = miscellaneous.id
But I don't know how to join when there is a condition that will point to which table to JOIN
You can use left join. Something like this:
select i.*,
coalesce(mi.code, me.code) as code_1,
coalesce(mi.description, me.description) as description_1
from item i left join
miscellaneous mi
on mi.code = i.key and i.code = 'misc' left join
medicine me
on me.code = i.key and i.code = 'med';
You can try a LEFT JOIN, that will be the easiest method to implement this.
But if you want both the table result to come under one column name, use UNION ALL
Using UNION ALL
SELECT *
FROM item i
INNER JOIN miscellaneous m on m.code=i.code
UNION ALL
SELECT *
FROM item i
INNER JOIN medicine me on me.code=i.code
Using LEFT JOIN
SELECT *
FROM item i
LEFT JOIN miscellaneous m on m.code=i.code
LEFT JOIN medicine me on me.code=i.code
Since Item is your 'base' / main table, you could match other tables with LEFT JOIN so that all rows from Item tables are present.
SELECT *
FROM Item AS i
LEFT JOIN Miscellaneous AS mi ON (mi.[id] = i.[key] AND i.code = 'misc')
LEFT JOIN Medicine AS me ON (me.[id] = i.[key] AND i.code = 'med');
You could also combine Miscellaneous & Medicine tables columns using ISNULL() function with some pre-condition
SELECT i.*
, ISNULL(mi.id, me.id) AS m_id
, ISNULL(mi.code, me.code) AS m_code
, ISNULL(mi.description, me.description) AS m_description
FROM Item AS i
LEFT JOIN Miscellaneous AS mi ON (mi.id = i.[key] AND i.code = 'misc')
LEFT JOIN Medicine AS me ON (me.id = i.[key] AND i.code = 'med');
SqlFiddle Demo
You can do this in a dynamic way
preparation of the data set for testing
IF (OBJECT_ID('item') IS NULL)
BEGIN
CREATE TABLE item (id int, [key] nvarchar(128), code nvarchar(128),
description nvarchar(512))
INSERT INTO item (id, [key] , code, description)
SELECT 1, 1, 'misc', 'miscellaneous' UNION ALL
SELECT 2, 1, 'med', 'medicine'UNION ALL
SELECT 3, 2, 'test not existing table', 'not_existing_table';
END
IF (OBJECT_ID('miscellaneous') IS NULL)
BEGIN
CREATE TABLEe miscellaneous (id int, code nvarchar(128), description
nvarchar(512))
INSERT INTO miscellaneous (id, code, description)
SELECT 1, 'misc1', 'miscellaneous' UNION ALL
SELECT 2, 'misc2', 'miscellaneous_xxx';
END
IF (OBJECT_ID('medicine') IS NULL)
BEGIN
CREATE TABLE medicine (id int, code nvarchar(128), description
nvarchar(512))
INSERT INTO medicine (id, code, description)
SELECT 1, 'medicine1', 'medicine' UNION ALL
SELECT 2, 'medicine2', 'medicine_xxx';
END
SQL Script
DECLARE #joins nvarchar(max) ='',#sql NVARCHAR(MAX) = ' SELECT * FROM item';
SELECT #joins = STUFF((SELECT ( CHAR(13)+CHAR(10) + ' LEFT JOIN ' +
description + ' ' + description + ' ON ' + description +'.id = item.[key] AND item.description = ''' + description + '''' )
FROM item i WHERE ISNULL(description,'') != '' AND
(select OBJECT_ID(description) ) IS NOT NULL
GROUP BY description
ORDER BY description
FOR XML PATH(''), TYPE ).value('.', 'NVARCHAR(MAX)') ,1,1,'');
IF(ISNULL(#joins,'') != '' )
BEGIN
SET #sql = #sql + #joins;
EXECUTE sp_executesql #sql;
PRINT #sql
END;

replace value in varchar(max) field with join

I have a table that contains text field with placeholders. Something like this:
Row Notes
1. This is some notes ##placeholder130## this ##myPlaceholder##, #oneMore#. End.
2. Second row...just a ##test#.
(This table contains about 1-5k rows on average. Average number of placeholders in one row is 5-15).
Now, I have a lookup table that looks like this:
Name Value
placeholder130 Dog
myPlaceholder Cat
oneMore Cow
test Horse
(Lookup table will contain anywhere from 10k to 100k records)
I need to find the fastest way to join those placeholders from strings to a lookup table and replace with value. So, my result should look like this (1st row):
This is some notes Dog this Cat, Cow. End.
What I came up with was to split each row into multiple for each placeholder and then join it to lookup table and then concat records back to original row with new values, but it takes around 10-30 seconds on average.
You could try to split the string using a numbers table and rebuild it with for xml path.
select (
select coalesce(L.Value, T.Value)
from Numbers as N
cross apply (select substring(Notes.notes, N.Number, charindex('##', Notes.notes + '##', N.Number) - N.Number)) as T(Value)
left outer join Lookup as L
on L.Name = T.Value
where N.Number <= len(notes) and
substring('##' + notes, Number, 2) = '##'
order by N.Number
for xml path(''), type
).value('text()[1]', 'varchar(max)')
from Notes
SQL Fiddle
I borrowed the string splitting from this blog post by Aaron Bertrand
SQL Server is not very fast with string manipulation, so this is probably best done client-side. Have the client load the entire lookup table, and replace the notes as they arrived.
Having said that, it can of course be done in SQL. Here's a solution with a recursive CTE. It performs one lookup per recursion step:
; with Repl as
(
select row_number() over (order by l.name) rn
, Name
, Value
from Lookup l
)
, Recurse as
(
select Notes
, 0 as rn
from Notes
union all
select replace(Notes, '##' + l.name + '##', l.value)
, r.rn + 1
from Recurse r
join Repl l
on l.rn = r.rn + 1
)
select *
from Recurse
where rn =
(
select count(*)
from Lookup
)
option (maxrecursion 0)
Example at SQL Fiddle.
Another option is a while loop to keep replacing lookups until no more are found:
declare #notes table (notes varchar(max))
insert #notes
select Notes
from Notes
while 1=1
begin
update n
set Notes = replace(n.Notes, '##' + l.name + '##', l.value)
from #notes n
outer apply
(
select top 1 Name
, Value
from Lookup l
where n.Notes like '%##' + l.name + '##%'
) l
where l.name is not null
if ##rowcount = 0
break
end
select *
from #notes
Example at SQL Fiddle.
I second the comment that tsql is just not suited for this operation, but if you must do it in the db here is an example using a function to manage the multiple replace statements.
Since you have a relatively small number of tokens in each note (5-15) and a very large number of tokens (10k-100k) my function first extracts tokens from the input as potential tokens and uses that set to join to your lookup (dbo.Token below). It was far too much work to look for an occurrence of any of your tokens in each note.
I did a bit of perf testing using 50k tokens and 5k notes and this function runs really well, completing in <2 seconds (on my laptop). Please report back how this strategy performs for you.
note: In your example data the token format was not consistent (##_#, ##_##, #_#), I am guessing this was simply a typo and assume all tokens take the form of ##TokenName##.
--setup
if object_id('dbo.[Lookup]') is not null
drop table dbo.[Lookup];
go
if object_id('dbo.fn_ReplaceLookups') is not null
drop function dbo.fn_ReplaceLookups;
go
create table dbo.[Lookup] (LookupName varchar(100) primary key, LookupValue varchar(100));
insert into dbo.[Lookup]
select '##placeholder130##','Dog' union all
select '##myPlaceholder##','Cat' union all
select '##oneMore##','Cow' union all
select '##test##','Horse';
go
create function [dbo].[fn_ReplaceLookups](#input varchar(max))
returns varchar(max)
as
begin
declare #xml xml;
select #xml = cast(('<r><i>'+replace(#input,'##' ,'</i><i>')+'</i></r>') as xml);
--extract the potential tokens
declare #LookupsInString table (LookupName varchar(100) primary key);
insert into #LookupsInString
select distinct '##'+v+'##'
from ( select [v] = r.n.value('(./text())[1]', 'varchar(100)'),
[r] = row_number() over (order by n)
from #xml.nodes('r/i') r(n)
)d(v,r)
where r%2=0;
--tokenize the input
select #input = replace(#input, l.LookupName, l.LookupValue)
from dbo.[Lookup] l
join #LookupsInString lis on
l.LookupName = lis.LookupName;
return #input;
end
go
return
--usage
declare #Notes table ([Id] int primary key, notes varchar(100));
insert into #Notes
select 1, 'This is some notes ##placeholder130## this ##myPlaceholder##, ##oneMore##. End.' union all
select 2, 'Second row...just a ##test##.';
select *,
dbo.fn_ReplaceLookups(notes)
from #Notes;
Returns:
Tokenized
--------------------------------------------------------
This is some notes Dog this Cat, Cow. End.
Second row...just a Horse.
Try this
;WITH CTE (org, calc, [Notes], [level]) AS
(
SELECT [Notes], [Notes], CONVERT(varchar(MAX),[Notes]), 0 FROM PlaceholderTable
UNION ALL
SELECT CTE.org, CTE.[Notes],
CONVERT(varchar(MAX), REPLACE(CTE.[Notes],'##' + T.[Name] + '##', T.[Value])), CTE.[level] + 1
FROM CTE
INNER JOIN LookupTable T ON CTE.[Notes] LIKE '%##' + T.[Name] + '##%'
)
SELECT DISTINCT org, [Notes], level FROM CTE
WHERE [level] = (SELECT MAX(level) FROM CTE c WHERE CTE.org = c.org)
SQL FIDDLE DEMO
Check the below devioblog post for reference
devioblog post
To get speed, you can preprocess the note templates into a more efficient form. This will be a sequence of fragments, with each ending in a substitution. The substitution might be NULL for the last fragment.
Notes
Id FragSeq Text SubsId
1 1 'This is some notes ' 1
1 2 ' this ' 2
1 3 ', ' 3
1 4 '. End.' null
2 1 'Second row...just a ' 4
2 2 '.' null
Subs
Id Name Value
1 'placeholder130' 'Dog'
2 'myPlaceholder' 'Cat'
3 'oneMore' 'Cow'
4 'test' 'Horse'
Now we can do the substitutions with a simple join.
SELECT Notes.Text + COALESCE(Subs.Value, '')
FROM Notes LEFT JOIN Subs
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq
This produces a list of fragments with substitutions complete. I am not an MSQL user, but in most dialects of SQL you can concatenate these fragments in a variable quite easily:
DECLARE #Note VARCHAR(8000)
SELECT #Note = COALESCE(#Note, '') + Notes.Text + COALSCE(Subs.Value, '')
FROM Notes LEFT JOIN Subs
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq
Pre-processing a note template into fragments will be straightforward using the string splitting techniques of other posts.
Unfortunately I'm not at a location where I can test this, but it ought to work fine.
I really don't know how it will perform with 10k+ of lookups.
how does the old dynamic SQL performs?
DECLARE #sqlCommand NVARCHAR(MAX)
SELECT #sqlCommand = N'PlaceholderTable.[Notes]'
SELECT #sqlCommand = 'REPLACE( ' + #sqlCommand +
', ''##' + LookupTable.[Name] + '##'', ''' +
LookupTable.[Value] + ''')'
FROM LookupTable
SELECT #sqlCommand = 'SELECT *, ' + #sqlCommand + ' FROM PlaceholderTable'
EXECUTE sp_executesql #sqlCommand
Fiddle demo
And now for some recursive CTE.
If your indexes are correctly set up, this one should be very fast or very slow. SQL Server always surprises me with performance extremes when it comes to the r-CTE...
;WITH T AS (
SELECT
Row,
StartIdx = 1, -- 1 as first starting index
EndIdx = CAST(patindex('%##%', Notes) as int), -- first ending index
Result = substring(Notes, 1, patindex('%##%', Notes) - 1)
-- (first) temp result bounded by indexes
FROM PlaceholderTable -- **this is your source table**
UNION ALL
SELECT
pt.Row,
StartIdx = newstartidx, -- starting index (calculated in calc1)
EndIdx = EndIdx + CAST(newendidx as int) + 1, -- ending index (calculated in calc4 + total offset)
Result = Result + CAST(ISNULL(newtokensub, newtoken) as nvarchar(max))
-- temp result taken from subquery or original
FROM
T
JOIN PlaceholderTable pt -- **this is your source table**
ON pt.Row = T.Row
CROSS APPLY(
SELECT newstartidx = EndIdx + 2 -- new starting index moved by 2 from last end ('##')
) calc1
CROSS APPLY(
SELECT newtxt = substring(pt.Notes, newstartidx, len(pt.Notes))
-- current piece of txt we work on
) calc2
CROSS APPLY(
SELECT patidx = patindex('%##%', newtxt) -- current index of '##'
) calc3
CROSS APPLY(
SELECT newendidx = CASE
WHEN patidx = 0 THEN len(newtxt) + 1
ELSE patidx END -- if last piece of txt, end with its length
) calc4
CROSS APPLY(
SELECT newtoken = substring(pt.Notes, newstartidx, newendidx - 1)
-- get the new token
) calc5
OUTER APPLY(
SELECT newtokensub = Value
FROM LookupTable
WHERE Name = newtoken -- substitute the token if you can find it in **your lookup table**
) calc6
WHERE newstartidx + len(newtxt) - 1 <= len(pt.Notes)
-- do this while {new starting index} + {length of txt we work on} exceeds total length
)
,lastProcessed AS (
SELECT
Row,
Result,
rn = row_number() over(partition by Row order by StartIdx desc)
FROM T
) -- enumerate all (including intermediate) results
SELECT *
FROM lastProcessed
WHERE rn = 1 -- filter out intermediate results (display only last ones)

initialize and increment variable inside cte query sqlserver 2008

I am using sqlserver 2008 ,I want to initialize and increment variable (#NUMTwo) both at the same time, in my second part(Problem Line).
I am creating a cte query.
Is this possible , if yes then please let me know.
following is a sample example.I hope i am clear.
CREATE table #TempTable
(
childProductID INT,parentProductID INT,productModel varchar(50),[Num2] VARCHAR(100)
)
DECLARE #NUMTwo INT = 0
WITH tableR AS
(
-- First Part
SELECT childProductID = null,parentProductID=null,productModel from Products where productid in (#a),[Num2] = convert(varchar(100), '')
UNION ALL
--Second Part
SELECT e.childProductID,e.parentProductID,prd.productModel FROM ProductIncludes AS e
,[Num2] = convert(varchar(100),'1.' + #NUMTwo+=1 ) -- Problem line
INNER JOIN Products AS PRD ON e.childProductID = PRD.productID
WHERE parentProductID in (#a)
)
INSERT INTO #TempTable(childProductID,parentProductID,productModel,[Num2])
SELECT childProductID,parentProductID,productModel,[Num2]
END
SELECT * FROM #TempTable
You need to "Initialize" a column in the acnhor part of the query, and then "oncrement" this column in the recursive parts.
Something like
DECLARE #NUMTwo INT = 0
;WITH Test AS (
SELECT [Num2] = convert(varchar(MAX), ''),
#NUMTwo [N]
UNION ALL
SELECT [Num2] = '1.' + convert(varchar(MAX),[N]+1),
[N]+1
FROM TEst
WHERE [N] < 10
)
SELECT *
FROM Test
SQL Fiddle DEMO
If the parameter #NUMTwo is just for numbering rows you can use the ROW_NUMBER() OVER(...) instead of it like so:
WITH tableR AS
(
SELECT childProductID = NULL, parentProductID = NULL,
productModel, NUMTwo = CAST('0' AS VARCHAR(10))
FROM Products
WHERE
productid in (#a),
[Num2] = convert(varchar(100), '')
UNION ALL
SELECT e.childProductID, e.parentProductID,
prd.productModel,
NUMTwo = '1.' +
CAST( ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS VARCHAR(10))
FROM ProductIncludes AS e
INNER JOIN Products AS PRD ON e.childProductID = PRD.productID
WHERE parentProductID in (#a)
)

Join [one word per row] to rows of phrases with [multiple words per row]

Please excuse the length of the question. I included a test script to demo the situation and my best attempt at a solution.
There are two tables:
test_WORDS = Words extracted in order from several sources. The OBJ_FK column is the ID of the source. WORD_ID is an identifier for the word itself that is unique within the source. Each row contains one word.
test_PHRASE = a list of phrases to be searched for in test_WORDS. The PHRASE_TEXT column is a space separated phrase like 'foo bar' (see below) so that each row contains multiple words.
Requirement:
Return the first word from test_WORDS that is the start of a matching a phrase from test_PHRASE.
I would prefer something set based to avoid RBAR approach below. Also my solution is limited to 5 word phrases. I need to support up to 20 word phrases. Is it possible to match the words from a row in test_PHRASE to contiguous rows in the test_WORD without cursors?
After breaking the phrase words out into a temporary table, the problem boils down to matching portions of two sets together in row order.
-- Create test data
CREATE TABLE [dbo].[test_WORDS](
[OBJ_FK] [bigint] NOT NULL, --FK to the source object
[WORD_ID] [int] NOT NULL, --The word order in the source object
[WORD_TEXT] [nvarchar](50) NOT NULL,
CONSTRAINT [PK_test_WORDS] PRIMARY KEY CLUSTERED
(
[OBJ_FK] ASC,
[WORD_ID] ASC
)
) ON [PRIMARY]
GO
CREATE TABLE [dbo].[test_PHRASE](
[ID] [int], --PHRASE ID
[PHRASE_TEXT] [nvarchar](150) NOT NULL --Space-separated phrase
CONSTRAINT [PK_test_PHRASE] PRIMARY KEY CLUSTERED
(
[ID] ASC
)
)
GO
INSERT INTO dbo.test_WORDS
SELECT 1,1,'aaa' UNION ALL
SELECT 1,2,'bbb' UNION ALL
SELECT 1,3,'ccc' UNION ALL
SELECT 1,4,'ddd' UNION ALL
SELECT 1,5,'eee' UNION ALL
SELECT 1,6,'fff' UNION ALL
SELECT 1,7,'ggg' UNION ALL
SELECT 1,8,'hhh' UNION ALL
SELECT 2,1,'zzz' UNION ALL
SELECT 2,2,'yyy' UNION ALL
SELECT 2,3,'xxx' UNION ALL
SELECT 2,4,'www'
INSERT INTO dbo.test_PHRASE
SELECT 1, 'bbb ccc ddd' UNION ALL --should match
SELECT 2, 'ddd eee fff' UNION ALL --should match
SELECT 3, 'xxx xxx xxx' UNION ALL --should NOT match
SELECT 4, 'zzz yyy xxx' UNION ALL --should match
SELECT 5, 'xxx www ppp' UNION ALL --should NOT match
SELECT 6, 'zzz yyy xxx www' --should match
-- Create variables
DECLARE #maxRow AS INTEGER
DECLARE #currentRow AS INTEGER
DECLARE #phraseSubsetTable AS TABLE(
[ROW] int IDENTITY(1,1) NOT NULL,
[ID] int NOT NULL, --PHRASE ID
[PHRASE_TEXT] nvarchar(150) NOT NULL
)
--used to split the phrase into words
--note: No permissions to sys.dm_fts_parser
DECLARE #WordList table
(
ID int,
WORD nvarchar(50)
)
--Records to be returned to caller
DECLARE #returnTable AS TABLE(
OBJECT_FK INT NOT NULL,
WORD_ID INT NOT NULL,
PHRASE_ID INT NOT NULL
)
DECLARE #phrase AS NVARCHAR(150)
DECLARE #phraseID AS INTEGER
-- Get subset of phrases to simulate a join that would occur in production
INSERT INTO #phraseSubsetTable
SELECT ID, PHRASE_TEXT
FROM dbo.test_PHRASE
--represent subset of phrases caused by join in production
WHERE ID IN (2,3,4)
-- Loop each phrase in the subset, split into rows of words and return matches to the test_WORDS table
SET #maxRow = ##ROWCOUNT
SET #currentRow = 1
WHILE #currentRow <= #maxRow
BEGIN
SELECT #phrase=PHRASE_TEXT, #phraseID=ID FROM #phraseSubsetTable WHERE row = #currentRow
--clear previous phrase that was split into rows
DELETE FROM #WordList
--Recursive Function with CTE to create recordset of words, one per row
;WITH Pieces(pn, start, stop) AS (
SELECT 1, 1, CHARINDEX(' ', #phrase)
UNION ALL
SELECT pn + 1, stop + 1, CHARINDEX(' ', #phrase, stop + 1)
FROM Pieces
WHERE stop > 0)
--Create the List of words with the CTE above
insert into #WordList
SELECT pn,
SUBSTRING(#phrase, start, CASE WHEN stop > 0 THEN stop-start ELSE 1056 END) AS WORD
FROM Pieces
DECLARE #wordCt as int
select #wordCt=count(ID) from #WordList;
-- Do the actual query using a CTE with a rownumber that repeats for every SOURCE OBJECT
;WITH WordOrder_CTE AS (
SELECT OBJ_FK, WORD_ID, WORD_TEXT,
ROW_NUMBER() OVER (Partition BY OBJ_FK ORDER BY WORD_ID) AS rownum
FROM test_WORDS)
--CREATE a flattened record of the first word in the phrase and join it to the rest of the words.
INSERT INTO #returnTable
SELECT r1.OBJ_FK, r1.WORD_ID, #phraseID AS PHRASE_ID
FROM WordOrder_CTE r1
INNER JOIN #WordList w1 ON r1.WORD_TEXT = w1.WORD and w1.ID=1
LEFT JOIN WordOrder_CTE r2
ON r1.rownum = r2.rownum - 1 and r1.OBJ_FK = r2.OBJ_FK
LEFT JOIN #WordList w2 ON r2.WORD_TEXT = w2.WORD and w2.ID=2
LEFT JOIN WordOrder_CTE r3
ON r1.rownum = r3.rownum - 2 and r1.OBJ_FK = r3.OBJ_FK
LEFT JOIN #WordList w3 ON r3.WORD_TEXT = w3.WORD and w3.ID=3
LEFT JOIN WordOrder_CTE r4
ON r1.rownum = r4.rownum - 3 and r1.OBJ_FK = r4.OBJ_FK
LEFT JOIN #WordList w4 ON r4.WORD_TEXT = w4.WORD and w4.ID=4
LEFT JOIN WordOrder_CTE r5
ON r1.rownum = r5.rownum - 4 and r1.OBJ_FK = r5.OBJ_FK
LEFT JOIN #WordList w5 ON r5.WORD_TEXT = w5.WORD and w5.ID=5
WHERE (#wordCt < 2 OR w2.ID is not null) and
(#wordCt < 3 OR w3.ID is not null) and
(#wordCt < 4 OR w4.ID is not null) and
(#wordCt < 5 OR w5.ID is not null)
--loop
SET #currentRow = #currentRow+1
END
--Return the first words of each matching phrase
SELECT OBJECT_FK, WORD_ID, PHRASE_ID FROM #returnTable
GO
--Clean up
DROP TABLE [dbo].[test_WORDS]
DROP TABLE [dbo].[test_PHRASE]
Edited solution:
This is an edit of the correct solution provided below to account for non-contiguous word IDs. Hope this helps someone as much as it did me.
;WITH
numberedwords AS (
SELECT
OBJ_FK,
WORD_ID,
WORD_TEXT,
rowcnt = ROW_NUMBER() OVER
(PARTITION BY OBJ_FK ORDER BY WORD_ID DESC),
totalInSrc = COUNT(WORD_ID) OVER (PARTITION BY OBJ_FK)
FROM dbo.test_WORDS
),
phrasedwords AS (
SELECT
nw1.OBJ_FK,
nw1.WORD_ID,
nw1.WORD_TEXT,
PHRASE_TEXT = RTRIM((
SELECT [text()] = nw2.WORD_TEXT + ' '
FROM numberedwords nw2
WHERE nw1.OBJ_FK = nw2.OBJ_FK
AND nw2.rowcnt BETWEEN nw1.rowcnt AND nw1.totalInSrc
ORDER BY nw2.OBJ_FK, nw2.WORD_ID
FOR XML PATH ('')
))
FROM numberedwords nw1
GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt, nw1.totalInSrc
)
SELECT *
FROM phrasedwords pw
INNER JOIN test_PHRASE tp
ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID
Note: The final query I used in production uses indexed temp tables instead of CTEs. I also limited the length of the PHRASE_TEXT column to my needs. With these improvements, I was able to reduce my query time from over 3 minutes to 3 seconds!
Here's a solution that uses a different approach: instead of splitting the phrases into words it combines the words into phrases.
Edited: changed the rowcnt expression to using COUNT(*) OVER …, as suggested by #ErikE in the comments.
;WITH
numberedwords AS (
SELECT
OBJ_FK,
WORD_ID,
WORD_TEXT,
rowcnt = COUNT(*) OVER (PARTITION BY OBJ_FK)
FROM dbo.test_WORDS
),
phrasedwords AS (
SELECT
nw1.OBJ_FK,
nw1.WORD_ID,
nw1.WORD_TEXT,
PHRASE_TEXT = RTRIM((
SELECT [text()] = nw2.WORD_TEXT + ' '
FROM numberedwords nw2
WHERE nw1.OBJ_FK = nw2.OBJ_FK
AND nw2.WORD_ID BETWEEN nw1.WORD_ID AND nw1.rowcnt
ORDER BY nw2.OBJ_FK, nw2.WORD_ID
FOR XML PATH ('')
))
FROM numberedwords nw1
GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt
)
SELECT *
FROM phrasedwords pw
INNER JOIN test_PHRASE tp
ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID
Using a Split function should work.
Split Function
CREATE FUNCTION dbo.Split
(
#RowData nvarchar(2000),
#SplitOn nvarchar(5)
)
RETURNS #RtnValue table
(
Id int identity(1,1),
Data nvarchar(100)
)
AS
BEGIN
Declare #Cnt int
Set #Cnt = 1
While (Charindex(#SplitOn,#RowData)>0)
Begin
Insert Into #RtnValue (data)
Select
Data = ltrim(rtrim(Substring(#RowData,1,Charindex(#SplitOn,#RowData)-1)))
Set #RowData = Substring(#RowData,Charindex(#SplitOn,#RowData)+1,len(#RowData))
Set #Cnt = #Cnt + 1
End
Insert Into #RtnValue (data)
Select Data = ltrim(rtrim(#RowData))
Return
END
SQL Statement
SELECT DISTINCT p.*
FROM dbo.test_PHRASE p
LEFT OUTER JOIN (
SELECT p.ID
FROM dbo.test_PHRASE p
CROSS APPLY dbo.Split(p.PHRASE_TEXT, ' ') sp
LEFT OUTER JOIN dbo.test_WORDS w ON w.WORD_TEXT = sp.Data
WHERE w.OBJ_FK IS NULL
) ignore ON ignore.ID = p.ID
WHERE ignore.ID IS NULL
This performs a little better than other solutions given. if you don't need WORD_ID, just WORD_TEXT, you can remove a whole column. I know this was over a year ago, but I wonder if you can get 3 seconds down to 30 ms? :)
If this query seems good, then my biggest speed advice is to put the entire phrases into a separate table (using your example data, it would have only 2 rows with phrases of length 8 words and 4 words).
SELECT
W.OBJ_FK,
X.Phrase,
P.*,
Left(P.PHRASE_TEXT,
IsNull(NullIf(CharIndex(' ', P.PHRASE_TEXT), 0) - 1, 2147483647)
) WORD_TEXT,
Len(Left(X.Phrase, PatIndex('%' + P.PHRASE_TEXT + '%', ' ' + X.Phrase) - 1))
- Len(Replace(
Left(X.Phrase, PatIndex('%' + P.PHRASE_TEXT + '%', X.Phrase) - 1), ' ', '')
)
WORD_ID
FROM
(SELECT DISTINCT OBJ_FK FROM dbo.test_WORDS) W
CROSS APPLY (
SELECT RTrim((SELECT WORD_TEXT + ' '
FROM dbo.test_WORDS W2
WHERE W.OBJ_FK = W2.OBJ_FK
ORDER BY W2.WORD_ID
FOR XML PATH (''))) Phrase
) X
INNER JOIN dbo.test_PHRASE P
ON X.Phrase LIKE '%' + P.PHRASE_TEXT + '%';
Here's another version for curiosity's sake. It doesn't perform quite as well.
WITH Calc AS (
SELECT
P.ID,
P.PHRASE_TEXT,
W.OBJ_FK,
W.WORD_ID StartID,
W.WORD_TEXT StartText,
W.WORD_ID,
Len(W.WORD_TEXT) + 2 NextPos,
Convert(varchar(150), W.WORD_TEXT) MatchingPhrase
FROM
dbo.test_PHRASE P
INNER JOIN dbo.test_WORDS W
ON P.PHRASE_TEXT + ' ' LIKE W.WORD_TEXT + ' %'
UNION ALL
SELECT
C.ID,
C.PHRASE_TEXT,
C.OBJ_FK,
C.StartID,
C.StartText,
W.WORD_ID,
C.NextPos + Len(W.WORD_TEXT) + 1,
Convert(varchar(150), C.MatchingPhrase + Coalesce(' ' + W.WORD_TEXT, ''))
FROM
Calc C
INNER JOIN dbo.test_WORDS W
ON C.OBJ_FK = W.OBJ_FK
AND C.WORD_ID + 1 = W.WORD_ID
AND Substring(C.PHRASE_TEXT, C.NextPos, 2147483647) + ' ' LIKE W.WORD_TEXT + ' %'
)
SELECT C.OBJ_FK, C.PHRASE_TEXT, C.StartID, C.StartText, C.ID
FROM Calc C
WHERE C.PHRASE_TEXT = C.MatchingPhrase;