How to split value to another column? - sql

I have data input and want output in picture. How do I to write it?
i can run in sql server for this but i get only width column( w_pacakaging) , but i dont know to write get value length or high from column text (db name : [Packing list text])
my code is
select [Packing list text],[Grade],[sales doc no],
case when [Packing list text] like'%:%' then
cast(Ltrim(Rtrim(SUBSTRING([Packing list text], charindex(':', [Packing list text]) + 1, charindex('"', [Packing list text])
- (charindex(':', [Packing list text]) + 1)))) as [nvarchar] )
END AS [W_packaging]
from [TPC_CRSYS].[dbo].[TotalOrder_Export]
the result from sql server
enter image description here
How could you advice me for write sql to get data for length and heigh from table [TotalOrder_Export] because it has more than one x in each record
could anyone advice me

the easy way using CTE and CHARINDEX:
;with
T AS ( -- select the base table, adding an index for performances pourpose
SELECT *, ROW_NUMBER() OVER (ORDER BY [sales doc no],[Packing list text]) ROW_IDX
FROM [TPC_CRSYS].[dbo].[TotalOrder_Export]
),
W AS (-- SELECT ONLY ROWS WITH VALID DIMENSIONS AND CALC WIDTH POSITION
SELECT *, CHARINDEX('(WXLXH): ', [Packing list text])+9 W_POS
FROM T
WHERE CHARINDEX('(WXLXH): ', [Packing list text])>0
),
L AS (-- CALC LENGTH POSITION
SELECT *, CHARINDEX('"', [Packing list text], W_POS)+2 L_POS
FROM W
),
H AS (-- CALC HEIGHT POSITION
SELECT *, CHARINDEX('"', [Packing list text], L_POS)+2 H_POS
FROM L
),
X AS (-- CALC
SELECT *, CHARINDEX('"', [Packing list text], H_POS) END_POS
FROM H
)
SELECT T.[Packing list text], T.grade, T.[sales doc no]
, SUBSTRING(T.[Packing list text], W_POS, L_POS-2-W_POS) W
, SUBSTRING(T.[Packing list text], L_POS, H_POS-2-L_POS) L
, SUBSTRING(T.[Packing list text], H_POS, END_POS-H_POS) H
FROM T
LEFT JOIN X ON T.ROW_IDX = X.ROW_IDX
alternatively, you can use OUTER APPLY and a split (FN_SPLIT) function to get w/h/l in rows for each [Packing list text], and then PIVOT them back to one single line:
;with
T AS ( -- select the base table, adding an index for performances pourpose
SELECT *, ROW_NUMBER() OVER (ORDER BY [sales doc no], [Packing list text]) ROW_IDX
FROM [TPC_CRSYS].[dbo].[TotalOrder_Export]
),
W AS (-- SELECT ONLY ROWS WITH VALID DIMENSIONS AND CALC WIDTH POSITION
SELECT *, CHARINDEX('(WXLXH): ', [Packing list text])+9 W_POS
FROM T
WHERE CHARINDEX('(WXLXH): ', [Packing list text])>0
),
spl as (
select W.ROW_IDX, d.*
from w
outer apply (
select ID, SUBSTRING(data,2,100) VAL
from FN_SPLIT(SUBSTRING(w.[Packing list text], w_pos-1, 100),'"') s
where s.id<=3
) d
),
X AS (
SELECT *
FROM spl
PIVOT (MIN(VAL) FOR ID IN ([1],[2],[3])) P
)
SELECT T.[Packing list text], T.grade, T.[sales doc no], X.[1] W, X.[2] L, X.[3] H
FROM T
LEFT JOIN X ON T.ROW_IDX = X.ROW_IDX
this second version is faster if you have many rows in your [TotalOrder_Export] table
for the split function, you will find many versions around, I made this:
CREATE FUNCTION [dbo].[FN_SPLIT]
(
#Line varchar(8000),
#SplitOn varchar(10) = ','
)
RETURNS #RtnValue table
(
Id INT NOT NULL IDENTITY(1,1) PRIMARY KEY CLUSTERED, -- VERY USEFUL TO MAKE JOINS AND TO CREATE THE UNIQUE INDEX ON SPLITTED STRINGS
Data varchar(800) NOT NULL,
UNIQUE (DATA, ID) -- THIS WILL MAKE REALLY FASTER THE QUERIES USING THIS FUNCTION
)
AS
BEGIN
IF #Line IS NULL RETURN
DECLARE #split_on_len INT = LEN(#SplitOn+'X')-1 -- TO CATCH TRAILING SPACES PROBLEM WITH LEN()
DECLARE #line_len INT = LEN(#line+'X')-1
DECLARE #start_at INT = 1
DECLARE #end_at INT
DECLARE #data_len INT
WHILE 1=1
BEGIN
IF #start_at > #line_len BREAK;
SET #end_at = CHARINDEX(#SplitOn, #Line, #start_at)
SET #data_len = CASE #end_at WHEN 0 THEN #line_len ELSE #end_at-#start_at END
INSERT INTO #RtnValue (data) VALUES( SUBSTRING(#Line,#start_at,#data_len));
SET #start_at = #start_at + #data_len + #split_on_len
END
RETURN
END
I hope this helps

Related

How to replace anything between 2 specific characters in SQL Server

I'm trying to replace anything between 2 specific characters in a string that contains multiples of those 2 caracters. Take it as a csv format.
Here an example of what i got as data in that field:
0001, ABCD1234;0002, EFGH432562;0003, IJKL1345hsth;...
What I need to retreive from it is all parts before the ',' but not what are between ',' and ';'
I tried with those formula but no success
SELECT REPLACE(fieldname, ',[A-Z];', ' ') FROM ...
or
SELECT REPLACE(fieldname, ',*;', ' ') FROM ...
I need to get
0001 0002 0003
Is there a way to achieve that?
You can CROSS APPLY to a STRING_SPLIT that uses STRING_AGG (since Sql Server 2017) to stick the numbers back together.
select id, codes
from your_table
cross apply (
select string_agg(left(value, patindex('%_,%', value)), ' ') as codes
from string_split(fieldname, ';') s
where value like '%_,%'
) ca;
GO
id
codes
1
0001 0002 0003
Demo on db<>fiddle here
Extra
Here is a version that also works in Sql Server 2014.
Inspired by the research from #AaronBertrand
The UDF uses a recursive CTE to split the string.
And the FOR XML trick is used to stick the numbers back together.
CREATE FUNCTION dbo.fnString_Split
(
#str nvarchar(4000),
#delim nchar(1)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
(
WITH RCTE AS (
SELECT
1 AS ordinal
, ISNULL(NULLIF(CHARINDEX(#delim, #str),0), LEN(#str)) AS pos
, LEFT(#str, ISNULL(NULLIF(CHARINDEX(#delim, #str),0)-1, LEN(#str))) AS value
UNION ALL
SELECT
ordinal+1
, ISNULL(NULLIF(CHARINDEX(#delim, #str, pos+1), 0), LEN(#str))
, SUBSTRING(#str, pos+1, ISNULL(NULLIF(CHARINDEX(#delim, #str, pos+1),0)-pos-1, LEN(#str)-pos ))
FROM RCTE
WHERE pos < LEN(#str)
)
SELECT ordinal, value
FROM RCTE
);
SELECT id, codes
FROM your_table
CROSS APPLY (
SELECT RTRIM((
SELECT LEFT(value, PATINDEX('%_,%', value))+' '
FROM dbo.fnString_Split(fieldname, ';') AS spl
WHERE value LIKE '%_,%'
ORDER BY ordinal
FOR XML PATH(''), TYPE).value(N'./text()[1]', N'nvarchar(max)')
) AS codes
) ca
OPTION (MAXRECURSION 250);
id
codes
1
0001 0002 0003
Demo on db<>fiddle here
Alternative version of the UDF (no recursion)
CREATE FUNCTION dbo.fnString_Split
(
#str NVARCHAR(4000),
#delim NCHAR(1)
)
RETURNS #tbl TABLE (ordinal INT, value NVARCHAR(4000))
WITH SCHEMABINDING
AS
BEGIN
DECLARE #value NVARCHAR(4000)
, #pos INT = 0
, #ordinal INT = 0;
WHILE (LEN(#str) > 0)
BEGIN
SET #ordinal += 1;
SET #pos = ISNULL(NULLIF(CHARINDEX(#delim, #str),0), LEN(#str)+1);
SET #value = LEFT(#str, #pos-1);
SET #str = SUBSTRING(#str, #pos+1, LEN(#str));
INSERT INTO #tbl (ordinal, value)
VALUES (#ordinal, #value);
END;
RETURN;
END;
If you're on SQL Server 2017 and don't need a guarantee that the order will be maintained, then LukStorms' answer is perfectly adequate.
However, if you:
care about an order guarantee; or,
are on an older version than 2017 (and can't use STRING_AGG); or,
are on an even older version than 2016 or are in an older compatibility level (and can't use STRING_SPLIT):
Here's an ordered split function that can help (it's long and ugly but you only have to create it once):
CREATE FUNCTION dbo.SplitOrdered
(
#list nvarchar(max),
#delim nvarchar(10)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
(
WITH w(n) AS (SELECT 0 FROM (VALUES (0),(0),(0),(0)) w(n)),
k(n) AS (SELECT 0 FROM w a, w b),
r(n) AS (SELECT 0 FROM k a, k b, k c, k d, k e, k f, k g, k h),
p(n) AS (SELECT TOP (COALESCE(LEN(#list), 0))
ROW_NUMBER() OVER (ORDER BY ##SPID) -1 FROM r),
spots(p) AS
(
SELECT n FROM p
WHERE (SUBSTRING(#list, n, LEN(#delim + 'x') - 1) LIKE #delim OR n = 0)
),
parts(p,val) AS
(
SELECT p, SUBSTRING(#list, p + LEN(#delim + 'x') - 1,
LEAD(p, 1, 2147483647) OVER (ORDER BY p) - p - LEN(#delim))
FROM spots AS s
)
SELECT listpos = ROW_NUMBER() OVER (ORDER BY p),
Item = LTRIM(RTRIM(val))
FROM parts
);
Then the query can become:
;WITH x AS
(
SELECT id, listpos,
codes = LEFT(Item, COALESCE(NULLIF(CHARINDEX(',', Item),0),1)-1)
FROM dbo.your_table
CROSS APPLY dbo.SplitOrdered(fieldname, ';') AS c
)
SELECT id, codes = (
(SELECT x2.codes + ' '
FROM x AS x2
WHERE x2.id = x.id
ORDER BY x2.listpos
FOR XML PATH(''), TYPE).value(N'./text()[1]', N'nvarchar(max)')
)
FROM x GROUP BY id;
Example borrowing from LukStorms' db<>fiddle
Note that, in addition to guaranteeing order and being backward compatible (well, only back so many versions), it also ignores garbage data, e.g. try:
0001, ABCD1234;0002 but no comma

Compare strings and Highlight mismatch wherever found in T-SQL

I have below Material table which contains data like this:
[PO Number] [Actual Material] [Ideal Material]
----------------------------------------------------
1000000 Milk-Sugar-tea Milk-Sugar-Coffee
1000001 Milk-Water Milk-Water-Ice-tea
I have the requirement where I need to compare two columns Actual Material and Ideal material and highlight the mismatch materials in SQL.
Mismatch would be
[PO Number] [Actual Material] [Ideal Material] [Mismatch]
----------------------------------------------------------------
1000000 Milk-Sugar-tea Milk-Sugar-Coffee tea-coffee
1000001 Milk-Water Milk-Water-Ice-tea Ice-tea
How to achieve this in a SQL query?
As many others have very sensibly suggested, your first port of call should be to restructure your database so you are actually storing normalised data against your PO Numbers.
That said, something we are dealt a rubbish hand and have to play the cards we get. To answer your question exactly as it is asked, you can do the following:
If you are not on SQL Server 2016 and therefore cannot use the built in string_split function, start by creating your own:
create function [dbo].[StringSplit]
(
#str nvarchar(4000) = ' ' -- String to split.
,#delimiter as nvarchar(1) = ',' -- Delimiting value to split on.
,#num as int = null -- Which value to return.
)
returns #results table(ItemNumber int, Item nvarchar(4000))
as
begin
declare #return nvarchar(4000);
-- Handle null #str values
select #str = case when len(isnull(#str,'')) = 0 then '' else #str end;
-- Start tally table with 10 rows.
with n(n) as (select n from (values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) n(n))
-- Select the same number of rows as characters in #str as incremental row numbers.
-- Cross joins increase exponentially to a max possible 10,000 rows to cover largest #str length.
,t(t) as (select top (select len(#str) a) row_number() over (order by (select null)) from n n1,n n2,n n3,n n4)
-- Return the position of every value that follows the specified delimiter.
,s(s) as (select 1 union all select t+1 from t where substring(#str,t,1) = #delimiter)
-- Return the start and length of every value, to use in the SUBSTRING function.
-- ISNULL/NULLIF combo handles the last value where there is no delimiter at the end of the string.
,l(s,l) as (select s,isnull(nullif(charindex(#delimiter,#str,s),0)-s,4000) from s)
insert into #results
select rn as ItemNumber
,Item
from(select row_number() over(order by s) as rn
,substring(#str,s,l) as item
from l
) a
where rn = #num
or #num is null;
return;
end
Using this function you can then create a set each for your Actual Material and Ideal Material columns, combine them to find the differences using a full join and then concatenate the results using stuff and for xml into one string value:
declare #t table(PONumber int, ActualMaterial nvarchar(50), IdealMaterial nvarchar(50));
insert into #t values (1000000,'Milk-Sugar-tea','Milk-Sugar-Coffee'),(1000001,'Milk-Water','Milk-Water-Ice-tea');
with a as
(
select t.PONumber
,a.Item
from #t t
outer apply dbo.StringSplit(t.ActualMaterial,'-',null) a
), i as
(
select t.PONumber
,i.Item
from #t t
outer apply dbo.StringSplit(t.IdealMaterial,'-',null) i
), m as
(
select isnull(a.PONumber,i.PONumber) as PONumber
,isnull(a.Item,i.Item) as Item
from a
full join i
on(a.PONumber = i.PONumber
and a.Item = i.Item
)
where a.Item is null
or i.Item is null
)
select t.PONumber
,t.ActualMaterial
,t.IdealMaterial
,stuff((select '-' + m.Item
from m
where t.PONumber = m.PONumber
order by m.Item
for xml path('')
)
,1,1,'') as Mismatch
from #t t
order by PONumber;
Output:
+----------+----------------+--------------------+------------+
| PONumber | ActualMaterial | IdealMaterial | Mismatch |
+----------+----------------+--------------------+------------+
| 1000000 | Milk-Sugar-tea | Milk-Sugar-Coffee | Coffee-tea |
| 1000001 | Milk-Water | Milk-Water-Ice-tea | Ice-tea |
+----------+----------------+--------------------+------------+
I use a table value functions for split actual-material an ideal-material values.
Split Function detail isALTER FUNCTION [dbo].[Split]
(
#String NVARCHAR(4000),
#Delimiter NCHAR(1)
)
RETURNS TABLE
AS
RETURN
(
WITH Split(stpos,endpos)
AS(
SELECT 0 AS stpos, CHARINDEX(#Delimiter,#String) AS endpos
UNION ALL
SELECT endpos+1, CHARINDEX(#Delimiter,#String,endpos+1)
FROM Split
WHERE endpos > 0
)
SELECT 'Id' = ROW_NUMBER() OVER (ORDER BY (SELECT 1)),
'Value' = LTRIM(SUBSTRING(#String,stpos,COALESCE(NULLIF(endpos,0),LEN(#String)+1)-stpos))
FROM Split
)
For result table is
declare #result table (
[PO Number] int , [Actual Material] varchar(100),[Ideal Material] varchar(100),Mismatch varchar(200)
)
And query for result table insert is :
;with CTE AS (
select distinct s.* ,x1.Value x1value,x2.Value x2value
from dbo.material s
outer apply (select *from Split(s.[Actual Material],'-')) x1
outer apply (select *from Split(s.[Ideal Material],'-')) x2
),
CTE2 AS (
SELECT distinct c.[PO Number],c.[Actual Material],c.[Ideal Material]
,case when not exists (select *from CTE c2 where c2.[PO Number] = c.[PO Number] and c2.x2value = c.x1value ) then c.x1value else '' end [ActualMismatch]
,case when not exists (select *from CTE c2 where c2.[PO Number] = c.[PO Number] and c2.x1value = c.x2value ) then c.x2value else '' end [IdealMismatch]
FROM CTE c
)
insert into #result
SELECt c.[PO Number],c.[Actual Material],c.[Ideal Material],c.ActualMismatch Mismatch from CTE2 c
union
SELECt c.[PO Number],c.[Actual Material],c.[Ideal Material] ,c.IdealMismatch Mismatch from CTE2 c
where
(c.ActualMismatch !='' or
c.[IdealMismatch] !='')
order by 1
select [PO Number],[Actual Material],[Ideal Material],
STUFF((
SELECT '-' + mismatch
FROM #result
WHERE ([PO Number] = c.[PO Number])
FOR XML PATH(''),TYPE).value('(./text())[1]','VARCHAR(MAX)')
,1,2,'') AS mismatch
from #result c
where Mismatch !=''
Group by [PO Number],[Actual Material],[Ideal Material]
how to coalesce mismatch values ! => with xml stuff

initialize and increment variable inside cte query sqlserver 2008

I am using sqlserver 2008 ,I want to initialize and increment variable (#NUMTwo) both at the same time, in my second part(Problem Line).
I am creating a cte query.
Is this possible , if yes then please let me know.
following is a sample example.I hope i am clear.
CREATE table #TempTable
(
childProductID INT,parentProductID INT,productModel varchar(50),[Num2] VARCHAR(100)
)
DECLARE #NUMTwo INT = 0
WITH tableR AS
(
-- First Part
SELECT childProductID = null,parentProductID=null,productModel from Products where productid in (#a),[Num2] = convert(varchar(100), '')
UNION ALL
--Second Part
SELECT e.childProductID,e.parentProductID,prd.productModel FROM ProductIncludes AS e
,[Num2] = convert(varchar(100),'1.' + #NUMTwo+=1 ) -- Problem line
INNER JOIN Products AS PRD ON e.childProductID = PRD.productID
WHERE parentProductID in (#a)
)
INSERT INTO #TempTable(childProductID,parentProductID,productModel,[Num2])
SELECT childProductID,parentProductID,productModel,[Num2]
END
SELECT * FROM #TempTable
You need to "Initialize" a column in the acnhor part of the query, and then "oncrement" this column in the recursive parts.
Something like
DECLARE #NUMTwo INT = 0
;WITH Test AS (
SELECT [Num2] = convert(varchar(MAX), ''),
#NUMTwo [N]
UNION ALL
SELECT [Num2] = '1.' + convert(varchar(MAX),[N]+1),
[N]+1
FROM TEst
WHERE [N] < 10
)
SELECT *
FROM Test
SQL Fiddle DEMO
If the parameter #NUMTwo is just for numbering rows you can use the ROW_NUMBER() OVER(...) instead of it like so:
WITH tableR AS
(
SELECT childProductID = NULL, parentProductID = NULL,
productModel, NUMTwo = CAST('0' AS VARCHAR(10))
FROM Products
WHERE
productid in (#a),
[Num2] = convert(varchar(100), '')
UNION ALL
SELECT e.childProductID, e.parentProductID,
prd.productModel,
NUMTwo = '1.' +
CAST( ROW_NUMBER() OVER(ORDER BY (SELECT 0)) AS VARCHAR(10))
FROM ProductIncludes AS e
INNER JOIN Products AS PRD ON e.childProductID = PRD.productID
WHERE parentProductID in (#a)
)

Parsing text to multiple columns

I have a feed that is populating a single text field in a table with statistics.
I need to pull this data into multiple fields in another table
but the strange format makes importing automatically difficult.
The file format is flat text but an example is below:
08:34:52 Checksum=180957248,TicketType=6,InitialUserType=G,InitialUserID=520,CommunicationType=Incoming,Date=26-03-2012,Time=08:35:00,Service=ST,Duration=00:00:14,Cost=0.12
Effectively it's made up of:
[timestamp] [Field1 name]=[Field1 value],[Field2 name]=[Field2 value],[Field4 name]=[Field4 value]...[CR]
All fields are always in the same order but not always present.
Total columns could be anywhere from 5 to 30.
I've tried the below function to translate it which seems to work mostly but seems to randomly skip fields:
Parsing the data:
(SELECT [Data].[dbo].[GetFromTextString] ( 'Checksum=' ,',' ,RAWTEXT)) AS RowCheckSum,
(SELECT [Data].[dbo].[GetFromTextString] ( 'TicketType=' ,',' ,RAWTEXT)) AS TicketType,
And the Function:
CREATE FUNCTION [dbo].[GetFromTextString]
-- Input start and end and return value.
(#uniqueprefix VARCHAR(100),
#commonsuffix VARCHAR(100),
#datastring VARCHAR(MAX) )
RETURNS VARCHAR(MAX) -- Picked Value.
AS
BEGIN
DECLARE #ADJLEN INT = LEN(#uniqueprefix)
SET #datastring = #datastring + #commonsuffix
RETURN (
CASE WHEN (CHARINDEX(#uniqueprefix,#datastring) > 0)
AND (CHARINDEX(#uniqueprefix + #commonsuffix,#datastring) = 0)
THEN SUBSTRING(#datastring, PATINDEX('%' + #uniqueprefix + '%',#datastring)+#ADJLEN, CHARINDEX(#commonsuffix,#datastring,PATINDEX('%' + #uniqueprefix + '%',#datastring))- PATINDEX('%' + #uniqueprefix + '%',#datastring)-#ADJLEN) ELSE NULL END
)
END
Could anyone suggest a better/cleaner way to strip out the data or could someone work out why this formula skips rows?
Any help really appreciated.
NOTE - THE FIRST SOLUTION IS RUBBISH. I HAVE LEFT IN IT FOR HISTORICAL REASONS, BUT A BETTER SOLUTION IS CONTAINED BELOW
I am not even sure if this will be faster than your current method, but it is the way I would approach the issue (If i was forced into an SQL only solution). The first thing that is required is a table valued function that will perform a split function:
CREATE FUNCTION dbo.Split (#TextToSplit VARCHAR(MAX), #Delimiter VARCHAR(MAX))
RETURNS #Values TABLE (Position INT IDENTITY(1, 1) NOT NULL, TextValues VARCHAR(MAX) NOT NULL)
AS
BEGIN
WHILE CHARINDEX(#Delimiter, #TextToSplit) > 0
BEGIN
INSERT #Values
SELECT LEFT(#TextToSplit, CHARINDEX(#Delimiter, #TextToSplit) - 1)
SET #TextToSplit = SUBSTRING(#TextToSplit, CHARINDEX(#Delimiter, #TextToSplit) + 1, LEN(#TextToSplit))
END
INSERT #Values VALUES (#TextToSplit)
RETURN
END
For my example I am working from a temp table #Worklist, you may need to adapt yours accordingly, or you could just insert the relevant data into #Worklist where I have used dummy data:
DECLARE #WorkList TABLE (ID INT IDENTITY(1, 1) NOT NULL, TextField VARCHAR(MAX))
INSERT #WorkList
SELECT '08:34:52 Checksum=180957248,TicketType=6,InitialUserType=G,InitialUserID=520,CommunicationType=Incoming,Date=26-03-2012,Time=08:35:00,Service=ST,Duration=00:00:14,Cost=0.12'
UNION
SELECT '08:34:52 Checksum=180957249,TicketType=5,InitialUserType=H,InitialUserID=521,CommunicationType=Outgoing,Date=27-03-2012,Time=14:27:00,Service=ST,Duration=00:15:12,Cost=0.37'
The main bit of the query is done here. It is quite long, so I have tried to comment it as well as possible. If further clarification is required I can add more comments.
DECLARE #Output TABLE (ID INT IDENTITY(1, 1) NOT NULL, TextField VARCHAR(MAX))
DECLARE #KeyPairs TABLE (WorkListID INT NOT NULL, KeyField VARCHAR(MAX), ValueField VARCHAR(MAX))
-- STORE TIMESTAMP DATA - THIS ASSUMES THE FIRST SPACE IS THE END OF THE TIMESTAMP
INSERT #KeyPairs
SELECT ID, 'TimeStamp', LEFT(TextField, CHARINDEX(' ', TextField))
FROM #WorkList
-- CLEAR THE TIMESTAMP FROM THE WORKLIST
UPDATE #WorkList
SET TextField = SUBSTRING(TextField, CHARINDEX(' ', TextField) + 1, LEN(TextField))
DECLARE #ID INT = (SELECT MIN(ID) FROM #WorkList)
WHILE #ID IS NOT NULL
BEGIN
-- SPLIT THE STRING FIRST INTO ALL THE PAIRS (e.g. Checksum=180957248)
INSERT #Output
SELECT TextValues
FROM dbo.Split((SELECT TextField FROM #WorkList WHERE ID = #ID), ',')
DECLARE #ID2 INT = (SELECT MIN(ID) FROM #Output)
-- FOR ALL THE PAIRS SPLIT THEM INTO A KEY AND A VALUE (USING THE POSITION OF THE SPLIT FUNCTION)
WHILE #ID2 IS NOT NULL
BEGIN
INSERT #KeyPairs
SELECT #ID,
MAX(CASE WHEN Position = 1 THEN TextValues ELSE '' END),
MAX(CASE WHEN Position = 2 THEN TextValues ELSE '' END)
FROM dbo.Split((SELECT TextField FROM #Output WHERE ID = #ID2), '=')
DELETE #Output
WHERE ID = #ID2
SET #ID2 = (SELECT MIN(ID) FROM #Output)
END
DELETE #WorkList
WHERE ID = #ID
SET #ID = (SELECT MIN(ID) FROM #WorkList)
END
-- WE NOW HAVE A TABLE CONTAINING EAV MODEL STYLE DATA. THIS NEEDS TO BE PIVOTED INTO THE CORRECT FORMAT
-- ENSURE COLUMNS ARE LISTED IN THE ORDER YOU WANT THEM TO APPEAR
SELECT *
FROM #KeyPairs p
PIVOT
( MAX(ValueField)
FOR KeyField IN
( [TimeStamp], [Checksum], [TicketType], [InitialUserType],
[InitialUserID], [CommunicationType], [Date], [Time],
[Service], [Duration], [Cost]
)
) AS PivotTable;
EDIT (4 YEARS LATER)
A recent upvote brought this to my attention and the I hate myself a little bit for ever posting this answer in its current form.
A much better split function would be:
CREATE FUNCTION dbo.Split
(
#List NVARCHAR(MAX),
#Delimiter NVARCHAR(255)
)
RETURNS TABLE
WITH SCHEMABINDING AS
RETURN
( WITH N1 AS (SELECT N FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1), (1)) n (N)),
N2(N) AS (SELECT 1 FROM N1 a CROSS JOIN N1 b),
N3(N) AS (SELECT 1 FROM N2 a CROSS JOIN N2 b),
N4(N) AS (SELECT 1 FROM N3 a CROSS JOIN N3 b),
cteTally(N) AS
( SELECT 0 UNION ALL
SELECT TOP (DATALENGTH(ISNULL(#List,1))) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM n4
),
cteStart(N1) AS
( SELECT t.N+1
FROM cteTally t
WHERE (SUBSTRING(#List,t.N,1) = #Delimiter OR t.N = 0)
)
SELECT Item = SUBSTRING(#List, s.N1, ISNULL(NULLIF(CHARINDEX(#Delimiter,#List,s.N1),0)-s.N1,8000)),
Position = s.N1,
ItemNumber = ROW_NUMBER() OVER(ORDER BY s.N1)
FROM cteStart s
);
Then there is no need for looping at all, you just have a proper set based solution by calling the split function twice to get your EAV style data set:
DECLARE #WorkList TABLE (ID INT IDENTITY(1, 1) NOT NULL, TextField VARCHAR(MAX))
INSERT #WorkList
SELECT '08:34:52 Checksum=180957248,TicketType=6,InitialUserType=G,InitialUserID=520,CommunicationType=Incoming,Date=26-03-2012,Time=08:35:00,Service=ST,Duration=00:00:14,Cost=0.12'
UNION
SELECT '08:34:52 Checksum=180957249,TicketType=5,InitialUserType=H,InitialUserID=521,CommunicationType=Outgoing,Date=27-03-2012,Time=14:27:00,Service=ST,Duration=00:15:12,Cost=0.37';
WITH KeyPairs AS
( SELECT w.ID,
[Timestamp] = LEFT(w.TextField, CHARINDEX(' ', w.TextField)),
KeyField = MAX(CASE WHEN v.ItemNumber = 1 THEN v.Item END),
ValueField = MAX(CASE WHEN v.ItemNumber = 2 THEN v.Item END)
FROM #WorkList AS w
CROSS APPLY dbo.Split(SUBSTRING(TextField, CHARINDEX(' ', TextField) + 1, LEN(TextField)), ',') AS kp
CROSS APPLY dbo.Split(kp.Item, '=') AS v
GROUP BY w.ID, kp.ItemNumber,w.TextField
)
SELECT *
FROM KeyPairs AS kp
PIVOT
( MAX(ValueField)
FOR KeyField IN
( [Checksum], [TicketType], [InitialUserType],
[InitialUserID], [CommunicationType], [Date], [Time],
[Service], [Duration], [Cost]
)
) AS pvt;

Join [one word per row] to rows of phrases with [multiple words per row]

Please excuse the length of the question. I included a test script to demo the situation and my best attempt at a solution.
There are two tables:
test_WORDS = Words extracted in order from several sources. The OBJ_FK column is the ID of the source. WORD_ID is an identifier for the word itself that is unique within the source. Each row contains one word.
test_PHRASE = a list of phrases to be searched for in test_WORDS. The PHRASE_TEXT column is a space separated phrase like 'foo bar' (see below) so that each row contains multiple words.
Requirement:
Return the first word from test_WORDS that is the start of a matching a phrase from test_PHRASE.
I would prefer something set based to avoid RBAR approach below. Also my solution is limited to 5 word phrases. I need to support up to 20 word phrases. Is it possible to match the words from a row in test_PHRASE to contiguous rows in the test_WORD without cursors?
After breaking the phrase words out into a temporary table, the problem boils down to matching portions of two sets together in row order.
-- Create test data
CREATE TABLE [dbo].[test_WORDS](
[OBJ_FK] [bigint] NOT NULL, --FK to the source object
[WORD_ID] [int] NOT NULL, --The word order in the source object
[WORD_TEXT] [nvarchar](50) NOT NULL,
CONSTRAINT [PK_test_WORDS] PRIMARY KEY CLUSTERED
(
[OBJ_FK] ASC,
[WORD_ID] ASC
)
) ON [PRIMARY]
GO
CREATE TABLE [dbo].[test_PHRASE](
[ID] [int], --PHRASE ID
[PHRASE_TEXT] [nvarchar](150) NOT NULL --Space-separated phrase
CONSTRAINT [PK_test_PHRASE] PRIMARY KEY CLUSTERED
(
[ID] ASC
)
)
GO
INSERT INTO dbo.test_WORDS
SELECT 1,1,'aaa' UNION ALL
SELECT 1,2,'bbb' UNION ALL
SELECT 1,3,'ccc' UNION ALL
SELECT 1,4,'ddd' UNION ALL
SELECT 1,5,'eee' UNION ALL
SELECT 1,6,'fff' UNION ALL
SELECT 1,7,'ggg' UNION ALL
SELECT 1,8,'hhh' UNION ALL
SELECT 2,1,'zzz' UNION ALL
SELECT 2,2,'yyy' UNION ALL
SELECT 2,3,'xxx' UNION ALL
SELECT 2,4,'www'
INSERT INTO dbo.test_PHRASE
SELECT 1, 'bbb ccc ddd' UNION ALL --should match
SELECT 2, 'ddd eee fff' UNION ALL --should match
SELECT 3, 'xxx xxx xxx' UNION ALL --should NOT match
SELECT 4, 'zzz yyy xxx' UNION ALL --should match
SELECT 5, 'xxx www ppp' UNION ALL --should NOT match
SELECT 6, 'zzz yyy xxx www' --should match
-- Create variables
DECLARE #maxRow AS INTEGER
DECLARE #currentRow AS INTEGER
DECLARE #phraseSubsetTable AS TABLE(
[ROW] int IDENTITY(1,1) NOT NULL,
[ID] int NOT NULL, --PHRASE ID
[PHRASE_TEXT] nvarchar(150) NOT NULL
)
--used to split the phrase into words
--note: No permissions to sys.dm_fts_parser
DECLARE #WordList table
(
ID int,
WORD nvarchar(50)
)
--Records to be returned to caller
DECLARE #returnTable AS TABLE(
OBJECT_FK INT NOT NULL,
WORD_ID INT NOT NULL,
PHRASE_ID INT NOT NULL
)
DECLARE #phrase AS NVARCHAR(150)
DECLARE #phraseID AS INTEGER
-- Get subset of phrases to simulate a join that would occur in production
INSERT INTO #phraseSubsetTable
SELECT ID, PHRASE_TEXT
FROM dbo.test_PHRASE
--represent subset of phrases caused by join in production
WHERE ID IN (2,3,4)
-- Loop each phrase in the subset, split into rows of words and return matches to the test_WORDS table
SET #maxRow = ##ROWCOUNT
SET #currentRow = 1
WHILE #currentRow <= #maxRow
BEGIN
SELECT #phrase=PHRASE_TEXT, #phraseID=ID FROM #phraseSubsetTable WHERE row = #currentRow
--clear previous phrase that was split into rows
DELETE FROM #WordList
--Recursive Function with CTE to create recordset of words, one per row
;WITH Pieces(pn, start, stop) AS (
SELECT 1, 1, CHARINDEX(' ', #phrase)
UNION ALL
SELECT pn + 1, stop + 1, CHARINDEX(' ', #phrase, stop + 1)
FROM Pieces
WHERE stop > 0)
--Create the List of words with the CTE above
insert into #WordList
SELECT pn,
SUBSTRING(#phrase, start, CASE WHEN stop > 0 THEN stop-start ELSE 1056 END) AS WORD
FROM Pieces
DECLARE #wordCt as int
select #wordCt=count(ID) from #WordList;
-- Do the actual query using a CTE with a rownumber that repeats for every SOURCE OBJECT
;WITH WordOrder_CTE AS (
SELECT OBJ_FK, WORD_ID, WORD_TEXT,
ROW_NUMBER() OVER (Partition BY OBJ_FK ORDER BY WORD_ID) AS rownum
FROM test_WORDS)
--CREATE a flattened record of the first word in the phrase and join it to the rest of the words.
INSERT INTO #returnTable
SELECT r1.OBJ_FK, r1.WORD_ID, #phraseID AS PHRASE_ID
FROM WordOrder_CTE r1
INNER JOIN #WordList w1 ON r1.WORD_TEXT = w1.WORD and w1.ID=1
LEFT JOIN WordOrder_CTE r2
ON r1.rownum = r2.rownum - 1 and r1.OBJ_FK = r2.OBJ_FK
LEFT JOIN #WordList w2 ON r2.WORD_TEXT = w2.WORD and w2.ID=2
LEFT JOIN WordOrder_CTE r3
ON r1.rownum = r3.rownum - 2 and r1.OBJ_FK = r3.OBJ_FK
LEFT JOIN #WordList w3 ON r3.WORD_TEXT = w3.WORD and w3.ID=3
LEFT JOIN WordOrder_CTE r4
ON r1.rownum = r4.rownum - 3 and r1.OBJ_FK = r4.OBJ_FK
LEFT JOIN #WordList w4 ON r4.WORD_TEXT = w4.WORD and w4.ID=4
LEFT JOIN WordOrder_CTE r5
ON r1.rownum = r5.rownum - 4 and r1.OBJ_FK = r5.OBJ_FK
LEFT JOIN #WordList w5 ON r5.WORD_TEXT = w5.WORD and w5.ID=5
WHERE (#wordCt < 2 OR w2.ID is not null) and
(#wordCt < 3 OR w3.ID is not null) and
(#wordCt < 4 OR w4.ID is not null) and
(#wordCt < 5 OR w5.ID is not null)
--loop
SET #currentRow = #currentRow+1
END
--Return the first words of each matching phrase
SELECT OBJECT_FK, WORD_ID, PHRASE_ID FROM #returnTable
GO
--Clean up
DROP TABLE [dbo].[test_WORDS]
DROP TABLE [dbo].[test_PHRASE]
Edited solution:
This is an edit of the correct solution provided below to account for non-contiguous word IDs. Hope this helps someone as much as it did me.
;WITH
numberedwords AS (
SELECT
OBJ_FK,
WORD_ID,
WORD_TEXT,
rowcnt = ROW_NUMBER() OVER
(PARTITION BY OBJ_FK ORDER BY WORD_ID DESC),
totalInSrc = COUNT(WORD_ID) OVER (PARTITION BY OBJ_FK)
FROM dbo.test_WORDS
),
phrasedwords AS (
SELECT
nw1.OBJ_FK,
nw1.WORD_ID,
nw1.WORD_TEXT,
PHRASE_TEXT = RTRIM((
SELECT [text()] = nw2.WORD_TEXT + ' '
FROM numberedwords nw2
WHERE nw1.OBJ_FK = nw2.OBJ_FK
AND nw2.rowcnt BETWEEN nw1.rowcnt AND nw1.totalInSrc
ORDER BY nw2.OBJ_FK, nw2.WORD_ID
FOR XML PATH ('')
))
FROM numberedwords nw1
GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt, nw1.totalInSrc
)
SELECT *
FROM phrasedwords pw
INNER JOIN test_PHRASE tp
ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID
Note: The final query I used in production uses indexed temp tables instead of CTEs. I also limited the length of the PHRASE_TEXT column to my needs. With these improvements, I was able to reduce my query time from over 3 minutes to 3 seconds!
Here's a solution that uses a different approach: instead of splitting the phrases into words it combines the words into phrases.
Edited: changed the rowcnt expression to using COUNT(*) OVER …, as suggested by #ErikE in the comments.
;WITH
numberedwords AS (
SELECT
OBJ_FK,
WORD_ID,
WORD_TEXT,
rowcnt = COUNT(*) OVER (PARTITION BY OBJ_FK)
FROM dbo.test_WORDS
),
phrasedwords AS (
SELECT
nw1.OBJ_FK,
nw1.WORD_ID,
nw1.WORD_TEXT,
PHRASE_TEXT = RTRIM((
SELECT [text()] = nw2.WORD_TEXT + ' '
FROM numberedwords nw2
WHERE nw1.OBJ_FK = nw2.OBJ_FK
AND nw2.WORD_ID BETWEEN nw1.WORD_ID AND nw1.rowcnt
ORDER BY nw2.OBJ_FK, nw2.WORD_ID
FOR XML PATH ('')
))
FROM numberedwords nw1
GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt
)
SELECT *
FROM phrasedwords pw
INNER JOIN test_PHRASE tp
ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID
Using a Split function should work.
Split Function
CREATE FUNCTION dbo.Split
(
#RowData nvarchar(2000),
#SplitOn nvarchar(5)
)
RETURNS #RtnValue table
(
Id int identity(1,1),
Data nvarchar(100)
)
AS
BEGIN
Declare #Cnt int
Set #Cnt = 1
While (Charindex(#SplitOn,#RowData)>0)
Begin
Insert Into #RtnValue (data)
Select
Data = ltrim(rtrim(Substring(#RowData,1,Charindex(#SplitOn,#RowData)-1)))
Set #RowData = Substring(#RowData,Charindex(#SplitOn,#RowData)+1,len(#RowData))
Set #Cnt = #Cnt + 1
End
Insert Into #RtnValue (data)
Select Data = ltrim(rtrim(#RowData))
Return
END
SQL Statement
SELECT DISTINCT p.*
FROM dbo.test_PHRASE p
LEFT OUTER JOIN (
SELECT p.ID
FROM dbo.test_PHRASE p
CROSS APPLY dbo.Split(p.PHRASE_TEXT, ' ') sp
LEFT OUTER JOIN dbo.test_WORDS w ON w.WORD_TEXT = sp.Data
WHERE w.OBJ_FK IS NULL
) ignore ON ignore.ID = p.ID
WHERE ignore.ID IS NULL
This performs a little better than other solutions given. if you don't need WORD_ID, just WORD_TEXT, you can remove a whole column. I know this was over a year ago, but I wonder if you can get 3 seconds down to 30 ms? :)
If this query seems good, then my biggest speed advice is to put the entire phrases into a separate table (using your example data, it would have only 2 rows with phrases of length 8 words and 4 words).
SELECT
W.OBJ_FK,
X.Phrase,
P.*,
Left(P.PHRASE_TEXT,
IsNull(NullIf(CharIndex(' ', P.PHRASE_TEXT), 0) - 1, 2147483647)
) WORD_TEXT,
Len(Left(X.Phrase, PatIndex('%' + P.PHRASE_TEXT + '%', ' ' + X.Phrase) - 1))
- Len(Replace(
Left(X.Phrase, PatIndex('%' + P.PHRASE_TEXT + '%', X.Phrase) - 1), ' ', '')
)
WORD_ID
FROM
(SELECT DISTINCT OBJ_FK FROM dbo.test_WORDS) W
CROSS APPLY (
SELECT RTrim((SELECT WORD_TEXT + ' '
FROM dbo.test_WORDS W2
WHERE W.OBJ_FK = W2.OBJ_FK
ORDER BY W2.WORD_ID
FOR XML PATH (''))) Phrase
) X
INNER JOIN dbo.test_PHRASE P
ON X.Phrase LIKE '%' + P.PHRASE_TEXT + '%';
Here's another version for curiosity's sake. It doesn't perform quite as well.
WITH Calc AS (
SELECT
P.ID,
P.PHRASE_TEXT,
W.OBJ_FK,
W.WORD_ID StartID,
W.WORD_TEXT StartText,
W.WORD_ID,
Len(W.WORD_TEXT) + 2 NextPos,
Convert(varchar(150), W.WORD_TEXT) MatchingPhrase
FROM
dbo.test_PHRASE P
INNER JOIN dbo.test_WORDS W
ON P.PHRASE_TEXT + ' ' LIKE W.WORD_TEXT + ' %'
UNION ALL
SELECT
C.ID,
C.PHRASE_TEXT,
C.OBJ_FK,
C.StartID,
C.StartText,
W.WORD_ID,
C.NextPos + Len(W.WORD_TEXT) + 1,
Convert(varchar(150), C.MatchingPhrase + Coalesce(' ' + W.WORD_TEXT, ''))
FROM
Calc C
INNER JOIN dbo.test_WORDS W
ON C.OBJ_FK = W.OBJ_FK
AND C.WORD_ID + 1 = W.WORD_ID
AND Substring(C.PHRASE_TEXT, C.NextPos, 2147483647) + ' ' LIKE W.WORD_TEXT + ' %'
)
SELECT C.OBJ_FK, C.PHRASE_TEXT, C.StartID, C.StartText, C.ID
FROM Calc C
WHERE C.PHRASE_TEXT = C.MatchingPhrase;