SQL Server split a single column multiple times - sql

I have a database table that has a column with stacked data with two levels with a column that I want to break a part. Here is the example of the data (data changed to protect the innocent :) :
Table
ID = varchar(100)
CarData = varchar(1000)
ID CarData
1 Nissan:blue:20000,Ford:green:10000
2 Nissan:steel:20001,Ford:blue:10001,Chevy:blue:10000,Ford:olive:10000
** Note that cardata can is not fixed, and can have many cars in it
Output Desired:
ID Manufacture Color Cost
1 Nissan Blue 20000
1 Ford green 10000
2 Nissan steel 20001
... and on
So to say it plainly I need to break the first stacked field which is a comma and create a row for that, then break the second stacked field which is a colon into columns.
Any help would be greatly appreciated.

-- Sample data
declare #T table(ID int, CarData varchar(100))
insert into #T values
(1, 'Nissan:blue:20000,Ford:green:10000'),
(2, 'Nissan:steel:20001,Ford:blue:10001,Chevy:blue:10000,Ford:olive:10000')
-- Recursice CTE to get one row for each car
;with cte(ID, Car, CarData) as
(
select ID,
cast(substring(CarData+',', 1, charindex(',', CarData+',')-1) as varchar(100)),
stuff(CarData, 1, charindex(',', CarData), '')+','
from #T
where len(CarData) > 0
union all
select ID,
cast(substring(CarData, 1, charindex(',', CarData)-1) as varchar(100)),
stuff(CarData, 1, charindex(',', CarData), '')
from cte
where len(CarData) > 0
)
-- Use parsename to split the car data
select ID,
parsename(replace(Car, ':', '.'), 3) as Manufacture,
parsename(replace(Car, ':', '.'), 2) as Color,
parsename(replace(Car, ':', '.'), 1) as Cost
from cte
order by ID
Result:
ID Manufacture Color Cost
-- ----------- ------ -----
1 Nissan blue 20000
1 Ford green 10000
2 Nissan steel 20001
2 Ford blue 10001
2 Chevy blue 10000
2 Ford olive 10000
Edit 1
You will have trouble with parsename if color, cost or a manufacturer name contains a .. If that is the case you should try this instead.
-- Sample data
declare #T table(ID int, CarData varchar(100))
insert into #T values
(1, 'Nissan:blue:20000,Ford:green:10000'),
(2, 'Nissan:steel:20001,Ford:blue:10001,Chevy:blue:10000,Ford:olive:10000')
-- Recursice CTE to get one row for each car
;with cte(ID, Car, CarData) as
(
select ID,
cast(substring(CarData+',', 1, charindex(',', CarData+',')-1) as varchar(100)),
stuff(CarData, 1, charindex(',', CarData), '')+','
from #T
where len(CarData) > 0
union all
select ID,
cast(substring(CarData, 1, charindex(',', CarData)-1) as varchar(100)),
stuff(CarData, 1, charindex(',', CarData), '')
from cte
where len(CarData) > 0
)
-- Split the car data with substring
select ID,
substring(Car, 1, P1.Pos-1) as Manufacture,
substring(Car, P1.Pos+1, P2.Pos-P1.Pos-1) as Color,
substring(Car, P2.Pos+1, len(Car)-P2.Pos) as Cost
from cte
cross apply (select charindex(':', Car)) as P1(Pos)
cross apply (select charindex(':', Car, P1.Pos+1)) as P2(Pos)
order by ID

Use this string splitting function to produce a table of results.
I would first call dbo.split() using a , as the separator character. Then you'll have a list of items like:
Nissan:blue:20000
Ford:green:10000
Nissan:steel:20001
Ford:blue:10001
Chevy:blue:10000
Ford:olive:10000
From there you can call dbo.split() again using : as your separator. Each call will result in exactly three records (assuming your design as at least that "normal").
As #JNK mentioned in his comment, hopefully this is not something you'd want to be running regularly.
EDIT:
Some sample code to get you started:
SELECT *
INTO #YuckyCar
FROM (
SELECT 1 ID, 'Nissan:blue:20000,Ford:green:10000' CarData
UNION
SELECT 2, 'Nissan:steel:20001,Ford:blue:10001,Chevy:blue:10000,Ford:olive:10000'
) T;
-- Shows logical step #1
SELECT ID, X.items MoreCarData
FROM #YuckyCar CROSS APPLY dbo.Split(CarData, ',') X;
-- Shows logical step #2
SELECT Q.ID, Y.items
FROM (
SELECT ID, X.items MoreCarData
FROM #YuckyCar CROSS APPLY dbo.Split(CarData, ',') X) Q CROSS APPLY dbo.Split(Q.MoreCarData, ':') Y
DROP TABLE #YuckyCar;
The problem in the last part is that you can't guarantee row 1 = Manufacturer, row 2 = Color, row 3 = Cost.

This should solve your problem:
[EDIT] Your ID is a varchar(100) and you do not specify if it is a primary key, so I made some changes ... ID does't have to be primary key in this case.
declare #T table(ID varchar(100), CarData varchar(1000))
declare #OUT table(pk INT IDENTITY(1,1), ID varchar(100), Manufacture varchar(100), Color VARCHAR(100), Cost INT)
insert into #T (ID, CarData) values
('1', 'Nissan:blue:20000,Ford:green:10000'),
('2', 'Nissan:steel:20001,Ford:blue:10001,Chevy:blue:10000,Ford:olive:10000')
DECLARE #x XML, #i INT, #ID VARCHAR(100), #maxi INT;
;WITH list AS (SELECT pk = ROW_NUMBER() OVER(ORDER BY ID), * FROM #T)
SELECT #i=1, #maxi=MAX(pk) FROM list;
WHILE #i <= #maxi
BEGIN
;WITH list AS (SELECT pk = ROW_NUMBER() OVER(ORDER BY ID), * FROM #T)
SELECT
#x = CAST( '<root><car><prop>' +
REPLACE(
REPLACE(
CarData
,':'
,'</prop><prop>'
)
,','
,'</prop></car><car><prop>'
) +
'</prop></car></root>'
AS XML)
, #ID = ID
FROM list
WHERE pk = #i
INSERT INTO #OUT
SELECT
ID = #ID
,Manufacture = x.value('./prop[1]','VARCHAR(100)')
,Color = x.value('./prop[2]','VARCHAR(100)')
,Cost = x.value('./prop[3]','INT')
FROM #x.nodes('/root/car') AS T(x)
SET #i = #i + 1;
END
SELECT * FROM #OUT
/* -- OUTPUT
ID Manufacture Color Cost
--------------------------------
1 Nissan blue 20000
1 Ford green 10000
2 Nissan steel 20001
2 Ford blue 10001
2 Chevy blue 10000
2 Ford olive 10000
*/

Related

Extracting last number within string

I have the below sample data and I'm trying to extract the last number within the string. I have the following, which gets me part of the way there, but I'm not sure how to get the value where it isn't the last word.
right(TextDescription, patindex('%[^0-9]%',reverse(TextDescription)) - 1)
The result should be:
ID
code
1
10015662
2
100040344
3
10015370
4
NULL
5
400337
Sample data
Create Table #TestData
(
ID int,
TextDescription varchar(100)
)
insert into #TestData Values (1,'Data From JOE BLOGGS 10015662 tree 10015662')
insert into #TestData Values (2,'Fast Data From JOHN SMITH 10004034 MARY SMITH 100040344 plant')
insert into #TestData Values (3,'Data In 10015370 pot JONES')
insert into #TestData Values (4,'Fast Data From LEE tree')
insert into #TestData Values (5,'Direct Data 106600 JANE GREEN 400337')
Just another option using a a bit of JSON which will convert the string into an array and [key] will maintain the sequence.
Select A.ID
,B.Value
From #TestData A
Outer Apply (
Select top 1 Value
From OpenJSON( '["'+replace(string_escape(TextDescription,'json'),' ','","')+'"]' )
Where try_convert(int,Value) is not null
Order by [key] Desc
) B
Results
ID Value
1 10015662
2 100040344
3 10015370
4 NULL
5 400337
The following will locate the last digit, trim the string at that point, find the preceding non-digit, and then perform another trim to get the final result. As in your original post, calculations are done on a reversed string to accommodate the lack of of a LASTPATINDEX() function. CROSS APPLY is used to build up intermediate results and to avoid duplication of subexpressions.
select T.*, P1.Pos1, P2.Pos2, N.Result
from #TestData T
cross apply (select reverse(TextDescription) AS Reversed) R
cross apply (select nullif(patindex('%[0-9]%', R.Reversed), 0) AS Pos1) P1
cross apply (select stuff(R.Reversed, 1, P1.Pos1 - 1, '') AS Trim1) T1
cross apply (select patindex('%[^0-9]%', T1.Trim1 + 'X') AS Pos2) P2
cross apply (select reverse(left(T1.Trim1, P2.Pos2 - 1)) AS Result) N
-- Partly reduced
select T.*, reverse(left(T1.Trim1, patindex('%[^0-9]%', T1.Trim1 + 'X') - 1)) AS Result
from #TestData T
cross apply (select reverse(TextDescription) AS Reversed) R
cross apply (select stuff(R.Reversed, 1, nullif(patindex('%[0-9]%', R.Reversed), 0) - 1, '') AS Trim1) T1
This will handle a variety of forms, not just space-delimited values.
See this db<>fiddle.
If your interesting values are always found at the end, and are always preceeded by a non-digit, you can use the SUBSTRING with:
lower boundary being the last non-digit before the last number location
length being the difference between last non-digit and first value of last number
WITH cte AS (
SELECT ID, TextDescription,
PATINDEX('%[0-9][^0-9]%', REVERSE(TextDescription) + ' ') AS first_space,
PATINDEX('%[0-9]%' , REVERSE(TextDescription) ) AS last_digit
FROM #TestData
)
SELECT ID,
SUBSTRING(TextDescription,
LEN(TextDescription) -first_space +1,
first_space+1 -last_digit) AS code
FROM cte
Check the demo here.
Please try the following solution leveraging XML and XQuery.
Notable points:
CROSS APPLY is tokenizing TextDescription column as XML.
XQuery FLWOR expression is checking every token if it is castable as
INTEGER data type. And filtering them out if they are not.
XPath predicate [last()] is giving us last INTEGER value.
SQL
-- DDL and sample data population, start
DECLARE #tbl Table (ID INT IDENTITY PRIMARY KEY, TextDescription varchar(100));
INSERT INTO #tbl (TextDescription) VALUES
('Data From JOE BLOGGS 10015662 tree 10015662'),
('Fast Data From JOHN SMITH 10004034 MARY SMITH 100040344 plant'),
('Data In 10015370 pot JONES'),
('Fast Data From LEE tree'),
('Direct Data 106600 JANE GREEN 400337');
-- DDL and sample data population, end
DECLARE #separator CHAR(1) = SPACE(1);
SELECT t.*
, c.query('for $x in /root/r[not(empty(xs:int(.)))]
return $x
').value('(/r[last()]/text())[1]','INT') AS [code]
FROM #tbl AS t
CROSS APPLY (SELECT TRY_CAST('<root><r><![CDATA[' +
REPLACE(TextDescription, #separator, ']]></r><r><![CDATA[') +
']]></r></root>' AS XML)) AS t1(c)
ORDER BY t.ID;
Output
ID
TextDescription
code
1
Data From JOE BLOGGS 10015662 tree 10015662
10015662
2
Fast Data From JOHN SMITH 10004034 MARY SMITH 100040344 plant
100040344
3
Data In 10015370 pot JONES
10015370
4
Fast Data From LEE tree
NULL
5
Direct Data 106600 JANE GREEN 400337
400337
CREATE FUNCTION [ExtractInteger](#String VARCHAR(2000))
RETURNS VARCHAR(1000)
AS
BEGIN
DECLARE #Count INT
DECLARE #IntNumbers VARCHAR(1000)
SET #Count = 0
SET #IntNumbers = ''
WHILE #Count <= LEN(#String)
BEGIN
IF SUBSTRING(#String,#Count,1) = ' '
BEGIN
SET #IntNumbers = #IntNumbers + ' '
END
IF SUBSTRING(#String,#Count,1) >= '0'
AND SUBSTRING(#String,#Count,1) <= '9'
BEGIN
SET #IntNumbers = #IntNumbers + SUBSTRING(#String,#Count,1)
END
SET #Count = #Count + 1
END
RETURN LTRIM(RTRIM(#IntNumbers))
END
The ExtractInteger function will fetch only numbers and spaces, and the below select will take the last word as number:
select right(dbo.ExtractInteger('My 3rd Phone Number is 323-111-CALL'), charindex(' ', reverse(dbo.ExtractInteger('My 3rd Phone Number is 323-111-CALL')) + ' ') - 1)

Adding 'and' at the end of a comma separated list

Currently, I'm using the stuff function to create a comma separated list per each row.
x,y,z
What I want is to add commas for n-1 items in the list, with the final item being preceded by 'and'
x,y, and z.
For these purposes, just checking row number won't work because this list is being generated per unique Id, therefore I can't just iterate to the end of the table. Code below:
SELECT DISTINCT (sw.OwnerID)
,stuff((
SELECT DISTINCT ', ' + e.pn
FROM fct.enrtablev e
WHERE sw.OwnerID = e.OwnerId
FOR XML PATH('')), 1, 1, '') AS [Pet(s)]
A bit of a hack... AND string_agg() would be a better fit if 2017+
Here we use test the row_number() of the item count sum(1) over(), when equal this is the last item in the list
Example
Declare #YourTable table (OwnerID int,pn varchar(50))
Insert Into #YourTable values
(1,'X')
,(1,'Y')
,(1,'Z')
,(1,'Z')
,(2,'Apples')
Select Distinct
OwnerID
,stuff( ( Select case when row_number() over(order by pn) = nullif(sum(1) over() ,1)
then ', and '
else ', '
end + pn
FROM (Select distinct pn
From #YourTable
Where OwnerID = A.OwnerId
) e
Order By PN
For XML Path('')), 1, 2, '') AS [Pet(s)]
From #YourTable A
Returns
OwnerID Pet(s)
1 X, Y, and Z
2 Apples
XQUery and XML data model is based on ordered sequences. Exactly what we need.
Here is a simple solution based on XQuery and its FLWOR expression.
SQL
-- DDL and sample data population, start
DECLARE #tbl TABLE (OwnerID int, pn VARCHAR(50));
INSERT INTO #tbl (OwnerID, pn) VALUES
(1,'X'),
(1,'Y'),
(1,'Z'),
(2,'Apples');
-- DDL and sample data population, end
SELECT p.OwnerID
, (SELECT *
FROM #tbl AS c
WHERE c.OwnerID = p.OwnerID
FOR XML PATH('r'), TYPE, ROOT('root')
).query('
for $x in /root/r/pn/text()
return if ($x is (/root/r[last()]/pn/text())[1]) then
if (count(/root/r) gt 1) then concat("and ", $x) else string($x)
else concat($x, ",")
').value('.', 'VARCHAR(MAX)') AS Result
FROM #tbl AS p
GROUP BY p.OwnerID;
Output
+---------+----------------+
| OwnerID | Result |
+---------+----------------+
| 1 | X, Y, and Z |
| 2 | Apples |
+---------+----------------+
You can achieve this using ORDER BY and count.
Declare #YourTable table (OwnerID int,pn varchar(50))
Insert Into #YourTable values
(1,'X')
,(1,'Y')
,(1,'Z')
,(2,'Apples')
;WITH CTE_OwnerIdRank as
(
SELECT ownerid, pn, row_number() over (order by ownerId) as totalrn,
count(*) over(partition by ownerid order by ownerid) as ownercnt
from #yourtable
)
SELECT distinct OwnerId,
stuff((
SELECT ', ' + CASE WHEN c.totalrn = c.ownercnt then CONCAT(' and ',c.pn) else c.pn end
FROM CTE_OwnerIdRank as c
WHERE c.OwnerID = o.OwnerId
order by c.totalrn
FOR XML PATH('')), 1, 1, '') AS [Pet(s)]
from #yourtable as o
OwnerId
Pet(s)
1
X, Y, and Z
2
Apples

SQL taking a one-to-many delimited list in a table column and transforming it into a one-to-one relationship table [duplicate]

I have a SQL Table like this:
| SomeID | OtherID | Data
+----------------+-------------+-------------------
| abcdef-..... | cdef123-... | 18,20,22
| abcdef-..... | 4554a24-... | 17,19
| 987654-..... | 12324a2-... | 13,19,20
is there a query where I can perform a query like SELECT OtherID, SplitData WHERE SomeID = 'abcdef-.......' that returns individual rows, like this:
| OtherID | SplitData
+-------------+-------------------
| cdef123-... | 18
| cdef123-... | 20
| cdef123-... | 22
| 4554a24-... | 17
| 4554a24-... | 19
Basically split my data at the comma into individual rows?
I am aware that storing a comma-separated string into a relational database sounds dumb, but the normal use case in the consumer application makes that really helpful.
I don't want to do the split in the application as I need paging, so I wanted to explore options before refactoring the whole app.
It's SQL Server 2008 (non-R2).
You can use the wonderful recursive functions from SQL Server:
Sample table:
CREATE TABLE Testdata
(
SomeID INT,
OtherID INT,
String VARCHAR(MAX)
);
INSERT Testdata SELECT 1, 9, '18,20,22';
INSERT Testdata SELECT 2, 8, '17,19';
INSERT Testdata SELECT 3, 7, '13,19,20';
INSERT Testdata SELECT 4, 6, '';
INSERT Testdata SELECT 9, 11, '1,2,3,4';
The query
WITH tmp(SomeID, OtherID, DataItem, String) AS
(
SELECT
SomeID,
OtherID,
LEFT(String, CHARINDEX(',', String + ',') - 1),
STUFF(String, 1, CHARINDEX(',', String + ','), '')
FROM Testdata
UNION all
SELECT
SomeID,
OtherID,
LEFT(String, CHARINDEX(',', String + ',') - 1),
STUFF(String, 1, CHARINDEX(',', String + ','), '')
FROM tmp
WHERE
String > ''
)
SELECT
SomeID,
OtherID,
DataItem
FROM tmp
ORDER BY SomeID;
-- OPTION (maxrecursion 0)
-- normally recursion is limited to 100. If you know you have very long
-- strings, uncomment the option
Output
SomeID | OtherID | DataItem
--------+---------+----------
1 | 9 | 18
1 | 9 | 20
1 | 9 | 22
2 | 8 | 17
2 | 8 | 19
3 | 7 | 13
3 | 7 | 19
3 | 7 | 20
4 | 6 |
9 | 11 | 1
9 | 11 | 2
9 | 11 | 3
9 | 11 | 4
Finally, the wait is over with SQL Server 2016. They have introduced the Split string function, STRING_SPLIT:
select OtherID, cs.Value --SplitData
from yourtable
cross apply STRING_SPLIT (Data, ',') cs
All the other methods to split string like XML, Tally table, while loop, etc.. have been blown away by this STRING_SPLIT function.
Here is an excellent article with performance comparison: Performance Surprises and Assumptions: STRING_SPLIT.
For older versions, using tally table here is one split string function(best possible approach)
CREATE FUNCTION [dbo].[DelimitedSplit8K]
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table" produces values from 0 up to 10,000...
-- enough to cover NVARCHAR(4000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l
;
Referred from Tally OH! An Improved SQL 8K “CSV Splitter” Function
Check this
SELECT A.OtherID,
Split.a.value('.', 'VARCHAR(100)') AS Data
FROM
(
SELECT OtherID,
CAST ('<M>' + REPLACE(Data, ',', '</M><M>') + '</M>' AS XML) AS Data
FROM Table1
) AS A CROSS APPLY Data.nodes ('/M') AS Split(a);
Very late but try this out:
SELECT ColumnID, Column1, value --Do not change 'value' name. Leave it as it is.
FROM tbl_Sample
CROSS APPLY STRING_SPLIT(Tags, ','); --'Tags' is the name of column containing comma separated values
So we were having this:
tbl_Sample :
ColumnID| Column1 | Tags
--------|-----------|-------------
1 | ABC | 10,11,12
2 | PQR | 20,21,22
After running this query:
ColumnID| Column1 | value
--------|-----------|-----------
1 | ABC | 10
1 | ABC | 11
1 | ABC | 12
2 | PQR | 20
2 | PQR | 21
2 | PQR | 22
Thanks!
select t.OtherID,x.Kod
from testData t
cross apply (select Code from dbo.Split(t.Data,',') ) x
As of Feb 2016 - see the TALLY Table Example - very likely to outperform my TVF below, from Feb 2014. Keeping original post below for posterity:
Too much repeated code for my liking in the above examples. And I dislike the performance of CTEs and XML. Also, an explicit Id so that consumers that are order specific can specify an ORDER BY clause.
CREATE FUNCTION dbo.Split
(
#Line nvarchar(MAX),
#SplitOn nvarchar(5) = ','
)
RETURNS #RtnValue table
(
Id INT NOT NULL IDENTITY(1,1) PRIMARY KEY CLUSTERED,
Data nvarchar(100) NOT NULL
)
AS
BEGIN
IF #Line IS NULL RETURN;
DECLARE #split_on_len INT = LEN(#SplitOn);
DECLARE #start_at INT = 1;
DECLARE #end_at INT;
DECLARE #data_len INT;
WHILE 1=1
BEGIN
SET #end_at = CHARINDEX(#SplitOn,#Line,#start_at);
SET #data_len = CASE #end_at WHEN 0 THEN LEN(#Line) ELSE #end_at-#start_at END;
INSERT INTO #RtnValue (data) VALUES( SUBSTRING(#Line,#start_at,#data_len) );
IF #end_at = 0 BREAK;
SET #start_at = #end_at + #split_on_len;
END;
RETURN;
END;
Nice to see that it have been solved in the 2016 version, but for all of those that is not on that, here are two generalized and simplified versions of the methods above.
The XML-method is shorter, but of course requires the string to allow for the xml-trick (no 'bad' chars.)
XML-Method:
create function dbo.splitString(#input Varchar(max), #Splitter VarChar(99)) returns table as
Return
SELECT Split.a.value('.', 'VARCHAR(max)') AS Data FROM
( SELECT CAST ('<M>' + REPLACE(#input, #Splitter, '</M><M>') + '</M>' AS XML) AS Data
) AS A CROSS APPLY Data.nodes ('/M') AS Split(a);
Recursive method:
create function dbo.splitString(#input Varchar(max), #Splitter Varchar(99)) returns table as
Return
with tmp (DataItem, ix) as
( select #input , CHARINDEX('',#Input) --Recu. start, ignored val to get the types right
union all
select Substring(#input, ix+1,ix2-ix-1), ix2
from (Select *, CHARINDEX(#Splitter,#Input+#Splitter,ix+1) ix2 from tmp) x where ix2<>0
) select DataItem from tmp where ix<>0
Function in action
Create table TEST_X (A int, CSV Varchar(100));
Insert into test_x select 1, 'A,B';
Insert into test_x select 2, 'C,D';
Select A,data from TEST_X x cross apply dbo.splitString(x.CSV,',') Y;
Drop table TEST_X
XML-METHOD 2: Unicode Friendly 😀 (Addition courtesy of Max Hodges)
create function dbo.splitString(#input nVarchar(max), #Splitter nVarchar(99)) returns table as
Return
SELECT Split.a.value('.', 'NVARCHAR(max)') AS Data FROM
( SELECT CAST ('<M>' + REPLACE(#input, #Splitter, '</M><M>') + '</M>' AS XML) AS Data
) AS A CROSS APPLY Data.nodes ('/M') AS Split(a);
Please refer below TSQL. STRING_SPLIT function is available only under compatibility level 130 and above.
TSQL:
DECLARE #stringValue NVARCHAR(400) = 'red,blue,green,yellow,black';
DECLARE #separator CHAR = ',';
SELECT [value] As Colour
FROM STRING_SPLIT(#stringValue, #separator);
RESULT:
Colour
red
blue
green
yellow
black
I know it has a lot of answers, but I want to write my version of split function like others and like string_split SQL Server 2016 native function.
create function [dbo].[Split]
(
#Value nvarchar(max),
#Delimiter nvarchar(50)
)
returns #tbl table
(
Seq int primary key identity(1, 1),
Value nvarchar(max)
)
as begin
declare #Xml xml = cast('<d>' + replace(#Value, #Delimiter, '</d><d>') + '</d>' as xml);
insert into #tbl
(Value)
select a.split.value('.', 'nvarchar(max)') as Value
from #Xml.nodes('/d') a(split);
return;
end;
Seq column is primary key to support fast join with other real table or Split function returned table.
Used XML function to support large data (looping version will slow down significantly when you have large data)
Here's a answer to question.
CREATE TABLE Testdata
(
SomeID INT,
OtherID INT,
String VARCHAR(MAX)
);
INSERT Testdata SELECT 1, 9, '18,20,22';
INSERT Testdata SELECT 2, 8, '17,19';
INSERT Testdata SELECT 3, 7, '13,19,20';
INSERT Testdata SELECT 4, 6, '';
INSERT Testdata SELECT 9, 11, '1,2,3,4';
select t.SomeID, t.OtherID, s.Value
from Testdata t
cross apply dbo.Split(t.String, ',') s;
--Output
SomeID OtherID Value
1 9 18
1 9 20
1 9 22
2 8 17
2 8 19
3 7 13
3 7 19
3 7 20
4 6
9 11 1
9 11 2
9 11 3
9 11 4
Joining Split with other split
declare #Names nvarchar(max) = 'a,b,c,d';
declare #Codes nvarchar(max) = '10,20,30,40';
select n.Seq, n.Value Name, c.Value Code
from dbo.Split(#Names, ',') n
inner join dbo.Split(#Codes, ',') c on n.Seq = c.Seq;
--Output
Seq Name Code
1 a 10
2 b 20
3 c 30
4 d 40
Split two times
declare #NationLocSex nvarchar(max) = 'Korea,Seoul,1;Vietnam,Kiengiang,0;China,Xian,0';
with rows as
(
select Value
from dbo.Split(#NationLocSex, ';')
)
select rw.Value r, cl.Value c
from rows rw
cross apply dbo.Split(rw.Value, ',') cl;
--Output
r c
Korea,Seoul,1 Korea
Korea,Seoul,1 Seoul
Korea,Seoul,1 1
Vietnam,Kiengiang,0 Vietnam
Vietnam,Kiengiang,0 Kiengiang
Vietnam,Kiengiang,0 0
China,Xian,0 China
China,Xian,0 Xian
China,Xian,0 0
Split to columns
declare #Numbers nvarchar(50) = 'First,Second,Third';
with t as
(
select case when Seq = 1 then Value end f1,
case when Seq = 2 then Value end f2,
case when Seq = 3 then Value end f3
from dbo.Split(#Numbers, ',')
)
select min(f1) f1, min(f2) f2, min(f3) f3
from t;
--Output
f1 f2 f3
First Second Third
Generate rows by range
declare #Ranges nvarchar(50) = '1-2,4-6';
declare #Numbers table (Num int);
insert into #Numbers values (1),(2),(3),(4),(5),(6),(7),(8);
with t as
(
select r.Seq, r.Value,
min(case when ft.Seq = 1 then ft.Value end) ValueFrom,
min(case when ft.Seq = 2 then ft.Value end) ValueTo
from dbo.Split(#Ranges, ',') r
cross apply dbo.Split(r.Value, '-') ft
group by r.Seq, r.Value
)
select t.Seq, t.Value, t.ValueFrom, t.ValueTo, n.Num
from t
inner join #Numbers n on n.Num between t.ValueFrom and t.ValueTo;
--Output
Seq Value ValueFrom ValueTo Num
1 1-2 1 2 1
1 1-2 1 2 2
2 4-6 4 6 4
2 4-6 4 6 5
2 4-6 4 6 6
DECLARE #id_list VARCHAR(MAX) = '1234,23,56,576,1231,567,122,87876,57553,1216';
DECLARE #table TABLE ( id VARCHAR(50) );
DECLARE #x INT = 0;
DECLARE #firstcomma INT = 0;
DECLARE #nextcomma INT = 0;
SET #x = LEN(#id_list) - LEN(REPLACE(#id_list, ',', '')) + 1; -- number of ids in id_list
WHILE #x > 0
BEGIN
SET #nextcomma = CASE WHEN CHARINDEX(',', #id_list, #firstcomma + 1) = 0
THEN LEN(#id_list) + 1
ELSE CHARINDEX(',', #id_list, #firstcomma + 1)
END;
INSERT INTO #table
VALUES ( SUBSTRING(#id_list, #firstcomma + 1, (#nextcomma - #firstcomma) - 1) );
SET #firstcomma = CHARINDEX(',', #id_list, #firstcomma + 1);
SET #x = #x - 1;
END;
SELECT *
FROM #table;
;WITH tmp(SomeID, OtherID, DataItem, Data) as (
SELECT SomeID, OtherID, LEFT(Data, CHARINDEX(',',Data+',')-1),
STUFF(Data, 1, CHARINDEX(',',Data+','), '')
FROM Testdata
WHERE Data > ''
)
SELECT SomeID, OtherID, Data
FROM tmp
ORDER BY SomeID
with only tiny little modification to above query...
By creating this function ([DelimitedSplit]) which splits a string, you could do an OUTER APPLY to your SELECT.
CREATE FUNCTION [dbo].[DelimitedSplit]
--===== Define I/O parameters
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
--WARNING!!! DO NOT USE MAX DATA-TYPES HERE! IT WILL KILL PERFORMANCE!
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table" produces values from 1 up to 10,000...
-- enough to cover VARCHAR(8000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a INNER JOIN E1 b ON b.N = a.N), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a INNER JOIN E2 b ON b.N = a.N), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l
;
TEST
CREATE TABLE #Testdata
(
SomeID INT,
OtherID INT,
String VARCHAR(MAX)
);
INSERT #Testdata SELECT 1, 9, '18,20,22';
INSERT #Testdata SELECT 2, 8, '17,19';
INSERT #Testdata SELECT 3, 7, '13,19,20';
INSERT #Testdata SELECT 4, 6, '';
INSERT #Testdata SELECT 9, 11, '1,2,3,4';
SELECT
*
FROM #Testdata
OUTER APPLY [dbo].[DelimitedSplit](String,',');
DROP TABLE #Testdata;
RESULT
SomeID OtherID String ItemNumber Item
1 9 18,20,22 1 18
1 9 18,20,22 2 20
1 9 18,20,22 3 22
2 8 17,19 1 17
2 8 17,19 2 19
3 7 13,19,20 1 13
3 7 13,19,20 2 19
3 7 13,19,20 3 20
4 6 1
9 11 1,2,3,4 1 1
9 11 1,2,3,4 2 2
9 11 1,2,3,4 3 3
9 11 1,2,3,4 4 4
Function
CREATE FUNCTION dbo.SplitToRows (#column varchar(100), #separator varchar(10))
RETURNS #rtnTable TABLE
(
ID int identity(1,1),
ColumnA varchar(max)
)
AS
BEGIN
DECLARE #position int = 0;
DECLARE #endAt int = 0;
DECLARE #tempString varchar(100);
set #column = ltrim(rtrim(#column));
WHILE #position<=len(#column)
BEGIN
set #endAt = CHARINDEX(#separator,#column,#position);
if(#endAt=0)
begin
Insert into #rtnTable(ColumnA) Select substring(#column,#position,len(#column)-#position);
break;
end;
set #tempString = substring(ltrim(rtrim(#column)),#position,#endAt-#position);
Insert into #rtnTable(ColumnA) select #tempString;
set #position=#endAt+1;
END;
return;
END;
Use case
select * from dbo.SplitToRows('T14; p226.0001; eee; 3554;', ';');
Or just a select with multiple result set
DECLARE #column varchar(max)= '1234; 4748;abcde; 324432';
DECLARE #separator varchar(10) = ';';
DECLARE #position int = 0;
DECLARE #endAt int = 0;
DECLARE #tempString varchar(100);
set #column = ltrim(rtrim(#column));
WHILE #position<=len(#column)
BEGIN
set #endAt = CHARINDEX(#separator,#column,#position);
if(#endAt=0)
begin
Select substring(#column,#position,len(#column)-#position);
break;
end;
set #tempString = substring(ltrim(rtrim(#column)),#position,#endAt-#position);
select #tempString;
set #position=#endAt+1;
END;
When using this approach you have to make sure that none of your values contains something that would be illegal XML – user1151923
I always use the XML method. Make sure you use VALID XML. I have two functions to convert between valid XML and Text. (I tend to strip out the carriage returns as I don't usually need them.
CREATE FUNCTION dbo.udf_ConvertTextToXML (#Text varchar(MAX))
RETURNS varchar(MAX)
AS
BEGIN
SET #Text = REPLACE(#Text,CHAR(10),'');
SET #Text = REPLACE(#Text,CHAR(13),'');
SET #Text = REPLACE(#Text,'<','<');
SET #Text = REPLACE(#Text,'&','&');
SET #Text = REPLACE(#Text,'>','>');
SET #Text = REPLACE(#Text,'''','&apos;');
SET #Text = REPLACE(#Text,'"','"');
RETURN #Text;
END;
CREATE FUNCTION dbo.udf_ConvertTextFromXML (#Text VARCHAR(MAX))
RETURNS VARCHAR(max)
AS
BEGIN
SET #Text = REPLACE(#Text,'<','<');
SET #Text = REPLACE(#Text,'&','&');
SET #Text = REPLACE(#Text,'>','>');
SET #Text = REPLACE(#Text,'&apos;','''');
SET #Text = REPLACE(#Text,'"','"');
RETURN #Text;
END;
Below works on sql server 2008
select *, ROW_NUMBER() OVER(order by items) as row#
from
( select 134 myColumn1, 34 myColumn2, 'd,c,k,e,f,g,h,a' comaSeperatedColumn) myTable
cross apply
SPLIT (rtrim(comaSeperatedColumn), ',') splitedTable -- gives 'items' column
Will get all Cartesian product with the origin table columns plus "items" of split table.
You can use the following function to extract data
CREATE FUNCTION [dbo].[SplitString]
(
#RowData NVARCHAR(MAX),
#Delimeter NVARCHAR(MAX)
)
RETURNS #RtnValue TABLE
(
ID INT IDENTITY(1,1),
Data NVARCHAR(MAX)
)
AS
BEGIN
DECLARE #Iterator INT;
SET #Iterator = 1;
DECLARE #FoundIndex INT;
SET #FoundIndex = CHARINDEX(#Delimeter,#RowData);
WHILE (#FoundIndex>0)
BEGIN
INSERT INTO #RtnValue (data)
SELECT
Data = LTRIM(RTRIM(SUBSTRING(#RowData, 1, #FoundIndex - 1)));
SET #RowData = SUBSTRING(#RowData,
#FoundIndex + DATALENGTH(#Delimeter) / 2,
LEN(#RowData));
SET #Iterator = #Iterator + 1;
SET #FoundIndex = CHARINDEX(#Delimeter, #RowData);
END;
INSERT INTO #RtnValue (Data)
SELECT Data = LTRIM(RTRIM(#RowData));
RETURN;
END;

Split string and number columns

Let's say I have a table such as
ItemID ClassID
------------------------
1 10, 13, 12
2 5, 7
and would like to copy the data to another table like so
ItemID Numbering ClassID
----------------------------------
1 1 10
1 2 13
1 3 12
2 1 5
2 2 7
Separating the comma-delimited ClassID field into individual rows, retaining the order they had in the first table
Populating the Numbering row on insert. The Numbering column has sequential integers for each batch of ClassID and is why ClassID needs to be kept in order.
I have attempted this with the following function:
CREATE FUNCTION dbo.Split
(
#String NVARCHAR(MAX)
)
RETURNS #SplittedValues TABLE(
Value INT
)
AS
BEGIN
DECLARE #SplitLength INT
DECLARE #Delimiter VARCHAR(10)
SET #Delimiter = ','
WHILE len(#String) > 0
BEGIN
SELECT #SplitLength = (CASE charindex(#Delimiter, #String)
WHEN 0 THEN
datalength(#String) / 2
ELSE
charindex(#Delimiter, #String) - 1
END)
INSERT INTO #SplittedValues
SELECT cast(substring(#String, 1, #SplitLength) AS INTEGER)
WHERE
ltrim(rtrim(isnull(substring(#String, 1, #SplitLength), ''))) <> '';
SELECT #String = (CASE ((datalength(#String) / 2) - #SplitLength)
WHEN 0 THEN
''
ELSE
right(#String, (datalength(#String) / 2) - #SplitLength - 1)
END)
END
RETURN
END
but it only partly works. It copies the rows the correct amount of times (i.e. three times for ItemID=1, and twice for ItemID=2 in the above example), but they are exact copies of the row (all saying '10, 13, 12') and the comma-delimited parts are not split up. There is also nothing in the function to add to the Numbering column.
So, I have two questions: How do I modify the above function to split up the ClassID string, and what do I add to correctly increment the Numbering column?
Thanks!
I'd use a recursive CTE to do it.
WITH SplitCTE AS
(
SELECT
itemid,
LEFT(ClassID,CHARINDEX(',',ClassID)-1) AS ClassID
,RIGHT(ClassID,LEN(ClassID)-CHARINDEX(',',ClassID)) AS remaining
FROM table1
WHERE ClassID IS NOT NULL AND CHARINDEX(',',ClassID)>0
UNION ALL
SELECT
itemid,
LEFT(remaining,CHARINDEX(',',remaining)-1)
,RIGHT(remaining,LEN(remaining)-CHARINDEX(',',remaining))
FROM SplitCTE
WHERE remaining IS NOT NULL AND CHARINDEX(',',remaining)>0
UNION ALL
SELECT
itemid,remaining,null
FROM SplitCTE
WHERE remaining IS NOT NULL AND CHARINDEX(',',remaining)=0
)
SELECT
itemid,
row_number() over (partition by itemid order by cast(classid as int) asc) as Numbering,
cast (ClassID as int) as ClassID
FROM
SplitCTE
UNION ALL
select
ItemId,
1,
cast(classid as int)
FROM table1
WHERE ClassID IS NOT NULL AND CHARINDEX(',',ClassID) = 0
SQL Fiddle
DECLARE #t TABLE( ID INT IDENTITY, data VARCHAR(50))
INSERT INTO #t(data) SELECT '10, 13, 12'
INSERT INTO #t(data) SELECT '5, 7'
select F1.id,O.splitdata, ROW_NUMBER() OVER(PARTITION BY ID ORDER BY (SELECT 1))
from (
select *,cast(''+replace(F.data,',','')+'' as XML) as xmlfilter from #t F
)F1
cross apply
(
select fdata.D.value('.','varchar(50)') as splitdata from f1.xmlfilter.nodes('X') as fdata(D)
) O

Calculate TF-IDF using Sql

I have a table in my DB containning a free text field column.
I would like to know the frequency each word appears over all the rows, or maybe even calc a TF-IDF for all words, where my documents are that field's values per row.
Is it possible to calculate this using an Sql Query? if not or there's a simpler way could you please direct me to it?
Many Thanks,
Jon
In SQL Server 2008 depending on your needs you could apply full text indexing to the column then query the sys.dm_fts_index_keywords and sys.dm_fts_index_keywords_by_document table valued functions to get the occurrence count.
Edit: Actually even without creating a persistent full text index you can still leverage the parser
WITH testTable AS
(
SELECT 1 AS Id, N'how now brown cow' AS txt UNION ALL
SELECT 2, N'she sells sea shells upon the sea shore' UNION ALL
SELECT 3, N'red lorry yellow lorry' UNION ALL
SELECT 4, N'the quick brown fox jumped over the lazy dog'
)
SELECT display_term, COUNT(*) As Cnt
FROM testTable
CROSS APPLY sys.dm_fts_parser('"' + REPLACE(txt,'"','""') + '"', 1033, 0,0)
WHERE TXT IS NOT NULL
GROUP BY display_term
HAVING COUNT(*) > 1
ORDER BY Cnt DESC
Returns
display_term Cnt
------------------------------ -----------
the 3
brown 2
lorry 2
sea 2
Solution for SQL Server 2008:
here is the table:
CREATE TABLE MyTable (id INT, txt VARCHAR(MAX));
here is SQL query:
SELECT sum(case when TSplitted.txt_word = 'searched' then 1 else 0 end) as cnt_searched
, count(*) as cnt_all
FROM MyTable MYT
INNER JOIN Fn_Split(MYT.id,' ',MYT.txt) TSplitted on MYT.id=TSplitted.id
here is table valued function Fn_Split(#id int, #separator VARCHAR(32), #string VARCHAR(MAX)) (taken from here):
CREATE FUNCTION Fn_Split (#id int, #separator VARCHAR(32), #string VARCHAR(MAX))
RETURNS #t TABLE
(
ret_id INT
,txt_word VARCHAR(MAX)
)
AS
BEGIN
DECLARE #xml XML
SET #XML = N'<root><r>' + REPLACE(#s, #separator, '</r><r>') + '</r></root>'
INSERT INTO #t(ret_id, val)
SELECT #id, r.value('.','VARCHAR(5)') as Item
FROM #xml.nodes('//root/r') AS RECORDS(r)
RETURN
END