Count Of Distinct Characters In Column - sql

Say I have the following data set
Column1 (VarChar(50 or something))
Elias
Sails
Pails
Plane
Games
What I'd like to produce from this column is the following set:
LETTER COUNT
E 3
L 4
I 3
A 5
S 5
And So On...
One solution I thought of was combining all strings into a single string, and then count each instance of the letter in that string, but that feels sloppy.
This is more an exercise of curiosity than anything else, but, is there a way to get a count of all distinct letters in a dataset with SQL?

I would do this by creating a table of your letters similar to:
CREATE TABLE tblLetter
(
letter varchar(1)
);
INSERT INTO tblLetter ([letter])
VALUES
('a'),
('b'),
('c'),
('d'); -- etc
Then you could join the letters to your table where your data is like the letter:
select l.letter, count(n.col) Total
from tblLetter l
inner join names n
on n.col like '%'+l.letter+'%'
group by l.letter;
See SQL Fiddle with Demo. This would give a result:
| LETTER | TOTAL |
|--------|-------|
| a | 5 |
| e | 3 |
| g | 1 |
| i | 3 |
| l | 4 |
| m | 1 |
| p | 2 |
| s | 4 |

If you create a table of letters, like this:
create table letter (ch char(1));
insert into letter(ch) values ('A'),('B'),('C'),('D'),('E'),('F'),('G'),('H')
,('I'),('J'),('K'),('L'),('M'),('N'),('O'),('P')
,('Q'),('R'),('S'),('T'),('U'),('V'),('W'),('X'),('Y'),('Z');
you could do it with a cross join, like this:
select ch, SUM(len(str) - len(replace(str,ch,'')))
from letter
cross join test -- <<== test is the name of the table with the string
group by ch
having SUM(len(str) - len(replace(str,ch,''))) <> 0
Here is a running demo on sqlfiddle.
You can do it without defining a table by embedding a list of letters into a query itself, but the idea of cross-joining and grouping by the letter would remain the same.
Note: see this answer for the explanation of the expression inside the SUM.

To me, this is a problem almost tailored for a CTE (Thanks, Nicholas Carey, for the original, my fiddle here: http://sqlfiddle.com/#!3/44f77/8):
WITH cteLetters
AS
(
SELECT
1 AS CharPos,
str,
MAX(LEN(str)) AS MaxLen,
SUBSTRING(str, 1, 1) AS Letter
FROM
test
GROUP BY
str,
SUBSTRING(str, 1, 1)
UNION ALL
SELECT
CharPos + 1,
str,
MaxLen,
SUBSTRING(str, CharPos + 1, 1) AS Letter
FROM
cteLetters
WHERE
CharPos + 1 <= MaxLen
)
SELECT
UPPER(Letter) AS Letter,
COUNT(*) CountOfLetters
FROM
cteLetters
GROUP BY
Letter
ORDER BY
Letter;
Use the CTE to calculate character positions and deconstruct each string. Then you can just aggregate from the CTE itself. No need for additional tables or anything.

This should work even if you have case sensitivity turned on.
The setup:
CREATE TABLE _test ( Column1 VARCHAR (50) )
INSERT _test (Column1) VALUES ('Elias'),('Sails'),('Pails'),('Plane'),('Games')
The work:
DECLARE #counter AS INT
DECLARE #results TABLE (LETTER VARCHAR(1),[COUNT] INT)
SET #counter=65 --ascii value for 'A'
WHILE ( #counter <=90 ) -- ascii value for 'Z'
BEGIN
INSERT #results (LETTER,[COUNT])
SELECT CHAR(#counter),SUM(LEN(UPPER(Column1)) - LEN(REPLACE(UPPER(Column1), CHAR(#counter),''))) FROM _test
SET #counter=#counter+1
END
SELECT * FROM #results WHERE [Count]>0

It's often useful to have a range or sequence table that gives you a source of large runs of contiguous sequential numbers, like this one covering the range -100,000–+100,000.
drop table dbo.range
go
create table dbo.range
(
id int not null primary key clustered ,
)
go
set nocount on
go
declare #i int = -100000
while ( #i <= +100000 )
begin
if ( #i > 0 and #i % 1000 = 0 ) print convert(varchar,#i) + ' rows'
insert dbo.range values ( #i )
set #i = #i + 1
end
go
set nocount off
go
Once you have such a table, you can do something like this:
select character = substring( t.some_column , r.id , 1 ) ,
frequency = count(*)
from dbo.some_table t
join dbo.range r on r.id between 1 and len( t.some_column )
group by substring( t.some_column , r.id , 1 )
order by 1
If you want to ensure case-insensitivity, just mix in the desired upper() or lower():
select character = upper( substring( t.some_column , r.id , 1 ) ) ,
frequency = count(*)
from dbo.some_table t
join dbo.range r on r.id between 1 and len( t.some_column )
group by upper( substring( t.some_column , r.id , 1 ) )
order by 1
Given your sample data:
create table dbo.some_table
(
some_column varchar(50) not null
)
go
insert dbo.some_table values ( 'Elias' )
insert dbo.some_table values ( 'Sails' )
insert dbo.some_table values ( 'Pails' )
insert dbo.some_table values ( 'Plane' )
insert dbo.some_table values ( 'Games' )
go
The latter query above produces the following results:
character frequency
A 5
E 3
G 1
I 3
L 4
M 1
N 1
P 2
S 5

Related

Count number of repeated character in a given string

How do I count the number of occurrences of repeated $ character in the given strings.
For ex:
String = '$$$$ABC$$$DE$$$' --> Answer is 4,3,3
String = '###$$%%ANE$$$$$' --> Answer is 2,5
I have no idea how to do it so did not do any attempts.
Thanks for your help.
For Reproducing:
DDL and Inserts:
Create table xyz(text varchar(200));
Insert into xyz values('$$$$ABC$$$DE$$$');
Insert into xyz values('###$$%%ANE$$$$$');
What I need to do: Count the repeated number of '$'
Desired output, based on the sample data in #1 above.
text = '$$$$ABC$$$DE$$$' --> Answer is 4,3,3
text = '###$$%%ANE$$$$$' --> Answer is 2,5
SQL Server version: Microsoft SQL Server 2019 (RTM) - 15.0.2000.5
Please try the following solution. It will work starting from SQL Server 2017 onwards.
It is based on use of the TRANSLATE() function, and XML and XQuery.
SQL
-- DDL and sample data population, start
DECLARE #tbl TABLE (ID INT IDENTITY PRIMARY KEY, tokens VARCHAR(30));
INSERT INTO #tbl (tokens) VALUES
('$$$$ABC$$$DE$$$'), --> Answer is 4,3,3
('###$$%%ANE$$$$$'); --> Answer is 2,5
-- DDL and sample data population, end
DECLARE #separator CHAR(1) = SPACE(1);
;WITH cte AS
(
SELECT *
, REPLACE(TRANSLATE(tokens, '$', SPACE(1)),' ','') AS JunkCharacters
FROM #tbl
)
SELECT *
, REPLACE(TRY_CAST('<root><r><![CDATA[' +
REPLACE(TRANSLATE(tokens, TRIM(JunkCharacters), SPACE(LEN(TRIM(JunkCharacters)))), #separator, ']]></r><r><![CDATA[') +
']]></r></root>' AS XML)
.query('
for $x in /root/r[text()]
return data(string-length($x))
').value('.', 'VARCHAR(20)'), SPACE(1), ',') AS CleansedTokensCounter
FROM cte;
Output
+----+-----------------+----------------+-----------------------+
| ID | tokens | JunkCharacters | CleansedTokensCounter |
+----+-----------------+----------------+-----------------------+
| 1 | $$$$ABC$$$DE$$$ | ABCDE | 4,3,3 |
| 2 | ###$$%%ANE$$$$$ | ###%%ANE | 2,5 |
+----+-----------------+----------------+-----------------------+
We can do this with a number of steps:
We use a tally/numbers table to shred the string into individual characters. The tally is calculated on the fly with a couple of cross-joins and ROW_NUMBER
We then calculate a grouping ID for each group of characters, using a standard gaps-and-islands technique: a windowed sum of each starting row
Filter down to the character we want, group it by ID and return a count of rows in each group.
This returns a new row for every group of $ characters
Create table xyz(text varchar(200));
Insert into xyz values('$$$$ABC$$$DE$$$');
Insert into xyz values('###$$%%ANE$$$$$');
WITH
L0 AS ( SELECT 1 AS c
FROM (VALUES(1),(1),(1),(1),(1),(1),(1),(1),
(1),(1),(1),(1),(1),(1),(1),(1)) AS D(c) ),
L1 AS ( SELECT 1 AS c FROM L0 AS A CROSS JOIN L0 AS B ),
-- you can allow for larger strings with more cross-joins
Nums AS ( SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS rownum
FROM L1 )
SELECT
xyz.[text],
r.numRepetitions
FROM xyz
CROSS APPLY (
SELECT numRepetitions = COUNT(*)
FROM (
SELECT TOP(LEN(xyz.[text]))
thisChar = SUBSTRING(xyz.[text], rownum, 1),
groupId = SUM(CASE WHEN rownum = 1 OR SUBSTRING(xyz.[text], rownum, 1) <> SUBSTRING(xyz.[text], rownum - 1, 1) THEN 1 ELSE 0 END)
OVER (ORDER BY rownum ROWS UNBOUNDED PRECEDING)
FROM Nums
ORDER BY rownum
) AS chars
WHERE thisChar = '$'
GROUP BY groupId
) AS r;
If you want a single comma-separated list of row-counts, you need to subquery again
CROSS APPLY (
SELECT numRepetitions = STRING_AGG(CAST(numRepetitions AS varchar(10)), ',')
FROM (
SELECT numRepetitions = COUNT(*)
FROM (
SELECT TOP(LEN(xyz.[text]))
thisChar = SUBSTRING(xyz.[text], rownum, 1),
groupId = SUM(CASE WHEN rownum = 1 OR SUBSTRING(xyz.[text], rownum, 1) <> SUBSTRING(xyz.[text], rownum - 1, 1) THEN 1 ELSE 0 END)
OVER (ORDER BY rownum ROWS UNBOUNDED PRECEDING)
FROM Nums
ORDER BY rownum
) AS chars
WHERE thisChar = '$'
GROUP BY groupId
) AS groups
) AS r;

Split a coma separated string in a particular format

I have a table with a column which represent hierarchy path, so when i execute the SQL query
select hierachypath from mytable where id=10
for a particular row i will get the result like this
hieracheypath
--------------
1,2,3,4,5,6,7,8,9,10
select hierachypath from mytable where id=10
I want to get a result like
1,2,3,4,5,6,7,8,9,10
1,1,2,3,4,5,6,7,8,9
1,2,3,4,5,6,7,8
1,2,3,4,5,6,7
1,2,3,4,5,6
1,2,3,4,5
1,2,3,4
1,2,3
1,2
1
OR
1
1,2
1,2,3
1,2,3,4
1,2,3,4,5
1,2,3,4,5,6
1,2,3,4,5,6,7
1,2,3,4,5,6,7,8
1,2,3,4,5,6,7,8,9
1,2,3,4,5,6,7,8,9,10
I had try this way
Declare #heiracheypath nvarchar(4000) ='1,2,3,4,5,6,7,8,9,10'
declare #Result TABLE (Column1 VARCHAR(100))
Declare #tcount int
SELECT #tcount=(len(#heiracheypath) - LEN(REPLACE(#heiracheypath,',','')) + 1)
DECLARE #IntLocation INT
WHILE (CHARINDEX(',', #heiracheypath, 0) > 0)
BEGIN
SET #IntLocation = CHARINDEX(',', #heiracheypath, 0)
INSERT INTO #Result (Column1)
--LTRIM and RTRIM to ensure blank spaces are removed
SELECT RTRIM(LTRIM(SUBSTRING(#heiracheypath, 0, #IntLocation)))
SET #heiracheypath = STUFF(#heiracheypath, 1, #IntLocation, '')
END
INSERT INTO #Result (Column1)
SELECT RTRIM(LTRIM(#heiracheypath))--LTRIM and RTRIM to ensure blank spaces are removed
select * from #Result
but the result was
Column1
-------
1
2
3
4
5
6
7
8
9
10
The code in the question Looks like T-SQL - so here's a simple solution without common table expressions:
DECLARE #heiracheypath nvarchar(4000) ='1,2,3,4,5,6,7,8,9,10';
SELECT SUBSTRING(#heiracheypath, 1, ci-1) As Paths
FROM
(
SELECT CHARINDEX(',',#heiracheypath, N) As ci
FROM
(
SELECT TOP(LEN(#heiracheypath)) ROW_NUMBER() OVER(ORDER BY ##SPID) As N
FROM sys.objects A
) AS Tally
UNION
SELECT LEN(#heiracheypath) + 1
) As CommaIndexes
WHERE ci > 0
ORDER BY ci
The Tally derived table contains numbers from 1 to the length of the value,
the CommaIndexes table contains the distinct indexes of each comma in the value,
the union part is to also return the full string,
and the outer most select statement simply use substring to return the relevant parts of the string.
This could be simplified further by combining the tally derived table with the commaIndexs derived table:
SELECT SUBSTRING(#heiracheypath, 1, ci-1) As Paths
FROM
(
SELECT TOP(LEN(#heiracheypath)) CHARINDEX(',',#heiracheypath, ROW_NUMBER() OVER(ORDER BY ##SPID)) As ci
FROM sys.objects A
UNION SELECT LEN(#heiracheypath) + 1
) As CommaIndexes
WHERE ci > 0
ORDER BY ci
Result:
Paths
1
1,2
1,2,3
1,2,3,4
1,2,3,4,5
1,2,3,4,5,6
1,2,3,4,5,6,7
1,2,3,4,5,6,7,8
1,2,3,4,5,6,7,8,9
1,2,3,4,5,6,7,8,9,10

Identify distinct codes present in one column but not present in another column

I have data like the following table. First two columns are list of country codes with pipe separator. There are two group of rows with RANK as 1 and 2.
I am trying to identify the country codes which are present in CountryList1 but not present in the column CountryList1 over a give RANK. For Rank 1 rows, HN JP SK and KY is present in CountryList1 but not present in CountryList2. Likewise, for Rank 2 rows. HN is present in CountryList1 but not present in CountryList2.
I am expecting Output like second table. I do not want to use a function or a procedure but trying to accomplish it using select statement.
Input
CountryList1 || CountryList2 || RANK
================||==============||=======
HN|IN|US || GB|CA|CH|CA || 1
JP|CH || IN|US|LU || 1
HN|SK|KY || GB|CA || 1
FI || IN|MO || 1
HN|IN|US || HN || 2
JP|CH || CH|IN|US || 2
HN || NO || 2
Output
DistinctCountry1 || RAN
====================||========
HN || 1
JP || 1
SK || 1
KY || 1
JP || 2
You have an abominable data structure. You should be storing elements of a list as separate values on rows. But you can do something by splitting the values. SQL Server 2016 has string_split(). For earlier versions you can find one on the web.
with tc as (
select t.*, s.country1
from t cross apply
(string_split(t.countrylist1, '|') s(country1)
)
select distinct t.country1, t.rnk
from tc
where not exists (select 1
from t t2
where tc.rnk = t2.rnk and
tc.country in (select value from string_split(t2.country_list))
);
This will not be efficient. And with the data structure you have, there is little scope for improving performance.
Here's a nice loop you can use for this:
declare #holding table (country1 varchar(max), country2 varchar(max), rank int)
declare #iterator int=1
declare #countrylistoriginal1 varchar(max)
declare #countrylistoriginal2 varchar(max)
declare #countrylist1 varchar(max)
declare #countrylist2 varchar(max)
declare #rank int
while #iterator<=(select max(rowid) from #temp2)
begin
select #countrylistoriginal1=countrylist1+'|', #rank=[rank]
from yourtable where rowid=#iterator
while #countrylistoriginal1<>''
begin
set #countrylist1=left(#countrylistoriginal1,(charindex('|',#countrylistoriginal1)))
set #countrylistoriginal1=replace(#countrylistoriginal1, #countrylist1,'')
select #countrylistoriginal2=countrylist2+'|'
from yourtable where rowid=#iterator
while #countrylistoriginal2<>''
begin
set #countrylist2=left(#countrylistoriginal2,(charindex('|',#countrylistoriginal2)))
set #countrylistoriginal2=replace(#countrylistoriginal2, #countrylist2,'')
insert #holding
select replace(#countrylist1,'|',''), replace(#countrylist2,'|',''), #rank
end
end
set #iterator=#iterator+1
end
select distinct a.country1, a.rank from #holding a
left join #holding b on a.country1=b.country2 and a.rank=b.rank where b.country2 is null
Try this...
Table Schema and data
CREATE TABLE [tableName](
[CountryList1] [nvarchar](50) NULL,
[CountryList2] [nvarchar](50) NULL,
[RANK] [int] NULL
)
INSERT [tableName] ([CountryList1], [CountryList2], [RANK]) VALUES (N'HN|IN|US', N'GB|CA|CH|CA', 1)
INSERT [tableName] ([CountryList1], [CountryList2], [RANK]) VALUES (N'JP|CH ', N'IN|US|LU', 1)
INSERT [tableName] ([CountryList1], [CountryList2], [RANK]) VALUES (N'HN|SK|KY', N'GB|CA', 1)
INSERT [tableName] ([CountryList1], [CountryList2], [RANK]) VALUES (N'FI', N'IN|MO', 1)
INSERT [tableName] ([CountryList1], [CountryList2], [RANK]) VALUES (N'HN|IN|US', N'HN ', 2)
INSERT [tableName] ([CountryList1], [CountryList2], [RANK]) VALUES (N'JP|CH', N'CH|IN|US', 2)
INSERT [tableName] ([CountryList1], [CountryList2], [RANK]) VALUES (N'HN', N'NO', 2)
SQL Query
;WITH cte AS
( SELECT DISTINCT *
FROM (SELECT [value] AS DistinctCountry1,
[rank],
Rtrim(Ltrim([value])) + Cast([rank] AS NVARCHAR(max)) AS colX
FROM tablename
CROSS apply String_split([countrylist1], '|')) tmp
WHERE colx NOT IN (SELECT Rtrim(Ltrim([value])) + Cast([rank] AS NVARCHAR(max)) AS colX
FROM tablename
CROSS apply String_split([countrylist2], '|'))
)
SELECT [distinctcountry1], [rank]
FROM cte
ORDER BY [rank]
Output
+------------------+------+
| distinctcountry1 | rank |
+------------------+------+
| FI | 1 |
| HN | 1 |
| JP | 1 |
| KY | 1 |
| SK | 1 |
| JP | 2 |
+------------------+------+
Demo: http://www.sqlfiddle.com/#!18/19acb/2/0
Note: As others already suggested, you should really consider fixing your table or you'll have to put extra hours when manipulating data.

SQL taking a one-to-many delimited list in a table column and transforming it into a one-to-one relationship table [duplicate]

I have a SQL Table like this:
| SomeID | OtherID | Data
+----------------+-------------+-------------------
| abcdef-..... | cdef123-... | 18,20,22
| abcdef-..... | 4554a24-... | 17,19
| 987654-..... | 12324a2-... | 13,19,20
is there a query where I can perform a query like SELECT OtherID, SplitData WHERE SomeID = 'abcdef-.......' that returns individual rows, like this:
| OtherID | SplitData
+-------------+-------------------
| cdef123-... | 18
| cdef123-... | 20
| cdef123-... | 22
| 4554a24-... | 17
| 4554a24-... | 19
Basically split my data at the comma into individual rows?
I am aware that storing a comma-separated string into a relational database sounds dumb, but the normal use case in the consumer application makes that really helpful.
I don't want to do the split in the application as I need paging, so I wanted to explore options before refactoring the whole app.
It's SQL Server 2008 (non-R2).
You can use the wonderful recursive functions from SQL Server:
Sample table:
CREATE TABLE Testdata
(
SomeID INT,
OtherID INT,
String VARCHAR(MAX)
);
INSERT Testdata SELECT 1, 9, '18,20,22';
INSERT Testdata SELECT 2, 8, '17,19';
INSERT Testdata SELECT 3, 7, '13,19,20';
INSERT Testdata SELECT 4, 6, '';
INSERT Testdata SELECT 9, 11, '1,2,3,4';
The query
WITH tmp(SomeID, OtherID, DataItem, String) AS
(
SELECT
SomeID,
OtherID,
LEFT(String, CHARINDEX(',', String + ',') - 1),
STUFF(String, 1, CHARINDEX(',', String + ','), '')
FROM Testdata
UNION all
SELECT
SomeID,
OtherID,
LEFT(String, CHARINDEX(',', String + ',') - 1),
STUFF(String, 1, CHARINDEX(',', String + ','), '')
FROM tmp
WHERE
String > ''
)
SELECT
SomeID,
OtherID,
DataItem
FROM tmp
ORDER BY SomeID;
-- OPTION (maxrecursion 0)
-- normally recursion is limited to 100. If you know you have very long
-- strings, uncomment the option
Output
SomeID | OtherID | DataItem
--------+---------+----------
1 | 9 | 18
1 | 9 | 20
1 | 9 | 22
2 | 8 | 17
2 | 8 | 19
3 | 7 | 13
3 | 7 | 19
3 | 7 | 20
4 | 6 |
9 | 11 | 1
9 | 11 | 2
9 | 11 | 3
9 | 11 | 4
Finally, the wait is over with SQL Server 2016. They have introduced the Split string function, STRING_SPLIT:
select OtherID, cs.Value --SplitData
from yourtable
cross apply STRING_SPLIT (Data, ',') cs
All the other methods to split string like XML, Tally table, while loop, etc.. have been blown away by this STRING_SPLIT function.
Here is an excellent article with performance comparison: Performance Surprises and Assumptions: STRING_SPLIT.
For older versions, using tally table here is one split string function(best possible approach)
CREATE FUNCTION [dbo].[DelimitedSplit8K]
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table" produces values from 0 up to 10,000...
-- enough to cover NVARCHAR(4000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l
;
Referred from Tally OH! An Improved SQL 8K “CSV Splitter” Function
Check this
SELECT A.OtherID,
Split.a.value('.', 'VARCHAR(100)') AS Data
FROM
(
SELECT OtherID,
CAST ('<M>' + REPLACE(Data, ',', '</M><M>') + '</M>' AS XML) AS Data
FROM Table1
) AS A CROSS APPLY Data.nodes ('/M') AS Split(a);
Very late but try this out:
SELECT ColumnID, Column1, value --Do not change 'value' name. Leave it as it is.
FROM tbl_Sample
CROSS APPLY STRING_SPLIT(Tags, ','); --'Tags' is the name of column containing comma separated values
So we were having this:
tbl_Sample :
ColumnID| Column1 | Tags
--------|-----------|-------------
1 | ABC | 10,11,12
2 | PQR | 20,21,22
After running this query:
ColumnID| Column1 | value
--------|-----------|-----------
1 | ABC | 10
1 | ABC | 11
1 | ABC | 12
2 | PQR | 20
2 | PQR | 21
2 | PQR | 22
Thanks!
select t.OtherID,x.Kod
from testData t
cross apply (select Code from dbo.Split(t.Data,',') ) x
As of Feb 2016 - see the TALLY Table Example - very likely to outperform my TVF below, from Feb 2014. Keeping original post below for posterity:
Too much repeated code for my liking in the above examples. And I dislike the performance of CTEs and XML. Also, an explicit Id so that consumers that are order specific can specify an ORDER BY clause.
CREATE FUNCTION dbo.Split
(
#Line nvarchar(MAX),
#SplitOn nvarchar(5) = ','
)
RETURNS #RtnValue table
(
Id INT NOT NULL IDENTITY(1,1) PRIMARY KEY CLUSTERED,
Data nvarchar(100) NOT NULL
)
AS
BEGIN
IF #Line IS NULL RETURN;
DECLARE #split_on_len INT = LEN(#SplitOn);
DECLARE #start_at INT = 1;
DECLARE #end_at INT;
DECLARE #data_len INT;
WHILE 1=1
BEGIN
SET #end_at = CHARINDEX(#SplitOn,#Line,#start_at);
SET #data_len = CASE #end_at WHEN 0 THEN LEN(#Line) ELSE #end_at-#start_at END;
INSERT INTO #RtnValue (data) VALUES( SUBSTRING(#Line,#start_at,#data_len) );
IF #end_at = 0 BREAK;
SET #start_at = #end_at + #split_on_len;
END;
RETURN;
END;
Nice to see that it have been solved in the 2016 version, but for all of those that is not on that, here are two generalized and simplified versions of the methods above.
The XML-method is shorter, but of course requires the string to allow for the xml-trick (no 'bad' chars.)
XML-Method:
create function dbo.splitString(#input Varchar(max), #Splitter VarChar(99)) returns table as
Return
SELECT Split.a.value('.', 'VARCHAR(max)') AS Data FROM
( SELECT CAST ('<M>' + REPLACE(#input, #Splitter, '</M><M>') + '</M>' AS XML) AS Data
) AS A CROSS APPLY Data.nodes ('/M') AS Split(a);
Recursive method:
create function dbo.splitString(#input Varchar(max), #Splitter Varchar(99)) returns table as
Return
with tmp (DataItem, ix) as
( select #input , CHARINDEX('',#Input) --Recu. start, ignored val to get the types right
union all
select Substring(#input, ix+1,ix2-ix-1), ix2
from (Select *, CHARINDEX(#Splitter,#Input+#Splitter,ix+1) ix2 from tmp) x where ix2<>0
) select DataItem from tmp where ix<>0
Function in action
Create table TEST_X (A int, CSV Varchar(100));
Insert into test_x select 1, 'A,B';
Insert into test_x select 2, 'C,D';
Select A,data from TEST_X x cross apply dbo.splitString(x.CSV,',') Y;
Drop table TEST_X
XML-METHOD 2: Unicode Friendly 😀 (Addition courtesy of Max Hodges)
create function dbo.splitString(#input nVarchar(max), #Splitter nVarchar(99)) returns table as
Return
SELECT Split.a.value('.', 'NVARCHAR(max)') AS Data FROM
( SELECT CAST ('<M>' + REPLACE(#input, #Splitter, '</M><M>') + '</M>' AS XML) AS Data
) AS A CROSS APPLY Data.nodes ('/M') AS Split(a);
Please refer below TSQL. STRING_SPLIT function is available only under compatibility level 130 and above.
TSQL:
DECLARE #stringValue NVARCHAR(400) = 'red,blue,green,yellow,black';
DECLARE #separator CHAR = ',';
SELECT [value] As Colour
FROM STRING_SPLIT(#stringValue, #separator);
RESULT:
Colour
red
blue
green
yellow
black
I know it has a lot of answers, but I want to write my version of split function like others and like string_split SQL Server 2016 native function.
create function [dbo].[Split]
(
#Value nvarchar(max),
#Delimiter nvarchar(50)
)
returns #tbl table
(
Seq int primary key identity(1, 1),
Value nvarchar(max)
)
as begin
declare #Xml xml = cast('<d>' + replace(#Value, #Delimiter, '</d><d>') + '</d>' as xml);
insert into #tbl
(Value)
select a.split.value('.', 'nvarchar(max)') as Value
from #Xml.nodes('/d') a(split);
return;
end;
Seq column is primary key to support fast join with other real table or Split function returned table.
Used XML function to support large data (looping version will slow down significantly when you have large data)
Here's a answer to question.
CREATE TABLE Testdata
(
SomeID INT,
OtherID INT,
String VARCHAR(MAX)
);
INSERT Testdata SELECT 1, 9, '18,20,22';
INSERT Testdata SELECT 2, 8, '17,19';
INSERT Testdata SELECT 3, 7, '13,19,20';
INSERT Testdata SELECT 4, 6, '';
INSERT Testdata SELECT 9, 11, '1,2,3,4';
select t.SomeID, t.OtherID, s.Value
from Testdata t
cross apply dbo.Split(t.String, ',') s;
--Output
SomeID OtherID Value
1 9 18
1 9 20
1 9 22
2 8 17
2 8 19
3 7 13
3 7 19
3 7 20
4 6
9 11 1
9 11 2
9 11 3
9 11 4
Joining Split with other split
declare #Names nvarchar(max) = 'a,b,c,d';
declare #Codes nvarchar(max) = '10,20,30,40';
select n.Seq, n.Value Name, c.Value Code
from dbo.Split(#Names, ',') n
inner join dbo.Split(#Codes, ',') c on n.Seq = c.Seq;
--Output
Seq Name Code
1 a 10
2 b 20
3 c 30
4 d 40
Split two times
declare #NationLocSex nvarchar(max) = 'Korea,Seoul,1;Vietnam,Kiengiang,0;China,Xian,0';
with rows as
(
select Value
from dbo.Split(#NationLocSex, ';')
)
select rw.Value r, cl.Value c
from rows rw
cross apply dbo.Split(rw.Value, ',') cl;
--Output
r c
Korea,Seoul,1 Korea
Korea,Seoul,1 Seoul
Korea,Seoul,1 1
Vietnam,Kiengiang,0 Vietnam
Vietnam,Kiengiang,0 Kiengiang
Vietnam,Kiengiang,0 0
China,Xian,0 China
China,Xian,0 Xian
China,Xian,0 0
Split to columns
declare #Numbers nvarchar(50) = 'First,Second,Third';
with t as
(
select case when Seq = 1 then Value end f1,
case when Seq = 2 then Value end f2,
case when Seq = 3 then Value end f3
from dbo.Split(#Numbers, ',')
)
select min(f1) f1, min(f2) f2, min(f3) f3
from t;
--Output
f1 f2 f3
First Second Third
Generate rows by range
declare #Ranges nvarchar(50) = '1-2,4-6';
declare #Numbers table (Num int);
insert into #Numbers values (1),(2),(3),(4),(5),(6),(7),(8);
with t as
(
select r.Seq, r.Value,
min(case when ft.Seq = 1 then ft.Value end) ValueFrom,
min(case when ft.Seq = 2 then ft.Value end) ValueTo
from dbo.Split(#Ranges, ',') r
cross apply dbo.Split(r.Value, '-') ft
group by r.Seq, r.Value
)
select t.Seq, t.Value, t.ValueFrom, t.ValueTo, n.Num
from t
inner join #Numbers n on n.Num between t.ValueFrom and t.ValueTo;
--Output
Seq Value ValueFrom ValueTo Num
1 1-2 1 2 1
1 1-2 1 2 2
2 4-6 4 6 4
2 4-6 4 6 5
2 4-6 4 6 6
DECLARE #id_list VARCHAR(MAX) = '1234,23,56,576,1231,567,122,87876,57553,1216';
DECLARE #table TABLE ( id VARCHAR(50) );
DECLARE #x INT = 0;
DECLARE #firstcomma INT = 0;
DECLARE #nextcomma INT = 0;
SET #x = LEN(#id_list) - LEN(REPLACE(#id_list, ',', '')) + 1; -- number of ids in id_list
WHILE #x > 0
BEGIN
SET #nextcomma = CASE WHEN CHARINDEX(',', #id_list, #firstcomma + 1) = 0
THEN LEN(#id_list) + 1
ELSE CHARINDEX(',', #id_list, #firstcomma + 1)
END;
INSERT INTO #table
VALUES ( SUBSTRING(#id_list, #firstcomma + 1, (#nextcomma - #firstcomma) - 1) );
SET #firstcomma = CHARINDEX(',', #id_list, #firstcomma + 1);
SET #x = #x - 1;
END;
SELECT *
FROM #table;
;WITH tmp(SomeID, OtherID, DataItem, Data) as (
SELECT SomeID, OtherID, LEFT(Data, CHARINDEX(',',Data+',')-1),
STUFF(Data, 1, CHARINDEX(',',Data+','), '')
FROM Testdata
WHERE Data > ''
)
SELECT SomeID, OtherID, Data
FROM tmp
ORDER BY SomeID
with only tiny little modification to above query...
By creating this function ([DelimitedSplit]) which splits a string, you could do an OUTER APPLY to your SELECT.
CREATE FUNCTION [dbo].[DelimitedSplit]
--===== Define I/O parameters
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
--WARNING!!! DO NOT USE MAX DATA-TYPES HERE! IT WILL KILL PERFORMANCE!
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table" produces values from 1 up to 10,000...
-- enough to cover VARCHAR(8000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a INNER JOIN E1 b ON b.N = a.N), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a INNER JOIN E2 b ON b.N = a.N), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l
;
TEST
CREATE TABLE #Testdata
(
SomeID INT,
OtherID INT,
String VARCHAR(MAX)
);
INSERT #Testdata SELECT 1, 9, '18,20,22';
INSERT #Testdata SELECT 2, 8, '17,19';
INSERT #Testdata SELECT 3, 7, '13,19,20';
INSERT #Testdata SELECT 4, 6, '';
INSERT #Testdata SELECT 9, 11, '1,2,3,4';
SELECT
*
FROM #Testdata
OUTER APPLY [dbo].[DelimitedSplit](String,',');
DROP TABLE #Testdata;
RESULT
SomeID OtherID String ItemNumber Item
1 9 18,20,22 1 18
1 9 18,20,22 2 20
1 9 18,20,22 3 22
2 8 17,19 1 17
2 8 17,19 2 19
3 7 13,19,20 1 13
3 7 13,19,20 2 19
3 7 13,19,20 3 20
4 6 1
9 11 1,2,3,4 1 1
9 11 1,2,3,4 2 2
9 11 1,2,3,4 3 3
9 11 1,2,3,4 4 4
Function
CREATE FUNCTION dbo.SplitToRows (#column varchar(100), #separator varchar(10))
RETURNS #rtnTable TABLE
(
ID int identity(1,1),
ColumnA varchar(max)
)
AS
BEGIN
DECLARE #position int = 0;
DECLARE #endAt int = 0;
DECLARE #tempString varchar(100);
set #column = ltrim(rtrim(#column));
WHILE #position<=len(#column)
BEGIN
set #endAt = CHARINDEX(#separator,#column,#position);
if(#endAt=0)
begin
Insert into #rtnTable(ColumnA) Select substring(#column,#position,len(#column)-#position);
break;
end;
set #tempString = substring(ltrim(rtrim(#column)),#position,#endAt-#position);
Insert into #rtnTable(ColumnA) select #tempString;
set #position=#endAt+1;
END;
return;
END;
Use case
select * from dbo.SplitToRows('T14; p226.0001; eee; 3554;', ';');
Or just a select with multiple result set
DECLARE #column varchar(max)= '1234; 4748;abcde; 324432';
DECLARE #separator varchar(10) = ';';
DECLARE #position int = 0;
DECLARE #endAt int = 0;
DECLARE #tempString varchar(100);
set #column = ltrim(rtrim(#column));
WHILE #position<=len(#column)
BEGIN
set #endAt = CHARINDEX(#separator,#column,#position);
if(#endAt=0)
begin
Select substring(#column,#position,len(#column)-#position);
break;
end;
set #tempString = substring(ltrim(rtrim(#column)),#position,#endAt-#position);
select #tempString;
set #position=#endAt+1;
END;
When using this approach you have to make sure that none of your values contains something that would be illegal XML – user1151923
I always use the XML method. Make sure you use VALID XML. I have two functions to convert between valid XML and Text. (I tend to strip out the carriage returns as I don't usually need them.
CREATE FUNCTION dbo.udf_ConvertTextToXML (#Text varchar(MAX))
RETURNS varchar(MAX)
AS
BEGIN
SET #Text = REPLACE(#Text,CHAR(10),'');
SET #Text = REPLACE(#Text,CHAR(13),'');
SET #Text = REPLACE(#Text,'<','<');
SET #Text = REPLACE(#Text,'&','&');
SET #Text = REPLACE(#Text,'>','>');
SET #Text = REPLACE(#Text,'''','&apos;');
SET #Text = REPLACE(#Text,'"','"');
RETURN #Text;
END;
CREATE FUNCTION dbo.udf_ConvertTextFromXML (#Text VARCHAR(MAX))
RETURNS VARCHAR(max)
AS
BEGIN
SET #Text = REPLACE(#Text,'<','<');
SET #Text = REPLACE(#Text,'&','&');
SET #Text = REPLACE(#Text,'>','>');
SET #Text = REPLACE(#Text,'&apos;','''');
SET #Text = REPLACE(#Text,'"','"');
RETURN #Text;
END;
Below works on sql server 2008
select *, ROW_NUMBER() OVER(order by items) as row#
from
( select 134 myColumn1, 34 myColumn2, 'd,c,k,e,f,g,h,a' comaSeperatedColumn) myTable
cross apply
SPLIT (rtrim(comaSeperatedColumn), ',') splitedTable -- gives 'items' column
Will get all Cartesian product with the origin table columns plus "items" of split table.
You can use the following function to extract data
CREATE FUNCTION [dbo].[SplitString]
(
#RowData NVARCHAR(MAX),
#Delimeter NVARCHAR(MAX)
)
RETURNS #RtnValue TABLE
(
ID INT IDENTITY(1,1),
Data NVARCHAR(MAX)
)
AS
BEGIN
DECLARE #Iterator INT;
SET #Iterator = 1;
DECLARE #FoundIndex INT;
SET #FoundIndex = CHARINDEX(#Delimeter,#RowData);
WHILE (#FoundIndex>0)
BEGIN
INSERT INTO #RtnValue (data)
SELECT
Data = LTRIM(RTRIM(SUBSTRING(#RowData, 1, #FoundIndex - 1)));
SET #RowData = SUBSTRING(#RowData,
#FoundIndex + DATALENGTH(#Delimeter) / 2,
LEN(#RowData));
SET #Iterator = #Iterator + 1;
SET #FoundIndex = CHARINDEX(#Delimeter, #RowData);
END;
INSERT INTO #RtnValue (Data)
SELECT Data = LTRIM(RTRIM(#RowData));
RETURN;
END;

Split string and divide value

I got a table Test with columns A and B.
The A column contains different values in one entry, e.g. abc;def;ghi, all separated by ;. And the B column contains numeric values, but only one.
What I want is to seperate the values from column A into multiple rows.
So:
abc;def;ghi;jkl
-->
abc
def
ghi
jkl
In column B is one value, e.g. 20 and I want that value split to the amount of rows,
So the final result shut be:
abc 5
def 5
ghi 5
jkl 5
The issue is that the amount of values in column A must be variable.
First you need to create this function
REATE FUNCTION Split
(
#delimited nvarchar(max),
#delimiter nvarchar(100)
) RETURNS #t TABLE
(
-- Id column can be commented out, not required for sql splitting string
id int identity(1,1), -- I use this column for numbering splitted parts
val nvarchar(max),
origVal nvarchar(max)
)
AS
BEGIN
declare #xml xml
set #xml = N'<root><r>' + replace(#delimited,#delimiter,'</r><r>') + '</r></root>'
insert into #t(val,origval)
select
r.value('.','varchar(max)') as item, #delimited
from #xml.nodes('//root/r') as records(r)
RETURN
END
GO
then this query might help
Select x.Val, test.B / (len(test.A) - len(replace(Test.A, ';', '')) + 1) from Test
inner join dbo.Split(Test.A,';') x on x.origVal = Test.A
this part len(test.A) - len(replace(Test.A, ';', '')) will count the number of ; in string
Be aware this query might have some malfunctioning if there will be duplicate strings in A column, in this situation you need to pass the unique value (for example ID) to split function and return it in the result table, then join it by this value (ie. x.origVal = Test.A => x.origID = Test.ID)
You can use some tricks with CTE, STUFF and windows functions
DECLARE #t TABLE
(
ID INT ,
A NVARCHAR(MAX) ,
B INT
)
INSERT INTO #t
VALUES ( 1, 'a;b;c;d;', 20 ),
( 2, 'x;y;z;', 40 );
WITH cte ( ID, B, D, A )
AS ( SELECT ID ,
B ,
LEFT(A, CHARINDEX(';', A + ';') - 1) ,
STUFF(A, 1, CHARINDEX(';', A + ';'), '')
FROM #t
UNION ALL
SELECT ID ,
B ,
LEFT(A, CHARINDEX(';', A + ';') - 1) ,
STUFF(A, 1, CHARINDEX(';', A + ';'), '')
FROM cte
WHERE A > ''
)
SELECT ID ,
B ,
D,
CAST(B AS DECIMAL) / COUNT(*) OVER (PARTITION BY ID) AS Portion
FROM cte
Output:
ID B D Portion
1 20 a 5.00000000000
1 20 b 5.00000000000
1 20 c 5.00000000000
1 20 d 5.00000000000
2 40 x 13.33333333333
2 40 y 13.33333333333
2 40 z 13.33333333333
this an example how you can achieve required result
DECLARE #table AS TABLE
(
ColumnA VARCHAR(100) ,
ColumnB FLOAT
)
INSERT INTO #table
( ColumnA, ColumnB )
VALUES ( 'abc;def;ghi;jkl', 20 ),
( 'asf;ret;gsd;jas', 30 ),
( 'dfa;aef;gffhi;fjfkl', 40 );
WITH C AS ( SELECT n = 1
UNION ALL
SELECT n + 1
FROM C
WHERE n <= 100
),
SetForSplit
AS ( SELECT T.ColumnA ,
T.ColumnB ,
C.n ,
( CASE WHEN LEFT(SUBSTRING(T.ColumnA, n, 100), 1) = ';'
THEN SUBSTRING(T.ColumnA, n + 1, 100) + ';'
ELSE SUBSTRING(T.ColumnA, n, 100) + ';'
END ) AS SomeText
FROM #table AS T
JOIN C ON C.n <= LEN(T.ColumnA)
WHERE SUBSTRING(T.ColumnA, n, 1) = ';'
OR n = 1
)
SELECT ROW_NUMBER() OVER ( PARTITION BY columnA ORDER BY LEFT(SomeText,
CHARINDEX(';',
SomeText) - 1) ) AS RowN,
LEFT(SomeText, CHARINDEX(';', SomeText) - 1) AS ColA ,
ColumnB / COUNT(*) OVER ( PARTITION BY ColumnA ) AS ColB
FROM SetForSplit
ORDER BY ColumnA
This is full working exmaple:
DECLARE #DataSource TABLE
(
[A] VARCHAR(MAX)
,[B] INT
);
INSERT INTO #DataSource ([A], [B])
VALUES ('a;b;c;d', 20 ),
('x;y;z', 40 );
SELECT T.c.value('.', 'VARCHAR(100)')
,[B] / COUNT([B]) OVER (PARTITION BY [B])
FROM #DataSource
CROSS APPLY
(
SELECT CONVERT(XML, '<t>' + REPLACE([A], ';', '</t><t>') + '</t>')
) DS([Bxml])
CROSS APPLY [Bxml].nodes('/t') AS T(c)
and of couse you can ROUND the devision as you like.