Split a string by a delimiter using SQL - sql

My table in DB has a column which stores values in following format.
1234#2345#6780
Four digit numbers are stored using delimiter "#".
Due to a data corruption, there are some records with five digit numbers. There may be one or more than one five digit numbers in a given row.
1234#12345#67895
I'm trying to write a script to get only those corrupted records But cannot find a way to split and check values.
Any help is appreciated.
I'm using SQL server 12.0 version

you can use this function to split values:
CREATE FUNCTION [dbo].[fnSplit]
(#sInputList VARCHAR(8000) -- List of delimited items
, #sDelimiter VARCHAR(8000) = '#' -- delimiter that separates items
)
RETURNS #List TABLE (item VARCHAR(8000))
BEGIN
DECLARE #sItem VARCHAR(8000)
WHILE CHARINDEX(#sDelimiter,#sInputList,0) <> 0
BEGIN
SELECT
#sItem=RTRIM(LTRIM(SUBSTRING(#sInputList,1,CHARINDEX(#sDelimiter,#sInputList,0)-1))),
#sInputList=RTRIM(LTRIM(SUBSTRING(#sInputList,CHARINDEX(#sDelimiter,#sInputList,0)+LEN(#sDelimiter),LEN(#sInputList))))
IF LEN(#sItem) > 0
INSERT INTO #List SELECT #sItem as item
END
IF LEN(#sInputList) > 0
INSERT INTO #List SELECT #sInputList as item -- Put the last item in
RETURN
END
and then ask about the result

You can use XML nodes to split the string before 2016 version.
Create table Xmltest(ID int, numbers nvarchar(max))
insert into Xmltest values (1, '1234#12345#67895')
select ID, N.value('.', 'varchar(255)') as xmlValue
from (
select ID ,
cast(('<w>' + replace(numbers,'#','</w><w>') + '</w>') as xml) as xmlValue
from Xmltest
) as z
cross apply xmlValue.nodes ('//w') as split(N)
Output you get, I added this ID column to identify which row may have more than 4 Characters.
ID xmlValue
1 1234
1 12345
1 67895
To check where you have more than 4 characters you can do:
select ID, N.value('.', 'varchar(255)') as xmlValue
from (
select ID ,
cast(('<w>' + replace(numbers,'#','</w><w>') + '</w>') as xml) as xmlValue
from Xmltest
) as z
cross apply xmlValue.nodes ('//w') as split(N)
where len(N.value('.', 'varchar(255)')) > 4
Output you get:
ID xmlValue
1 12345
1 67895

You can use this. it returns any numbers row which length greater than 4.
SELECT * FROM SampleData
WHERE data LIKE '%[0-9][0-9][0-9][0-9][0-9]%'

this will work patindex is orcale's equivalent of regexp_like():
select * from table_name where not PATINDEX ('^[0-9]{4}(#){1}[0-9]{4}(#){1}[0-9]
{4}$',col_name) !=0;

for SQL Server (starting with 2016)
you can use the built in function of SQL to split a string.
sample:
DECLARE #Text VARCHAR(100) = '1234#12345#67895'
SELECT * FROM STRING_SPLIT(#Text,'#')
result:
value
----
123
4456
78902
you can now easily manipulate the values after

Related

How to get all the two characters long substrings separated by dot (.) from an email address in SQL server. I want Scalar function

I have one email column that is having values like this 'claudio.passerini#uni.re.dit.mn.us'. I want to take two characters strings between dot (to check for the countries and states codes).
i want result like this
col1=re,mn,us
Solution
To do exactly what you've asked; i.e. pull back just the 2 char codes from within the email address's domain, you could use a function such as this:
create function dbo.fn_Get2AlphaCharCodesFromEmail
(
#email nvarchar(254) --max length of an email is 254: http://stackoverflow.com/questions/386294/what-is-the-maximum-length-of-a-valid-email-address
) returns nvarchar(254)
as
begin
declare #result nvarchar(254) = null
, #maxLen int = 254
;with cte(i, remainder,result) as
(
select cast(0 as int)
, cast('.' + substring(#email,charindex('#',#email)+1,#maxLen) + '.' as nvarchar(254))
, cast(null as nvarchar(254))
union all
select cast(i+1 as int)
, cast(substring(remainder,patindex('%.[A-Z][A-Z].%',remainder)+3,#maxLen)as nvarchar(254))
, cast(coalesce(result + ',','') + substring(remainder,patindex('%.[A-Z][A-Z].%',remainder)+1,2) as nvarchar(254))
from cte
where patindex('%.[A-Z][A-Z].%',remainder) > 0
)
select top 1 #result = result from cte order by i desc;
Return #result;
end
go
--demo
select dbo.fn_Get2AlphaCharCodesFromEmail ('claudio.passerini#uni.re.dit.mn.us')
--returns: re,mn,us
select dbo.fn_Get2AlphaCharCodesFromEmail ('claudio.passerini#uni.123.dit.mnx.usx')
--returns: NULL
Explanation
Create a function called fn_Get2AlphaCharCodesFromEmail in the schema dbo which takes a single parameter, #email which is a string of up to 254 characters, and returns a string of up to 254 characters.
create function dbo.fn_Get2AlphaCharCodesFromEmail
(
#email nvarchar(254)
) returns nvarchar(254)
as
begin
--... code that does the work goes here
end
declare the variables we'll be using later on.
#result holds the value we'll be returning from the function
#maxLen records the maximum length of an email; this makes it slightly easier should this length ever need to change; though not entirely simple since we have to specify the 254 length in our column & variable definitions later on anyway.
declare #result nvarchar(254) = null
, #maxLen int = 254
Now comes the interesting bit. We create a common table expression with 3 columns:
i is used to record which iteration each record was produced in; the highest value of i is the last record to be created.
remainder is used to hold the yet-to-be processed characters from the email.
result is used to record the 2 char codes; each new row adds another value to this column's comma separated values.
;with cte(i, remainder,result) as
(
--code to iterate through the email string, breaking it down, goes here
)
this gives us our first row in the cte "table".
The cast statements throughout this part are to ensure we have a consistent data type, as data types in a CTE are implicit, and not always correct
we initialise i (i.e. the first column) with value 0 to say that this is our first row (we could choose pretty much any value here; it doesn't matter
we initialise remainder (i.e. 2nd column) as the part of the email address which follows the # character; i.e. the email's domain.
we initialise result (i.e. 3rd column) as null; as we've not yet found a result (i.e. a 2 char string within the email's domain)
there is no from component as we're just getting data from the #email variable; no tables/views/etc are required.
select cast(0 as int)
, cast('.' + substring(#email,charindex('#',#email)+1,#maxLen) + '.' as nvarchar(254))
, cast(null as nvarchar(254))
union all is used to combing the first result(s) with the results of the next (recurring) statement. NB: The CTE code before this statement is run once to give initial values; the code after is run once for each new set of rows generated.
union all
The recurring code in the CTE is applied to new rows in the CTE until no new rows are generated.
i takes the value of the previous iteration's row's i incremented by 1.
select cast(i+1 as int)
remainder takes the previous iteration's remainder, and removes everything before (and including) the next 2 character code (result).
patindex('%.[A-Z][A-Z].%',remainder) returns a number giving the location of the a string containing a dot followed by 2 letters followed by a dot, occurring anywhere in the input string
, cast(substring(remainder,patindex('%.[A-Z][A-Z].%',remainder)+3,#maxLen)as nvarchar(254))
result uses the same logic as remainder, only it takes the 2 characters found, rather than everything after them. These characters are added on to the end of the previous iteartion's row's result value, separated by a comma.
, cast(coalesce(result + ',','') + substring(remainder,patindex('%.[A-Z][A-Z].%',remainder)+1,2) as nvarchar(254))
the from cte part just says that we're referencing the same "table" we're creating; i.e. this is how the recursion occurs
from cte
the where statement is used to prevent infinite recursion; i.e. once there are no more 2 char codes left in the remainder, stop looking.
where patindex('%.[A-Z][A-Z].%',remainder) > 0
Once we've found all the 2 char codes in the string, we know that the last row's result will contain the complete set; as such we assign this single row's value to the #result variable.
select top 1 #result = result
the from statement shows we're referencing the data we created in our with cte statement
from cte
the order by is used to determine which record comes first (i.e. which record is the top 1 record). We want it to be the last row generated by the CTE. Since we've been incrementing i by 1 each time, this last record will have the highest value of i, so by sorting by i desc (descending) that last generated row will be the row we get.
order by i desc;
Finally, we return the result generated above.
Return #result;
Alternative Approach
However, if you're trying to extract information from your emails, I'd recommend an alternate approach... have a list of values that you're looking for, and compare your email with that, without having to break apart the email address (beyond splitting on the # to ensure you're only checking the email's domain).
declare #countryCodes table (code nchar(2), name nvarchar(64)) --you'd use a real table for this; I'm just using a table variable so this demo's throwaway code
insert into #countryCodes (code, name)
values
('es','Spain')
,('fr','France')
,('uk','United Kingdom')
,('us','USA')
--etc.
--check a single mail
declare #mail nvarchar(256) = 'claudio.passerini#uni.re.dit.mn.us'
if exists (select top 1 1 from #countryCodes where '.' + substring(#mail,charindex('#',#mail)+1,256) + '.' like '%.' + code + '.%')
begin
select name from #countryCodes where '.' + substring(#mail,charindex('#',#mail)+1,256) + '.' like '%.' + code + '.%'
end
else
begin
select 'no results found'
end
--check a bunch of mails
declare #emailsToCheck table (email nvarchar(256))
insert into #emailsToCheck (email)
values
('claudio.passerini#uni.re.dit.mn.us')
,('someone#someplace.co.uk')
,('cant.see.me#never.never.land')
,('some.fr.address.hidden#france.not.in.this.bit')
select e.email, c.name
from #emailsToCheck e
left outer join #countryCodes c
on '.' + substring(email,charindex('#',email)+1,256) + '.' like '%.' + code + '.%'
order by e.email, c.name
If yo want individual columns you will need to pivot your data after splitting out your strings with a table valued function as per Marc's answer. If you are happy having them in rows, you can just use the select statement inside the brackets.
Query to get the data
declare #t table (Email nvarchar(50));
insert into #t values('claudio.passerini#uni.re.dit.mn.us'),('claudio.passerini#uni.ry.dit.mn.urg'),('claudio.passerini#uni.rn.dit.mn.uk');
select Email
,[1]
,[2]
,[3]
,[4]
,[5]
,[6]
from(
select t.Email
,s.Item
,row_number() over (partition by t.Email order by s.Item) as rn
from #t t
cross apply dbo.DelimitedSplit8K(t.Email,'.') s
where len(s.Item) = 2
) a
pivot
(
max(Item) for rn in([1],[2],[3],[4],[5],[6])
) pvt
Table valued function to split out the strings, courtesy of Jeff Moden
http://www.sqlservercentral.com/articles/Tally+Table/72993/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
ALTER FUNCTION [dbo].[DelimitedSplit8K]
--===== Define I/O parameters
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
--WARNING!!! DO NOT USE MAX DATA-TYPES HERE! IT WILL KILL PERFORMANCE!
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table" produces values from 1 up to 10,000...
-- enough to cover VARCHAR(8000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l
You can create your own function to split strings.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[fnSplitString]
(
#string NVARCHAR(MAX),
#delimiter CHAR(1)
)
RETURNS #output TABLE(splitdata NVARCHAR(MAX)
)
BEGIN
set #delimiter = coalesce(#delimiter, dbo.cSeparador());
DECLARE #start INT, #end INT
SELECT #start = 1, #end = CHARINDEX(#delimiter, #string)
WHILE #start < LEN(#string) + 1 BEGIN
IF #end = 0
SET #end = LEN(#string) + 1
INSERT INTO #output (splitdata)
VALUES(SUBSTRING(#string, #start, #end - #start))
SET #start = #end + 1
SET #end = CHARINDEX(#delimiter, #string, #start)
END
RETURN
END
Using this function you can get all your country&state codes :
select splitdata from dbo.fnSplitString('claudio.passerini#uni.re.dit.mn.us', '.')
where len(splitdata) = 2
You can modify that query to concatenate the result on a single string :
SELECT
STUFF((SELECT ',' + splitdata
FROM dbo.fnSplitString('claudio.passerini#uni.re.dit.mn.us', '.')
WHERE len(splitdata) = 2
FOR XML PATH('')), 1, 1, '')
Here is how you put it into an scalar function :
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[fnCountryCodes](#email nvarchar(max)) returns nvarchar(max)
AS
BEGIN
RETURN (SELECT
STUFF((SELECT ',' + splitdata
FROM dbo.fnSplitString(#email, '.')
WHERE len(splitdata) = 2
FOR XML PATH('')), 1, 1, ''));
END
You call it like this :
select dbo.fnCountryCodes('claudio.passerini#uni.re.dit.mn.us')
Alternatively you can create a table-valued function that returns all the 2 characters long substrings from the domain of a mail address :
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[fnCountryCodes] (#email NVARCHAR(MAX))
RETURNS #output TABLE(subdomain1 nvarchar(2), subdomain2 nvarchar(2), subdomain3 nvarchar(2), subdomain4 nvarchar(2), subdomain5 nvarchar(2))
as
BEGIN
DECLARE #subdomain1 nvarchar(2);
DECLARE #subdomain2 nvarchar(2);
DECLARE #subdomain3 nvarchar(2);
DECLARE #subdomain4 nvarchar(2);
DECLARE #subdomain5 nvarchar(2);
DECLARE CURSOR_SUBDOMAINS CURSOR FOR select splitdata from dbo.fnSplitString(#email, '.') where len(splitdata) = 2;
OPEN CURSOR_SUBDOMAINS;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain1;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain2;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain3;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain4;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain5;
CLOSE CURSOR_SUBDOMAINS;
DEALLOCATE CURSOR_SUBDOMAINS;
INSERT INTO #output (subdomain1, subdomain2, subdomain3, subdomain4, subdomain5)
values (#subdomain1, #subdomain2, #subdomain3, #subdomain4, #subdomain5)
RETURN
END
You use it like that :
select * from dbo.fnCountryCodes('claudio.passerini#uni.re.dit.mn.us')

Parse from string specific values in SQL Server

I have a column with a very long string, and I need to be able to parse out specific values from the string (i.e. values 67-70 for the state name). Below is the (long) string I am working with. I am assuming I can use the Parsename function but I'm unsure of the syntax.
H0100343107000100000000000151750A P+++++++++++++++++1016 STANLEY YOUNG 17 SPRAYPOINT DRIVE POINT COOK FO000006140949525A N WEB SITE S 3030 00010VICTORIA 61409495255
You should use substring
SELECT SUBSTRING('w3resource',4,3);
will out put eso 4,3 means start from 4th position till next 3 characters
so in your case it will be
SELECT SUBSTRING(column_name,67,4);
This is all about MYSQL but MS SQL has the same function
SUBSTRING( string, start_position, length )
Please check this link
http://social.technet.microsoft.com/wiki/contents/articles/17948.t-sql-right-left-substring-and-charindex-functions.aspx
If you want to extract something from string you have two solutions within t-sql (no CLR):
By position
Splitting string using delimiter
1 - String functions which can be used by position are: SUBSTRING, LEFT, RIGHT
2 - There is no build in function for splitting string in t-sql based on delimiter. You can write your function to split it. Below is some splitting function:
CREATE FUNCTION [dbo].[Split]
(
#Text VARCHAR(MAX),
#Delimiter VARCHAR(100),
#Index INT
)
RETURNS VARCHAR(MAX)
AS BEGIN
DECLARE #A TABLE (ID INT IDENTITY, V VARCHAR(MAX));
DECLARE #R VARCHAR(MAX);
WITH CTE AS
(
SELECT 0 A, 1 B
UNION ALL
SELECT B, CONVERT(INT,CHARINDEX(#Delimiter, #Text, B) + LEN(#Delimiter))
FROM CTE
WHERE B > A
)
INSERT #A(V)
SELECT SUBSTRING(#Text,A,CASE WHEN B > LEN(#Delimiter) THEN B-A-LEN(#Delimiter) ELSE LEN(#Text) - A + 1 END) VALUE
FROM CTE WHERE A >0
SELECT #R
= V
FROM #A
WHERE ID = #Index + 1
RETURN #R
END

How to split single cell into multiple columns in sql server 2008R2?

I want to split each name for individual columns
create table split_test(value integer,Allnames varchar(40))
insert into split_test values(1,'Vinoth,Kumar,Raja,Manoj,Jamal,Bala');
select * from split_test;
Value Allnames
-------------------
1 Vinoth,Kumar,Raja,Manoj,Jamal,Bala
Expected output
values N1 N2 N3 N4 N5 N6 N7.......N20
1 Vinoth Kumar Raja Manoj Jamal Bala
using this example you can get an idea.
declare #str varchar(max)
set #str = 'Hello world'
declare #separator varchar(max)
set #separator = ' '
declare #Splited table(id int identity(1,1), item varchar(max))
set #str = REPLACE(#str,#separator,'''),(''')
set #str = 'select * from (values('''+#str+''')) as V(A)'
insert into #Splited
exec(#str)
select * from #Splited
Here is an sql statement using recursive CTE to split names into rows, then pivot rows into columns.
SqlFiddle
with names as
(select
value,
1 as name_id,
substring(Allnames,1,charindex(',',Allnames+',', 0)-1) as name,
substring(Allnames,charindex(',',Allnames, 0)+1, 40) as left_names
from split_test
union all
select
value,
name_id +1,
case when charindex(',',left_names, 0)> 0 then
substring(left_names,1,charindex(',',left_names, 0)-1)
else left_names end as name,
case when charindex(',',left_names, 0)> 0 then
substring(left_names,charindex(',',left_names, 0)+1, 40)
else '' end as left_names
from names
where ltrim(left_names)<>'')
select value,
[1],[2],[3],[4],[5],[6],[7],[8],[9]
from (select value,name_id,name from names) as t1
PIVOT (MAX(name) FOR name_id IN ( [1],[2],[3],[4],[5],[6],[7],[8],[9] ) ) AS t2
UPDATE
#KM.'s answer might be a better way to split data into rows without recursive CTE table. It should be more efficient than this one. So I follow that example and simplified the part of null value process logic. Here is the result:
Step 1:
Create a table includes all numbers from 1 to a number grater than max length of Allnames column.
CREATE TABLE Numbers( Number int not null primary key);
with n as
(select 1 as num
union all
select num +1
from n
where num<100)
insert into numbers
select num from n;
Step 2:
Join data of split_test table with numbers table, we can get all the parts start from ,.
Then take the first part between 2 , form every row. If there are null values exists, add them with union.
select value ,
ltrim(rtrim(substring(allnames,number+1,charindex(',',substring(allnames,number,40),2)-2))) as name
from
(select value, ','+allnames+',' as allnames
from split_test) as t1
left join numbers
on number<= len(allnames)
where substring(allnames,number,1)=','
and substring(allnames,number,40)<>','
union
select value, Allnames
from split_test
where Allnames is null
Step 3: Pivot names from rows to columns like my first attempt above, omitted here.
SQLFiddle

SQL Splitting data in a column into a new table

I am being given data constantly in the following format:
Items Instrument
------- ---------
1|2|3 201400001
2|3 201400002
3 201400003
1|4 201400004
and need to output:
Item Instrument
------- ---------
1 201400001
2 201400001
3 201400001
2 201400002
3 201400002
3 201400003
1 201400004
4 201400004
My answer is a stored function or procedure I know, but which? further, do i write the function or procedure to accept columns from any database i have or will i need to write this on a case by case basis? essentially these are reports being turned in often and i am asked to get them into sql so the data can be analyzed further.
i hope this makes sense. thank you in advance for your time.
If you have a list of all possible items, then you can do:
select i.itemId, t.instrument
from table t join
items i
on '|' + t.items + '|' like '%|'+cast(i.itemId as varchar(255))+'|%';
Original Code: http://dotnetbites.com/split-strings-into-table
Modified for your case:
CREATE FUNCTION [dbo].[SplitStringToTable]
(
#InputString VARCHAR(MAX) = ''
, #Delimiter CHAR(1) = ','
, #Instrument char(8)
)
RETURNS #RESULT TABLE(ID INT IDENTITY, Items VARCHAR(1000), Instrument char(8))
AS
BEGIN
DECLARE #XML XML
SELECT #XML = CONVERT(XML, SQL_TEXT)
FROM (
SELECT '<root><item>'
+ REPLACE(#InputString, #Delimiter, '</item><item>')
+ '</item></root>' AS SQL_TEXT
) dt
INSERT INTO #RESULT(Items, Instrument)
SELECT t.col.query('.').value('.', 'VARCHAR(1000)') AS Items , #Instrument As Instrument
FROM #XML.nodes('root/item') t(col)
RETURN
END

How to split values from braces using SQL query

I need a query to split values between braces separately.
My varchar value is
16(8),14(10)
I need to split as
16,14
I need only 16 and 14 value but not the values which is inside the (8) (10) braces
I have tried this query
select
case
when charindex('(0-9)', OtherProduct) > 0
then rtrim(left(OtherProduct, charindex('(0-9)', OtherProduct)-1))
else OtherProduct end as OtherProduct
from dbo.rxnreactions where rsd='P=15(61),16(8),14(10)R=1,7S=9012'
is anyone can help me to split it.
Use a function to split by comma, then split by brackets, and at the end join into a single string
SELECT SplitByBrackets.val
FROM dbo.StringSplit(N'16(8),14(10)',N',') SplitByComma
CROSS APPLY StringSplit(SplitByComma.val,N'(') SplitByBrackets
WHERE SplitByBrackets.id % 2 = 1
sample of the StringSplit is
CREATE FUNCTION [dbo].[StringSplit]
(
#delimited nvarchar(max),
#delimiter nvarchar(100)
) RETURNS #t TABLE
(
-- Id column can be commented out, not required for sql splitting string
id int identity(1,1), -- I use this column for numbering splitted parts
val nvarchar(max)
)
AS
BEGIN
declare #xml xml
set #xml = N'<root><r>' + replace(#delimited,#delimiter,'</r><r>') + '</r></root>'
insert into #t(val)
select
r.value('.','varchar(max)') as item
from #xml.nodes('//root/r') as records(r)
RETURN
END
fiddle it