How to get 3rd string part with CharIndex/SubString [duplicate]

How to get 3rd string part with CharIndex/SubString [duplicate] - sql

This question already has answers here:
Using T-SQL, return nth delimited element from a string
(14 answers)
Closed 3 years ago.
everyone I'm trying to separate a name column into 4 different parts. As of right now all the name parts are separated by spaces ' '. I am having trouble with my #thirdString populating the fourth part of the name(usually a suffix) which I want to be considered the #fourthString. I will be running this with different names of different lengths. I'm just using Robert Dobson Bud jr as an example. Other names could be two parts or more.
-- Code for parsing a name with multiple parts
-- You should be able to copy and paste this into any MS-SQL Environment it doesn't use a certain table.
DECLARE #nameString as varchar(max),
#firstSpaceLoc as smallint,
#secondSpaceLoc as smallint,
#thirdSpaceLoc as smallint,
#forthSpaceLoc as smallint,
#firstString as varchar(max),
#secondString as varchar(max),
#thirdString as varchar(max),
#fourthString as varchar(max)
-- Create some type of loop or case statement to run through the entire table.
SET #nameString = 'Robert Dobson Bud jr'
SET #firstSpaceLoc = CHARINDEX(' ',#namestring,1)
SET #secondSpaceLoc = CHARINDEX(' ', #namestring, CHARINDEX(' ',#nameString,1)+1)
SET #thirdSpaceLoc =
CASE
WHEN CHARINDEX(' ',
#namestring,
CHARINDEX(' ',#nameString,1)+1) = 0 THEN 0
WHEN CHARINDEX(' ',
#namestring,
CHARINDEX(' ',#nameString,1)+1) > 0 THEN
CHARINDEX(' ', #namestring,
CHARINDEX(' ', #namestring,
CHARINDEX(' ',#nameString,1)+1)+1)
END
SET #forthSpaceLoc =
CASE
WHEN CHARINDEX(' ',
#namestring,
CHARINDEX(' ',#nameString,1)+1) = 0 THEN 0
WHEN CHARINDEX(' ',
#namestring,
CHARINDEX(' ',#nameString,1)+1) > 0 THEN 0
WHEN CHARINDEX(' ',
#namestring,
CHARINDEX(' ',#nameString,1)+1) > 0 THEN
CHARINDEX(' ',
#namestring,
CHARINDEX(' ', #namestring,
CHARINDEX(' ', #nameString,
CHARINDEX(' ',#nameString,1)+1)+1)+1)
END
SELECT
#firstString =
CASE
WHEN #firstSpaceLoc > 0 THEN LEFT(#nameString,CHARINDEX(' ',#namestring,1)-1)
ELSE #nameString
END,
#secondString =
CASE
WHEN #firstSpaceLoc = 0 THEN ''
WHEN #secondSpaceLoc = 0 THEN
RIGHT(#namestring, LEN(#namestring)- CHARINDEX(' ',#namestring,1))
WHEN #secondSpaceLoc > 0 THEN
REPLACE (
SUBSTRING (
#nameString, CHARINDEX(' ',#namestring,1)+1, CHARINDEX(' ', #namestring, CHARINDEX(' ',#nameString,1)+1)
- CHARINDEX(' ',#namestring,1)),' ',''
)
ELSE ''
END,
#thirdString =
CASE
WHEN #firstSpaceLoc = 0 OR #secondSpaceLoc = 0 THEN ''
WHEN #secondSpaceLoc > 0 THEN
SUBSTRING (
#nameString,
CHARINDEX(' ', #namestring,
CHARINDEX(' ',#nameString,1)+1),
LEN(#nameString)
)
END,
#fourthString =
CASE
WHEN #firstSpaceLoc = 0 OR #secondSpaceLoc = 0 OR #thirdSpaceLoc = 0 THEN ''
WHEN #secondSpaceLoc > 0 AND #thirdSpaceLoc = 0 THEN ''
WHEN #thirdSpaceLoc > 0 THEN
SUBSTRING(
#nameString,
CHARINDEX(' ', #namestring,
CHARINDEX(' ', #namestring,
CHARINDEX(' ',#nameString,1)+1)+1),
LEN(#nameString)
)
END
-- Report names
SELECT
#nameString sourceString,
#firstString [First string],
#secondString [Second string],
#thirdString [Third string],
#fourthString [Fourth String]
I would like to get rid of the jr in the 3rd column. The intention is to have 4 different columns with 4 different parts of the name.

This script will do the job
DECLARE #namestring as varchar(max)
SET #namestring = 'Robert Dobson Bud jr'
--SET #namestring = 'Robert Dobson'
;with cte as (
select cast(0 as int) [start],CHARINDEX(' ',#namestring,0) [end] ,#namestring namestring
union all
select cast(cte.[end] as int) [start],CHARINDEX(' ',#namestring,cte.[end]+1) [end] ,#namestring namestring from cte where [end]>0
),cte2 as (
select * ,ROW_NUMBER() over (order by cte.[start]) seq
,substring(#namestring,cte.[start]+1,(case when cte.[end]=0 then len(#namestring)+1 else cte.[end] end)-cte.[start]-1) part from cte
)
select
(select part from cte2 where seq=1) [First String]
,(select part from cte2 where seq=2) [Second String]
,(select part from cte2 where seq=3) [Third String]
,(select part from cte2 where seq=4) [Fourt String]
for 4 part name result will be as below
First String Second String Third String Fourt String
Robert Dobson Bud jr
for 2 part name result will be as below
First String Second String Third String Fourt String
Robert Dobson NULL NULL

The reason why you're getting "jr" in the third string is somewhat mystifying. It's in this part of the code:
#thirdString = CASE
WHEN #firstSpaceLoc = 0 OR #secondSpaceLoc = 0 THEN ''
WHEN #secondSpaceLoc > 0 THEN
SUBSTRING (
#nameString,
CHARINDEX(' ', #namestring,
CHARINDEX(' ',#nameString,1)+1),
LEN(#nameString)
)
Why are you using LEN(#nameString) for the third parameter of the SUBSTRING? Of course that will return the rest of the string, including the "Jr". You clearly knew not to do it that way when getting the #secondString value, how could you not know to do it that way when getting the #thirdString?
To get the #thirdString you need to use the same technique that you used for getting the #secondString.

Does this what you want?
DECLARE #Str VARCHAR(45) = 'Robert Dobson Bud jr';
WITH CTE AS
(
SELECT Value V,
'Str' + CAST(ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS VARCHAR(10)) RN
FROM STRING_SPLIT(#Str, ' ')
)
SELECT *
FROM
(
SELECT *
FROM CTE
) X
PIVOT
(
MAX(V) FOR RN IN ([Str1], [Str2], [Str3], [Str4])
) P;
Returns:
+--------+--------+------+------+
| Str1 | Str2 | Str3 | Str4 |
+--------+--------+------+------+
| Robert | Dobson | Bud | jr |
+--------+--------+------+------+
Live Demo

Using a splitting function, this can be arranged very simply.
SELECT firstString = MAX(CASE WHEN ItemNumber = 1 THEN Item END),
secondString = MAX(CASE WHEN ItemNumber = 2 THEN Item END),
thirdString = MAX(CASE WHEN ItemNumber = 3 THEN Item END),
fourthString = MAX(CASE WHEN ItemNumber = 4 THEN Item END)
FROM dbo.DelimitedSplit8K_LEAD( #nameString, ' ');
The code of the function was initially published and explained here. But I'm copying the definition.
CREATE FUNCTION [dbo].[DelimitedSplit8K_LEAD]
--===== Define I/O parameters
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table” produces values from 0 up to 10,000...
-- enough to cover VARCHAR(8000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "zero base" and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT 0 UNION ALL
SELECT TOP (DATALENGTH(ISNULL(#pString,1))) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT t.N+1
FROM cteTally t
WHERE (SUBSTRING(#pString,t.N,1) = #pDelimiter OR t.N = 0)
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY s.N1),
Item = SUBSTRING(#pString,s.N1,ISNULL(NULLIF((LEAD(s.N1,1,1) OVER (ORDER BY s.N1) - 1),0)-s.N1,8000))
FROM cteStart s
;

Related

How can I retrieve first second and third word of a String in SQL?

I need a query which would extract the first second and third word of a string.
I have approximately 5 words in each row and I need only the first three words out of 5 in the same row (1 row). Example "ATV BDSG 232 continue with other words".
I need only the first three words together in one row (in the same row) like "ATV BDSG 232" as a first row. The table has about 1000 rows and at the end of it I should have 1000 rows again but each row should contain only the first three words of the string.
I found a query which works fine for extracting first two like "ATV BDSG" discussed in stack overflow. The query is
"SELECT SUBSTRING(field1, 0, CHARINDEX(' ', field1, CHARINDEX(' ', field1, 0)+1))
FROM Table"
Can we derive this for extracting first three words?
Thanks in advance

If you don't want to create a dedicated function, you can use successive CROSS APPLYs:
SELECT
T.s,
FirstSpace.i,
SecondSpace.j,
ThirdSpace.k,
CASE
When ThirdSpace.k > 0 THEN LEFT(T.s, Thirdspace.k - 1)
ELSE T.S
END AS Phrase
FROM t
CROSS APPLY (SELECT CHARINDEX(' ', T.s, 1)) AS FirstSpace(i)
CROSS APPLY (SELECT CHARINDEX(' ', T.S, FirstSpace.i + 1)) AS SecondSpace(j)
CROSS APPLY (SELECT CHARINDEX(' ', T.s, SecondSpace.j + 1)) AS ThirdSpace(k)
gives you the results you need:
| s | i | j | k | phrase |
|----------------------------------------|---|---|----|------------------|
| ATV BDSG 232 Continue with other words | 4 | 9 | 13 | ATV BDSG 232 |

Things are easy, SQL Server provide STRING_SPLIT() function make that too easy
DECLARE #Var VARCHAR(100) = 'ATV BDSG 232 Continue with other words';
SELECT Word
FROM
(
SELECT Value AS Word,
ROW_NUMBER()OVER(ORDER BY (SELECT NULL)) RN
FROM STRING_SPLIT(#Var, ' ')
) T
WHERE RN <= 3;
But since you are working on 2012 version, you need to define your own function.
You can also take the hard way, first you need to get the first word, then replace it with '' and get the second word, then do the same for the 3rd word as
DECLARE #Var VARCHAR(100) = 'ATV BDSG 232 Continue with other words';
WITH FW AS
(
SELECT LEFT(#Var, CHARINDEX(' ', #Var)) FirstWord
),
SW AS
(
SELECT LEFT(REPLACE(#Var, FirstWord, ''),
CHARINDEX(' ', REPLACE(#Var, FirstWord, ''))) SecondWord
FROM FW
)
SELECT FirstWord,
SecondWord,
LEFT(REPLACE(REPLACE(V, FirstWord, ''), SecondWord, ''),
CHARINDEX(' ', REPLACE(REPLACE(V, FirstWord, ''), SecondWord, ''))
) ThirdWord
FROM
(
SELECT *, #Var V
FROM FW CROSS APPLY SW
) T
Demo
UPDATE
If you want to select the three first words then simply
SELECT SUBSTRING(Str, 0, CHARINDEX(' ', Str, CHARINDEX(' ', Str, CHARINDEX(' ', Str, 0)+1)+1)) Words
FROM Strings
Demo

--make some test data
declare #test as nvarchar(100) = 'my test string for words';
select 1 id, cast('my test string for words' as nvarchar(max)) word into #test;
insert #test (id,word) values (2,'a b c d e f g hhh yyyyyy') ;
insert #test (id,word) values (3,' a required test string d e f g hhh yyyyyy') ;
insert #test (id,word) values (4,'a quick test') ;
insert #test (id,word) values (5,'a test') ;
insert #test (id,word) values (6,'last') ;
--break up letters, count the first 3 words
;WITH CTE AS (SELECT 1 x, substring(#test,1,1) charx
UNION ALL
SELECT X + 1, substring(#test,x + 1,1) from CTE WHERE x < len(#test)
)
select * from cte c3 where (SELECT count(0) cnt FROM CTE c1 JOIN CTE c2 on c1.x <= c3.x and c1.x + 1 = c2.x and c1.charx =' ' and c2.charx != ' ') < 3
;WITH tabx as (select id, cast(ltrim(word) as nvarchar(max)) 'word' from #test), --do some ltrim
CTE AS (
SELECT id, 1 x, substring(word,1,1) charx from tabx
UNION ALL
SELECT t.id, c.X + 1, substring(t.word,x + 1,1)
from tabx t
JOIN CTE c on c.id = t.id and x < len(t.word)
),
disj as
(select * from cte c3 where
(SELECT count(0) cnt
FROM CTE c1
JOIN CTE c2 on c1.id = c3.id and c1.id = c2.id and c1.x <= c3.x and c1.x + 1 = c2.x and c1.charx =' ' and c2.charx != ' '
) < 3
),
rj as
(select disj.id,disj.x, disj.charx z
from disj
where disj.x = 1
UNION ALL
select d.id, d.x, r.z + d.charx
FROM rj r
join disj d on r.id = d.id and r.x + 1 = d.x
)
select *
from rj r1
cross apply (select max(r2.x) TheRow from rj r2 where r1.id = r2.id) dq
where r1.x = dq.TheRow
order by r1.id;
--delete test data
drop table #test

/* This is not perfect - but interesting */
declare #t table (fullname varchar(100))
insert #t values('Mr Jones'),('Mrs Amy smith'),('Jim Smith'),('Dr Harry Web '),('Paul Fred andrew jones')
select fullname,
a.value as a ,
b.Value as b,
c.Value as c,
d.Value as d,
e.Value as e,
f.value as f
from #t
outer apply (select top 1 value from STRING_SPLIT(fullname, ' ')) a
outer apply (select top 1 value from STRING_SPLIT(fullname, ' ') where value not in (a.value )) b
outer apply (select top 1 value from STRING_SPLIT(fullname, ' ') where value not in (a.value,b.value ) ) c
outer apply (select top 1 value from STRING_SPLIT(fullname, ' ') where value not in (a.value,b.value,c.value )) d
outer apply (select top 1 value from STRING_SPLIT(fullname, ' ') where value not in (a.value,b.value,c.value,d.value) ) e
outer apply (select top 1 value from STRING_SPLIT(fullname, ' ') where value not in (a.value,b.value ,c.value,d.value,e.value) ) f

To Select First Word -
Select top 1 Ltrim(Rtrim(value)) FROM STRING_SPLIT(#input,' ')
To Select Only Second Word -
Select Ltrim(Rtrim(value)) from STRING_SPLIT(#input,' ') Order by (Select NULL) OFFSET 1 ROWS FETCH NEXT 1 ROWS ONLY

Converting multiple delimited fields into rows in SQL Server

I have a data source which contains data in delimited fields which exist in a staging area in SQL Server. I'd like to transform this data into many rows so it is easier to work with. This differs from the numerous other questions and answers on similar topics in that I have multiple fields where this delimited data exists. Here is an example of what my data looks like:
ID | Field | Value
---+-------+------
1 | a,b,c | 1,2,3
2 | a,c | 5,2
And this is the desired output:
ID | Field | Value
---+-------+------
1 | a | 1
1 | b | 2
1 | c | 3
2 | a | 5
2 | c | 2
My code so far uses the XML parsing method like the one mentioned here: Turning a Comma Separated string into individual rows I needed to extend it to join each field to its corresponding value which I have done by generating a row_number for each ID and then matching based on the ID and this row_number.
My issue is that it is painfully slow so I wondered if anyone has any more performant methods?
select
[Value].ID, [Field], [Value]
from
(select
A.ID, Split.a.value('.', 'varchar(100)') as [Value],
row_number() over (partition by ID order by Split.a) as RowNumber
from
(select
ID, cast('<M>' + replace([Value], ',', '</M><M>') + '</M>' as xml) as [Value]
from
#source_table
where
[Field] not like '%[<>&%]%' and [Value] not like '%[<>&%]%') as A
cross apply
[Value].nodes ('/M') as Split(a)
) [Value]
inner join
(
select
A.ID, Split.a.value('.', 'varchar(100)') as [Field],
row_number() over (partition by A.ID order by Split.a) as RowNumber
from
(select
ID, cast('<M>' + replace([Field], ',', '</M><M>') + '</M>' as xml) as [Field]
from
#source_table
where
[Field] not like '%[<>&%]%' and [Value] not like '%[<>&%]%') as A
cross apply
[Field].nodes ('/M') as Split(a)
) [Field] on [Value].ID = [Field].ID and [Value].RowNumber = [Field].RowNumber

Here is an approach using the splitter from Jeff Moden. http://www.sqlservercentral.com/articles/Tally+Table/72993/ One nice feature of that splitter is that it returns the ordinal position of each element so you can use it for joins and such.
Starting with some data.
declare #Something table
(
ID int
, Field varchar(50)
, Value varchar(50)
)
insert #Something values
(1, 'a,b,c', '1,2,3')
, (2, 'a,c', '5,2')
;
Since you have two sets of delimited data you will be forced to split this for each set of delimited values. Here is how you can leverage this splitter to accomplish this.
with Fields as
(
select *
from #Something s
cross apply dbo.DelimitedSplit8K(s.Field, ',') f
)
, Value as
(
select *
from #Something s
cross apply dbo.DelimitedSplit8K(s.Value, ',') v
)
select f.ID
, Field = f.Item
, Value = v.Item
from Fields f
join Value v on v.ItemNumber = f.ItemNumber and v.ID = f.ID
If at all possible it would be best to see if you can change whatever process it is that is populating your source data so it is normalized and not delimited because it is a pain to work with.

Basing on #Gordon Linoff s query here another recursive cte:
DECLARE #t TABLE(
ID int
,Field VARCHAR(MAX)
,Value VARCHAR(MAX)
)
INSERT INTO #t VALUES
(1, 'a,b,c', '1,2,3')
,(2, 'a,c', '5,2')
,(3, 'x', '7');
with cte as (
select ID
,SUBSTRING(Field, 1, CASE WHEN CHARINDEX(',', Field) > 0 THEN CHARINDEX(',', Field)-1 ELSE LEN(Field) END) AS Field
,SUBSTRING(Value, 1, CASE WHEN CHARINDEX(',', Value) > 0 THEN CHARINDEX(',', Value)-1 ELSE LEN(Value) END) AS Value
,SUBSTRING(Field, CASE WHEN CHARINDEX(',', Field) > 0 THEN CHARINDEX(',', Field)+1 ELSE 1 END, LEN(Field)-CASE WHEN CHARINDEX(',', Field) > 0 THEN CHARINDEX(',', Field) ELSE 0 END) as field_list
,SUBSTRING(Value, CASE WHEN CHARINDEX(',', Value) > 0 THEN CHARINDEX(',', Value)+1 ELSE 1 END, LEN(Value)-CASE WHEN CHARINDEX(',', Value) > 0 THEN CHARINDEX(',', Value) ELSE 0 END) as value_list
,0 as lev
from #t
WHERE CHARINDEX(',', Field) > 0
UNION ALL
select ID
,SUBSTRING(field_list, 1, CASE WHEN CHARINDEX(',', field_list) > 0 THEN CHARINDEX(',', field_list)-1 ELSE LEN(field_list) END) AS Field
,SUBSTRING(value_list, 1, CASE WHEN CHARINDEX(',', value_list) > 0 THEN CHARINDEX(',', value_list)-1 ELSE LEN(value_list) END) AS Value
,CASE WHEN CHARINDEX(',', field_list) > 0 THEN SUBSTRING(field_list, CHARINDEX(',', field_list)+1, LEN(field_list)-CHARINDEX(',', field_list)) ELSE '' END as field_list
,CASE WHEN CHARINDEX(',', value_list) > 0 THEN SUBSTRING(value_list, CHARINDEX(',', value_list)+1, LEN(value_list)-CHARINDEX(',', value_list)) ELSE '' END as value_list
,lev + 1
from cte
WHERE LEN(field_list) > 0
)
select ID, Field, Value
from cte
UNION ALL
SELECT ID, Field, Value
FROM #t
WHERE CHARINDEX(',', Field) = 0
ORDER BY ID, Field
OPTION (MAXRECURSION 0)

One method is a recursive CTE:
with cte as (
select id, cast(NULL as varchar(max)) as field, cast(NULL as varchar(max)) as value, field as field_list, value as value_list, 0 as lev
from t
union all
select id, left(field_list, charindex(',', field_list + ',') - 1),
left(value_list, charindex(',', value_list + ',') - 1),
substring(field_list, charindex(',', field_list + ',') + 1, len(field_list)),
substring(value_list, charindex(',', value_list + ',') + 1, len(value_list)),
1 + lev
from cte
where field_list <> '' and value_list <> ''
)
select *
from cte
where lev > 0;
Here is an example of how it works.

Split Strings into columns in SQL Server

I have a name field in Students table which is a comma separated string in format "LastName, FirstName, Middle Name".While doing a select statement in SQL query I need to break this up into separate fields.How can I achieve this in SQL?.Some times Middle intial won't be available.
SUBSTRING(Name,CHARINDEX(',',Name,1)+2,LEN(Name)) AS FirstName,
SUBSTRING(Name,1,CHARINDEX(',',Name,1)-1) AS LastName,
Above code works fine when there is no Middle name.

This should give you what you need:
declare #tmp table (fullname varchar(100));
insert #tmp values('James, Billy, L'), ('John, Snow');
select
fullname
, [Last Name]
, case
when charindex(',', Remainder, 0) > 0
then ltrim(substring(Remainder, 0, charindex(',', Remainder, 0)))
else ltrim(Remainder)
end [First Name]
, case
when charindex(',', Remainder, 0) = 0
then NULL
else ltrim(substring(Remainder, charindex(',', Remainder, 0) + 1, len(Remainder)))
end [Middle Name]
from
(select
fullname
, substring(fullname, 0, charindex(',', fullname, 0)) [Last Name]
, substring(fullname, charindex(',', fullname, 0) + 1, len(fullname)) [Remainder]
from #tmp) result;

First just find the occurrences of comma(,) in the string. Then use CASE expression to get the number of comma. If there is 2 comma then we can assume that middle name is also there. If 1 then only first name and last name. Then use the combinations of LEFT, RIGHT, SUBSTRING, CHARINDEX string functions.
Query
select t.name,
left(
t.name,
charindex(',', t.name, 1) - 1
) last_name,
case t.comma_num
when 2
then substring(
t.name,
charindex(',', t.name, 1) + 1,
len(name) -
(charindex(',', t.name, 1) + 1) - charindex(',', reverse(t.name), 1) + 1
)
when 1
then right(
t.name,
charindex(',', reverse(t.name), 1) - 1
)
else null end as first_name,
case t.comma_num
when 2
then right(
t.name, charindex(',', reverse(t.name), 1) - 1
)
else null end as middle_name
from (
select name,
len(name) - len(replace(name, ',', '')) comma_num
from [your_table_name]
)t;
Find demo here

Use CTE and SUBSTRING AND CHARINDEX funntions
DECLARE #Name VARCHAR(100) = 'James, Billy, L'
--DECLARE #Name VARCHAR(100) = 'James, '', L'
;WITH _CTE ( SplitedNames ,RemainStr) AS
(
SELECT SUBSTRING(#Name,0,CHARINDEX(',',#Name)),
SUBSTRING(#Name,CHARINDEX(',',#Name)+1,LEN(#Name))
UNION ALL
SELECT CASE WHEN CHARINDEX(',',RemainStr) = 0 THEN RemainStr ELSE
SUBSTRING(RemainStr,0,CHARINDEX(',',RemainStr)) END,
CASE WHEN CHARINDEX(',',RemainStr) = 0 THEN '' ELSE
SUBSTRING(RemainStr,CHARINDEX(',',RemainStr)+1,LEN(RemainStr))
END
FROM _CTE
WHERE RemainStr <> ''
)
SELECT SplitedNames FROM _CTE

Extract email address from string using tsql

I'm trying to extract email addresses from an existing comments field and put it into its own column. The string may be something like this "this is an example comment with an email address of someemail#domain.org" or just literally the email itself "someemail#domain.org".
I figure the best thing to do would be to find the index of the '#' symbol and search in both directions until either the end of the string was hit or there was a space. Can anyone help me out with this implementation?

I know wewesthemenace already answered the question, but his/her solution seems over complicated. Why concatenate the left and right sides of the email address together? I'd rather just find the beginning and the end of the email address and then use substring to return the email address like so:
My Table
DECLARE #Table TABLE (comment NVARCHAR(50));
INSERT INTO #Table
VALUES ('blah MyEmailAddress#domain.org'), --At the end
('blah MyEmailAddress#domain.org blah blah'), --In the middle
('MyEmailAddress#domain.org blah'), --At the beginning
('no email');
Actual Query:
SELECT comment,
CASE
WHEN CHARINDEX('#',comment) = 0 THEN NULL
ELSE SUBSTRING(comment,beginningOfEmail,endOfEmail-beginningOfEmail)
END email
FROM #Table
CROSS APPLY (SELECT CHARINDEX(' ',comment + ' ',CHARINDEX('#',comment))) AS A(endOfEmail)
CROSS APPLY (SELECT DATALENGTH(comment)/2 - CHARINDEX(' ',REVERSE(' ' + comment),CHARINDEX('#',REVERSE(' ' + comment))) + 2) AS B(beginningOfEmail)
Results:
comment email
-------------------------------------------------- --------------------------------------------------
blah MyEmailAddress#domain.org MyEmailAddress#domain.org
blah MyEmailAddress#domain.org blah blah MyEmailAddress#domain.org
MyEmailAddress#domain.org blah MyEmailAddress#domain.org
no email NULL

You can search for '#' in the string. Then you get the string at the LEFT and RIGHT side of '#'. You then want to REVERSE the LEFT side and get first occurrence of ' ' then get the SUBSTRING from there. Then REVERSE it to get the original form. Same principle apply to the RIGHT side without doing REVERSE.
Example string: 'some text someemail#domain.org some text'
LEFT = 'some text someemail'
RIGHT = '#domain.org some text'
Reverse LEFT = 'liameemos txet emos'
SUBSTRING up to the first space = 'liameemos'
REVERSE(4) = someemail
SUBSTRING (2) up to the first space = '#domain.org'
Combine 5 and 6 = 'someemail#domain.org'
Your query would be:
;WITH CteEmail(email) AS(
SELECT 'someemail#domain.org' UNION ALL
SELECT 'some text someemail#domain.org some text' UNION ALL
SELECT 'no email'
)
,CteStrings AS(
SELECT
[Left] = LEFT(email, CHARINDEX('#', email, 0) - 1),
Reverse_Left = REVERSE(LEFT(email, CHARINDEX('#', email, 0) - 1)),
[Right] = RIGHT(email, CHARINDEX('#', email, 0) + 1)
FROM CteEmail
WHERE email LIKE '%#%'
)
SELECT *,
REVERSE(
SUBSTRING(Reverse_Left, 0,
CASE
WHEN CHARINDEX(' ', Reverse_Left, 0) = 0 THEN LEN(Reverse_Left) + 1
ELSE CHARINDEX(' ', Reverse_Left, 0)
END
)
)
+
SUBSTRING([Right], 0,
CASE
WHEN CHARINDEX(' ', [Right], 0) = 0 THEN LEN([Right]) + 1
ELSE CHARINDEX(' ', [Right], 0)
END
)
FROM CteStrings
Sample Data:
email
----------------------------------------
someemail#domain.org
some text someemail#domain.org some text
no email
Result
---------------------
someemail#domain.org
someemail#domain.org

Stephan's answer is great when looking for a single email address in each row.
However, I was running into this error when trying to get multiple email addresses in each row:
Invalid length parameter passed to the LEFT or SUBSTRING function
I used this answer from DBA Stack Exchange to get all of the positions of # inside the string. It entails a table-valued function that returns the number of positions equal to the number a certain pattern inside the string. I also had to modify the CROSS APPLY functions to handle multiple email addresses as well.
My Table:
DECLARE #Table TABLE (comment VARCHAR(500));
INSERT INTO #Table (comment)
VALUES ('blah blah My.EmailAddress#domain.org more blah someemailaddress#domain.com even more blah asdf#gmail.com'),
('blah hello.world#domain.org more'),
('no email')
Table-valued Function:
CREATE FUNCTION dbo.fnFindPatternLocation
(
#string NVARCHAR(MAX),
#term NVARCHAR(255)
)
RETURNS TABLE
AS
RETURN
(
SELECT pos = Number - LEN(#term)
FROM (SELECT Number, Item = LTRIM(RTRIM(SUBSTRING(#string, Number,
CHARINDEX(#term, #string + #term, Number) - Number)))
FROM (SELECT ROW_NUMBER() OVER (ORDER BY [object_id])
FROM sys.all_objects) AS n(Number)
WHERE Number > 1 AND Number <= CONVERT(INT, LEN(#string))
AND SUBSTRING(#term + #string, Number, LEN(#term)) = #term
) AS y);
GO
Query:
SELECT comment, pos, SUBSTRING(comment,beginningOfEmail,endOfEmail-beginningOfEmail) AS email
FROM #Table
CROSS APPLY (SELECT pos FROM dbo.fnFindPatternLocation(comment, '#')) AS A(pos)
CROSS APPLY (SELECT CHARINDEX(' ',comment + ' ', pos)) AS B(endOfEmail)
CROSS APPLY (SELECT pos - CHARINDEX(' ', REVERSE(SUBSTRING(comment, 1, pos))) + 2) AS C(beginningOfEmail)
Results:
comment
---------------------------------------------------------------------------------------------------------
blah blah My.EmailAddress#domain.org more blah someemailaddress#domain.com even more blah asdf#gmail.com
blah blah My.EmailAddress#domain.org more blah someemailaddress#domain.com even more blah asdf#gmail.com
blah blah My.EmailAddress#domain.org more blah someemailaddress#domain.com even more blah asdf#gmail.com
blah hello.world#domain.org more
pos email
--- ------------------------------
26 My.EmailAddress#domain.org
64 someemailaddress#domain.com
95 asdf#gmail.com
17 hello.world#domain.org

DECLARE #t TABLE (row_id INT, email VARCHAR(100))
INSERT #t (row_id, email)
VALUES (1, 'drgkls<ivan#gvi.ru>, info#gvi.com, # dgh507-16-65#'),
(2, 'hjshfkjshfj#kjs.kjsehf herwfjewr#kjsd.com adjfhja#.com u3483dhj#hhb#.dfj'),
(3, 'kjsdghfjs4254.23detygh#jhjdfg.dgb лдоврывплдоо isgfsi# klsdfksdl#,dd.com')
DECLARE #pat VARCHAR(100) = '%[^a-z0-9#._ ]%';
WITH f AS (
SELECT row_id,
CAST(' ' + email + ' ' AS VARCHAR(102)) email,
SUBSTRING(email, PATINDEX(#pat, email), 1) bad,
PATINDEX(#pat, email) pat
FROM #t
UNION ALL
SELECT row_id,
CAST(REPLACE(email, bad, ' ') AS VARCHAR(102)),
SUBSTRING(REPLACE(email, bad, ' '), PATINDEX(#pat, REPLACE(email, bad, ' ')), 1) bad,
PATINDEX(#pat, REPLACE(email, bad, ' '))
FROM f
WHERE PATINDEX(#pat, email) > 0
),
s AS
(
SELECT row_id,
email, PATINDEX('%#%', email) pos
FROM f
WHERE pat = 0
AND PATINDEX('%#%', email) > 0
UNION ALL
SELECT row_id,
SUBSTRING(email, pos + 1, 102),
PATINDEX('%#%', SUBSTRING(email, pos + 1, 102))
FROM s
WHERE PATINDEX('%#%', SUBSTRING(email, pos + 1, 102)) > 0
)
SELECT row_id, o1 + pp
FROM s
CROSS APPLY (SELECT REVERSE(LEFT(email, pos -1)) s1) x
CROSS APPLY (SELECT CHARINDEX(' ', s1) i1) y
CROSS APPLY (SELECT REVERSE(LEFT(s1, i1 -1)) o1 WHERE i1 > 0) z
CROSS APPLY (SELECT CHARINDEX(' ', email, pos) i2) e
CROSS APPLY (SELECT SUBSTRING(email, pos, i2 -pos) pp WHERE i2 > pos + 1) q
WHERE LEN(o1) > 1
AND CHARINDEX('.', pp) > 0
AND PATINDEX('%#%#%', pp) = 0
AND PATINDEX('%#.%', pp) = 0
AND PATINDEX('%.', pp) = 0

This one line would also work (a bit long line though lol):
--declare #a varchar(100)
--set #a = 'a asfd saasd asdfgh#asd.com wqe z zx cxzc '
select substring(substring(#a,0,charindex('#',#a)),len(substring(#a,0,charindex('#',#a)))-charindex(' ',reverse(substring(#a,0,charindex('#',#a))))+2,len(substring(#a,0,charindex('#',#a)))) + substring(substring(#a,charindex('#',#a),len(#a)),0,charindex(' ',substring(#a,charindex('#',#a),len(#a))))

For strings that contain new line characters I modified Felix's answer using PATINDEX to search for the first control character rather than white space.
I also had to modify the Right field to subtract the correct amount of text.
WITH CteEmail(email) AS(
SELECT 'example string with new lines
Email: some.example#email.address.com
(first email address - should be returned)
Email: another#test.co.uk
(other email addresses should be ignored
more example text' UNION ALL
SELECT 'Email: some.example#email.address.com' UNION ALL
SELECT 'someemail#domain.org' UNION ALL
SELECT 'some text someemail#domain.org some text' UNION ALL
SELECT 'no email'
)
,CteStrings AS(
SELECT
[Left] = LEFT(email, CHARINDEX('#', email, 0) - 1),
Reverse_Left = REVERSE(LEFT(email, CHARINDEX('#', email, 0) - 1)),
[Right] = RIGHT(email, LEN(email) - CHARINDEX('#', email, 0) + 1 )
FROM CteEmail
WHERE email LIKE '%#%'
)
SELECT *,
REVERSE(
SUBSTRING(Reverse_Left, 0,
CASE
WHEN PATINDEX('%[' + CHAR(10)+'- ]%', Reverse_Left) = 0 THEN LEN(Reverse_Left) + 1
ELSE PATINDEX('%[' + CHAR(0)+'- ]%', Reverse_Left)
END
)
)
+
SUBSTRING([Right], 0,
CASE
WHEN PATINDEX('%[' + CHAR(0)+'- ]%', [Right]) = 0 THEN LEN([Right]) + 1
ELSE PATINDEX('%[' + CHAR(0)+'- ]%', [Right])
END
)
FROM CteStrings

If you need it in a function then this works for me...
CREATE FUNCTION [dbo].[extractEmail]
(
#input nvarchar(500)
)
RETURNS nvarchar(100)
AS
BEGIN
DECLARE #atPosition int
DECLARE #firstRelevantSpace int
DECLARE #name nvarchar(100)
DECLARE #secondRelelvantSpace int
DECLARE #everythingAfterAt nvarchar(500)
DECLARE #domain nvarchar(100)
DECLARE #email nvarchar(100) = ''
IF CHARINDEX('#', #input,0) > 0
BEGIN
SET #input = ' ' + #input
SET #atPosition = CHARINDEX('#', #input, 0)
SET #firstRelevantSpace = CHARINDEX(' ',REVERSE(LEFT(#input, CHARINDEX('#', #input, 0) - 1)))
SET #name = REVERSE(LEFT(REVERSE(LEFT(#input, #atPosition - 1)),#firstRelevantSpace-1))
SET #everythingAfterAt = SUBSTRING(#input, #atPosition,len(#input)-#atPosition+1)
SET #secondRelelvantSpace = CHARINDEX(' ',#everythingAfterAt)
IF #secondRelelvantSpace = 0
SET #domain = #everythingAfterAt
ELSE
SET #domain = LEFT(#everythingAfterAt, #secondRelelvantSpace)
SET #email = #name + #domain
END
RETURN #email
END

Using Cymorg's Function: I ran into an issue where my data included CR/LF and it prevented the Function from working 100%. It was tough to figure out because, when using the function in a select statement, it would return occasionally incorrect results. If I copied the offending text from my query results and invoked the function using sql print with the text in quotes it would work fine. Inconceivable!
After much trial and error, I used sql replace to replace the CR/LF with spaces and huzza! I am an excellent guesser.
select extractEmail(replace(replace(MyColumn,CHAR(10),' '),CHAR(13),' ')) as AsYouWish from FacilityContacts

T-SQL split string based on delimiter

I have some data that I would like to split based on a delimiter that may or may not exist.
Example data:
John/Smith
Jane/Doe
Steve
Bob/Johnson
I am using the following code to split this data into First and Last names:
SELECT SUBSTRING(myColumn, 1, CHARINDEX('/', myColumn)-1) AS FirstName,
SUBSTRING(myColumn, CHARINDEX('/', myColumn) + 1, 1000) AS LastName
FROM MyTable
The results I would like:
FirstName---LastName
John--------Smith
Jane--------Doe
Steve-------NULL
Bob---------Johnson
This code works just fine as long as all the rows have the anticipated delimiter, but errors out when a row does not:
"Invalid length parameter passed to the LEFT or SUBSTRING function."
How can I re-write this to work properly?

May be this will help you.
SELECT SUBSTRING(myColumn, 1, CASE CHARINDEX('/', myColumn)
WHEN 0
THEN LEN(myColumn)
ELSE CHARINDEX('/', myColumn) - 1
END) AS FirstName
,SUBSTRING(myColumn, CASE CHARINDEX('/', myColumn)
WHEN 0
THEN LEN(myColumn) + 1
ELSE CHARINDEX('/', myColumn) + 1
END, 1000) AS LastName
FROM MyTable

For those looking for answers for SQL Server 2016+. Use the built-in STRING_SPLIT function
Eg:
DECLARE #tags NVARCHAR(400) = 'clothing,road,,touring,bike'
SELECT value
FROM STRING_SPLIT(#tags, ',')
WHERE RTRIM(value) <> '';
Reference: https://msdn.microsoft.com/en-nz/library/mt684588.aspx

Try filtering out the rows that contain strings with the delimiter and work on those only like:
SELECT SUBSTRING(myColumn, 1, CHARINDEX('/', myColumn)-1) AS FirstName,
SUBSTRING(myColumn, CHARINDEX('/', myColumn) + 1, 1000) AS LastName
FROM MyTable
WHERE CHARINDEX('/', myColumn) > 0
Or
SELECT SUBSTRING(myColumn, 1, CHARINDEX('/', myColumn)-1) AS FirstName,
SUBSTRING(myColumn, CHARINDEX('/', myColumn) + 1, 1000) AS LastName
FROM MyTable
WHERE myColumn LIKE '%/%'

SELECT CASE
WHEN CHARINDEX('/', myColumn, 0) = 0
THEN myColumn
ELSE LEFT(myColumn, CHARINDEX('/', myColumn, 0)-1)
END AS FirstName
,CASE
WHEN CHARINDEX('/', myColumn, 0) = 0
THEN ''
ELSE RIGHT(myColumn, CHARINDEX('/', REVERSE(myColumn), 0)-1)
END AS LastName
FROM MyTable

ALTER FUNCTION [dbo].[split_string](
#delimited NVARCHAR(MAX),
#delimiter NVARCHAR(100)
) RETURNS #t TABLE (id INT IDENTITY(1,1), val NVARCHAR(MAX))
AS
BEGIN
DECLARE #xml XML
SET #xml = N'<t>' + REPLACE(#delimited,#delimiter,'</t><t>') + '</t>'
INSERT INTO #t(val)
SELECT r.value('.','varchar(MAX)') as item
FROM #xml.nodes('/t') as records(r)
RETURN
END

I just wanted to give an alternative way to split a string with multiple delimiters, in case you are using a SQL Server version under 2016.
The general idea is to split out all of the characters in the string, determine the position of the delimiters, then obtain substrings relative to the delimiters. Here is a sample:
-- Sample data
DECLARE #testTable TABLE (
TestString VARCHAR(50)
)
INSERT INTO #testTable VALUES
('Teststring,1,2,3')
,('Test')
DECLARE #delimiter VARCHAR(1) = ','
-- Generate numbers with which we can enumerate
;WITH Numbers AS (
SELECT 1 AS N
UNION ALL
SELECT N + 1
FROM Numbers
WHERE N < 255
),
-- Enumerate letters in the string and select only the delimiters
Letters AS (
SELECT n.N
, SUBSTRING(t.TestString, n.N, 1) AS Letter
, t.TestString
, ROW_NUMBER() OVER ( PARTITION BY t.TestString
ORDER BY n.N
) AS Delimiter_Number
FROM Numbers n
INNER JOIN #testTable t
ON n <= LEN(t.TestString)
WHERE SUBSTRING(t.TestString, n, 1) = #delimiter
UNION
-- Include 0th position to "delimit" the start of the string
SELECT 0
, NULL
, t.TestString
, 0
FROM #testTable t
)
-- Obtain substrings based on delimiter positions
SELECT t.TestString
, ds.Delimiter_Number + 1 AS Position
, SUBSTRING(t.TestString, ds.N + 1, ISNULL(de.N, LEN(t.TestString) + 1) - ds.N - 1) AS Delimited_Substring
FROM #testTable t
LEFT JOIN Letters ds
ON t.TestString = ds.TestString
LEFT JOIN Letters de
ON t.TestString = de.TestString
AND ds.Delimiter_Number + 1 = de.Delimiter_Number
OPTION (MAXRECURSION 0)

The examples above work fine when there is only one delimiter, but it doesn't scale well for multiple delimiters. Note that this will only work for SQL Server 2016 and above.
/*Some Sample Data*/
DECLARE #mytable TABLE ([id] VARCHAR(10), [name] VARCHAR(1000));
INSERT INTO #mytable
VALUES ('1','John/Smith'),('2','Jane/Doe'), ('3','Steve'), ('4','Bob/Johnson')
/*Split based on delimeter*/
SELECT P.id, [1] 'FirstName', [2] 'LastName', [3] 'Col3', [4] 'Col4'
FROM(
SELECT A.id, X1.VALUE, ROW_NUMBER() OVER (PARTITION BY A.id ORDER BY A.id) RN
FROM #mytable A
CROSS APPLY STRING_SPLIT(A.name, '/') X1
) A
PIVOT (MAX(A.[VALUE]) FOR A.RN IN ([1],[2],[3],[4],[5])) P

These all helped me get to this. I am still on 2012 but now have something quick that will allow me to split a string, even if string has varying numbers of delimiters, and grab the nth substring from that string. It's quick too. I know this post is old, but it took me forever to find something so hopefully this will help someone else.
CREATE FUNCTION [dbo].[SplitsByIndex]
(#separator VARCHAR(20) = ' ',
#string VARCHAR(MAX),
#position INT
)
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE #results TABLE
(id INT IDENTITY(1, 1),
chrs VARCHAR(8000)
);
DECLARE #outResult VARCHAR(8000);
WITH X(N)
AS (SELECT 'Table1'
FROM(VALUES(0), (0), (0), (0), (0), (0), (0), (0), (0), (0), (0), (0), (0), (0), (0), (0)) T(C)),
Y(N)
AS (SELECT 'Table2'
FROM X A1,
X A2,
X A3,
X A4,
X A5,
X A6,
X A7,
X A8), -- Up to 16^8 = 4 billion
T(N)
AS (SELECT TOP (ISNULL(LEN(#string), 0)) ROW_NUMBER() OVER(
ORDER BY
(
SELECT NULL
)) - 1 N
FROM Y),
Delim(Pos)
AS (SELECT t.N
FROM T
WHERE(SUBSTRING(#string, t.N, LEN(#separator + 'x') - 1) LIKE #separator
OR t.N = 0)),
Separated(value)
AS (SELECT SUBSTRING(#string, d.Pos + LEN(#separator + 'x') - 1, LEAD(d.Pos, 1, 2147483647) OVER(
ORDER BY
(
SELECT NULL
))-d.Pos - LEN(#separator))
FROM Delim d
WHERE #string IS NOT NULL)
INSERT INTO #results(chrs)
SELECT s.value
FROM Separated s
WHERE s.value <> #separator;
SELECT #outResult =
(
SELECT chrs
FROM #results
WHERE id = #position
);
RETURN #outResult;
END;
This can be used like this:
SELECT [dbo].[SplitsByIndex](' ',fieldname,2)
from tablename

I would protect the substring operation by always appending a delimiter to the test strings. This makes the parsing much simpler. Your code may now rely on finding the right pattern, and not need to cope with special cases.
SELECT SUBSTRING(myColumn + '/', 1, CHARINDEX('/', myColumn)-1) AS FirstName,
SUBSTRING(myColumn + '/', CHARINDEX('/', myColumn) + 1, 1000) AS LastName
FROM MyTable
It eliminates edge cases and conditionals and cases.
Always add an extra delimiter at the end, then the challenge case is no problem.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to get 3rd string part with CharIndex/SubString [duplicate] - sql

Related

How can I retrieve first second and third word of a String in SQL?

Converting multiple delimited fields into rows in SQL Server

Split Strings into columns in SQL Server

Extract email address from string using tsql

T-SQL split string based on delimiter

Categories

Resources