Extract Emails that Lying Between Special Characters Using Regex in SQL - sql

How do I extract only email from this certain pattern of string using regex in SQL?
What I have :
tb_1
Logmessage
Alan Robert <alan.robert#gmail.com> was assigned to <richard#yahoo.com> and <nelson#gmail.com>
Alan Robert <alan.robert#gmail.com> was unassigned to <khanjoyty#gmail.com> and <katy#gmail.com>
What I want: tb_2
email_1
email_2
email_3
alan.robert#gmail.com
richard#yahoo.com
nelson#gmail.com
alan.robert#gmail.com
khanjoyty#gmail.com
katy#gmail.com
I already have a solution for this but the tb_1 table has a huge amount of rows so my query output takes too much time. That's why I thought maybe regex would be more time-saving.
My query:
with cte as(
Select replace(replace(replace(replace(right(#logmessage, len(logmessage)-charindex('<', logmessage)+1),
Case when logmessage like '%unassigned%' Then ' was unassigned to '
When logmessage like '%assigned%' then ' was assigned to ' End , '.'),' and ', '.'),
'<', '[' ),'>', ']') logmessage
From tb_1)
Select
PARSENAME(logmessage, 3) AS email_3,
PARSENAME(logmessage, 3) AS email_2,
PARSENAME(logmessage, 1) AS email_1
From cte

With the use of a helper function
Example or dbFiddle
Declare #YourTable Table (LogID int,[Logmessage] varchar(500)) Insert Into #YourTable Values
(1,'Alan Robert <alan.robert#gmail.com> was assigned to <richard#yahoo.com> and <nelson#gmail.com>')
,(2,'Alan Robert <alan.robert#gmail.com> was unassigned to <khanjoyty#gmail.com> and <katy#gmail.com>')
Select A.LogID
,B.*
From #YourTable A
Cross Apply [dbo].[tvf-Str-Extract-JSON](LogMessage,'<','>') B
Results
LogID RetSeq RetVal
1 1 alan.robert#gmail.com
1 2 richard#yahoo.com
1 3 nelson#gmail.com
2 1 alan.robert#gmail.com
2 2 khanjoyty#gmail.com
2 3 katy#gmail.com
It would then be a small matter to pivot the results
The TVF if interested
CREATE FUNCTION [dbo].[tvf-Str-Extract-JSON] (#String varchar(max),#Delim1 varchar(100),#Delim2 varchar(100))
Returns Table
As
Return (
Select RetSeq = row_number() over (order by RetSeq)
,RetVal = left(RetVal,charindex(#Delim2,RetVal)-1)
From (
Select RetSeq = [Key]+1
,RetVal = trim(Value)
From OpenJSON( '["'+replace(string_escape(#String,'json'),#Delim1,'","')+'"]' )
) C1
Where charindex(#Delim2,RetVal)>1
)

Related

How do I extract data within parentheses from a table with different values?

Im trying to extract data within a column that contains IDs and characters that contain IDs within parentheses. It looks somewhat like this (Btw, there will only be one set of parentheses if they happen to exist within a row):
Col1
Mark(1234)
6789
VZ(X678)
ASD(5677)qwe
Ideal Result
1234
6789
X678
5677
This is what I have so far but its returning an error: 'Invalid length parameter passed to the LEFT or SUBSTRING function.'
SELECT DISTINCT col1,
CASE WHEN col1 like '%(%' then
SUBSTRING (col1,
CHARINDEX('%(%', col1) + 1,
CHARINDEX('%)%', col1) - CHARINDEX('%(%', col1) - 1)
else col1 end
from MyTable B;
If interested in a helper Table-Valued Function which will support multiple observations. If you don't want the function, it would be a small matter to migrate the logic into the CROSS APPLY
Example
Declare #YourTable Table ([Col1] varchar(50)) Insert Into #YourTable Values
('Mark(1234)')
,('6789')
,('VZ(X678)')
,('ASD(5677)qwe')
Select A.*
,NewValue = coalesce(RetVal,Col1)
from #YourTable A
Outer Apply [dbo].[tvf-Str-Extract](Col1,'(',')') B
Results
Col1 NewValue
Mark(1234) 1234
6789 6789
VZ(X678) X678
ASD(5677)qwe 5677
Results
CREATE FUNCTION [dbo].[tvf-Str-Extract-JSON] (#String nvarchar(max),#Delim1 nvarchar(100),#Delim2 nvarchar(100))
Returns Table
As
Return (
Select RetSeq = row_number() over (order by RetSeq)
,RetVal = left(RetVal,charindex(#Delim2,RetVal)-1)
From (
Select RetSeq = [Key]+1
,RetVal = trim(Value)
From OpenJSON( N'["'+replace(string_escape(#String,'json'),#Delim1,'","')+N'"]' )
) C1
Where charindex(#Delim2,RetVal)>1
)
EDIT - Sans TVF
Select A.*
,NewValue = coalesce(RetVal,Col1)
from #YourTable A
Outer Apply (
Select RetSeq = row_number() over (order by RetSeq)
,RetVal = left(RetVal,charindex(')',RetVal)-1)
From (
Select RetSeq = [Key]+1
,RetVal = trim(Value)
From OpenJSON( N'["'+replace(string_escape(Col1,'json'),'(','","')+N'"]' )
) C1
Where charindex(')',RetVal)>1
) B
#martin Smith
Thanks for pointing out the used of the wildcards. I changed my code to this and its doing what I needed it to do! Using the case when expression to look for the parentheses regardless of location so I kept the % wildcard there but took it out in the CHARINDEX as you mentioned:
SELECT DISTINCT col1,
CASE WHEN col1 like '%(%' then
SUBSTRING (col1,
CHARINDEX('(', col1) + 1,
CHARINDEX(')', col1) - CHARINDEX('(', col1) - 1)
else col1 end
from MyTable B;

Adding 'and' at the end of a comma separated list

Currently, I'm using the stuff function to create a comma separated list per each row.
x,y,z
What I want is to add commas for n-1 items in the list, with the final item being preceded by 'and'
x,y, and z.
For these purposes, just checking row number won't work because this list is being generated per unique Id, therefore I can't just iterate to the end of the table. Code below:
SELECT DISTINCT (sw.OwnerID)
,stuff((
SELECT DISTINCT ', ' + e.pn
FROM fct.enrtablev e
WHERE sw.OwnerID = e.OwnerId
FOR XML PATH('')), 1, 1, '') AS [Pet(s)]
A bit of a hack... AND string_agg() would be a better fit if 2017+
Here we use test the row_number() of the item count sum(1) over(), when equal this is the last item in the list
Example
Declare #YourTable table (OwnerID int,pn varchar(50))
Insert Into #YourTable values
(1,'X')
,(1,'Y')
,(1,'Z')
,(1,'Z')
,(2,'Apples')
Select Distinct
OwnerID
,stuff( ( Select case when row_number() over(order by pn) = nullif(sum(1) over() ,1)
then ', and '
else ', '
end + pn
FROM (Select distinct pn
From #YourTable
Where OwnerID = A.OwnerId
) e
Order By PN
For XML Path('')), 1, 2, '') AS [Pet(s)]
From #YourTable A
Returns
OwnerID Pet(s)
1 X, Y, and Z
2 Apples
XQUery and XML data model is based on ordered sequences. Exactly what we need.
Here is a simple solution based on XQuery and its FLWOR expression.
SQL
-- DDL and sample data population, start
DECLARE #tbl TABLE (OwnerID int, pn VARCHAR(50));
INSERT INTO #tbl (OwnerID, pn) VALUES
(1,'X'),
(1,'Y'),
(1,'Z'),
(2,'Apples');
-- DDL and sample data population, end
SELECT p.OwnerID
, (SELECT *
FROM #tbl AS c
WHERE c.OwnerID = p.OwnerID
FOR XML PATH('r'), TYPE, ROOT('root')
).query('
for $x in /root/r/pn/text()
return if ($x is (/root/r[last()]/pn/text())[1]) then
if (count(/root/r) gt 1) then concat("and ", $x) else string($x)
else concat($x, ",")
').value('.', 'VARCHAR(MAX)') AS Result
FROM #tbl AS p
GROUP BY p.OwnerID;
Output
+---------+----------------+
| OwnerID | Result |
+---------+----------------+
| 1 | X, Y, and Z |
| 2 | Apples |
+---------+----------------+
You can achieve this using ORDER BY and count.
Declare #YourTable table (OwnerID int,pn varchar(50))
Insert Into #YourTable values
(1,'X')
,(1,'Y')
,(1,'Z')
,(2,'Apples')
;WITH CTE_OwnerIdRank as
(
SELECT ownerid, pn, row_number() over (order by ownerId) as totalrn,
count(*) over(partition by ownerid order by ownerid) as ownercnt
from #yourtable
)
SELECT distinct OwnerId,
stuff((
SELECT ', ' + CASE WHEN c.totalrn = c.ownercnt then CONCAT(' and ',c.pn) else c.pn end
FROM CTE_OwnerIdRank as c
WHERE c.OwnerID = o.OwnerId
order by c.totalrn
FOR XML PATH('')), 1, 1, '') AS [Pet(s)]
from #yourtable as o
OwnerId
Pet(s)
1
X, Y, and Z
2
Apples

Extract string between two characters in a string

I have a set of strings that has datetime values and I would like to extract them. I am not sure if this is even possible using T-SQL.
CREATE TABLE #Temp (
BLOB_NM VARCHAR(100)
);
INSERT INTO #Temp
SELECT 'products_country_20200528102030.txt'
UNION ALL
SELECT 'products_territory_20190528102030.txt'
UNION ALL
SELECT 'products_country_2020-05-20_20200528102030.txt'
;
Expected Results:
20200528102030
20190528102030
20200528102030
For this dataset, string functions should do it:
select blob_nm, substring(blob_nm, len(blob_nm) - 17, 14) res from #temp
The idea is to count backwards from the end of the string, and capture the 14 characters that preced the extension (represented by the last 4 characters of the string).
Demo on DB Fiddle:
blob_nm | res
:--------------------------------------------- | :-------------
products_country_20200528102030.txt | 20200528102030
products_territory_20190528102030.txt | 20190528102030
products_country_2020-05-20_20200528102030.txt | 20200528102030
If interested in a helper function... I created this TVF because I was tiered of extracting portions of strings (left, right, charindex, reverse, substing, etc)
Example
Select *
From #Temp A
Cross Apply [dbo].[tvf-Str-Extract](Blob_NM,'_','.') B
Returns
BLOB_NM RetSeq RetVal
products_country_20200528102030.txt 1 20200528102030
products_territory_20190528102030.txt 1 20190528102030
products_country_2020-05-20_20200528102030.txt 1 20200528102030
The Function if Interested
CREATE FUNCTION [dbo].[tvf-Str-Extract] (#String varchar(max),#Delim1 varchar(100),#Delim2 varchar(100))
Returns Table
As
Return (
Select RetSeq = row_number() over (order by RetSeq)
,RetVal = left(RetVal,charindex(#Delim2,RetVal)-1)
From (
Select RetSeq = row_number() over (order by 1/0)
,RetVal = ltrim(rtrim(B.i.value('(./text())[1]', 'varchar(max)')))
From ( values (convert(xml,'<x>' + replace((Select replace(#String,#Delim1,'§§Split§§') as [*] For XML Path('')),'§§Split§§','</x><x>')+'</x>').query('.'))) as A(XMLData)
Cross Apply XMLData.nodes('x') AS B(i)
) C1
Where charindex(#Delim2,RetVal)>1
)
i suppose :
Files extension are not always 3 of characters length
Your Date/Time format are always on 14 characters
Try this :
select
CONVERT(DATETIME, STUFF(STUFF(STUFF(left(right(BLOB_NM, charindex('_', reverse(BLOB_NM) + '_') - 1), 14),13,0,':'),11,0,':'),9,0,' ')) as Result
from #Temp

SQL Server Loop thru rows to form Groups

I using SQL Server 2008 R2 / 2014. I wish to find a SQL query that can do the following:
Rules:
Each [Group] must have [Number] 1 to 6 to be complete group.
[Name] in each [Group] must be unique.
Each row only can use 1 time.
Table before sorting is...
Name Number Group
---- ------ -----
A 1
B 6
A 123
C 3
B 4
C 23
D 45
D 4
C 56
A 12
D 56
After sorting, result I want is below or similar....
Name Number Group
---- ------ -----
A 1 1
C 23 1
D 45 1
B 6 1
A 123 2
D 4 2
C 56 2
A 12 3
C 3 3
B 4 3
D 56 3
What I tried before is to find a subgroup that have [Number] consist of 1-6 with below concatenate method...
SELECT *
FROM [Table1] ST2
WHERE
SUBSTRING((SELECT ST1.[Number] AS [text()]
FROM [Table1] ST1
-- WHERE ST1.[Group] = ST2.[Group]
ORDER BY LEFT(ST1.[Number],1)
FOR XML PATH ('')), 1, 1000) = '123456'
Maybe you should check ROW_NUMBER function.
select Name
, Number
, ROW_NUMBER () OVER(PARTITION BY Name ORDER BY Number) as Group
from [Table1]
If you have more than 6 rows with same NAME value then it will return more groups. You can filter additional groups out since you are interested in only 6 groups with unique values of NAME column.
I'm not sure if this can be done more simply or not, but here's my go at it...
Advanced warning, this requires some means of splitting strings. Since you're not on 2016, I've included a function at the beginning of the script.
The bulk of the work is a recursive CTE that builds the Name and Number columns into comma delimited groups. We then reduce our working set to only the groups where the numbers would create 123456, split the groups and use ROW_NUMBER() OVER... to identify them, and then select based on the new data.
Demo: http://rextester.com/NEXG53500
CREATE FUNCTION [dbo].[SplitStrings]
(
#List NVARCHAR(MAX),
#Delimiter NVARCHAR(255)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
(
SELECT Item = y.i.value('(./text())[1]', 'nvarchar(4000)')
FROM
(
SELECT x = CONVERT(XML, '<i>'
+ REPLACE(#List, #Delimiter, '</i><i>')
+ '</i>').query('.')
) AS a CROSS APPLY x.nodes('i') AS y(i)
);
GO
CREATE TABLE #temp
(
name VARCHAR(MAX),
number INT
)
INSERT INTO #temp
VALUES
('a',1),
('b',6),
('a',123),
('c',3),
('b',4),
('c',23),
('d',45),
('d',4),
('c',56),
('a',12),
('d',56);
/*** Recursively build groups based on information from #temp ***/
WITH groupFinder AS
(
SELECT CAST(name AS VARCHAR(MAX)) AS [groupNames], CAST(number AS VARCHAR(max)) AS [groupNumbers] FROM #temp
UNION ALL
SELECT
cast(CONCAT(t.[Name],',',g.[groupNames]) as VARCHAR(MAX)),
CAST(CONCAT(CAST(t.[Number] AS VARCHAR(max)),',',CAST(g.[groupNumbers] AS VARCHAR(max))) AS VARCHAR(max))
FROM #temp t
JOIN groupFinder g
ON
g.groupNames NOT LIKE '%' + t.name+'%'
AND g.[groupNumbers] NOT LIKE '%' + CAST(t.number/100 AS VARCHAR(10)) +'%'
AND g.[groupNumbers] NOT LIKE '%' + CAST(t.number/10 AS VARCHAR(10)) +'%'
AND g.[groupNumbers] NOT LIKE '%' + CAST(t.number%10 AS VARCHAR(10)) +'%'
)
/*** only get groups where the numbers form 123456 ***/
, groupPruner AS
(
SELECT *, ROW_NUMBER() OVER (ORDER BY [groupNames]) AS [rn] FROM groupFinder WHERE REPLACE([groupNumbers],',','') = '123456'
)
/*** split the name group and give it identifiers ***/
, nameIdentifier AS
(
SELECT g.*, c1.[item] AS [Name], ROW_NUMBER() OVER (PARTITION BY [rn] ORDER BY (SELECT NULL)) AS [rn1]
FROM groupPruner g
CROSS APPLY splitstrings(g.groupnames,',') c1
)
/*** split the number group and give it identifiers ***/
, numberIdentifier AS
(
SELECT g.*, c1.[item] AS [Number], ROW_NUMBER() OVER (PARTITION BY [rn], [rn1] ORDER BY (SELECT NULL)) AS [rn2]
FROM nameIdentifier g
CROSS APPLY splitstrings(g.groupNumbers,',') c1
)
SELECT [Name], [Number], [rn] AS [Group]
--,groupnames, groupNumbers /*uncomment this line to see the groups that were built*/
FROM numberIdentifier
WHERE rn1 = rn2
ORDER BY rn, rn1
DROP TABLE #temp

SQL How can I optimize splitting a string and inserting the words into a new table?

Is there anyway to do this in less time? I am taking the summary column from my case table and splitting the data word by word into my words table using the following loop:
Example case table
CaseID | CaseNumber | Summary
1 111111 This is a summary
2 111112 This is Summary 2
DECLARE
#n int = 1
;
WHILE #n <= 1000
BEGIN
INSERT INTO words (caseID, caseNumber, pn, word)
SELECT caseID, caseNumber, pn, word FROM dbo.Split6(' ', (select summary
from
cases where caseID = #n)) where caseID = #n group by caseID,caseNumber, pn,
word
option (maxrecursion 0)
SET #n = #n+1;
END
GO
It works, but it is slow. Took 3 hours to break down 1000 cases. I have 100,000 cases. Is there a way I can do this more efficiently? Here is the split function I'm using:
Split6 function:
CREATE FUNCTION [dbo].[Split6] (
#sep CHAR(1)
,#s nVARCHAR(4000)
)
RETURNS TABLE
AS
RETURN (
WITH Pieces(caseID,caseNumber, pn, start, stop) AS (
SELECT cs.caseID
,cs.caseNumber
,1
,1
,CHARINDEX(#sep, #s)
FROM cases cs
UNION ALL
SELECT caseID
,caseNumber
,pn + 1
,stop + 1
,CHARINDEX(#sep, #s, stop + 1)
FROM Pieces
WHERE stop > 0
)
SELECT caseID
,caseNumber
,pn
,SUBSTRING(#s, start, CASE
WHEN stop > 0
THEN stop - start
ELSE 512
END) AS word
FROM Pieces
) GO
You should avoid loops whenever possible.
The following uses a Parse/Split function in concert with a Cross Apply (use Outer Apply to show null values).
As far as performance goes... useing a test sample of 100,000 records with a average of 5 words each, the execution time is 2.2 seconds.
Example
Declare #YourTable Table ([CaseID] varchar(50),[CaseNumber] varchar(50),[Summary] varchar(50))
Insert Into #YourTable Values
(1,111111,'This is a summary')
,(2,111112,'This is Summary 2')
Select A.CaseID
,A.CaseNumber
,B.*
From #YourTable A
Cross Apply [dbo].[udf-Str-Parse](A.Summary,' ') B
Returns
CaseID CaseNumber RetSeq RetVal
1 111111 1 This
1 111111 2 is
1 111111 3 a
1 111111 4 summary
2 111112 1 This
2 111112 2 is
2 111112 3 Summary
2 111112 4 2
The UDF if Interested
CREATE FUNCTION [dbo].[udf-Str-Parse] (#String varchar(max),#Delimiter varchar(10))
Returns Table
As
Return (
Select RetSeq = Row_Number() over (Order By (Select null))
,RetVal = LTrim(RTrim(B.i.value('(./text())[1]', 'varchar(max)')))
From (Select x = Cast('<x>' + replace((Select replace(#String,#Delimiter,'§§Split§§') as [*] For XML Path('')),'§§Split§§','</x><x>')+'</x>' as xml).query('.')) as A
Cross Apply x.nodes('x') AS B(i)
);
--Thanks Shnugo for making this XML safe
--Select * from [dbo].[udf-Str-Parse]('Dog,Cat,House,Car',',')
--Select * from [dbo].[udf-Str-Parse]('John Cappelletti was here',' ')
--Select * from [dbo].[udf-Str-Parse]('this,is,<test>,for,< & >',',')
EDIT - Another Parse/Split Function
The following TVF is slightly faster then the XML version, but limited to 8K. For example, on 5,000 sample records, with an average of 36 "words", it was 20ms faster than the XML version.
CREATE FUNCTION [dbo].[udf-Str-Parse-8K] (#String varchar(max),#Delimiter varchar(25))
Returns Table
As
Return (
with cte1(N) As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N(N)),
cte2(N) As (Select Top (IsNull(DataLength(#String),0)) Row_Number() over (Order By (Select NULL)) From (Select N=1 From cte1 a,cte1 b,cte1 c,cte1 d) A ),
cte3(N) As (Select 1 Union All Select t.N+DataLength(#Delimiter) From cte2 t Where Substring(#String,t.N,DataLength(#Delimiter)) = #Delimiter),
cte4(N,L) As (Select S.N,IsNull(NullIf(CharIndex(#Delimiter,#String,s.N),0)-S.N,8000) From cte3 S)
Select RetSeq = Row_Number() over (Order By A.N)
,RetVal = LTrim(RTrim(Substring(#String, A.N, A.L)))
From cte4 A
);
--Orginal Source http://www.sqlservercentral.com/articles/Tally+Table/72993/
--Select * from [dbo].[udf-Str-Parse-8K]('Dog,Cat,House,Car',',')
--Select * from [dbo].[udf-Str-Parse-8K]('John||Cappelletti||was||here','||')