SSMS replace all commas outside of quotation marks in string - sql

I've written the following function in SSMS to replace any commas that are outside of quotation marks with ||||:
CREATE FUNCTION dbo.fixqualifier (#string nvarchar(max))
returns nvarchar(max)
as begin
DECLARE #STRINGTOPAD NVARCHAR(MAX)
DECLARE #position int = 1,#newstring nvarchar(max) ='',#QUOTATIONMODE INT = 0
WHILE(LEN(#string)>0)
BEGIN
SET #STRINGTOPAD = SUBSTRING(#string,0,IIF(#STRING LIKE '%"%',CHARINDEX('"',#string),LEN(#STRING)))
SET #newstring = #newstring + IIF(#QUOTATIONMODE = 1, REPLACE(#STRINGTOPAD,',','||||'),#STRINGTOPAD)
SET #QUOTATIONMODE = IIF(#QUOTATIONMODE = 1,0,1)
set #string = SUBSTRING(#string,1+IIF(#STRING LIKE '%"%',CHARINDEX('"',#string),LEN(#STRING)),LEN(#string))
END
return #newstring
end
The idea is for the function to find the first ", replace all ',' before that then switch to quotation mode 1 so it knows to not replace the , until it changes back to quotation mode 0 when it hits the 2nd " and so on.
so for example the string:
qwer,tyu,io,asd,"edffs,asdfgh","jjkzx",kl
would become:
qwer||||tyu||||io||||asd||||"edffs,asdfgh"||||"jjkzx"||||kl
It works as expected but it's really inefficient when it comes to doing this for several thousand rows.
Is there a better way or doing this or at least speeding the function up.

Do a simple trick by Modulus
DECLARE #VAR VARCHAR(100) = 'qwer,tyu,io,asd,"edffs,asdfgh","jjkzx",kl'
,#OUTPUT VARCHAR(100) = '';
SELECT #OUTPUT = #OUTPUT + CASE WHEN (LEN(#OUTPUT) - LEN(REPLACE(#OUTPUT, '"', ''))) % 2 = 0
THEN REPLACE(VAL, ',', '||||') ELSE VAL END
FROM (
SELECT SUBSTRING(#VAR, NUMBER, 1) VAL
FROM master.dbo.spt_values
WHERE type = 'P'
AND NUMBER BETWEEN 1 AND LEN(#VAR)
) A
PRINT #OUTPUT
Result:
qwer||||tyu||||io||||asd||||"edffs,asdfgh"||||"jjkzx"||||kl
By this LEN(#OUTPUT) - LEN(REPLACE(#OUTPUT, '"', '')) expression, you will get count of ". By taking Modulus of the count %2, if it is zero its even then you can replace commas, otherwise you will keep them.

This uses DelimitedSplit8k and completely avoids any RBAR methods (such as a WHILE or #Variable = #Variable +... (which is a hidden form of RBAR)).
It firstly splits on the quotation, and then on the commas, where the string isn't quoted. Finally it then puts the strings back together again, using the "old" STUFF and FOR XML PATH method:
USE Sandbox;
DECLARE #String varchar(8000) = 'qwer,tyu,io,asd,"edffs,asdfgh","jjkzx",kl';
WITH Splits AS(
SELECT QS.ItemNumber AS QuoteNumber, CS.ItemNumber AS CommaNumber, ISNULL(CS.Item, '"' + QS.Item + '"') AS DelimitedItem
FROM dbo.DelimitedSplit8K(#string,'"') QS
OUTER APPLY (SELECT *
FROM dbo.DelimitedSplit8K(QS.Item,',')
WHERE QS.ItemNumber % 2 = 1) CS
WHERE QS.Item <> ',')
SELECT STUFF((SELECT '||||' + S.DelimitedItem
FROM Splits S
ORDER BY S.QuoteNumber, S.CommaNumber
FOR XML PATH('')),1,1,'') AS DelimitedList;
(Note, DelimitedSplit8K does not accept more than 8,000 characters. If you have more than that, SQL Server is really not the right tool. STRING_SPLIT does not provide the ordinal position, so you would be unable to guarantee the rebuild order with it.)

Related

Function to replace all non alpha-numeric and multiple whitespace characters with a single space

I am trying to write an efficient function to use in a calculated field which has the following characteristics
Replace all non alpha numeric characters with space
Replace multiple white spaces with a space
Trim and lower the results
Example input
A B##%$$C &^%D
Example output
a b c d
A normal regex pattern would match like so
[\W_]+
The following works, however I am not sure if there is a more efficient approach than using 2 loops ( O(n2) complexity at least) with PatIndex and Stuff, charindex and replace
Create Function [dbo].[Clean](#Temp nvarchar(1000))
Returns nvarchar(1000)
AS
Begin
Declare #Pattern as varchar(50) = '%[^a-z0-9 ]%'
While PatIndex(#Pattern, #Temp) > 0
Set #Temp = Stuff(#Temp, PatIndex(#Pattern, #Temp), 1, ' ')
while charindex(' ',#Temp ) > 0
set #Temp = replace(#Temp, ' ', ' ')
Return LOWER(TRIM(#Temp))
End
Usage
Select dbo.Clean(' A B##%$$C &^%D ')
Result
a b c d
Is there potentially a single pass approach I can use, or a sneaky method I am not aware of?
I'm not able to test the performance, but the following approach (without loops and based on some string manipulations) is an additional option.
Note, that you'll need at least SQL Server 2017 (for the TRANSLATE() call).
-- Input text and patterns
DECLARE #text varchar(1000) = ' A B##%$$C &^%D'
DECLARE #alphanumericpattern varchar(36) = 'abcdefghijklmnopqrstuvwxyz0123456789'
DECLARE #notalphanumericpattern varchar(1000)
-- Trim and lower the input text
SELECT #text = RTRIM(LTRIM(LOWER(#text)))
-- Get not alpha-numeric characters
SELECT #notalphanumericpattern =
REPLACE(
TRANSLATE(#text, #alphanumericpattern, REPLICATE('a', LEN(#alphanumericpattern))),
'a',
''
)
-- Replace all not alpha-numeric characters with a space
SELECT #text =
REPLACE(
TRANSLATE(#text, #notalphanumericpattern, REPLICATE('$', LEN(#notalphanumericpattern))),
'$',
' '
)
-- Replace multiple spaces with a single space
SELECT #text =
REPLACE(
REPLACE(
REPLACE(
#text,
' ',
'<>'
),
'><',
''
),
'<>',
' '
)
Result:
a b c d

How to identify and redact all instances of a matching pattern in T-SQL

I have a requirement to run a function over certain fields to identify and redact any numbers which are 5 digits or longer, ensuring all but the last 4 digits are replaced with *
For example: "Some text with 12345 and 1234 and 12345678" would become "Some text with *2345 and 1234 and ****5678"
I've used PATINDEX to identify the the starting character of the pattern:
PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%', TEST_TEXT)
I can recursively call that to get the starting character of all the occurrences, but I'm struggling with the actual redaction.
Does anyone have any pointers on how this can be done? I know to use REPLACE to insert the *s where they need to be, it's just the identification of what I should actually be replacing I'm struggling with.
Could do it on a program, but I need it to be T-SQL (can be a function if needed).
Any tips greatly appreciated!
You can do this using the built in functions of SQL Server. All of which used in this example are present in SQL Server 2008 and higher.
DECLARE #String VARCHAR(500) = 'Example Input: 1234567890, 1234, 12345, 123456, 1234567, 123asd456'
DECLARE #StartPos INT = 1, #EndPos INT = 1;
DECLARE #Input VARCHAR(500) = ISNULL(#String, '') + ' '; --Sets input field and adds a control character at the end to make the loop easier.
DECLARE #OutputString VARCHAR(500) = ''; --Initalize an empty string to avoid string null errors
WHILE (#StartPOS <> 0)
BEGIN
SET #StartPOS = PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%', #Input);
IF #StartPOS <> 0
BEGIN
SET #OutputString += SUBSTRING(#Input, 1, #StartPOS - 1); --Seperate all contents before the first occurance of our filter
SET #Input = SUBSTRING(#Input, #StartPOS, 500); --Cut the entire string to the end. Last value must be greater than the original string length to simply cut it all.
SET #EndPos = (PATINDEX('%[0-9][0-9][0-9][0-9][^0-9]%', #Input)); --First occurance of 4 numbers with a not number behind it.
SET #Input = STUFF(#Input, 1, (#EndPos - 1), REPLICATE('*', (#EndPos - 1))); --#EndPos - 1 gives us the amount of chars we want to replace.
END
END
SET #OutputString += #Input; --Append the last element
SET #OutputString = LEFT(#OutputString, LEN(#OutputString))
SELECT #OutputString;
Which outputs the following:
Example Input: ******7890, 1234, *2345, **3456, ***4567, 123asd456
This entire code could also be made as a function since it only requires an input text.
A dirty solution with recursive CTE
DECLARE
#tags nvarchar(max) = N'Some text with 12345 and 1234 and 12345678',
#c nchar(1) = N' ';
;
WITH Process (s, i)
as
(
SELECT #tags, PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%', #tags)
UNION ALL
SELECT value, PATINDEX('%[0-9][0-9][0-9][0-9][0-9]%', value)
FROM
(SELECT SUBSTRING(s,0,i)+'*'+SUBSTRING(s,i+4,len(s)) value
FROM Process
WHERE i >0) calc
-- we surround the value and the string with leading/trailing ,
-- so that cloth isn't a false positive for clothing
)
SELECT * FROM Process
WHERE i=0
I think a better solution it's to add clr function in Ms SQL Server to manage regexp.
sql-clr/RegEx
Here is an option using the DelimitedSplit8K_LEAD which can be found here. https://www.sqlservercentral.com/articles/reaping-the-benefits-of-the-window-functions-in-t-sql-2 This is an extension of Jeff Moden's splitter that is even a little bit faster than the original. The big advantage this splitter has over most of the others is that it returns the ordinal position of each element. One caveat to this is that I am using a space to split on based on your sample data. If you had numbers crammed in the middle of other characters this will ignore them. That may be good or bad depending on you specific requirements.
declare #Something varchar(100) = 'Some text with 12345 and 1234 and 12345678';
with MyCTE as
(
select x.ItemNumber
, Result = isnull(case when TRY_CONVERT(bigint, x.Item) is not null then isnull(replicate('*', len(convert(varchar(20), TRY_CONVERT(bigint, x.Item))) - 4), '') + right(convert(varchar(20), TRY_CONVERT(bigint, x.Item)), 4) end, x.Item)
from dbo.DelimitedSplit8K_LEAD(#Something, ' ') x
)
select Output = stuff((select ' ' + Result
from MyCTE
order by ItemNumber
FOR XML PATH('')), 1, 1, '')
This produces: Some text with *2345 and 1234 and ****5678

T-SQL How to create function that compares string, checks difference, and do special function

First - sorry for my english. Second - i'm learning t-SQL.
Goal:
I want to get difference between two strings, then check in which column is this difference. If the difference is in first column, do something, if in second column - do something else.
What I'm actually doing
Column 'messages' is a string which contains list of ID. So i am replacing all '#' with ',' and deleting last ',' what gives to me ActualID and BeforeID column. See below:
DECLARE #string VARCHAR(512);
DECLARE #string2 VARCHAR(512);
DECLARE #string3 VARCHAR(512);
SET #string = '41#42#43#44#45#46#47#48#49#50#51#52#53#54#55#56#57#58#59#';
SET #string2 = REPLACE((SELECT messages FROM USERS WHERE userid = 4), '#', ', ' )
SET #string3 = left(#string2, len(#string2) - 1);
SET #string2 = REPLACE(#string, '#', ', ' )
SET #string = left(#string2, len(#string2) - 1);
SELECT #string3 as ActualID, #string as BeforeID
So now, I want compare BeforeID with ActualID. For example:
In BeforeID we have 1, 2, 3 / In ActualID 1, 2, 3, 4
In example above 4 was added. So, if it was added I want to add it to #AddedElements.
If 4, 5, 7 were added then SELECT #AddedElements as AddedElements should return 4, 5, 7 (With comas)
But, that's not all.
If BeforeID = 1, 5, 10, 14 and ActualID = 1, 5, 14 I want, that element which is in BeforeID, but not in AcutalID will be added to #DeletedElements.
So SELECT #DeletedElements as DeletedElements should return 10
Added elements/Deleted elements should be returned once. I mean, full result what I want to Earn should be
SELECT #AddedElements as AddedElements, #DeletedElements as DeletedElements
Is it possible? If, then how to do it?
First of all, I have to start by saying that this is just poor design; but having said that, I've also found myself in all kinds of situations where I couldn't change the way things worked, only try to make them work better in the current configuration. Therefore, I recommend something like this:
1: Create a UDF (User-Defined Function) that can handle splitting the strings and returning them in table-formed data that you can work with:
CREATE FUNCTION [dbo].[UDF_StringDelimiter]
/*********************************************************
** Takes Parameter "LIST" and transforms it for use **
** to select individual values or ranges of values. **
** **
** EX: 'This,is,a,test' = 'This' 'Is' 'A' 'Test' **
*********************************************************/
(
#LIST VARCHAR(8000)
,#DELIMITER VARCHAR(255)
)
RETURNS #TABLE TABLE
(
[RowID] INT IDENTITY
,[Value] VARCHAR(255)
)
WITH SCHEMABINDING
AS
BEGIN
DECLARE
#LISTLENGTH AS SMALLINT
,#LISTCURSOR AS SMALLINT
,#VALUE AS VARCHAR(255)
;
SELECT
#LISTLENGTH = LEN(#LIST) - LEN(REPLACE(#LIST,#DELIMITER,'')) + 1
,#LISTCURSOR = 1
,#VALUE = ''
;
WHILE #LISTCURSOR <= #LISTLENGTH
BEGIN
INSERT INTO #TABLE (Value)
SELECT
CASE
WHEN #LISTCURSOR < #LISTLENGTH
THEN SUBSTRING(#LIST,1,PATINDEX('%' + #DELIMITER + '%',#LIST) - 1)
ELSE SUBSTRING(#LIST,1,LEN(#LIST))
END
;
SET #LIST = STUFF(#LIST,1,PATINDEX('%' + #DELIMITER + '%',#LIST),'')
;
SET #LISTCURSOR = #LISTCURSOR + 1
;
END
;
RETURN
;
END
;
2: Consider dropping the whole "Switching out commas" thing, because it's pointless - the function I've written here takes two arguments: The string itself, and the delimiter (the mini-string that separates the individual strings within the big string, in your case '#') Then you just have to do a couple of quick comparisons to find out what was added and what was deleted.
DECLARE
#AddedElements VARCHAR(255) = ''
,#DeletedElements VARCHAR(255) = ''
,#ActualID VARCHAR(255) = '41#42#43#44#45#46#47#48#49#50#51#52#53#54#55#56#57#58#59#'
,#BeforeID VARCHAR(255) = '41#42#43#44#45#46#47#48#50#51#52#53#54#55#56#57#58#59#60#'
;
SET #AddedElements = #AddedElements +
SUBSTRING(
(
SELECT ', ' + Value
FROM dbo.UDF_StringDelimiter(#ActualID,'#')
WHERE Value NOT IN
(
SELECT Value
FROM dbo.UDF_StringDelimiter(#BeforeID,'#')
)
GROUP BY ', ' + Value
FOR XML PATH('')
)
,3,255)
;
SET #DeletedElements = #DeletedElements +
SUBSTRING(
(
SELECT ', ' + Value
FROM dbo.UDF_StringDelimiter(#BeforeID,'#')
WHERE Value NOT IN
(
SELECT Value
FROM dbo.UDF_StringDelimiter(#ActualID,'#')
)
GROUP BY ', ' + Value
FOR XML PATH('')
)
,3,255)
;
SELECT #AddedElements AS AddedElements,#DeletedElements AS DeletedElements
;
Using this method, if you add a value to #ActualID that does not exist in #BeforeID, it will show up in #AddedElements.
Likewise , if you remove an element from #ActualID that had previously existed in #BeforeID, it will show up in #DeletedElements.
All of this is, of course, assuming that the dynamic string (the one really being compared here) is the #ActualID. I operated with the understanding that #BeforeID is actually a stored value in the DB, and #ActualID is a dynamic string being passed in from...somewhere. If this is wrong, update me and I'll change the tactic appropriately.
Quick note: It's important to me to point out that this is just one way of dealing with a situation like this, and I'm sure there are better ways; but with the information I have, it's the best I could come up with without spending too much time and energy on it.

Remove only leading or trailing carriage returns

I'm dumbfounded that this question has not been asked meaningfully already. How does one go about creating an equivalent function in SQL like LTRIM or RTRIM for carriage returns and line feeds ONLY at the start or end of a string.
Obviously REPLACE(REPLACE(#MyString,char(10),''),char(13),'') removes ALL carriage returns and new line feeds. Which is NOT what I'm looking for. I just want to remove leading or trailing ones.
Find the first character that is not CHAR(13) or CHAR(10) and subtract its position from the string's length.
LTRIM()
SELECT RIGHT(#MyString,LEN(#MyString)-PATINDEX('%[^'+CHAR(13)+CHAR(10)+']%',#MyString)+1)
RTRIM()
SELECT LEFT(#MyString,LEN(#MyString)-PATINDEX('%[^'+CHAR(13)+CHAR(10)+']%',REVERSE(#MyString))+1)
Following functions are enhanced types of trim functions you can use. Copied from sqlauthority.com
These functions remove trailing spaces, leading spaces, white space, tabs, carriage returns, line feeds etc.
Trim Left
CREATE FUNCTION dbo.LTrimX(#str VARCHAR(MAX)) RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE #trimchars VARCHAR(10)
SET #trimchars = CHAR(9)+CHAR(10)+CHAR(13)+CHAR(32)
IF #str LIKE '[' + #trimchars + ']%' SET #str = SUBSTRING(#str, PATINDEX('%[^' + #trimchars + ']%', #str), LEN(#str))
RETURN #str
END
Trim Right
CREATE FUNCTION dbo.RTrimX(#str VARCHAR(MAX)) RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE #trimchars VARCHAR(10)
SET #trimchars = CHAR(9)+CHAR(10)+CHAR(13)+CHAR(32)
IF #str LIKE '%[' + #trimchars + ']'
SET #str = REVERSE(dbo.LTrimX(REVERSE(#str)))
RETURN #str
END
Trim both Left and Right
CREATE FUNCTION dbo.TrimX(#str VARCHAR(MAX)) RETURNS VARCHAR(MAX)
AS
BEGIN
RETURN dbo.LTrimX(dbo.RTrimX(#str))
END
Using function
SELECT dbo.TRIMX(#MyString)
If you do use these functions you might also consider changing from varchar to nvarchar to support more encodings.
In SQL Server 2017 you can use the TRIM function to remove specific characters from beginning and end, in one go:
WITH testdata(str) AS (
SELECT CHAR(13) + CHAR(10) + ' test ' + CHAR(13) + CHAR(10)
)
SELECT
str,
TRIM(CHAR(13) + CHAR(10) + CHAR(9) + ' ' FROM str) AS [trim cr/lf/tab/space],
TRIM(CHAR(13) + CHAR(10) FROM str) AS [trim cr/lf],
TRIM(' ' FROM str) AS [trim space]
FROM testdata
Result:
Note that the last example (trim space) does nothing as expected since the spaces are in the middle.
Here's an example you may run:
I decided to cast the results as an Xml value, so when you click on it, you will be able to view the Carriage Returns.
DECLARE #CRLF Char(2) = (CHAR(0x0D) + CHAR(0x0A))
DECLARE #String VarChar(MAX) = #CRLF + #CRLF + ' Hello' + #CRLF + 'World ' + #CRLF + #CRLF
--Unmodified String:
SELECT CAST(#String as Xml)[Unmodified]
--Remove Trailing Whitespace (including Spaces).
SELECT CAST(LEFT(#String, LEN(REPLACE(#String, #CRLF, ' '))) as Xml)[RemoveTrailingWhitespace]
--Remove Leading Whitespace (including Spaces).
SELECT CAST(RIGHT(#String, LEN(REVERSE(REPLACE(#String, #CRLF, ' ')))) as Xml)[RemoveLeadingWhitespace]
--Remove Leading & Trailing Whitespace (including Spaces).
SELECT CAST(SUBSTRING(#String, LEN(REPLACE(#String, ' ', '_')) - LEN(REVERSE(REPLACE(#String, #CRLF, ' '))) + 1, LEN(LTRIM(RTRIM(REPLACE(#String, #CRLF, ' '))))) as Xml)[RemoveAllWhitespace]
--Remove Only Leading and Trailing CR/LF's (while still preserving all other Whitespace - including Spaces). - 04/06/2016 - MCR.
SELECT CAST(SUBSTRING(#String, PATINDEX('%[^'+CHAR(13)+CHAR(10)+']%',#String), LEN(REPLACE(#String, ' ', '_')) - PATINDEX('%[^'+CHAR(13)+CHAR(10)+']%',#String) + 1 - PATINDEX('%[^'+CHAR(13)+CHAR(10)+']%', REVERSE(#String)) + 1) as Xml)[RemoveLeadingAndTrailingCRLFsOnly]
Remember to remove the Cast-to-Xml, as this was done just as a Proof-of-Concept to show it works.
How is this better than the currently Accepted Answer?
At first glance this may appear to use more Functions than the Accepted Answer.
However, this is not the case.
If you combine both approaches listed in the Accepted Answer (to remove both Trailing and Leading whitespace), you will either have to make two passes updating the Record, or copy all of one Logic into the other (everywhere #String is listed), which would cause way more function calls and become even more difficult to read.
I was stuck using Microsoft SQL Server 2008 R2 and so basing my functions on #sqluser's answer I came up with the below. This will return an empty string if the string only contains the characters to be trimmed.
The bit that threw me was the pattern for PATINDEX must be included between % characters, which for a while I was thinking of as the same wildcard in a LIKE statement but which I now believe is just the syntax to denote a pattern, though I may be wrong!
CREATE FUNCTION [dbo].[ExtendedLTRIM](#string_to_trim VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE #tab CHAR(1) = CHAR(9);
DECLARE #line_feed CHAR(1) = CHAR(10);
DECLARE #carriage_return CHAR(1) = CHAR(13);
DECLARE #space CHAR(1) = CHAR(32);
DECLARE #characters_to_trim VARCHAR(10)
SET #characters_to_trim = #tab + #line_feed + #carriage_return + #space
IF #string_to_trim LIKE '[' + #characters_to_trim + ']%'
BEGIN
DECLARE #first_non_trim_character INT = PATINDEX('%[^' + #characters_to_trim + ']%', #string_to_trim);
IF #first_non_trim_character = 0 RETURN '';
RETURN SUBSTRING(#string_to_trim, #first_non_trim_character, 8000)
END
RETURN #string_to_trim
END
GO
To trim characters from a pre-defined list you'll want to create the following UDF (should work in 2008R2 and above).
Handles both sides in a single pass and doesn't care if it's a CRLF, LFCR (yep, seen that abomination more than once), bare LF or a bunch of spaces.
is easy to extend to e.g. add additional parameters to do LTRIM/RTRIM only, or a full purge (that last bit is simpler to do in 2017 by incorporating STRING_AGG, but perfectly doable in 2008R2); as a matter of fact this is a simplified version of something I use to do all those things. If anybody is interested then let me know and I'll update:
CREATE FUNCTION fnTrimHarder
(
#String VARCHAR(MAX)
)
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE
#Start INT,
#Len INT,
#Chars CHAR(5) = CONCAT(
CHAR(9), -- TAB
CHAR(10), -- LF
CHAR(13), -- CR
' '
), -- List of invalid characters
#Return VARCHAR(MAX) = '';
IF #String NOT LIKE '%[^' + #Chars + ']%' -- If string contains only invalid characters
OR COALESCE(#String, '') = '' -- Optional addition for NULL handling
RETURN #Return
ELSE
BEGIN -- Create a "table" of characters with ordinals, calculate the start of string and its length, then return the substring
WITH CTE AS (
SELECT 1 AS n
UNION ALL
SELECT n + 1
FROM CTE
WHERE n < LEN(#String)
)
SELECT
#Start = MIN(n),
#Len = 1 + MAX(n) - MIN(n)
FROM CTE
WHERE SUBSTRING(#String, n, 1) NOT LIKE '[' + #Chars + ']';
SET #Return = SUBSTRING(#String, #Start, #Len)
END
RETURN #Return
END
GO

Extract a number from String in SQL

I have the following string:
"FLEETWOOD DESIGNS 535353110XXXXX" (The X's are actually numbers I just wanted to hide them here)
Does anyone know how can I search through Strings in SQL and extract numbers that are greater then lets say 10 characters long?
This a quite old post but might help anyone else. I was searching for an user defined function in SQL Server to extract only the numbers of a given string, and, surprisingly I could not find exactly what I was looking for.
Let me put here the code of a function to "Extract a number from string in SQL" (valid for SQL Server). This is taken from the fantastic blog of Pinal Dave, I've modified it just to return NULL is a NULL value is passed to the function.
CREATE FUNCTION [dbo].[ExtractInteger](#String VARCHAR(2000))
RETURNS VARCHAR(1000)
AS
BEGIN
DECLARE #Count INT
DECLARE #IntNumbers VARCHAR(1000)
SET #Count = 0
SET #IntNumbers = ''
IF #String IS NULL
RETURN NULL;
WHILE #Count <= LEN(#String)
BEGIN
IF SUBSTRING(#String,#Count,1) >= '0' AND SUBSTRING(#String,#Count,1) <= '9'
BEGIN
SET #IntNumbers = #IntNumbers + SUBSTRING(#String,#Count,1)
END
SET #Count = #Count + 1
END
RETURN #IntNumbers
END
Tests
select '"' + dbo.ExtractInteger('1a2b3c4d5e6f7g8h9i') + '"'
GO
select '"' + dbo.ExtractInteger('abcdefghi') + '"'
GO
select '"' + dbo.ExtractInteger(NULL) + '"'
GO
select '"' + dbo.ExtractInteger('') + '"'
GO
Results
"123456789"
""
NULL
""
You don't mention the DB engine, so we don't know what features are available...
If regexpressions are available then pattern like \d{10,} would match numbers with 10 or more digit.
In mySQL REGEXP can only return true or false (0 or 1) so you'd have to use some ugly hack like
SELECT
LEAST(
INSTR(field,'0'),
INSTR(field,'1'),
INSTR(field,'2'),
INSTR(field,'3'),
INSTR(field,'4'),
INSTR(field,'5'),
INSTR(field,'6'),
INSTR(field,'7'),
INSTR(field,'8'),
INSTR(field,'9')
) AS startPos,
REVERSE(field) AS backward,
LEAST(
INSTR(backward,'0'),
INSTR(backward,'1'),
INSTR(backward,'2'),
INSTR(backward,'3'),
INSTR(backward,'4'),
INSTR(backward,'5'),
INSTR(backward,'6'),
INSTR(backward,'7'),
INSTR(backward,'8'),
INSTR(backward,'9')
) AS endPos,
SUBSTRING(field, startPos, endPos - startPos + 1)
FROM tab
WHERE(field REGEXP '[0-9]{10,}')
but this isn't perfect - it would extract false substring for string like "ABC 9 A 1234567891", not to mention that it is probably so slooooow that it is faster to go througt data by hand.
SUBSTRING('FLEETWOOD DESIGNS 535353110XXXXX', 18, 32)
You could also use LEN() to get the length of the string itself. If you know the serial number length, you can just subtract that from the end index to get your start index of the substring.
It could be done like this
Declare #X varchar(100)
Select #X= 'Here is where15234Numbers'
--
Select #X= SubString(#X,PATINDEX('%[0-9]%',#X),Len(#X))
Select #X= SubString(#X,0,PATINDEX('%[^0-9]%',#X))
--// show result
Select #X