Generating an n-gram table with an SQL query - sql

I'm trying to implement a fuzzy search with JavaScript client side, to search a largish db (300 items roughly) of records contained in an SQL database. My constraint is that it is not possible to perform a live query on the database- I must generate "indexes" as flat files during a nightly batch job. And so, starting with a db that looks like this:
ID. NAME
1. The Rain Man
2. The Electric Slide
3. Transformers
I need to create within a single query something like this:
Trigram ID
the 1
the 2
he_ 1
he_ 2
e_r 1
_ra 1
rai 1
ain 1
in_ 1
n_m 1
_ma 1
man 1
e_e 2
_el 2
ele 2
lec 2
Etc etc, typos not withstanding. The rules here are that ''n' is the length of the strings in the first column, that only a-z and _ are valid characters, any other character being normalized to Lower case, or mapped to _, that a group by n-gram clause may be applied to the table. Thus, I would hope to gain a table that would allow me to quickly look up a particular n-gram and get a list of all the Ids of rows which contain that sequence. I'm not a clever enough SQL cookie to figure this problem out. Can you?

I created an T-SQL NGrams that works quite nicely; note the comments section for examples of how to use
CREATE FUNCTION dbo.nGrams8K
(
#string VARCHAR(8000),
#n TINYINT,
#pad BIT
)
/*
Created by: Alan Burstein
Created on: 3/10/2014
Updated on: 5/20/2014 changed the logic to use an "inline tally table"
9/10/2014 Added some more code examples in the comment section
9/30/2014 Added more code examples
10/27/2014 Small bug fix regarding padding
Use: Outputs a stream of tokens based on an input string.
Works just like mdq.nGrams; see http://msdn.microsoft.com/en-us/library/ff487027(v=sql.105).aspx.
n-gram defined:
In the fields of computational linguistics and probability,
an n-gram is a contiguous sequence of n items from a given
sequence of text or speech. The items can be phonemes, syllables,
letters, words or base pairs according to the application.
To better understand N-Grams see: http://en.wikipedia.org/wiki/N-gram
*/
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH
E1(n) AS (SELECT 1 FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(n)),
E2(n) AS (SELECT 1 FROM E1 a CROSS JOIN E1 b),
iTally(n) AS
(
SELECT TOP (LEN(#string)+#n) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM E2 a CROSS JOIN E2 b
),
NewString(NewString) AS
(
SELECT REPLICATE(CASE #pad WHEN 0 THEN '' ELSE ' ' END,#n-1)+#string+
REPLICATE(CASE #pad WHEN 0 THEN '' ELSE ' ' END,#n-1)
)
SELECT TOP ((#n)+LEN(#string))
n AS [sequence],
SUBSTRING(NewString,n,#n) AS token
FROM iTally
CROSS APPLY NewString
WHERE n < ((#n)+LEN(#string));
/*
------------------------------------------------------------
-- (1) Basic Use
-------------------------------------------------------------
;-- (A)basic "string to table":
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',1,1);
-- (b) create "bi-grams" (pad bit off)
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',2,0);
-- (c) create "tri-grams" (pad bit on)
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1);
-- (d) filter for only "tri-grams"
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1)
WHERE len(ltrim(token)) = 3;
-- note the query plan for each. The power is coming from an index
-- also note how many rows are produced: len(#string+(#n-1))
-- lastly, you can trim as needed when padding=1
------------------------------------------------------------
-- (2) With a variable
------------------------------------------------------------
-- note, in this example I am getting only the stuff that has three letters
DECLARE #string varchar(20) = 'abcdefg',
#tokenLen tinyint = 3;
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1)
WHERE len(ltrim(token)) = 3;
GO
------------------------------------------------------------
-- (3) An on-the-fly alphabet (this will come in handy in a moment)
------------------------------------------------------------
DECLARE #alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ';
SELECT [sequence], token
FROM dbo.nGrams8K(#alphabet,1,0);
GO
------------------------------------------------------------
-- (4) Character Count
------------------------------------------------------------
DECLARE #string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dog just laid there.',
#alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ';
SELECT a.token, COUNT(b.token) ttl
FROM dbo.nGrams8K(#alphabet,1,0) a
LEFT JOIN dbo.nGrams8K(#string,1,0) b ON a.token=b.token
GROUP BY a.token
ORDER BY a.token;
GO
------------------------------------------------------------
-- (5) Locate the start position of a search pattern
------------------------------------------------------------
;-- (A) note these queries:
DECLARE #string varchar(100)='THE QUICK Green FOX JUMPED OVER THE LAZY DOGS BACK';
-- (i)
SELECT * FROM dbo.nGrams8K(#string,1,0) a;
-- (ii) note this query:
SELECT * FROM dbo.nGrams8K(#string,1,0) a WHERE [token]=' ';
-- (B) and now the word count (#string included for presentation)
SELECT #string AS string,
count(*)+1 AS words
FROM dbo.nGrams8K(#string,1,0) a
WHERE [token]=' '
GO
------------------------------------------------------------
-- (6) search for the number of occurances of a word
------------------------------------------------------------
DECLARE #string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dog just laid there.',
#alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ',
#searchString VARCHAR(100)='The';
-- (5a) by location
SELECT sequence-(LEN(#searchstring)) AS location,
token AS searchString
FROM dbo.nGrams8K(#string,LEN(#searchstring+' ')+1,0) b
WHERE token=#searchString;
-- (2b) get total
SELECT #string AS string,
#searchString AS searchString,
COUNT(*) AS ttl
FROM dbo.nGrams8K(#string,LEN(#searchstring+' ')+1,0) b
WHERE token=#searchString;
------------------------------------------------------------
-- (7) Special SubstringBefore and SubstringAfter
------------------------------------------------------------
-- (7a) SubstringBeforeSSI (note: SSI = substringIndex)
ALTER FUNCTION dbo.SubstringBeforeSSI
(
#string varchar(1000),
#substring varchar(100),
#substring_index tinyint
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH get_pos AS
(
SELECT rn = row_number() over (order by sequence), substring_index = sequence
FROM dbo.nGrams8K(#string,len(#substring),1)
WHERE token=#substring
)
SELECT newstring = substring(#string,1,substring_index-len(#substring))
FROM get_pos
WHERE rn=#substring_index;
GO
DECLARE #string varchar(1000)='10.0.1600.22',
#searchPattern varchar(100)='.',
#substring_index tinyint = 3;
SELECT * FROM dbo.SubstringBeforeSSI(#string,#searchPattern,#substring_index);
GO
-- (7b) SubstringBeforeSSI (note: SSI = substringIndex)
ALTER FUNCTION dbo.SubstringAfterSSI
(
#string varchar(1000),
#substring varchar(100),
#substring_index tinyint
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH get_pos AS
(
SELECT rn = row_number() over (order by sequence), substring_index = sequence
FROM dbo.nGrams8K(#string,len(#substring),1)
WHERE token=#substring
)
SELECT newstring = substring(#string,substring_index+1,8000)
FROM get_pos
WHERE rn=#substring_index;
GO
DECLARE #string varchar(1000)='<notes id="1">blah, blah, blah</notes><notes id="2">More Notes</notes>',
#searchPattern varchar(100)='</notes>',
#substring_index tinyint = 1;
SELECT #string, *
FROM dbo.SubstringAfterSSI(#string,#searchPattern,#substring_index);
------------------------------------------------------------
-- (8) Strip non-numeric characters from a string
------------------------------------------------------------
-- (8a) create the function
ALTER FUNCTION StripNonNumeric_itvf(#OriginalText VARCHAR(8000))
RETURNS TABLE
--WITH SCHEMABINDING
AS
return
WITH ngrams AS
(
SELECT n = [sequence], c = token
FROM dbo.nGrams8K(#OriginalText,1,1)
),
clean_txt(CleanedText) AS
(
SELECT c+''
FROM ngrams
WHERE ascii(substring(#OriginalText,n,1)) BETWEEN 48 AND 57
FOR XML PATH('')
)
SELECT CleanedText
FROM clean_txt;
GO
-- (8b) use against a value or variable
SELECT CleanedText
FROM dbo.StripNonNumeric_itvf('value123');
-- (8c) use against a table
-- test harness:
IF OBJECT_ID('tempdb..#strings') IS NOT NULL DROP TABLE #strings;
WITH strings AS
(
SELECT TOP (100000) string = newid()
FROM sys.all_columns a CROSS JOIN sys.all_columns b
)
SELECT *
INTO #strings
FROM strings;
GO
-- query (returns 100K rows every 3 seconds on my pc):
SELECT CleanedText
FROM #strings
CROSS APPLY dbo.StripNonNumeric_itvf(string);
------------------------------------------------------------
-- (9) A couple complex String Algorithms
------------------------------------------------------------
-- (9a) hamming distance between two strings:
DECLARE #string1 varchar(8000) = 'xxxxyyyzzz',
#string2 varchar(8000) = 'xxxxyyzzzz';
SELECT string1 = #string1,
string2 = #string2,
hamming_distance = count(*)
FROM dbo.nGrams8K(#string1,1,0) s1
CROSS APPLY dbo.nGrams8K(#string2,1,0) s2
WHERE s1.sequence = s2.sequence
AND s1.token <> s2.token
GO
-- (9b) inner join between 2 strings
--(can be used to speed up other string metrics such as the longest common subsequence)
DECLARE #string1 varchar(100)='xxxx123yyyy456zzzz',
#string2 varchar(100)='xx789yy000zz';
WITH
s1(string1) AS
(
SELECT [token]+''
FROM dbo.nGrams8K(#string1,1,0)
WHERE charindex([token],#string2)<>0
ORDER BY [sequence]
FOR XML PATH('')
),
s2(string2) AS
(
SELECT [token]+''
FROM dbo.nGrams8K(#string2,1,0)
WHERE charindex([token],#string1)<>0
ORDER BY [sequence]
FOR XML PATH('')
)
SELECT string1, string2
FROM s1
CROSS APPLY s2;
------------------------------------------------------------
-- (10) Advanced Substring Metrics
------------------------------------------------------------
-- (10a) Identify common substrings and their location
DECLARE #string1 varchar(100) = 'xxx yyy zzz',
#string2 varchar(100) = 'xx yyy zz';
-- (i) review the two strings
SELECT str1 = #string1,
str2 = #string2;
-- (ii) the results
WITH
iTally AS
(
SELECT n
FROM dbo.tally t
WHERE n<= len(#string1)
),
distinct_tokens AS
(
SELECT ng1 = ng1.token, ng2 = ng2.token --= ltrim(ng1.token), ng2 = ltrim(ng2.token)
FROM itally
CROSS APPLY dbo.nGrams8K(#string1,n,1) ng1
CROSS APPLY dbo.nGrams8K(#string2,n,1) ng2
WHERE ng1.token=ng2.token
)
SELECT ss_txt = ng1,
ss_len = len(ng1),
str1_loc = charindex(ng1,#string1),
str2_loc = charindex(ng2,#string2)
FROM distinct_tokens
WHERE ng1<>'' AND charindex(ng1,#string1)+charindex(ng2,#string2)<>0
GROUP BY ng1, ng2
ORDER BY charindex(ng1,#string1), charindex(ng2,#string2), len(ng1);
-- (10b) Longest common substring function
-- (i) function
IF EXISTS
( SELECT * FROM INFORMATION_SCHEMA.ROUTINES
WHERE ROUTINE_SCHEMA='dbo' AND ROUTINE_NAME = 'lcss')
DROP FUNCTION dbo.lcss;
GO
CREATE FUNCTION dbo.lcss(#string1 varchar(100), #string2 varchar(100))
RETURNS TABLE
AS
RETURN
SELECT TOP (1) with ties token
FROM dbo.tally
CROSS APPLY dbo.nGrams8K(#string1,n,1)
WHERE n <= len(#string1)
AND charindex(token, #string2) > 0
ORDER BY len(token) DESC;
GO
-- (ii) example of use
DECLARE #string1 varchar(100) = '000xxxyyyzzz',
#string2 varchar(100) = '999xxyyyzaa';
SELECT string1 = #string1,
string2 = #string2,
token
FROM dbo.lcss(#string1, #string2);
*/
GO

You'd have to repeat this statement:
insert into trigram_table ( Trigram, ID )
select substr( translate( lower( Name ), ' ', '_' ), :X, :N ),
ID
from db_table
for all :X from 1 to Len(Name) + 1 - :N
You will also have to extend the translate function for all the other special characters you'd want to convert to an underscore. Right now it's just translating a blank into an underscore.
For performance you could do the translate and lower functions on the Trigram column in a last pass on the trigram_table so you're not doing those functions for each :X.

Related

T-SQL - Count unique characters in a variable

Goal: To count # of distinct characters in a variable the fastest way possible.
DECLARE #String1 NVARCHAR(4000) = N'1A^' ; --> output = 3
DECLARE #String2 NVARCHAR(4000) = N'11' ; --> output = 1
DECLARE #String3 NVARCHAR(4000) = N'*' ; --> output = 1
DECLARE #String4 NVARCHAR(4000) = N'*A-zz' ; --> output = 4
I've found some posts in regards to distinct characters in a column, grouped by characters, and etc, but not one for this scenario.
Using NGrams8K as a base, you can change the input parameter to a nvarchar(4000) and tweak the DATALENGTH, making NGramsN4K. Then you can use that to split the string into individual characters and count them:
SELECT COUNT(DISTINCT NG.token) AS DistinctCharacters
FROM dbo.NGramsN4k(#String1,1) NG;
Altered NGrams8K:
IF OBJECT_ID('dbo.NGramsN4k','IF') IS NOT NULL DROP FUNCTION dbo.NGramsN4k;
GO
CREATE FUNCTION dbo.NGramsN4k
(
#string nvarchar(4000), -- Input string
#N int -- requested token size
)
/****************************************************************************************
Purpose:
A character-level N-Grams function that outputs a contiguous stream of #N-sized tokens
based on an input string (#string). Accepts strings up to 8000 varchar characters long.
For more information about N-Grams see: http://en.wikipedia.org/wiki/N-gram.
Compatibility:
SQL Server 2008+, Azure SQL Database
Syntax:
--===== Autonomous
SELECT position, token FROM dbo.NGrams8k(#string,#N);
--===== Against a table using APPLY
SELECT s.SomeID, ng.position, ng.token
FROM dbo.SomeTable s
CROSS APPLY dbo.NGrams8K(s.SomeValue,#N) ng;
Parameters:
#string = The input string to split into tokens.
#N = The size of each token returned.
Returns:
Position = bigint; the position of the token in the input string
token = varchar(8000); a #N-sized character-level N-Gram token
Developer Notes:
1. NGrams8k is not case sensitive
2. Many functions that use NGrams8k will see a huge performance gain when the optimizer
creates a parallel execution plan. One way to get a parallel query plan (if the
optimizer does not chose one) is to use make_parallel by Adam Machanic which can be
found here:
sqlblog.com/blogs/adam_machanic/archive/2013/07/11/next-level-parallel-plan-porcing.aspx
3. When #N is less than 1 or greater than the datalength of the input string then no
tokens (rows) are returned. If either #string or #N are NULL no rows are returned.
This is a debatable topic but the thinking behind this decision is that: because you
can't split 'xxx' into 4-grams, you can't split a NULL value into unigrams and you
can't turn anything into NULL-grams, no rows should be returned.
For people who would prefer that a NULL input forces the function to return a single
NULL output you could add this code to the end of the function:
UNION ALL
SELECT 1, NULL
WHERE NOT(#N > 0 AND #N <= DATALENGTH(#string)) OR (#N IS NULL OR #string IS NULL)
4. NGrams8k can also be used as a tally table with the position column being your "N"
row. To do so use REPLICATE to create an imaginary string, then use NGrams8k to split
it into unigrams then only return the position column. NGrams8k will get you up to
8000 numbers. There will be no performance penalty for sorting by position in
ascending order but there is for sorting in descending order. To get the numbers in
descending order without forcing a sort in the query plan use the following formula:
N = <highest number>-position+1.
Pseudo Tally Table Examples:
--===== (1) Get the numbers 1 to 100 in ascending order:
SELECT N = position
FROM dbo.NGrams8k(REPLICATE(0,100),1);
--===== (2) Get the numbers 1 to 100 in descending order:
DECLARE #maxN int = 100;
SELECT N = #maxN-position+1
FROM dbo.NGrams8k(REPLICATE(0,#maxN),1)
ORDER BY position;
5. NGrams8k is deterministic. For more about deterministic functions see:
https://msdn.microsoft.com/en-us/library/ms178091.aspx
Usage Examples:
--===== Turn the string, 'abcd' into unigrams, bigrams and trigrams
SELECT position, token FROM dbo.NGrams8k('abcd',1); -- unigrams (#N=1)
SELECT position, token FROM dbo.NGrams8k('abcd',2); -- bigrams (#N=2)
SELECT position, token FROM dbo.NGrams8k('abcd',3); -- trigrams (#N=3)
--===== How many times the substring "AB" appears in each record
DECLARE #table TABLE(stringID int identity primary key, string varchar(100));
INSERT #table(string) VALUES ('AB123AB'),('123ABABAB'),('!AB!AB!'),('AB-AB-AB-AB-AB');
SELECT string, occurances = COUNT(*)
FROM #table t
CROSS APPLY dbo.NGrams8k(t.string,2) ng
WHERE ng.token = 'AB'
GROUP BY string;
----------------------------------------------------------------------------------------
Revision History:
Rev 00 - 20140310 - Initial Development - Alan Burstein
Rev 01 - 20150522 - Removed DQS N-Grams functionality, improved iTally logic. Also Added
conversion to bigint in the TOP logic to remove implicit conversion
to bigint - Alan Burstein
Rev 03 - 20150909 - Added logic to only return values if #N is greater than 0 and less
than the length of #string. Updated comment section. - Alan Burstein
Rev 04 - 20151029 - Added ISNULL logic to the TOP clause for the #string and #N
parameters to prevent a NULL string or NULL #N from causing "an
improper value" being passed to the TOP clause. - Alan Burstein
****************************************************************************************/
RETURNS TABLE WITH SCHEMABINDING AS RETURN
WITH
L1(N) AS
(
SELECT 1
FROM (VALUES -- 90 NULL values used to create the CTE Tally table
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL)
) t(N)
),
iTally(N) AS -- my cte Tally table
(
SELECT TOP(ABS(CONVERT(BIGINT,((DATALENGTH(ISNULL(#string,N''))/2)-(ISNULL(#N,1)-1)),0)))
ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
FROM L1 a CROSS JOIN L1 b -- cartesian product for 8100 rows (90^2)
)
SELECT
position = N, -- position of the token in the string(s)
token = SUBSTRING(#string,CAST(N AS int),#N) -- the #N-Sized token
FROM iTally
WHERE #N > 0 AND #N <= (DATALENGTH(#string)/2); -- Protection against bad parameter values
Here is another alternative using the power of the tally table. It has been called the "Swiss Army Knife of T-SQL". I keep a tally table as a view on my system which makes it insanely fast.
create View [dbo].[cteTally] as
WITH
E1(N) AS (select 1 from (values (1),(1),(1),(1),(1),(1),(1),(1),(1),(1))dt(n)),
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS
(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
)
select N from cteTally
Now we can use that tally anytime we need it, like for this exercise.
declare #Something table
(
String1 nvarchar(4000)
)
insert #Something values
(N'1A^')
, (N'11')
, (N'*')
, (N'*A-zz')
select count(distinct substring(s.String1, t.N, 1))
, s.String1
from #Something s
join cteTally t on t.N <= len(s.String1)
group by s.String1
To be honest I don't know this would be any faster than Larnu's usage of NGrams but testing on a large table would be fun to see.
----- EDIT -----
Thanks to Shnugo for the idea. Using a cross apply to a correlated subquery here is actually quite an improvement.
select count(distinct substring(s.String1, A.N, 1))
, s.String1
from #Something s
CROSS APPLY (SELECT TOP(LEN(s.String1)) t.N FROM cteTally t) A(N)
group by s.String1
The reason this is so much faster is that this is no longer using a triangular join which can really be painfully slow. I did also switch out the view with an indexed physical tally table. The improvement there was noticeable on larger datasets but not nearly as big as using the cross apply.
If you want to read more about triangular joins and why we should avoid them Jeff Moden has a great article on the topic. https://www.sqlservercentral.com/articles/hidden-rbar-triangular-joins
Grab a copy of NGrams8k and you can do this:
DECLARE #String1 NVARCHAR(4000) = N'1A^' ; --> output = 3
DECLARE #String2 NVARCHAR(4000) = N'11' ; --> output = 1
DECLARE #String3 NVARCHAR(4000) = N'*' ; --> output = 1
DECLARE #String4 NVARCHAR(4000) = N'*A-zz' ; --> output = 4
SELECT s.String, Total = COUNT(DISTINCT ng.token)
FROM (VALUES(#String1),(#String2),(#String3),(#String4)) AS s(String)
CROSS APPLY dbo.NGrams8k(s.String,1) AS ng
GROUP BY s.String;
Returns:
String Total
-------- -----------
* 1
*A-zz 4
11 1
1A^ 3
UPDATED
Just a quick update based on #Larnu's post and comments. I did not notice that the OP was dealing with Unicode e.g. NVARCHAR. I created an NVARCHAR(4000) version here - similar to what #Larnu posted above. I just updated the return token to use Latin1_General_BIN collation.
SUBSTRING(#string COLLATE Latin1_General_BIN,CAST(N AS int),#N)
This returns the correct answer:
DECLARE #String5 NVARCHAR(4000) = N'ᡣᓡ'; --> output = 2
SELECT COUNT(DISTINCT ng.token)
FROM dbo.NGramsN4k(#String5,1) AS ng;
Without the collation in place you can use the what Larnu posted and get the right answer like this:
DECLARE #String5 NVARCHAR(4000) = N'ᡣᓡ'; --> output = 2
SELECT COUNT(DISTINCT UNICODE(ng.token))
FROM dbo.NGramsN4k(#String5,1) AS ng;
Here's my updated NGramsN4K function:
ALTER FUNCTION dbo.NGramsN4K
(
#string nvarchar(4000), -- Input string
#N int -- requested token size
)
/****************************************************************************************
Purpose:
A character-level N-Grams function that outputs a contiguous stream of #N-sized tokens
based on an input string (#string). Accepts strings up to 4000 nvarchar characters long.
For more information about N-Grams see: http://en.wikipedia.org/wiki/N-gram.
Compatibility:
SQL Server 2008+, Azure SQL Database
Syntax:
--===== Autonomous
SELECT position, token FROM dbo.NGramsN4K(#string,#N);
--===== Against a table using APPLY
SELECT s.SomeID, ng.position, ng.token
FROM dbo.SomeTable s
CROSS APPLY dbo.NGramsN4K(s.SomeValue,#N) ng;
Parameters:
#string = The input string to split into tokens.
#N = The size of each token returned.
Returns:
Position = bigint; the position of the token in the input string
token = nvarchar(4000); a #N-sized character-level N-Gram token
Developer Notes:
1. NGramsN4K is not case sensitive
2. Many functions that use NGramsN4K will see a huge performance gain when the optimizer
creates a parallel execution plan. One way to get a parallel query plan (if the
optimizer does not chose one) is to use make_parallel by Adam Machanic which can be
found here:
sqlblog.com/blogs/adam_machanic/archive/2013/07/11/next-level-parallel-plan-porcing.aspx
3. When #N is less than 1 or greater than the datalength of the input string then no
tokens (rows) are returned. If either #string or #N are NULL no rows are returned.
This is a debatable topic but the thinking behind this decision is that: because you
can't split 'xxx' into 4-grams, you can't split a NULL value into unigrams and you
can't turn anything into NULL-grams, no rows should be returned.
For people who would prefer that a NULL input forces the function to return a single
NULL output you could add this code to the end of the function:
UNION ALL
SELECT 1, NULL
WHERE NOT(#N > 0 AND #N <= DATALENGTH(#string)) OR (#N IS NULL OR #string IS NULL);
4. NGramsN4K is deterministic. For more about deterministic functions see:
https://msdn.microsoft.com/en-us/library/ms178091.aspx
Usage Examples:
--===== Turn the string, 'abcd' into unigrams, bigrams and trigrams
SELECT position, token FROM dbo.NGramsN4K('abcd',1); -- unigrams (#N=1)
SELECT position, token FROM dbo.NGramsN4K('abcd',2); -- bigrams (#N=2)
SELECT position, token FROM dbo.NGramsN4K('abcd',3); -- trigrams (#N=3)
--===== How many times the substring "AB" appears in each record
DECLARE #table TABLE(stringID int identity primary key, string nvarchar(100));
INSERT #table(string) VALUES ('AB123AB'),('123ABABAB'),('!AB!AB!'),('AB-AB-AB-AB-AB');
SELECT string, occurances = COUNT(*)
FROM #table t
CROSS APPLY dbo.NGramsN4K(t.string,2) ng
WHERE ng.token = 'AB'
GROUP BY string;
------------------------------------------------------------------------------------------
Revision History:
Rev 00 - 20170324 - Initial Development - Alan Burstein
Rev 01 - 20191108 - Added Latin1_General_BIN collation to token output - Alan Burstein
*****************************************************************************************/
RETURNS TABLE WITH SCHEMABINDING AS RETURN
WITH
L1(N) AS
(
SELECT 1 FROM (VALUES -- 64 dummy values to CROSS join for 4096 rows
($),($),($),($),($),($),($),($),($),($),($),($),($),($),($),($),
($),($),($),($),($),($),($),($),($),($),($),($),($),($),($),($),
($),($),($),($),($),($),($),($),($),($),($),($),($),($),($),($),
($),($),($),($),($),($),($),($),($),($),($),($),($),($),($),($)) t(N)
),
iTally(N) AS
(
SELECT
TOP (ABS(CONVERT(BIGINT,((DATALENGTH(ISNULL(#string,''))/2)-(ISNULL(#N,1)-1)),0)))
ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
FROM L1 a CROSS JOIN L1 b -- cartesian product for 4096 rows (16^2)
)
SELECT
position = N, -- position of the token in the string(s)
token = SUBSTRING(#string COLLATE Latin1_General_BIN,CAST(N AS int),#N) -- the #N-Sized token
FROM iTally
WHERE #N > 0 -- Protection against bad parameter values:
AND #N <= (ABS(CONVERT(BIGINT,((DATALENGTH(ISNULL(#string,''))/2)-(ISNULL(#N,1)-1)),0)));
You can do this natively in SQL Server using CTE and some string manipuation:
DECLARE #TestString NVARCHAR(4000);
SET #TestString = N'*A-zz';
WITH letters AS
(
SELECT 1 AS Pos,
#TestString AS Stri,
MAX(LEN(#TestString)) AS MaxPos,
SUBSTRING(#TestString, 1, 1) AS [Char]
UNION ALL
SELECT Pos + 1,
#TestString,
MaxPos,
SUBSTRING(#TestString, Pos + 1, 1) AS [Char]
FROM letters
WHERE Pos + 1 <= MaxPos
)
SELECT COUNT(*) AS LetterCount
FROM (
SELECT UPPER([Char]) AS [Char]
FROM letters
GROUP BY [Char]
) a
Example outputs:
SET #TestString = N'*A-zz';
{execute code}
LetterCount = 4
SET #TestString = N'1A^';
{execute code}
LetterCount = 3
SET #TestString = N'1';
{execute code}
LetterCount = 1
SET #TestString = N'*';
{execute code}
LetterCount = 1
CREATE TABLE #STRINGS(
STRING1 NVARCHAR(4000)
)
INSERT INTO #STRINGS (
STRING1
)
VALUES
(N'1A^'),(N'11'),(N'*'),(N'*A-zz')
;WITH CTE_T AS (
SELECT DISTINCT
S.STRING1
,SUBSTRING(S.STRING1, V.number + 1, 1) AS Val
FROM
#STRINGS S
INNER JOIN
[master]..spt_values V
ON V.number < LEN(S.STRING1)
WHERE
V.[type] = 'P'
)
SELECT
T.STRING1
,COUNT(1) AS CNT
FROM
CTE_T T
GROUP BY
T.STRING1

Sql Multiple Replace based on query

I have been trying to set up a SQL function to build descriptions with "tags". For example, I would want to start with a description:
"This is [length] ft. long and [height] ft. high"
And modify the description with data from a related table, to end up with:
"This is 75 ft. long and 20 ft. high"
I could do this easily with REPLACE functions if we had a set number of tags, but I want these tags to be user defined, and each description may or may not have specific tags in it. Would there be any better way to get this other than using a cursor to go through the string once for each available tag? Does SQL have any built in functionality to do a multiple replace? something like:
Replace(description,(select tag, replacement from tags))
I actually recommend doing this in application code. But, you can do it using a recursive CTE:
with t as (
select t.*, row_number() over (order by t.tag) as seqnum
from tags t
),
cte as (
select replace(#description, t.tag, t.replacement) as d, t.seqnum
from t
where seqnum = 1
union all
select replace(d, t.tag, t.replacement), t.seqnum
from cte join
t
on t.seqnum = cte.seqnum + 1
)
select top 1 cte.*
from cte
order by seqnum desc;
Try below query :
SELECT REPLACE(DESCRIPTION,'[length]',( SELECT replacement FROM tags WHERE tag
= '[length]') )
I agree with Gordon that this is best handled in your application code.
If for whatever reason that option is not available however, and if you don't want to use recursion as per Gordon's answer, you could use a tally table approach to swap out your values.
You will need to test the performance of the for xml being executed for each value though...
Assuming you have a table of Tag replacement values:
create table TagReplacementTable(Tag nvarchar(50), Replacement nvarchar(50));
insert into TagReplacementTable values('[test]',999)
,('[length]',75)
,('[height]',20)
,('[other length]',40)
,('[other height]',50);
You can create an inline table function that will work through your Descriptions and drop replace the necessary parts using TagReplacementTable as reference:
create function dbo.Tag_Replace(#str nvarchar(4000)
,#tagstart nvarchar(1)
,#tagend nvarchar(1)
)
returns table
as
return
(
with n(n) as (select n from (values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) n(n))
-- Select the same number of rows as characters in #str as incremental row numbers.
-- Cross joins increase exponentially to a max possible 10,000 rows to cover largest #str length.
,t(t) as (select top (select len(#str) a) row_number() over (order by (select null)) from n n1,n n2,n n3,n n4)
-- Return the position of every value that starts or ends a part of the description.
-- This will be the first character (t='f'), the start of any tag (t='s') and the end of any tag (t='e').
,s(s,t) as (select 1, 'f'
union all select t+1, 's' from t where substring(#str,t,1) = #tagstart
union all select t+1, 'e' from t where substring(#str,t,1) = #tagend
)
-- Return the start and length of every value, to use in the SUBSTRING function.
-- ISNULL/NULLIF combo handles the last value where there is no delimiter at the end of the string.
-- Using the t value we can determine which CHARINDEX to look for.
,l(t,s,l) as (select t,s,isnull(nullif(charindex(case t when 'f' then #tagstart when 's' then #tagend when 'e' then #tagstart end,#str,s),0)-s,4000) from s)
-- Each element of the string is returned in an ordered list along with its t value.
-- Where this t value is 's' this means the value is a tag, so append the start and end identifiers and join to the TagReplacementTable.
-- Where no replacement is found, simply return the part of the Description.
-- Finally, concatenate into one string value.
select (select isnull(r.Replacement,k.Item)
from(select row_number() over(order by s) as ItemNumber
,case when l.t = 's' then '[' else '' end
+ substring(#str,s,l)
+ case when l.t = 's' then ']' else '' end as Item
,t
from l
) k
left join TagReplacementTable r
on(k.Item = r.Tag)
order by k.ItemNumber
for xml path('')
) as NewString
);
And then outer apply to the results of the function to do replacements on all your Description values:
declare #t table (Descr nvarchar(100));
insert into #t values('This is [length] ft. long and [height] ft. high'),('[test] This is [other length] ft. long and [other height] ft. high');
select *
from #t t
outer apply dbo.Tag_Replace(t.Descr,'[',']') r;
Output:
+--------------------------------------------------------------------+-----------------------------------------+
| Descr | NewString |
+--------------------------------------------------------------------+-----------------------------------------+
| This is [length] ft. long and [height] ft. high | This is 75 ft. long and 20 ft. high |
| [test] This is [other length] ft. long and [other height] ft. high | 999 This is 40 ft. long and 50 ft. high |
+--------------------------------------------------------------------+-----------------------------------------+
I would not iterate through an individual string, but instead run the update on the entire column of strings. I'm not sure if that was your intent but this would be much quicker than one string at a time.
Test Data:
Create TABLE #strs ( mystr VARCHAR(MAX) )
Create TABLE #rpls (i INT IDENTITY(1,1) NOT NULL, src VARCHAR(MAX) , Trg VARCHAR(MAX) )
INSERT INTO #strs
( mystr )
SELECT 'hello ##color## world'
UNION ALL SELECT 'see jack ##verboftheday##! ##verboftheday## Jack, ##verboftheday##!'
UNION ALL SELECT 'on ##Date##, the ##color## StockMarket was ##MarketDirection##!'
INSERT INTO #rpls ( src ,Trg )
SELECT '##Color##', 'Blue'
UNION SELECT ALL '##verboftheday##' , 'run'
UNION SELECT ALL '##Date##' , CONVERT(VARCHAR(MAX), GETDATE(), 9)
UNION SELECT ALL '##MarketDirection##' , 'UP'
then a loop like this:
DECLARE #i INTEGER = 0
DECLARE #count INTEGER
SELECT #count = COUNT(*)
FROM #rpls R
WHILE #i < #count
BEGIN
SELECT #i += 1
UPDATE #strs
SET mystr = REPLACE(mystr, ( SELECT R.src
FROM #rpls R
WHERE i = #i ), ( SELECT R.Trg
FROM #rpls R
WHERE i = #i ))
END
SELECT *
FROM #strs S
Yielding the following
hello Blue world
see jack run! run Jack, run!
on May 19 2017 9:48:02:390AM, the Blue StockMarket was UP!
I found someone wanting to do something similar here with a set number of options:
SELECT #target = REPLACE(#target, invalidChar, '-')
FROM (VALUES ('~'),(''''),('!'),('#'),('#')) AS T(invalidChar)
I could modify it as such:
declare #target as varchar(max) = 'This is [length] ft. long and [height] ft. high'
select #target = REPLACE(#target,'[' + tag + ']',replacement)
from tags
It then runs the replace once for every record returned in the select statement.
(I originally had added this to my question, but it sounds like it is better protocol to add it as a answer.)

How to get all the two characters long substrings separated by dot (.) from an email address in SQL server. I want Scalar function

I have one email column that is having values like this 'claudio.passerini#uni.re.dit.mn.us'. I want to take two characters strings between dot (to check for the countries and states codes).
i want result like this
col1=re,mn,us
Solution
To do exactly what you've asked; i.e. pull back just the 2 char codes from within the email address's domain, you could use a function such as this:
create function dbo.fn_Get2AlphaCharCodesFromEmail
(
#email nvarchar(254) --max length of an email is 254: http://stackoverflow.com/questions/386294/what-is-the-maximum-length-of-a-valid-email-address
) returns nvarchar(254)
as
begin
declare #result nvarchar(254) = null
, #maxLen int = 254
;with cte(i, remainder,result) as
(
select cast(0 as int)
, cast('.' + substring(#email,charindex('#',#email)+1,#maxLen) + '.' as nvarchar(254))
, cast(null as nvarchar(254))
union all
select cast(i+1 as int)
, cast(substring(remainder,patindex('%.[A-Z][A-Z].%',remainder)+3,#maxLen)as nvarchar(254))
, cast(coalesce(result + ',','') + substring(remainder,patindex('%.[A-Z][A-Z].%',remainder)+1,2) as nvarchar(254))
from cte
where patindex('%.[A-Z][A-Z].%',remainder) > 0
)
select top 1 #result = result from cte order by i desc;
Return #result;
end
go
--demo
select dbo.fn_Get2AlphaCharCodesFromEmail ('claudio.passerini#uni.re.dit.mn.us')
--returns: re,mn,us
select dbo.fn_Get2AlphaCharCodesFromEmail ('claudio.passerini#uni.123.dit.mnx.usx')
--returns: NULL
Explanation
Create a function called fn_Get2AlphaCharCodesFromEmail in the schema dbo which takes a single parameter, #email which is a string of up to 254 characters, and returns a string of up to 254 characters.
create function dbo.fn_Get2AlphaCharCodesFromEmail
(
#email nvarchar(254)
) returns nvarchar(254)
as
begin
--... code that does the work goes here
end
declare the variables we'll be using later on.
#result holds the value we'll be returning from the function
#maxLen records the maximum length of an email; this makes it slightly easier should this length ever need to change; though not entirely simple since we have to specify the 254 length in our column & variable definitions later on anyway.
declare #result nvarchar(254) = null
, #maxLen int = 254
Now comes the interesting bit. We create a common table expression with 3 columns:
i is used to record which iteration each record was produced in; the highest value of i is the last record to be created.
remainder is used to hold the yet-to-be processed characters from the email.
result is used to record the 2 char codes; each new row adds another value to this column's comma separated values.
;with cte(i, remainder,result) as
(
--code to iterate through the email string, breaking it down, goes here
)
this gives us our first row in the cte "table".
The cast statements throughout this part are to ensure we have a consistent data type, as data types in a CTE are implicit, and not always correct
we initialise i (i.e. the first column) with value 0 to say that this is our first row (we could choose pretty much any value here; it doesn't matter
we initialise remainder (i.e. 2nd column) as the part of the email address which follows the # character; i.e. the email's domain.
we initialise result (i.e. 3rd column) as null; as we've not yet found a result (i.e. a 2 char string within the email's domain)
there is no from component as we're just getting data from the #email variable; no tables/views/etc are required.
select cast(0 as int)
, cast('.' + substring(#email,charindex('#',#email)+1,#maxLen) + '.' as nvarchar(254))
, cast(null as nvarchar(254))
union all is used to combing the first result(s) with the results of the next (recurring) statement. NB: The CTE code before this statement is run once to give initial values; the code after is run once for each new set of rows generated.
union all
The recurring code in the CTE is applied to new rows in the CTE until no new rows are generated.
i takes the value of the previous iteration's row's i incremented by 1.
select cast(i+1 as int)
remainder takes the previous iteration's remainder, and removes everything before (and including) the next 2 character code (result).
patindex('%.[A-Z][A-Z].%',remainder) returns a number giving the location of the a string containing a dot followed by 2 letters followed by a dot, occurring anywhere in the input string
, cast(substring(remainder,patindex('%.[A-Z][A-Z].%',remainder)+3,#maxLen)as nvarchar(254))
result uses the same logic as remainder, only it takes the 2 characters found, rather than everything after them. These characters are added on to the end of the previous iteartion's row's result value, separated by a comma.
, cast(coalesce(result + ',','') + substring(remainder,patindex('%.[A-Z][A-Z].%',remainder)+1,2) as nvarchar(254))
the from cte part just says that we're referencing the same "table" we're creating; i.e. this is how the recursion occurs
from cte
the where statement is used to prevent infinite recursion; i.e. once there are no more 2 char codes left in the remainder, stop looking.
where patindex('%.[A-Z][A-Z].%',remainder) > 0
Once we've found all the 2 char codes in the string, we know that the last row's result will contain the complete set; as such we assign this single row's value to the #result variable.
select top 1 #result = result
the from statement shows we're referencing the data we created in our with cte statement
from cte
the order by is used to determine which record comes first (i.e. which record is the top 1 record). We want it to be the last row generated by the CTE. Since we've been incrementing i by 1 each time, this last record will have the highest value of i, so by sorting by i desc (descending) that last generated row will be the row we get.
order by i desc;
Finally, we return the result generated above.
Return #result;
Alternative Approach
However, if you're trying to extract information from your emails, I'd recommend an alternate approach... have a list of values that you're looking for, and compare your email with that, without having to break apart the email address (beyond splitting on the # to ensure you're only checking the email's domain).
declare #countryCodes table (code nchar(2), name nvarchar(64)) --you'd use a real table for this; I'm just using a table variable so this demo's throwaway code
insert into #countryCodes (code, name)
values
('es','Spain')
,('fr','France')
,('uk','United Kingdom')
,('us','USA')
--etc.
--check a single mail
declare #mail nvarchar(256) = 'claudio.passerini#uni.re.dit.mn.us'
if exists (select top 1 1 from #countryCodes where '.' + substring(#mail,charindex('#',#mail)+1,256) + '.' like '%.' + code + '.%')
begin
select name from #countryCodes where '.' + substring(#mail,charindex('#',#mail)+1,256) + '.' like '%.' + code + '.%'
end
else
begin
select 'no results found'
end
--check a bunch of mails
declare #emailsToCheck table (email nvarchar(256))
insert into #emailsToCheck (email)
values
('claudio.passerini#uni.re.dit.mn.us')
,('someone#someplace.co.uk')
,('cant.see.me#never.never.land')
,('some.fr.address.hidden#france.not.in.this.bit')
select e.email, c.name
from #emailsToCheck e
left outer join #countryCodes c
on '.' + substring(email,charindex('#',email)+1,256) + '.' like '%.' + code + '.%'
order by e.email, c.name
If yo want individual columns you will need to pivot your data after splitting out your strings with a table valued function as per Marc's answer. If you are happy having them in rows, you can just use the select statement inside the brackets.
Query to get the data
declare #t table (Email nvarchar(50));
insert into #t values('claudio.passerini#uni.re.dit.mn.us'),('claudio.passerini#uni.ry.dit.mn.urg'),('claudio.passerini#uni.rn.dit.mn.uk');
select Email
,[1]
,[2]
,[3]
,[4]
,[5]
,[6]
from(
select t.Email
,s.Item
,row_number() over (partition by t.Email order by s.Item) as rn
from #t t
cross apply dbo.DelimitedSplit8K(t.Email,'.') s
where len(s.Item) = 2
) a
pivot
(
max(Item) for rn in([1],[2],[3],[4],[5],[6])
) pvt
Table valued function to split out the strings, courtesy of Jeff Moden
http://www.sqlservercentral.com/articles/Tally+Table/72993/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
ALTER FUNCTION [dbo].[DelimitedSplit8K]
--===== Define I/O parameters
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
--WARNING!!! DO NOT USE MAX DATA-TYPES HERE! IT WILL KILL PERFORMANCE!
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table" produces values from 1 up to 10,000...
-- enough to cover VARCHAR(8000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l
You can create your own function to split strings.
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[fnSplitString]
(
#string NVARCHAR(MAX),
#delimiter CHAR(1)
)
RETURNS #output TABLE(splitdata NVARCHAR(MAX)
)
BEGIN
set #delimiter = coalesce(#delimiter, dbo.cSeparador());
DECLARE #start INT, #end INT
SELECT #start = 1, #end = CHARINDEX(#delimiter, #string)
WHILE #start < LEN(#string) + 1 BEGIN
IF #end = 0
SET #end = LEN(#string) + 1
INSERT INTO #output (splitdata)
VALUES(SUBSTRING(#string, #start, #end - #start))
SET #start = #end + 1
SET #end = CHARINDEX(#delimiter, #string, #start)
END
RETURN
END
Using this function you can get all your country&state codes :
select splitdata from dbo.fnSplitString('claudio.passerini#uni.re.dit.mn.us', '.')
where len(splitdata) = 2
You can modify that query to concatenate the result on a single string :
SELECT
STUFF((SELECT ',' + splitdata
FROM dbo.fnSplitString('claudio.passerini#uni.re.dit.mn.us', '.')
WHERE len(splitdata) = 2
FOR XML PATH('')), 1, 1, '')
Here is how you put it into an scalar function :
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[fnCountryCodes](#email nvarchar(max)) returns nvarchar(max)
AS
BEGIN
RETURN (SELECT
STUFF((SELECT ',' + splitdata
FROM dbo.fnSplitString(#email, '.')
WHERE len(splitdata) = 2
FOR XML PATH('')), 1, 1, ''));
END
You call it like this :
select dbo.fnCountryCodes('claudio.passerini#uni.re.dit.mn.us')
Alternatively you can create a table-valued function that returns all the 2 characters long substrings from the domain of a mail address :
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[fnCountryCodes] (#email NVARCHAR(MAX))
RETURNS #output TABLE(subdomain1 nvarchar(2), subdomain2 nvarchar(2), subdomain3 nvarchar(2), subdomain4 nvarchar(2), subdomain5 nvarchar(2))
as
BEGIN
DECLARE #subdomain1 nvarchar(2);
DECLARE #subdomain2 nvarchar(2);
DECLARE #subdomain3 nvarchar(2);
DECLARE #subdomain4 nvarchar(2);
DECLARE #subdomain5 nvarchar(2);
DECLARE CURSOR_SUBDOMAINS CURSOR FOR select splitdata from dbo.fnSplitString(#email, '.') where len(splitdata) = 2;
OPEN CURSOR_SUBDOMAINS;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain1;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain2;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain3;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain4;
FETCH NEXT FROM CURSOR_SUBDOMAINS INTO #subdomain5;
CLOSE CURSOR_SUBDOMAINS;
DEALLOCATE CURSOR_SUBDOMAINS;
INSERT INTO #output (subdomain1, subdomain2, subdomain3, subdomain4, subdomain5)
values (#subdomain1, #subdomain2, #subdomain3, #subdomain4, #subdomain5)
RETURN
END
You use it like that :
select * from dbo.fnCountryCodes('claudio.passerini#uni.re.dit.mn.us')

SQL query to match keywords?

I have a table with a column as nvarchar(max) with text extracted from word documents in it. How can I create a select query that I'll pass another a list of keywords as parameter and return the rows ordered by the number of matches?
Maybe it is possible with full text search?
Yes, possible with full text search, and likely the best answer. For a straight T-SQL solution, you could use a split function and join, e.g. assuming a table of numbers called dbo.Numbers (you may need to decide on a different upper limit):
SET NOCOUNT ON;
DECLARE #UpperLimit INT;
SET #UpperLimit = 200000;
WITH n AS
(
SELECT
rn = ROW_NUMBER() OVER
(ORDER BY s1.[object_id])
FROM sys.objects AS s1
CROSS JOIN sys.objects AS s2
CROSS JOIN sys.objects AS s3
)
SELECT [Number] = rn - 1
INTO dbo.Numbers
FROM n
WHERE rn <= #UpperLimit + 1;
CREATE UNIQUE CLUSTERED INDEX n ON dbo.Numbers([Number]);
And a splitting function that uses that table of numbers:
CREATE FUNCTION dbo.SplitStrings
(
#List NVARCHAR(MAX)
)
RETURNS TABLE
AS
RETURN
(
SELECT DISTINCT
[Value] = LTRIM(RTRIM(
SUBSTRING(#List, [Number],
CHARINDEX(N',', #List + N',', [Number]) - [Number])))
FROM
dbo.Numbers
WHERE
Number <= LEN(#List)
AND SUBSTRING(N',' + #List, [Number], 1) = N','
);
GO
Then you can simply say:
SELECT key, NvarcharColumn /*, other cols */
FROM dbo.table AS outerT
WHERE EXISTS
(
SELECT 1
FROM dbo.table AS t
INNER JOIN dbo.SplitStrings(N'list,of,words') AS s
ON t.NvarcharColumn LIKE '%' + s.Item + '%'
WHERE t.key = outerT.key
);
As a procedure:
CREATE PROCEDURE dbo.Search
#List NVARCHAR(MAX)
AS
BEGIN
SET NOCOUNT ON;
SELECT key, NvarcharColumn /*, other cols */
FROM dbo.table AS outerT
WHERE EXISTS
(
SELECT 1
FROM dbo.table AS t
INNER JOIN dbo.SplitStrings(#List) AS s
ON t.NvarcharColumn LIKE '%' + s.Item + '%'
WHERE t.key = outerT.key
);
END
GO
Then you can just pass in #List (e.g. EXEC dbo.Search #List = N'foo,bar,splunge') from C#.
This won't be super fast, but I'm sure it will be quicker than pulling all the data out into C# and double-nested loop it manually.
how to ... return the rows ordered by the number of [full-text] matches
I have not used it myself but believe SQL Server 2008 supports weighting the CONTAINSTABLE matches which might be of help to you:
http://msdn.microsoft.com/en-us/library/ms189760.aspx
If you don't have an engine that returns results weighted by the number of hits ...
You could write a UDF that takes two inputs and returns an integer: the big textvalue is the first input and the words you're looking for as a comma-delimited string is the second. The function returns an integer representing either the number of distinct looked-for words that were actually found at least once in the text, or the total number of times the looked-for words were found. Implementation --how to weight-- is up to you. Maybe, for example, you'd want to arrange the looked-for words in most-important to least-important order, and give an important word hit more weight than a less important word hit.
You could then use your full text search engine to find all records that contain at least one of the words (you'd OR them), and you'd run this result set through your UDF scalar function:
pseudo code
select title, weightfunction(summary, 'word1,word2,word3....wordN')
from docs
where summary contains ( word1 or word2 or word3 ... or wordN)
order by weightfunction(summary, 'word1,word2,word3....wordN') desc

The most elegant way to generate permutations in SQL server

Given a the following table:
Index | Element
---------------
1 | A
2 | B
3 | C
4 | D
We want to generate all the possible permutations (without repetitions) using the elements.
the final result (skipping some rows) will look like this:
Results
----------
ABCD
ABDC
ACBD
ACDB
ADAC
ADCA
...
DABC
DACB
DBCA
DBAC
DCAB
DCBA
(24 Rows)
How would you do it?
After making some perhaps snarky comments, this problem stuck in my brain all evening, and I eventually came up with the following set-based approach. I believe it definitely qualifies as "elegant", but then I also think it qualifies as "kinda dumb". You make the call.
First, set up some tables:
-- For testing purposes
DROP TABLE Source
DROP TABLE Numbers
DROP TABLE Results
-- Add as many rows as need be processed--though note that you get N! (number of rows, factorial) results,
-- and that gets big fast. The Identity column must start at 1, or the algorithm will have to be adjusted.
-- Element could be more than char(1), though the algorithm would have to be adjusted again, and each element
-- must be the same length.
CREATE TABLE Source
(
SourceId int not null identity(1,1)
,Element char(1) not null
)
INSERT Source (Element) values ('A')
INSERT Source (Element) values ('B')
INSERT Source (Element) values ('C')
INSERT Source (Element) values ('D')
--INSERT Source (Element) values ('E')
--INSERT Source (Element) values ('F')
-- This is a standard Tally table (or "table of numbers")
-- It only needs to be as long as there are elements in table Source
CREATE TABLE Numbers (Number int not null)
INSERT Numbers (Number) values (1)
INSERT Numbers (Number) values (2)
INSERT Numbers (Number) values (3)
INSERT Numbers (Number) values (4)
INSERT Numbers (Number) values (5)
INSERT Numbers (Number) values (6)
INSERT Numbers (Number) values (7)
INSERT Numbers (Number) values (8)
INSERT Numbers (Number) values (9)
INSERT Numbers (Number) values (10)
-- Results are iteratively built here. This could be a temp table. An index on "Length" might make runs
-- faster for large sets. Combo must be at least as long as there are characters to be permuted.
CREATE TABLE Results
(
Combo varchar(10) not null
,Length int not null
)
Here's the routine:
SET NOCOUNT on
DECLARE
#Loop int
,#MaxLoop int
-- How many elements there are to process
SELECT #MaxLoop = max(SourceId)
from Source
-- Initialize first value
TRUNCATE TABLE Results
INSERT Results (Combo, Length)
select Element, 1
from Source
where SourceId = 1
SET #Loop = 2
-- Iterate to add each element after the first
WHILE #Loop <= #MaxLoop
BEGIN
-- See comments below. Note that the "distinct" remove duplicates, if a given value
-- is to be included more than once
INSERT Results (Combo, Length)
select distinct
left(re.Combo, #Loop - nm.Number)
+ so.Element
+ right(re.Combo, nm.Number - 1)
,#Loop
from Results re
inner join Numbers nm
on nm.Number <= #Loop
inner join Source so
on so.SourceId = #Loop
where re.Length = #Loop - 1
-- For performance, add this in if sets will be large
--DELETE Results
-- where Length <> #Loop
SET #Loop = #Loop + 1
END
-- Show results
SELECT *
from Results
where Length = #MaxLoop
order by Combo
The general idea is: when adding a new element (say "B") to any string (say, "A"), to catch all permutations you would add B
to all possible positions (Ba, aB), resulting in a new set of strings. Then iterate: Add a new element (C) to each position in a string
(AB becomes Cab, aCb, abC), for all strings (Cba, bCa, baC), and you have the set of permutations. Iterate over each result set with
the next character until you run out of characters... or resources. 10 elements is 3.6 million permutations, roughly 48MB with the above algorithm, and 14 (unique) elements would hit 87 billion permutations and 1.163 terabytes.
I'm sure it could eventually be wedged into a CTE, but in the end all that would be is a glorified loop. The logic
is clearer this way, and I can't help but think the CTE execution plan would be a nightmare.
DECLARE #s VARCHAR(5);
SET #s = 'ABCDE';
WITH Subsets AS (
SELECT CAST(SUBSTRING(#s, Number, 1) AS VARCHAR(5)) AS Token,
CAST('.'+CAST(Number AS CHAR(1))+'.' AS VARCHAR(11)) AS Permutation,
CAST(1 AS INT) AS Iteration
FROM dbo.Numbers WHERE Number BETWEEN 1 AND 5
UNION ALL
SELECT CAST(Token+SUBSTRING(#s, Number, 1) AS VARCHAR(5)) AS Token,
CAST(Permutation+CAST(Number AS CHAR(1))+'.' AS VARCHAR(11)) AS
Permutation,
s.Iteration + 1 AS Iteration
FROM Subsets s JOIN dbo.Numbers n ON s.Permutation NOT LIKE
'%.'+CAST(Number AS CHAR(1))+'.%' AND s.Iteration < 5 AND Number
BETWEEN 1 AND 5
--AND s.Iteration = (SELECT MAX(Iteration) FROM Subsets)
)
SELECT * FROM Subsets
WHERE Iteration = 5
ORDER BY Permutation
Token Permutation Iteration
----- ----------- -----------
ABCDE .1.2.3.4.5. 5
ABCED .1.2.3.5.4. 5
ABDCE .1.2.4.3.5. 5
(snip)
EDBCA .5.4.2.3.1. 5
EDCAB .5.4.3.1.2. 5
EDCBA .5.4.3.2.1. 5
first posted a while ago here
However, it would be better to do it in a better language such as C# or C++.
Just using SQL, without any code, you could do it if you can crowbar yourself another column into the table. Clearly you need to have one joined table for each of the values to be permuted.
with llb as (
select 'A' as col,1 as cnt union
select 'B' as col,3 as cnt union
select 'C' as col,9 as cnt union
select 'D' as col,27 as cnt
)
select a1.col,a2.col,a3.col,a4.col
from llb a1
cross join llb a2
cross join llb a3
cross join llb a4
where a1.cnt + a2.cnt + a3.cnt + a4.cnt = 40
Am I correctly understanding that you built Cartesian product n x n x n x n, and then filter out unwanted stuff? The alternative would be generating all the numbers up to n! and then using factorial number system to map them via element encoding.
Simpler than a recursive CTE:
declare #Number Table( Element varchar(MAX), Id varchar(MAX) )
Insert Into #Number Values ( 'A', '01')
Insert Into #Number Values ( 'B', '02')
Insert Into #Number Values ( 'C', '03')
Insert Into #Number Values ( 'D', '04')
select a.Element, b.Element, c.Element, d.Element
from #Number a
join #Number b on b.Element not in (a.Element)
join #Number c on c.Element not in (a.Element, b.Element)
join #Number d on d.Element not in (a.Element, b.Element, c.Element)
order by 1, 2, 3, 4
For an arbitrary number of elements, script it out:
if object_id('tempdb..#number') is not null drop table #number
create table #number (Element char(1), Id int, Alias as '_'+convert(varchar,Id))
insert #number values ('A', 1)
insert #number values ('B', 2)
insert #number values ('C', 3)
insert #number values ('D', 4)
insert #number values ('E', 5)
declare #sql nvarchar(max)
set #sql = '
select '+stuff((
select char(13)+char(10)+'+'+Alias+'.Element'
from #number order by Id for xml path (''), type
).value('.','NVARCHAR(MAX)'),3,1,' ')
set #sql += '
from #number '+(select top 1 Alias from #number order by Id)
set #sql += (
select char(13)+char(10)+'join #number '+Alias+' on '+Alias+'.Id not in ('
+stuff((
select ', '+Alias+'.Id'
from #number b where a.Id > b.Id
order by Id for xml path ('')
),1,2,'')
+ ')'
from #number a where Id > (select min(Id) from #number)
order by Element for xml path (''), type
).value('.','NVARCHAR(MAX)')
set #sql += '
order by 1'
print #sql
exec (#sql)
To generate this:
select
_1.Element
+_2.Element
+_3.Element
+_4.Element
+_5.Element
from #number _1
join #number _2 on _2.Id not in (_1.Id)
join #number _3 on _3.Id not in (_1.Id, _2.Id)
join #number _4 on _4.Id not in (_1.Id, _2.Id, _3.Id)
join #number _5 on _5.Id not in (_1.Id, _2.Id, _3.Id, _4.Id)
order by 1
This method uses a binary mask to select the correct rows:
;with src(t,n,p) as (
select element, index, power(2,index-1)
from table
)
select s1.t+s2.t+s3.t+s4.t
from src s1, src s2, src s3, src s4
where s1.p+s2.p+s3.p+s4.p=power(2,4)-1
My original post:
declare #t varchar(4) = 'ABCD'
;with src(t,n,p) as (
select substring(#t,1,1),1,power(2,0)
union all
select substring(#t,n+1,1),n+1,power(2,n)
from src
where n < len(#t)
)
select s1.t+s2.t+s3.t+s4.t
from src s1, src s2, src s3, src s4
where s1.p+s2.p+s3.p+s4.p=power(2,len(#t))-1
This is one of those problems that haunts you. I liked the simplicity of my original answer but there was this issue where I was still building all the possible solutions and then selecting the correct ones. One more try to make this process more efficient by only building the solutions that were correct yielded this answer. Add a character to the string only if that character didn't exist in the string. Patindex seemed like the perfect companion for a CTE solution. Here it is.
declare #t varchar(10) = 'ABCDEFGHIJ'
;with s(t,n) as (
select substring(#t,1,1),1
union all
select substring(#t,n+1,1),n+1
from s where n<len(#t)
)
,j(t) as (
select cast(t as varchar(10)) from s
union all
select cast(j.t+s.t as varchar(10))
from j,s where patindex('%'+s.t+'%',j.t)=0
)
select t from j where len(t)=len(#t)
I was able to build all 3.6 million solutions in 3 minutes and 2 seconds. Hopefully this solution will not get missed just because it's not the first.
Current solution using a recursive CTE.
-- The base elements
Declare #Number Table( Element varchar(MAX), Id varchar(MAX) )
Insert Into #Number Values ( 'A', '01')
Insert Into #Number Values ( 'B', '02')
Insert Into #Number Values ( 'C', '03')
Insert Into #Number Values ( 'D', '04')
-- Number of elements
Declare #ElementsNumber int
Select #ElementsNumber = COUNT(*)
From #Number;
-- Permute!
With Permutations( Permutation, -- The permutation generated
Ids, -- Which elements where used in the permutation
Depth ) -- The permutation length
As
(
Select Element,
Id + ';',
Depth = 1
From #Number
Union All
Select Permutation + ' ' + Element,
Ids + Id + ';',
Depth = Depth + 1
From Permutations,
#Number
Where Depth < #ElementsNumber And -- Generate only the required permutation number
Ids Not like '%' + Id + ';%' -- Do not repeat elements in the permutation (this is the reason why we need the 'Ids' column)
)
Select Permutation
From Permutations
Where Depth = #ElementsNumber
Assuming your table is named Elements and has 4 rows, this is as simple as:
select e1.Element + e2.Element + e3.Element + e4.Element
from Elements e1
join Elements e2 on e2.Element != e1.Element
join Elements e3 on e3.Element != e2.Element AND e3.Element != e1.Element
join Elements e4 on e4.Element != e3.Element AND e4.Element != e2.Element AND e4.Element != e1.Element
Way too much rust on my SQL skills, but i took a different tack for a similar problem and thought it worth sharing.
Table1 - X strings in a single field Uno
Table2 - Y strings in a single field Dos
(SELECT Uno, Dos
FROM Table1
CROSS JOIN Table2 ON 1=1)
UNION
(SELECT Dos, Uno
FROM Table1
CROSS JOIN Table2 ON 1=1)
Same principle for 3 tables with an added CROSS JOIN
(SELECT Tres, Uno, Dos
FROM Table1
CROSS JOIN Table2 ON 1=1
CROSS JOIN Table3 ON 1=1)
although it takes 6 cross-join sets in the union.
--Hopefully this is a quick solution, just change the values going into #X
IF OBJECT_ID('tempdb.dbo.#X', 'U') IS NOT NULL DROP TABLE #X; CREATE table #X([Opt] [nvarchar](10) NOT NULL)
Insert into #X values('a'),('b'),('c'),('d')
declare #pSQL NVarChar(max)='select * from #X X1 ', #pN int =(select count(*) from #X), #pC int = 0;
while #pC<#pN begin
if #pC>0 set #pSQL = concat(#pSQL,' cross join #X X', #pC+1);
set #pC = #pC +1;
end
execute(#pSQL)
--or as single column result
IF OBJECT_ID('tempdb.dbo.#X', 'U') IS NOT NULL DROP TABLE #X; CREATE table #X([Opt] [nvarchar](10) NOT NULL)
Insert into #X values('a'),('b'),('c'),('d')
declare #pSQL NVarChar(max)=' as R from #X X1 ',#pSelect NVarChar(Max)=' ',#pJoin NVarChar(Max)='', #pN int =(select count(*) from #X), #pC int = 0;
while #pC<#pN begin
if #pC>0 set #pJoin = concat(#pJoin ,' cross join #X X', #pC+1) set #pSelect = concat(#pSelect ,'+ X', #pC+1,'.Opt ')
set #pC = #pC +1;
end
set #pSQL = concat ('select X1.Opt', #pSelect,#pSQL ,#pJoin)
exec(#pSQL)
create function GeneratePermutations (#string nvarchar(4000))
RETURNS #Permutations
TABLE(
name nVARCHAR(500)
)
AS
begin
declare #SplitedString table(name nvarchar(500))
insert into #SplitedString
select *
from string_split(#string,' ')
declare #CountOfWords as int
set #CountOfWords = (select count(*) from #SplitedString)
;with cte_Permutations (name, level) as (
select convert(nvarchar(500), name), 1 as level from #SplitedString
union all
select convert(nvarchar(500),splited.name+','+cte_Permutations.name),level+1
from #SplitedString splited ,cte_Permutations
where level < #CountOfWords
)
insert into #Permutations
select name
from cte_Permutations
where level = #CountOfWords
order by name
return
end
select *
From (
select 1 id,'a b c' msg
union all
select 2 id,'d e' msg
) p
cross apply dbo.GeneratePermutations(p.msg)