SQL Server 2008 R2 get common text between a group of text strings - sql

I sure hope someone can help me out with this issue. I have been searching for hours to find it but I am coming up empty.
In this example I have two columns in my table
GRP_ID Desc
My group ID is the way I will identify that these products are of the same type, and desc is what I want to find all the common words.
So here is my table
GRP_ID Desc
-------------------------------
2 Red Hat
2 Green Hat
2 Yellow Hat
3 Boots Large Brown
3 Boots Medium Red
3 Boots Medium Brown
What I want as a result of the query would be the following
GRP_ID Desc
-----------------------
2 Hat
3 Boots
So what I want is all the words that appear in every string in the group or the common words in the group.

I think you'd need to create a mapping table for GRP_ID and products - e.g. Hat and Boots.
CREATE TABLE GroupProductMapping (
GRP_ID INT NOT NULL, -- I'm assuming its an Int
ProductDesc VARCHAR(50) NOT NULL
)
SELECT a.GRP_ID,
b.ProductDesc Desc
FROM {Table_Name} a
INNER JOIN GroupProductMapping b ON a.GRP_ID = b.GRP_ID
Alternatively, if you don't have too many products. You could use CASE in your SELECT clause.
e.g.
SELECT
GRP_ID,
CASE GRP_ID
WHEN 1 THEN 'Hat'
WHEN 2 THEN 'Boots'
END AS Desc
FROM {Table_Name}
{Table_Name} is the name of your original table.

Ideally you would normalise your data and store the words in a separate table.
However for your immediate requirements, you first need to provide a UDF to split 'desc' into words. I poached this function:
-- this function splits the provided strings on a delimiter
-- similar to .Net string.Split.
-- I'm sure there are alternatives (such as calling string.Split through
-- a CLR function).
CREATE FUNCTION [dbo].[Split]
(
#RowData NVARCHAR(MAX),
#Delimeter NVARCHAR(MAX)
)
RETURNS #RtnValue TABLE
(
ID INT IDENTITY(1,1),
Data NVARCHAR(MAX)
)
AS
BEGIN
DECLARE #Iterator INT
SET #Iterator = 1
DECLARE #FoundIndex INT
SET #FoundIndex = CHARINDEX(#Delimeter,#RowData)
WHILE (#FoundIndex>0)
BEGIN
INSERT INTO #RtnValue (data)
SELECT
Data = LTRIM(RTRIM(SUBSTRING(#RowData, 1, #FoundIndex - 1)))
SET #RowData = SUBSTRING(#RowData,
#FoundIndex + DATALENGTH(#Delimeter) / 2,
LEN(#RowData))
SET #Iterator = #Iterator + 1
SET #FoundIndex = CHARINDEX(#Delimeter, #RowData)
END
INSERT INTO #RtnValue (Data)
SELECT Data = LTRIM(RTRIM(#RowData))
RETURN
END
Then you need to split the descriptions and do some grouping (which you would also do if the data was normalised)
-- get the count of each grp_id
with group_count as
(
select grp_id, count(*) cnt from [Group]
group by grp_id
),
-- get the count of each word in each grp_id
group_word_count as
(
select count(*) cnt, grp_id, data from
(
select * from [group] g
cross apply dbo.Split(g.[Desc], ' ')
)
t
group by grp_id, data
)
-- return rows where number of grp_id = number of words in grp_id
select gwc.GRP_ID, gwc.Data [Desc] from group_word_count gwc
inner join group_count gc on gwc.GRP_ID = gc.GRP_ID and gwc.cnt = gc.cnt
Where [Group] is your table.

Related

Sql Multiple Replace based on query

I have been trying to set up a SQL function to build descriptions with "tags". For example, I would want to start with a description:
"This is [length] ft. long and [height] ft. high"
And modify the description with data from a related table, to end up with:
"This is 75 ft. long and 20 ft. high"
I could do this easily with REPLACE functions if we had a set number of tags, but I want these tags to be user defined, and each description may or may not have specific tags in it. Would there be any better way to get this other than using a cursor to go through the string once for each available tag? Does SQL have any built in functionality to do a multiple replace? something like:
Replace(description,(select tag, replacement from tags))
I actually recommend doing this in application code. But, you can do it using a recursive CTE:
with t as (
select t.*, row_number() over (order by t.tag) as seqnum
from tags t
),
cte as (
select replace(#description, t.tag, t.replacement) as d, t.seqnum
from t
where seqnum = 1
union all
select replace(d, t.tag, t.replacement), t.seqnum
from cte join
t
on t.seqnum = cte.seqnum + 1
)
select top 1 cte.*
from cte
order by seqnum desc;
Try below query :
SELECT REPLACE(DESCRIPTION,'[length]',( SELECT replacement FROM tags WHERE tag
= '[length]') )
I agree with Gordon that this is best handled in your application code.
If for whatever reason that option is not available however, and if you don't want to use recursion as per Gordon's answer, you could use a tally table approach to swap out your values.
You will need to test the performance of the for xml being executed for each value though...
Assuming you have a table of Tag replacement values:
create table TagReplacementTable(Tag nvarchar(50), Replacement nvarchar(50));
insert into TagReplacementTable values('[test]',999)
,('[length]',75)
,('[height]',20)
,('[other length]',40)
,('[other height]',50);
You can create an inline table function that will work through your Descriptions and drop replace the necessary parts using TagReplacementTable as reference:
create function dbo.Tag_Replace(#str nvarchar(4000)
,#tagstart nvarchar(1)
,#tagend nvarchar(1)
)
returns table
as
return
(
with n(n) as (select n from (values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) n(n))
-- Select the same number of rows as characters in #str as incremental row numbers.
-- Cross joins increase exponentially to a max possible 10,000 rows to cover largest #str length.
,t(t) as (select top (select len(#str) a) row_number() over (order by (select null)) from n n1,n n2,n n3,n n4)
-- Return the position of every value that starts or ends a part of the description.
-- This will be the first character (t='f'), the start of any tag (t='s') and the end of any tag (t='e').
,s(s,t) as (select 1, 'f'
union all select t+1, 's' from t where substring(#str,t,1) = #tagstart
union all select t+1, 'e' from t where substring(#str,t,1) = #tagend
)
-- Return the start and length of every value, to use in the SUBSTRING function.
-- ISNULL/NULLIF combo handles the last value where there is no delimiter at the end of the string.
-- Using the t value we can determine which CHARINDEX to look for.
,l(t,s,l) as (select t,s,isnull(nullif(charindex(case t when 'f' then #tagstart when 's' then #tagend when 'e' then #tagstart end,#str,s),0)-s,4000) from s)
-- Each element of the string is returned in an ordered list along with its t value.
-- Where this t value is 's' this means the value is a tag, so append the start and end identifiers and join to the TagReplacementTable.
-- Where no replacement is found, simply return the part of the Description.
-- Finally, concatenate into one string value.
select (select isnull(r.Replacement,k.Item)
from(select row_number() over(order by s) as ItemNumber
,case when l.t = 's' then '[' else '' end
+ substring(#str,s,l)
+ case when l.t = 's' then ']' else '' end as Item
,t
from l
) k
left join TagReplacementTable r
on(k.Item = r.Tag)
order by k.ItemNumber
for xml path('')
) as NewString
);
And then outer apply to the results of the function to do replacements on all your Description values:
declare #t table (Descr nvarchar(100));
insert into #t values('This is [length] ft. long and [height] ft. high'),('[test] This is [other length] ft. long and [other height] ft. high');
select *
from #t t
outer apply dbo.Tag_Replace(t.Descr,'[',']') r;
Output:
+--------------------------------------------------------------------+-----------------------------------------+
| Descr | NewString |
+--------------------------------------------------------------------+-----------------------------------------+
| This is [length] ft. long and [height] ft. high | This is 75 ft. long and 20 ft. high |
| [test] This is [other length] ft. long and [other height] ft. high | 999 This is 40 ft. long and 50 ft. high |
+--------------------------------------------------------------------+-----------------------------------------+
I would not iterate through an individual string, but instead run the update on the entire column of strings. I'm not sure if that was your intent but this would be much quicker than one string at a time.
Test Data:
Create TABLE #strs ( mystr VARCHAR(MAX) )
Create TABLE #rpls (i INT IDENTITY(1,1) NOT NULL, src VARCHAR(MAX) , Trg VARCHAR(MAX) )
INSERT INTO #strs
( mystr )
SELECT 'hello ##color## world'
UNION ALL SELECT 'see jack ##verboftheday##! ##verboftheday## Jack, ##verboftheday##!'
UNION ALL SELECT 'on ##Date##, the ##color## StockMarket was ##MarketDirection##!'
INSERT INTO #rpls ( src ,Trg )
SELECT '##Color##', 'Blue'
UNION SELECT ALL '##verboftheday##' , 'run'
UNION SELECT ALL '##Date##' , CONVERT(VARCHAR(MAX), GETDATE(), 9)
UNION SELECT ALL '##MarketDirection##' , 'UP'
then a loop like this:
DECLARE #i INTEGER = 0
DECLARE #count INTEGER
SELECT #count = COUNT(*)
FROM #rpls R
WHILE #i < #count
BEGIN
SELECT #i += 1
UPDATE #strs
SET mystr = REPLACE(mystr, ( SELECT R.src
FROM #rpls R
WHERE i = #i ), ( SELECT R.Trg
FROM #rpls R
WHERE i = #i ))
END
SELECT *
FROM #strs S
Yielding the following
hello Blue world
see jack run! run Jack, run!
on May 19 2017 9:48:02:390AM, the Blue StockMarket was UP!
I found someone wanting to do something similar here with a set number of options:
SELECT #target = REPLACE(#target, invalidChar, '-')
FROM (VALUES ('~'),(''''),('!'),('#'),('#')) AS T(invalidChar)
I could modify it as such:
declare #target as varchar(max) = 'This is [length] ft. long and [height] ft. high'
select #target = REPLACE(#target,'[' + tag + ']',replacement)
from tags
It then runs the replace once for every record returned in the select statement.
(I originally had added this to my question, but it sounds like it is better protocol to add it as a answer.)

Fastest method to insert numbers into sql server table?

SELECT TOP 1000000 row_number() over(ORDER by sv.number) AS num
INTO numbertest
from master..spt_values sv CROSS JOIN master..spt_values sv2
SELECT TOP 1000000 IDENTITY(int,1,1) AS Number
INTO NumberTest
FROM master..spt_values sv1
CROSS JOIN master..spt_values s2
I've come across two methods to insert 1 to 1000000 numbers in a table which works perfectly but doesn't insert 1 to 1000000 sequentially? how can i insert sequentially with fast insertion rate?
I have table Numbers in my database that i fill with the following query.
CREATE TABLE [dbo].[NUMBERS] (
[number] INT IDENTITY (1, 1) NOT NULL
);
set Identity_insert dbo.Numbers oN
declare
#row_count int,
#target_rows int
set #target_rows = 1048576
set #row_count = null
while ( 1 = 1 ) begin
if ( #row_count is null ) begin
insert into Numbers ( [number] ) values ( 1 )
end
else begin
insert into Numbers ( [number] )
select [number] = [number] + #row_count
from Numbers
end
set #row_count = isnull( #row_count, 0 ) + ##rowcount
if ( #row_count >= #target_rows ) begin
break
end
end
what I understood that you need to add/insert rows with serial number 1 to 10,00,000 (ten lac rows)
your two command seems select command instead of insert command
do really required to add serial number field in one of your filed
or
you can run query to any existing table to get result with serial number like
for eg, table name : employee
filed name : name
Note : no fileds existing like SerialNumber
you can run a command to get output with serial number
e.g. select ROW_NUMBER() over (order by employee.name) as SerialNumber, Employee.name from employee
your result will be like
SerialNumber name
1 abc
2 xyz

Remove a sentence from a paragraph that has a specific pattern with T-SQL

I have a large number of descriptions that can be anywhere from 5 to 20 sentences each. I am trying to put a script together that will locate and remove a sentence that contains a word with numbers before or after it.
before example: Hello world. Todays department has 345 employees. Have a good day.
after example: Hello world. Have a good day.
My main problem right now is identifying the violation.
Here "345 employees" is what causes the sentence to be removed. However, each description will have a different number and possibly a different variation of the word employee.
I would like to avoid having to create a table of all the different variations of employee.
JTB
This would make a good SQL Puzzle.
Disclaimer: there are probably TONS of edge cases that would blow this up
This would take a string, split it out into a table with a row for each sentence, then remove the rows that matched a condition, and then finally join them all back into a string.
CREATE FUNCTION dbo.fn_SplitRemoveJoin(#Val VARCHAR(2000), #FilterCond VARCHAR(100))
RETURNS VARCHAR(2000)
AS
BEGIN
DECLARE #tbl TABLE (rid INT IDENTITY(1,1), val VARCHAR(2000))
DECLARE #t VARCHAR(2000)
-- Split into table #tbl
WHILE CHARINDEX('.',#Val) > 0
BEGIN
SET #t = LEFT(#Val, CHARINDEX('.', #Val))
INSERT #tbl (val) VALUES (#t)
SET #Val = RIGHT(#Val, LEN(#Val) - LEN(#t))
END
IF (LEN(#Val) > 0)
INSERT #tbl VALUES (#Val)
-- Filter out condition
DELETE FROM #tbl WHERE val LIKE #FilterCond
-- Join back into 1 string
DECLARE #i INT, #rv VARCHAR(2000)
SET #i = 1
WHILE #i <= (SELECT MAX(rid) FROM #tbl)
BEGIN
SELECT #rv = IsNull(#rv,'') + IsNull(val,'') FROM #tbl WHERE rid = #i
SET #i = #i + 1
END
RETURN #rv
END
go
CREATE TABLE #TMP (rid INT IDENTITY(1,1), sentence VARCHAR(2000))
INSERT #tmp (sentence) VALUES ('Hello world. Todays department has 345 employees. Have a good day.')
INSERT #tmp (sentence) VALUES ('Hello world. Todays department has 15 emps. Have a good day. Oh and by the way there are 12 employees somewhere else')
SELECT
rid, sentence, dbo.fn_SplitRemoveJoin(sentence, '%[0-9] Emp%')
FROM #tmp t
returns
rid | sentence | |
1 | Hello world. Todays department has 345 employees. Have a good day. | Hello world. Have a good day.|
2 | Hello world. Todays department has 15 emps. Have a good day. Oh and by the way there are 12 employees somewhere else | Hello world. Have a good day. |
I've used the split/remove/join technique as well.
The main points are:
This uses a pair of recursive CTEs, rather than a UDF.
This will work with all English sentence endings: . or ! or ?
This removes whitespace to make the comparison for "digit then employee" so you don't have to worry about multiple spaces and such.
Here's the SqlFiddle demo, and the code:
-- Split descriptions into sentences (could use period, exclamation point, or question mark)
-- Delete any sentences that, without whitespace, are like '%[0-9]employ%'
-- Join sentences back into descriptions
;with Splitter as (
select ID
, ltrim(rtrim(Data)) as Data
, cast(null as varchar(max)) as Sentence
, 0 as SentenceNumber
from Descriptions -- Your table here
union all
select ID
, case when Data like '%[.!?]%' then right(Data, len(Data) - patindex('%[.!?]%', Data)) else null end
, case when Data like '%[.!?]%' then left(Data, patindex('%[.!?]%', Data)) else Data end
, SentenceNumber + 1
from Splitter
where Data is not null
), Joiner as (
select ID
, cast('' as varchar(max)) as Data
, 0 as SentenceNumber
from Splitter
group by ID
union all
select j.ID
, j.Data +
-- Don't want "digit+employ" sentences, remove whitespace to search
case when replace(replace(replace(replace(s.Sentence, char(9), ''), char(10), ''), char(13), ''), char(32), '') like '%[0-9]employ%' then '' else s.Sentence end
, s.SentenceNumber
from Joiner j
join Splitter s on j.ID = s.ID and s.SentenceNumber = j.SentenceNumber + 1
)
-- Final Select
select a.ID, a.Data
from Joiner a
join (
-- Only get max SentenceNumber
select ID, max(SentenceNumber) as SentenceNumber
from Joiner
group by ID
) b on a.ID = b.ID and a.SentenceNumber = b.SentenceNumber
order by a.ID, a.SentenceNumber
One way to do this. Please note that it only works if you have one number in all sentences.
declare #d VARCHAR(1000) = 'Hello world. Todays department has 345 employees. Have a good day.'
declare #dr VARCHAR(1000)
set #dr = REVERSE(#d)
SELECT REVERSE(RIGHT(#dr,LEN(#dr) - CHARINDEX('.',#dr,PATINDEX('%[0-9]%',#dr))))
+ RIGHT(#d,LEN(#d) - CHARINDEX('.',#d,PATINDEX('%[0-9]%',#d)) + 1)

Generating an n-gram table with an SQL query

I'm trying to implement a fuzzy search with JavaScript client side, to search a largish db (300 items roughly) of records contained in an SQL database. My constraint is that it is not possible to perform a live query on the database- I must generate "indexes" as flat files during a nightly batch job. And so, starting with a db that looks like this:
ID. NAME
1. The Rain Man
2. The Electric Slide
3. Transformers
I need to create within a single query something like this:
Trigram ID
the 1
the 2
he_ 1
he_ 2
e_r 1
_ra 1
rai 1
ain 1
in_ 1
n_m 1
_ma 1
man 1
e_e 2
_el 2
ele 2
lec 2
Etc etc, typos not withstanding. The rules here are that ''n' is the length of the strings in the first column, that only a-z and _ are valid characters, any other character being normalized to Lower case, or mapped to _, that a group by n-gram clause may be applied to the table. Thus, I would hope to gain a table that would allow me to quickly look up a particular n-gram and get a list of all the Ids of rows which contain that sequence. I'm not a clever enough SQL cookie to figure this problem out. Can you?
I created an T-SQL NGrams that works quite nicely; note the comments section for examples of how to use
CREATE FUNCTION dbo.nGrams8K
(
#string VARCHAR(8000),
#n TINYINT,
#pad BIT
)
/*
Created by: Alan Burstein
Created on: 3/10/2014
Updated on: 5/20/2014 changed the logic to use an "inline tally table"
9/10/2014 Added some more code examples in the comment section
9/30/2014 Added more code examples
10/27/2014 Small bug fix regarding padding
Use: Outputs a stream of tokens based on an input string.
Works just like mdq.nGrams; see http://msdn.microsoft.com/en-us/library/ff487027(v=sql.105).aspx.
n-gram defined:
In the fields of computational linguistics and probability,
an n-gram is a contiguous sequence of n items from a given
sequence of text or speech. The items can be phonemes, syllables,
letters, words or base pairs according to the application.
To better understand N-Grams see: http://en.wikipedia.org/wiki/N-gram
*/
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH
E1(n) AS (SELECT 1 FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(n)),
E2(n) AS (SELECT 1 FROM E1 a CROSS JOIN E1 b),
iTally(n) AS
(
SELECT TOP (LEN(#string)+#n) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM E2 a CROSS JOIN E2 b
),
NewString(NewString) AS
(
SELECT REPLICATE(CASE #pad WHEN 0 THEN '' ELSE ' ' END,#n-1)+#string+
REPLICATE(CASE #pad WHEN 0 THEN '' ELSE ' ' END,#n-1)
)
SELECT TOP ((#n)+LEN(#string))
n AS [sequence],
SUBSTRING(NewString,n,#n) AS token
FROM iTally
CROSS APPLY NewString
WHERE n < ((#n)+LEN(#string));
/*
------------------------------------------------------------
-- (1) Basic Use
-------------------------------------------------------------
;-- (A)basic "string to table":
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',1,1);
-- (b) create "bi-grams" (pad bit off)
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',2,0);
-- (c) create "tri-grams" (pad bit on)
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1);
-- (d) filter for only "tri-grams"
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1)
WHERE len(ltrim(token)) = 3;
-- note the query plan for each. The power is coming from an index
-- also note how many rows are produced: len(#string+(#n-1))
-- lastly, you can trim as needed when padding=1
------------------------------------------------------------
-- (2) With a variable
------------------------------------------------------------
-- note, in this example I am getting only the stuff that has three letters
DECLARE #string varchar(20) = 'abcdefg',
#tokenLen tinyint = 3;
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1)
WHERE len(ltrim(token)) = 3;
GO
------------------------------------------------------------
-- (3) An on-the-fly alphabet (this will come in handy in a moment)
------------------------------------------------------------
DECLARE #alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ';
SELECT [sequence], token
FROM dbo.nGrams8K(#alphabet,1,0);
GO
------------------------------------------------------------
-- (4) Character Count
------------------------------------------------------------
DECLARE #string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dog just laid there.',
#alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ';
SELECT a.token, COUNT(b.token) ttl
FROM dbo.nGrams8K(#alphabet,1,0) a
LEFT JOIN dbo.nGrams8K(#string,1,0) b ON a.token=b.token
GROUP BY a.token
ORDER BY a.token;
GO
------------------------------------------------------------
-- (5) Locate the start position of a search pattern
------------------------------------------------------------
;-- (A) note these queries:
DECLARE #string varchar(100)='THE QUICK Green FOX JUMPED OVER THE LAZY DOGS BACK';
-- (i)
SELECT * FROM dbo.nGrams8K(#string,1,0) a;
-- (ii) note this query:
SELECT * FROM dbo.nGrams8K(#string,1,0) a WHERE [token]=' ';
-- (B) and now the word count (#string included for presentation)
SELECT #string AS string,
count(*)+1 AS words
FROM dbo.nGrams8K(#string,1,0) a
WHERE [token]=' '
GO
------------------------------------------------------------
-- (6) search for the number of occurances of a word
------------------------------------------------------------
DECLARE #string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dog just laid there.',
#alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ',
#searchString VARCHAR(100)='The';
-- (5a) by location
SELECT sequence-(LEN(#searchstring)) AS location,
token AS searchString
FROM dbo.nGrams8K(#string,LEN(#searchstring+' ')+1,0) b
WHERE token=#searchString;
-- (2b) get total
SELECT #string AS string,
#searchString AS searchString,
COUNT(*) AS ttl
FROM dbo.nGrams8K(#string,LEN(#searchstring+' ')+1,0) b
WHERE token=#searchString;
------------------------------------------------------------
-- (7) Special SubstringBefore and SubstringAfter
------------------------------------------------------------
-- (7a) SubstringBeforeSSI (note: SSI = substringIndex)
ALTER FUNCTION dbo.SubstringBeforeSSI
(
#string varchar(1000),
#substring varchar(100),
#substring_index tinyint
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH get_pos AS
(
SELECT rn = row_number() over (order by sequence), substring_index = sequence
FROM dbo.nGrams8K(#string,len(#substring),1)
WHERE token=#substring
)
SELECT newstring = substring(#string,1,substring_index-len(#substring))
FROM get_pos
WHERE rn=#substring_index;
GO
DECLARE #string varchar(1000)='10.0.1600.22',
#searchPattern varchar(100)='.',
#substring_index tinyint = 3;
SELECT * FROM dbo.SubstringBeforeSSI(#string,#searchPattern,#substring_index);
GO
-- (7b) SubstringBeforeSSI (note: SSI = substringIndex)
ALTER FUNCTION dbo.SubstringAfterSSI
(
#string varchar(1000),
#substring varchar(100),
#substring_index tinyint
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH get_pos AS
(
SELECT rn = row_number() over (order by sequence), substring_index = sequence
FROM dbo.nGrams8K(#string,len(#substring),1)
WHERE token=#substring
)
SELECT newstring = substring(#string,substring_index+1,8000)
FROM get_pos
WHERE rn=#substring_index;
GO
DECLARE #string varchar(1000)='<notes id="1">blah, blah, blah</notes><notes id="2">More Notes</notes>',
#searchPattern varchar(100)='</notes>',
#substring_index tinyint = 1;
SELECT #string, *
FROM dbo.SubstringAfterSSI(#string,#searchPattern,#substring_index);
------------------------------------------------------------
-- (8) Strip non-numeric characters from a string
------------------------------------------------------------
-- (8a) create the function
ALTER FUNCTION StripNonNumeric_itvf(#OriginalText VARCHAR(8000))
RETURNS TABLE
--WITH SCHEMABINDING
AS
return
WITH ngrams AS
(
SELECT n = [sequence], c = token
FROM dbo.nGrams8K(#OriginalText,1,1)
),
clean_txt(CleanedText) AS
(
SELECT c+''
FROM ngrams
WHERE ascii(substring(#OriginalText,n,1)) BETWEEN 48 AND 57
FOR XML PATH('')
)
SELECT CleanedText
FROM clean_txt;
GO
-- (8b) use against a value or variable
SELECT CleanedText
FROM dbo.StripNonNumeric_itvf('value123');
-- (8c) use against a table
-- test harness:
IF OBJECT_ID('tempdb..#strings') IS NOT NULL DROP TABLE #strings;
WITH strings AS
(
SELECT TOP (100000) string = newid()
FROM sys.all_columns a CROSS JOIN sys.all_columns b
)
SELECT *
INTO #strings
FROM strings;
GO
-- query (returns 100K rows every 3 seconds on my pc):
SELECT CleanedText
FROM #strings
CROSS APPLY dbo.StripNonNumeric_itvf(string);
------------------------------------------------------------
-- (9) A couple complex String Algorithms
------------------------------------------------------------
-- (9a) hamming distance between two strings:
DECLARE #string1 varchar(8000) = 'xxxxyyyzzz',
#string2 varchar(8000) = 'xxxxyyzzzz';
SELECT string1 = #string1,
string2 = #string2,
hamming_distance = count(*)
FROM dbo.nGrams8K(#string1,1,0) s1
CROSS APPLY dbo.nGrams8K(#string2,1,0) s2
WHERE s1.sequence = s2.sequence
AND s1.token <> s2.token
GO
-- (9b) inner join between 2 strings
--(can be used to speed up other string metrics such as the longest common subsequence)
DECLARE #string1 varchar(100)='xxxx123yyyy456zzzz',
#string2 varchar(100)='xx789yy000zz';
WITH
s1(string1) AS
(
SELECT [token]+''
FROM dbo.nGrams8K(#string1,1,0)
WHERE charindex([token],#string2)<>0
ORDER BY [sequence]
FOR XML PATH('')
),
s2(string2) AS
(
SELECT [token]+''
FROM dbo.nGrams8K(#string2,1,0)
WHERE charindex([token],#string1)<>0
ORDER BY [sequence]
FOR XML PATH('')
)
SELECT string1, string2
FROM s1
CROSS APPLY s2;
------------------------------------------------------------
-- (10) Advanced Substring Metrics
------------------------------------------------------------
-- (10a) Identify common substrings and their location
DECLARE #string1 varchar(100) = 'xxx yyy zzz',
#string2 varchar(100) = 'xx yyy zz';
-- (i) review the two strings
SELECT str1 = #string1,
str2 = #string2;
-- (ii) the results
WITH
iTally AS
(
SELECT n
FROM dbo.tally t
WHERE n<= len(#string1)
),
distinct_tokens AS
(
SELECT ng1 = ng1.token, ng2 = ng2.token --= ltrim(ng1.token), ng2 = ltrim(ng2.token)
FROM itally
CROSS APPLY dbo.nGrams8K(#string1,n,1) ng1
CROSS APPLY dbo.nGrams8K(#string2,n,1) ng2
WHERE ng1.token=ng2.token
)
SELECT ss_txt = ng1,
ss_len = len(ng1),
str1_loc = charindex(ng1,#string1),
str2_loc = charindex(ng2,#string2)
FROM distinct_tokens
WHERE ng1<>'' AND charindex(ng1,#string1)+charindex(ng2,#string2)<>0
GROUP BY ng1, ng2
ORDER BY charindex(ng1,#string1), charindex(ng2,#string2), len(ng1);
-- (10b) Longest common substring function
-- (i) function
IF EXISTS
( SELECT * FROM INFORMATION_SCHEMA.ROUTINES
WHERE ROUTINE_SCHEMA='dbo' AND ROUTINE_NAME = 'lcss')
DROP FUNCTION dbo.lcss;
GO
CREATE FUNCTION dbo.lcss(#string1 varchar(100), #string2 varchar(100))
RETURNS TABLE
AS
RETURN
SELECT TOP (1) with ties token
FROM dbo.tally
CROSS APPLY dbo.nGrams8K(#string1,n,1)
WHERE n <= len(#string1)
AND charindex(token, #string2) > 0
ORDER BY len(token) DESC;
GO
-- (ii) example of use
DECLARE #string1 varchar(100) = '000xxxyyyzzz',
#string2 varchar(100) = '999xxyyyzaa';
SELECT string1 = #string1,
string2 = #string2,
token
FROM dbo.lcss(#string1, #string2);
*/
GO
You'd have to repeat this statement:
insert into trigram_table ( Trigram, ID )
select substr( translate( lower( Name ), ' ', '_' ), :X, :N ),
ID
from db_table
for all :X from 1 to Len(Name) + 1 - :N
You will also have to extend the translate function for all the other special characters you'd want to convert to an underscore. Right now it's just translating a blank into an underscore.
For performance you could do the translate and lower functions on the Trigram column in a last pass on the trigram_table so you're not doing those functions for each :X.

The most elegant way to generate permutations in SQL server

Given a the following table:
Index | Element
---------------
1 | A
2 | B
3 | C
4 | D
We want to generate all the possible permutations (without repetitions) using the elements.
the final result (skipping some rows) will look like this:
Results
----------
ABCD
ABDC
ACBD
ACDB
ADAC
ADCA
...
DABC
DACB
DBCA
DBAC
DCAB
DCBA
(24 Rows)
How would you do it?
After making some perhaps snarky comments, this problem stuck in my brain all evening, and I eventually came up with the following set-based approach. I believe it definitely qualifies as "elegant", but then I also think it qualifies as "kinda dumb". You make the call.
First, set up some tables:
-- For testing purposes
DROP TABLE Source
DROP TABLE Numbers
DROP TABLE Results
-- Add as many rows as need be processed--though note that you get N! (number of rows, factorial) results,
-- and that gets big fast. The Identity column must start at 1, or the algorithm will have to be adjusted.
-- Element could be more than char(1), though the algorithm would have to be adjusted again, and each element
-- must be the same length.
CREATE TABLE Source
(
SourceId int not null identity(1,1)
,Element char(1) not null
)
INSERT Source (Element) values ('A')
INSERT Source (Element) values ('B')
INSERT Source (Element) values ('C')
INSERT Source (Element) values ('D')
--INSERT Source (Element) values ('E')
--INSERT Source (Element) values ('F')
-- This is a standard Tally table (or "table of numbers")
-- It only needs to be as long as there are elements in table Source
CREATE TABLE Numbers (Number int not null)
INSERT Numbers (Number) values (1)
INSERT Numbers (Number) values (2)
INSERT Numbers (Number) values (3)
INSERT Numbers (Number) values (4)
INSERT Numbers (Number) values (5)
INSERT Numbers (Number) values (6)
INSERT Numbers (Number) values (7)
INSERT Numbers (Number) values (8)
INSERT Numbers (Number) values (9)
INSERT Numbers (Number) values (10)
-- Results are iteratively built here. This could be a temp table. An index on "Length" might make runs
-- faster for large sets. Combo must be at least as long as there are characters to be permuted.
CREATE TABLE Results
(
Combo varchar(10) not null
,Length int not null
)
Here's the routine:
SET NOCOUNT on
DECLARE
#Loop int
,#MaxLoop int
-- How many elements there are to process
SELECT #MaxLoop = max(SourceId)
from Source
-- Initialize first value
TRUNCATE TABLE Results
INSERT Results (Combo, Length)
select Element, 1
from Source
where SourceId = 1
SET #Loop = 2
-- Iterate to add each element after the first
WHILE #Loop <= #MaxLoop
BEGIN
-- See comments below. Note that the "distinct" remove duplicates, if a given value
-- is to be included more than once
INSERT Results (Combo, Length)
select distinct
left(re.Combo, #Loop - nm.Number)
+ so.Element
+ right(re.Combo, nm.Number - 1)
,#Loop
from Results re
inner join Numbers nm
on nm.Number <= #Loop
inner join Source so
on so.SourceId = #Loop
where re.Length = #Loop - 1
-- For performance, add this in if sets will be large
--DELETE Results
-- where Length <> #Loop
SET #Loop = #Loop + 1
END
-- Show results
SELECT *
from Results
where Length = #MaxLoop
order by Combo
The general idea is: when adding a new element (say "B") to any string (say, "A"), to catch all permutations you would add B
to all possible positions (Ba, aB), resulting in a new set of strings. Then iterate: Add a new element (C) to each position in a string
(AB becomes Cab, aCb, abC), for all strings (Cba, bCa, baC), and you have the set of permutations. Iterate over each result set with
the next character until you run out of characters... or resources. 10 elements is 3.6 million permutations, roughly 48MB with the above algorithm, and 14 (unique) elements would hit 87 billion permutations and 1.163 terabytes.
I'm sure it could eventually be wedged into a CTE, but in the end all that would be is a glorified loop. The logic
is clearer this way, and I can't help but think the CTE execution plan would be a nightmare.
DECLARE #s VARCHAR(5);
SET #s = 'ABCDE';
WITH Subsets AS (
SELECT CAST(SUBSTRING(#s, Number, 1) AS VARCHAR(5)) AS Token,
CAST('.'+CAST(Number AS CHAR(1))+'.' AS VARCHAR(11)) AS Permutation,
CAST(1 AS INT) AS Iteration
FROM dbo.Numbers WHERE Number BETWEEN 1 AND 5
UNION ALL
SELECT CAST(Token+SUBSTRING(#s, Number, 1) AS VARCHAR(5)) AS Token,
CAST(Permutation+CAST(Number AS CHAR(1))+'.' AS VARCHAR(11)) AS
Permutation,
s.Iteration + 1 AS Iteration
FROM Subsets s JOIN dbo.Numbers n ON s.Permutation NOT LIKE
'%.'+CAST(Number AS CHAR(1))+'.%' AND s.Iteration < 5 AND Number
BETWEEN 1 AND 5
--AND s.Iteration = (SELECT MAX(Iteration) FROM Subsets)
)
SELECT * FROM Subsets
WHERE Iteration = 5
ORDER BY Permutation
Token Permutation Iteration
----- ----------- -----------
ABCDE .1.2.3.4.5. 5
ABCED .1.2.3.5.4. 5
ABDCE .1.2.4.3.5. 5
(snip)
EDBCA .5.4.2.3.1. 5
EDCAB .5.4.3.1.2. 5
EDCBA .5.4.3.2.1. 5
first posted a while ago here
However, it would be better to do it in a better language such as C# or C++.
Just using SQL, without any code, you could do it if you can crowbar yourself another column into the table. Clearly you need to have one joined table for each of the values to be permuted.
with llb as (
select 'A' as col,1 as cnt union
select 'B' as col,3 as cnt union
select 'C' as col,9 as cnt union
select 'D' as col,27 as cnt
)
select a1.col,a2.col,a3.col,a4.col
from llb a1
cross join llb a2
cross join llb a3
cross join llb a4
where a1.cnt + a2.cnt + a3.cnt + a4.cnt = 40
Am I correctly understanding that you built Cartesian product n x n x n x n, and then filter out unwanted stuff? The alternative would be generating all the numbers up to n! and then using factorial number system to map them via element encoding.
Simpler than a recursive CTE:
declare #Number Table( Element varchar(MAX), Id varchar(MAX) )
Insert Into #Number Values ( 'A', '01')
Insert Into #Number Values ( 'B', '02')
Insert Into #Number Values ( 'C', '03')
Insert Into #Number Values ( 'D', '04')
select a.Element, b.Element, c.Element, d.Element
from #Number a
join #Number b on b.Element not in (a.Element)
join #Number c on c.Element not in (a.Element, b.Element)
join #Number d on d.Element not in (a.Element, b.Element, c.Element)
order by 1, 2, 3, 4
For an arbitrary number of elements, script it out:
if object_id('tempdb..#number') is not null drop table #number
create table #number (Element char(1), Id int, Alias as '_'+convert(varchar,Id))
insert #number values ('A', 1)
insert #number values ('B', 2)
insert #number values ('C', 3)
insert #number values ('D', 4)
insert #number values ('E', 5)
declare #sql nvarchar(max)
set #sql = '
select '+stuff((
select char(13)+char(10)+'+'+Alias+'.Element'
from #number order by Id for xml path (''), type
).value('.','NVARCHAR(MAX)'),3,1,' ')
set #sql += '
from #number '+(select top 1 Alias from #number order by Id)
set #sql += (
select char(13)+char(10)+'join #number '+Alias+' on '+Alias+'.Id not in ('
+stuff((
select ', '+Alias+'.Id'
from #number b where a.Id > b.Id
order by Id for xml path ('')
),1,2,'')
+ ')'
from #number a where Id > (select min(Id) from #number)
order by Element for xml path (''), type
).value('.','NVARCHAR(MAX)')
set #sql += '
order by 1'
print #sql
exec (#sql)
To generate this:
select
_1.Element
+_2.Element
+_3.Element
+_4.Element
+_5.Element
from #number _1
join #number _2 on _2.Id not in (_1.Id)
join #number _3 on _3.Id not in (_1.Id, _2.Id)
join #number _4 on _4.Id not in (_1.Id, _2.Id, _3.Id)
join #number _5 on _5.Id not in (_1.Id, _2.Id, _3.Id, _4.Id)
order by 1
This method uses a binary mask to select the correct rows:
;with src(t,n,p) as (
select element, index, power(2,index-1)
from table
)
select s1.t+s2.t+s3.t+s4.t
from src s1, src s2, src s3, src s4
where s1.p+s2.p+s3.p+s4.p=power(2,4)-1
My original post:
declare #t varchar(4) = 'ABCD'
;with src(t,n,p) as (
select substring(#t,1,1),1,power(2,0)
union all
select substring(#t,n+1,1),n+1,power(2,n)
from src
where n < len(#t)
)
select s1.t+s2.t+s3.t+s4.t
from src s1, src s2, src s3, src s4
where s1.p+s2.p+s3.p+s4.p=power(2,len(#t))-1
This is one of those problems that haunts you. I liked the simplicity of my original answer but there was this issue where I was still building all the possible solutions and then selecting the correct ones. One more try to make this process more efficient by only building the solutions that were correct yielded this answer. Add a character to the string only if that character didn't exist in the string. Patindex seemed like the perfect companion for a CTE solution. Here it is.
declare #t varchar(10) = 'ABCDEFGHIJ'
;with s(t,n) as (
select substring(#t,1,1),1
union all
select substring(#t,n+1,1),n+1
from s where n<len(#t)
)
,j(t) as (
select cast(t as varchar(10)) from s
union all
select cast(j.t+s.t as varchar(10))
from j,s where patindex('%'+s.t+'%',j.t)=0
)
select t from j where len(t)=len(#t)
I was able to build all 3.6 million solutions in 3 minutes and 2 seconds. Hopefully this solution will not get missed just because it's not the first.
Current solution using a recursive CTE.
-- The base elements
Declare #Number Table( Element varchar(MAX), Id varchar(MAX) )
Insert Into #Number Values ( 'A', '01')
Insert Into #Number Values ( 'B', '02')
Insert Into #Number Values ( 'C', '03')
Insert Into #Number Values ( 'D', '04')
-- Number of elements
Declare #ElementsNumber int
Select #ElementsNumber = COUNT(*)
From #Number;
-- Permute!
With Permutations( Permutation, -- The permutation generated
Ids, -- Which elements where used in the permutation
Depth ) -- The permutation length
As
(
Select Element,
Id + ';',
Depth = 1
From #Number
Union All
Select Permutation + ' ' + Element,
Ids + Id + ';',
Depth = Depth + 1
From Permutations,
#Number
Where Depth < #ElementsNumber And -- Generate only the required permutation number
Ids Not like '%' + Id + ';%' -- Do not repeat elements in the permutation (this is the reason why we need the 'Ids' column)
)
Select Permutation
From Permutations
Where Depth = #ElementsNumber
Assuming your table is named Elements and has 4 rows, this is as simple as:
select e1.Element + e2.Element + e3.Element + e4.Element
from Elements e1
join Elements e2 on e2.Element != e1.Element
join Elements e3 on e3.Element != e2.Element AND e3.Element != e1.Element
join Elements e4 on e4.Element != e3.Element AND e4.Element != e2.Element AND e4.Element != e1.Element
Way too much rust on my SQL skills, but i took a different tack for a similar problem and thought it worth sharing.
Table1 - X strings in a single field Uno
Table2 - Y strings in a single field Dos
(SELECT Uno, Dos
FROM Table1
CROSS JOIN Table2 ON 1=1)
UNION
(SELECT Dos, Uno
FROM Table1
CROSS JOIN Table2 ON 1=1)
Same principle for 3 tables with an added CROSS JOIN
(SELECT Tres, Uno, Dos
FROM Table1
CROSS JOIN Table2 ON 1=1
CROSS JOIN Table3 ON 1=1)
although it takes 6 cross-join sets in the union.
--Hopefully this is a quick solution, just change the values going into #X
IF OBJECT_ID('tempdb.dbo.#X', 'U') IS NOT NULL DROP TABLE #X; CREATE table #X([Opt] [nvarchar](10) NOT NULL)
Insert into #X values('a'),('b'),('c'),('d')
declare #pSQL NVarChar(max)='select * from #X X1 ', #pN int =(select count(*) from #X), #pC int = 0;
while #pC<#pN begin
if #pC>0 set #pSQL = concat(#pSQL,' cross join #X X', #pC+1);
set #pC = #pC +1;
end
execute(#pSQL)
--or as single column result
IF OBJECT_ID('tempdb.dbo.#X', 'U') IS NOT NULL DROP TABLE #X; CREATE table #X([Opt] [nvarchar](10) NOT NULL)
Insert into #X values('a'),('b'),('c'),('d')
declare #pSQL NVarChar(max)=' as R from #X X1 ',#pSelect NVarChar(Max)=' ',#pJoin NVarChar(Max)='', #pN int =(select count(*) from #X), #pC int = 0;
while #pC<#pN begin
if #pC>0 set #pJoin = concat(#pJoin ,' cross join #X X', #pC+1) set #pSelect = concat(#pSelect ,'+ X', #pC+1,'.Opt ')
set #pC = #pC +1;
end
set #pSQL = concat ('select X1.Opt', #pSelect,#pSQL ,#pJoin)
exec(#pSQL)
create function GeneratePermutations (#string nvarchar(4000))
RETURNS #Permutations
TABLE(
name nVARCHAR(500)
)
AS
begin
declare #SplitedString table(name nvarchar(500))
insert into #SplitedString
select *
from string_split(#string,' ')
declare #CountOfWords as int
set #CountOfWords = (select count(*) from #SplitedString)
;with cte_Permutations (name, level) as (
select convert(nvarchar(500), name), 1 as level from #SplitedString
union all
select convert(nvarchar(500),splited.name+','+cte_Permutations.name),level+1
from #SplitedString splited ,cte_Permutations
where level < #CountOfWords
)
insert into #Permutations
select name
from cte_Permutations
where level = #CountOfWords
order by name
return
end
select *
From (
select 1 id,'a b c' msg
union all
select 2 id,'d e' msg
) p
cross apply dbo.GeneratePermutations(p.msg)