SQL: Find rows where Column contains all of the given words - sql

I have some column EntityName, and I want to have users to be able to search names by entering words separated by space. The space is implicitly considered as an 'AND' operator, meaning that the returned rows must have all of the words specified, and not necessarily in the given order.
For example, if we have rows like these:
abba nina pretty balerina
acdc you shook me all night long
sth you are me
dream theater it's all about you
when the user enters: me you, or you me (the results must be equivalent), the result has rows 2 and 3.
I know I can go like:
WHERE Col1 LIKE '%' + word1 + '%'
AND Col1 LIKE '%' + word2 + '%'
but I wanted to know if there's some more optimal solution.
The CONTAINS would require a full text index, which (for various reasons) is not an option.
Maybe Sql2008 has some built-in, semi-hidden solution for these cases?

The only thing I can think of is to write a CLR function that does the LIKE comparisons. This should be many times faster.
Update: Now that I think about it, it makes sense CLR would not help. Two other ideas:
1 - Try indexing Col1 and do this:
WHERE (Col1 LIKE word1 + '%' or Col1 LIKE '%' + word1 + '%')
AND (Col1 LIKE word2 + '%' or Col1 LIKE '%' + word2 + '%')
Depending on the most common searches (starts with vs. substring), this may offer an improvement.
2 - Add your own full text indexing table where each word is a row in the table. Then you can index properly.

Function
CREATE FUNCTION [dbo].[fnSplit] ( #sep CHAR(1), #str VARCHAR(512) )
RETURNS TABLE AS
RETURN (
WITH Pieces(pn, start, stop) AS (
SELECT 1, 1, CHARINDEX(#sep, #str)
UNION ALL
SELECT pn + 1, stop + 1, CHARINDEX(#sep, #str, stop + 1)
FROM Pieces
WHERE stop > 0
)
SELECT
pn AS Id,
SUBSTRING(#str, start, CASE WHEN stop > 0 THEN stop - start ELSE 512 END) AS Data
FROM
Pieces
)
Query
DECLARE #FilterTable TABLE (Data VARCHAR(512))
INSERT INTO #FilterTable (Data)
SELECT DISTINCT S.Data
FROM fnSplit(' ', 'word1 word2 word3') S -- Contains words
SELECT DISTINCT
T.*
FROM
MyTable T
INNER JOIN #FilterTable F1 ON T.Col1 LIKE '%' + F1.Data + '%'
LEFT JOIN #FilterTable F2 ON T.Col1 NOT LIKE '%' + F2.Data + '%'
WHERE
F2.Data IS NULL
Source: SQL SELECT WHERE field contains words

http://msdn.microsoft.com/en-us/magazine/cc163473.aspx

You're going to end up with a full table scan anyway.
The collation can make a big difference apparently. Kalen Delaney in the book "Microsoft SQL Server 2008 Internals" says:
Collation can make a huge difference
when SQL Server has to look at almost
all characters in the strings. For
instance, look at the following:
SELECT COUNT(*) FROM tbl WHERE longcol LIKE '%abc%'
This may execute 10 times faster or more with a binary collation than a nonbinary Windows collation. And with varchar data, this executes up to seven or eight times faster with a SQL collation than with a Windows collation.

WITH Tokens AS(SELECT 'you' AS Token UNION ALL SELECT 'me')
SELECT ...
FROM YourTable AS t
WHERE (SELECT COUNT(*) FROM Tokens WHERE y.Col1 LIKE '%'+Tokens.Token+'%')
=
(SELECT COUNT(*) FROM Tokens) ;

This should ideally be done with the help of Full text search as mentioned above.
BUT,
If you don't have full text configured for your DB, here is a performance intensive solution for doing a prioritized string search.
-- table to search in
drop table if exists dbo.myTable;
go
CREATE TABLE dbo.myTable
(
myTableId int NOT NULL IDENTITY (1, 1),
code varchar(200) NOT NULL,
description varchar(200) NOT NULL -- this column contains the values we are going to search in
) ON [PRIMARY]
GO
-- function to split space separated search string into individual words
drop function if exists [dbo].[fnSplit];
go
CREATE FUNCTION [dbo].[fnSplit] (#StringInput nvarchar(max),
#Delimiter nvarchar(1))
RETURNS #OutputTable TABLE (
id nvarchar(1000)
)
AS
BEGIN
DECLARE #String nvarchar(100);
WHILE LEN(#StringInput) > 0
BEGIN
SET #String = LEFT(#StringInput, ISNULL(NULLIF(CHARINDEX(#Delimiter, #StringInput) - 1, -1),
LEN(#StringInput)));
SET #StringInput = SUBSTRING(#StringInput, ISNULL(NULLIF(CHARINDEX
(
#Delimiter, #StringInput
),
0
), LEN
(
#StringInput)
)
+ 1, LEN(#StringInput));
INSERT INTO #OutputTable (id)
VALUES (#String);
END;
RETURN;
END;
GO
-- this is the search script which can be optionally converted to a stored procedure /function
declare #search varchar(max) = 'infection upper acute genito'; -- enter your search string here
-- the searched string above should give rows containing the following
-- infection in upper side with acute genitointestinal tract
-- acute infection in upper teeth
-- acute genitointestinal pain
if (len(trim(#search)) = 0) -- if search string is empty, just return records ordered alphabetically
begin
select 1 as Priority ,myTableid, code, Description from myTable order by Description
return;
end
declare #splitTable Table(
wordRank int Identity(1,1), -- individual words are assinged priority order (in order of occurence/position)
word varchar(200)
)
declare #nonWordTable Table( -- table to trim out auxiliary verbs, prepositions etc. from the search
id varchar(200)
)
insert into #nonWordTable values
('of'),
('with'),
('at'),
('in'),
('for'),
('on'),
('by'),
('like'),
('up'),
('off'),
('near'),
('is'),
('are'),
(','),
(':'),
(';')
insert into #splitTable
select id from dbo.fnSplit(#search,' '); -- this function gives you a table with rows containing all the space separated words of the search like in this e.g., the output will be -
-- id
-------------
-- infection
-- upper
-- acute
-- genito
delete s from #splitTable s join #nonWordTable n on s.word = n.id; -- trimming out non-words here
declare #countOfSearchStrings int = (select count(word) from #splitTable); -- count of space separated words for search
declare #highestPriority int = POWER(#countOfSearchStrings,3);
with plainMatches as
(
select myTableid, #highestPriority as Priority from myTable where Description like #search -- exact matches have highest priority
union
select myTableid, #highestPriority-1 as Priority from myTable where Description like #search + '%' -- then with something at the end
union
select myTableid, #highestPriority-2 as Priority from myTable where Description like '%' + #search -- then with something at the beginning
union
select myTableid, #highestPriority-3 as Priority from myTable where Description like '%' + #search + '%' -- then if the word falls somewhere in between
),
splitWordMatches as( -- give each searched word a rank based on its position in the searched string
-- and calculate its char index in the field to search
select myTable.myTableid, (#countOfSearchStrings - s.wordRank) as Priority, s.word,
wordIndex = CHARINDEX(s.word, myTable.Description) from myTable join #splitTable s on myTable.Description like '%'+ s.word + '%'
-- and not exists(select myTableid from plainMatches p where p.myTableId = myTable.myTableId) -- need not look into myTables that have already been found in plainmatches as they are highest ranked
-- this one takes a long time though, so commenting it, will have no impact on the result
),
matchingRowsWithAllWords as (
select myTableid, count(myTableid) as myTableCount from splitWordMatches group by(myTableid) having count(myTableid) = #countOfSearchStrings
)
, -- trim off the CTE here if you don't care about the ordering of words to be considered for priority
wordIndexRatings as( -- reverse the char indexes retrived above so that words occuring earlier have higher weightage
-- and then normalize them to sequential values
select s.myTableid, Priority, word, ROW_NUMBER() over (partition by s.myTableid order by wordindex desc) as comparativeWordIndex
from splitWordMatches s join matchingRowsWithAllWords m on s.myTableId = m.myTableId
)
,
wordIndexSequenceRatings as ( -- need to do this to ensure that if the same set of words from search string is found in two rows,
-- their sequence in the field value is taken into account for higher priority
select w.myTableid, w.word, (w.Priority + w.comparativeWordIndex + coalesce(sequncedPriority ,0)) as Priority
from wordIndexRatings w left join
(
select w1.myTableid, w1.priority, w1.word, w1.comparativeWordIndex, count(w1.myTableid) as sequncedPriority
from wordIndexRatings w1 join wordIndexRatings w2 on w1.myTableId = w2.myTableId and w1.Priority > w2.Priority and w1.comparativeWordIndex>w2.comparativeWordIndex
group by w1.myTableid, w1.priority,w1.word, w1.comparativeWordIndex
)
sequencedPriority on w.myTableId = sequencedPriority.myTableId and w.Priority = sequencedPriority.Priority
),
prioritizedSplitWordMatches as ( -- this calculates the cumulative priority for a field value
select w1.myTableId, sum(w1.Priority) as OverallPriority from wordIndexSequenceRatings w1 join wordIndexSequenceRatings w2 on w1.myTableId = w2.myTableId
where w1.word <> w2.word group by w1.myTableid
),
completeSet as (
select myTableid, priority from plainMatches -- get plain matches which should be highest ranked
union
select myTableid, OverallPriority as priority from prioritizedSplitWordMatches -- get ranked split word matches (which are ordered based on word rank in search string and sequence)
),
maximizedCompleteSet as( -- set the priority of a field value = maximum priority for that field value
select myTableid, max(priority) as Priority from completeSet group by myTableId
)
select priority, myTable.myTableid , code, Description from maximizedCompleteSet m join myTable on m.myTableId = myTable.myTableId
order by Priority desc, Description -- order by priority desc to get highest rated items on top
--offset 0 rows fetch next 50 rows only -- optional paging

Related

output of replace SQL as a where clause in another SQL

Sorry for the rubbish title. I could quite articulate my problem in a few words.
I have an SQL that gives a list of ids separate by a pipe (|). I want to pass these ids into another sql as a where clause. I can use replace to convert the values from pipe separate into comma separated.
As an example the list of IDs might be
1|2|3|4
and using replace I get
1,2,3,4
select replace(value, '|', ',') from my_table;
If I try and pass this into another SQL where I want to look up these IDs I get an error
ORA-01722: invalid number
select * from my_table2 where id in (
select replace(value, '|', ',') from my_table);
Now I presume I need to cast the output to a number but I dont want to cast the entire string to a number just hte numeric values within it.
How can I do this easily?
Thanks
This is a complicated expression, but you can do it with like and exists:
select *
from my_table2
where exists (select 1
from my_table t1
where '|' || value || '|' like '%|' || id || '|%'
);
However, you have a fundamental problem with the data structure in my_table. You should not be storing lists of anything -- and especially integer ids -- in a string. The proper SQL approach is to use a junction table, with one row per id. Oracle has other data structures such as nested tables, which can help with this.
There may be two cases: good and bad.
Bad case is your pipe-separated string is stored somewhere in the database and you cannot change this design to something meaningful. If so, you'll need to use like operator, something like this:
select t2.*
from my_table2 t2, my_table t
where '|' || t1.value || '|' like '%|' || t2.id || '|%'
Good case is this pipelining isn't persistent and made by first SQL. If so, you should just remove garbage. Remove pipelining, remove listing into one row. Make inner SQL return resultset of IDs required, one per row, and use something like
select t2.*
from my_table2 t2
where t2.id in (select id from ...)
Additional case is if this list is a parameter value transferred from client. Some developers use this approach to make filters etc. If so, you should change client for transferring something better, say, table of numbers. SQL would be like
select t2.*
from my_table2 t2
where t2.id in (select column_value from table(cast :param as NumberTable))
We do it the following way. We have one function that splits string and returns table. The code is TSQL, but i think you can easily change it to ORACLE SQL
CREATE FUNCTION [dbo].[fStringToTable]
(
#List NVARCHAR(MAX) ,
#Splitter NVARCHAR(MAX)
)
RETURNS #ParsedList TABLE ( ID INT )
AS
BEGIN
DECLARE #ID NVARCHAR(MAX) ,
#Pos INT ,
#sqlstat NVARCHAR(MAX)
DECLARE #tbl TABLE ( ID INT )
SET #List = LTRIM(RTRIM(#List)) + #Splitter
SET #Pos = CHARINDEX(',', #List, 1)
IF REPLACE(#List, #Splitter, '') <> ''
BEGIN
WHILE #Pos > 0
BEGIN
SET #ID = LTRIM(RTRIM(LEFT(#List, #Pos - 1)))
IF #ID <> ''
BEGIN
INSERT INTO #tbl
( ID )
SELECT ( #ID )
END
SET #List = RIGHT(#List, LEN(#List) - #Pos)
SET #Pos = CHARINDEX(#Splitter, #List, 1)
END
END
INSERT INTO #ParsedList
SELECT ID
FROM #tbl
GROUP BY ID
RETURN
END
And your select will be
select * from my_table2 where id in (
SELECT ID FROM [dbo].[fStringToTable]('1,2', ','));
See this http://www.adp-gmbh.ch/ora/plsql/coll/return_table.html

SQL Server 2012 T-SQL count number of words between elements of two sets

I have two sets of elements, let's say they are these words:
set 1: "nuclear", "fission", "dirty" and
set 2: "device", "explosive"
In my database, I have a text column (Description) which contains a sentence or two. I would like to find any records where Description contains both an element from set 1 followed by an element from set 2, where the two elements are separated by four words or less. For simplicity, counting (spaces-1) will count words between the two elements.
I'd prefer it if a solution didn't require the installation of anything like CLR functions for regular expression. Rather, if this could be done with a user-defined table function, it would make deployment simpler.
Does this sound possible?
It is possible, but i do not think it will preform well with millions of rows.
I have a solution here that handles about 10 000 rows in 2 sec and 100 000 rows in about 20 sec on our server. It also requires the famous DelimitedSplit8K sql table function from SQLServerCentral:
DECLARE #set1 VARCHAR(MAX) = 'nuclear, fission, dirty';
DECLARE #set2 VARCHAR(MAX) = 'device, explosive';
WITH GetDistances AS
(
SELECT
DubID = ROW_NUMBER() OVER (PARTITION BY ID ORDER BY ID)
, Distance = dbo.[cf_ValueSetDistance](s.Description, #set1, #set2)
, s.ID
,s.Description
FROM #sentences s
JOIN dbo.cf_DelimitedSplit8K(#set1, ',') s1 ON s.Description LIKE '%' + RTRIM(LTRIM(s1.Item)) + '%'
JOIN dbo.cf_DelimitedSplit8K(#set2, ',') s2 ON s.Description LIKE '%' + RTRIM(LTRIM(s2.Item)) + '%'
) SELECT Distance, ID, Description FROM GetDistances WHERE DubID = 1 AND Distance BETWEEN 1 AND 4;
--10 000 rows: 2sec
--100 000 rows: 20sec
Test data generator
--DROP TABLE #sentences
CREATE TABLE #sentences
(
ID INT IDENTITY(1,1) PRIMARY KEY
, Description VARCHAR(100)
);
GO
--CREATE 10000 random sentences that are 100 chars long
SET NOCOUNT ON;
WHILE((SELECT COUNT(*) FROM #sentences) < 10000)
BEGIN
DECLARE #randomWord VARCHAR(100) = '';
SELECT TOP 100 #randomWord = #randomWord + ' ' + Item FROM dbo.cf_DelimitedSplit8K('nuclear fission dirty device explosive On the other hand, we denounce with righteous indignation and dislike men who are so beguiled and demoralized by the charms of pleasure of the moment, so blinded by desire, that they cannot foresee the pain and trouble that are bound to ensue; and equal blame belongs to those who fail in their duty through weakness of will, which is the same as saying through shrinking from toil and pain. These cases are perfectly simple and easy to distinguish. In a free hour, when our power of choice is untrammelled and when nothing prevents our being able to do what we like best, every pleasure is to be welcomed and every pain avoided. But in certain circumstances and owing to the claims of duty or the obligations of business it will frequently occur that pleasures have to be repudiated and annoyances accepted. The wise man therefore always holds in these matters to this principle of selection: he rejects pleasures to secure other greater pleasures, or else he endures pains to avoid worse pains', ' ') ORDER BY NEWID();
INSERT INTO #sentences
SELECT #randomWord
END
SET NOCOUNT OFF;
Function 1 - cf_ValueSetDistance
CREATE FUNCTION [dbo].[cf_ValueSetDistance]
(
#value VARCHAR(MAX)
, #compareSet1 VARCHAR(MAX)
, #compareSet2 VARCHAR(MAX)
)
RETURNS INT
AS
BEGIN
SET #value = REPLACE(REPLACE(REPLACE(#value, '.', ''), ',', ''), '?', '');
DECLARE #distance INT;
DECLARE #sentence TABLE( WordIndex INT, Word VARCHAR(MAX) );
DECLARE #set1 TABLE(Word VARCHAR(MAX) );
DECLARE #set2 TABLE(Word VARCHAR(MAX) );
INSERT INTO #sentence
SELECT ItemNumber, RTRIM(LTRIM(Item)) FROM dbo.cf_DelimitedSplit8K(#value, ' ')
INSERT INTO #set1
SELECT RTRIM(LTRIM(Item)) FROM dbo.cf_DelimitedSplit8K(#compareSet1, ',')
IF(EXISTS(SELECT 1 FROM #sentence s JOIN #set1 s1 ON s.Word = s1.Word))
BEGIN
INSERT INTO #set2
SELECT RTRIM(LTRIM(Item)) FROM dbo.cf_DelimitedSplit8K(#compareSet2, ',');
IF(EXISTS(SELECT 1 FROM #sentence s JOIN #set2 s2 ON s.Word = s2.Word))
BEGIN
WITH Set1 AS (
SELECT s.WordIndex, s.Word FROM #sentence s
JOIN #set1 s1 ON s1.Word = s.Word
), Set2 AS
(
SELECT s.WordIndex, s.Word FROM #sentence s
JOIN #set2 s2 ON s2.Word = s.Word
)
SELECT #distance = MIN(ABS(s2.WordIndex - s1.WordIndex)) FROM Set1 s1, Set2 s2
END
END
RETURN #distance;
END
Function 2 - DelimitedSplit8K
(No need to even try to understand this code, this is an extremely fast function for splitting a string to a table, written by several talented people):
CREATE FUNCTION [dbo].[cf_DelimitedSplit8K]
(#pString VARCHAR(8000), #pDelimiter CHAR(1))
RETURNS TABLE WITH SCHEMABINDING AS
RETURN
--===== "Inline" CTE Driven "Tally Table" produces values from 0 up to 10,000...
-- enough to cover NVARCHAR(4000)
WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS (--==== This provides the "base" CTE and limits the number of rows right up front
-- for both a performance gain and prevention of accidental "overruns"
SELECT TOP (ISNULL(DATALENGTH(#pString),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
),
cteStart(N1) AS (--==== This returns N+1 (starting position of each "element" just once for each delimiter)
SELECT 1 UNION ALL
SELECT t.N+1 FROM cteTally t WHERE SUBSTRING(#pString,t.N,1) = #pDelimiter
),
cteLen(N1,L1) AS(--==== Return start and length (for use in substring)
SELECT s.N1,
ISNULL(NULLIF(CHARINDEX(#pDelimiter,#pString,s.N1),0)-s.N1,8000)
FROM cteStart s
)
--===== Do the actual split. The ISNULL/NULLIF combo handles the length for the final element when no delimiter is found.
SELECT ItemNumber = ROW_NUMBER() OVER(ORDER BY l.N1),
Item = SUBSTRING(#pString, l.N1, l.L1)
FROM cteLen l;
I dont know anything about performance, but this could be done with cross apply and two temporary tables.
--initialize word set data
DECLARE #set1 TABLE (wordFromSet varchar(n))
DECLARE #set2 TABLE (wordFromSet varchar(n))
INSERT INTO #set1 SELECT 'nuclear' UNION SELECT 'fission' UNION SELECT 'dirty'
INSERT INTO #set2 SELECT 'device' UNION SELECT 'explosive'
SELECT *
FROM MyTable m
CROSS APPLY
(
SELECT wordFromSet
,LEN(SUBSTRING(m.Description, 1, CHARINDEX(wordFromSet, m.Description))) - LEN(REPLACE(SUBSTRING(m.Description, 1, CHARINDEX(wordFromSet, m.Description)),' ', '')) AS WordPosition
FROM #set1
WHERE m.Description LIKE '%' + wordFromSet + '%'
) w1
CROSS APPLY
(
SELECT wordFromSet
,LEN(SUBSTRING(m.Description, 1, CHARINDEX(wordFromSet, m.Description))) - LEN(REPLACE(SUBSTRING(m.Description, 1, CHARINDEX(wordFromSet, m.Description)),' ', '')) AS WordPosition
FROM #set2
WHERE m.Description LIKE '%' + wordFromSet + '%'
) w2
WHERE w2.WordPosition - w1.WordPosition <= treshold
Essentially it will only return rows from MyTable that have at least a word from both sets, and for these rows it will calculate which word position it holds by calculating the difference in length between the substring that ends at the words position and the same substring with spaces removed.
I am adding a new answer, even if my old one has been accepted and I can see you went for the "FULL TEXT INDEX".
I have looked at the answer #Louis gave, and I think it was clever to use "CROSS APPLY". His answer beats the performance of mine. The only problem is that his code will only compare from the first instance of a word. This made me want to try to combine his answer with the split function I used (DelimitedSplit8K from SQLServerCentral).
This results in a remarkable performance boost, I have tested this on 1 million rows, and the result was almost instant:
My old answer: 5min
#Louis answer: 2min
New answer: 3sec
This do not beet the "FULLTEXT INDEX" performance wise, but it at least supports the word search combination specification you provided in a relatively effective way.
DECLARE #set1 TABLE (Word VARCHAR(50))
DECLARE #set2 TABLE (Word VARCHAR(50))
INSERT INTO #set1 SELECT 'nuclear' UNION SELECT 'fission' UNION SELECT 'dirty'
INSERT INTO #set2 SELECT 'device'UNION SELECT 'explosive'
SELECT * FROM #sentences s
CROSS APPLY
(
SELECT * FROM #set1 s1
JOIN dbo.cf_DelimitedSplit8K(s.Description, ' ') split ON split.Item = s1.Word
) s1
CROSS APPLY
(
SELECT * FROM #set2 s2
JOIN dbo.cf_DelimitedSplit8K(s.Description, ' ') split ON split.Item = s2.Word
) s2
WHERE ABS(s1.ItemNumber - s2.ItemNumber) <= 4;
Look at my old answer for the code for the dbo.cf_COM_DelimitedSplit8K function.

Select statement that concatenates the first character after every '/' character in a column

So I am trying to write a query which, among other things, brings back the first character in a Varchar field, then returns the first character which appears after each / character throughout the rest of the field.
The field I am refrering too will contain a group of last names, separated by a '/'. For example: Fischer-Costello/Korbell/Morrison/Pearson
For the above example, I would want my select statement to return: FKMP.
So far, I have only been able to get my code to return the first character + the first character after the FIRST (and only the first) '/' character.
So for the above example input, my select statement would return: FK
Here is the code that I have written so far:
select rp.CONTACT_ID, ra.TRADE_REP, c.FIRST_NAME, c.LAST_NAME,
UPPER(LEFT(FIRST_NAME, 1)) + SUBSTRING(c.first_name,CHARINDEX('/',c.first_name)+1,1) as al_1,
UPPER(LEFT(LAST_NAME, 1)) + SUBSTRING(c.LAST_name,CHARINDEX('/',c.LAST_name)+1,1) as al_2
from dbo.REP_ALIAS ra
inner join dbo.REP_PROFILE rp on rp.CONTACT_ID = ra.CONTACT_ID
inner join dbo.CONTACT c on rp.CONTACT_ID = c.CONTACT_ID
where
rp.CRD_NUMBER is null and
ra.TRADE_REP like '%DNK%' and
(c.LAST_NAME like '%/%' or c.FIRST_NAME like '%/%') and
ra.TRADE_FIRM in
(
'xxxxxxx',
'xxxxxxx'
)
If you read the code, it's obvious that I am attempting to perform the same concatenation on the first_name column as well. However, I realize that a solution which will work for the Last_name column (used in my example), will also work for the first_name column.
Thank you.
Some default values
DECLARE #List VARCHAR(50) = 'Fischer-Costello/Korbell/Morrison/Pearson'
DECLARE #SplitOn CHAR(1) = '/'
This area just splits the string into a list
DECLARE #RtnValue table
(
Id int identity(1,1),
Value nvarchar(4000)
)
While (Charindex(#SplitOn, #List)>0)
Begin
Insert Into #RtnValue (value)
Select
Value = ltrim(rtrim(Substring(#List,1,Charindex(#SplitOn,#List)-1)))
Set #List = Substring(#List,Charindex(#SplitOn,#List)+len(#SplitOn+',')-1,len(#List))
End
Insert Into #RtnValue (Value)
Select Value = ltrim(rtrim(#List))
Now lets grab the first character of each name and stuff it back into a single variable
SELECT STUFF((SELECT SUBSTRING(VALUE,1,1) FROM #RtnValue FOR XML PATH('')),1,0,'') AS Value
Outputs:
Value
FKMP
Here is another way to do this would be a lot faster than looping. What you need is a set based splitter. Jeff Moden at sql server central has one that is awesome. Here is a link to the article. http://www.sqlservercentral.com/articles/Tally+Table/72993/
Now I know you have to signup for an account to view this but it is free and the logic in that article will change the way you look at data. You might also be able to find his code posted if you search for DelimitedSplit8K.
At any rate, here is how you could implement this type of splitter.
declare #Table table(ID int identity, SomeValue varchar(50))
insert #Table
select 'Fischer-Costello/Korbell/Morrison/Pearson'
select ID, STUFF((select '' + left(x.Item, 1)
from #Table t2
cross apply dbo.DelimitedSplit8K(SomeValue, '/') x
where t2.ID = t1.ID
for xml path('')), 1, 0 , '') as MyResult
from #Table t1
group by t1.ID

replace value in varchar(max) field with join

I have a table that contains text field with placeholders. Something like this:
Row Notes
1. This is some notes ##placeholder130## this ##myPlaceholder##, #oneMore#. End.
2. Second row...just a ##test#.
(This table contains about 1-5k rows on average. Average number of placeholders in one row is 5-15).
Now, I have a lookup table that looks like this:
Name Value
placeholder130 Dog
myPlaceholder Cat
oneMore Cow
test Horse
(Lookup table will contain anywhere from 10k to 100k records)
I need to find the fastest way to join those placeholders from strings to a lookup table and replace with value. So, my result should look like this (1st row):
This is some notes Dog this Cat, Cow. End.
What I came up with was to split each row into multiple for each placeholder and then join it to lookup table and then concat records back to original row with new values, but it takes around 10-30 seconds on average.
You could try to split the string using a numbers table and rebuild it with for xml path.
select (
select coalesce(L.Value, T.Value)
from Numbers as N
cross apply (select substring(Notes.notes, N.Number, charindex('##', Notes.notes + '##', N.Number) - N.Number)) as T(Value)
left outer join Lookup as L
on L.Name = T.Value
where N.Number <= len(notes) and
substring('##' + notes, Number, 2) = '##'
order by N.Number
for xml path(''), type
).value('text()[1]', 'varchar(max)')
from Notes
SQL Fiddle
I borrowed the string splitting from this blog post by Aaron Bertrand
SQL Server is not very fast with string manipulation, so this is probably best done client-side. Have the client load the entire lookup table, and replace the notes as they arrived.
Having said that, it can of course be done in SQL. Here's a solution with a recursive CTE. It performs one lookup per recursion step:
; with Repl as
(
select row_number() over (order by l.name) rn
, Name
, Value
from Lookup l
)
, Recurse as
(
select Notes
, 0 as rn
from Notes
union all
select replace(Notes, '##' + l.name + '##', l.value)
, r.rn + 1
from Recurse r
join Repl l
on l.rn = r.rn + 1
)
select *
from Recurse
where rn =
(
select count(*)
from Lookup
)
option (maxrecursion 0)
Example at SQL Fiddle.
Another option is a while loop to keep replacing lookups until no more are found:
declare #notes table (notes varchar(max))
insert #notes
select Notes
from Notes
while 1=1
begin
update n
set Notes = replace(n.Notes, '##' + l.name + '##', l.value)
from #notes n
outer apply
(
select top 1 Name
, Value
from Lookup l
where n.Notes like '%##' + l.name + '##%'
) l
where l.name is not null
if ##rowcount = 0
break
end
select *
from #notes
Example at SQL Fiddle.
I second the comment that tsql is just not suited for this operation, but if you must do it in the db here is an example using a function to manage the multiple replace statements.
Since you have a relatively small number of tokens in each note (5-15) and a very large number of tokens (10k-100k) my function first extracts tokens from the input as potential tokens and uses that set to join to your lookup (dbo.Token below). It was far too much work to look for an occurrence of any of your tokens in each note.
I did a bit of perf testing using 50k tokens and 5k notes and this function runs really well, completing in <2 seconds (on my laptop). Please report back how this strategy performs for you.
note: In your example data the token format was not consistent (##_#, ##_##, #_#), I am guessing this was simply a typo and assume all tokens take the form of ##TokenName##.
--setup
if object_id('dbo.[Lookup]') is not null
drop table dbo.[Lookup];
go
if object_id('dbo.fn_ReplaceLookups') is not null
drop function dbo.fn_ReplaceLookups;
go
create table dbo.[Lookup] (LookupName varchar(100) primary key, LookupValue varchar(100));
insert into dbo.[Lookup]
select '##placeholder130##','Dog' union all
select '##myPlaceholder##','Cat' union all
select '##oneMore##','Cow' union all
select '##test##','Horse';
go
create function [dbo].[fn_ReplaceLookups](#input varchar(max))
returns varchar(max)
as
begin
declare #xml xml;
select #xml = cast(('<r><i>'+replace(#input,'##' ,'</i><i>')+'</i></r>') as xml);
--extract the potential tokens
declare #LookupsInString table (LookupName varchar(100) primary key);
insert into #LookupsInString
select distinct '##'+v+'##'
from ( select [v] = r.n.value('(./text())[1]', 'varchar(100)'),
[r] = row_number() over (order by n)
from #xml.nodes('r/i') r(n)
)d(v,r)
where r%2=0;
--tokenize the input
select #input = replace(#input, l.LookupName, l.LookupValue)
from dbo.[Lookup] l
join #LookupsInString lis on
l.LookupName = lis.LookupName;
return #input;
end
go
return
--usage
declare #Notes table ([Id] int primary key, notes varchar(100));
insert into #Notes
select 1, 'This is some notes ##placeholder130## this ##myPlaceholder##, ##oneMore##. End.' union all
select 2, 'Second row...just a ##test##.';
select *,
dbo.fn_ReplaceLookups(notes)
from #Notes;
Returns:
Tokenized
--------------------------------------------------------
This is some notes Dog this Cat, Cow. End.
Second row...just a Horse.
Try this
;WITH CTE (org, calc, [Notes], [level]) AS
(
SELECT [Notes], [Notes], CONVERT(varchar(MAX),[Notes]), 0 FROM PlaceholderTable
UNION ALL
SELECT CTE.org, CTE.[Notes],
CONVERT(varchar(MAX), REPLACE(CTE.[Notes],'##' + T.[Name] + '##', T.[Value])), CTE.[level] + 1
FROM CTE
INNER JOIN LookupTable T ON CTE.[Notes] LIKE '%##' + T.[Name] + '##%'
)
SELECT DISTINCT org, [Notes], level FROM CTE
WHERE [level] = (SELECT MAX(level) FROM CTE c WHERE CTE.org = c.org)
SQL FIDDLE DEMO
Check the below devioblog post for reference
devioblog post
To get speed, you can preprocess the note templates into a more efficient form. This will be a sequence of fragments, with each ending in a substitution. The substitution might be NULL for the last fragment.
Notes
Id FragSeq Text SubsId
1 1 'This is some notes ' 1
1 2 ' this ' 2
1 3 ', ' 3
1 4 '. End.' null
2 1 'Second row...just a ' 4
2 2 '.' null
Subs
Id Name Value
1 'placeholder130' 'Dog'
2 'myPlaceholder' 'Cat'
3 'oneMore' 'Cow'
4 'test' 'Horse'
Now we can do the substitutions with a simple join.
SELECT Notes.Text + COALESCE(Subs.Value, '')
FROM Notes LEFT JOIN Subs
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq
This produces a list of fragments with substitutions complete. I am not an MSQL user, but in most dialects of SQL you can concatenate these fragments in a variable quite easily:
DECLARE #Note VARCHAR(8000)
SELECT #Note = COALESCE(#Note, '') + Notes.Text + COALSCE(Subs.Value, '')
FROM Notes LEFT JOIN Subs
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq
Pre-processing a note template into fragments will be straightforward using the string splitting techniques of other posts.
Unfortunately I'm not at a location where I can test this, but it ought to work fine.
I really don't know how it will perform with 10k+ of lookups.
how does the old dynamic SQL performs?
DECLARE #sqlCommand NVARCHAR(MAX)
SELECT #sqlCommand = N'PlaceholderTable.[Notes]'
SELECT #sqlCommand = 'REPLACE( ' + #sqlCommand +
', ''##' + LookupTable.[Name] + '##'', ''' +
LookupTable.[Value] + ''')'
FROM LookupTable
SELECT #sqlCommand = 'SELECT *, ' + #sqlCommand + ' FROM PlaceholderTable'
EXECUTE sp_executesql #sqlCommand
Fiddle demo
And now for some recursive CTE.
If your indexes are correctly set up, this one should be very fast or very slow. SQL Server always surprises me with performance extremes when it comes to the r-CTE...
;WITH T AS (
SELECT
Row,
StartIdx = 1, -- 1 as first starting index
EndIdx = CAST(patindex('%##%', Notes) as int), -- first ending index
Result = substring(Notes, 1, patindex('%##%', Notes) - 1)
-- (first) temp result bounded by indexes
FROM PlaceholderTable -- **this is your source table**
UNION ALL
SELECT
pt.Row,
StartIdx = newstartidx, -- starting index (calculated in calc1)
EndIdx = EndIdx + CAST(newendidx as int) + 1, -- ending index (calculated in calc4 + total offset)
Result = Result + CAST(ISNULL(newtokensub, newtoken) as nvarchar(max))
-- temp result taken from subquery or original
FROM
T
JOIN PlaceholderTable pt -- **this is your source table**
ON pt.Row = T.Row
CROSS APPLY(
SELECT newstartidx = EndIdx + 2 -- new starting index moved by 2 from last end ('##')
) calc1
CROSS APPLY(
SELECT newtxt = substring(pt.Notes, newstartidx, len(pt.Notes))
-- current piece of txt we work on
) calc2
CROSS APPLY(
SELECT patidx = patindex('%##%', newtxt) -- current index of '##'
) calc3
CROSS APPLY(
SELECT newendidx = CASE
WHEN patidx = 0 THEN len(newtxt) + 1
ELSE patidx END -- if last piece of txt, end with its length
) calc4
CROSS APPLY(
SELECT newtoken = substring(pt.Notes, newstartidx, newendidx - 1)
-- get the new token
) calc5
OUTER APPLY(
SELECT newtokensub = Value
FROM LookupTable
WHERE Name = newtoken -- substitute the token if you can find it in **your lookup table**
) calc6
WHERE newstartidx + len(newtxt) - 1 <= len(pt.Notes)
-- do this while {new starting index} + {length of txt we work on} exceeds total length
)
,lastProcessed AS (
SELECT
Row,
Result,
rn = row_number() over(partition by Row order by StartIdx desc)
FROM T
) -- enumerate all (including intermediate) results
SELECT *
FROM lastProcessed
WHERE rn = 1 -- filter out intermediate results (display only last ones)

TSQL - Querying a table column to pull out popular words for a tag cloud

Just an exploratory question to see if anyone has done this or if, in fact it is at all possible.
We all know what a tag cloud is, and usually, a tag cloud is created by someone assigning tags. Is it possible, within the current features of SQL Server to create this automatically, maybe via trigger when a table has a record added or updated, by looking at the data within a certain column and getting popular words?
It is similar to this question: How can I get the most popular words in a table via mysql?. But, that is MySQL not MSSQL.
Thanks in advance.
James
Here is a good bit on parsing delimited string into rows:
http://anyrest.wordpress.com/2010/08/13/converting-parsing-delimited-string-column-in-sql-to-rows/
http://www.sqlteam.com/article/parsing-csv-values-into-multiple-rows
http://www.sqlteam.com/forums/topic.asp?TOPIC_ID=50648
T-SQL: Opposite to string concatenation - how to split string into multiple records
If you want to parse all words, you can use the space ' ' as your delimiter, Then you get a row for each word.
Next you would simply select the result set GROUPing by the word and aggregating the COUNT
Order your results and you're there.
IMO, the design approach is what makes this difficult. Just because you allow users to assign tags does not mean the tags must be stored as a single delimited list of words. You can normalize the structure into something like:
Create Table Posts ( Id ... not null primary key )
Create Table Tags( Id ... not null primary key, Name ... not null Unique )
Create Table PostTags
( PostId ... not null References Posts( Id )
, TagId ... not null References Tags( Id ) )
Now your question becomes trivial:
Select T.Id, T.Name, Count(*) As TagCount
From PostTags As PT
Join Tags As T
On T.Id = PT.TagId
Group By T.Id, T.Name
Order By Count(*) Desc
If you insist on storing tags as delimited values, then only solution is to split the values on their delimiter by writing a custom Split function and then do your count. At the bottom is an example of a Split function. With it your query would look something like (using a comma delimiter):
Select Tag.Value, Count(*) As TagCount
From Posts As P
Cross Apply dbo.Split( P.Tags, ',' ) As Tag
Group By Tag.Value
Order By Count(*) Desc
Split Function:
Create Function [dbo].[Split]
(
#DelimitedList nvarchar(max)
, #Delimiter nvarchar(2) = ','
)
RETURNS TABLE
AS
RETURN
(
With CorrectedList As
(
Select Case When Left(#DelimitedList, DataLength(#Delimiter)/2) <> #Delimiter Then #Delimiter Else '' End
+ #DelimitedList
+ Case When Right(#DelimitedList, DataLength(#Delimiter)/2) <> #Delimiter Then #Delimiter Else '' End
As List
, DataLength(#Delimiter)/2 As DelimiterLen
)
, Numbers As
(
Select TOP (Coalesce(Len(#DelimitedList),1)) Row_Number() Over ( Order By c1.object_id ) As Value
From sys.objects As c1
Cross Join sys.columns As c2
)
Select CharIndex(#Delimiter, CL.list, N.Value) + CL.DelimiterLen As Position
, Substring (
CL.List
, CharIndex(#Delimiter, CL.list, N.Value) + CL.DelimiterLen
, Case
When CharIndex(#Delimiter, CL.list, N.Value + 1)
- CharIndex(#Delimiter, CL.list, N.Value)
- CL.DelimiterLen < 0 Then Len(CL.List)
Else CharIndex(#Delimiter, CL.list, N.Value + 1)
- CharIndex(#Delimiter, CL.list, N.Value)
- CL.DelimiterLen
End
) As Value
From CorrectedList As CL
Cross Join Numbers As N
Where N.Value < Len(CL.List)
And Substring(CL.List, N.Value, CL.DelimiterLen) = #Delimiter
)
Word or Tag clouds need two fields: a string and a value of how many times that word or string appeared in your collection. You can then pass the results into a tag cloud tool that will display the data as you require.
Not to take away from the previous answers, as they do answer the original challenge. However, I have a simpler solution using two functions (similar to #Thomas answer), one of which uses regex to "clean" the words.
The two functions are:
dbo.fnStripChars(a, b) --use regex 'b' to cleanse a string 'a'
dbo.fnMakeTableFromList(a, b) --convert a single field 'a' into a tabled list, delimited by 'b'
I then apply them into a single SQL statement, using the TOP n feature to give me the top 10 words I want to pass onto PowerBI or some other graphical tool, for actually displaying a word or tag cloud.
SELECT TOP 10 b.[words], b.[total]
FROM
(SELECT a.[words], count(*) AS [total]
FROM
(SELECT upper(l.item) AS [words]
FROM dbo.MyTableWithWords AS c
CROSS APPLY POTS.fnMakeTableFromList([POTS].fnStripChars(c.myColumnThatHasTheWords,'[^a-zA-Z ]'),' ') AS l) AS a
GROUP BY a.[words]) AS b
ORDER BY 2 DESC
As you can see, the regex is [^a-zA-Z ], which is to give me only alphabetical characters and spaces. The space is then used as a delimiter to the make table function to separate each word individually. I apply a count(*), to give me the number of times that word appears, hence then I have everything I need to give me the TOP 10 results.
Note that CROSS APPLY is important here so I get only data with actual "words" in each record found. Otherwise it will go through every record with or without words to extract from the column I want.
fnStripChars()
FUNCTION [dbo].[fnStripChars]
(
#String NVARCHAR(4000),
#MatchExpression VARCHAR(255)
)
RETURNS NVARCHAR(MAX)
AS
BEGIN
SET #MatchExpression = '%' + #MatchExpression + '%'
WHILE PatIndex(#MatchExpression, #String) > 0
SET #String = Stuff(#String, PatIndex(#MatchExpression, #String), 1, '')
RETURN #String
END
fnMakeTableFromList()
FUNCTION [dbo].[fnMakeTableFromList](
#List VARCHAR(MAX),
#Delimiter CHAR(1))
RETURNS TABLE
AS
RETURN (SELECT Item = CONVERT(VARCHAR, Item)
FROM (SELECT Item = x.i.value('(./text())[1]','varchar(max)')
FROM (SELECT [XML] = CONVERT(XML,'<i>' + REPLACE(#List,#Delimiter,'</i><i>') + '</i>').query('.')) AS a
CROSS APPLY [XML].nodes('i') AS x(i)) AS y
WHERE Item IS NOT NULL);
I've tested this with over 400K records and it's able to come back with my results in under 60 seconds. I think that's reasonable.