T-SQL query: find most frequent pair of values - sql

I have text column with numeric values separated by semicolons. I'm trying to figure out how to get the most frequent pair of values that appeared together in the same row. I've found a solution for a very similar problem in Python Finding the most frequent occurrences of pairs in a list of lists, but I don't know how to rewrite it in using SQL In example below it returns 2 and 3 because this pair appeared 3 times in the input set:
Input rows Output
---------- -------
';1;2;3;5' | '2;3'
';2;3' | '1;2'
';3;4;5;1;2' | '1;3'
';1;5;2' | '1;5'
Orginal data:

You may try with the following approach. First, using OPENJSON(), get all possible combinations. When OPENJSON parses a JSON array the indexes of the elements in the JSON text are returned as keys (0-based). Then, count the most frequent pair with DENSE_RANK().
Input:
CREATE TABLE #Items (
Id int,
ItemValues varchar(max)
)
INSERT INTO #Items
(Id, ItemValues)
VALUES
(1, '1;2;3;5'),
(2, '2;3'),
(3, '3;4;5;1;2'),
(4, '1;5;2')
Statement:
;WITH combinationsCTE AS (
SELECT
CASE
WHEN s1.[value] <= s2.[value] THEN CONCAT(s1.[value], ';', s2.[value])
ELSE CONCAT(s2.[value], ';', s1.[value])
END AS PairValue
FROM #Items i
CROSS APPLY (SELECT [key], [value] FROM OPENJSON('["' + REPLACE(i.ItemValues,';','","') + '"]')) s1
CROSS APPLY (SELECT [key], [value] FROM OPENJSON('["' + REPLACE(i.ItemValues,';','","') + '"]')) s2
WHERE (s1.[key] < s2.[key])
), rankingCTE AS (
SELECT
PairValue,
DENSE_RANK() OVER (ORDER BY COUNT(PairValue) DESC) AS PairRank
FROM combinationsCTE
GROUP BY PairValue
)
SELECT PairValue
FROM rankingCTE
WHERE PairRank = 1
Output:
PairValue
1;2
1;5
2;3
2;5

First have a split function
CREATE FUNCTION Splitfn(#String varchar(8000), #Delimiter char(1))
returns #temptable TABLE (items varchar(8000))
as
begin
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null return
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(Items) values(#slice)
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end
return
end
Second Step To get all rows in a single string
Declare #val Varchar(MAX);
Select #val = COALESCE(#val + '; ' + YourColumn, YourColumn)
From YourTable
Third step,
SELECT TOP 1 items, count(*)
FROM dbo.Splitfn(#Val, ';')
WHERE LTRIM(RTRIM(items)) <> ''
GROUP BY items
ORDER BY Count(*) DESC

Related

Delimiter with a condition

I have a column (MarketID) in a table.
I have to derive a value out of it.
I have to check for occurrence of delimiter(.) in the second position and see if there are consecutive three numbers after the delimiter then get that value. If not check for occurrence of delimiter(.) in the fourth position and see if there are consecutive three numbers after the delimiter then get that value
else get 0.
1) In first record: '3.001.1.16', at the second position there is a delimiter(.) and consecutive 3 number exists (001), so my output would be 001..
2)In the second record '3.1.006.4.7',there is a delimiter at second position but we don't have three consecutive numbers so we check for the 4th position and there is a delimiter and consecutive three numbers exist so the output is 006 ..
3) no (.) delimiter so output=0.
create table dbo.SampleList
(
MarketID varchar(100)
)
insert into dbo.SampleList
select '3.001.1.16'
union all
select '3.1.006.4.7'
union all
select 'D16B000000:21109:4'
select * from dbo.SampleList
Assuming SQL Server from dbo, you could use a CASE statement:
SELECT MarketID,
CASE WHEN SUBSTRING(MarketID,2,1) = '.' AND TRY_CONVERT(int,SUBSTRING(MarketID,3,3)) IS NOT NULL THEN SUBSTRING(MarketID,3,3)
WHEN SUBSTRING(MarketID,4,1) = '.' AND TRY_CONVERT(int,SUBSTRING(MarketID,5,3)) IS NOT NULL THEN SUBSTRING(MarketID,5,3)
ELSE '0'
END
FROM #SampleList
TRY_CONVERT to int will verify that the 3 characters are numbers
Here's a solution using a function I've created a few years ago.
It allows you to split a string and get a table as a result.
CREATE FUNCTION [dbo].[splitStringToTable]
(
#List VARCHAR(MAX) ,
#Separator VARCHAR(MAX)
)
RETURNS #Results TABLE
(
ID INT
)
AS
BEGIN
SET #List = #List + ','
DECLARE #POS INT
DECLARE #TEMP VARCHAR(8000)
WHILE (Charindex(#Separator, #List)>0)
BEGIN
SET #POS = Charindex(#Separator, #List)
IF #POS > = 0
BEGIN
SET #TEMP = LEFT(#List, #POS-1)
IF #TEMP <> ''
INSERT INTO #Results (ID) VALUES (#TEMP)
SET #List = Substring(#List, Charindex(#Separator, #List)+len(#Separator), len(#List))
END
END
RETURN
END
GO
Usage:
SELECT *, ISNULL((SELECT TOP 1 ID FROM dbo.[splitStringToStringTable](MarketID, '.') WHERE LEN(ID) = 3), 0) AS Result
FROM SampleList
SELECT MarketID,
(CASE WHEN SUBSTRING(MarketID,2,1) = '.'
THEN
(CASE WHEN SUBSTRING(MarketID,6,1) = '.' THEN SUBSTRING (MarketID,3,3)
WHEN SUBSTRING(MarketID,4,1) = '.' THEN
(CASE WHEN SUBSTRING(MarketID ,8,1)='.' THEN SUBSTRING(MarketID,5,3) ELSE NULL END)ELSE NULL END)
WHEN MarketID NOT LIKE '%.%' THEN '0'
ELSE '0'
END ) AS Output
FROM dbo.SampleList

Reverse Concat - Split function

I have a table and it has a value column that lists data as:
Row 1: '00','01','02','03'
Row 2: '03','02','09','08'
I have a couple of split functions
FUNCTION [dbo].[udf_Split](#String varchar(MAX), #Delimiter char(1))
returns #temptable TABLE (Item varchar(MAX))
as
begin
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null return
while #idx<>0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(Item) values(#slice)
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end return end;
I'm trying to create a view of the table, with that column and then I'd like my view results to be a list of rows that have each value broken to its own row (and distinct) So would look like: (the tics can stay or go, don't care about them right now)
Row 1: 00
Row 2: 01
Row 3: 02
Row 4: 03
My view is pretty much a:
SELECT DISTINCT VALUE FROM TABLE
cross apply dbo.split(Value, ',') as Item
But it's not working. Can someone lend me some direction on how I should work this?
This is because you're SELECTing the field VALUE instead of Item.Item. You should do this:
SELECT DISTINCT x.Item
FROM TABLE
CROSS APPLY dbo.split(Value, ',') AS x
Additionally, your dbo.split function is not optimal. There are number of ways to split a string in a set-based fashion, instead of RBAR. Here is one way using XML:
CREATE FUNCTION dbo.SplitStrings_XML
(
#List NVARCHAR(MAX),
#Delimiter NVARCHAR(255)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
(
SELECT Item = y.i.value('(./text())[1]', 'nvarchar(4000)')
FROM
(
SELECT x = CONVERT(XML, '<i>'
+ REPLACE(#List, #Delimiter, '</i><i>')
+ '</i>').query('.')
) AS a CROSS APPLY x.nodes('i') AS y(i)
);
GO
Sample usage:
;WITH CteData(Value) AS(
SELECT '''00'',''01'',''02'',''03''' UNION ALL
SELECT '''03'',''02'',''09'',''08'''
)
SELECT DISTINCT x.Item
FROM CteData d
CROSS APPLY dbo.SplitStrings_XML(d.Value, ',') x
Result:
Item
--------
'00'
'01'
'02'
'03'
'08'
'09'
For other string splitter, you can read this article by Aaron Bertrand.

Table from comma separated Column at two level

I have one old db in which there are two columns which contain comma separated values like this,
SQL FIDDLE LINK for SCHEMA
Now my problem is that I am trying to import those values into another database which is normalized. So instead of comma separated values, I need to convert those values into a tabular format .
So my output should be look like this,
You need to define what those columns mean. In your example are you discarding the original ID column, in which case what does "1,2,3" & "A,B" actually mean?
I'd probably approach this by cursoring through each row and using a split function to convert each field to a table of values.
create FUNCTION dbo.fn_Split1 (#sep nchar(1), #s nvarchar(4000))
RETURNS table
/**************************************************************************************************
* Author: http://stackoverflow.com/questions/314824/
* Description: splits a string into a table of values, with single-char delimiter.
* Example Usage:
select * from dbo.fn_split1(',', '1,2,5,2,,dggsfdsg,456,df,1,2,5,2,,dggsfdsg,456,df,1,2,5,2,,')
**************************************************************************************************/
AS
RETURN (
WITH Pieces(pn, start, stop) AS (
SELECT 1, 1, CHARINDEX(#sep, #s)
UNION ALL
SELECT pn + 1, stop + 1, CHARINDEX(#sep, #s, stop + 1)
FROM Pieces
WHERE stop > 0
)
SELECT pn,
SUBSTRING(#s, start, CASE WHEN stop > 0 THEN stop-start ELSE 4000 END) AS s
FROM Pieces
)
CREATE TABLE #RegionDetail
(
Id int identity(1,1) not null,
RegionId nvarchar(50),
Zone nvarchar(50)
)
INSERT INTO #RegionDetail (RegionId,Zone) values ('1,2,3','A,B')
INSERT INTO #RegionDetail (RegionId,Zone) values ('1,2,3','X,Y')
INSERT INTO #RegionDetail (RegionId,Zone) values ('4,3,5','A,B')
GO
create FUNCTION [dbo].[Split](#String varchar(MAX), #Delimiter char(1))
returns #temptable TABLE (items varchar(MAX))
as
begin
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null return
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(Items) values(#slice)
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end
return
end;
GO
SELECT Id,RegionId,Zone FROM #RegionDetail
select
r.Id,f.items as RegionId,z.items as Zone
from
#RegionDetail r
cross apply [dbo].[Split](r.RegionId,',') f
cross apply [dbo].[Split](r.Zone,',') z
order by Id,RegionId,Zone

Compare two list items

I am trying to compare a database field which stores list items (comma separated) with unfortunately a variable which is also a list item.
Example:
In this case, a user can belong to multiple groups, and content access is also allocated to multiple groups.
contentid | group
(1) (c,d)
(2) (a,c)
(3) (b)
So, I need to select all content where user is in group (a,c). In this case, contentid 1,2 should be returned.
Here's a safe but slow solution for SQL 2008
BEGIN
-- setup
DECLARE #tbl TABLE (
[contentid] INT
,[group] VARCHAR(MAX)
)
INSERT INTO #tbl VALUES
(1, 'c,d')
,(2, 'a,c')
,(3, 'd')
-- send your request as simple xml
DECLARE #param XML
SET #param = '<g>a</g><g>c</g>'
-- query
SELECT DISTINCT contentid
FROM #tbl t
INNER JOIN #param.nodes('/g') AS t2(g)
ON ',' + t.[group] + ',' LIKE '%,' + t2.g.value('.', 'varchar(max)') + ',%'
END
You just pass your query in as an XML snippet instead of a comma separated list.
If your group names are single characters or you can be sure the names are not character-subsets of each other (ie: GroupA, GroupAB), then the query can be optimized to.
ON t.[group] LIKE '%' + t2.g.value('.', 'varchar(max)') + '%'
If you're using a RDBMS without XML parsing capability you'll have to use string split your query into a temp table and work it that way.
You really should not be using comma separated values inside your columns. It would be much better if the [group] column only contained one value and you had repeated entries with a UNIQUE constraint on the composite (contentid, group).
You might find this question and answer useful : How do I split a string so I can access item x?
Or you could always use something like this :
create function SplitString(
#string varchar(max),
#delimiter char(1)
)
returns #items table (item varchar(max))
as
begin
declare #index int set #index = 0
if (#delimiter is null) set #delimiter = ','
declare #prevdelimiter int set #prevdelimiter = 0
while (#index < len(#string)) begin
if (substring(#string, #index, 1) = #delimiter) begin
insert into #items
select substring(#string, #prevdelimiter, #index-#prevdelimiter)
set #prevdelimiter = #index + 1
end
set #index = #index + 1
end
--last item (or only if there were no delimiters)
insert into #items
select substring(#string, #prevdelimiter, #index - #prevdelimiter + 1)
return
end
go
declare #content table(contentid int, [group] varchar(max))
insert into #content
select 1, 'c,d'
union
select 2, 'a,c'
union
select 3, 'b'
declare #groups varchar(max) set #groups = 'a,c'
declare #grouptable table(item varchar(max))
insert into #grouptable
select * from dbo.SplitString(#groups, ',')
select * From #content
where (select count(*) from #grouptable g1 join dbo.SplitString([group], ',') g2 on g1.item = g2.item) > 0

SQL Server - Compare Varchar values using IN

In my table, I have a varchar column whereby multi-values are stored. An example of my table:
RecNum | Title | Category
-----------------------------------------
wja-2012-000001 | abcdef | 4,6
wja-2012-000002 | qwerty | 1,3,7
wja-2012-000003 | asdffg |
wja-2012-000004 | zxcvbb | 2,7
wja-2012-000005 | ploiuh | 3,4,12
The values in the Category column points to another table.
How can I return the relevant rows if I want to retrieve the rows with value 1,3,5,6,8 in the Category column?
When I tried using IN, I get the 'Conversion failed when converting the varchar value '1,3,5,6,8' to data type int' error.
Breaking the Categories out into a separate table would be a better design if that's a change you can make... otherwise, you could create a function to split the values into a table of integers like this:
CREATE FUNCTION dbo.Split(#String varchar(8000), #Delimiter char(1))
returns #temptable TABLE (id int)
as
begin
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null return
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(id) values(convert(int, #slice))
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end
return
end
Then call it from your query:
SELECT ...
FROM ...
WHERE #SomeID IN (SELECT id FROM dbo.Split(Category, ','))
Or if you're looking to provide a list of categories as an input parameter (such as '1,3,5,6,8'), and return all records in your table that contain at least one of these values, you could use a query like this:
SELECT ...
FROM ...
WHERE
EXISTS (
select 1
from dbo.Split(Category, ',') s1
join dbo.Split(#SearchValues, ',') s2 ON s1.id = s2.id
)
you can do like this
declare #var varchar(30); set #var='2,3';
exec('select * from category where Category_Id in ('+#var+')')
Try this solution:
CREATE TABLE test4(RecNum varchar(20),Title varchar(10),Category varchar(15))
INSERT INTO test4
VALUES('wja-2012-000001','abcdef','4,6'),
('wja-2012-000002','qwerty','1,3,7'),
('wja-2012-000003','asdffg',null),
('wja-2012-000004','zxcvbb','2,7'),
('wja-2012-000005','ploiuh','3,4,12')
select * from test4
Declare #str varchar(25) = '1,3,5,6,8'
;WITH CTE as (select RecNum,Title,Category from test4)
,CTE1 as (
select RecNum,Title,RIGHT(#str,LEN(#str)-CHARINDEX(',',#str,1)) as rem from CTE where category like '%'+LEFT(#str,1)+'%'
union all
select c.RecNum,c.Title,RIGHT(c1.rem,LEN(c1.rem)-CHARINDEX(',',c1.rem,1)) as rem from CTE1 c1 inner join CTE c
on c.category like '%'+LEFT(c1.rem,1)+'%' and CHARINDEX(',',c1.rem,1)>0
)
select RecNum,Title from CTE1
As mentioned by others, your table design violates basic database design principles and if there is no way around it, you could normalize the table with little code (example below) and then join away with the other table. Here you go:
Data:
CREATE TABLE data(RecNum varchar(20),Title varchar(10),Category varchar(15))
INSERT INTO data
VALUES('wja-2012-000001','abcdef','4,6'),
('wja-2012-000002','qwerty','1,3,7'),
('wja-2012-000003','asdffg',null),
('wja-2012-000004','zxcvbb','2,7'),
('wja-2012-000005','ploiuh','3,4,12')
This function takes a comma separated string and returns a table:
CREATE FUNCTION listToTable (#list nvarchar(MAX))
RETURNS #tbl TABLE (number int NOT NULL) AS
BEGIN
DECLARE #pos int,
#nextpos int,
#valuelen int
SELECT #pos = 0, #nextpos = 1
WHILE #nextpos > 0
BEGIN
SELECT #nextpos = charindex(',', #list, #pos + 1)
SELECT #valuelen = CASE WHEN #nextpos > 0
THEN #nextpos
ELSE len(#list) + 1
END - #pos - 1
INSERT #tbl (number)
VALUES (convert(int, substring(#list, #pos + 1, #valuelen)))
SELECT #pos = #nextpos
END
RETURN
END
Then, you can do something like this to "normalize" the table:
SELECT *
FROM data m
CROSS APPLY listToTable(m.Category) AS t
where Category is not null
And then use the result of the above query to join with the "other" table. For example (i did not test this query):
select * from otherTable a
join listToTable('1,3,5,6,8') b
on a.Category = b.number
join(
SELECT *
FROM data m
CROSS APPLY listToTable(m.Category) AS t
where Category is not null
) c
on a.category = c.number