SQL - How to find a value in a tree level data structure

SQL - How to find a value in a tree level data structure - sql

I have two SQL Server tables:
Invoice (invoice)
Invoice Relations (invoice_relation)
invoice table stores all invoice records with a transaction folio.
invoice_relation table stores any relation between invoices.
This is an example of how invoices can be related between each other:
So the goal is to find the "folio" under invoice table given an invoicenumber and a folio but the folio sometimes won't be the folio that the invoice has, so I need to do a search on all the tree relation in order to find if any invoice match the invoice number but also the folio is part of the relation.
For example I have to find the folio and match invoice number of:
Folio: 1003
Invoice Number: A1122
In my query I would need to first find by folio because it's my invoice table primary key. Then, will try to match A1122 with D1122 that won't match, so then I have to search all the tree structure to find if there is a A1122. The result will be that the invoice A1122 was found in folio 1000.
Any clue on how to do this?
Here is a script of how to create the above example tables with data:
CREATE TABLE [dbo].[invoice](
[folio] [int] NOT NULL,
[invoicenumber] [nvarchar](20) NOT NULL,
[isactive] [bit] NOT NULL,
CONSTRAINT [PK_invoice] PRIMARY KEY CLUSTERED
(
[folio] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO
CREATE TABLE [dbo].[invoice_relation](
[relationid] [int] NOT NULL,
[invoice] [nvarchar](20) NOT NULL,
[parentinvoice] [nvarchar](20) NOT NULL,
CONSTRAINT [PK_invoice_relation_1] PRIMARY KEY CLUSTERED
(
[relationid] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO
INSERT [dbo].[invoice] ([folio], [invoicenumber], [isactive]) VALUES (1000, N'A1122', 1)
GO
INSERT [dbo].[invoice] ([folio], [invoicenumber], [isactive]) VALUES (1001, N'B1122', 1)
GO
INSERT [dbo].[invoice] ([folio], [invoicenumber], [isactive]) VALUES (1002, N'C1122', 1)
GO
INSERT [dbo].[invoice] ([folio], [invoicenumber], [isactive]) VALUES (1003, N'D1122', 1)
GO
INSERT [dbo].[invoice] ([folio], [invoicenumber], [isactive]) VALUES (1004, N'F1122', 1)
GO
INSERT [dbo].[invoice] ([folio], [invoicenumber], [isactive]) VALUES (1005, N'G1122', 1)
GO
INSERT [dbo].[invoice_relation] ([relationid], [invoice], [parentinvoice]) VALUES (1, N'A1122', N'B1122')
GO
INSERT [dbo].[invoice_relation] ([relationid], [invoice], [parentinvoice]) VALUES (2, N'C1122', N'A1122')
GO
INSERT [dbo].[invoice_relation] ([relationid], [invoice], [parentinvoice]) VALUES (3, N'D1122', N'A1122')
GO
INSERT [dbo].[invoice_relation] ([relationid], [invoice], [parentinvoice]) VALUES (4, N'F1122', N'B1122')
GO
INSERT [dbo].[invoice_relation] ([relationid], [invoice], [parentinvoice]) VALUES (5, N'G1122', N'F1122')
GO

I am still not sure what you really want, I had written something similar to JamieD77 which is to find top parent and then walk back down tree but then you get children and granchildren that are not directly related to A1122.....
Here is a way to walk up and down the tree and return all children and parents directly related to an invoicenumber
DECLARE #InvoiceNumber NVARCHAR(20) = 'A1122'
DECLARE #Folio INT = 1003
;WITH cteFindParents AS (
SELECT
i.folio
,i.invoicenumber
,CAST(NULL AS NVARCHAR(20)) as ChildInvoiceNumber
,CAST(NULL AS NVARCHAR(20)) as ParentInvoiceNumber
,0 as Level
FROM
dbo.invoice i
WHERE
i.invoicenumber = #InvoiceNumber
UNION ALL
SELECT
i.folio
,i.invoicenumber
,c.invoicenumber as ChildInvoiceNumber
,i.invoicenumber as ParentInvoiceNumber
,c.Level - 1 as Level
FROM
cteFindParents c
INNER JOIN dbo.invoice_relation r
ON c.invoicenumber = r.invoice
INNER JOIN dbo.invoice i
ON r.parentinvoice = i.invoicenumber
)
, cteFindChildren as (
SELECT *
FROM
cteFindParents
UNION ALL
SELECT
i.folio
,i.invoicenumber
,i.invoicenumber AS ChildInvoiceNumber
,c.invoicenumber AS ParentInvoiceNumber
,Level + 1 as Level
FROM
cteFindChildren c
INNER JOIN dbo.invoice_relation r
ON c.invoicenumber = r.parentinvoice
INNER JOIN dbo.invoice i
ON r.invoice = i.invoicenumber
WHERE
c.Level = 0
)
SELECT *
FROM
cteFindChildren
But Depending on what exactly you are looking for you may actually get a couple of cousins that are not desired.....
--------------Here was a method to find top parent and get the whole tree
DECLARE #InvoiceNumber NVARCHAR(20) = 'A1122'
DECLARE #Folio INT = 1003
;WITH cteFindParents AS (
SELECT
i.folio
,i.invoicenumber
,CAST(NULL AS NVARCHAR(20)) as ChildInvoiceNumber
,0 as Level
FROM
dbo.invoice i
WHERE
i.invoicenumber = #InvoiceNumber
UNION ALL
SELECT
i.folio
,i.invoicenumber
,c.invoicenumber as ChildInvoiceNumber
,c.Level + 1 as Level
FROM
cteFindParents c
INNER JOIN dbo.invoice_relation r
ON c.invoicenumber = r.invoice
INNER JOIN dbo.invoice i
ON r.parentinvoice = i.invoicenumber
)
, cteGetTopParent AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY 1 ORDER BY LEVEL DESC) as RowNum
FROM
cteFindParents
)
, cteGetWholeTree AS (
SELECT
p.folio
,p.invoicenumber
,p.invoicenumber as TopParent
,p.invoicenumber as Parent
,CAST(p.invoicenumber AS NVARCHAR(1000)) as Hierarchy
,0 as Level
FROM
cteGetTopParent p
WHERE
RowNum = 1
UNION ALL
SELECT
i.folio
,i.invoicenumber
,c.TopParent
,c.invoicenumber AS Parent
,CAST(c.TopParent + '|' + (CASE WHEN Level > 0 THEN c.invoicenumber + '|' ELSE '' END) + i.invoicenumber AS NVARCHAR(1000)) as Hierarchy
,Level + 1 as Level
FROM
cteGetWholeTree c
INNER JOIN dbo.invoice_relation r
ON c.invoicenumber = r.parentinvoice
INNER JOIN dbo.invoice i
ON r.invoice = i.invoicenumber
)
SELECT *
FROM
cteGetWholeTree

Your model is broken to begin with. parentinvoice should be in the invoice table. It's a recursive database model....so make the table schema recursive. Have a nullable foreign key in a column that references it's own table. Any time that field (the parent invoice field) is null indicates that it is the primary invoice. Any row having a parent is a piece of invoice.
When you want to find a value in a tree level structure you wrap your initial sql query into a 'SELECT(.....)' statement (creating your own custom selectable table) that filters out what you want. Let me know if you have any questions!

I was a little unclear as to your actual requirements, so I figured a Table Valued Function may be appropriate here. I added a few optional items, and if unwanted, they are easy enough to remove (i.e. TITLE, Nesting, TopInvoice, TopFolio). Also, you may notice the Range Keys (R1/R2). These serve many functions: Presentation Sequence, Selection Criteria, Parent/Leaf Indicators, and perhaps most importantly Non-Recursive Aggregation.
To Return the Entire Hierarchy
Select * from [dbo].[udf_SomeFunction](NULL,NULL)
To Return an Invoice and ALL of its descendants
Select * from [dbo].[udf_SomeFunction]('A1122',NULL)
To Return the PATH of a Folio
Select * from [dbo].[udf_SomeFunction](NULL,'1003')
To Return a Folio Limited to an Invoice
Select * from [dbo].[udf_SomeFunction]('A1122','1003')
The following code requires SQL 2012+
CREATE FUNCTION [dbo].[udf_SomeFunction](#Invoice nvarchar(25),#Folio nvarchar(25))
Returns Table
As
Return (
with cteBld as (
Select Seq = cast(1000+Row_Number() over (Order By Invoice) as nvarchar(500)),I.Invoice,I.ParentInvoice,Lvl=1,Title = I.Invoice,F.Folio
From (
Select Distinct
Invoice=ParentInvoice
,ParentInvoice=cast(NULL as nvarchar(20))
From [Invoice_Relation]
Where #Invoice is NULL and ParentInvoice Not In (Select Invoice from [Invoice_Relation])
Union All
Select Invoice
,ParentInvoice
From [Invoice_Relation]
Where Invoice=#Invoice
) I
Join Invoice F on I.Invoice=F.InvoiceNumber
Union All
Select Seq = cast(concat(A.Seq,'.',1000+Row_Number() over (Order by I.Invoice)) as nvarchar(500))
,I.Invoice
,I.ParentInvoice
,A.Lvl+1
,I.Invoice,F.folio
From [Invoice_Relation] I
Join cteBld A on I.ParentInvoice = A.Invoice
Join Invoice F on I.Invoice=F.InvoiceNumber )
,cteR1 as (Select Seq,Invoice,Folio,R1=Row_Number() over (Order By Seq) From cteBld)
,cteR2 as (Select A.Seq,A.Invoice,R2=Max(B.R1) From cteR1 A Join cteR1 B on (B.Seq like A.Seq+'%') Group By A.Seq,A.Invoice )
Select Top 100 Percent
B.R1
,C.R2
,A.Invoice
,A.ParentInvoice
,A.Lvl
,Title = Replicate('|-----',A.Lvl-1)+A.Title -- Optional: Added for Readability
,A.Folio
,TopInvoice = First_Value(A.Invoice) over (Order By R1)
,TopFolio = First_Value(A.Folio) over (Order By R1)
From cteBld A
Join cteR1 B on A.Invoice=B.Invoice
Join cteR2 C on A.Invoice=C.Invoice
Where (#Folio is NULL)
or (#Folio is Not NULL and (Select R1 from cteR1 Where Folio=#Folio) between R1 and R2)
Order By R1
)
Final Thoughts:
This certainly may be more than what you were looking, and there is good chance that I COMPLETELY misunderstood your requirements. That said, being a TVF you can expand with additional WHERE and/or ORDER clauses or even incorporate into a CROSS APPLY.

This uses an approach of using a hierarchyid, first generating hierarchyid for every row, then selecting the row where folio is 1003, then finding all ancestors who have an invoicenumber of 'A1122'. It's not very efficient but may give you some different ideas:
;WITH
Allfolios
AS
(
Select i.folio, i.InvoiceNumber,
hierarchyid::Parse('/' +
CAST(ROW_NUMBER()
OVER (ORDER BY InvoiceNumber) AS VARCHAR(30)
) + '/') AS hierarchy, 1 as level
from invoice i
WHERE NOT EXISTS
(SELECT * FROM invoice_relation ir WHERE ir.invoice = i. invoicenumber)
UNION ALL
SELECT i.folio, i.invoiceNumber,
hierarchyid::Parse(CAST(a.hierarchy as VARCHAR(30)) +
CAST(ROW_NUMBER()
OVER (ORDER BY a.InvoiceNumber)
AS VARCHAR(30)) + '/') AS hierarchy,
level + 1
FROM Allfolios A
INNER JOIN invoice_relation ir
on a.InvoiceNumber = ir.ParentInvoice
INNER JOIN invoice i
on ir.Invoice = i.invoicenumber
),
Ancestors
AS
(
SELECT folio, invoiceNumber, hierarchy, hierarchy.GetAncestor(1) as AncestorId
from Allfolios
WHERE folio = 1003
UNION ALL
SELECT af.folio, af.invoiceNumber, af.hierarchy, af.hierarchy.GetAncestor(1)
FROM Allfolios AF
INNER JOIN
Ancestors a ON Af.hierarchy= a.AncestorId
)
SELECT *
FROM Ancestors
WHERE InvoiceNumber = 'A1122'
Edited for the case highlighted by #jj32 where you wish to find the root element in the hierarchy which folio 1003 is in, then find any descendant of that root which has an invoice number of 'A1122'. See below:
;WITH
Allfolios -- Convert all rows to a hierarchy
AS
(
Select i.folio, i.InvoiceNumber,
hierarchyid::Parse('/' +
CAST(ROW_NUMBER()
OVER (ORDER BY InvoiceNumber) AS VARCHAR(30)
) + '/') AS hierarchy, 1 as level
from invoice i
WHERE NOT EXISTS
(SELECT * FROM invoice_relation ir WHERE ir.invoice = i. invoicenumber)
UNION ALL
SELECT i.folio, i.invoiceNumber,
hierarchyid::Parse(CAST(a.hierarchy as VARCHAR(30)) +
CAST(ROW_NUMBER()
OVER (ORDER BY a.InvoiceNumber)
AS VARCHAR(30)) + '/') AS hierarchy,
level + 1
FROM Allfolios A
INNER JOIN invoice_relation ir
on a.InvoiceNumber = ir.ParentInvoice
INNER JOIN invoice i
on ir.Invoice = i.invoicenumber
),
Root -- Find Root
AS
(
SELECT *
FROM AllFolios AF
WHERE Level = 1 AND
(SELECT hierarchy.IsDescendantOf(AF.hierarchy) from AllFolios AF2 WHERE folio = 1003) = 1
)
-- Find all descendants of the root element which have an invoicenumber = 'A1122'
SELECT *
FROM ALLFolios
WHERE hierarchy.IsDescendantOf((SELECT TOP 1 hierarchy FROM Root)) = 1 AND
invoicenumber = 'A1122'

this was tricky since you have a separate relation table and the root invoice is not in it.
DECLARE #folio INT = 1003,
#invoice NVARCHAR(20) = 'A1122'
-- find highest level of relationship
;WITH cte AS (
SELECT i.folio,
i.invoicenumber,
ir.parentinvoice,
0 AS [level]
FROM invoice i
LEFT JOIN invoice_relation ir ON ir.invoice = i.invoicenumber
WHERE i.folio = #folio
UNION ALL
SELECT i.folio,
i.invoicenumber,
ir.parentinvoice,
[level] + 1
FROM invoice i
JOIN invoice_relation ir ON ir.invoice = i.invoicenumber
JOIN cte r ON r.parentinvoice = i.invoicenumber
),
-- make sure you get the root folio
rootCte AS (
SELECT COALESCE(oa.folio, c.folio) AS rootFolio
FROM (SELECT *,
ROW_NUMBER() OVER (ORDER BY [level] DESC) Rn
FROM cte ) c
OUTER APPLY (SELECT folio FROM invoice i WHERE i.invoicenumber = c.parentinvoice) oa
WHERE c.Rn = 1
),
-- get all children of root folio
fullTree AS (
SELECT i.folio,
i.invoicenumber
FROM rootCte r
JOIN invoice i ON r.rootFolio = i.folio
UNION ALL
SELECT i.folio,
i.invoicenumber
FROM fullTree ft
JOIN invoice_relation ir ON ir.parentinvoice = ft.invoicenumber
JOIN invoice i ON ir.invoice = i.invoicenumber
)
-- search for invoice
SELECT *
FROM fullTree
WHERE invoicenumber = #invoice

Here's an attempt that flattens out the relations first so you can travel in any direction. Then it does the recursive CTE to work through the levels:
WITH invoicerelation AS
(
select relationid, invoice, parentinvoice AS relatedinvoice
from invoice_relation
union
select relationid, parentinvoice AS invoice, invoice AS relatedinvoice
from invoice_relation
),
cteLevels AS
(
select 0 AS relationid, invoice.folio,
invoicenumber AS invoice, invoicenumber AS relatedinvoice,
0 AS Level
from invoice
UNION ALL
select invoicerelation.relationid, invoice.folio,
invoicerelation.invoice, cteLevels.relatedinvoice,
Level + 1 AS Level
from invoice INNER JOIN
invoicerelation ON invoice.invoicenumber = invoicerelation.invoice INNER JOIN
cteLevels ON invoicerelation.relatedinvoice = cteLevels.invoice
and (ctelevels.relationid <> invoicerelation.relationid)
)
SELECT cteLevels.folio, relatedinvoice, invoice.folio AS invoicefolio, cteLevels.level
from cteLevels INNER JOIN
invoice ON cteLevels.relatedinvoice = invoice.invoicenumber
WHERE cteLevels.folio = 1003 AND cteLevels.relatedinvoice = 'a1122'
I agree with SwampDev's comment that the parentinvoice should really be in the invoice table. This could also be done without a recursive CTE if you know the maximum number of levels of separation between invoices.

Related

MS SQL recursively get child id with parent ids

I have created a search SQL query, I'm almost finished I only need to match the WHERE clause with an id and it's parent ids. Currently I only match with an id and not it's parents. I'm not sure how to solve this.
This is my query, the code where I need the change is where the comment "This part needs to match the tileid and it's parent tileids" is. Based on the [TileId] I need to get the [TileId] and it's parents.
WITH [TileSearch_CTE] ([TileId], [TypeId], [TypeName], [Title], [Info]) AS
(
SELECT TOP 10 [TileId], [TypeId], [TypeName], [Title], [Info]
FROM [Priox].[TileFullTextSearchNL]
INNER JOIN CONTAINSTABLE([Priox].[TileFullTextSearchNL], *, '"searchText*"') AS [CT]
ON [Priox].[TileFullTextSearchNL].[TileId] = [CT].[Key]
WHERE
(
NOT EXISTS
(
SELECT [intPortalTileFilterId]
FROM [Priox].[tblPortalTileFilter]
WHERE [intPortalTileIdFk] = [TileId] -- This part needs to match the tileid and it's parent tileids
)
OR EXISTS
(
SELECT [intPortalTileFilterId]
FROM [Priox].[tblPortalTileFilter]
WHERE [intPortalTileIdFk] = [TileId] -- This part needs to match the tileid and it's parent tileids
AND [intPortalFilterIdFk] IN (56)
)
)
ORDER BY [CT].[Rank] DESC
)
SELECT [TileId], [TypeName], [Title], [Info], [TP].[strName] AS [ParamName], [TPV].[strParamValue] AS [ParamValue]
FROM [TileSearch_CTE]
LEFT JOIN [Priox].[tblPortalTileParam] AS [TP] ON [TP].[intTileType] = [TileSearch_CTE].[TypeId]
LEFT JOIN [Priox].[tblPortalTileParamValue] AS [TPV] ON [TPV].[intTileParamIdFk] = [TP].[intPortalTileParamId] AND [TileSearch_CTE].[TileId] = [TPV].[intTileIdFk]
This is the database structure of the tiles.
So the query should do something like this.
SELECT [intPortalTileFilterId]
FROM [Priox].[tblPortalTileFilter]
WHERE [intPortalTileIdFk] IN (438,1317)
I fixed it myself, created a function that returns the parents and the child.
CREATE FUNCTION [Priox].[GetTileIdHierarchy]
(
#tileId int
)
RETURNS TABLE
AS
RETURN
(
WITH parents AS
(
SELECT [intPortalTileId], [intPortalTileId] AS [intParentIdFk]
FROM [Priox].[tblPortalTile] WHERE [intPortalTileId] = #tileId
UNION ALL
SELECT p.[intPortalTileId], [Priox].[tblPortalTile].[intParentIdFk]
FROM parents p
INNER JOIN [Priox].[tblPortalTile] on p.[intParentIdFk] = [Priox].[tblPortalTile].[intPortalTileId]
AND [Priox].[tblPortalTile].[intParentIdFk] IS NOT NULL
AND [Priox].[tblPortalTile].[intPortalTileId] <> [Priox].[tblPortalTile].[intParentIdFk]
)
SELECT [intPortalTileId], [intParentIdFk]
FROM parents
WHERE [intPortalTileId] = #tileId
)
GO
And changed my search query accordingly.
WITH [TileSearch_CTE] ([TileId], [TypeId], [TypeName], [Title], [Info], [Rank]) AS
(
SELECT TOP 10 [TileId], [TypeId], [TypeName], [Title], [Info],[CT].[Rank] AS [Rank]
FROM [Priox].[TileFullTextSearchNL]
INNER JOIN CONTAINSTABLE([Priox].[TileFullTextSearchNL], *, '"searchTerm*"') AS [CT]
ON [Priox].[TileFullTextSearchNL].[TileId] = [CT].[Key]
WHERE
(
NOT EXISTS
(
SELECT [intPortalTileFilterId]
FROM [Priox].[tblPortalTileFilter]
WHERE [intPortalTileIdFk] IN
(
SELECT [intParentIdFk] FROM [Priox].[GetTileIdHierarchy] ([TileId])
)
)
OR
EXISTS
(
SELECT [intPortalTileFilterId]
FROM [Priox].[tblPortalTileFilter]
WHERE [intPortalTileIdFk] IN
(
SELECT [intParentIdFk] FROM [Priox].[GetTileIdHierarchy] ([TileId])
)
AND [intPortalFilterIdFk] IN (51)
)
)
ORDER BY [CT].[Rank] DESC
)
SELECT [TileId], [TypeName], [Title], [Info], [TP].[strName] AS [ParamName], [TPV].[strParamValue] AS [ParamValue] , [Rank]
FROM [TileSearch_CTE]
LEFT JOIN [Priox].[tblPortalTileParam] AS [TP] ON [TP].[intTileType] = [TileSearch_CTE].[TypeId]
LEFT JOIN [Priox].[tblPortalTileParamValue] AS [TPV] ON [TPV].[intTileParamIdFk] = [TP].[intPortalTileParamId] AND [TileSearch_CTE].[TileId] = [TPV].[intTileIdFk]

I edited the starting post with the solution.

Troubles isolating target cell in recursive sql query

I have a table, let's say it looks like this:
c | p
=====
|1|3|
|2|1|
|7|5|
c stands for current and p stands for parent
Given a c value of 2 I would return its top most ancestor (which has no parent) this value is 3. Since this is a self referencing table, I figured using CTE would be the best method however I am very new to using it. Nevertheless, I gave it a shot:
WITH Tree(this, parent) AS
( SELECT c ,p
FROM myTable
WHERE c = '2'
UNION ALL
SELECT M.c ,M.p
FROM myTable M
JOIN Tree T ON T.parent = M.c )
SELECT parent
FROM Tree
However this returns:
1
3
I only want 3 though. I have tried putting WHERE T.parent <> M.c but that doesn't entirely make sense. Neadless to say, I am a little confused for how to isolate the grandparent.

DECLARE #Table AS TABLE (Child INT, Parent INT)
INSERT INTO #Table VALUES (1,3),(2,1),(7,5)
;WITH cteRecursive AS (
SELECT
OriginalChild = Child
,Child
,Parent
,Level = 0
FROM
#Table
WHERE
Child = 2
UNION ALL
SELECT
c.OriginalChild
,t.Child
,t.Parent
,Level + 1
FROM
cteRecursive c
INNER JOIN #Table t
ON c.Parent = t.Child
)
SELECT TOP 1 TopAncestor = Parent
FROM
cteRecursive
ORDER BY
Level DESC
Use a recursive cte to Recuse up the tree until you cannot. Keep track of the Level of recursion, then take the last level of recursions parent and you have the top ancestor.
And just because I wrote it I will add in if you wanted to find the top ancestor of every child. The concept is still the same but you would need to introduce a row_number() to find the last level that was recursed.
DECLARE #Table AS TABLE (Child INT, Parent INT)
INSERT INTO #Table VALUES (1,3),(2,1),(7,5),(5,9)
;WITH cteRecursive AS (
SELECT
OriginalChild = Child
,Child
,Parent
,Level = 0
FROM
#Table
UNION ALL
SELECT
c.OriginalChild
,t.Child
,t.Parent
,Level + 1
FROM
cteRecursive c
INNER JOIN #Table t
ON c.Parent = t.Child
)
, cteTopAncestorRowNum AS (
SELECT
*
,TopAncestorRowNum = ROW_NUMBER() OVER (PARTITION BY OriginalChild ORDER BY Level DESC)
FROM
cteRecursive
)
SELECT
Child = OriginalChild
,TopMostAncestor = Parent
FROM
cteTopAncestorRowNum
WHERE
TopAncestorRowNum = 1

How to join three tables with distinct

I'm trying to join three tables to pull back a list of distinct blog posts with associated assets (images etc) but I keep coming up a cropper. The three tablets are tblBlog, tblAssetLink and tblAssets. The Blog tablet hold the blog, the asset table holds the assets and the Assetlink table links the two together.
tblBlog.BID is the PK in blog, tblAssets.AID is the PK in Assets.
This query works but pulls back multiple posts for the same record. I've tried to use select distinct and group by and even union but as my knowledge is pretty poor with SQL - they all error.
I'd like to also discount any assets that are marked as deleted (tblAssets.Deleted = true) but not hide the associated Blog post (if that's not marked as deleted). If anyone can help - it would be much appreciated! Thanks.
Here's my query so far....
SELECT dbo.tblBlog.BID,
dbo.tblBlog.DateAdded,
dbo.tblBlog.PMonthName,
dbo.tblBlog.PDay,
dbo.tblBlog.Header,
dbo.tblBlog.AddedBy,
dbo.tblBlog.PContent,
dbo.tblBlog.Category,
dbo.tblBlog.Deleted,
dbo.tblBlog.Intro,
dbo.tblBlog.Tags,
dbo.tblAssets.Name,
dbo.tblAssets.Description,
dbo.tblAssets.Location,
dbo.tblAssets.Deleted AS Expr1,
dbo.tblAssetLink.Priority
FROM dbo.tblBlog
LEFT OUTER JOIN dbo.tblAssetLink
ON dbo.tblBlog.BID = dbo.tblAssetLink.BID
LEFT OUTER JOIN dbo.tblAssets
ON dbo.tblAssetLink.AID = dbo.tblAssets.AID
WHERE ( dbo.tblBlog.Deleted = 'False' )
ORDER BY dbo.tblAssetLink.Priority, tblBlog.DateAdded DESC
EDIT
Changed the Where and the order by....
Expected output:
tblBlog.BID = 123
tblBlog.DateAdded = 12/04/2015
tblBlog.Header = This is a header
tblBlog.AddedBy = Persons name
tblBlog.PContent = *text*
tblBlog.Category = Category name
tblBlog.Deleted = False
tblBlog.Intro = *text*
tblBlog.Tags = Tag, Tag, Tag
tblAssets.Name = some.jpg
tblAssets.Description = Asset desc
tblAssets.Location = Location name
tblAssets.Priority = True

Use OUTER APPLY:
DECLARE #b TABLE ( BID INT )
DECLARE #a TABLE ( AID INT )
DECLARE #ba TABLE
(
BID INT ,
AID INT ,
Priority INT
)
INSERT INTO #b
VALUES ( 1 ),
( 2 )
INSERT INTO #a
VALUES ( 1 ),
( 2 ),
( 3 ),
( 4 )
INSERT INTO #ba
VALUES ( 1, 1, 1 ),
( 1, 2, 2 ),
( 2, 1, 1 ),
( 2, 2, 2 )
SELECT *
FROM #b b
OUTER APPLY ( SELECT TOP 1
a.*
FROM #ba ba
JOIN #a a ON a.AID = ba.AID
WHERE ba.BID = b.BID
ORDER BY Priority
) o
Output:
BID AID
1 1
2 1
Something like:
SELECT b.BID ,
b.DateAdded ,
b.PMonthName ,
b.PDay ,
b.Header ,
b.AddedBy ,
b.PContent ,
b.Category ,
b.Deleted ,
b.Intro ,
b.Tags ,
o.Name ,
o.Description ,
o.Location ,
o.Deleted AS Expr1 ,
o.Priority
FROM dbo.tblBlog b
OUTER APPLY ( SELECT TOP 1
a.* ,
al.Priority
FROM dbo.tblAssetLink al
JOIN dbo.tblAssets a ON al.AID = a.AID
WHERE b.BID = al.BID
ORDER BY al.Priority
) o
WHERE b.Deleted = 'False'

You cannot join three tables unless they all have the same attribute. It would work if all tables had BID, but the second join is trying to join AID. Which wont work. They all have to have BID.

Based on your comments
i would like to get is just one asset per blog post (top one ordered
by Priority)
You can change your query as following. I suggest changing the join with dbo.tblAssetLink to filtered one, which contains only one (highest priority) link for every blog.
SELECT dbo.tblBlog.BID,
dbo.tblBlog.DateAdded,
dbo.tblBlog.PMonthName,
dbo.tblBlog.PDay,
dbo.tblBlog.Header,
dbo.tblBlog.AddedBy,
dbo.tblBlog.PContent,
dbo.tblBlog.Category,
dbo.tblBlog.Deleted,
dbo.tblBlog.Intro,
dbo.tblBlog.Tags,
dbo.tblAssets.Name,
dbo.tblAssets.Description,
dbo.tblAssets.Location,
dbo.tblAssets.Deleted AS Expr1,
dbo.tblAssetLink.Priority
FROM dbo.tblBlog
LEFT OUTER JOIN
(SELECT BID, AID,
ROW_NUMBER() OVER (PARTITION BY BID ORDER BY [Priority] DESC) as N
FROM dbo.tblAssetLink) AS filteredAssetLink
ON dbo.tblBlog.BID = filteredAssetLink.BID
LEFT OUTER JOIN dbo.tblAssets
ON filteredAssetLink.AID = dbo.tblAssets.AID
WHERE dbo.tblBlog.Deleted = 'False' AND filteredAssetLink.N = 1
ORDER BY tblBlog.DateAdded DESC

SQL Sort table with number of iteration from other table

i have bank Table and payment table something like these:
Bank Table:
BankID/Name
1/bank1Name
2/bank2Name
3/bank3Name
4/bank4Name
Payment Table:
PaymentID/ProductID/BankID
1/100/2
2/102/2
3/98/3
4/100/2
5/102/1
as it shows the number of iterated of bank id 2,3,1 are 3,1,1 respectively. I want to sort bank table with most iterated on table bank. the result something like this:
2/bank2Name
3/bank3Name
1/bank1Name
4/bank4Name

Not sure what most iterated means but if it is most paymentid something like:
select b.bankid, b.name
from bank b
join payment p
on b.bankid = p.bankid
group by b.bankid, b.name
order by count(1) desc
Another option is to order by a sub-select
select b.bankid, b.name
from bank b
order by (select count(1) from payment p where b.bankid = p.bankid) desc

This is full working example where I am guessing that bank ids for each there is no record in the Payment table are shown at the end sorted by its id:
DECLARE #Bank TABLE
(
[ID] TINYINT
,[Name] NVARCHAR(32)
)
INSERT INTO #Bank ([ID], [Name])
VALUES (1,'bank1Name')
,(2,'bank2Name')
,(3,'bank3Name')
,(4,'bank4Name')
,(5,'bank5Name')
DECLARE #Payment TABLE
(
[PaymentID] TINYINT
,[ProductID] TINYINT
,[BankID] TINYINT
)
INSERT INTO #Payment ([PaymentID], [ProductID], [BankID])
VALUES (1,100,2)
,(2,102,2)
,(3,98,3)
,(4,100,2)
,(5,102,1)
;WITH HelpValue AS
(
SELECT COUNT([PaymentID]) AS [RoCount]
FROM #Payment
)
SELECT B.[ID]
,B.[Name]
FROM #Bank B
OUTER APPLY
(
SELECT TOP 1 P.[PaymentID]
FROM #Payment P
WHERE B.[ID] = P.[BankID]
) DS
CROSS APPLY HelpValue
ORDER BY COALESCE(DS.[PaymentID], [RoCount] + B.[ID])

Group All Related Records in Many to Many Relationship, SQL graph connected components

Hopefully I'm missing a simple solution to this.
I have two tables. One contains a list of companies. The second contains a list of publishers. The mapping between the two is many to many. What I would like to do is bundle or group all of the companies in table A which have any relationship to a publisher in table B and vise versa.
The final result would look something like this (GROUPID is the key field). Row 1 and 2 are in the same group because they share the same company. Row 3 is in the same group because the publisher Y was already mapped over to company A. Row 4 is in the group because Company B was already mapped to group 1 through Publisher Y.
Said simply, any time there is any kind of shared relationship across Company and Publisher, that pair should be assigned to the same group.
ROW GROUPID Company Publisher
1 1 A Y
2 1 A X
3 1 B Y
4 1 B Z
5 2 C W
6 2 C P
7 2 D W
Fiddle
Update:
My bounty version: Given the table in the fiddle above of simply Company and Publisher pairs, populate the GROUPID field above. Think of it as creating a Family ID that encompasses all related parents/children.
SQL Server 2012

I thought about using recursive CTE, but, as far as I know, it's not possible in SQL Server to use UNION to connect anchor member and a recursive member of recursive CTE (I think it's possible to do in PostgreSQL), so it's not possible to eliminate duplicates.
declare #i int
with cte as (
select
GroupID,
row_number() over(order by Company) as rn
from Table1
)
update cte set GroupID = rn
select #i = ##rowcount
-- while some rows updated
while #i > 0
begin
update T1 set
GroupID = T2.GroupID
from Table1 as T1
inner join (
select T2.Company, min(T2.GroupID) as GroupID
from Table1 as T2
group by T2.Company
) as T2 on T2.Company = T1.Company
where T1.GroupID > T2.GroupID
select #i = ##rowcount
update T1 set
GroupID = T2.GroupID
from Table1 as T1
inner join (
select T2.Publisher, min(T2.GroupID) as GroupID
from Table1 as T2
group by T2.Publisher
) as T2 on T2.Publisher = T1.Publisher
where T1.GroupID > T2.GroupID
-- will be > 0 if any rows updated
select #i = #i + ##rowcount
end
;with cte as (
select
GroupID,
dense_rank() over(order by GroupID) as rn
from Table1
)
update cte set GroupID = rn
sql fiddle demo
I've also tried a breadth first search algorithm. I thought it could be faster (it's better in terms of complexity), so I'll provide a solution here. I've found that it's not faster than SQL approach, though:
declare #Company nvarchar(2), #Publisher nvarchar(2), #GroupID int
declare #Queue table (
Company nvarchar(2), Publisher nvarchar(2), ID int identity(1, 1),
primary key(Company, Publisher)
)
select #GroupID = 0
while 1 = 1
begin
select top 1 #Company = Company, #Publisher = Publisher
from Table1
where GroupID is null
if ##rowcount = 0 break
select #GroupID = #GroupID + 1
insert into #Queue(Company, Publisher)
select #Company, #Publisher
while 1 = 1
begin
select top 1 #Company = Company, #Publisher = Publisher
from #Queue
order by ID asc
if ##rowcount = 0 break
update Table1 set
GroupID = #GroupID
where Company = #Company and Publisher = #Publisher
delete from #Queue where Company = #Company and Publisher = #Publisher
;with cte as (
select Company, Publisher from Table1 where Company = #Company and GroupID is null
union all
select Company, Publisher from Table1 where Publisher = #Publisher and GroupID is null
)
insert into #Queue(Company, Publisher)
select distinct c.Company, c.Publisher
from cte as c
where not exists (select * from #Queue as q where q.Company = c.Company and q.Publisher = c.Publisher)
end
end
sql fiddle demo
I've tested my version and Gordon Linoff's to check how it's perform. It looks like CTE is much worse, I couldn't wait while it's complete on more than 1000 rows.
Here's sql fiddle demo with random data. My results were:
128 rows:
my RBAR solution: 190ms
my SQL solution: 27ms
Gordon Linoff's solution: 958ms
256 rows:
my RBAR solution: 560ms
my SQL solution: 1226ms
Gordon Linoff's solution: 45371ms
It's random data, so results may be not very consistent. I think timing could be changed by indexes, but don't think it could change a whole picture.
old version - using temporary table, just calculating GroupID without touching initial table:
declare #i int
-- creating table to gather all possible GroupID for each row
create table #Temp
(
Company varchar(1), Publisher varchar(1), GroupID varchar(1),
primary key (Company, Publisher, GroupID)
)
-- initializing it with data
insert into #Temp (Company, Publisher, GroupID)
select Company, Publisher, Company
from Table1
select #i = ##rowcount
-- while some rows inserted into #Temp
while #i > 0
begin
-- expand #Temp in both directions
;with cte as (
select
T2.Company, T1.Publisher,
T1.GroupID as GroupID1, T2.GroupID as GroupID2
from #Temp as T1
inner join #Temp as T2 on T2.Company = T1.Company
union
select
T1.Company, T2.Publisher,
T1.GroupID as GroupID1, T2.GroupID as GroupID2
from #Temp as T1
inner join #Temp as T2 on T2.Publisher = T1.Publisher
), cte2 as (
select
Company, Publisher,
case when GroupID1 < GroupID2 then GroupID1 else GroupID2 end as GroupID
from cte
)
insert into #Temp
select Company, Publisher, GroupID
from cte2
-- don't insert duplicates
except
select Company, Publisher, GroupID
from #Temp
-- will be > 0 if any row inserted
select #i = ##rowcount
end
select
Company, Publisher,
dense_rank() over(order by min(GroupID)) as GroupID
from #Temp
group by Company, Publisher
=> sql fiddle example

Your problem is a graph-walking problem of finding connected subgraphs. It is a little more challenging because your data structure has two types of nodes ("companies" and "pubishers") rather than one type.
You can solve this with a single recursive CTE. The logic is as follows.
First, convert the problem into a graph with only one type of node. I do this by making the nodes companies and the edges linkes between companies, using the publisher information. This is just a join:
select t1.company as node1, t2.company as node2
from table1 t1 join
table1 t2
on t1.publisher = t2.publisher
)
(For efficiency sake, you could also add t1.company <> t2.company but that is not strictly necessary.)
Now, this is a "simple" graph walking problem, where a recursive CTE is used to create all connections between two nodes. The recursive CTE walks through the graph using join. Along the way, it keeps a list of all nodes visited. In SQL Server, this needs to be stored in a string.
The code needs to ensure that it doesn't visit a node twice for a given path, because this can result in infinite recursion (and an error). If the above is called edges, the CTE that generates all pairs of connected nodes looks like:
cte as (
select e.node1, e.node2, cast('|'+e.node1+'|'+e.node2+'|' as varchar(max)) as nodes,
1 as level
from edges e
union all
select c.node1, e.node2, c.nodes+e.node2+'|', 1+c.level
from cte c join
edges e
on c.node2 = e.node1 and
c.nodes not like '|%'+e.node2+'%|'
)
Now, with this list of connected nodes, assign each node the minimum of all the nodes it is connected to, including itself. This serves as an identifier of connected subgraphs. That is, all companies connected to each other via the publishers will have the same minimum.
The final two steps are to enumerate this minimum (as the GroupId) and to join the GroupId back to the original data.
The full (and I might add tested) query looks like:
with edges as (
select t1.company as node1, t2.company as node2
from table1 t1 join
table1 t2
on t1.publisher = t2.publisher
),
cte as (
select e.node1, e.node2,
cast('|'+e.node1+'|'+e.node2+'|' as varchar(max)) as nodes,
1 as level
from edges e
union all
select c.node1, e.node2,
c.nodes+e.node2+'|',
1+c.level
from cte c join
edges e
on c.node2 = e.node1 and
c.nodes not like '|%'+e.node2+'%|'
),
nodes as (
select node1,
(case when min(node2) < node1 then min(node2) else node1 end
) as grp
from cte
group by node1
)
select t.company, t.publisher, grp.GroupId
from table1 t join
(select n.node1, dense_rank() over (order by grp) as GroupId
from nodes n
) grp
on t.company = grp.node1;
Note that this works on finding any connected subgraphs. It does not assume that any particular number of levels.
EDIT:
The question of performance for this is vexing. At a minimum, the above query will run better with an index on Publisher. Better yet is to take #MikaelEriksson's suggestion, and put the edges in a separate table.
Another question is whether you look for equivalency classes among the Companies or the Publishers. I took the approach of using Companies, because I think that has better "explanability" (my inclination to respond was based on numerous comments that this could not be done with CTEs).
I am guessing that you could get reasonable performance from this, although that requires more knowledge of your data and system than provided in the OP. It is quite likely, though, that the best performance will come from a multiple query approach.

Here is my solution SQL Fiddle
The nature of the relationships require looping as I figure.
Here is the SQL:
--drop TABLE Table1
CREATE TABLE Table1
([row] int identity (1,1),GroupID INT NULL,[Company] varchar(2), [Publisher] varchar(2))
;
INSERT INTO Table1
(Company, Publisher)
select
left(newid(), 2), left(newid(), 2)
declare #i int = 1
while #i < 8
begin
;with cte(Company, Publisher) as (
select
left(newid(), 2), left(newid(), 2)
from Table1
)
insert into Table1(Company, Publisher)
select distinct c.Company, c.Publisher
from cte as c
where not exists (select * from Table1 as t where t.Company = c.Company and t.Publisher = c.Publisher)
set #i = #i + 1
end;
CREATE NONCLUSTERED INDEX IX_Temp1 on Table1 (Company)
CREATE NONCLUSTERED INDEX IX_Temp2 on Table1 (Publisher)
declare #counter int=0
declare #row int=0
declare #lastnullcount int=0
declare #currentnullcount int=0
WHILE EXISTS (
SELECT *
FROM Table1
where GroupID is null
)
BEGIN
SET #counter=#counter+1
SET #lastnullcount =0
SELECT TOP 1
#row=[row]
FROM Table1
where GroupID is null
order by [row] asc
SELECT #currentnullcount=count(*) from table1 where groupid is null
WHILE #lastnullcount <> #currentnullcount
BEGIN
SELECT #lastnullcount=count(*)
from table1
where groupid is null
UPDATE Table1
SET GroupID=#counter
WHERE [row]=#row
UPDATE t2
SET t2.GroupID=#counter
FROM Table1 t1
INNER JOIN Table1 t2 on t1.Company=t2.Company
WHERE t1.GroupID=#counter
AND t2.GroupID IS NULL
UPDATE t2
SET t2.GroupID=#counter
FROM Table1 t1
INNER JOIN Table1 t2 on t1.publisher=t2.publisher
WHERE t1.GroupID=#counter
AND t2.GroupID IS NULL
SELECT #currentnullcount=count(*)
from table1
where groupid is null
END
END
SELECT * FROM Table1
Edit:
Added indexes as I would expect on the real table and be more in line with the other data sets Roman is using.

You are trying to find all of the connected components of your graph, which can only be done iteratively. If you know the maximum width of any connected component (i.e. the maximum number of links you will have to take from one company/publisher to another), you could in principle do it something like this:
SELECT
MIN(x2.groupID) AS groupID,
x1.Company,
x1.Publisher
FROM Table1 AS x1
INNER JOIN (
SELECT
MIN(x2.Company) AS groupID,
x1.Company,
x1.Publisher
FROM Table1 AS x1
INNER JOIN Table1 AS x2
ON x1.Publisher = x2.Publisher
GROUP BY
x1.Publisher,
x1.Company
) AS x2
ON x1.Company = x2.Company
GROUP BY
x1.Publisher,
x1.Company;
You have to keep nesting the subquery (alternating joins on Company and Publisher, and with the deepest subquery saying MIN(Company) rather than MIN(groupID)) to the maximum iteration depth.
I don't really recommend this, though; it would be cleaner to do this outside of SQL.
Disclaimer: I don't know anything about SQL Server 2012 (or any other version); it may have some kind of additional scripting ability to let you do this iteration dynamically.

This is a recursive solution, using XML:
with a as ( -- recursive result, containing shorter subsets and duplicates
select cast('<c>' + company + '</c>' as xml) as companies
,cast('<p>' + publisher + '</p>' as xml) as publishers
from Table1
union all
select a.companies.query('for $c in distinct-values((for $i in /c return string($i),
sql:column("t.company")))
order by $c
return <c>{$c}</c>')
,a.publishers.query('for $p in distinct-values((for $i in /p return string($i),
sql:column("t.publisher")))
order by $p
return <p>{$p}</p>')
from a join Table1 t
on ( a.companies.exist('/c[text() = sql:column("t.company")]') = 0
or a.publishers.exist('/p[text() = sql:column("t.publisher")]') = 0)
and ( a.companies.exist('/c[text() = sql:column("t.company")]') = 1
or a.publishers.exist('/p[text() = sql:column("t.publisher")]') = 1)
), b as ( -- remove the shorter versions from earlier steps of the recursion and the duplicates
select distinct -- distinct cannot work on xml types, hence cast to nvarchar
cast(companies as nvarchar) as companies
,cast(publishers as nvarchar) as publishers
,DENSE_RANK() over(order by cast(companies as nvarchar), cast(publishers as nvarchar)) as groupid
from a
where not exists (select 1 from a as s -- s is a proper subset of a
where (cast('<s>' + cast(s.companies as varchar)
+ '</s><a>' + cast(a.companies as varchar) + '</a>' as xml)
).value('if((count(/s/c) > count(/a/c))
and (some $s in /s/c/text() satisfies
(some $a in /a/c/text() satisfies $s = $a))
) then 1 else 0', 'int') = 1
)
and not exists (select 1 from a as s -- s is a proper subset of a
where (cast('<s>' + cast(s.publishers as nvarchar)
+ '</s><a>' + cast(a.publishers as nvarchar) + '</a>' as xml)
).value('if((count(/s/p) > count(/a/p))
and (some $s in /s/p/text() satisfies
(some $a in /a/p/text() satisfies $s = $a))
) then 1 else 0', 'int') = 1
)
), c as ( -- cast back to xml
select cast(companies as xml) as companies
,cast(publishers as xml) as publishers
,groupid
from b
)
select Co.company.value('(./text())[1]', 'varchar') as company
,Pu.publisher.value('(./text())[1]', 'varchar') as publisher
,c.groupid
from c
cross apply companies.nodes('/c') as Co(company)
cross apply publishers.nodes('/p') as Pu(publisher)
where exists(select 1 from Table1 t -- restrict to only the combinations that exist in the source
where t.company = Co.company.value('(./text())[1]', 'varchar')
and t.publisher = Pu.publisher.value('(./text())[1]', 'varchar')
)
The set of companies and the set of publishers are kept in XML fields in the intermediate steps, and there is some casting between xml and nvarchar necessary due to some limitations of SQL Server (like not being able to group or use distinct on XML columns.

Bit late to the challenge, and since SQLFiddle seems to be down ATM I'll have to guess your data-structures. Nevertheless, it seemed like a fun challenge (and it was =) so here's what I made from it :
Setup:
IF OBJECT_ID('t_link') IS NOT NULL DROP TABLE t_link
IF OBJECT_ID('t_company') IS NOT NULL DROP TABLE t_company
IF OBJECT_ID('t_publisher') IS NOT NULL DROP TABLE t_publisher
IF OBJECT_ID('tempdb..#link_A') IS NOT NULL DROP TABLE #link_A
IF OBJECT_ID('tempdb..#link_B') IS NOT NULL DROP TABLE #link_B
GO
CREATE TABLE t_company ( company_id int IDENTITY(1, 1) NOT NULL PRIMARY KEY,
company_name varchar(100) NOT NULL)
GO
CREATE TABLE t_publisher (publisher_id int IDENTITY(1, 1) NOT NULL PRIMARY KEY,
publisher_name varchar(100) NOT NULL)
CREATE TABLE t_link (company_id int NOT NULL FOREIGN KEY (company_id) REFERENCES t_company (company_id),
publisher_id int NOT NULL FOREIGN KEY (publisher_id) REFERENCES t_publisher (publisher_id),
PRIMARY KEY (company_id, publisher_id),
group_id int NULL
)
GO
-- example content
-- ROW GROUPID Company Publisher
--1 1 A Y
--2 1 A X
--3 1 B Y
--4 1 B Z
--5 2 C W
--6 2 C P
--7 2 D W
INSERT t_company (company_name) VALUES ('A'), ('B'), ('C'), ('D')
INSERT t_publisher (publisher_name) VALUES ('X'), ('Y'), ('Z'), ('W'), ('P')
INSERT t_link (company_id, publisher_id)
SELECT company_id, publisher_id
FROM t_company, t_publisher
WHERE (company_name = 'A' AND publisher_name = 'Y')
OR (company_name = 'A' AND publisher_name = 'X')
OR (company_name = 'B' AND publisher_name = 'Y')
OR (company_name = 'B' AND publisher_name = 'Z')
OR (company_name = 'C' AND publisher_name = 'W')
OR (company_name = 'C' AND publisher_name = 'P')
OR (company_name = 'D' AND publisher_name = 'W')
GO
/*
-- volume testing
TRUNCATE TABLE t_link
DELETE t_company
DELETE t_publisher
DECLARE #company_count int = 1000,
#publisher_count int = 450,
#links_count int = 800
INSERT t_company (company_name)
SELECT company_name = Convert(varchar(100), NewID())
FROM master.dbo.fn_int_list(1, #company_count)
UPDATE STATISTICS t_company
INSERT t_publisher (publisher_name)
SELECT publisher_name = Convert(varchar(100), NewID())
FROM master.dbo.fn_int_list(1, #publisher_count)
UPDATE STATISTICS t_publisher
-- Random links between the companies & publishers
DECLARE #count int
SELECT #count = 0
WHILE #count < #links_count
BEGIN
SELECT TOP 30 PERCENT row_id = IDENTITY(int, 1, 1), company_id = company_id + 0
INTO #link_A
FROM t_company
ORDER BY NewID()
SELECT TOP 30 PERCENT row_id = IDENTITY(int, 1, 1), publisher_id = publisher_id + 0
INTO #link_B
FROM t_publisher
ORDER BY NewID()
INSERT TOP (#links_count - #count) t_link (company_id, publisher_id)
SELECT A.company_id,
B.publisher_id
FROM #link_A A
JOIN #link_B B
ON A.row_id = B.row_id
WHERE NOT EXISTS ( SELECT *
FROM t_link old
WHERE old.company_id = A.company_id
AND old.publisher_id = B.publisher_id)
SELECT #count = #count + ##ROWCOUNT
DROP TABLE #link_A
DROP TABLE #link_B
END
*/
Actual grouping:
IF OBJECT_ID('tempdb..#links') IS NOT NULL DROP TABLE #links
GO
-- apply grouping
-- init
SELECT row_id = IDENTITY(int, 1, 1),
company_id,
publisher_id,
group_id = 0
INTO #links
FROM t_link
-- don't see an index that would be actually helpful here right-away, using row_id to avoid HEAP
CREATE CLUSTERED INDEX idx0 ON #links (row_id)
--CREATE INDEX idx1 ON #links (company_id)
--CREATE INDEX idx2 ON #links (publisher_id)
UPDATE #links
SET group_id = row_id
-- start grouping
WHILE ##ROWCOUNT > 0
BEGIN
UPDATE #links
SET group_id = new_group_id
FROM #links upd
CROSS APPLY (SELECT new_group_id = Min(group_id)
FROM #links new
WHERE new.company_id = upd.company_id
OR new.publisher_id = upd.publisher_id
) x
WHERE upd.group_id > new_group_id
-- select * from #links
END
-- remove 'holes'
UPDATE #links
SET group_id = (SELECT COUNT(DISTINCT o.group_id)
FROM #links o
WHERE o.group_id <= upd.group_id)
FROM #links upd
GO
UPDATE t_link
SET group_id = new.group_id
FROM t_link upd
LEFT OUTER JOIN #links new
ON new.company_id = upd.company_id
AND new.publisher_id = upd.publisher_id
GO
SELECT row = ROW_NUMBER() OVER (ORDER BY group_id, company_name, publisher_name),
l.group_id,
c.company_name, -- c.company_id,
p.publisher_name -- , p.publisher_id
from t_link l
JOIN t_company c
ON l.company_id = c.company_id
JOIN t_publisher p
ON p.publisher_id = l.publisher_id
ORDER BY 1
At first sight this approach hasn't been tried yet by anyone else, interesting to see how this can be done in a variety of ways... (preferred not to read them upfront as it would spoil the puzzle =)
Results look as expected (as far as I understand the requirements and the example) and performance isn't too shabby either although there is no real indication on the amount of records this should work on; not sure how it would scale but don't expect too many problems either...

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

SQL - How to find a value in a tree level data structure - sql

Related

MS SQL recursively get child id with parent ids

Troubles isolating target cell in recursive sql query

How to join three tables with distinct

SQL Sort table with number of iteration from other table

Group All Related Records in Many to Many Relationship, SQL graph connected components

Categories

Resources