Azure Synapse recursive CTE alternative for flatten hierarchy - sql

I am facing following challenge: I would like to flatten a parent child hierarchy in a way that I have per row the child + any parent of the upper levels
Source data
Child
Parent
A
B
B
C
D
E
X
Y
Y
Z
Whished result
Child
Any Parent
A
B
A
C
A
D
A
E
B
C
B
D
B
E
C
D
C
E
D
E
X
Y
X
Z
Y
Z
There is no information about levels and there is not a distinct "Top parent". It is a unbalanced tree and there might be multiple "Top 1 parents" with different childs.
I know this should normally be done via a recursive CTE, but since this is not supported in Azure Synapse I am searching for a smart loop statement to solve this puzzle. Performance is not the priority since it is in a DWH context with not that much data.
Really appreciate any smart hints or solutions
Tried several recursive CTE solutions for that which are unfortunately all not supported in Synapse

I'm not sure what the limitations are with Azure Synapse but try this a..
It's won't win any awards for elegance but does the job. I corrected what I assumed to be a missing record in your sample data too (C, D) based on your expected output.
DECLARE #t TABLE(Child varchar(10), Parent varchar(10))
INSERT INTO #t VALUES ('A', 'B'), ('B', 'C'), ('C', 'D'), ('D', 'E'), ('X', 'Y'), ('Y', 'Z')
DECLARE #Items TABLE (Item varchar(10), Done INT)
DECLARE #Results TABLE(Child varchar(10), Parent varchar(10))
DECLARE #p varchar(10)
DECLARE #curItem varchar(10)
INSERT INTO #Items (item)
SELECT DISTINCT Child FROM #t
UNION
SELECT DISTINCT Parent FROM #t
SET #curItem = (SELECT TOP 1 Item FROM #Items WHERE Done IS NULL)
WHILE #curItem IS NOT NULL
BEGIN
SELECT #p = Parent FROM #t WHERE Child = #curItem
WHILE #p IS NOT NULL
BEGIN
INSERT INTO #Results
SELECT #curItem, Parent FROM #t WHERE Parent = #p
SET #p = (SELECT Parent FROM #t WHERE Child = #p)
END
UPDATE #Items SET Done = 1 WHERE Item = #curItem
SET #curItem = (SELECT TOP 1 Item FROM #Items WHERE Done IS NULL)
END
SELECT * FROM #Results
I tested this and it does work, tried to stick on sql fiddle but it would not run on there for some rerason.
Anyway,
Here's the results

Related

SQL Server - find a letter in a chain which doesn't belong to anyone

I have a query which takes itemnum and search the table to find the itemnum which doesn't have any replacementItemNum in a chain.
It is like a chain where a belong to b and b belongs c , need to find the letter in a chain which doesn't belong to anyone.
Table has 72k records.
If I pass A or B or C, I will get letter D because it doesn't have any replacement item.
If I pass D, then result would be NULL
This is how the data (chain) looks like:
ItemNum - ReplacementItemNum
----------------------------
A - B
B - C
C - D
D -
This query is taking too long; can this be re written so it doesn't take that long?
DECLARE #CountryOID INT = 250 ,
#ItemNum VARCHAR(50) = 'A';
DECLARE #LastItem VARCHAR(50);
DECLARE #NextItem VARCHAR(50);
SELECT #LastItem = ReplaceItemNum
FROM dbo.MacInventory
WHERE ItemNum = #ItemNum
AND CountryOID = #CountryOID;
WHILE #LastItem <> ''
BEGIN
SET #NextItem = #LastItem;
SELECT #LastItem = ReplaceItemNum
FROM dbo.MacInventory
WHERE ItemNum = #LastItem
AND CountryOID = #CountryOID;
END;
SELECT #NextItem;
Recursive cte to the rescue!
First, create and populate sample table (Please save us this step in your future questions):
DECLARE #T AS TABLE
(
ItemNum char(1),
ReplacementItemNum char(1)
);
INSERT INTO #T (ItemNum, ReplacementItemNum) VALUES
('A', 'B'),
('B', 'C'),
('C', 'D'),
('D', NULL),
('E', 'F'), -- added some more data to make sure we don't get the wrong result...
('F', NULL);
Set your starting point:
DECLARE #StartFrom char(1) = 'A';
The recursive cte:
WITH CTE AS
(
SELECT ItemNum, ReplacementItemNum
FROM #T
WHERE ItemNum = #StartFrom
UNION ALL
SELECT T.ItemNum, T.ReplacementItemNum
FROM #T As T
JOIN CTE
ON T.ItemNum = CTE.ReplacementItemNum
)
The query:
SELECT IIF(ItemNum = #StartFrom, NULL, ItemNum) As ItemNum
FROM CTE
WHERE ReplacementItemNum IS NULL
And finally, the result:
ItemNum
D
You can see a live demo on rextester.
Do you have proper indexes on the column (CountryOID, ItemNum) in the table? Try the following recursive cte solution:
;WITH cte_item (ItemNum, ReplaceItemNum)
AS
(
SELECT ItemNum, ReplaceItemNum
FROM dbo.MacInventory
WHERE ItemNum = #ItemNum
AND CountryOID = #CountryOID
UNION
SELECT m.ItemNum, m.ReplaceItemNum
FROM dbo.MacInventory m
INNER JOIN cte_item i
ON m.ItemNum = i.ReplaceItemNum
AND m.CountryOID = #CountryOID
)
SELECT * FROM cte_item WHERE ReplaceItemNum='';

Need to get descendent items list in stored procedure

Description:
There is table wich consist of two columns (ParentId and ChildId) that displays hierarchy of some entities. Each Id could be presented in ParentId column only one time. That means that each entity have only one child entity.
Problem: I need to check whether entity(its Id) is in descendants list of parent entity.
DECLARE #parentId INT
DECLARE #childId INT
DECLARE #targetChildId INT
SET #targetChildId=<put id of a child you want to find>
SET #parentId=<put id of a parent you are looking child for>
SET #childId=0
WHILE (#childId<>#targetChildId)
BEGIN
IF(EXISTS(SELECT ChildId FROM Hierarchies WHERE ParentId=#parentId))
BEGIN
SET #childId=(SELECT ChildId FROM Hierarchies WHERE ParentId=#parentId)
SET #parentId=#childId
END
ELSE
BEGIN
SET #childId=0
BREAK
END
END
PRINT #childId
It returns 0 if target child not found in target parent.
Sample data:
CREATE TABLE [dbo].[EntityHierarchy]
(
[EntityId] INT,
[ChildEntityId] INT
)
INSERT [dbo].[EntityHierarchy]
VALUES (1, 2),
(2, 3),
(3, 4),
(4, 1) -- Cycle
Find circular relationships:
DECLARE #SearchEntityId INT = 1
;WITH [cteRursive] AS
(
SELECT 1 AS [ROW_NUMBER],
[ChildEntityId] AS [EntityId]
FROM [dbo].[EntityHierarchy]
WHERE [EntityId] = #SearchEntityId
UNION ALL
SELECT r.[ROW_NUMBER] + 1,
h.[ChildEntityId]
FROM [cteRursive] r
INNER JOIN [dbo].[EntityHierarchy] h
ON r.[EntityId] = h.[EntityId]
WHERE h.[ChildEntityId] <> #SearchEntityId
)
SELECT h.*
FROM [cteRursive] r
INNER JOIN [dbo].[EntityHierarchy] h
ON r.[EntityId] = h.[EntityId]
WHERE r.[ROW_NUMBER] = (SELECT MAX([ROW_NUMBER]) FROM [cteRursive])
I'm using a recursive CTE to list the descendants. The child of the last descendant either creates a cycle or not.

Algorithm to select nodes and their parents in a tree

Let's say you have a tree structure as follows:
a [Level 0]
/ | \
b c d [Level 1]
/ \ |
e f g [Level 2]
| / \
h i j [Level 3]
I have represented this in a database like so:
node parent
------------
a null
b a
c a
d a
[...]
h f
i g
I'd like to write a function that, given a level, it will return all nodes at that level and their parents.
For example:
f(0) => { a }
f(1) => { a, b, c, d }
f(2) => { a, b, c, d, e, f, g }
Any thoughts?
Here I iterate through the levels, adding each one to the table with the level it is on.
create table mytable (
node varchar(80),
parent varchar(80),
constraint PK_mytable primary key nonclustered (node)
)
-- index for speed selecting on parent
create index IDX_mytable_parent on mytable (parent, node)
insert into mytable values ('a', null)
insert into mytable values ('b', 'a')
insert into mytable values ('c', 'a')
insert into mytable values ('d', 'a')
insert into mytable values ('e', 'b')
insert into mytable values ('f', 'b')
insert into mytable values ('g', 'd')
insert into mytable values ('h', 'f')
insert into mytable values ('i', 'g')
insert into mytable values ('j', 'g')
create function fn_level (#level int) returns #nodes table (Node varchar(80), TreeLevel int)
as begin
declare #current int
set #current = 0
while #current <= #level begin
if (#current = 0)
insert #nodes (Node, TreeLevel)
select node, #current
from mytable
where parent is null
else
insert #nodes (Node, TreeLevel)
select mt.node, #current
from #nodes n
inner join mytable mt on mt.parent = n.Node
where n.TreeLevel = (#current - 1)
set #current = #current + 1
end
return
end
select * from fn_level(2)
The usual way to do this, unless your flavour of SQL has a special non-standard function for it, is to build a path table that has these columns:
parent_key
child_key
path_length
To fill this table, you use a recursive or procedural loop to find all of the parents, grand-parents, great-grand-parents, etc for each item in your list of items. The recursion or looping needs to continue until you stop finding longer paths which return new pairs.
At the end, you'll have a list of records that tell you things like (a,b,1), (a,f,2), (a,h,3) etc. Then, to get everything that is at level x and above, you do a distinct select on all of the children with a path_length <= x (unioned with the root, unless you included a record of (null, root, 0) when you started your recursion/looping.
It would be nice if SQL were better at handling directed graphs (trees) but unfortunately you have to cheat it with extra tables like this.
A solution for MySQL is less than ideal.
Assuming that the maximum depth of the tree is known:
SELECT
nvl(e.node, nvl(d.node, nvl(c.node, nvl(b.node, a.node)))) item
, nvl2(e.node, 5, nvl2(d.node, 4, nvl2(c.node, 3, nvl2(b.node, 2, 1)))) depth
FROM table t AS a
LEFT JOIN table t AS b ON (a.node = b.parent)
LEFT JOIN table t AS c ON (b.node = c.parent)
LEFT JOIN table t AS d ON (c.node = d.parent)
LEFT JOIN table t AS e ON (d.node = e.parent)
WHERE a.parent IS NULL
This will give you every node and it's depth. After that it's trivial to select every item that has depth less that X.
If the depth of the tree is not known, or is significantly large then the solution is iterative as another poster has said.
Shamelessly copying from Jason, I made a function-less solution which I tested with postgresql (which has functions - maybe it would have worked out of the box).
create table tree (
node char(1),
parent char(1)
);
insert into tree values ('a', null);
insert into tree values ('b', 'a');
insert into tree values ('c', 'a');
insert into tree values ('d', 'a');
insert into tree values ('e', 'b');
insert into tree values ('f', 'b');
insert into tree values ('g', 'd');
insert into tree values ('h', 'f');
insert into tree values ('i', 'g');
insert into tree values ('j', 'g');
ALTER TABLE tree ADD level int2;
--
-- node parent level
-- a - 1
-- b a a.(level + 1)
-- c a a.(level + 1)
-- e b b.(level + 1)
-- root is a:
UPDATE tree SET level = 0 WHERE node = 'a';
-- every level else is parent + 1:
UPDATE tree tout -- outer
SET level = (
SELECT ti.level + 1
FROM tree ti -- inner
WHERE tout.parent = ti.node
AND ti.level IS NOT NULL)
WHERE tout.level IS NULL;
The update statement is pure sql, and has to be repeated for every level, to fill the table up.
kram=# select * from tree;
node | parent | level
------+--------+-------
a | | 1
b | a | 2
c | a | 2
d | a | 2
e | b | 3
f | b | 3
g | d | 3
h | f | 4
i | g | 4
j | g | 4
(10 Zeilen)
I started with 'level=1', not '0' for a, therefore the difference.
SQL doesn't always handle these recursive problems very well.
Some DBMS platforms allow you to use Common Table Expressions which are effectively queries that call themselves, allowing you to recurse through a data structure. There's no support for this in MySQL, so I'd recommend you use multiple queries constructed and managed by a script written in another language.
I do not know much about databases, or their terminology, but would it work if you performed a joint product of a table with itself N times in order to find all elements at level N?
In other words, perform a query in which you search for all entries that have parent A. That will return to you a list of all its children. Then, repeat the query to find the children of each of these children. Repeat this procedure until you find all children at level N.
In this way you would not have to pre-compute the depth of each item.

Multiple parents tree (or digraph) implementation sql server 2005

I need to implement a multi-parented tree (or digraph) onto SQL Server 2005.
I've read several articles, but most of them uses single-parented trees with a unique root like the following one.
-My PC
-Drive C
-Documents and Settings
-Program Files
-Adobe
-Microsoft
-Folder X
-Drive D
-Folder Y
-Folder Z
In this one, everything derives from a root element (My PC).
In my case, a child could have more than 1 parent, like the following:
G A
\ /
B
/ \
X C
/ \
D E
\ /
F
So I have the following code:
create table #ObjectRelations
(
Id varchar(20),
NextId varchar(20)
)
insert into #ObjectRelations values ('G', 'B')
insert into #ObjectRelations values ('A', 'B')
insert into #ObjectRelations values ('B', 'C')
insert into #ObjectRelations values ('B', 'X')
insert into #ObjectRelations values ('C', 'E')
insert into #ObjectRelations values ('C', 'D')
insert into #ObjectRelations values ('E', 'F')
insert into #ObjectRelations values ('D', 'F')
declare #id varchar(20)
set #id = 'A';
WITH Objects (Id, NextId) AS
( -- This is the 'Anchor' or starting point of the recursive query
SELECT rel.Id,
rel.NextId
FROM #ObjectRelations rel
WHERE rel.Id = #id
UNION ALL -- This is the recursive portion of the query
SELECT rel.Id,
rel.NextId
FROM #ObjectRelations rel
INNER JOIN Objects -- Note the reference to CTE table name (Recursive Join)
ON rel.Id = Objects.NextId
)
SELECT o.*
FROM Objects o
drop table #ObjectRelations
Which returns the following SET:
Id NextId
-------------------- --------------------
A B
B C
B X
C E
C D
D F
E F
Expected result SET:
Id NextId
-------------------- --------------------
G B
A B
B C
B X
C E
C D
D F
E F
Note that the relation G->B is missing, because it asks for an starting object (which doesn't work for me also, because I don't know the root object from the start) and using A as the start point will ignore the G->B relationship.
So, this code doesn't work in my case because it asks for a starting object, which is obvious in a SINGLE-parent tree (will always be the root object). But in multi-parent tree, you could have more than 1 "root" object (like in the example, G and A are the "root" objects, where root is an object which doesn't have a parent (ancestor)).
So I'm kind of stucked in here... I need to modify the query to NOT ask for a starting object and recursively traverse the entire tree.
I don't know if that's possible with the (Id, NextId) implementation... may be I need to store it like a graph using some kind of Incidence matrix, adjacency matrix or whatever (see http://willets.org/sqlgraphs.html).
Any help? What do you think guys?
Thank you very much for your time =)
Cheers!
Sources:
Source 1
Source 2
Source 3
Well, I finally came up with the following solution.
It's the way I found to support multi-root trees and also cycling digraphs.
create table #ObjectRelations
(
Id varchar(20),
NextId varchar(20)
)
/* Cycle */
/*
insert into #ObjectRelations values ('A', 'B')
insert into #ObjectRelations values ('B', 'C')
insert into #ObjectRelations values ('C', 'A')
*/
/* Multi root */
insert into #ObjectRelations values ('G', 'B')
insert into #ObjectRelations values ('A', 'B')
insert into #ObjectRelations values ('B', 'C')
insert into #ObjectRelations values ('B', 'X')
insert into #ObjectRelations values ('C', 'E')
insert into #ObjectRelations values ('C', 'D')
insert into #ObjectRelations values ('E', 'F')
insert into #ObjectRelations values ('D', 'F')
declare #startIds table
(
Id varchar(20) primary key
)
;WITH
Ids (Id) AS
(
SELECT Id
FROM #ObjectRelations
),
NextIds (Id) AS
(
SELECT NextId
FROM #ObjectRelations
)
INSERT INTO #startIds
/* This select will not return anything since there are not objects without predecessor, because it's a cyclic of course */
SELECT DISTINCT
Ids.Id
FROM
Ids
LEFT JOIN
NextIds on Ids.Id = NextIds.Id
WHERE
NextIds.Id IS NULL
UNION
/* So let's just pick anyone. (the way I will be getting the starting object for a cyclic doesn't matter for the regarding problem)*/
SELECT TOP 1 Id FROM Ids
;WITH Objects (Id, NextId, [Level], Way) AS
( -- This is the 'Anchor' or starting point of the recursive query
SELECT rel.Id,
rel.NextId,
1,
CAST(rel.Id as VARCHAR(MAX))
FROM #ObjectRelations rel
WHERE rel.Id IN (SELECT Id FROM #startIds)
UNION ALL -- This is the recursive portion of the query
SELECT rel.Id,
rel.NextId,
[Level] + 1,
RecObjects.Way + ', ' + rel.Id
FROM #ObjectRelations rel
INNER JOIN Objects RecObjects -- Note the reference to CTE table name (Recursive Join)
ON rel.Id = RecObjects.NextId
WHERE RecObjects.Way NOT LIKE '%' + rel.Id + '%'
)
SELECT DISTINCT
Id,
NextId,
[Level]
FROM Objects
ORDER BY [Level]
drop table #ObjectRelations
Could be useful for somebody. It is for me =P
Thanks
If you want to use all root objects as starting objects, you should first update your data to include information about the root objects (and the leaves). You should add the following inserts:
insert into #ObjectRelations values (NULL, 'G')
insert into #ObjectRelations values (NULL, 'A')
insert into #ObjectRelations values ('X', NULL)
insert into #ObjectRelations values ('F', NULL)
Of course you could also write your anchor query in such a way that you select as root nodes the records that have an Id that does not occur as a NextId, but this is easier.
Next, modify your anchor query to look like this:
SELECT rel.Id,
rel.NextId
FROM #ObjectRelations rel
WHERE rel.Id IS NULL
If you run this query, you'll see that you get a lot of duplicates, a lot of arcs occur multiple times. This is because you now have two results from your anchor query and therefore the tree is traversed two times.
This can be fixed by changing your select statement to this (note the DISTINCT):
SELECT DISTINCT o.*
FROM Objects o
If you dont want to do the inserts suggested by Ronald,this would do!.
WITH CTE_MultiParent (ID, ParentID)
AS
(
SELECT ID, ParentID FROM #ObjectRelations
WHERE ID NOT IN
(
SELECT DISTINCT ParentID FROM #ObjectRelations
)
UNION ALL
SELECT ObjR.ID, ObjR.ParentID FROM #ObjectRelations ObjR INNER JOIN CTE_MultiParent
ON CTE_MultiParent.ParentID = ObjR.Id
)
SELECT DISTINCT * FROM CTE_MultiParent

Find lowest common parent in recursive SQL table

Suppose I have a recursive table (e.g. employees with managers) and a list of size 0..n of ids. How can I find the lowest common parent for these ids?
For example, if my table looks like this:
Id | ParentId
---|---------
1 | NULL
2 | 1
3 | 1
4 | 2
5 | 2
6 | 3
7 | 3
8 | 7
Then the following sets of ids lead to the following results (the first one is a corner case):
[] => 1 (or NULL, doesn't really matter)
[1] => 1
[2] => 2
[1,8] => 1
[4,5] => 2
[4,6] => 1
[6,7,8] => 3
How to do this?
EDIT: Note that parent isn't the correct term in all cases. It's the lowest common node in all paths up the tree. The lowest common node can also be a node itself (for example in the case [1,8] => 1, node 1 is not a parent of node 1 but node 1 itself).
Kind regards,
Ronald
Here's one way of doing it; it uses a recursive CTE to find the ancestry of a node, and uses "CROSS APPLY" over the input values to get the common ancestry; you just change the values in #ids (table variable):
----------------------------------------- SETUP
CREATE TABLE MyData (
Id int NOT NULL,
ParentId int NULL)
INSERT MyData VALUES (1,NULL)
INSERT MyData VALUES (2,1)
INSERT MyData VALUES (3,1)
INSERT MyData VALUES (4,2)
INSERT MyData VALUES (5,2)
INSERT MyData VALUES (6,3)
INSERT MyData VALUES (7,3)
INSERT MyData VALUES (8,7)
GO
CREATE FUNCTION AncestorsUdf (#Id int)
RETURNS TABLE
AS
RETURN (
WITH Ancestors (Id, ParentId)
AS (
SELECT Id, ParentId
FROM MyData
WHERE Id = #Id
UNION ALL
SELECT md.Id, md.ParentId
FROM MyData md
INNER JOIN Ancestors a
ON md.Id = a.ParentId
)
SELECT Id FROM Ancestors
);
GO
----------------------------------------- ACTUAL QUERY
DECLARE #ids TABLE (Id int NOT NULL)
DECLARE #Count int
-- your data (perhaps via a "split" udf)
INSERT #ids VALUES (6)
INSERT #ids VALUES (7)
INSERT #ids VALUES (8)
SELECT #Count = COUNT(1) FROM #ids
;
SELECT TOP 1 a.Id
FROM #ids
CROSS APPLY AncestorsUdf(Id) AS a
GROUP BY a.Id
HAVING COUNT(1) = #Count
ORDER BY a.ID DESC
Update if the nodes aren't strictly ascending:
CREATE FUNCTION AncestorsUdf (#Id int)
RETURNS #result TABLE (Id int, [Level] int)
AS
BEGIN
WITH Ancestors (Id, ParentId, RelLevel)
AS (
SELECT Id, ParentId, 0
FROM MyData
WHERE Id = #Id
UNION ALL
SELECT md.Id, md.ParentId, a.RelLevel - 1
FROM MyData md
INNER JOIN Ancestors a
ON md.Id = a.ParentId
)
INSERT #result
SELECT Id, RelLevel FROM Ancestors
DECLARE #Min int
SELECT #Min = MIN([Level]) FROM #result
UPDATE #result SET [Level] = [Level] - #Min
RETURN
END
GO
and
SELECT TOP 1 a.Id
FROM #ids
CROSS APPLY AncestorsUdf(Id) AS a
GROUP BY a.Id, a.[Level]
HAVING COUNT(1) = #Count
ORDER BY a.[Level] DESC
After doing some thinking and some hints in the right direction from Marc's answer (thanks), I came up with another solution myself:
DECLARE #parentChild TABLE (Id INT NOT NULL, ParentId INT NULL);
INSERT INTO #parentChild VALUES (1, NULL);
INSERT INTO #parentChild VALUES (2, 1);
INSERT INTO #parentChild VALUES (3, 1);
INSERT INTO #parentChild VALUES (4, 2);
INSERT INTO #parentChild VALUES (5, 2);
INSERT INTO #parentChild VALUES (6, 3);
INSERT INTO #parentChild VALUES (7, 3);
INSERT INTO #parentChild VALUES (8, 7);
DECLARE #ids TABLE (Id INT NOT NULL);
INSERT INTO #ids VALUES (6);
INSERT INTO #ids VALUES (7);
INSERT INTO #ids VALUES (8);
DECLARE #count INT;
SELECT #count = COUNT(1) FROM #ids;
WITH Nodes(Id, ParentId, Depth) AS
(
-- Start from every node in the #ids collection.
SELECT pc.Id , pc.ParentId , 0 AS DEPTH
FROM #parentChild pc
JOIN #ids i ON pc.Id = i.Id
UNION ALL
-- Recursively find parent nodes for each starting node.
SELECT pc.Id , pc.ParentId , n.Depth - 1
FROM #parentChild pc
JOIN Nodes n ON pc.Id = n.ParentId
)
SELECT n.Id
FROM Nodes n
GROUP BY n.Id
HAVING COUNT(n.Id) = #count
ORDER BY MIN(n.Depth) DESC
It now returns the entire path from the lowest common parent to the root node but that is a matter of adding a TOP 1 to the select.