Is the following query possible with SQL Pivot? - sql

Let's say I have the following tables:
create table student(
id number not null,
name varchar2(80),
primary key(id)
);
create table class(
id number not null,
subject varchar2(80),
primary key(id)
);
create table class_meeting(
id number not null,
class_id number not null,
meeting_sequence number,
primary key(id),
foreign key(class_id) references class(id)
);
create table meeting_attendance(
id number not null,
student_id number not null,
meeting_id number not null,
present number not null,
primary key(id),
foreign key(student_id) references student(id),
foreign key(meeting_id) references class_meeting(id),
constraint meeting_attendance_uq unique(student_id, meeting_id),
constraint present_ck check(present in(0,1))
);
I want a query for each class, which has a column for the student name, one column for every class_meeting for this class and for every class meeting the cells would show the present attribute, which should be 1 if the student was present at that meeting and 0 if the student was absent in that meeting. Here is a picture from excel for reference:
Is it possible to make an apex report like that?
From googling I figured I must use Pivot, however I'm having a hard time understanding how it could be used here. Here is the query I have so far:
select * from(
select s.name, m.present
from student s, meeting_attendance m
where s.id = m.student_id
)
pivot(
present
for class_meeting in ( select a.meeting_sequence
from class_meeting a, class b
where b.id = a.class_id )
)
However I'm sure it's way off. Is it even possible to do this with one query, or should I use pl sql htp and htf packages to create an html table?
Pretty inexperienced oracle developer here, so any help is very appreciated.

It took a while to answer, but I had to write this all up and test it!
Data I've worked with:
begin
insert into student(id, name) values (1, 'Tom');
insert into student(id, name) values (2, 'Odysseas');
insert into class(id, subject) values (1, 'Programming');
insert into class(id, subject) values (2, 'Databases');
insert into class_meeting (id, class_id, meeting_sequence) values (1, 1, 10);
insert into class_meeting (id, class_id, meeting_sequence) values (2, 1, 20);
insert into class_meeting (id, class_id, meeting_sequence) values (3, 2, 10);
insert into class_meeting (id, class_id, meeting_sequence) values (4, 2, 20);
insert into meeting_attendance (id, student_id, meeting_id, present) values (1, 1, 1, 1); -- Tom was at meeting 10 about programming
insert into meeting_attendance (id, student_id, meeting_id, present) values (2, 1, 2, 1); -- Tom was at meeting 20 about programming
insert into meeting_attendance (id, student_id, meeting_id, present) values (3, 1, 3, 0); -- Tom was NOT at meeting 10 about databases
insert into meeting_attendance (id, student_id, meeting_id, present) values (4, 1, 4, 0); -- Tom was NOT at meeting 20 about databases
insert into meeting_attendance (id, student_id, meeting_id, present) values (5, 2, 1, 0); -- Odysseas was NOT at meeting 10 about programming
insert into meeting_attendance (id, student_id, meeting_id, present) values (6, 2, 2, 1); -- Odysseas was at meeting 20 about programming
insert into meeting_attendance (id, student_id, meeting_id, present) values (7, 2, 3, 0); -- Odysseas was NOT at meeting 10 about databases
insert into meeting_attendance (id, student_id, meeting_id, present) values (8, 2, 4, 1); -- Odysseas was at meeting 20 about databases
end;
PIVOT , as it stands right now, does not allow a dynamic number of columns in a simple way. It only allows this with the XML keyword, resulting in an xmltype column.
Here are some excellent docs. http://www.oracle-base.com/articles/11g/pivot-and-unpivot-operators-11gr1.php
It always pays off to read those first.
How to, then?
You'll literally find tons of questions about the same thing once you start searching.
Dynamic SQL
https://asktom.oracle.com/pls/asktom/f?p=100:11:0::::P11_QUESTION_ID:4471013000346257238
Dynamically pivoting a table Oracle
Dynamic Oracle Pivot_In_Clause
A classic report can take a function body returning a sql statement as return. An interactive report can not. As it stands, an IR is out of the question as it is too metadata dependent.
For example, with these queries/plsql in a classic report region source:
static pivot
select *
from (
select s.name as student_name, m.present present, cm.meeting_sequence||'-'|| c.subject meeting
from student s
join meeting_attendance m
on s.id = m.student_id
join class_meeting cm
on cm.id = m.meeting_id
join class c
on c.id = cm.class_id
)
pivot ( max(present) for meeting in ('10-Databases' as "10-DB", '20-Databases' as "20-DB", '10-Programming' as "10-PRM", '20-Programming' as "20-PRM") );
-- Results
STUDENT_NAME '10-Databases' 20-DB 10-PRM 20-PRM
Tom 0 0 1 1
Odysseas 0 1 0 1
function body returning statement
DECLARE
l_pivot_cols VARCHAR2(4000);
l_pivot_qry VARCHAR2(4000);
BEGIN
SELECT ''''||listagg(cm.meeting_sequence||'-'||c.subject, ''',''') within group(order by 1)||''''
INTO l_pivot_cols
FROM class_meeting cm
JOIN "CLASS" c
ON c.id = cm.class_id;
l_pivot_qry :=
'select * from ( '
|| 'select s.name as student_name, m.present present, cm.meeting_sequence||''-''||c.subject meeting '
|| 'from student s '
|| 'join meeting_attendance m '
|| 'on s.id = m.student_id '
|| 'join class_meeting cm '
|| 'on cm.id = m.meeting_id '
|| 'join class c '
|| 'on c.id = cm.class_id '
|| ') '
|| 'pivot ( max(present) for meeting in ('||l_pivot_cols||') )' ;
RETURN l_pivot_qry;
END;
Take note however of the settings in the region source.
Use Query-Specific Column Names and Validate Query
This is the standard setting. It will parse your query and then store the columns found in the query in the report metadata. If you go ahead and create a report with the above plsql code, you can see that apex has parsed the query and has assigned the correct columns. What is wrong with this approach is that that metadata is static. The report's metadata is not refreshed every time the report is being ran.
This can be proven quite simply by adding another class to the data.
begin
insert into class(id, subject) values (3, 'Watch YouTube');
insert into class_meeting (id, class_id, meeting_sequence) values (5, 3, 10);
insert into meeting_attendance (id, student_id, meeting_id, present) values (10, 1, 5, 1); -- Tom was at meeting 10 about watching youtube
end;
Run the page without editing the report! Editing and saving will regenerate the metadata, which is clearly not a viable method. The data will change anyway, and you cannot go in and save the report metadata every time.
--cleanup
begin
delete from class where id = 3;
delete from class_meeting where id = 5;
delete from meeting_attendance where id = 10;
end;
Use Generic Column Names (parse query at runtime only)
Setting the source to this type will allow you to use a more dynamic approach. By changing the settings of the report to this type of parsing, apex will just generate an amount of columns in its metadata without being directly associated with the actual query. There'll just be columns with 'COL1', 'COL2', 'COL3',...
Run the report. Works fine. Now insert some data again.
begin
insert into class(id, subject) values (3, 'Watch YouTube');
insert into class_meeting (id, class_id, meeting_sequence) values (5, 3, 10);
insert into meeting_attendance (id, student_id, meeting_id, present) values (10, 1, 5, 1); -- Tom was at meeting 10 about watching youtube
end;
Run the report. Works fine.
However, the kink here are the column names. They're not really all that dynamic, with their ugly names. You can edit the columns, surely, but they're not dynamic. There is no class being displayed or anything, nor can you reliably set their headers to one. Again this makes sense: the metadata is there, but it is static. It could work for you if you're happy with this approach.
You can however deal with this. In the "Report Attributes" of the report, you can select a "Headings Type". They're all static, expect for "PL/SQL" of course! Here you can write a function body (or just call a function) which'll return the column headers!
DECLARE
l_return VARCHAR2(400);
BEGIN
SELECT listagg(cm.meeting_sequence||'-'||c.subject, ':') within group(order by 1)
INTO l_return
FROM class_meeting cm
JOIN "CLASS" c
ON c.id = cm.class_id;
RETURN l_return;
END;
Third party solution
https://asktom.oracle.com/pls/apex/f?p=100:11:0::::P11_QUESTION_ID:4843682300346852395#5394721000346803830
https://stackoverflow.com/a/16702401/814048
http://technology.amis.nl/2006/05/24/dynamic-sql-pivoting-stealing-antons-thunder/
In APEX: though the dynamic pivot is more straightforward after installing, the setup in apex remains the same as if you'd want to use dynamic SQL. Use a classic report with generic column names.
I'm not going to go into much detail here. I don't have this package installed atm. It's nice to have, but in this scenario it may not be that helpful. It purely allows you to write a dynamic pivot in a more concise way, but doesn't help much on the apex side of things. As I've demonstrated above, the dynamic columns and the static metadata of the apex reports are the limiting factor here.
Use XML
I myself have opted to use the XML keyword before. I use pivot to make sure I have values for all rows and columns, then read it out again with XMLTABLE, and then creating one XMLTYPE column, serializing it to a CLOB.
This may be a bit advanced, but it's a technique I've used a couple of times so far, with good results. It's fast, provided the base data is not too big, and it's just one sql call, so not a lot of context switches. I've used it with CUBE'd data aswell, and it works great.
(note: the classes I've added on the elements correspond with classes used on classic reports in theme 1, simple red)
DECLARE
l_return CLOB;
BEGIN
-- Subqueries:
-- SRC
-- source data query
-- SRC_PIVOT
-- pivoted source data with XML clause to allow variable columns.
-- Mainly used for convenience because pivot fills in 'gaps' in the data.
-- an example would be that 'Odysseas' does not have a relevant record for the 'Watch Youtube' class
-- PIVOT_HTML
-- Pulls the data from the pivot xml into columns again, and collates the data
-- together with xmlelments.
-- HTML_HEADERS
-- Creates a row with just header elements based on the source data
-- HTML_SRC
-- Creates row elements with the student name and the collated data from pivot_html
-- Finally:
-- serializes the xmltype column for easier-on-the-eye markup
WITH src AS (
SELECT s.name as student_name, m.present present, cm.meeting_sequence||'-'||c.subject meeting
FROM student s
JOIN meeting_attendance m
ON s.id = m.student_id
JOIN class_meeting cm
ON cm.id = m.meeting_id
JOIN class c
ON c.id = cm.class_id
),
src_pivot AS (
SELECT student_name, meeting_xml
FROM src pivot xml(MAX(NVL(present, 0)) AS is_present_max for (meeting) IN (SELECT distinct meeting FROM src) )
),
pivot_html AS (
SELECT student_name
, xmlagg(
xmlelement("td", xmlattributes('data' as "class"), is_present_max)
ORDER BY meeting
) is_present_html
FROM src_pivot
, xmltable('PivotSet/item'
passing meeting_xml
COLUMNS "MEETING" VARCHAR2(400) PATH 'column[#name="MEETING"]'
, "IS_PRESENT_MAX" NUMBER PATH 'column[#name="IS_PRESENT_MAX"]')
GROUP BY (student_name)
),
html_headers AS (
SELECT xmlelement("tr",
xmlelement("th", xmlattributes('header' as "class"), 'Student Name')
, xmlagg(xmlelement("th", xmlattributes('header' as "class"), meeting) order by meeting)
) headers
FROM (SELECT DISTINCT meeting FROM src)
),
html_src as (
SELECT
xmlagg(
xmlelement("tr",
xmlelement("td", xmlattributes('data' as "class"), student_name)
, ah.is_present_html
)
) data
FROM pivot_html ah
)
SELECT
xmlserialize( content
xmlelement("table"
, xmlattributes('report-standard' as "class", '0' as "cellpadding", '0' as "cellspacing", '0' as "border")
, xmlelement("thead", headers )
, xmlelement("tbody", data )
)
AS CLOB INDENT SIZE = 2
)
INTO l_return
FROM html_headers, html_src ;
htp.prn(l_return);
END;
In APEX: well, since the HTML has been constructed, this can only be a PLSQL region which calls the package function and prints it using HTP.PRN.
(edit) There's also this post on the OTN forum which does the same in a large part, but does not generate headings etc, rather using the apex functionalities:
OTN: Matrix report
PLSQL
Alternatively, you can just opt to go the good ol' plsql route. You could take the body from the dynamic sql above, loop over it, and put out a table structure by using htp.prn calls. Put out headers, and put out whatever else you want. For good effect, add classes on the elements which correspond with the theme you're using.

Disclaimer: I don't know apex specifically.
Here's a correct pivot query, assuming the class you want has an ID = 1, and that the meeting_id's for that class are 1,2,3.
select * from(
select s.name, a.present,m.id meeting_id
from student s, meeting_attendance a, class_meeting m, class c
where s.id = a.student_id
and m.id = a.meeting_id
and c.id = m.class_id
and c.id = 1
)
pivot(
sum(present)
for meeting_id in(1,2,3)
);
I don't believe you can use a sub-query to return the values for the "for in" of the pivot.

Related

How to I get distinct combinations of one XRef column related to any value in the other XRef column

I need to select the count of unique value combinations of column B in an XRef table which is grouped by column A.
Consider the following schema and data, which represents a simple family structure. Each child has a father and mother:
TABLE Father
FatherID
Name
1
Alex
2
Bob
TABLE Mother
MotherID
Name
1
Alice
2
Barbara
TABLE Child
ChildID
FatherID
MotherID
Name
1
1 (Alex)
1 (Alice)
Adam
2
1 (Alex)
1 (Alice)
Billy
3
1 (Alex)
2 (Barbara)
Celine
4
2 (Bob)
2 (Barbara)
Derek
The distinct combinations of mothers for each father are:
Alex (Alice, Barbara)
Bob (Barbara)
In all there are two distinct combinations of mothers:
Alice, Barbara
Barbara
The query I want to write would return the count of those distinct combinations of mother, regardless of which father they are associated with:
UniqueMotherGroups
2
I was able to do this successfully using the STRING_AGG function, but it feels clunky. It also needs to operate over millions of rows and is quite slow at the moment. Is there a more idiomatic way to do this with set operations instead?
Here is my working example:
-- Drop pre-existing tables
DROP TABLE IF EXISTS dbo.Child;
DROP TABLE IF EXISTS dbo.Father;
DROP TABLE IF EXISTS dbo.Mother;
-- Create family tables.
CREATE TABLE dbo.Father
(
FatherID INT NOT NULL
, Name VARCHAR(50) NOT NULL
);
ALTER TABLE dbo.Father
ADD CONSTRAINT PK_Father
PRIMARY KEY CLUSTERED (FatherID);
ALTER TABLE dbo.Father SET (LOCK_ESCALATION = TABLE);
CREATE TABLE dbo.Mother
(
MotherID INT NOT NULL
, Name VARCHAR(50) NOT NULL
);
ALTER TABLE dbo.Mother
ADD CONSTRAINT PK_Mother
PRIMARY KEY CLUSTERED (MotherID);
ALTER TABLE dbo.Mother SET (LOCK_ESCALATION = TABLE);
CREATE TABLE dbo.Child
(
ChildID INT NOT NULL
, FatherID INT NOT NULL
, MotherID INT NOT NULL
, Name VARCHAR(50) NOT NULL
);
ALTER TABLE dbo.Child
ADD CONSTRAINT PK_Child
PRIMARY KEY CLUSTERED (ChildID);
CREATE NONCLUSTERED INDEX IX_Parents ON dbo.Child (FatherID, MotherID);
ALTER TABLE dbo.Child
ADD CONSTRAINT FK_Child_Father
FOREIGN KEY (FatherID)
REFERENCES dbo.Father (FatherID);
ALTER TABLE dbo.Child
ADD CONSTRAINT FK_Child_Mother
FOREIGN KEY (MotherID)
REFERENCES dbo.Mother (MotherID);
-- Insert two children with the same parents
INSERT INTO dbo.Father
(
FatherID
, Name
)
VALUES
(1, 'Alex')
, (2, 'Bob')
, (3, 'Charlie')
INSERT INTO dbo.Mother
(
MotherID
, Name
)
VALUES
(1, 'Alice')
, (2, 'Barbara');
INSERT INTO dbo.Child
(
ChildID
, FatherID
, MotherID
, Name
)
VALUES
(1, 1, 1, 'Adam')
, (2, 1, 1, 'Billy')
, (3, 1, 2, 'Celine')
, (4, 2, 2, 'Derek')
, (5, 3, 1, 'Eric');
-- CTE Gets distinct combinations of parents
WITH distinctParentCombinations (FatherID, MotherID)
AS (SELECT children.FatherID
, children.MotherID
FROM dbo.Child as children
GROUP BY children.FatherID
, children.MotherID
)
-- CTE Gets uses STRING_AGG to get unique combinations of mothers.
, motherGroups (Mothers)
AS (SELECT STRING_AGG(CONVERT(VARCHAR(MAX), distinctParentCombinations.MotherID), '-') WITHIN GROUP (ORDER BY distinctParentCombinations.MotherID) AS Mothers
FROM distinctParentCombinations
GROUP BY distinctParentCombinations.FatherID
)
-- Remove the COUNT function to see the actual combinations
SELECT COUNT(motherGroups.Mothers) AS UniqueMotherGroups
FROM motherGroups
-- Clean up the example
DROP TABLE IF EXISTS dbo.Child;
DROP TABLE IF EXISTS dbo.Father;
DROP TABLE IF EXISTS dbo.Mother;
You have a great explanation and setup of your "problem case".
Your setup runs great in (for example) tempdb.
You have solved the problem in a nice way, and I don't think you can optimize it much further if you are going to calculate the mother groups every time you run the query.
There is one small mistake though; You must do a COUNT(DISTINCT motherGroups.Mothers) in your final count.
Since you mention milions of rows, I would suggest a slightly different approach.
If you aggregate the mother groups as soon as there is a change in the Child table, your query can run fast every time - even with millions of rows.
The kind of queries you want to run is seldom run only once, so it would be nice if the heavy work is already done.
Usually I prefer not to use triggers, because you get extra logic in a place where it could be hard to find and debug.
But sometimes triggers are nice to have, especially when you are not able to change the source code running on the clients.
So, my solution is to add a new column to the Father table and to create a trigger which (re)generates the mother group each time there is a change in the Child table.
This way, the hard aggregation work for each father is done as soon there is a change, and you don't have to aggregate when you run your query.
Since you already have millions of rows, we also have to update these existing rows.
I have used SQL Server 2019 for this solution.
*** The solution ***
Add 1 or 2 new columns to the Father table.
If you should add 1 or 2, it depends on what your preferences are:
"Do I want to see the aggregated mother groups for debugging purpose, or do I just trust the hashed values?"
Column 1: Hashed value of the aggregated mother group for each Father row.
The hashed value is VARBINARY and is at least 32 bytes, but we will use VARBINARY(1600):
1600 is less than 1700 which is the max nonclustered index size, so we will not have any problems indexing the column.
Since the hash value is in blocks of 32 bytes, a value of 1600 will cover a really, really, really long aggreated mother group.
-- Column 1: Hashed value of the aggregated mother group for each Father row.
alter table Father add MotherHash varbinary(1600)
create index IX_MotherHash on Father(MotherHash)
Column 2: This column is more optional, and depends on your preferences.
The column could be nice to have for debugging purpose if any questions are made about the result.
Which VARCHAR-length you should use depends on your real data.
MAX? Then you have no problems storing the mother groups, but you might have problems indexing it, since 1700 is the max for an unclustered index. But maybe you don't need to index it?
1700? Then you are able to index the column, but depending on your real data, will this cover the biggest mother group?
Why indexing? If you want to list the aggregated mother groups, it could be faster to read the index than the whole table.
As said; this depends on you (and your data). If we have no need to see the aggregated mother groups, then we don't need this column at all.
For this demo/solution we will add the column for debugging purpose, without any indexing.
-- Column 2: This column is more optional, and depends on your preferences.
alter table Father add MotherGroup varchar(MAX)
go
Create a trigger on the Child table.
It will handle all inserts, updates and deletes in the Child table.
create or alter trigger trIUD_Child on Child
after insert, update, delete
as
begin
set nocount on
-- Get all FatherIDs from the Inserted and Deleted table.
-- An ordinary Temp table is created with a clustered index to get SEEK performance later.
-- The table might also have more than 100 rows, where table variables are not recommended.
declare #numRowsInInsertedDeleted int
create table #rowsInInsertedDeleted(rowId int identity(1, 1), FatherID int)
create unique clustered index ix on #rowsInInsertedDeleted(rowId)
insert #rowsInInsertedDeleted(FatherID)
select distinct f.FatherID
from
(
select i.FatherID from inserted i
union all
select i.FatherID from deleted i
) f
select #numRowsInInsertedDeleted = max(rowId) from #rowsInInsertedDeleted
-- We have to loop each of the FatherIDs, since we might have several rows in the Inserted and Deleted tables.
declare #rowId int = 0
while (#rowId < #numRowsInInsertedDeleted)
begin
-- Get the father for the next row.
select #rowId += 1
declare #fatherId int
select #fatherId = r.FatherID
from #rowsInInsertedDeleted r
where r.rowId = #rowId
-- Aggregate the mothers for this father.
declare #motherGroup varchar(max) = ''
select #motherGroup += ',' + cast(c.MotherID as varchar)
from Child c
where c.FatherID = #fatherId
group by c.MotherID
order by c.MotherID
-- Update the father record.
-- Any empty strings are handled automatically, skip the leading ','.
update Father
set MotherGroup = substring(#motherGroup, 2, 2147483647),
MotherHash = HASHBYTES('SHA2_256', #motherGroup)
where FatherID = #fatherId
end
end
go
Updating existing rows
Since you already have millions of rows, we must aggregate the mother groups for these existing rows.
If you don't have the disk space for logging the update of the whole table, maybe you should take your database out of AG and switch to Simple recovery model for this task?
In that case you should also modify the update with a WHERE clause to update only parts of the table, and run the update for each part until the whole table is updated.
Example: update Child set FatherID = FatherID where FatherID between 1 and 1000000
Note: This update statement could block access to the Child table for other users.
-- Aggregate the mother groups for the existing rows.
-- This could takes minutes to complete, depending on the number of rows.
-- NOTE: This update statement could block access to the Child table for other users.
update Child set FatherID = FatherID
That's it!
You should now be able to quickly get the mother groups on existing rows, and also after future changes in the Child table.
-- Voila - now you can get the unique mother groups any time at a fast speed.
select count(distinct MotherHash) from Father
Thank you for posting such a comprehensive setup for the test data. However, I'm not running any CREATE/DROP statements against my DB so I converted those tables into table variables. Using your data, I came up with the following query. Just change the table names back to your dbo. names and you should be able to test in your environment. I basically concatenate every father/mother combo into a text string using FOR XML PATH. Then I count up all the distinct combos. If you find error in my logic, let me know. I'm just tossing this in the ring of possible solutions.
WITH distinctCombos AS (
SELECT DISTINCT
c.FatherID, c.MotherID
FROM #Child as c
) , motherComboCount AS (
SELECT
f.FatherID
, f.[Name]
, STUFF((
SELECT
',' + CAST(dc.MotherID as nvarchar)
FROM distinctCombos as dc
WHERE dc.FatherID = f.FatherID
ORDER BY dc.MotherID ASC
FOR XML PATH('')
),1,1,'') as motherList
FROM #Father as f
)
SELECT
COUNT(DISTINCT motherList) as UniqueMotherGroups
FROM motherComboCount as mcc
To save a bit of compute power, remove the STUFF function as it's not necessary for the comparison... it just makes the list nicer to look at if displaying... and I'm in the habit of using it.
It looks like the main differences between our methods is the use of FOR XML PATH vs STRING_AGG (I'm still on older SQL.) And I use DISTINCT twice instead of GROUP BY. If you have a larger dataset to test against, let me know how the 2 methods compare. I'm trying to think of a completely set-based method but I can't see it at the moment.
Update: Method 2.
Here's an idea I had using recursive CTEs to build the distinct mother combinations. In your example data, there are only 2 mothers per father. So there would be a total of 4 set-based queries performed (first CTE, 2 queries in the recursive CTE and the final SELECT).
WITH uniqueCombo as (
SELECT DISTINCT
c.FatherID
, c.MotherID
, ROW_NUMBER() OVER(PARTITION BY c.FatherID ORDER BY c.MotherID) as row_num
FROM #Child as c
), combos as (
SELECT
uc.FatherID
, uc.MotherID
, CAST(uc.MotherID as nvarchar(max)) as [path]
, row_num
, 0 as hierarchy_num
FROM uniqueCombo as uc
WHERE uc.row_num = 1
UNION ALL
SELECT
uc.FatherID
, uc.MotherID
, co.[path] + ',' + CAST(uc.MotherID as nvarchar(max))
, uc.row_num
, co.hierarchy_num + 1 as heirarchy_num
FROM uniqueCombo as uc
INNER JOIN combos as co
ON co.FatherID = uc.FatherID
--AND co.MotherID <> uc.MotherID
AND co.row_num + 1 = uc.row_num
), rankedCombos as (
SELECT
c.[path]
, ROW_NUMBER() OVER(PARTITION BY c.FatherID ORDER BY c.hierarchy_num DESC) as row_num
FROM combos as c
)
SELECT COUNT(DISTINCT rc.[path]) as UniqueMotherGroups
FROM rankedCombos as rc
WHERE rc.row_num = 1
Update 2:
I had another idea to use a PIVOT to transpose the records so that the FatherID would be in the left-most column with the MotherIDs as the column headers. To make that work with a dynamic list of MotherIDs, you have to use a dynamic PIVOT/dynamic SQL. (FatherID isn't really needed in the PIVOT so it's not included in the PIVOT query. I just had to describe what the goal is.) After the pivot, you can SELECT DISTINCT to get the unique mother combinations. Then the last SELECT is to get the COUNT. This one I ran in SQL Fiddle:
SQL Fiddle
MS SQL Server 2017 Schema Setup:
-- Create family tables.
CREATE TABLE dbo.Father
(
FatherID INT NOT NULL
, Name VARCHAR(50) NOT NULL
);
ALTER TABLE dbo.Father
ADD CONSTRAINT PK_Father
PRIMARY KEY CLUSTERED (FatherID);
ALTER TABLE dbo.Father SET (LOCK_ESCALATION = TABLE);
CREATE TABLE dbo.Mother
(
MotherID INT NOT NULL
, Name VARCHAR(50) NOT NULL
);
ALTER TABLE dbo.Mother
ADD CONSTRAINT PK_Mother
PRIMARY KEY CLUSTERED (MotherID);
ALTER TABLE dbo.Mother SET (LOCK_ESCALATION = TABLE);
CREATE TABLE dbo.Child
(
ChildID INT NOT NULL
, FatherID INT NOT NULL
, MotherID INT NOT NULL
, Name VARCHAR(50) NOT NULL
);
ALTER TABLE dbo.Child
ADD CONSTRAINT PK_Child
PRIMARY KEY CLUSTERED (ChildID);
CREATE NONCLUSTERED INDEX IX_Parents ON dbo.Child (FatherID, MotherID);
ALTER TABLE dbo.Child
ADD CONSTRAINT FK_Child_Father
FOREIGN KEY (FatherID)
REFERENCES dbo.Father (FatherID);
ALTER TABLE dbo.Child
ADD CONSTRAINT FK_Child_Mother
FOREIGN KEY (MotherID)
REFERENCES dbo.Mother (MotherID);
-- Insert two children with the same parents
INSERT INTO dbo.Father
(
FatherID
, Name
)
VALUES
(1, 'Alex')
, (2, 'Bob')
, (3, 'Charlie')
INSERT INTO dbo.Mother
(
MotherID
, Name
)
VALUES
(1, 'Alice')
, (2, 'Barbara');
INSERT INTO dbo.Child
(
ChildID
, FatherID
, MotherID
, Name
)
VALUES
(1, 1, 1, 'Adam')
, (2, 1, 1, 'Billy')
, (3, 1, 2, 'Celine')
, (4, 2, 2, 'Derek')
, (5, 3, 1, 'Eric');
Query 1:
DECLARE #cols AS nvarchar(MAX)
DECLARE #query AS nvarchar(MAX)
SET #cols = STUFF((
SELECT DISTINCT ',' + QUOTENAME(m.MotherID)
FROM Mother as m
FOR XML PATH(''))
,1,1,'')
SET #query = 'SELECT COUNT(mCount) as UniqueMotherGroups FROM (
SELECT DISTINCT ' + #cols + ', 1 as mCount FROM (
SELECT ' + #cols + '
FROM (
SELECT
c.FatherID
, c.MotherID
, 1 as mID
FROM child as c
) x
PIVOT
(
MAX(mID)
FOR MotherID in (' + #cols + ')
) p
) as m
) as mg'
--SELECT #query
Exec(#query)
Results:
| UniqueMotherGroups |
|--------------------|
| 3 |
UPDATE 3: Here's one other idea... create a results table with a unique constraint and with IGNORE_DUP_KEY=ON. You could use this in a function or stored procedure, or, setup a trigger to put the mother combinations into a unique-combo-holding-table. With IGNORE_DUP_KEY=ON, you can insert every combo and only the unique combos will remain. Then just do a count of all the rows.
--Create a table to hold the results:
CREATE TABLE results (
ChildID int not null
, UniqueCombos nvarchar(50) not null
PRIMARY KEY WITH (IGNORE_DUP_KEY = ON)
);
--Insert all combos into the results table. The unique constraint will cause only unique entries to remain.
INSERT INTO results (ChildID, UniqueCombos)
SELECT DISTINCT
c.ChildID
, (
SELECT ',' + CAST(MotherID as nvarchar(500))
FROM Child as c2
WHERE c2.ChildID = c.ChildID
ORDER BY c2.MotherID
FOR XML PATH('')
) as mother_combos
FROM Child as c
;
--Count up all the rows in the results table. Since these are all unique combinations, it should be fast to sum.
SELECT COUNT(*)
FROM results;
If you accept to define a maximum number of mothers per father (here 7) you may try:
select count(*) as UniqueMotherGroups from (
select distinct m1, m2, m3, m4, m5, m6, m7 from (
select FatherID, row_number() over(partition by FatherID order by motherid) as rn, motherid
from (
select distinct FatherID, MotherID
from t_Child
)
)
pivot (
max(motherid) for rn in (1 as m1,2 as m2,3 as m3,4 as m4,5 as m5,6 as m6,7 as m7)
)
)
;
UNIQUEMOTHERGROUPS
------------------
3
Here is one idea. Instead of using precise STRING_AGG you can calculate a hash / checksum of the group. You don't need to know the exact composition of the group, you just need to distinguish between different groups. Calculating of the hash may be faster than concatenating strings.
SQL Server has a function CHECKSUM_AGG
You can write your own hashing function with CLR.
Sample data
CREATE TABLE #Child
(
ChildID INT NOT NULL IDENTITY PRIMARY KEY
,FatherID INT NOT NULL
,MotherID INT NOT NULL
,Name VARCHAR(50) NOT NULL
);
INSERT INTO #Child
(
FatherID
,MotherID
,Name
)
VALUES
(1, 1, 'Adam')
,(1, 1, 'Billy')
,(1, 2, 'Celine')
,(2, 2, 'Derek')
,(3, 1, 'Eric')
,(4, 1, 'A')
,(4, 1, 'B')
,(4, 2, 'C')
,(4, 2, 'D')
,(4, 2, 'E')
,(5, 2, 'F')
,(6, 2, 'G')
;
Query
WITH
distinctParentCombinations
AS
(
SELECT
FatherID
,MotherID
FROM #Child
GROUP BY
FatherID
,MotherID
)
,motherGroups
AS
(
SELECT
FatherID
,CHECKSUM_AGG(MotherID) AS MotherGroup
FROM distinctParentCombinations
GROUP BY
FatherID
)
SELECT COUNT(DISTINCT MotherGroup) AS UniqueMotherGroups
FROM motherGroups
;
Result
+--------------------+
| UniqueMotherGroups |
+--------------------+
| 3 |
+--------------------+
You need to compare performance of all methods on your actual data.
Obviously, with CHECKSUM_AGG it is possible that some of the groups will be missed. There is a chance that two different groups will generate the same checksum.
You know better if this is acceptable.
General way to speed up calculations is to have some of the results already pre-calculated. In your case, for the first part you can create indexed view as follows:
CREATE OR ALTER VIEW vw_distinctParentCombinations WITH SCHEMABINDING AS
SELECT children.FatherID
, children.MotherID
,COUNT_BIG(*) AS [wifes_count]
FROM dbo.Child as children
GROUP BY children.FatherID
, children.MotherID
GO
CREATE UNIQUE CLUSTERED INDEX IX_vw_distinctParentCombinations ON vw_distinctParentCombinations
(
FatherID,MotherID
);
Then in your initial query, you can avoid the first CTE:
-- CTE Gets distinct combinations of parents
WITH motherGroups (Mothers)
AS
(SELECT STRING_AGG(CONVERT(VARCHAR(MAX), distinctParentCombinations.MotherID), '-') WITHIN GROUP (ORDER BY distinctParentCombinations.MotherID) AS Mothers
FROM vw_distinctParentCombinations distinctParentCombinations WITH(NOEXPAND)
GROUP BY distinctParentCombinations.FatherID
)
-- Remove the COUNT function to see the actual combinations
SELECT COUNT(motherGroups.Mothers) AS UniqueMotherGroups
FROM motherGroups;
This will avoid the initial read of the large table and depending the distinct combinations of the pairs (father - mother) it can reduce the view size significantly.
Unfortunately, there are a lot of limitations in order to create an indexed view, and you are not able to create such for the second CTE.
If we change our mind and look this issue in different view, simply we can get the count of mothers with this query:
SELECT Count(distinct ConcatMothers) UniqueMothersCount from(
SELECT FatherID, concat(FatherID,'-',SUM(MotherID)) ConcatMothers
FROM dbo.Child
GROUP BY FatherID) t;
Or even you can use Dense_Rank() like this:
SELECT Max(RankMothers) UniqueMothersCount from(
SELECT FatherID, DENSE_RANK() over (order by concat(FatherID,'-',SUM(MotherID))) RankMothers
FROM dbo.Child
GROUP BY FatherID) t;
For the performance it is hard to measure because dataset is small but since we have one column in the group by and the motherId is in the select maybe we can change index as below:
CREATE NONCLUSTERED INDEX IX_Parents ON dbo.Child (FatherID) Include(MotherID);
but you need to check it on your dataset.

Insert into table from select only when select returns valid rows

I want to insert into table from select statement but it is required that insert only happens when select returns valid rows. If no rows return from select, then no insertion happens.
insert into products (name, type) select 'product_name', type from prototype where id = 1
However, the above sql does insertion even when select returns no rows.
It tries to insert NULL values.
I know the following sql can check if row exists
select exists (select true from prototype where id = 1)
How to write a single SQL to add the above condition to insert to exclude the case ?
You are inserting the wrong way. See the example below, that doesn't insert any row since none matches id = 1:
create table products (
name varchar(10),
type varchar(10)
);
create table prototype (
id int,
name varchar(10),
type varchar(10)
);
insert into prototype (id, name, type) values (5, 'product5', 'type5');
insert into prototype (id, name, type) values (7, 'product7', 'type7');
insert into products (name, type) select name, type from prototype where id = 1
-- no rows were inserted.

Need an automatism to insert in a table the missing rows of a select statement

I don't have much experience with database so i am asking for help here in this situation:
THis select statement here is retrieving all the documents that are not in a table that i use to map my documents that are going to be integrated in other systems, it's a mapping table (TP_DOC_MAP). So i want to create some kind of automatism in my database that X in X time it will run this statement and it will insert in my mappings table (TP_DOC_MAP).
Right now it has been made manually. I run this statement and i will insert each record manually, and it doesn't make sense of course.
(
SELECT DOM_DOCUMENT.DOMAIN_DOC,
TYPE_DOCS.TYPE_DOC_PK,
TYPE_DOCS.TIPO_DOCUMENTO,
TYPE_DOCS.USERCODE,
TYPE_DOCS.CODE_RESULT
FROM TYPE_DOCS
JOIN DOM_TDOC_SIS
ON TYPE_DOCS.TYPE_DOC_PK = DOM_TDOC_SIS.TYPE_DOC_PK
JOIN DOM_DOCUMENT
ON DOM_TDOC_SIS.DOMAIN_DOC_PK = DOM_DOCUMENT.DOMAIN_DOC_PK
WHERE DOM_DOCUMENT.DOMAIN_DOC_PK IN (2, 10) -- (QLD = | PRD = 2)
"motive docs" -- (QLD = 63 | PRD = 10) "Consent. docs"
AND NOT EXISTS
(
SELECT 1
FROM TP_DOC_MAP
WHERE TP_DOC_MAP.LS_LOCAL_PK = 8
AND TP_DOC_MAP.LS_SYSTEM_PK = 3 -- system type
AND TP_DOC_MAP.ACTIVE = 1 -- Mapping all the active
documents
AND TP_DOC_MAP.CODE = TYPE_DOCS.TYPE_DOC_PK --"not e
xists" junction
))
INSERT INTO TP_DOC(CODIGO, NAME, ACTIVE, CREATEDAT, UPDATEDAT) VALUES
('TPDOC_23', 'Report for intrusive function', 1, SYSDATE, NULL);
INSERT INTO TP_DOC_MAP (TP_DOCUMENT_PK, LS_LOCAL_PK, LS_SYSTEM_PK, CODE,
ACTIVE, CREATEDAT, UPDATEDAT) VALUES ((SELECT TP_DOCUMENT_PK FROM TP_DOC
WHERE CODE = 'TPDOC_23'), (SELECT LS_LOCAL_PK FROM LS_LOCAL WHERE CODE =
'IPE'), (SELECT LS_SYSTEM_PK FROM LS_SYSTEM WHERE CODIGO = 'MDS'), '21',
1, SYSDATE, NULL);
so i want t create a database routine to do this automatically.
Convert that code into a procedure, e.g.
create or replace procedure p_insert as
begin
insert into target_table (col1, col2, ...)
select rows_that_are_missing
from some_other_table
where ...;
end;
Then schedule it using either DBMS_JOB or DBMS_SCHEDULER packages. For example:
declare
l_job number;
begin
dbms_job.submit
(job => job_insert,
what => 'p_insert;'
next_date => sysdate,
interval => 'sysdate + 1');
end;

Oracle SQL invalid identifier error in nested WITH subquery

Below you will find three sample tables and data along with a query. This example might seem contrived, but it is part of much larger (nearly 1500 lines) SQL query. The original query works great, but I've run into a problem while adding some new functionality.
CREATE TABLE rule_table (
id_rule_table NUMBER (10),
name VARCHAR2 (24),
goal NUMBER (10),
amount NUMBER (10)
);
INSERT INTO rule_table (id_rule_table, name, goal, amount) VALUES(1, 'lorem', 2, 3);
INSERT INTO rule_table (id_rule_table, name, goal, amount) VALUES(2, 'ipsum', 3, 3);
INSERT INTO rule_table (id_rule_table, name, goal, amount) VALUES(3, 'dolor', 4, 3);
CREATE TABLE content_table (
id_content_table NUMBER (10),
name VARCHAR2 (24),
show_flag NUMBER (10)
);
INSERT INTO content_table (id_content_table, name, show_flag) VALUES(1, 'lorem', 0);
INSERT INTO content_table (id_content_table, name, show_flag) VALUES(2, 'ipsum', 1);
INSERT INTO content_table (id_content_table, name, show_flag) VALUES(3, 'dolor', 1);
CREATE TABLE module_table (
id_module_table NUMBER (10),
id_content_table NUMBER (10),
name VARCHAR2 (24),
amount NUMBER (10)
);
INSERT INTO module_table (id_module_table, id_content_table, name, amount) VALUES(1, 2, 'lorem', 10);
INSERT INTO module_table (id_module_table, id_content_table, name, amount) VALUES(2, 2, 'ipsum', 11);
INSERT INTO module_table (id_module_table, id_content_table, name, amount) VALUES(3, 2, 'dolor', 12);
SELECT RULE.id_rule_table
FROM rule_table RULE
WHERE (
CASE
WHEN RULE.goal <= (
WITH contentTbl (id_content_table)
AS (
SELECT id_content_table
FROM content_table
WHERE show_flag = 1
),
modulesTbl (id_content_table, id_module_table)
AS (
SELECT C.id_content_table, M.id_module_table
FROM contentTbl C
JOIN module_table M ON M.id_content_table = C.id_content_table
WHERE 4 < M.amount - RULE.amount
)
SELECT SUM(M.id_module_table)
FROM contentTbl C
JOIN modulesTbl M ON C.id_content_table = M.id_content_table
)
THEN 1
ELSE 0
END
) = 1;
DROP TABLE rule_table;
DROP TABLE content_table;
DROP TABLE module_table;
If you try this you will receive the error ORA-00904: "RULE"."AMOUNT": invalid identifier. The problem lies with the line "WHERE 4 < M.amount - RULE.amount".
If you replace RULE.amount, in that line, with some number (e.g., WHERE 4 < M.amount - 3) then the query will run just fine.
As mentioned above, this is a snippet test case from a much larger query, so the structure of the query can't be (or hopefully doesn't need to be) changed too much. That is, ideally I'm looking for a solution that will allow me to use RULE.amount in the sub-query without changing anything other that the SQL inside of the "WHEN RULE.goal <= ()" block.
I'm trying to run this on Oracle 11g.
One last thing, I tried searching google and stackoverflow for solutions, but I couldn't figure out the correct terminology to describe my issue. The closest thing seemed to be nested correlated subquery, but that doesn't seem to be exactly right.
Taking into account that this is only part of a much larger query, here are the surgical changes required to make this work:
Move the WHERE 4 < M.amount - RULE.amount condition out of the CTE and into the main query so that RULE is in scope.
Modify the modulesTbl CTE to return an additional column amount so that M.amount is now available to the main query.
With these 2 changes, the query would look like this:
SELECT RULE.id_rule_table
FROM rule_table RULE
WHERE (
CASE
WHEN RULE.goal <= (
WITH contentTbl (id_content_table)
AS (
SELECT id_content_table
FROM content_table
WHERE show_flag = 1
),
modulesTbl (id_content_table, id_module_table, amount) -- add amount
AS (
SELECT C.id_content_table, M.id_module_table, M.amount -- add amount
FROM contentTbl C
JOIN module_table M ON M.id_content_table = C.id_content_table
)
SELECT SUM(M.id_module_table)
FROM contentTbl C
JOIN modulesTbl M ON C.id_content_table = M.id_content_table
AND 4 < M.amount - RULE.amount -- moved from CTE to here
)
THEN 1
ELSE 0
END
) = 1;

SQL very tricky procedure

I am a bit obsessed now with this quiz, it seems I cant find a way out... hehe .
Let's see - we have two tables:
actor (id, salary, bonus)
info (id, name, surname)
The question is, to create a procedure so that it shows salary, bonus, name, surname of the actor and additionally his final paycheck = (bonus + salary) .
To begin with I tried this: (just to show the salary, bonus, name, surname and not the total paycheck, but it failed big time).
create or replace procedure show_things AS
BEGIN
Select a.id, a.salary, a.bonus
From actor a
where a.id in
(select i_id, i.name, i.surname
from info i
where i_id = a_id);
END;
I mean, is it possible to show things using a procedure ? I really can't understand this question.
I don't really know which version of SQL that you're using, but essentially, you're going to want to join two tables and query from them, like so:
SELECT
actor.salary,
actor.bonus,
info.name,
info.surname,
actor.salary + actor.bonus AS total_pay
FROM actor INNER JOIN info;
Done :-)
It's a simple join:
SELECT
a.id,
a.salary,
a.bonus,
i.name,
i.surname,
a.salary + a.bonus AS final_paycheck
FROM
actor a INNER JOIN info i
ON
a.id= i.id
Your procedure won't compile as-is. You'll (at least) need to define some local variables, and use the INTO clause to select into them.
You can then output the result of your procedure using dbms_output.put_line(), assuming that you have SERVEROUTPUT turned on.
In addition, you can't use the IN operator with a different number of columns (1) to that in the subquery it references (3). You should use a simple JOIN as others have suggested.
Please see the sample in the PL/SQL documentation.
You can use pipepline function to achieve this.
-- drop type t_actor_tab;
-- drop type t_actor_obj;
-- drop function get_actor_ptf;
-- drop table ACTOR;
-- drop table info;
CREATE TABLE ACTOR(ID NUMBER, SAL NUMBER, BONUS NUMBER);
INSERT INTO ACTOR(ID, SAL, BONUS) VALUES(1, 1200, 120);
INSERT INTO ACTOR(ID, SAL, BONUS) VALUES(2, 1300, 240);
INSERT INTO ACTOR(ID, SAL, BONUS) VALUES(3, 1500, 120);
CREATE TABLE INFO(ID NUMBER, NAME VARCHAR2(30), SURNAME VARCHAR2(30));
INSERT INTO INFO(ID, NAME, SURNAME) VALUES(1, 'ABC', 'abc');
INSERT INTO INFO(ID, NAME, SURNAME) VALUES(2, 'xyz', 'xyz');
INSERT INTO INFO(ID, NAME, SURNAME) VALUES(3, 'MNO', 'mno');
CREATE TYPE t_actor_obj AS OBJECT (
id NUMBER
,sal NUMBER
,bonus NUMBER
,final_paycheck NUMBER
);
CREATE TYPE t_actor_tab IS TABLE OF t_actor_obj;
CREATE OR REPLACE FUNCTION get_actor_ptf
RETURN t_actor_tab PIPELINED AS
CURSOR get_details_cur
IS
SELECT a.id
,a.sal
,a.bonus
,(a.sal+a.bonus) final_paycheck
From actor a
where EXISTS(SELECT 1 FROM info i WHERE i.id = a.id);
v_details_row get_details_cur%ROWTYPE;
BEGIN
OPEN get_details_cur;
LOOP
FETCH get_details_cur INTO v_details_row;
EXIT WHEN get_details_cur%NOTFOUND;
PIPE ROW(t_actor_obj (v_details_row.id,v_details_row.sal,v_details_row.bonus,v_details_row.final_paycheck));
END LOOP;
CLOSE get_details_cur;
RETURN;
END;
/
SELECT * FROM TABLE(get_actor_ptf);
ID SAL BONUS FINAL_PAYCHECK
---------- ---------- ---------- --------------
1 1200 120 1320
2 1300 240 1540
3 1500 120 1620
CREATE TABLE ACTOR(ID BIGINT, SAL BIGINT, BONUS BIGINT);
INSERT INTO ACTOR(ID, SAL, BONUS) VALUES(1, 1200, 120), (2, 1300, 240),(3, 1500, 120);
CREATE TABLE INFO(ID BIGINT, NAME VARCHAR(30), SURNAME VARCHAR(30));
INSERT INTO INFO(ID, NAME, SURNAME) VALUES(1, 'ABC', 'abc'), (2, 'xyz', 'xyz'), (3, 'MNO', 'mno')
SELECT * FROM ACTOR
SELECT * FROM INFO
CREATE OR REPLACE PROCEDURE SP1()
LANGUAGE SQL
DYNAMIC RESULT SETS 1
BEGIN
DECLARE DISPLAY CURSOR WITH RETURN FOR
SELECT X.ID, Y.NAME, Y.SURNAME, X.SAL, X.BONUS, X.SAL + X. BONUS AS TOTAL FROM ACTOR X JOIN INFO Y ON X.ID = Y. ID ;
OPEN DISPLAY;
END#
CALL SP1()