Related
I have a complicated stored procedure that worked great until the client wanted to change it.
I am not great with complicated TSQL so I have no idea what is wrong with my code.
Here is the situation. I have three temp tables, Cost, Adjustments, and Payments.
In the end I merge all these tables together in a report table. The problem I am having is even if one or even two of these tables are null, as long as one table has data I need that data to show. I currently have it set up with full outer joins but I'm still not getting the full list, I'm missing probably....50 ish records that should be there.
Can anyone look at this code and tell me what the heck I'm doing wrong? I'm bringing all the data together on #ThisReportAll
UPDATE: So I removed the having clause to see what was going on, and the data for the overdue balance is returning null. So the math isn't...mathing correctly, any ideas?
CODE
CREATE TABLE #BalanceAdjustmentsAll (CustomerId int, Amount decimal(20,2));
CREATE TABLE #AnimalCostsAll (thisIndex int IDENTITY(1,1), AnimalTypeId int, Cost decimal(20,2));
CREATE TABLE #TotalAnimalCostAll (thisIndex int IDENTITY(1,1), YearSetupId int, AnimalTypeId int, AnimalType varchar(max), OwnerId int, CustomerId int, AnimalCount int, TtlSpeciesCost decimal(20,2));
CREATE TABLE #CustomerPaymentsAll (thisIndex int IDENTITY(1,1), CustomerID nvarchar(max), TtlPayments decimal(20,2));
CREATE TABLE #CustomerInfoAll (thisIndex int IDENTITY(1,1), OwnerId int, CustomerId int, FName nvarchar(200), LName nvarchar(200),BName nvarchar(200));
CREATE TABLE #ThisReportAll (thisIndex int IDENTITY(1,1), CustomerID nvarchar(max), Year char(4), OverdueBalance decimal(20,2), YearSetupId int);
INSERT INTO #BalanceAdjustmentsAll (CustomerId, Amount)
SELECT CustomerId, SUM(Amount)
FROM BalanceAdjustment
WHERE YearSetupId = 3
GROUP BY CustomerId;
/* GET Costs per Animal for 'This' yearID */
INSERT INTO #AnimalCostsAll (AnimalTypeId, Cost)
SELECT AnimalTypeId, Cost
FROM PerCapitaFee
WHERE YearSetupId = 3;
/* GET animal type totals for owner per year */
INSERT INTO #TotalAnimalCostAll (yearSetupId,AnimalTypeId,AnimalType,OwnerId,CustomerId,AnimalCount,TtlSpeciesCost)
SELECT YearSetup.YearSetupId,AnimalCount.AnimalTypeId,AnimalType.ShortDescription,Owner.OwnerId,Report.CustomerId,AnimalCount.Count,(ac.Cost * AnimalCount.Count)
FROM AnimalCount
INNER JOIN #AnimalCostsAll as ac
ON ac.AnimalTypeId = AnimalCount.AnimalTypeId
INNER JOIN AnimalType
ON AnimalCount.AnimalTypeId=AnimalType.AnimalTypeId
INNER JOIN AnimalLocation
ON AnimalLocation.AnimalLocationid=AnimalCount.AnimalLocationId
INNER JOIN Owner
ON Owner.OwnerId=AnimalLocation.OwnerId
AND Owner.OwnerType = 'P'
INNER JOIN Report
ON Report.ReportId=Owner.ReportId
INNER JOIN YearSetup
ON Report.YearSetupId=YearSetup.YearSetupId
INNER JOIN County
ON County.CountyId=AnimalLocation.CountyId
WHERE YearSetup.YearSetupId = 3 AND Report.Completed IS NOT NULL AND Report.CustomerId IS NOT NULL
/* Get The total payments a customer has made */
INSERT INTO #CustomerPaymentsAll (CustomerID,TtlPayments)
SELECT BPS.CustomerId,SUM(BPS.Amount)
FROM BatchPaymentSplit BPS
LEFT JOIN BatchPayment bp ON BPS.BatchPaymentId=bp.BatchPaymentId
LEFT JOIN Batch b ON bp.BatchId=b.BatchId
WHERE BPS.CustomerId IS NOT NULL
AND
(
((b.BatchTypeId = 'M' OR b.BatchTypeId = 'C' OR b.BatchTypeId = 'E') AND (b.BatchStatusId = 'S'))
OR
((b.BatchTypeId = 'B' OR b.BatchTypeId = 'N' OR b.BatchTypeId = 'R' OR b.BatchTypeId = 'T') AND (b.BatchStatusId = 'S' OR b.BatchStatusId='C'))
)
AND
BPS.YearSetupId = 3
GROUP BY BPS.CustomerId;
/* Deal with the name/id stuff */
INSERT INTO #CustomerInfoAll(FName, LName, BName, OwnerId, CustomerId)
SELECT
o.FirstName AS FName,
o.LastName AS LName,
o.BusinessName AS BName,
o.OwnerId AS OwnerId,
r.CustomerId AS CustomerId
FROM Owner o
INNER JOIN Report r
ON o.ReportId = r.ReportId
AND o.OwnerType = 'P'
WHERE r.CustomerId IN (SELECT CustomerId FROM #TotalAnimalCostAll)
AND r.Completed IS NOT NULL
AND r.YearSetupId = 3
AND NOT EXISTS(
SELECT 1 FROM Report
WHERE r.CustomerId = Report.CustomerId
AND Report.Completed IS NOT NULL
AND r.ReportId != Report.ReportId
AND r.YearSetupId = Report.YearSetupId
AND (
r.Completed < Report.Completed
OR (
r.Completed = Report.Completed
AND r.ReportId < Report.ReportId
)
)
)
ORDER BY CustomerId;
/** MAKE IT SO #1 **************************************************/
/* Simply Joining The Customer Info to the calculated totals to avoid any aggregation shenanigans... */
INSERT INTO #ThisReportAll (CustomerID,Year,OverdueBalance,YearSetupId)
SELECT COALESCE(t.CustomerId,cp.CustomerId,ba.CustomerID), ys.Name AS Year,
CASE
WHEN (SUM(t.TtlSpeciesCost) < 5 AND SUM(t.TtlSpeciesCost) > 0) AND (ys.Name='2015' OR ys.Name='2016')
THEN (5) - Isnull(cp.TtlPayments,0) + Isnull(ba.Amount,0)
ELSE SUM(t.TtlSpeciesCost) - Isnull(cp.TtlPayments,0) + Isnull(ba.Amount,0)
END
AS TtlOwnerCost, t.YearSetupId AS YearSetupId
FROM #TotalAnimalCostAll t
FULL OUTER JOIN #CustomerPaymentsAll cp ON t.CustomerId=cp.CustomerID
FULL OUTER JOIN #BalanceAdjustmentsAll ba ON COALESCE(t.CustomerId,cp.CustomerId)=ba.CustomerID
LEFT JOIN YearSetup ys ON COALESCE(t.CustomerId,cp.CustomerId,ba.CustomerID) = ys.YearSetupId
GROUP BY COALESCE(t.CustomerId,cp.CustomerId,ba.CustomerID),ys.Name,cp.TtlPayments, ba.Amount, t.YearSetupId
HAVING
CASE WHEN (SUM(t.TtlSpeciesCost) < 5 AND SUM(t.TtlSpeciesCost) > 0) AND (ys.Name='2015' OR ys.Name='2016')
THEN SUM(5) - Isnull(cp.TtlPayments,0) + Isnull(ba.Amount,0)
ELSE SUM(t.TtlSpeciesCost) - Isnull(cp.TtlPayments,0) + Isnull(ba.Amount,0)
END < 0;
/* Return some meaningful report data */
SELECT r.Year AS [YearName],r.CustomerID,left(ci.FName,20) AS [FirstName], left(ci.LName,40) AS [LastName], left(ci.BName,40) AS [BusinessName],r.OverdueBalance AS [Balance],r.YearSetupId
FROM #ThisReportAll r
LEFT JOIN #CustomerInfoAll ci ON r.CustomerID = ci.CustomerId
ORDER BY CAST(r.CustomerID as int) ASC;
DROP TABLE #BalanceAdjustmentsAll;
DROP TABLE #AnimalCostsAll;
DROP TABLE #TotalAnimalCostAll;
DROP TABLE #CustomerPaymentsAll;
DROP TABLE #CustomerInfoAll;
DROP TABLE #ThisReportAll;
Found it. I didn't have a default value for t.TtlSpeciesCost if it was null
SUM(t.TtlSpeciesCost) - Isnull(cp.TtlPayments,0) + Isnull(ba.Amount,0)
to
SUM(ISNULL(t.TtlSpeciesCost,0)) - Isnull(cp.TtlPayments,0) + Isnull(ba.Amount,0)
Some missing records may be found here:
by adjusting /* Get The total payments a customer has made */
INSERT INTO #CustomerPaymentsAll (CustomerID,TtlPayments)
SELECT BPS.CustomerId,SUM(BPS.Amount)
FROM BatchPaymentSplit BPS
LEFT JOIN BatchPayment bp
ON BPS.BatchPaymentId=bp.BatchPaymentId
LEFT JOIN Batch b
ON bp.BatchId=b.BatchId
AND ((b.BatchTypeId IN ('M', 'C', 'E') AND b.BatchStatusId = 'S')
OR (b.BatchTypeId IN ('B','N','R','T') AND (b.BatchStatusId IN ('S','C')))
WHERE BPS.CustomerId IS NOT NULL
AND BPS.YearSetupId = 3
GROUP BY BPS.CustomerId;
The WHERE on B would have negated the left join causing null records to be omitted. or made the left join to behave like an inner join.
To know for certain we need sample data from your tables showing which records are being omitted that you need to retain.
I also refactored the OR's and made them "IN"s to improve readability.
Just a question regards to temp tables and declaring table. If I change the temp tables 'ChangedData' and 'PackageDatatoProcess' to their own variables '#ChangedData' and '#PackageDatatoProcess', can I ask how I am suppose to change the select into statement as I have not quite done this before. Virtually I told that we can declare tables rather than using the select into but just need a bit of help with this:
select distinct * into #PackageDataToProcess from #ChangedData pp
outer apply (
select pk.Reference, pjl.PackageToJournalLinkId, j.CreatedDate, pccl.PackageCostChangeLogId from Jet2Holidays.dbo.Package pk
inner join Jet2Holidays.dbo.PackageToJournalLink pjl on pk.PackageId = pjl.PackageId
inner join Jet2Holidays.dbo.Journal j on pjl.JournalId = j.JournalId
and j.PrincipalName= iif(#AllowNonSupportChanges = 0, 'HolidaysSupport', j.PrincipalName)
inner join Jet2Holidays.dbo.BusinessProcess bp on pjl.BusinessProcessId = bp.BusinessProcessId
and bp.[Description] = iif(#AllowNonSupportChanges = 0, 'CallCentreAction', bp.[Description])
left outer join Jet2Holidays.dbo.PackageCostChangeLog pccl on pccl.PackageToJournalLinkId = pjl.PackageToJournalLinkId
where pk.Reference = pp.PackageReference
and pp.JournalID = pjl.JournalId
) as packageData
First, you declare your variable tables like so:
DECLARE #PackageDataToProcess TABLE
(
Reference UNIQUEIDENTIFIER
, PackageToJournalLinkId INT
, CreatedDate DATETIME
, PackageCostChangeLogId INT
, {other columns here}
)
DECLARE #ChangedData TABLE
(
Reference UNIQUEIDENTIFIER
, PackageToJournalLinkId INT
, CreatedDate DATETIME
, PackageCostChangeLogId INT
, {other columns here}
)
At this point you can populate your #ChangedData table like so:
INSERT #ChangedData ( Reference, PackageToJournalLinkId, CreatedDate, PackageCostChangeLogId, {other columns})
SELECT Reference, PackageToJournalLinkId, CreatedDate, PackageCostChangeLogId, {other columns}
FROM ChangedDataSource -- Table, Procedure, Function
And then you can run your code by substituting #PackageDataToProcess for #PackageDataToProcess. Here it is again with a slight re-write (from the original):
INSERT #PackageDataToProcess
select distinct pp.* from #ChangedData pp
outer apply (
select pk.Reference, pjl.PackageToJournalLinkId, j.CreatedDate, pccl.PackageCostChangeLogId from Jet2Holidays.dbo.Package pk
inner join Jet2Holidays.dbo.PackageToJournalLink pjl on pk.PackageId = pjl.PackageId
inner join Jet2Holidays.dbo.Journal j on pjl.JournalId = j.JournalId
and j.PrincipalName= iif(#AllowNonSupportChanges = 0, 'HolidaysSupport', j.PrincipalName)
inner join Jet2Holidays.dbo.BusinessProcess bp on pjl.BusinessProcessId = bp.BusinessProcessId
and bp.[Description] = iif(#AllowNonSupportChanges = 0, 'CallCentreAction', bp.[Description])
left outer join Jet2Holidays.dbo.PackageCostChangeLog pccl on pccl.PackageToJournalLinkId = pjl.PackageToJournalLinkId
where pk.Reference = pp.PackageReference
and pp.JournalID = pjl.JournalId
) as packageData
Your declared tabled will go out of scope in a similar manner to the way your non-global temporary table do.
Hello I have following SQL query
SELECT K.name AS Name, K.surname AS Surname, U1.akce AS Event,
U2.[text] AS [Scheme], U1.[text] AS [Registered under>],
( U1.x - (
SELECT Count(K.ubytov)
FROM klient
WHERE ubytov = U2.[text]) ) AS [Free space]
FROM klient K
INNER JOIN ubytov U1 ON U1.[text] = K.ubytov
LEFT OUTER JOIN ubytov U2 ON U1.z = U2.id WHERE U1.akce = '140012-02'
ORDER BY U1.[text]
I'm trying to achieve that in column Free space would be (value from ubytov.x that matches U1.z = U2.id) - (total number of rows from table klient that has the same value in U1.[text]=K.ubytov)
In table klient column ubytov I have values that matches ubytov.text and in ubytov.z I have value that matches ubytov.x in different row.
Would somebody help me solve this out please?
Thank you for your time.
An example:
Table klient
ID_K ubytov
1 RoomOwner
2 RoomOwner
table ubytov
id text x z
1 roomType1 2 NULL
2 RoomOwner NULL 1
Desired Output:
Name Surname Event Scheme Registered under: Free space
Nam1 Surname1 Even1 Scheme1 RoomOwnerName 0 // (because 2 counts from klient) - (roomType1 x)
Although, it wasn't much clear because of missing columns.
I tried to build the required query using a CTE expression.
Here is the sqlfiddle code.
Let me know, if this is what you are looking for.
create table klient
(
id_k int,
ubytov varchar(25)
)
go
create table ubytov
(
id int,
text varchar(25),
x int null,
z int null
)
go
insert into klient(id_k, ubytov)
select 1, 'RoomOwner'
union select 2, 'RoomOwner'
go
insert into ubytov(id, text, x, z)
select 1, 'roomType1', 2, null
union select 2, 'RoomOwner', null, 1
go
;WITH cte_klint_counts_by_ubytov
AS
(
SELECT
ubytov,
Count(ubytov) AS ubytovCount
FROM klient
GROUP BY ubytov
)
SELECT
U2.[text] AS [Scheme],
U1.[text] AS [Registered under>],
(isnull(U1.x, 0) - isnull(c.ubytovCount, 0)) AS [Free space]
FROM
klient K
INNER JOIN ubytov U1 ON U1.[text] = K.ubytov
LEFT OUTER JOIN ubytov U2 ON U1.z = U2.id
LEFT OUTER JOIN cte_klint_counts_by_ubytov c ON c.ubytov = U2.[text]
ORDER BY u1.[text]
Suppose I have a table, in Database name 'Old', as below:
TABLE A
(
SeniorVehicle varchar(255),
SeniorVehicleAllowance int,
JuniorVehicle varchar(255),
JuniorVehicleAllowance int
ManagerVehicle varchar(255),
ManagerVehicleAllowance int
);
And another table, in Database name 'New' as below:
TABLE B
(
SeniorVehicle int,
SeniorVehicleAllowance int,
JuniorVehicle int,
JuniorVehicleAllowance int,
ManagerVehicle int,
ManagerVehicleAllowance int
);
I want to bring the data from TABLE A of Database 'Old' to TABLE B of Database 'New'.
The thing is that, there is a table named Vehicle in both databases as bellow:
TABLE Vehicle
(
VehicleID int pk,
VehicleName varchar(255)
)
Values in SeniorVehicle, JuniorVehicle and ManagerVehicle columns in TABLE A are the VehicleName value in TABLE Vehicle. But the value of SeniorVehicle, JuniorVehicle and ManagerVehicle that must be stored in TABLE B must be the value of VehicleID column in the Vehicle Table.
How to achieve this without error????
I have tried the following:
INSERT INTO B
(SeniorVehicle, SeniorVehicleAllowance, JuniorVehicle, JuniorniorVehicleAllowance, ManagerVehicle, ManagerVehicleAllowance)
SELECT Vehicle.VehicleID, c.SeniorVehicleAllowance, c.VehicleID, c.JuniorVehicleAllowance, c.VehicleID, c.ManagerVehicleAllowance
FROM (SELECT b.SeniorVehicle, b.SeniorVehicleAllowance, Vehicle.VehicleID, b.JuniorVehicleAllowance, b.VehicleID, b.ManagerVehicleAllowance
FROM (SELECT a.SeniorVehicle, a.SeniorVehicleAllowance, a.JuniorVehicle, a.JuniorVehicleAllowance, Vehicle.VehicleID, a.ManagerVehicleAllowance
FROM (SELECT SeniorVehicle, SeniorVehicleAllowance, JuniorVehicle, JuniorVehicleAllowance, ManagerVehicle, ManagerVehicleAllowance FROM A) as a
Inner join
Vehicle
ON a.ManagerVehicle = Vehicle.VehicleName) as b
Inner join
Vehicle
ON b.JuniorVehicle = Vehicle.VehicleName) as c
Inner join
Vehicle
ON c.SeniorVehicle = Vehicle.VehicleName
I get the following error:
The column 'VehicleID' was specified multiple times for 'c'
My Databse is MSSQL Server 2008 R2
Reformatting your current query gives:
SELECT
Vehicle.VehicleID,
c.SeniorVehicleAllowance,
c.VehicleID,
c.JuniorVehicleAllowance,
c.VehicleID,
c.ManagerVehicleAllowance
FROM (
SELECT b.SeniorVehicle,
b.SeniorVehicleAllowance,
Vehicle.VehicleID,
b.JuniorVehicleAllowance,
b.VehicleID,
b.ManagerVehicleAllowance
FROM (
SELECT a.SeniorVehicle,
a.SeniorVehicleAllowance,
a.JuniorVehicle,
a.JuniorVehicleAllowance,
Vehicle.VehicleID,
a.ManagerVehicleAllowance
FROM (
SELECT SeniorVehicle,
SeniorVehicleAllowance,
JuniorVehicle,
JuniorVehicleAllowance,
ManagerVehicle,
ManagerVehicleAllowance
FROM A
) as a
Inner join Vehicle
ON a.ManagerVehicle = Vehicle.VehicleName
) as b
Inner join Vehicle
ON b.JuniorVehicle = Vehicle.VehicleName
) as c
Inner join Vehicle
ON c.SeniorVehicle = Vehicle.VehicleName
In this query, the sub query aliased c has two columns called VehicleID (which is what your error message is telling you.
The smallest change to fix the issue is to alias the columns in the sub query, e.g:
SELECT
Vehicle.VehicleID AS SeniorVehicleId,
c.SeniorVehicleAllowance,
c.JuniorVehicleId,
c.JuniorVehicleAllowance,
c.ManagerVehicleID,
c.ManagerVehicleAllowance
FROM (
SELECT b.SeniorVehicle,
b.SeniorVehicleAllowance,
Vehicle.VehicleID AS JuniorVehicleId,
b.JuniorVehicleAllowance,
b.ManagerVehicleID,
b.ManagerVehicleAllowance
FROM (
SELECT a.SeniorVehicle,
a.SeniorVehicleAllowance,
a.JuniorVehicle,
a.JuniorVehicleAllowance,
Vehicle.VehicleID AS ManagerVehicleID,
a.ManagerVehicleAllowance
-- Rest ommited for brevity
It would be also possible to re-write the query with more joins, and omit the need for the subqueries altogether also:
SELECT srmgr.VehicleId AS SeniorVehicleId,
A.SeniorVehicleAllowance,
jrmgr.VehicleId AS JuniorVehicleId,
A.JuniorVehicleAllowance,
mgr.VehicleId AS ManagerVehicleId,
A.ManagerVehicleAllowance
FROM A
INNER JOIN Vehicle AS mgr
ON a.ManagerVehicle = mgr.VehicleName
INNER JOIN Vehicle AS jrmgr
ON a.ManagerVehicle = jrmgr.VehicleName
INNER JOIN Vehicle AS srmgr
ON a.ManagerVehicle = srmgr.VehicleName
Hopefully I'm missing a simple solution to this.
I have two tables. One contains a list of companies. The second contains a list of publishers. The mapping between the two is many to many. What I would like to do is bundle or group all of the companies in table A which have any relationship to a publisher in table B and vise versa.
The final result would look something like this (GROUPID is the key field). Row 1 and 2 are in the same group because they share the same company. Row 3 is in the same group because the publisher Y was already mapped over to company A. Row 4 is in the group because Company B was already mapped to group 1 through Publisher Y.
Said simply, any time there is any kind of shared relationship across Company and Publisher, that pair should be assigned to the same group.
ROW GROUPID Company Publisher
1 1 A Y
2 1 A X
3 1 B Y
4 1 B Z
5 2 C W
6 2 C P
7 2 D W
Fiddle
Update:
My bounty version: Given the table in the fiddle above of simply Company and Publisher pairs, populate the GROUPID field above. Think of it as creating a Family ID that encompasses all related parents/children.
SQL Server 2012
I thought about using recursive CTE, but, as far as I know, it's not possible in SQL Server to use UNION to connect anchor member and a recursive member of recursive CTE (I think it's possible to do in PostgreSQL), so it's not possible to eliminate duplicates.
declare #i int
with cte as (
select
GroupID,
row_number() over(order by Company) as rn
from Table1
)
update cte set GroupID = rn
select #i = ##rowcount
-- while some rows updated
while #i > 0
begin
update T1 set
GroupID = T2.GroupID
from Table1 as T1
inner join (
select T2.Company, min(T2.GroupID) as GroupID
from Table1 as T2
group by T2.Company
) as T2 on T2.Company = T1.Company
where T1.GroupID > T2.GroupID
select #i = ##rowcount
update T1 set
GroupID = T2.GroupID
from Table1 as T1
inner join (
select T2.Publisher, min(T2.GroupID) as GroupID
from Table1 as T2
group by T2.Publisher
) as T2 on T2.Publisher = T1.Publisher
where T1.GroupID > T2.GroupID
-- will be > 0 if any rows updated
select #i = #i + ##rowcount
end
;with cte as (
select
GroupID,
dense_rank() over(order by GroupID) as rn
from Table1
)
update cte set GroupID = rn
sql fiddle demo
I've also tried a breadth first search algorithm. I thought it could be faster (it's better in terms of complexity), so I'll provide a solution here. I've found that it's not faster than SQL approach, though:
declare #Company nvarchar(2), #Publisher nvarchar(2), #GroupID int
declare #Queue table (
Company nvarchar(2), Publisher nvarchar(2), ID int identity(1, 1),
primary key(Company, Publisher)
)
select #GroupID = 0
while 1 = 1
begin
select top 1 #Company = Company, #Publisher = Publisher
from Table1
where GroupID is null
if ##rowcount = 0 break
select #GroupID = #GroupID + 1
insert into #Queue(Company, Publisher)
select #Company, #Publisher
while 1 = 1
begin
select top 1 #Company = Company, #Publisher = Publisher
from #Queue
order by ID asc
if ##rowcount = 0 break
update Table1 set
GroupID = #GroupID
where Company = #Company and Publisher = #Publisher
delete from #Queue where Company = #Company and Publisher = #Publisher
;with cte as (
select Company, Publisher from Table1 where Company = #Company and GroupID is null
union all
select Company, Publisher from Table1 where Publisher = #Publisher and GroupID is null
)
insert into #Queue(Company, Publisher)
select distinct c.Company, c.Publisher
from cte as c
where not exists (select * from #Queue as q where q.Company = c.Company and q.Publisher = c.Publisher)
end
end
sql fiddle demo
I've tested my version and Gordon Linoff's to check how it's perform. It looks like CTE is much worse, I couldn't wait while it's complete on more than 1000 rows.
Here's sql fiddle demo with random data. My results were:
128 rows:
my RBAR solution: 190ms
my SQL solution: 27ms
Gordon Linoff's solution: 958ms
256 rows:
my RBAR solution: 560ms
my SQL solution: 1226ms
Gordon Linoff's solution: 45371ms
It's random data, so results may be not very consistent. I think timing could be changed by indexes, but don't think it could change a whole picture.
old version - using temporary table, just calculating GroupID without touching initial table:
declare #i int
-- creating table to gather all possible GroupID for each row
create table #Temp
(
Company varchar(1), Publisher varchar(1), GroupID varchar(1),
primary key (Company, Publisher, GroupID)
)
-- initializing it with data
insert into #Temp (Company, Publisher, GroupID)
select Company, Publisher, Company
from Table1
select #i = ##rowcount
-- while some rows inserted into #Temp
while #i > 0
begin
-- expand #Temp in both directions
;with cte as (
select
T2.Company, T1.Publisher,
T1.GroupID as GroupID1, T2.GroupID as GroupID2
from #Temp as T1
inner join #Temp as T2 on T2.Company = T1.Company
union
select
T1.Company, T2.Publisher,
T1.GroupID as GroupID1, T2.GroupID as GroupID2
from #Temp as T1
inner join #Temp as T2 on T2.Publisher = T1.Publisher
), cte2 as (
select
Company, Publisher,
case when GroupID1 < GroupID2 then GroupID1 else GroupID2 end as GroupID
from cte
)
insert into #Temp
select Company, Publisher, GroupID
from cte2
-- don't insert duplicates
except
select Company, Publisher, GroupID
from #Temp
-- will be > 0 if any row inserted
select #i = ##rowcount
end
select
Company, Publisher,
dense_rank() over(order by min(GroupID)) as GroupID
from #Temp
group by Company, Publisher
=> sql fiddle example
Your problem is a graph-walking problem of finding connected subgraphs. It is a little more challenging because your data structure has two types of nodes ("companies" and "pubishers") rather than one type.
You can solve this with a single recursive CTE. The logic is as follows.
First, convert the problem into a graph with only one type of node. I do this by making the nodes companies and the edges linkes between companies, using the publisher information. This is just a join:
select t1.company as node1, t2.company as node2
from table1 t1 join
table1 t2
on t1.publisher = t2.publisher
)
(For efficiency sake, you could also add t1.company <> t2.company but that is not strictly necessary.)
Now, this is a "simple" graph walking problem, where a recursive CTE is used to create all connections between two nodes. The recursive CTE walks through the graph using join. Along the way, it keeps a list of all nodes visited. In SQL Server, this needs to be stored in a string.
The code needs to ensure that it doesn't visit a node twice for a given path, because this can result in infinite recursion (and an error). If the above is called edges, the CTE that generates all pairs of connected nodes looks like:
cte as (
select e.node1, e.node2, cast('|'+e.node1+'|'+e.node2+'|' as varchar(max)) as nodes,
1 as level
from edges e
union all
select c.node1, e.node2, c.nodes+e.node2+'|', 1+c.level
from cte c join
edges e
on c.node2 = e.node1 and
c.nodes not like '|%'+e.node2+'%|'
)
Now, with this list of connected nodes, assign each node the minimum of all the nodes it is connected to, including itself. This serves as an identifier of connected subgraphs. That is, all companies connected to each other via the publishers will have the same minimum.
The final two steps are to enumerate this minimum (as the GroupId) and to join the GroupId back to the original data.
The full (and I might add tested) query looks like:
with edges as (
select t1.company as node1, t2.company as node2
from table1 t1 join
table1 t2
on t1.publisher = t2.publisher
),
cte as (
select e.node1, e.node2,
cast('|'+e.node1+'|'+e.node2+'|' as varchar(max)) as nodes,
1 as level
from edges e
union all
select c.node1, e.node2,
c.nodes+e.node2+'|',
1+c.level
from cte c join
edges e
on c.node2 = e.node1 and
c.nodes not like '|%'+e.node2+'%|'
),
nodes as (
select node1,
(case when min(node2) < node1 then min(node2) else node1 end
) as grp
from cte
group by node1
)
select t.company, t.publisher, grp.GroupId
from table1 t join
(select n.node1, dense_rank() over (order by grp) as GroupId
from nodes n
) grp
on t.company = grp.node1;
Note that this works on finding any connected subgraphs. It does not assume that any particular number of levels.
EDIT:
The question of performance for this is vexing. At a minimum, the above query will run better with an index on Publisher. Better yet is to take #MikaelEriksson's suggestion, and put the edges in a separate table.
Another question is whether you look for equivalency classes among the Companies or the Publishers. I took the approach of using Companies, because I think that has better "explanability" (my inclination to respond was based on numerous comments that this could not be done with CTEs).
I am guessing that you could get reasonable performance from this, although that requires more knowledge of your data and system than provided in the OP. It is quite likely, though, that the best performance will come from a multiple query approach.
Here is my solution SQL Fiddle
The nature of the relationships require looping as I figure.
Here is the SQL:
--drop TABLE Table1
CREATE TABLE Table1
([row] int identity (1,1),GroupID INT NULL,[Company] varchar(2), [Publisher] varchar(2))
;
INSERT INTO Table1
(Company, Publisher)
select
left(newid(), 2), left(newid(), 2)
declare #i int = 1
while #i < 8
begin
;with cte(Company, Publisher) as (
select
left(newid(), 2), left(newid(), 2)
from Table1
)
insert into Table1(Company, Publisher)
select distinct c.Company, c.Publisher
from cte as c
where not exists (select * from Table1 as t where t.Company = c.Company and t.Publisher = c.Publisher)
set #i = #i + 1
end;
CREATE NONCLUSTERED INDEX IX_Temp1 on Table1 (Company)
CREATE NONCLUSTERED INDEX IX_Temp2 on Table1 (Publisher)
declare #counter int=0
declare #row int=0
declare #lastnullcount int=0
declare #currentnullcount int=0
WHILE EXISTS (
SELECT *
FROM Table1
where GroupID is null
)
BEGIN
SET #counter=#counter+1
SET #lastnullcount =0
SELECT TOP 1
#row=[row]
FROM Table1
where GroupID is null
order by [row] asc
SELECT #currentnullcount=count(*) from table1 where groupid is null
WHILE #lastnullcount <> #currentnullcount
BEGIN
SELECT #lastnullcount=count(*)
from table1
where groupid is null
UPDATE Table1
SET GroupID=#counter
WHERE [row]=#row
UPDATE t2
SET t2.GroupID=#counter
FROM Table1 t1
INNER JOIN Table1 t2 on t1.Company=t2.Company
WHERE t1.GroupID=#counter
AND t2.GroupID IS NULL
UPDATE t2
SET t2.GroupID=#counter
FROM Table1 t1
INNER JOIN Table1 t2 on t1.publisher=t2.publisher
WHERE t1.GroupID=#counter
AND t2.GroupID IS NULL
SELECT #currentnullcount=count(*)
from table1
where groupid is null
END
END
SELECT * FROM Table1
Edit:
Added indexes as I would expect on the real table and be more in line with the other data sets Roman is using.
You are trying to find all of the connected components of your graph, which can only be done iteratively. If you know the maximum width of any connected component (i.e. the maximum number of links you will have to take from one company/publisher to another), you could in principle do it something like this:
SELECT
MIN(x2.groupID) AS groupID,
x1.Company,
x1.Publisher
FROM Table1 AS x1
INNER JOIN (
SELECT
MIN(x2.Company) AS groupID,
x1.Company,
x1.Publisher
FROM Table1 AS x1
INNER JOIN Table1 AS x2
ON x1.Publisher = x2.Publisher
GROUP BY
x1.Publisher,
x1.Company
) AS x2
ON x1.Company = x2.Company
GROUP BY
x1.Publisher,
x1.Company;
You have to keep nesting the subquery (alternating joins on Company and Publisher, and with the deepest subquery saying MIN(Company) rather than MIN(groupID)) to the maximum iteration depth.
I don't really recommend this, though; it would be cleaner to do this outside of SQL.
Disclaimer: I don't know anything about SQL Server 2012 (or any other version); it may have some kind of additional scripting ability to let you do this iteration dynamically.
This is a recursive solution, using XML:
with a as ( -- recursive result, containing shorter subsets and duplicates
select cast('<c>' + company + '</c>' as xml) as companies
,cast('<p>' + publisher + '</p>' as xml) as publishers
from Table1
union all
select a.companies.query('for $c in distinct-values((for $i in /c return string($i),
sql:column("t.company")))
order by $c
return <c>{$c}</c>')
,a.publishers.query('for $p in distinct-values((for $i in /p return string($i),
sql:column("t.publisher")))
order by $p
return <p>{$p}</p>')
from a join Table1 t
on ( a.companies.exist('/c[text() = sql:column("t.company")]') = 0
or a.publishers.exist('/p[text() = sql:column("t.publisher")]') = 0)
and ( a.companies.exist('/c[text() = sql:column("t.company")]') = 1
or a.publishers.exist('/p[text() = sql:column("t.publisher")]') = 1)
), b as ( -- remove the shorter versions from earlier steps of the recursion and the duplicates
select distinct -- distinct cannot work on xml types, hence cast to nvarchar
cast(companies as nvarchar) as companies
,cast(publishers as nvarchar) as publishers
,DENSE_RANK() over(order by cast(companies as nvarchar), cast(publishers as nvarchar)) as groupid
from a
where not exists (select 1 from a as s -- s is a proper subset of a
where (cast('<s>' + cast(s.companies as varchar)
+ '</s><a>' + cast(a.companies as varchar) + '</a>' as xml)
).value('if((count(/s/c) > count(/a/c))
and (some $s in /s/c/text() satisfies
(some $a in /a/c/text() satisfies $s = $a))
) then 1 else 0', 'int') = 1
)
and not exists (select 1 from a as s -- s is a proper subset of a
where (cast('<s>' + cast(s.publishers as nvarchar)
+ '</s><a>' + cast(a.publishers as nvarchar) + '</a>' as xml)
).value('if((count(/s/p) > count(/a/p))
and (some $s in /s/p/text() satisfies
(some $a in /a/p/text() satisfies $s = $a))
) then 1 else 0', 'int') = 1
)
), c as ( -- cast back to xml
select cast(companies as xml) as companies
,cast(publishers as xml) as publishers
,groupid
from b
)
select Co.company.value('(./text())[1]', 'varchar') as company
,Pu.publisher.value('(./text())[1]', 'varchar') as publisher
,c.groupid
from c
cross apply companies.nodes('/c') as Co(company)
cross apply publishers.nodes('/p') as Pu(publisher)
where exists(select 1 from Table1 t -- restrict to only the combinations that exist in the source
where t.company = Co.company.value('(./text())[1]', 'varchar')
and t.publisher = Pu.publisher.value('(./text())[1]', 'varchar')
)
The set of companies and the set of publishers are kept in XML fields in the intermediate steps, and there is some casting between xml and nvarchar necessary due to some limitations of SQL Server (like not being able to group or use distinct on XML columns.
Bit late to the challenge, and since SQLFiddle seems to be down ATM I'll have to guess your data-structures. Nevertheless, it seemed like a fun challenge (and it was =) so here's what I made from it :
Setup:
IF OBJECT_ID('t_link') IS NOT NULL DROP TABLE t_link
IF OBJECT_ID('t_company') IS NOT NULL DROP TABLE t_company
IF OBJECT_ID('t_publisher') IS NOT NULL DROP TABLE t_publisher
IF OBJECT_ID('tempdb..#link_A') IS NOT NULL DROP TABLE #link_A
IF OBJECT_ID('tempdb..#link_B') IS NOT NULL DROP TABLE #link_B
GO
CREATE TABLE t_company ( company_id int IDENTITY(1, 1) NOT NULL PRIMARY KEY,
company_name varchar(100) NOT NULL)
GO
CREATE TABLE t_publisher (publisher_id int IDENTITY(1, 1) NOT NULL PRIMARY KEY,
publisher_name varchar(100) NOT NULL)
CREATE TABLE t_link (company_id int NOT NULL FOREIGN KEY (company_id) REFERENCES t_company (company_id),
publisher_id int NOT NULL FOREIGN KEY (publisher_id) REFERENCES t_publisher (publisher_id),
PRIMARY KEY (company_id, publisher_id),
group_id int NULL
)
GO
-- example content
-- ROW GROUPID Company Publisher
--1 1 A Y
--2 1 A X
--3 1 B Y
--4 1 B Z
--5 2 C W
--6 2 C P
--7 2 D W
INSERT t_company (company_name) VALUES ('A'), ('B'), ('C'), ('D')
INSERT t_publisher (publisher_name) VALUES ('X'), ('Y'), ('Z'), ('W'), ('P')
INSERT t_link (company_id, publisher_id)
SELECT company_id, publisher_id
FROM t_company, t_publisher
WHERE (company_name = 'A' AND publisher_name = 'Y')
OR (company_name = 'A' AND publisher_name = 'X')
OR (company_name = 'B' AND publisher_name = 'Y')
OR (company_name = 'B' AND publisher_name = 'Z')
OR (company_name = 'C' AND publisher_name = 'W')
OR (company_name = 'C' AND publisher_name = 'P')
OR (company_name = 'D' AND publisher_name = 'W')
GO
/*
-- volume testing
TRUNCATE TABLE t_link
DELETE t_company
DELETE t_publisher
DECLARE #company_count int = 1000,
#publisher_count int = 450,
#links_count int = 800
INSERT t_company (company_name)
SELECT company_name = Convert(varchar(100), NewID())
FROM master.dbo.fn_int_list(1, #company_count)
UPDATE STATISTICS t_company
INSERT t_publisher (publisher_name)
SELECT publisher_name = Convert(varchar(100), NewID())
FROM master.dbo.fn_int_list(1, #publisher_count)
UPDATE STATISTICS t_publisher
-- Random links between the companies & publishers
DECLARE #count int
SELECT #count = 0
WHILE #count < #links_count
BEGIN
SELECT TOP 30 PERCENT row_id = IDENTITY(int, 1, 1), company_id = company_id + 0
INTO #link_A
FROM t_company
ORDER BY NewID()
SELECT TOP 30 PERCENT row_id = IDENTITY(int, 1, 1), publisher_id = publisher_id + 0
INTO #link_B
FROM t_publisher
ORDER BY NewID()
INSERT TOP (#links_count - #count) t_link (company_id, publisher_id)
SELECT A.company_id,
B.publisher_id
FROM #link_A A
JOIN #link_B B
ON A.row_id = B.row_id
WHERE NOT EXISTS ( SELECT *
FROM t_link old
WHERE old.company_id = A.company_id
AND old.publisher_id = B.publisher_id)
SELECT #count = #count + ##ROWCOUNT
DROP TABLE #link_A
DROP TABLE #link_B
END
*/
Actual grouping:
IF OBJECT_ID('tempdb..#links') IS NOT NULL DROP TABLE #links
GO
-- apply grouping
-- init
SELECT row_id = IDENTITY(int, 1, 1),
company_id,
publisher_id,
group_id = 0
INTO #links
FROM t_link
-- don't see an index that would be actually helpful here right-away, using row_id to avoid HEAP
CREATE CLUSTERED INDEX idx0 ON #links (row_id)
--CREATE INDEX idx1 ON #links (company_id)
--CREATE INDEX idx2 ON #links (publisher_id)
UPDATE #links
SET group_id = row_id
-- start grouping
WHILE ##ROWCOUNT > 0
BEGIN
UPDATE #links
SET group_id = new_group_id
FROM #links upd
CROSS APPLY (SELECT new_group_id = Min(group_id)
FROM #links new
WHERE new.company_id = upd.company_id
OR new.publisher_id = upd.publisher_id
) x
WHERE upd.group_id > new_group_id
-- select * from #links
END
-- remove 'holes'
UPDATE #links
SET group_id = (SELECT COUNT(DISTINCT o.group_id)
FROM #links o
WHERE o.group_id <= upd.group_id)
FROM #links upd
GO
UPDATE t_link
SET group_id = new.group_id
FROM t_link upd
LEFT OUTER JOIN #links new
ON new.company_id = upd.company_id
AND new.publisher_id = upd.publisher_id
GO
SELECT row = ROW_NUMBER() OVER (ORDER BY group_id, company_name, publisher_name),
l.group_id,
c.company_name, -- c.company_id,
p.publisher_name -- , p.publisher_id
from t_link l
JOIN t_company c
ON l.company_id = c.company_id
JOIN t_publisher p
ON p.publisher_id = l.publisher_id
ORDER BY 1
At first sight this approach hasn't been tried yet by anyone else, interesting to see how this can be done in a variety of ways... (preferred not to read them upfront as it would spoil the puzzle =)
Results look as expected (as far as I understand the requirements and the example) and performance isn't too shabby either although there is no real indication on the amount of records this should work on; not sure how it would scale but don't expect too many problems either...