Subquery using multiple IN operator - sql

I am trying to fetch all the id's in list 1 and with those id's from list 1, I am trying to fetch all the values in list 2 along with the count based on values in list 2.
DECLARE #Table1 AS TABLE (
id int,
l1 varchar(20)
);
INSERT INTO #Table1 VALUES
(1,'sun'),
(2,'shine'),
(3,'moon'),
(4,'light'),
(5,'earth'),
(6,'revolves'),
(7,'flow'),
(8,'fire'),
(9,'fighter'),
(10,'sun'),
(10,'shine'),
(11,'shine'),
(12,'moon'),
(1,'revolves'),
(10,'revolves'),
(2,'air'),
(3,'shine'),
(4,'fire'),
(5,'love'),
(6,'sun'),
(7,'rises');
/*
OPERATION 1
fetch all distinct ID's that has values from List 1
List1
sun
moon
earth
Initial OUTPUT1:
distinct_id list1_value
1 sun
3 moon
5 earth
10 sun
12 moon
6 sun
OPERATION2
fetch all the id, count_of_list2_values, list2_values
based on the id's that we recieved from OPERATION1
List2
shine
revolves
Expected Output:
id list1-value count_of_list2_values, list2_values
1 sun 1 revolves
3 moon 1 shine
5 earth 0 NULL
10 sun 2 shine,revolves
12 moon 0 NULL
6 sun 1 revolves
*/
My query:
Here is what I tried
select id, count(l1),l1
from #table1
where id in ('shine','revolves') and id in ('sun','moon','earth')
How can I achieve this.
I know this should be a subquery, having multiple in. How can this be achieved?
SQL fiddle Link:
https://dbfiddle.uk/?rdbms=sqlserver_2017&fiddle=7a85dbf51ca5b5d35e87d968c46300bb
foo
foo

There are several ways this could be done. Here's how I'd do it:
First set up the data:
DECLARE #Table1 AS TABLE (
id int,
l1 varchar(20)
) ;
INSERT INTO #Table1 VALUES
(1,'sun'),
(2,'shine'),
(3,'moon'),
(4,'light'),
(5,'earth'),
(6,'revolves'),
(7,'flow'),
(8,'fire'),
(9,'fighter'),
(10,'sun'),
(10,'shine'),
(11,'shine'),
(12,'moon'),
(1,'revolves'),
(10,'revolves'),
(2,'air'),
(3,'shine'),
(4,'fire'),
(5,'love'),
(6,'sun'),
(7,'rises') ;
Since this is a known list, set the "target" data up as it's own set. (In SQL, tables are almost invariably better to work with than demented lists. Oops, typo! I meant delimited lists.)
DECLARE #Targets AS TABLE (
l2 varchar(20)
) ;
INSERT INTO #Targets VALUES
('sun'),
('moon'),
('earth') ;
OPERATION 1
fetch all distinct ID's that has values from List 1
(sun, moon, earth)
Easy enough with a join:
SELECT Id
from #Table1 t1
inner join #Targets tg
on tg.l2 = t1.l1
OPERATION 2
fetch all the id, count_of_list2_values, list2_values
based on the id's that we recieved from OPERATION1
If I'm following the desired logic correctly, then (read the "join" comments first):
SELECT
tt.Id
-- This next counts how many items in the Operation 1 list are not in the target list
-- (Spaced out, to make it easier to compare with the next line)
,sum( case when tg2.l2 is null then 1 else 0 end)
-- And this concatenates them together in a string (in later editions of SQL Server)
,string_agg(case when tg2.l2 is null then tt.l1 else null end, ', ')
from #Table1 tt
inner join (-- Operation 1 as a subquery, produce list of the Ids to work with
select t1.id
from #Table1 t1
inner join #Targets tg
on tg.l2 = t1.l1
) xx
on xx.id = tt.id
-- This is used to identify the target values vs. the non-target values
left outer join #Targets tg2
on tg2.l2 = tt.l1
-- Aggregate, because that's what we need to do
group by tt.Id
-- Order it, because why not?
order by tt.Id

If you're using Sql Server 2017 then you can use string_agg function and outer apply operator:
select
l1.id,
l1.l1,
l2.cnt as count_of_list2_values,
l2.l1 as list2_values
from #Table1 as l1
outer apply (
select
count(*) as cnt,
string_agg(tt.l1, ',') as l1
from #Table1 as tt
where
tt.l1 in ('shine','revolves') and
tt.id = l1.id
) as l2
where
l1.l1 in ('sun','moon','earth')
db fiddle demo
In previous editions, I'm not sure it's possible to aggregate and count in one pass without creation of the special function for this. You can, of course, do it like this with xquery, but it might be a bit of an overkill (I'd not do this in production code at least):
select
l1.id,
l1.l1,
l2.data.value('count(l1)', 'int'),
stuff(l2.data.query('for $i in l1 return concat(",",$i/text()[1])').value('.','nvarchar(max)'),1,1,'')
from #Table1 as l1
outer apply (
select
tt.l1
from #Table1 as tt
where
tt.l1 in ('shine','revolves') and
tt.id = l1.id
for xml path(''), type
) as l2(data)
where
l1.l1 in ('sun','moon','earth')
db fiddle demo
If you don't mind to do it with double scan / seek of the table then you can either use #forpas answer or do something like this:
with cte_list2 as (
select tt.l1, tt.id
from #Table1 as tt
where
tt.l1 in ('shine','revolves')
)
select
l1.id,
l1.l1,
l22.cnt as count_of_list2_values,
stuff(l21.data.value('.', 'nvarchar(max)'),1,1,'') as list2_values
from #Table1 as l1
outer apply (
select
',' + tt.l1
from cte_list2 as tt
where
tt.id = l1.id
for xml path(''), type
) as l21(data)
outer apply (
select count(*) as cnt
from cte_list2 as tt
where
tt.id = l1.id
) as l22(cnt)
where
l1.l1 in ('sun','moon','earth')

With this:
with
cte as(
select t1.id, t2.l1
from table1 t1 left join (
select * from table1 where l1 in ('shine','revolves')
) t2 on t2.id = t1.id
where t1.l1 in ('sun','moon','earth')
),
cte1 as(
select
c.id,
stuff(( select ',' + cte.l1 from cte where id = c.id for xml path(''), type).value('.', 'NVARCHAR(MAX)'), 1, 1, '') col
from cte c
)
select
id,
count(col) count_of_list2_values,
max(col) list2_values
from cte1
group by id
The 1st CTE gives these results:
id | l1
-: | :-------
1 | revolves
3 | shine
5 | null
10 | shine
10 | revolves
12 | null
6 | revolves
and the 2nd operates on these results to concatenate the common grouped values of l1.
Finally I use group by id and aggergation on the results of the 2nd CTE.
See the demo
Results:
id | count_of_list2_values | list2_values
-: | --------------------: | :-------------
1 | 1 | revolves
3 | 1 | shine
5 | 0 | null
6 | 1 | revolves
10 | 2 | shine,revolves
12 | 0 | null

Related

Redshift split single dynamic column into multiple rows in new table

With a table like:
uid | segmentids
-------------------------+----------------------------------------
f9b6d54b-c646-4bbb-b0ec | 4454918|4455158|4455638|4455878|4455998
asd7a0s9-c646-asd7-b0ec | 1265899|1265923|1265935|1266826|1266596
gd3355ff-cjr8-assa-fke0 | 2237557|2237581|2237593
laksnfo3-kgi5-fke0-b0ec | 4454918|4455158|4455638|4455878
How to create a new table with:
uid | segmentids
-------------------------+---------------------------
f9b6d54b-c646-4bbb-b0ec | 4454918
f9b6d54b-c646-4bbb-b0ec | 1265899
f9b6d54b-c646-4bbb-b0ec | 2237557
f9b6d54b-c646-4bbb-b0ec | 4454918
f9b6d54b-c646-4bbb-b0ec | 4454918
asd7a0s9-c646-asd7-b0ec | 1265899
asd7a0s9-c646-asd7-b0ec | 1265923
asd7a0s9-c646-asd7-b0ec | 1265935
asd7a0s9-c646-asd7-b0ec | 1266826
asd7a0s9-c646-asd7-b0ec | 1266596
The number of segments are dynamic, can vary with each record.
I tried the Split function with delimiter, but it requires the index in string, which is dynamic here.
Any suggestions?
Here is the Redshift answer, it will work with up to 10 thousand segment ids values per row.
test data
create table test_split (uid varchar(50),segmentids varchar(max));
insert into test_split
values
('f9b6d54b-c646-4bbb-b0ec','4454918|4455158|4455638|4455878|4455998'),
('asd7a0s9-c646-asd7-b0ec','1265899|1265923|1265935|1266826|1266596'),
('asd7345s9-c646-asd7-b0ec','1235935|1263456|1265675696'),
('as345a0s9-c646-asd7-b0ec','12765899|12658883|12777935|144466826|1266226|12345')
;
code
with ten_numbers as (select 1 as num union select 2 union select 3 union select 4 union select 5 union select 6 union select 7 union select 8 union select 9 union select 0)
, generted_numbers AS
(
SELECT (1000 * t1.num) + (100 * t2.num) + (10 * t3.num) + t4.num AS gen_num
FROM ten_numbers AS t1
JOIN ten_numbers AS t2 ON 1 = 1
JOIN ten_numbers AS t3 ON 1 = 1
JOIN ten_numbers AS t4 ON 1 = 1
)
, splitter AS
(
SELECT *
FROM generted_numbers
WHERE gen_num BETWEEN 1 AND (SELECT max(REGEXP_COUNT(segmentids, '\\|') + 1)
FROM test_split)
)
--select * from splitter;
, expanded_input AS
(
SELECT
uid,
split_part(segmentids, '|', s.gen_num) AS segment
FROM test_split AS ts
JOIN splitter AS s ON 1 = 1
WHERE split_part(segmentids, '|', s.gen_num) <> ''
)
SELECT * FROM expanded_input;
the first 2 cte steps (ten_numbers and generated_numbers) are used to generate a number of rows, this is needed because generate_series is not supported
The next step (splitter) just takes a number of rows equal to the max number of delimiters + 1 (which is the max number of segments)
finally, we cross join splitter with the input data, take the related value using split_part and then exclude blank parts (which are caused where the row has < the max number of segments)
You can iterate over the SUPER array returned by split_to_array -- see the "Unnesting and flattening" section of this post. Using the same test_split table as the previous answer:
WITH seg_array AS
(SELECT uid,
split_to_array(segmentids, '|') segs
FROM test_split)
SELECT uid,
segmentid::int
FROM seg_array a,
a.segs AS segmentid;
Redshift now has the super data type & the split_to_array function which is similar to postgresql string_to_array
Redshift now also supports unnesting arrays through a syntax similar to a LATERAL JOIN in postgresql.
Using these techniques, we may write the same transformation in 2022 as
WITH split_up AS (
SELECT
uid
, split_to_array(segmentids) segment_array
)
SELECT
su.uid
, CAST(sid AS VARCHAR) segmentid
FROM split_up su
JOIN split_up.segment_array sid ON TRUE

SQL Server 2008 - Get the bottom most linked record

I have the following data in my SQL Server database:
Id | Name | LinkedId
---+------+----------
1 | A | 1
2 | B | 2
3 | C | 1
4 | D | 3
5 | E | 4
Now I want to write a stored procedure in which the following record should be shown:
Note: LinkedId has the Id that is associated with that name.
For example: "C" is associated with "A"
Id | Name | LinkedId
---+------+---------
1 | A | 1
2 | B | 2
3 | C | 1
4 | D | 1 //here instead of showing 3, it showed 1 which is the bottom most value in the tree
5 | E | 1 //same case as the above
PROBLEM:
For this scenario according to my limited knowledge I can only think of using JOINS (LEFT, INNER) but that won't be enough in this case to get the bottom most linked id of it.
EDIT (OUTPUT):
I want all the items associated (directly and indirectly) with item "C"
Id | Name |
---+------+
3 | C |
4 | D |
5 | E |
You could use a recursive function
A simple explain, recursive function using CTE is a common table expression that uses itself on caculating. It includes:
Invocation of the routine.
The first invocation of the recursive CTE
consists of one or more CTE_query_definitions joined by UNION ALL,
UNION, EXCEPT, or INTERSECT operators.
Because these query definitions
form the base result set of the CTE structure, they are referred to as
anchor members.
CTE_query_definitions are considered anchor members
unless they reference the CTE itself.
All anchor-member query
definitions must be positioned before the first recursive member
definition, and a UNION ALL operator must be used to join the last
anchor member with the first recursive member.
Recursive invocation of the routine.
The recursive invocation includes one or more
CTE_query_definitions joined by UNION ALL operators that reference the
CTE itself. These query definitions are referred to as recursive
members.
Termination check.
The termination check is implicit;
recursion stops when no rows are returned from the previous
invocation.
Reference link: Recursive query using CTE
Simple Example of Recursive CTE
Cte sql server
DECLARE #SampleData AS TABLE (Id int, Name varchar(10), LinkedId int)
INSERT INTO #SampleData
VALUES (1, 'A', 1), (2, 'B', 2),
(3, 'C', 1),(4, 'D', 3),(5, 'A', 4)
;WITH temp AS
(
SELECT sd.Id, sd.Name, sd.Id AS RootId
FROM #SampleData sd WHERE sd.LinkedId = sd.Id -- Invocation of the routine, in this case it's root node of tree.
UNION ALL
-- Recursive invocation of the routine
SELECT sd.Id, sd.Name, t.RootId AS RootId
FROM temp t
INNER JOIN #SampleData sd ON sd.LinkedId = t.Id AND sd.LinkedId <> sd.Id
-- Termination check: sd.LinkedId = t.Id AND sd.LinkedId <> sd.Id.
-- It make recursive query is not an infinitive loop
)
SELECT t.Id, t.Name, t.RootId AS LinkedId
FROM temp t
OPTION (MAXRECURSION 0) -- this option remove recursive max depth, default is 100.
Demo link: Rextester
For new output, you could change The first invocation of the
recursive CTE
;WITH temp AS
(
SELECT sd.Id, sd.Name, sd.Id AS RootId
FROM #SampleData sd WHERE sd.Id = 3
UNION ALL
-- Recursive invocation of the routine
SELECT sd.Id, sd.Name, t.RootId AS RootId
FROM temp t
INNER JOIN #SampleData sd ON sd.LinkedId = t.Id AND sd.LinkedId <> sd.Id
-- Termination check: sd.LinkedId = t.Id AND sd.LinkedId <> sd.Id.
-- It make recursive query is not an infinitive loop
)
SELECT t.Id, t.Name, t.RootId AS LinkedId
FROM temp t
OPTION (MAXRECURSION 0) -- this option remove recursive max depth, default is 100.
You want a recursive procedure:
with cte as (
select t.id, t.name, t.linkedid, 1 as lev
from t
where t.linkedid = t.id
union all
select t.id, t.name, cte.linkedid, cte.lev + 1
from t join
cte
on cte.id = t.linkedid and t.linkedid <> t.id
)
select id, name, linkedid
from cte;
Here is it working in practice.

How to synthesize attribute for joined tables

I have a view defined like this:
CREATE VIEW [dbo].[PossiblyMatchingContracts] AS
SELECT
C.UniqueID,
CC.UniqueID AS PossiblyMatchingContracts
FROM [dbo].AllContracts AS C
INNER JOIN [dbo].AllContracts AS CC
ON C.SecondaryMatchCodeFB = CC.SecondaryMatchCodeFB
OR C.SecondaryMatchCodeLB = CC.SecondaryMatchCodeLB
OR C.SecondaryMatchCodeBB = CC.SecondaryMatchCodeBB
OR C.SecondaryMatchCodeLB = CC.SecondaryMatchCodeBB
OR C.SecondaryMatchCodeBB = CC.SecondaryMatchCodeLB
WHERE C.UniqueID NOT IN
(
SELECT UniqueID FROM [dbo].DefinitiveMatches
)
AND C.AssociatedUser IS NULL
AND C.UniqueID <> CC.UniqueID
Which is basically finding contracts where f.e. the first name and the birthday are matching. This works great. Now I want to add a synthetic attribute to each row with the value from only one source row.
Let me give you an example to make it clearer. Suppose I have the following table:
UniqueID | FirstName | LastName | Birthday
1 | Peter | Smith | 1980-11-04
2 | Peter | Gray | 1980-11-04
3 | Peter | Gray-Smith| 1980-11-04
4 | Frank | May | 1985-06-09
5 | Frank-Paul| May | 1985-06-09
6 | Gina | Ericson | 1950-11-04
The resulting view should look like this:
UniqueID | PossiblyMatchingContracts | SyntheticID
1 | 2 | PeterSmith1980-11-04
1 | 3 | PeterSmith1980-11-04
2 | 1 | PeterSmith1980-11-04
2 | 3 | PeterSmith1980-11-04
3 | 1 | PeterSmith1980-11-04
3 | 2 | PeterSmith1980-11-04
4 | 5 | FrankMay1985-06-09
5 | 4 | FrankMay1985-06-09
6 | NULL | NULL [or] GinaEricson1950-11-04
Notice that the SyntheticID column uses ONLY values from one of the matching source rows. It doesn't matter which one. I am exporting this view to another application and need to be able to identify each "match group" afterwards.
Is it clear what I mean? Any ideas how this could be done in sql?
Maybe it helps to elaborate a bit on the actual use case:
I am importing contracts from different systems. To account for the possibility of typos or people that have married but the last name was only updated in one system, I need to find so called 'possible matches'. Two or more contracts are considered a possible match if they contain the same birthday plus the same first, last or birth name. That implies, that if contract A matches contract B, contract B also matches contract A.
The target system uses multivalue reference attributes to store these relationships. The ultimate goal is to create user objects for these contracts. The catch first is, that the shall only be one user object for multiple matching contracts. Thus I'm creating these matches in the view. The second catch is, that the creation of user objects happens by workflows, which run parallel for each contract. To avoid creating multiple user objects for matching contracts, each workflow needs to check, if there is already a matching user object or another workflow, which is about to create said user object. Because the workflow engine is extremely slow compared to sql, the workflows should not repeat the whole matching test. So the idea is, to let the workflow check only for the 'syntheticID'.
I have solved it with a multi step approach:
Create the list of possible 1st level matches for each contract
Create the base groups list, assigning a different group for for
each contract (as if they were not related to anybody)
Iterate the matches list updating the group list when more contracts need to
be added to a group
Recursively build up the SyntheticID from final group list
Output results
First of all, let me explain what I have understood, so you can tell if my approach is correct or not.
1) matching propagates in "cascade"
I mean, if "Peter Smith" is grouped up with "Peter Gray", it means that all Smith and all Gray are related (if they have the same birth date) so Luke Smith can be in the same group of John Gray
2) I have not understood what you mean with "Birth Name"
You say contracts matches on "first, last or birth name", sorry, I'm italian, I thought birth name and first were the same, also in your data there is not such column. Maybe it is related to that dash symbol between names?
When FirstName is Frank-Paul it means it should match both Frank and Paul?
When LastName is Gray-Smith it means it should match both Gray and Smith?
In following code I have simply ignored this problem, but it could be handled if needed (I already did a try, breaking names, unpivoting them and treating as double match).
Step Zero: some declaration and prepare base data
declare #cli as table (UniqueID int primary key, FirstName varchar(20), LastName varchar(20), Birthday varchar(20))
declare #comb as table (id1 int, id2 int, done bit)
declare #grp as table (ix int identity primary key, grp int, id int, unique (grp,ix))
declare #str_id as table (grp int primary key, SyntheticID varchar(1000))
declare #id1 as int, #g int
;with
t as (
select *
from (values
(1 , 'Peter' , 'Smith' , '1980-11-04'),
(2 , 'Peter' , 'Gray' , '1980-11-04'),
(3 , 'Peter' , 'Gray-Smith', '1980-11-04'),
(4 , 'Frank' , 'May' , '1985-06-09'),
(5 , 'Frank-Paul', 'May' , '1985-06-09'),
(6 , 'Gina' , 'Ericson' , '1950-11-04')
) x (UniqueID , FirstName , LastName , Birthday)
)
insert into #cli
select * from t
Step One: Create the list of possible 1st level matches for each contract
;with
p as(select UniqueID, Birthday, FirstName, LastName from #cli),
m as (
select p.UniqueID UniqueID1, p.FirstName FirstName1, p.LastName LastName1, p.Birthday Birthday1, pp.UniqueID UniqueID2, pp.FirstName FirstName2, pp.LastName LastName2, pp.Birthday Birthday2
from p
join p pp on (pp.Birthday=p.Birthday) and (pp.FirstName = p.FirstName or pp.LastName = p.LastName)
where p.UniqueID<=pp.UniqueID
)
insert into #comb
select UniqueID1,UniqueID2,0
from m
Step Two: Create the base groups list
insert into #grp
select ROW_NUMBER() over(order by id1), id1 from #comb where id1=id2
Step Three: Iterate the matches list updating the group list
Only loop on contracts that have possible matches and updates only if needed
set #id1 = 0
while not(#id1 is null) begin
set #id1 = (select top 1 id1 from #comb where id1<>id2 and done=0)
if not(#id1 is null) begin
set #g = (select grp from #grp where id=#id1)
update g set grp= #g
from #grp g
inner join #comb c on g.id = c.id2
where c.id2<>#id1 and c.id1=#id1
and grp<>#g
update #comb set done=1 where id1=#id1
end
end
Step Four: Build up the SyntheticID
Recursively add ALL (distinct) first and last names of group to SyntheticID.
I used '_' as separator for birth date, first names and last names, and ',' as separator for the list of names to avoid conflicts.
;with
c as(
select c.*, g.grp
from #cli c
join #grp g on g.id = c.UniqueID
),
d as (
select *, row_number() over (partition by g order by t,s) n1, row_number() over (partition by g order by t desc,s desc) n2
from (
select distinct c.grp g, 1 t, FirstName s from c
union
select distinct c.grp, 2, LastName from c
) l
),
r as (
select d.*, cast(CONVERT(VARCHAR(10), t.Birthday, 112) + '_' + s as varchar(1000)) Names, cast(0 as bigint) i1, cast(0 as bigint) i2
from d
join #cli t on t.UniqueID=d.g
where n1=1
union all
select d.*, cast(r.names + IIF(r.t<>d.t,'_',',') + d.s as varchar(1000)), r.n1, r.n2
from d
join r on r.g = d.g and r.n1=d.n1-1
)
insert into #str_id
select g, Names
from r
where n2=1
Step Five: Output results
select c.UniqueID, case when id2=UniqueID then id1 else id2 end PossibleMatchingContract, s.SyntheticID
from #cli c
left join #comb cb on c.UniqueID in(id1,id2) and id1<>id2
left join #grp g on c.UniqueID = g.id
left join #str_id s on s.grp = g.grp
Here is the results
UniqueID PossibleMatchingContract SyntheticID
1 2 1980-11-04_Peter_Gray,Gray-Smith,Smith
1 3 1980-11-04_Peter_Gray,Gray-Smith,Smith
2 1 1980-11-04_Peter_Gray,Gray-Smith,Smith
2 3 1980-11-04_Peter_Gray,Gray-Smith,Smith
3 1 1980-11-04_Peter_Gray,Gray-Smith,Smith
3 2 1980-11-04_Peter_Gray,Gray-Smith,Smith
4 5 1985-06-09_Frank,Frank-Paul_May
5 4 1985-06-09_Frank,Frank-Paul_May
6 NULL 1950-11-04_Gina_Ericson
I think that in this way the resulting SyntheticID should also be "unique" for each group
This creates a synthetic value and is easy to change to suit your needs.
DECLARE #T TABLE (
UniqueID INT
,FirstName VARCHAR(200)
,LastName VARCHAR(200)
,Birthday DATE
)
INSERT INTO #T(UniqueID,FirstName,LastName,Birthday) SELECT 1,'Peter','Smith','1980-11-04'
INSERT INTO #T(UniqueID,FirstName,LastName,Birthday) SELECT 2,'Peter','Gray','1980-11-04'
INSERT INTO #T(UniqueID,FirstName,LastName,Birthday) SELECT 3,'Peter','Gray-Smith','1980-11-04'
INSERT INTO #T(UniqueID,FirstName,LastName,Birthday) SELECT 4,'Frank','May','1985-06-09'
INSERT INTO #T(UniqueID,FirstName,LastName,Birthday) SELECT 5,'Frank-Paul','May','1985-06-09'
INSERT INTO #T(UniqueID,FirstName,LastName,Birthday) SELECT 6,'Gina','Ericson','1950-11-04'
DECLARE #PossibleMatches TABLE (UniqueID INT,[PossibleMatch] INT,SynKey VARCHAR(2000)
)
INSERT INTO #PossibleMatches
SELECT t1.UniqueID [UniqueID],t2.UniqueID [Possible Matches],'Ln=' + t1.LastName + ' Fn=' + + t1.FirstName + ' DoB=' + CONVERT(VARCHAR,t1.Birthday,102) [SynKey]
FROM #T t1
INNER JOIN #T t2 ON t1.Birthday=t2.Birthday
AND t1.FirstName=t2.FirstName
AND t1.LastName=t2.LastName
AND t1.UniqueID<>t2.UniqueID
INSERT INTO #PossibleMatches
SELECT t1.UniqueID [UniqueID],t2.UniqueID [Possible Matches],'Fn=' + t1.FirstName + ' DoB=' + CONVERT(VARCHAR,t1.Birthday,102) [SynKey]
FROM #T t1
INNER JOIN #T t2 ON t1.Birthday=t2.Birthday
AND t1.FirstName=t2.FirstName
AND t1.UniqueID<>t2.UniqueID
INSERT INTO #PossibleMatches
SELECT t1.UniqueID,t2.UniqueID,'Ln=' + t1.LastName + ' DoB=' + CONVERT(VARCHAR,t1.Birthday,102) [SynKey]
FROM #T t1
INNER JOIN #T t2 ON t1.Birthday=t2.Birthday
AND t1.LastName=t2.LastName
AND t1.UniqueID<>t2.UniqueID
INSERT INTO #PossibleMatches
SELECT t1.UniqueID,pm.UniqueID,'Ln=' + t1.LastName + ' Fn=' + + t1.FirstName + ' DoB=' + CONVERT(VARCHAR,t1.Birthday,102) [SynKey]
FROM #T t1
LEFT JOIN #PossibleMatches pm on pm.UniqueID=t1.UniqueID
WHERE pm.UniqueID IS NULL
SELECT *
FROM #PossibleMatches
ORDER BY UniqueID,[PossibleMatch]
I think this will work for you
SELECT
C.UniqueID,
CC.UniqueID AS PossiblyMatchingContracts,
FIRST_VALUE(CC.FirstName+CC.LastName+CC.Birthday)
OVER (PARTITION BY C.UniqueID ORDER BY CC.UniqueID) as SyntheticID
FROM
[dbo].AllContracts AS C INNER JOIN
[dbo].AllContracts AS CC ON
C.SecondaryMatchCodeFB = CC.SecondaryMatchCodeFB OR
C.SecondaryMatchCodeLB = CC.SecondaryMatchCodeLB OR
C.SecondaryMatchCodeBB = CC.SecondaryMatchCodeBB OR
C.SecondaryMatchCodeLB = CC.SecondaryMatchCodeBB OR
C.SecondaryMatchCodeBB = CC.SecondaryMatchCodeLB
WHERE
C.UniqueID NOT IN(
SELECT UniqueID FROM [dbo].DefinitiveMatches)
AND C.AssociatedUser IS NULL
You can try this:
SELECT
C.UniqueID,
CC.UniqueID AS PossiblyMatchingContracts,
FIRST_VALUE(CC.FirstName+CC.LastName+CC.Birthday)
OVER (PARTITION BY C.UniqueID ORDER BY CC.UniqueID) as SyntheticID
FROM
[dbo].AllContracts AS C
INNER JOIN
[dbo].AllContracts AS CC
ON
C.SecondaryMatchCodeFB = CC.SecondaryMatchCodeFB
OR
C.SecondaryMatchCodeLB = CC.SecondaryMatchCodeLB
OR
C.SecondaryMatchCodeBB = CC.SecondaryMatchCodeBB
OR
C.SecondaryMatchCodeLB = CC.SecondaryMatchCodeBB
OR
C.SecondaryMatchCodeBB = CC.SecondaryMatchCodeLB
WHERE
C.UniqueID NOT IN
(
SELECT UniqueID FROM [dbo].DefinitiveMatches
)
AND
C.AssociatedUser IS NULL
This will generate one extra row (because we left out C.UniqueID <> CC.UniqueID) but will give you the good souluton.
Following an example with some example data extracted from your original post. The idea: Generate all SyntheticID in a CTE, query all records with a "PossibleMatch" and Union it with all records which are not yet included:
DECLARE #t TABLE(
UniqueID int
,FirstName nvarchar(20)
,LastName nvarchar(20)
,Birthday datetime
)
INSERT INTO #t VALUES (1, 'Peter', 'Smith', '1980-11-04');
INSERT INTO #t VALUES (2, 'Peter', 'Gray', '1980-11-04');
INSERT INTO #t VALUES (3, 'Peter', 'Gray-Smith', '1980-11-04');
INSERT INTO #t VALUES (4, 'Frank', 'May', '1985-06-09');
INSERT INTO #t VALUES (5, 'Frank-Paul', 'May', '1985-06-09');
INSERT INTO #t VALUES (6, 'Gina', 'Ericson', '1950-11-04');
WITH ctePrep AS(
SELECT UniqueID, FirstName, LastName, BirthDay,
ROW_NUMBER() OVER (PARTITION BY FirstName, BirthDay ORDER BY FirstName, BirthDay) AS k,
FirstName+LastName+CONVERT(nvarchar(10), Birthday, 126) AS SyntheticID
FROM #t
),
cteKeys AS(
SELECT FirstName, BirthDay, SyntheticID
FROM ctePrep
WHERE k = 1
),
cteFiltered AS(
SELECT
C.UniqueID,
CC.UniqueID AS PossiblyMatchingContracts,
keys.SyntheticID
FROM #t AS C
JOIN #t AS CC ON C.FirstName = CC.FirstName
AND C.Birthday = CC.Birthday
JOIN cteKeys AS keys ON keys.FirstName = c.FirstName
AND keys.Birthday = c.Birthday
WHERE C.UniqueID <> CC.UniqueID
)
SELECT UniqueID, PossiblyMatchingContracts, SyntheticID
FROM cteFiltered
UNION ALL
SELECT UniqueID, NULL, FirstName+LastName+CONVERT(nvarchar(10), Birthday, 126) AS SyntheticID
FROM #t
WHERE UniqueID NOT IN (SELECT UniqueID FROM cteFiltered)
Hope this helps. The result looked OK to me:
UniqueID PossiblyMatchingContracts SyntheticID
---------------------------------------------------------------
2 1 PeterSmith1980-11-04
3 1 PeterSmith1980-11-04
1 2 PeterSmith1980-11-04
3 2 PeterSmith1980-11-04
1 3 PeterSmith1980-11-04
2 3 PeterSmith1980-11-04
4 NULL FrankMay1985-06-09
5 NULL Frank-PaulMay1985-06-09
6 NULL GinaEricson1950-11-04
Tested in SSMS, it works perfect. :)
--create table structure
create table #temp
(
uniqueID int,
firstname varchar(15),
lastname varchar(15),
birthday date
)
--insert data into the table
insert #temp
select 1, 'peter','smith','1980-11-04'
union all
select 2, 'peter','gray','1980-11-04'
union all
select 3, 'peter','gray-smith','1980-11-04'
union all
select 4, 'frank','may','1985-06-09'
union all
select 5, 'frank-paul','may','1985-06-09'
union all
select 6, 'gina','ericson','1950-11-04'
select * from #temp
--solution is as below
select ab.uniqueID
, PossiblyMatchingContracts
, c.firstname+c.lastname+cast(c.birthday as varchar) as synID
from
(
select a.uniqueID
, case
when a.uniqueID < min(b.uniqueID)over(partition by a.uniqueid)
then a.uniqueID
else min(b.uniqueID)over(partition by a.uniqueid)
end as SmallestID
, b.uniqueID as PossiblyMatchingContracts
from #temp a
left join #temp b
on (a.firstname = b.firstname OR a.lastname = b.lastname) AND a.birthday = b.birthday AND a.uniqueid <> b.uniqueID
) as ab
left join #temp c
on ab.SmallestID = c.uniqueID
Result capture is attached below:
Say we have following table (a VIEW in your case):
UniqueID PossiblyMatchingContracts SyntheticID
1 2 G1
1 3 G2
2 1 G3
2 3 G4
3 1 G4
3 4 G6
4 5 G7
5 4 G8
6 NULL G9
In your case you can set initial SyntheticID as a string like PeterSmith1980-11-04 using UniqueID for each line. Here is a recursive CTE query it divides all lines to unconnected groups and select MAX(SyntheticId) in the current group as a new SyntheticID for all lines in this group.
WITH CTE AS
(
SELECT CAST(','+CAST(UniqueID AS Varchar(100)) +','+ CAST(PossiblyMatchingContracts as Varchar(100))+',' as Varchar(MAX)) as GroupCont,
SyntheticID
FROM PossiblyMatchingContracts
UNION ALL
SELECT CAST(GroupCont+CAST(UniqueID AS Varchar(100)) +','+ CAST(PossiblyMatchingContracts as Varchar(100))+',' AS Varchar(MAX)) as GroupCont,
pm.SyntheticID
FROM CTE
JOIN PossiblyMatchingContracts as pm
ON
(
CTE.GroupCont LIKE '%,'+CAST(pm.UniqueID AS Varchar(100))+',%'
OR
CTE.GroupCont LIKE '%,'+CAST(pm.PossiblyMatchingContracts AS Varchar(100))+',%'
)
AND NOT
(
CTE.GroupCont LIKE '%,'+CAST(pm.UniqueID AS Varchar(100))+',%'
AND
CTE.GroupCont LIKE '%,'+CAST(pm.PossiblyMatchingContracts AS Varchar(100))+',%'
)
)
SELECT pm.UniqueID,
pm.PossiblyMatchingContracts,
ISNULL(
(SELECT MAX(SyntheticID) FROM CTE WHERE
(
CTE.GroupCont LIKE '%,'+CAST(pm.UniqueID AS Varchar(100))+',%'
OR
CTE.GroupCont LIKE '%,'+CAST(pm.PossiblyMatchingContracts AS Varchar(100))+',%'
))
,pm.SyntheticID) as SyntheticID
FROM PossiblyMatchingContracts pm

Change an iterative query to a relational set-based query

SQL Fiddle
I'm trying without success to change an iterative/cursor query (that is working fine) to a relational set query to achieve a better performance.
What I have:
table1
| ID | NAME |
|----|------|
| 1 | A |
| 2 | B |
| 3 | C |
Using a function, I want to insert my data into another table. The following function is a simplified example:
Function
CREATE FUNCTION fn_myExampleFunction
(
#input nvarchar(50)
)
RETURNS #ret_table TABLE
(
output nvarchar(50)
)
AS
BEGIN
IF #input = 'A'
INSERT INTO #ret_table VALUES ('Alice')
ELSE IF #input = 'B'
INSERT INTO #ret_table VALUES ('Bob')
ELSE
INSERT INTO #ret_table VALUES ('Foo'), ('Bar')
RETURN
END;
My expected result is to insert data in table2 like the following:
table2
| ID | NAME |
|----|-------|
| 1 | Alice |
| 2 | Bob |
| 3 | Foo |
| 3 | Bar |
To achieve this, I've tried some CTEs (Common Table Expression) and relational queries, but none worked as desired. The only working solution that I've got so far was an iterative and not performatic solution.
My current working solution:
BEGIN
DECLARE
#ID int,
#i int = 0,
#max int = (SELECT COUNT(name) FROM table1)
WHILE ( #i < #max ) -- In this example, it will iterate 3 times
BEGIN
SET #i += 1
-- Select table1.ID where row_number() = #i
SET #ID =
(SELECT
id
FROM
(SELECT
id,
ROW_NUMBER() OVER (ORDER BY id) as rn
FROM
table1) rows
WHERE
rows.rn = #i
)
-- Insert into table2 one or more rows related with table1.ID
INSERT INTO table2
(id, name)
SELECT
#ID,
fn_result.output
FROM
fn_myExampleFunction (
(SELECT name FROM table1 WHERE id = #ID)
) fn_result
END
END
The objective is to achieve the same without iterating through the IDs.
if the question is about how to apply a function in a set oriented way, then cross apply (or outer apply) is your friend:
insert into table2 (
id, name
) select
t1.id,
t2.output
from
table1 t1
cross apply
fn_myExampleFunction(t1.name) t2
Example SQLFiddle
If the non-simplified version of your function is amenable to rewriting, the other solutions will likely be faster.
A query like this will do what you want:
insert into table2(id, name)
select id, (case when name = 'A' then 'Alice'
when name = 'B' then 'Bob'
when name = 'C' then 'Foo'
end)
from table1
union all
select id, 'Bar'
from table1
where name = 'C';
Why wouldn't you store this data as a table? It's relational. Coding it in a function or stored procedure seems less than ideal.
In any case, I hope the following gives you ideas about how to improve your code. I realize that you said your function is more complicated than your example, but you can still use this idea even inside of the function as necessary.
INSERT dbo.table2 (ID, Name)
SELECT
T1.ID,
N.FullName
FROM
dbo.table1 T1
INNER JOIN (VALUES -- A "derived table" made up of only constants
('A', 'Alice'),
('B', 'Bob'),
('C', 'Foo'),
('C', 'Bar')
) N (ShortName, FullName)
ON T1.Name = N.ShortName
;
But of course, that could just be rendered INNER JOIN dbo.NameTranslation N if it were in a real table (and then updating it would be so much easier!).
If your function absolutely can't be rewritten to be relational (it must take a single name at a time) then you would use CROSS APPLY:
INSERT dbo.table2 (ID, Name)
SELECT
T1.ID,
N.OutputName
FROM
dbo.table1 T1
CROSS APPLY dbo.YourFunction(T1.Name) F
;
However, this will not perform very well for large rowsets. Rewriting the function to be the type that RETURNS TABLE is a step in the right direction (instead of RETURNS #variable TABLE (definition)).

Ungrouping effect?

I have a dynamic set of data X of the form:
----------------------------------
x.id | x.allocated | x.unallocated
----------------------------------
foo | 2 | 0
bar | 1 | 2
----------------------------------
And I need to get to a result of Y (order is unimportant):
----------------------------------
y.id | y.state
----------------------------------
foo | allocated
foo | allocated
bar | allocated
bar | unallocated
bar | unallocated
----------------------------------
I have a UTF based solution, but I'm looking for hyper-efficiency so I'm idly wondering if there's a statement based, non-procedural way to get this kind of "ungroup by" effect?
It feels like an unpivot, but my brain can't get there right now.
If you have a numbers table in your database, you could use that to help get your results. In my database, I have a table named Numbers with a Num column.
Declare #Temp Table(id VarChar(10), Allocated Int, UnAllocated Int)
Insert Into #Temp Values('foo', 2, 0)
Insert Into #Temp Values('bar',1, 2)
Select T.id,'Allocated'
From #Temp T
Inner Join Numbers
On T.Allocated >= Numbers.Num
Union All
Select T.id,'Unallocated'
From #Temp T
Inner Join Numbers
On T.unAllocated >= Numbers.Num
Using Sql Server 2005, UNPIVOT, and CTE you can try something like
DECLARE #Table TABLE(
id VARCHAR(20),
allocated INT,
unallocated INT
)
INSERT INTO #Table SELECT 'foo', 2, 0
INSERT INTO #Table SELECT 'bar', 1, 2
;WITH vals AS (
SELECT *
FROM
(
SELECT id,
allocated,
unallocated
FROM #Table
) p
UNPIVOT (Cnt FOR Action IN (allocated, unallocated)) unpvt
WHERE Cnt > 0
)
, Recurs AS (
SELECT id,
Action,
Cnt - 1 Cnt
FROM vals
UNION ALL
SELECT id,
Action,
Cnt - 1 Cnt
FROM Recurs
WHERE Cnt > 0
)
SELECT id,
Action
FROM Recurs
ORDER BY id, action
This answer is just to ping back to G Mastros and doesn't need any upvotes. I thought he would appreciate a performance boost to his already superior query.
SELECT
T.id,
CASE X.Which WHEN 1 THEN 'Allocated' ELSE 'Unallocated' END
FROM
#Temp T
INNER JOIN Numbers N
On N.Num <= CASE X.Which WHEN 1 THEN T.Allocated ELSE T.Unallocated END
CROSS JOIN (SELECT 1 UNION ALL SELECT 2) X (Which)