Using Scoring to find Best Match in SQL - sql

Suppose I have a DATA table like:
ID | Col1 | Col2 | Col3
1 a b 23
2 a c 14
3 f g 11
Suppose I have a POSSIBLE_MATCHES table like:
MatchID | Col1 | Col2 | Col3
101 a a 11
102 a b 11
103 a b 14
104 a c 23
105 f a 1
Suppose I have a WEIGHTS table like (if you want for the sake of this discussion and simplicity assume all weights to be 1 - I can improvise my solution later to incorporate the weights):
Col | Weight
Col1 1
Col2 1.5
Col3 2
So for each possible match we would calculate a SCORE on each matching column.
Score = Col1 Weight * (CASE WHEN DATA.COL1 = POSSIBLE_MATCHES.Col1 THEN 1 ELSE 0) +
Col2 Weight * (CASE WHEN DATA.COL2 = POSSIBLE_MATCHES.Col2 THEN 1 ELSE 0) +
Col3 Weight * (CASE WHEN DATA.COL3 = POSSIBLE_MATCHES.Col3 THEN 1 ELSE 0)
So for example the BEST MATCH for the first row: Col1 = a, Col2 = b, Col3 = 23:
MatchID | Col1 | Col2 | Col3 | Score
101 a a 11 1*1 + 1.5*0 + 2*0 = 1
102 a b 11 1*1 + 1.5*1 + 2*0 = 2.5
103 a b 14 1*1 + 1.5*1 + 2*0 = 2.5
104 a c 23 1*1 + 1.5*0 + 2*1 = 3
105 f a 1 1*0 + 1.5*0 + 2*0 = 0
So in this case the best match for ID:1 is MatchID:104. If the scores are the same then take the lowest MatchID.
Here's a sql fiddle if you wish to play around with this:
http://sqlfiddle.com/#!6/9df45/1
For each ID in DATA how would I find the BEST match in POSSIBLE MATCHES?

In this solution, we do a full join to get all possibilities and evaluate the score of all of them. Then, we assign them a number from best to lowest with ROW_NUMBER. Finally, we exclude all those that aren't the best one with "WHERE Rank = 1"
SELECT *
FROM
(SELECT data.ID,
possible_matches.MatchID,
Score = (CASE WHEN data.Col1 = possible_matches.Col1 THEN 1 ELSE 0 END) * 1 +
(CASE WHEN data.Col2 = possible_matches.Col2 THEN 1 ELSE 0 END) * 1.5 +
(CASE WHEN data.Col3 = possible_matches.Col3 THEN 1 ELSE 0 END) * 2,
[Rank] = ROW_NUMBER() OVER(PARTITION BY data.ID ORDER BY (CASE WHEN data.Col1 = possible_matches.Col1 THEN 1 ELSE 0 END) * 1 +
(CASE WHEN data.Col2 = possible_matches.Col2 THEN 1 ELSE 0 END) * 1.5 +
(CASE WHEN data.Col3 = possible_matches.Col3 THEN 1 ELSE 0 END) * 2 DESC)
from data, possible_matches) AS AllScore
WHERE AllScore.[Rank] = 1

Try this:
DECLARE #d TABLE(ID INT, Col1 CHAR(1), Col2 CHAR(1), Col3 INT)
DECLARE #m TABLE(ID INT, Col1 CHAR(1), Col2 CHAR(1), Col3 INT)
INSERT INTO #d VALUES
(1, 'a', 'b', 23),
(2, 'a', 'c', 14),
(3, 'f', 'g', 11)
INSERT INTO #m VALUES
(101, 'a', 'a', 11),
(102, 'a', 'b', 11),
(103, 'a', 'b', 14),
(104, 'a', 'c', 23),
(105, 'f', 'a', 1)
SELECT DataID, MatchID FROM
(
SELECT d.ID AS DataID,
m.ID AS MatchID,
ROW_NUMBER() OVER(PARTITION BY d.ID ORDER BY
CASE WHEN d.Col1 = m.Col1 THEN 1 ELSE 0 END * 1 +
CASE WHEN d.Col2 = m.Col2 THEN 1 ELSE 0 END * 1.5 +
CASE WHEN d.Col3 = m.Col3 THEN 1 ELSE 0 END * 2 DESC) AS rn
FROM #d d
CROSS JOIN #m m
) t WHERE rn = 1
Output:
DataID MatchID
1 104
2 103
3 102

Related

Turn one column into multiple based on index ranges

I have the following table in SQL Server:
| idx | value |
| --- | ----- |
| 1 | N |
| 2 | C |
| 3 | C |
| 4 | P |
| 5 | N |
| 6 | N |
| 7 | C |
| 8 | N |
| 9 | P |
I would like to turn it to this:
| idx 1-3 | idx 4-6 | idx 7-9 |
| ------- | ------- | ------- |
| N | P | C |
| C | N | N |
| C | N | P |
How can I do this?
If you want to split the data into three columns, with the data in order by id -- and assuming that the ids start at 1 and have no gaps -- then on your particular data, you can use:
select max(case when (idx - 1) / 3 = 0 then value end) as grp_1,
max(case when (idx - 1) / 3 = 1 then value end) as grp_2,
max(case when (idx - 1) / 3 = 2 then value end) as grp_3
from t
group by idx % 3
order by min(idx);
The above doesn't hard-code the ranges, but the "3" means different things in different contexts -- sometimes the number of columns, sometimes the number of rows in the result set.
However, the following generalizes so it adds additional rows as needed:
select max(case when (idx - 1) / num_rows = 0 then idx end) as grp_1,
max(case when (idx - 1) / num_rows = 1 then idx end) as grp_2,
max(case when (idx - 1) / num_rows = 2 then idx end) as grp_3
from (select t.*, convert(int, ceiling(count(*) over () / 3.0)) as num_rows
from t
) t
group by idx % num_rows
order by min(idx);
Here is a db<>fiddle.
You can compute the category of each row with a lateral join, then enumerate the rows within each category, and finally pivot with conditional aggregation:
select
max(case when cat = 'idx_1_3' then value end) as idx_1_3,
max(case when cat = 'idx_4_6' then value end) as idx_4_6,
max(case when cat = 'idx_7_9' then value end) as idx_7_9
from (
select t.*, row_number() over(partition by v.cat) as rn
from mytable t
cross apply (values (
case
when idx between 1 and 3 then 'idx_1_3'
when idx between 4 and 6 then 'idx_4_6'
when idx between 7 and 9 then 'idx_7_9'
end
)) v(cat)
) t
group by rn
Another solution with union all operator and row_number function
select max(IDX_1_3) as IDX_1_3, max(IDX_4_6) as IDX_4_6, max(IDX_1_3) as IDX_1_3
from (
select
case when idx in (1, 2, 3) then value end as idx_1_3
, null as idx_4_6
, null as idx_7_9
, row_number()over(order by idx) as rnb
from Your_table where idx in (1, 2, 3)
union all
select null as idx_1_3
, case when idx in (4, 5, 6) then value end as idx_4_6
, null as idx_7_9
, row_number()over(order by idx) as rnb
from Your_table where idx in (4, 5, 6)
union all
select null as idx_1_3
, null as idx_4_6
, case when idx in (7, 8, 9) then value end as idx_7_9
, row_number()over(order by idx) as rnb
from Your_table where idx in (7, 8, 9)
) t
group by rnb
;
drop table if exists #t;
create table #t (id int identity(1,1) primary key clustered, val varchar(20));
insert into #t(val)
select top (2002) concat(row_number() over(order by ##spid), ' - ', char(65 + abs(checksum(newid()))%26))
from sys.all_objects
order by row_number() over(order by ##spid);
select p.r, 1+(p.r-1)/3 grp3id, p.[1] as [idx 1-3], p.[2] as [idx 4-6], p.[3] as [idx 7-9]
from
(
select
val,
1+((1+(id-1)/3)-1)%3 as c3,
row_number() over(partition by 1+((1+(id-1)/3)-1)%3 order by id) as r
from #t
) as src
pivot
(
max(val) for c3 in ([1], [2], [3])
) as p
order by p.r;
You can use the mod as follows:
select max(case when idx between 1 and 3 then value end) as idx_1_3,
max(case when idx between 4 and 6 then value end) as idx_4_6,
max(case when idx between 7 and 9 then value end) as idx_7_9
from t
group by (idx-1) % 3;
If your idx is not continuous numbers then instead of from t use the following
from (select value, row_number() over(order by idx) as idx
from your_table t) t

how do we have count of a specific values for multiple columns with table having a unique column

If I have a table like :
u_id A B C D
----------------------------------
jud 1 1 0 1
bud 0 0 1 0
cud 1 1 0 1
nud 0 0 1 0
dud 1 0 0 1
aud 0 1 1 0
fud 1 0 1 1
which sql is useful to get output like:
count 0 count 1
-----------------------
A 3 4
B 4 3
C 3 4
D 3 4
Doesn't matter row or columns just need count of a specific value count for multiple columns in a table.
Instead of 0's and 1's it can be specific string values as well as 'yes' or 'no'
Thank you
Use UNION ALL and aggregation. Assuming that the only possible values in the columns are 0 and 1:
SELECT 'A' col, COUNT(*) - SUM(A) count0, SUM(A) count1 FROM mytable
UNION ALL SELECT 'B', COUNT(*) - SUM(B), SUM(B) FROM mytable
UNION ALL SELECT 'C', COUNT(*) - SUM(C), SUM(C) FROM mytable
UNION ALL SELECT 'D', COUNT(*) - SUM(D), SUM(D) FROM mytable
Demo on DB Fiddle:
| col | count0 | count1 |
| --- | ------ | ------ |
| A | 3 | 4 |
| B | 4 | 3 |
| C | 3 | 4 |
| D | 3 | 4 |
If other values than 0/1 are possible, then you can change the SELECTs to, eg 'yes'/'no', then:
SELECT
'A' col,
SUM(CASE WHEN A = 'no' THEN 1 ELSE 0 END) count_no,
SUM(CASE WHEN A = 'yes' THEN 1 ELSE O END) count_yes
FROM mytable
GROUP BY col
UNION ALL SELECT
'B' col,
SUM(CASE WHEN B = 'no' THEN 1 ELSE 0 END),
SUM(CASE WHEN B = 'yes' THEN 1 ELSE 0 END)
FROM mytable
GROUP BY col
UNION ALL SELECT
'C' col,
SUM(CASE WHEN C = 'no' THEN 1 ELSE 0 END),
SUM(CASE WHEN C = 'yes' THEN 1 ELSE 0 END)
FROM mytable
GROUP BY col
UNION ALL SELECT
'D' col,
SUM(CASE WHEN D = 'no' THEN 1 ELSE 0 END),
SUM(CASE WHEN D = 'yes' THEN 1 ELSE 0 END)
FROM mytable
GROUP BY col
If you are okay with a single row, you can do:
select sum(a), sum(1-a), sum(b), sum(1-b), sum(c), sum(1-c), sum(d), sum(1-d)
from t;
The advantage of this approach is that t is read only once. This is even more true if it is a complex view.
With that in mind, you can unpivot this result:
select v.x,
(case when v.x = 'a' then a_0 end) as a_0,
(case when v.x = 'a' then a_1 end) as a_1,
(case when v.x = 'b' then b_0 end) as b_0,
(case when v.x = 'b' then b_1 end) as b_1,
(case when v.x = 'c' then c_0 end) as c_0,
(case when v.x = 'c' then c_1 end) as c_1,
(case when v.x = 'd' then d_0 end) as d_0,
(case when v.x = 'd' then d_1 end) as d_1
from (select sum(a) as a_1, sum(1-a) as a_0,
sum(b) as b_1, sum(1-b) as b_0,
sum(c) as c_1, sum(1-c) as c_0,
sum(d) as d_1, sum(1-d) as d_0
from t
) s cross join
(values ('a'), ('b'), ('c'), ('d')) v(x) -- may require a subquery
You don't mention the database you're using, but in Oracle you can use DECODE and COUNT together to make this reasonably clean:
SELECT 'A' AS FIELD_NAME,
COUNT(DECODE(A, 0, 0, NULL)) AS ZERO_COUNT,
COUNT(DECODE(A, 0, NULL, A)) AS NON_ZERO_COUNT
FROM TEST_TABLE UNION ALL
SELECT 'B', COUNT(DECODE(B, 0, 0, NULL)),
COUNT(DECODE(B, 0, NULL, A))
FROM TEST_TABLE UNION ALL
SELECT 'C', COUNT(DECODE(C, 0, 0, NULL)),
COUNT(DECODE(C, 0, NULL, A))
FROM TEST_TABLE UNION ALL
SELECT 'D', COUNT(DECODE(D, 0, 0, NULL)),
COUNT(DECODE(D, 0, NULL, A))
FROM TEST_TABLE
dbfiddle here

How to Compare Multiple Columns and Rows of a Table

I have a big data table that looks something like this
ID Marker Value1 Value2
================================
1 A 10 11
1 B 12 13
1 C 14 15
2 A 10 11
2 B 13 12
2 C
3 A 10 11
3 C 12 13
I want to search this data by the following data, which is user input and not stored in a table:
Marker Value1 Value2
==========================
A 10 11
B 12 13
C 14 14
The result should be something like this:
ID Marker Value1 Value2 Match?
==========================================
1 A 10 11 true
1 B 12 13 true
1 C 14 15 false
2 A 10 11 true
2 B 13 12 true
2 C false
3 A 10 11 true
3 C 12 13 false
And ultimately this (the above table is not necessary, it should demonstrate how these values came to be):
ID Matches Percent
========================
1 2 66%
2 2 66%
3 1 33%
I'm searching for the most promising approach to get this to work in SQL (PostgreSQL to be exact).
My ideas:
Create a temporary table, join it with the above one and group the result
Use CASE WHEN or a temporary PROCEDURE to only use a single (probably bloated) query
I'm not satisified with either approach, hence the question. How can I compare two tables like these efficiently?
The user input can be supplied using a VALUES clause in a common table expression and that can then be used in a left join with the actual table.
with user_input (marker, value1, value2) as (
values
('A', 10, 11),
('B', 12, 13),
('C', 14, 14)
)
select d.id,
count(*) filter (where (d.marker, d.value1, d.value2) is not distinct from (u.marker, u.value1, u.value2)),
100 * count(*) filter (where (d.marker, d.value1, d.value2) is not distinct from (u.marker, u.value1, u.value2)) / cast(count(*) as numeric) as pct
from data d
left join user_input u on (d.marker, d.value1, d.value2) = (u.marker, u.value1, u.value2)
group by d.id
order by d.id;
Returns:
id | count | pct
---+-------+------
1 | 2 | 66.67
2 | 2 | 66.67
3 | 1 | 50.00
Online example: https://rextester.com/OBOOD9042
Edit
If the order of the values isn't relevant (so (12,13) is considered the same as (13,12) then the comparison gets a bit more complicated.
with user_input (marker, value1, value2) as (
values
('A', 10, 11),
('B', 12, 13),
('C', 14, 14)
)
select d.id,
count(*) filter (where (d.marker, least(d.value1, d.value2), greatest(d.value1, d.value2)) is not distinct from (u.marker, least(u.value1, u.value2), greatest(u.value1, u.value2)))
from data d
left join user_input u on (d.marker, least(d.value1, d.value2), greatest(d.value1, d.value2)) = (u.marker, least(u.value1, u.value2), greatest(u.value1, u.value2))
group by d.id
order by d.id;
You can use a CTE to pre-compute the matches. Then a simple aggregation will do the trick. Assuming your parameters are:
Marker Value1 Value2
==========================
m1 x1 y1
m2 x2 y2
m3 x3 y3
You can do:
with x as (
select
id,
case when
marker = :m1 and (value1 = :x1 and value2 = :y1 or value1 = :y1 and value2 = :x1)
or marker = :m2 and (value1 = :x2 and value2 = :y2 or value1 = :y2 and value2 = :x2)
or marker = :m3 and (value1 = :x3 and value2 = :y3 or value1 = :y3 and value2 = :x3)
then 1 else 0 end as matches
from t
)
select
id,
sum(matches) as matches,
100.0 * sum(matches) / count(*) as percent
from x
group by id
Try this:
CREATE TABLE #Temp
(
Marker nvarchar(50),
Value1 nvarchar(50),
Value2 nvarchar(50)
)
INSERT INTO #Temp Values ('A', '10', '11')
INSERT INTO #Temp Values ('B', '12', '13')
INSERT INTO #Temp Values ('C', '14', '14')
SELECT m.Id, m.Marker, m.Value1, m.Value2,
(Select
CASE
WHEN COUNT(*) = 0 THEN 'False'
WHEN COUNT(*) <> 0 THEN 'True'
END
FROM #Temp t
WHERE t.Marker = m.Marker and t.Value1 = m.Value1 and t.Value2 = m.Value2) as Matches
FROM [Test].[dbo].[Markers] m
ORDER BY Matches DESC
Drop TABLE #Temp
If it's exactly what you want, I try to solve the second part of it.

Select only the "most complete" record

I need to solve the following problem.
Let's suppose I have a table with 4 fields called a, b, c, d.
I have the following records:
-------------------------------------
a | b | c | d
-------------------------------------
1 | 2 | | row 1
1 | 2 | 3 | 4 row 2
1 | 2 | | 4 row 3
1 | 2 | 3 | row 4
As it's possible to observe, rows 1,3,4 are "sub-records" of row 2.
What I would like to do is, to extract only 2nd row.
Could you help me please?
Thanks in advance for the answer
EDIT: I need to be more specific.
I could have also the cases:
-------------------------------------
a | b | c | d
-------------------------------------
1 | 2 | | row 1
1 | 2 | | 4 row 2
1 | | | 4 row 3
where I need to extract the 2nd row,
-------------------------------------
a | b | c | d
-------------------------------------
1 | 2 | | row 1
1 | 2 | 3 | row 2
1 | | 3 | row 3
and again I need to extract the 2nd row.
Same for couples,
a | b | c | d
-------------------------------------
1 | | | row 1
1 | | 3 | row 2
| | 3 | row 3
and so on for the other examples.
(Of course, it's now always 2nd row)
Using a NOT EXISTS the records that have a better duplicate can be filtered out.
create table abcd (
a int,
b int,
c int,
d int
);
insert into abcd (a, b, c, d) values
(1, 2, null, null)
,(1, 2, 3, 4)
,(1, 2, null, 4)
,(1, 2, 3, null)
,(2, 3, null,null)
,(2, 3, null, 5)
,(2, null, null, 5)
,(3, null, null, null)
,(3, null, 5, null)
,(null, null, 5, null)
SELECT *
FROM abcd AS t
WHERE NOT EXISTS
(
select 1
from abcd as d
where (t.a is null or d.a = t.a)
and (t.b is null or d.b = t.b)
and (t.c is null or d.c = t.c)
and (t.d is null or d.d = t.d)
and (case when t.a is null then 0 else 1 end +
case when t.b is null then 0 else 1 end +
case when t.c is null then 0 else 1 end +
case when t.d is null then 0 else 1 end) <
(case when d.a is null then 0 else 1 end +
case when d.b is null then 0 else 1 end +
case when d.c is null then 0 else 1 end +
case when d.d is null then 0 else 1 end)
);
a | b | c | d
-: | ---: | ---: | ---:
1 | 2 | 3 | 4
2 | 3 | null | 5
3 | null | 5 | null
db<>fiddle here
You will need to compute a "completion index" for each row. In the example you provided, you might use something along the lines of:
(CASE WHEN a IS NULL THEN 0 ELSE 1) +
(CASE WHEN b IS NULL THEN 0 ELSE 1) +
(CASE WHEN c IS NULL THEN 0 ELSE 1) +
(CASE WHEN d IS NULL THEN 0 ELSE 1) AS CompletionIndex
Then SELECT the top 1 ordered by CompletionIndex in descending order.
This is obviously not very scalable across a large number of columns. But if you have a large number of sparsely populated columns you might consider a row-based rather than column-based structure for your data. That design would make it much easier to count the number of non-NULL values for each entity.
Most complete rows, by your definition, are the ones with the least null columns:
SELECT * FROM tablename
WHERE (
(CASE WHEN a IS NULL THEN 0 ELSE 1 END) +
(CASE WHEN b IS NULL THEN 0 ELSE 1 END) +
(CASE WHEN c IS NULL THEN 0 ELSE 1 END) +
(CASE WHEN d IS NULL THEN 0 ELSE 1 END)
) =
(SELECT MAX(
(CASE WHEN a IS NULL THEN 0 ELSE 1 END) +
(CASE WHEN b IS NULL THEN 0 ELSE 1 END) +
(CASE WHEN c IS NULL THEN 0 ELSE 1 END) +
(CASE WHEN d IS NULL THEN 0 ELSE 1 END))
FROM tablename)
Hmmm . . . I think you can use not exists:
with t as (
select t.*, row_number() over (order by a) as id
from t
)
select t.*
from t
where not exists (select 1
from t t2
where ((t2.a is not distinct from t.a or t2.a is not null and t.a is null) and
(t2.b is not distinct from t.b or t2.b is not null and t.b is null) and
(t2.c is not distinct from t.c or t2.c is not null and t.c is null) and
(t2.d is not distinct from t.d or t2.d is not null and t.d is null)
) and
t2.id <> t.id
);
The logic is that no more specific row exists, where the values match
Here is a db<>fiddle.
As mentioned by Gordon Linoff, we do have to use something like not exists too,
Edit Using EXCEPT helps
This might work...
SELECT * from table1
EXCEPT
(
SELECT t1.*
FROM table1 t1
JOIN table1 t2
ON COALESCE(t1.a, t2.a, -1) = COALESCE(t2.a, -1)
AND COALESCE(t1.b, t2.b, -1) = COALESCE(t2.b, -1)
AND COALESCE(t1.c, t2.c, -1) = COALESCE(t2.c, -1)
AND COALESCE(t1.d, t2.d, -1) = COALESCE(t2.d, -1)
)
Here, t1 is every subset row.
Note: We are assuming value -1 as sentinel value and it does not occur in any column.

To 'flag' a specific condition in SQL query

I have a result set in the below format and I need to flag "GroupColumn"
-------------------------------------------------------------------
ID GroupColumn ConditionCol1 ConditionCol2
-------------------------------------------------------------------
1 101 ABC 99
2 101 DEF 99
3 102 ABC 01
4 102 DEF 01
5 103 ABC 02
6 103 DEF 99
7 104 DEF 02
8 104 DEF 99
First of the I need to flag the data based on "GroupColumn", with in this "GroupColumn" I am looking to satisfy Condition of "ABC" from one row and "99" from another row but not necessarily from the same row.
I looking to get a final result set some thing like this for the "Output" column
-------------------------------------------------------------------
ID GroupColumn ConditionCol1 ConditionCol2 Output
-------------------------------------------------------------------
1 101 ABC 99 Satisfied
2 101 DEF 99 Satisfied
3 102 ABC 01
4 102 DEF 01
5 103 ABC 02 Satisfied
6 103 DEF 99 Satisfied
7 104 DEF 02
8 104 DEF 99
You can do this using window functions:
select t.*,
(case when sum(case when conditioncol1 = 'ABC' then 1 else 0 end) over (partition by groupcolumn) > 0 and
sum(case when conditioncol2 = 99 then 1 else 0 end) over (partition by groupcolumn) > 0
then 'Satisfied'
end) as flag
from t;
An alternative is to use group by:
select t.*, tt.flag
from t join
(select groupcolumn, 'Satisfied' as flag
from t
where conditioncol1 = 'ABC' or conditioncol2 = 99
group by groupcolumn
having sum(case when conditioncol1 = 'ABC' then 1 else 0 end) > 0 and
sum(case when conditioncol2 = 99 then 1 else 0 end) > 0
) tt
on tt.groupcolumn = t.groupcolumn;
Assuming you are using SQL Server and you need to add Output column to your original table, you can try the following:
create table #temp
(GroupColumn int,ConditionCol1 varchar(20),ConditionCol2 int)
insert into #temp values (100,'ABC',99)
insert into #temp values (100,'DEF',99)
insert into #temp values (101,'ABC',02)
insert into #temp values (101,'DEF',99)
insert into #temp values (102,'DEF',99)
insert into #temp values (102,'DEF',99)
ALTER TABLE #temp
ADD [Output] varchar(10)
GO
;with cte(GroupColumn) as (
select GroupColumn
from #temp
where ConditionCol1 <> 'ABC'
and ConditionCol2 = 99
)
UPDATE t
SET [Output] = 'Satisfied'
FROM #temp t
INNER JOIN cte on t.GroupColumn = cte.GroupColumn
WHERE t.ConditionCol1 = 'ABC'
UPDATE t
SET [Output] = 'Satisfied'
FROM #temp t
WHERE [Output] is null
and t.GroupColumn in (Select GroupColumn from #temp where [Output]='Satisfied')
select * from #temp