SQL: JOIN with 'near' match - sql

I need to do a JOIN with a 'near match'. The best way to explain this is with an example:
CREATE TABLE Car
(
Vin int,
Make nvarchar(50),
ColorID int,
)
CREATE TABLE Color
(
ColorID int,
ColorCode nvarchar(10)
)
CREATE TABLE ColorName
(
ColorID int,
Languagecode varchar(12),
ColorName nvarchar(50)
)
INSERT INTO Color Values (1, 'RED CODE')
INSERT INTO Color Values (2, 'GREEN CODE')
INSERT INTO Color Values (3, 'BLUE CODE')
INSERT INTO ColorName Values (1, 'en', 'Red')
INSERT INTO ColorName Values (1, 'en-US', 'Red, my friend')
INSERT INTO ColorName Values (1, 'en-GB', 'Red, my dear')
INSERT INTO ColorName Values (1, 'en-AU', 'Red, mate')
INSERT INTO ColorName Values (1, 'fr', 'Rouge')
INSERT INTO ColorName Values (1, 'fr-BE', 'Rouge, mon ami')
INSERT INTO ColorName Values (1, 'fr-CA', 'Rouge, mon chum')
INSERT INTO Car Values (123, 'Honda', 1)
The SPROC would look like this:
DECLARE #LanguageCode varchar(12) = 'en-US'
SELECT * FROM Car A
JOIN Color B ON (A.ColorID = B.ColorID)
LEFT JOIN ColorName C ON (B.ColorID = C.ColorID AND C.LanguageCode = #LanguageCode)
See http://sqlfiddle.com/#!6/ac24d/24 (thanks to Jake!)
Here is the challenge:
When the SPROC parameter #LanguageCode is an exact match, all is well.
I would like for it to also work for partial matches; more specifically: say for example that #LanguageCode would be 'en-NZ' then I would like the SPROC to return the value for language code 'en' (since there is no value for 'en-NZ').
As an extra challenge: if there is no match at all I would like to return the 'en' value; for example if #LanguageCode would be 'es' then the SPROC would return the 'en' value (since there is no value for 'es').

Try left(#LanguageCode, 2) + '%'
http://sqlfiddle.com/#!6/ac24d/26
About second part - you have to query table two times anyway (you can do it in one statement, but if will be like two statements in one). You also can insert data into temporary (or variable) table, check if there's no rows and then make another query
I've made a query with table function
http://sqlfiddle.com/#!6/b7be3/5
So you can write
DECLARE #LanguageCode varchar(12) = 'es'
if not exists (select * from sf_test(#LanguageCode))
select * from sf_test('en')
else
select * from sf_test(#LanguageCode)
you also can write
declare #temp table
(
Vin int,
Make nvarchar(50),
ColorCode nvarchar(10)
)
insert into #temp
select * from sf_test(#LanguageCode)
if not exists (select * from #temp)
select * from sf_test('en')
else
select * from #temp

As #Roman Pekar has said in his comment, this can indeed be done, including your additional request about falling back to en, in one statement with the help of a ranking function. Here's how you could go about it:
WITH FilteredAndRanked AS (
SELECT
*,
rnk = ROW_NUMBER() OVER (
PARTITION BY ColorID
ORDER BY CASE LanguageCode
WHEN #LanguageCode THEN 1
WHEN LEFT(#LanguageCode, 2) THEN 2
WHEN 'en' THEN 3
END
)
FROM ColorName
WHERE LanguageCode IN (
#LanguageCode,
LEFT(#LanguageCode, 2),
'en'
)
)
SELECT
...
FROM Car A
INNER JOIN Color B ON (A.ColorID = B.ColorID)
LEFT JOIN FilteredAndRanked C ON (B.ColorID = C.ColorID AND C.rnk = 1)
;
That is, the ColorName table is filtered and ranked before being used in the query, and then only the rows with the rankings of 1 are joined:
The filter for ColorName includes only rows with LanguageCode values of #LanguageCode, LEFT(#LanguageCode, 2) and 'en'.
The ranking values are assigned based on which language code each row contains: rows with LEFT(#LanguageCode, 2) are ranked after those with #LanguageCode but before the 'en' ones.

Related

Updating thousands of records with different values

I've been given a spreadsheet in the format of :
Id | Val
1 57
2 99
There's approximately 10,000 records - Any ideas to handle the query below for 10,000 records without manually writing each case statement, tediously. Thanks.
update person
SET val = (
case
when Id = 1 then 57
when Id = 2 then 99
end),
where Id in (1, 2)
Quick and dirty? here you go
Add a new spredsheet call the old one datatable
In the first row first column you write
"Update person set val = ("
in the second column you link to the value on datatable spreadsheet
third column ") where ID = ("
fourth column you link to the ID of the datatable spreadsheet
fifth column ")"
Then you mark the whole row and pull it downwards to row 10000
Copy past into query escecute
I think this example can be help you :
CREATE TABLE #Person
(PrimaryKey int PRIMARY KEY,
ValueSome varchar(50)
);
GO
CREATE TABLE #MySpreadSheet
(PrimaryKey int PRIMARY KEY,
ValueSpread varchar(50)
);
GO
INSERT INTO #Person
SELECT 1, 'someValue'
INSERT INTO #Person
SELECT 2, 'someValueBeforeUpdate'
INSERT INTO #Person
SELECT 3, ''
INSERT INTO #MySpreadSheet
SELECT 1, '45'
INSERT INTO #MySpreadSheet
SELECT 2, '56'
INSERT INTO #MySpreadSheet
SELECT 3, '34'
SELECT * FROM #Person
SELECT * FROM #MySpreadSheet
UPDATE P SET P.ValueSome = SS.ValueSpread FROM #Person P JOIN #MySpreadSheet SS ON P.PrimaryKey = SS.PrimaryKey
SELECT * FROM #Person
DROP TABLE #Person
DROP TABLE #MySpreadSheet
If anyones interested, I went with this :
CREATE TABLE #TempTable(
Id int,
val int
)
INSERT INTO #TempTable (Id, val)
Values (1, 57),
(2, 99)
Update Person
Set Id = tp.Id,
val = tp.val
FROM Person p
INNER JOIN #TempTable as tp on tp.Id = p.Id
create table #example (id int , value int)
insert into #example (id, value) values (1, 10)
insert into #example (id, value) values (2, 20)
select * from #example
id value
1 10
2 20
update #example
set value = case when id = 1 then 100
when id = 2 then 200 end
where id in (1,2)
select * from #example
id value
1 100
2 200

SQL return only distinct IDs from LEFT JOIN

I've inherited some fun SQL and am trying to figure out how to how to eliminate rows with duplicate IDs. Our indexes are stored in a somewhat columnar format and then we pivot all the rows into one with the values as different columns.
The below sample returns three rows of unique data, but the IDs are duplicated. I need just two rows with unique IDs (and the other columns that go along with it). I know I'll be losing some data, but I just need one matching row per ID to the query (first, top, oldest, newest, whatever).
I've tried using DISTINCT, GROUP BY, and ROW_NUMBER, but I keep getting the syntax wrong, or using them in the wrong place.
I'm also open to rewriting the query completely in a way that is reusable as I currently have to generate this on the fly (cardtypes and cardindexes are user defined) and would love to be able to create a stored procedure. Thanks in advance!
declare #cardtypes table ([ID] int, [Name] nvarchar(50))
declare #cards table ([ID] int, [CardTypeID] int, [Name] nvarchar(50))
declare #cardindexes table ([ID] int, [CardID] int, [IndexType] int, [StringVal] nvarchar(255), [DateVal] datetime)
INSERT INTO #cardtypes VALUES (1, 'Funny Cards')
INSERT INTO #cardtypes VALUES (2, 'Sad Cards')
INSERT INTO #cards VALUES (1, 1, 'Bunnies')
INSERT INTO #cards VALUES (2, 1, 'Dogs')
INSERT INTO #cards VALUES (3, 1, 'Cat')
INSERT INTO #cards VALUES (4, 1, 'Cat2')
INSERT INTO #cardindexes VALUES (1, 1, 1, 'Bunnies', null)
INSERT INTO #cardindexes VALUES (2, 1, 1, 'playing', null)
INSERT INTO #cardindexes VALUES (3, 1, 2, null, '2014-09-21')
INSERT INTO #cardindexes VALUES (4, 2, 1, 'Dogs', null)
INSERT INTO #cardindexes VALUES (5, 2, 1, 'playing', null)
INSERT INTO #cardindexes VALUES (6, 2, 1, 'poker', null)
INSERT INTO #cardindexes VALUES (7, 2, 2, null, '2014-09-22')
SELECT TOP(100)
[ID] = c.[ID],
[Name] = c.[Name],
[Keyword] = [colKeyword].[StringVal],
[DateAdded] = [colDateAdded].[DateVal]
FROM #cards AS c
LEFT JOIN #cardindexes AS [colKeyword] ON [colKeyword].[CardID] = c.ID AND [colKeyword].[IndexType] = 1
LEFT JOIN #cardindexes AS [colDateAdded] ON [colDateAdded].[CardID] = c.ID AND [colDateAdded].[IndexType] = 2
WHERE [colKeyword].[StringVal] LIKE 'p%' AND c.[CardTypeID] = 1
ORDER BY [DateAdded]
Edit:
While both solutions are valid, I ended up using the MAX() solution from #popovitsj as it was easier to implement. The issue of data coming from multiple rows doesn't really factor in for me as all rows are essentially part of the same record. I will most likely use both solutions depending on my needs.
Here's my updated query (as it didn't quite match the answer):
SELECT TOP(100)
[ID] = c.[ID],
[Name] = MAX(c.[Name]),
[Keyword] = MAX([colKeyword].[StringVal]),
[DateAdded] = MAX([colDateAdded].[DateVal])
FROM #cards AS c
LEFT JOIN #cardindexes AS [colKeyword] ON [colKeyword].[CardID] = c.ID AND [colKeyword].[IndexType] = 1
LEFT JOIN #cardindexes AS [colDateAdded] ON [colDateAdded].[CardID] = c.ID AND [colDateAdded].[IndexType] = 2
WHERE [colKeyword].[StringVal] LIKE 'p%' AND c.[CardTypeID] = 1
GROUP BY c.ID
ORDER BY [DateAdded]
You could use MAX or MIN to 'decide' on what to display for the other columns in the rows that are duplicate.
SELECT ID, MAX(Name), MAX(Keyword), MAX(DateAdded)
(...)
GROUP BY ID;
using row number windowed function along with a CTE will do this pretty well. For example:
;With preResult AS (
SELECT TOP(100)
[ID] = c.[ID],
[Name] = c.[Name],
[Keyword] = [colKeyword].[StringVal],
[DateAdded] = [colDateAdded].[DateVal],
ROW_NUMBER()OVER(PARTITION BY c.ID ORDER BY [colDateAdded].[DateVal]) rn
FROM #cards AS c
LEFT JOIN #cardindexes AS [colKeyword] ON [colKeyword].[CardID] = c.ID AND [colKeyword].[IndexType] = 1
LEFT JOIN #cardindexes AS [colDateAdded] ON [colDateAdded].[CardID] = c.ID AND [colDateAdded].[IndexType] = 2
WHERE [colKeyword].[StringVal] LIKE 'p%' AND c.[CardTypeID] = 1
ORDER BY [DateAdded]
)
SELECT * from preResult WHERE rn = 1

project a sparse result at some level

I don't really know what to call this but it's not that hard to explain
Basically what I have is a result like this
Similarity ColumnA ColumnB ColumnC
1 SomeValue NULL SomeValue
2 NULL SomeB NULL
3 SomeValue NULL SomeC
4 SomeA NULL NULL
This result is created by matching a set of strings against another table. Each string also contains some values for these ColumnA..C which are the values I wan't to aggregate in some way.
Something like min/max works very well but I can't figure out how to get it to account for the highest similarity not just the min/max value. I don't really want the min/max, I want the first non-null value with the highest similarity.
Ideally the result would look like this
ColumnA ColumnB ColumnC
SomeA SomeB SomeC
I'd like be able to efficiently join in the temporary result to compute the rest and I've been exploring different options. Something which I've been considering is creating a SQL Server CLR aggregate the yields the "first" non-null value but I'm unsure if there's even such a thing as a first or last when running an aggregate on a result.
Okay, so I figured it out, I originally had trouble with the UPDATE FROM and JOIN not playing well together. I was counting on that the UPDATE would just occur multiple times and that would give me the correct results, however, there's no such guarantee from SQL Server (it's actually undefined behavior and alltough it appeared to work we'll have none of that) but since you can run UPDATE against a CTE I combined that with the OUTER APPLY to select the exactly 1 row to complement a missing value if possible.
Here's the whole thing with test data as well.
DECLARE #cost TABLE (
make nvarchar(100) not null,
model nvarchar(100),
a numeric(18,2),
b numeric(18,2)
);
INSERT #cost VALUES ('a%', null, 100, 2);
INSERT #cost VALUES ('a%', 'a%', 149, null);
INSERT #cost VALUES ('a%', 'ab', 349, null);
INSERT #cost VALUES ('b', null, null, 2.5);
INSERT #cost VALUES ('b', 'b%', 249, null);
INSERT #cost VALUES ('b', 'b', null, 3);
DECLARE #unit TABLE (
id int,
make nvarchar(100) not null,
model nvarchar(100)
);
INSERT #unit VALUES (1, 'a', null);
INSERT #unit VALUES (2, 'a', 'a');
INSERT #unit VALUES (3, 'a', 'ab');
INSERT #unit VALUES (4, 'b', null);
INSERT #unit VALUES (5, 'b', 'b');
DECLARE #tmp TABLE (
id int,
specificity int,
a numeric(18,2),
b numeric(18,2),
primary key(id, specificity)
);
INSERT #tmp
OUTPUT inserted.* --FOR DEBUGGING
SELECT
unit.id
, ROW_NUMBER() OVER (
PARTITION BY unit.id
ORDER BY cost.make DESC, cost.model DESC
) AS specificity
, cost.a
, cost.b
FROM #unit unit
INNER JOIN #cost cost ON unit.make LIKE cost.make
AND (cost.model IS NULL OR unit.model LIKE cost.model)
;
--fix the holes
WITH tmp AS (
SELECT *
FROM #tmp
WHERE specificity = 1
AND (a IS NULL OR b IS NULL) --where necessary
)
UPDATE tmp
SET
tmp.a = COALESCE(tmp.a, a.a)
, tmp.b = COALESCE(tmp.b, b.b)
OUTPUT inserted.* --FOR DEBUGGING
FROM tmp
OUTER APPLY (
SELECT TOP 1 a
FROM #tmp a
WHERE a.id = tmp.id
AND a.specificity > 1
AND a.a IS NOT NULL
ORDER BY a.specificity
) a
OUTER APPLY (
SELECT TOP 1 b
FROM #tmp b
WHERE b.id = tmp.id
AND b.specificity > 1
AND b.b IS NOT NULL
ORDER BY b.specificity
) b
;

SQL Server Simple Group by query

I have a simple problem , Although i believe its simple , am not able to figure out the same.
Consider i have the below table with exactly same data as given below :
CREATE TABLE #temp
(
link varchar(255),
number INT,
fname varchar(255)
)
insert into #temp VALUES ('abc',1,'f1')
insert into #temp VALUES ('abc',2,'f2')
insert into #temp VALUES ('abc',3,'f3')
insert into #temp VALUES ('abc',4,'f6')
insert into #temp VALUES ('abc',10,'f100')
insert into #temp VALUES ('abe',-1,'f0')
insert into #temp VALUES ('abe',1,'f1')
insert into #temp VALUES ('abe',2,'f2')
insert into #temp VALUES ('abe',3,'f3')
insert into #temp VALUES ('abe',4,'f6')
insert into #temp VALUES ('abe',20,'f200')
insert into #temp VALUES ('cbe',-1,'f0')
insert into #temp VALUES ('cbe',1,'f1')
insert into #temp VALUES ('cbe',2,'f2')
insert into #temp VALUES ('cbe',3,'f3')
Now for a given link , i need to get the max 'number' and the corresponding 'fname' which has the max 'number' for the given 'link'.
1)Ex : if link is 'abc' , output should be
abc, 10, f100
2)Ex : if link if 'abe' , Output should be
abe, 20, f200
3)Now link can be also given as a pattern , like (link like 'ab%') , so output should be
abc, 10, f100
abe, 20, f200
4)if (link like 'cb%') , so output should be
cbe, 3, f3
Any help in writing this group by query. I have a solution using CAST and string concat like below , but that seems to be in-efficient.
select link,number,fname from #temp
where link like 'ab%' and link+'_'+CAST(number AS varchar(255))
in (select link+'_'+CAST(MAX(number) AS varchar(255)) from #temp
group by link)
Thanks..
Using a self join:
SELECT x.link,
x.number,
x.fname
FROM #temp x
JOIN (SELECT t.link,
MAX(t.number) AS max_number
FROM #temp t
GROUP BY t.link) y ON y.link = x.link
AND y.max_number = x.number
Using a CTE and ROW_NUMBER (SQL Server 2005+):
WITH cte AS (
SELECT x.link,
x.number,
x.fname,
ROW_NUMBER() OVER(PARTITION BY x.link
ORDER BY x.number DESC) rank
FROM #temp x)
SELECT c.link,
c.number,
c.fname
FROM cte c
WHERE c.rank = 1

Select records with order of IN clause

I have
SELECT * FROM Table1 WHERE Col1 IN(4,2,6)
I want to select and return the records with the specified order which i indicate in the IN clause
(first display record with Col1=4, Col1=2, ...)
I can use
SELECT * FROM Table1 WHERE Col1 = 4
UNION ALL
SELECT * FROM Table1 WHERE Col1 = 6 , .....
but I don't want to use that, cause I want to use it as a stored procedure and not auto generated.
I know it's a bit late but the best way would be
SELECT *
FROM Table1
WHERE Col1 IN( 4, 2, 6 )
ORDER BY CHARINDEX(CAST(Col1 AS VARCHAR), '4,2,67')
Or
SELECT CHARINDEX(CAST(Col1 AS VARCHAR), '4,2,67')s_order,
*
FROM Table1
WHERE Col1 IN( 4, 2, 6 )
ORDER BY s_order
You have a couple of options. Simplest may be to put the IN parameters (they are parameters, right) in a separate table in the order you receive them, and ORDER BY that table.
The solution is along this line:
SELECT * FROM Table1
WHERE Col1 IN(4,2,6)
ORDER BY
CASE Col1
WHEN 4 THEN 1
WHEN 2 THEN 2
WHEN 6 THEN 3
END
select top 0 0 'in', 0 'order' into #i
insert into #i values(4,1)
insert into #i values(2,2)
insert into #i values(6,3)
select t.* from Table1 t inner join #i i on t.[in]=t.[col1] order by i.[order]
Replace the IN values with a table, including a column for sort order to used in the query (and be sure to expose the sort order to the calling application):
WITH OtherTable (Col1, sort_seq)
AS
(
SELECT Col1, sort_seq
FROM (
VALUES (4, 1),
(2, 2),
(6, 3)
) AS OtherTable (Col1, sort_seq)
)
SELECT T1.Col1, O1.sort_seq
FROM Table1 AS T1
INNER JOIN OtherTable AS O1
ON T1.Col1 = O1.Col1
ORDER
BY sort_seq;
In your stored proc, rather than a CTE, split the values into table (a scratch base table, temp table, function that returns a table, etc) with the sort column populated as appropriate.
I have found another solution. It's similar to the answer from onedaywhen, but it's a little shorter.
SELECT sort.n, Table1.Col1
FROM (VALUES (4), (2), (6)) AS sort(n)
JOIN Table1
ON Table1.Col1 = sort.n
I am thinking about this problem two different ways because I can't decide if this is a programming problem or a data architecture problem. Check out the code below incorporating "famous" TV animals. Let's say that we are tracking dolphins, horses, bears, dogs and orangutans. We want to return only the horses, bears, and dogs in our query and we want bears to sort ahead of horses to sort ahead of dogs. I have a personal preference to look at this as an architecture problem, but can wrap my head around looking at it as a programming problem. Let me know if you have questions.
CREATE TABLE #AnimalType (
AnimalTypeId INT NOT NULL PRIMARY KEY
, AnimalType VARCHAR(50) NOT NULL
, SortOrder INT NOT NULL)
INSERT INTO #AnimalType VALUES (1,'Dolphin',5)
INSERT INTO #AnimalType VALUES (2,'Horse',2)
INSERT INTO #AnimalType VALUES (3,'Bear',1)
INSERT INTO #AnimalType VALUES (4,'Dog',4)
INSERT INTO #AnimalType VALUES (5,'Orangutan',3)
CREATE TABLE #Actor (
ActorId INT NOT NULL PRIMARY KEY
, ActorName VARCHAR(50) NOT NULL
, AnimalTypeId INT NOT NULL)
INSERT INTO #Actor VALUES (1,'Benji',4)
INSERT INTO #Actor VALUES (2,'Lassie',4)
INSERT INTO #Actor VALUES (3,'Rin Tin Tin',4)
INSERT INTO #Actor VALUES (4,'Gentle Ben',3)
INSERT INTO #Actor VALUES (5,'Trigger',2)
INSERT INTO #Actor VALUES (6,'Flipper',1)
INSERT INTO #Actor VALUES (7,'CJ',5)
INSERT INTO #Actor VALUES (8,'Mr. Ed',2)
INSERT INTO #Actor VALUES (9,'Tiger',4)
/* If you believe this is a programming problem then this code works */
SELECT *
FROM #Actor a
WHERE a.AnimalTypeId IN (2,3,4)
ORDER BY case when a.AnimalTypeId = 3 then 1
when a.AnimalTypeId = 2 then 2
when a.AnimalTypeId = 4 then 3 end
/* If you believe that this is a data architecture problem then this code works */
SELECT *
FROM #Actor a
JOIN #AnimalType at ON a.AnimalTypeId = at.AnimalTypeId
WHERE a.AnimalTypeId IN (2,3,4)
ORDER BY at.SortOrder
DROP TABLE #Actor
DROP TABLE #AnimalType
ORDER BY CHARINDEX(','+convert(varchar,status)+',' ,
',rejected,active,submitted,approved,')
Just put a comma before and after a string in which you are finding the substring index or you can say that second parameter.
And first parameter of CHARINDEX is also surrounded by , (comma).