I have a T-SQL query to do in the most efficient way as possible.
Here is an example of my table:
A
B
C
D
E
1
x, y
z
NULL
NULL
2
x
NULL
NULL
y
3
y
z
NULL
NULL
4
a
NULL
b
x
Now, I need to do a query to classify my best matching records. Let's say that I need to take the top 3 records that match the more of the values 'x' & 'y' (it could be more than 2 values) into the columns B, C, D, E
A
NumberOfMatches
Comment
1
2
Because Column B contains x, y
2
2
Because Column B contains x & Column E contains y
3
1
Because Column B contains y
4
1
Because Column E contains x
Could you help me to find a good way to do this query?
I would highly recommend that you change how your data is stored, storing delimited strings to record multiple values is a recipe for disaster. If you have a one to many relationship use a child table with a foreign key to the main table. There is very rarely a reason to store delimited data like this, and when you have to query it and manipulate it you realise why it is not advised.
Assuming that each of your columns are the same and can hold many values, you'll need to split all of them, which you can do using this:
SELECT t.A, upvt.Col, Value = TRIM(ss.value)
FROM #T AS t
CROSS APPLY (VALUES ('B', t.B), ('C', t.C), ('D', t.D), ('E', t.E)) upvt (Col, Value)
CROSS APPLY STRING_SPLIT(upvt.Value, ',') AS ss;
Which gives:
A
Col
Value
1
B
x
1
B
y
1
C
z
2
B
x
2
E
y
3
B
y
3
C
z
4
B
a
4
D
b
4
E
x
With this normalised data, you can then just do a simple WHERE Value IN ('x', 'y') along with GROUP BY and COUNT(*):
IF OBJECT_ID(N'tempdb..#T ', 'U') IS NOT NULL
DROP TABLE #T ;
CREATE TABLE #T (A INT, B VARCHAR(4), C VARCHAR(4), D VARCHAR(4), E VARCHAR(4));
INSERT #T(A, B, C, D, E)
VALUES
(1, 'x, y', 'z', NULL, NULL),
(2, 'x', NULL, NULL, 'y'),
(3, 'y', 'z', NULL, NULL),
(4, 'a', NULL, 'b', 'x');
SELECT t.A, NumberOfMatches = COUNT(*)
FROM #T AS t
CROSS APPLY (VALUES (t.B), (t.C), (t.D), (t.E)) upvt (Value)
CROSS APPLY STRING_SPLIT(upvt.Value, ',') AS ss
WHERE TRIM(ss.value) IN ('x', 'y')
GROUP BY t.A
ORDER BY COUNT(*) DESC, t.A;
If you can't normalize the model you can use this query to get your expected result:
select a, count(*) numberOfMatches,
concat('Because column ', string_agg(columnName, ', ')) AS comment
from (
select a, columnName, trim(value) targetValue from (
select a, columnName, value result
from tbl unpivot (value for columnName in ([B], [C], [D], [E])) up
) t
outer apply string_split(result,',')
) r
where targetValue in ('x','y')
group by a
-- Result
/*
a numberOfMatches comment
1 2 Because column B, B
2 2 Because column B, E
3 1 Because column B
4 1 Because column E
*/
Update
You can use this custom function in order to implement string_split in SQL Server prior versions.
IF OBJECT_ID('[dbo].[STRING_SPLIT]','IF') IS NULL BEGIN
EXEC ('CREATE FUNCTION [dbo].[STRING_SPLIT] () RETURNS TABLE AS RETURN SELECT 1 X')
END
GO
ALTER FUNCTION [dbo].[STRING_SPLIT]
(
#string nvarchar(MAX),
#separator nvarchar(MAX)
)
RETURNS TABLE WITH SCHEMABINDING
AS RETURN
WITH X(N) AS (SELECT 'Table1' FROM (VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) T(C)),
Y(N) AS (SELECT 'Table2' FROM X A1, X A2, X A3, X A4, X A5, X A6, X A7, X A8) , -- Up to 16^8 = 4 billion
T(N) AS (SELECT TOP(ISNULL(LEN(#string),0)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -1 N FROM Y),
Delim(Pos) AS (SELECT t.N FROM T WHERE (SUBSTRING(#string, t.N, LEN(#separator+'x')-1) LIKE #separator OR t.N = 0)),
Separated(value) AS (SELECT SUBSTRING(#string, d.Pos + LEN(#separator+'x')-1, LEAD(d.Pos,1,2147483647) OVER (ORDER BY (SELECT NULL)) - d.Pos - LEN(#separator))
FROM Delim d
WHERE #string IS NOT NULL)
SELECT s.value
FROM Separated s
WHERE s.value <> #separator
GO
Update 2
Just 1 if the field has x,y at the same time:
select a, count(distinct columnName) numberOfMatches,
concat('Because column ', string_agg(columnName, ', ')) AS comment
from (
select a, columnName, trim(value) targetValue from (
select a, columnName, value result
from tbl unpivot (value for columnName in ([B], [C], [D], [E])) up
) t
outer apply string_split(result,',')
) r
where targetValue in ('x','y')
group by a
/* another alternative */
select a, count(*) numberOfMatches,
concat('Because column ', string_agg(columnName, ', ')) AS comment
from (
select a, columnName, value targetValue
from tbl unpivot (value for columnName in ([B], [C], [D], [E])) up
) t
where targetValue like '%x%' or targetValue like '%y%'
group by a
-- Result
/*
a numberOfMatches comment
1 1 Because column B
2 2 Because column B, E
3 1 Because column B
4 1 Because column E
*/
Related
I have three columns X, Y & Z. I want to do a select statement that returns only one column.
For example for the following rows:
X Y Z
1 0 0
0 1 0
0 0 1
I want to return one column A:
A
X
Y
Z
So, wherever there is one, the column should return a string corresponding to the column name where it is one..
I don't have rights to create a new column in the database and then update it using where condition. So I was wondering if it could be done inside a SELECT statement
Without dealing with multiple 1 values in one row:
select case
when x = 1 then 'X'
when y = 1 then 'Y'
when z = 1 then 'Z'
end as A
from the_table;
If you are using Postgres and have a primary key column on that table, you can use JSON functions to make this dynamic and not hardcode the column names in the query.
Test data setup:
create table the_table (id integer, x int, y int, z int);
insert into the_table
(id,x,y,z)
values
(1, 1, 0, 0),
(2, 0, 1, 0),
(3, 0, 0, 1),
(4, 0, 1, 1),
(5, 0, 0, 0);
Then using this query:
select t.id, string_agg(k.col,'' order by k.col) as non_zero_columns
from the_table t,
jsonb_each(to_jsonb(t) - 'id') as k (col, val)
where k.val = '1'
group by id
order by id;
Will return this result:
id | non_zero_columns
---+-----------------
1 | x
2 | y
3 | z
4 | yz
Note that the row with ID=5 is not returned because all columns are zero.
If you have multiple columns with 1s in one row:
select ((case when x = 1 then 'X' else '' end) ||
(case when y = 1 then 'Y' else '' end) ||
(case when z = 1 then 'Z' else '' end)
) as A
from the_table;
Note that || is the ANSI standard operator for string concatenation. Some databases use other methods for concatenating strings.
if you are using ms sq-server, you can use UNPIVOT
DECLARE #MyTable TABLE (X INT, Y INT, Z INT)
INSERT INTO #MyTable VALUES
(1 ,0, 0 ),
(0 ,1, 0 ),
(0 ,0, 1 )
SELECT DISTINCT A FROM
(select * from #MyTable UNPIVOT( V FOR A IN ([X], [Y], [Z])) UNPVT) UP
WHERE V = 1
Result:
A
---------
X
Y
Z
You can use the INFORMATION_SCHEMA with find_in_set(str, strlist) function (MySql):
-- the schema
create table `docs` (
`X` int not null,
`Y` int not null,
`Z` int not null
);
insert into `docs` (`X`, `Y`, `Z`) values
(1, 0, 0),
(0, 1, 0),
(0, 0, 1);
-- the query
select column_name as A, concat(x, y, z)
from docs
join INFORMATION_SCHEMA.COLUMNS
on (ordinal_position = find_in_set('1', concat(x, ',', y, ',', z)))
where table_name like 'docs';
Here, the SQL Fiddle
Need suggestion to split string in table 1, match its Ids with table 2 and concatenate the values.
Table - 1
Id Tbl1Col
1 2
2 2,4
3
4 6
5 3
Table - 2
Id Tbl2Col
1 E
2 F
3 M
4 U
5 P
6 C
7 N
8 G
Query -
SELECT T2.Tbl2Col
FROM Table1 AS T1
LEFT JOIN Table2 AS T2 WHERE T1.Tbl1Col= T2.Id
WHERE T1.Id = #Id
Now If #Id = 1, Output is F -- works fine
Now If #Id = 2, Output should be FU -- should not be F,U
Yuck! But you can use LIKE:
SELECT T2.Tbl2Col
FROM Table1 T1 LEFT JOIN
Table2 T2
WHERE ',' + T1.Tbl1Col + ',' LIKE '%,' + CAST(T2.Id as VARCHAR(255)) + ',%'
WHERE T1.Id = #Id;
You have a lousy data format, so this cannot make use of indexes. You should really have a separate table, with one row per Table1.id and Table2.id. Such a table is called a junction table or an association table.
create table dbo.Table01 (
Id int
, Col varchar(100)
);
create table dbo.Table02 (
Id int
, Col varchar(100)
);
insert into dbo.Table01 (Id, Col)
values (1, '2'), (2, '2, 4');
insert into dbo.Table02 (Id, Col)
values (1, 'E'), (2, 'F'), (4, 'U');
select
t.Id
, replace(STRING_AGG (t02.Col, ','), ',', '') as StringAgg
from dbo.Table01 t
cross apply string_split (t.Col, ',') as ss
inner join dbo.Table02 t02 on ss.value = t02.Id
group by t.id
Follow the next approach:-
1) Turning a Comma Separated string into individual rows via using CROSS APPLY with XML
2) Join the two tables with left join.
3) Concatenate many rows with same id via using STUFF & FOR XML
4) Use Replace function for removing comma.
Demo:-
declare #MyTable table (id int , Tbl1Col varchar(10))
insert into #MyTable values (1,'2'),(2,'2,4'),(3,''),(4,'6'),(5,'3')
declare #MyTable2 table (id int , Tbl2Col varchar(10))
insert into #MyTable2 values (1,'E'),(2,'F'),(3,'M'),(4,'U'),(5,'P'),(6,'C'),(7,'N'),(8,'G')
select a.id , Tbl2Col
into #TestTable
from
(
SELECT A.id,
Split.a.value('.', 'VARCHAR(100)') AS Tbl1Col
FROM
(
SELECT id,
CAST ('<M>' + REPLACE(Tbl1Col, ',', '</M><M>') + '</M>' AS XML) AS Data
FROM #MyTable
) AS A CROSS APPLY Data.nodes ('/M') AS Split(a) ) a
left join #MyTable2 b
on a.Tbl1Col = b.id
order by a.id
SELECT id, Tbl2Col =
Replace(STUFF((SELECT DISTINCT ', ' + Tbl2Col
FROM #TestTable b
WHERE b.id = a.id
FOR XML PATH('')), 1, 2, ''),',','')
FROM #TestTable a
GROUP BY id
Output:-
1 F
2 F U
3 NULL
4 C
5 M
References:-
Turning a Comma Separated string into individual rows
How to concatenate many rows with same id in sql?
Finally:-
Don't use this approach, and normalize your database instead , just use it as fun/training/trying .... etc code.
I have a table with 2 columns OLD_VALUE and NEW_VALUE and 5 rows. 1st row has values (A,B). Other row values can be (B,C),(C,D),(E,D),(D,F). I want to update all the old values with the new value (how a vlookup in excel would work) The Final Result Required: The newest value in the above example would be D,F. i.e. D points to F. E and C point to D. B points to C and A points to B. D pointing to F is the last and newest and there are no more successions after D,F. So (OLD_VALUE,NEW_VALUE)->(A,F), (B,F), (C,F), (D,F), (E,F). I want 5 rows with the NEW_VALUE as 'F'. The level of successions can be ranging from 1 to x.
This is the table I have used for the script:
declare #t as table(old_value char(1), new_value char(1));
insert into #t values('A','B')
insert into #t values('B','C')
insert into #t values('C','D')
insert into #t values('E','D')
insert into #t values('D','F')
This needs to be done with a recursive CTE. First, you will need to define an anchor for the CTE. The anchor in this case should be the record with the latest value. This is how I define the anchor:
select old_value, new_value, 1 as level
from #t
where new_value NOT IN (select old_value from #t)
And here is the recursive CTE I used to locate the latest value for each row:
;with a as(
select old_value, new_value, 1 as level
from #t
where new_value NOT IN (select old_value from #t)
union all
select b.old_value, a.new_value, a.level + 1
from a INNER JOIN #t b ON a.old_value = b.new_value
)
select * from a
Results:
old_value new_value level
--------- --------- -----------
D F 1
C F 2
E F 2
B F 3
A F 4
(5 row(s) affected)
I think a recursive CTE like the following is what you're looking for (where the parent is the row whose second value does not exist as a first value elsewhere). If there's no parent(s) to anchor to, this would fail (e.g. if you had A->B, B->C, C->A, you'd get no result), but it should work for your case:
DECLARE #T TABLE (val1 CHAR(1), val2 CHAR(2));
INSERT #T VALUES ('A', 'B'), ('B', 'C'), ('C', 'D'), ('E', 'D'), ('D', 'F');
WITH CTE AS
(
SELECT val1, val2
FROM #T AS T
WHERE NOT EXISTS (SELECT 1 FROM #T WHERE val1 = T.val2)
UNION ALL
SELECT T.val1, CTE.val2
FROM #T AS T
JOIN CTE
ON CTE.val1 = T.val2
)
SELECT *
FROM CTE;
Currently I have a system that is dumping data into a table with the format:
Table1
Id#, row#, row_dump
222, 1, “set1 = aaaa set2 =aaaaaa aaaa dd set4=1111”
I want to take the row dump and transpose it into rows and insert it into another table of the format:
Table2
Id#, setting, value
222, ‘set1’,’aaa’
222, ‘set2’,’aaaaaa aaaa dd’
222, ‘set4’,’1111’
Is there a way to make a trigger in MSSQL that will parse this string on insert in Table1 and insert it into Table2 properly?
All of the examples I’ve found required a common delimiter. ‘=’ separates the setting from the value, space(s) separate a value from a setting but a value could have spaces in it (settings do not have spaces in them so the last word before the equal sign is the setting name but there could be spaces between the setting name and equal sign).
There could be 1-5 settings and values in any given row. The values can have spaces. There may or may not be space between the setting name and the ‘=’ sign.
I have no control over the original insert process or format as it is used for other purposes.
You could use 'set' as a delimiter. This is a simple sample. It obviously may have to be molded to your environment.
use tempdb
GO
IF OBJECT_ID('dbo.fn_TVF_Split') IS NOT NULL
DROP FUNCTION dbo.fn_TVF_Split;
GO
CREATE FUNCTION dbo.fn_TVF_Split(#arr AS NVARCHAR(2000), #sep AS NCHAR(3))
RETURNS TABLE
AS
RETURN
WITH
L0 AS (SELECT 1 AS C UNION ALL SELECT 1) --2 rows
,L1 AS (SELECT 1 AS C FROM L0 AS A, L0 AS B) --4 rows (2x2)
,L2 AS (SELECT 1 AS C FROM L1 AS A, L1 AS B) --16 rows (4x4)
,L3 AS (SELECT 1 AS C FROM L2 AS A, L2 AS B) --256 rows (16x16)
,L4 AS (SELECT 1 AS C FROM L3 AS A, L3 AS B) --65536 rows (256x256)
,L5 AS (SELECT 1 AS C FROM L4 AS A, L4 AS B) --4,294,967,296 rows (65536x65536)
,Nums AS (SELECT row_number() OVER (ORDER BY (SELECT 0)) AS N FROM L5)
SELECT
(n - 1) - LEN(REPLACE(LEFT(#arr, n-1), #sep, N'')) + 1 AS pos,
SUBSTRING(#arr, n, CHARINDEX(#sep, #arr + #sep, n) - n) AS element
FROM Nums
WHERE
n <= LEN(#arr) + 3
AND SUBSTRING(#sep + #arr, n, 3) = #sep
AND N<=100000
GO
declare #t table(
Id int,
row int,
row_dump varchar(Max)
);
insert into #t values(222, 1, 'set1 = aaaa set2 =aaaaaa aaaa dd set4=1111')
insert into #t values(111, 2, ' set1 =cx set2 =4444set4=124')
DECLARE #t2 TABLE(
Id int,
Setting VARCHAR(6),
[Value] VARCHAR(50)
)
insert into #t2 (Id,Setting,Value)
select
Id,
[Setting]='set' + left(LTRIM(element),1),
[Value]=RIGHT(element,charindex('=',reverse(element))-1)
from #t t
cross apply dbo.fn_TVF_Split(row_dump,'set')
where pos > 1
order by
id asc,
'set' + left(LTRIM(element),1) asc
select *
from #t2
Update
You could do something like this. It is not optimal and could probably be better handled in the transformation tool or application. Anyway here we go.
Note: You will need the split function I posted before.
declare #t table(
Id int,
row int,
row_dump varchar(Max)
);
insert into #t values(222, 1, 'set1 = aaaa set2 =aaaaaa aaaa dd set3=abc set4=1111 set5=7373')
insert into #t values(111, 2, 'set1 =cx set2 = 4444 set4=124')
DECLARE #t2 TABLE(
Id int,
Setting VARCHAR(6),
[Value] VARCHAR(50)
)
if OBJECT_ID('tempdb.dbo.#Vals') IS NOT NULL
BEGIN
DROP TABLE #Vals;
END
CREATE TABLE #Vals(
Id INT,
Row INT,
Element VARCHAR(MAX),
pos int,
value VARCHAR(MAX)
);
insert into #Vals
select
Id,
row,
element,
pos,
Value=STUFF(LEFT(element,len(element) - CHARINDEX(' ',reverse(element))),1,1,'')
from(
select
Id,
row,
row_dump = REPLACE(REPLACE(REPLACE(row_dump,'= ','='),' =','='),'=','=|')
from #t
) AS t
cross apply dbo.fn_TVF_Split(row_dump,'=')
where pos >=1 and pos < 10
insert into #t2 (Id,Setting,Value)
select
t1.Id,
Setting =
(
SELECT TOP 1
CASE WHEN t2.pos = 1
THEN LTRIM(RTRIM(t2.element))
ELSE LTRIM(RTRIM(RIGHT(t2.element,CHARINDEX(' ',REVERSE(t2.element)))))
END
FROM #Vals t2
where
t2.Id = t1.id
and t2.row = t1.row
and t2.pos < t1.pos
ORDER BY t2.pos DESC
),
t1.Value
from #Vals t1
where t1.pos > 1 and t1.pos < 10
order by t1.id,t1.row,t1.pos
select * from #t2
My question is how to identify duplicate (repeating) 'groups' of data within an SQL table. I am using SQL Server 2005 at the moment so prefer solutions based on that or ansi-sql.
Here is a sample table and expected result (below) to base this question on:
declare #data table (id nvarchar(10), fund nvarchar(1), xtype nvarchar(1))
insert into #data select 'Switch_1', 'A', 'S'
insert into #data select 'Switch_1', 'X', 'B'
insert into #data select 'Switch_1', 'Y', 'B'
insert into #data select 'Switch_1', 'Z', 'B'
insert into #data select 'Switch_2', 'A', 'S'
insert into #data select 'Switch_2', 'X', 'B'
insert into #data select 'Switch_2', 'Y', 'B'
insert into #data select 'Switch_2', 'Z', 'B'
insert into #data select 'Switch_3', 'C', 'S'
insert into #data select 'Switch_3', 'D', 'B'
insert into #data select 'Switch_4', 'C', 'S'
insert into #data select 'Switch_4', 'F', 'B'
(new data)
insert into #data select 'Switch_5', 'A', 'S'
insert into #data select 'Switch_5', 'X', 'B'
insert into #data select 'Switch_5', 'Y', 'B'
insert into #data select 'Switch_5', 'Z', 'B'
-- id fund xtype match
-- ---------- ---- ----- ---------
-- Switch_1 A S Match_1
-- Switch_1 X B Match_1
-- Switch_1 Y B Match_1
-- Switch_1 Z B Match_1
-- Switch_2 A S Match_1
-- Switch_2 X B Match_1
-- Switch_2 Y B Match_1
-- Switch_2 Z B Match_1
-- Switch_3 C S
-- Switch_3 D B
-- Switch_4 C S
-- Switch_4 F B
(new results)
-- Switch_5 A S Match_1
-- Switch_5 X B Match_1
-- Switch_5 Y B Match_1
-- Switch_5 Z B Match_1
I only want matches on an ALL or NOTHING basis (i.e. All records in the group match all records in another group - not a part match). Any match id can be used (I have used Match_1 above but can be numeric etc.)
Thanks for any help here.
(EDIT: I guess I should add that there could be any number of rows per group, not just the 2 or 4 shown in the sample above - and I'm also trying to avoid cursors)
(EDIT 2: I seem to have an issue if there are more than one matches found. The output from the SQL supplied is returning duplicate records for Switch_1 when there are more than one matches found. I have updated the sample data accordingly. Not sure if Lieven is still following this - I'm also looking at the solution and will post here if found.)
The flow of execution is as follows
q: Combine all funds and xtypes of one id into one string using an XML PATH construction
r: Select a ROW_NUMBER and the respective id's for matching groups
Select the results by LEFT JOINING #data and r
SQL Statement
;WITH q AS (
SELECT DISTINCT d.id
, DuplicateData = STUFF((SELECT ', ' + fund + xtype FROM #data WHERE id = d.id FOR XML PATH('')), 1, 2, '')
FROM #data d
)
, r AS (
SELECT id1 = q1.id
, id2 = q2.id
, rn = ROW_NUMBER() OVER (ORDER BY q1.ID)
FROM q q1
INNER JOIN q q2 ON q1.DuplicateData = q2.DuplicateData AND q1.id < q2.id
)
SELECT id
, fund
, xtype
, match = 'Match_' + CAST(r.rn AS VARCHAR(32))
FROM #data d
LEFT OUTER JOIN r ON d.id IN (r.id1, r.id2)
Results
id fund xtype match
---------- ---- ----- --------------------------------------
Switch_1 A S Match_1
Switch_1 X B Match_1
Switch_1 Y B Match_1
Switch_1 Z B Match_1
Switch_2 A S Match_1
Switch_2 X B Match_1
Switch_2 Y B Match_1
Switch_2 Z B Match_1
Switch_3 C S NULL
Switch_3 D B NULL
Switch_4 C S NULL
Switch_4 F B NULL
Here it is another query for it:
create table #temp1 (
id varchar(10),
fund nvarchar(1),
xtype nvarchar(1)
)
insert into #temp1 select 'Switch_1', 'A', 'S'
insert into #temp1 select 'Switch_1', 'X', 'B'
insert into #temp1 select 'Switch_1', 'Y', 'B'
insert into #temp1 select 'Switch_1', 'Z', 'B'
insert into #temp1 select 'Switch_2', 'A', 'S'
insert into #temp1 select 'Switch_2', 'X', 'B'
insert into #temp1 select 'Switch_2', 'Y', 'B'
insert into #temp1 select 'Switch_2', 'Z', 'B'
insert into #temp1 select 'Switch_3', 'C', 'S'
insert into #temp1 select 'Switch_3', 'D', 'B'
insert into #temp1 select 'Switch_4', 'C', 'S'
insert into #temp1 select 'Switch_4', 'F', 'B'
select t1.*, case when t2.equal = t3.total then 'True' else 'False' end as 'Match' from #temp1 t1
left outer join (select m.id, count(m2.id) as 'equal' from #temp1 m
inner join #temp1 m2 on m.Id <> m2.Id and m.fund = m2.fund and m.xtype = m2.xtype
group by m.id) t2 on t1.id = t2.id
inner join (select m3.id, count(m3.fund) as 'total' from #temp1 m3 group by m3.id) t3 on t3.id = t1.id
drop table #temp1