SQL Update First record of Duplicate row in table - sql

I am looking to update the first record when a duplicate is found in a table.
CREATE TABLE tblauthor
(
Col1 varchar(20),
Col2 varchar(30)
);
CREATE TABLE tblbook
(
Col1 varchar(20),
Col2 varchar(30),
Col3 varchar(30)
);
INSERT INTO tblAuthor
(Col1,Col2)
VALUES
('1', 'John'),
('2', 'Jane'),
('3', 'Jack'),
('4', 'Joe');
INSERT INTO tblbook
(Col1,Col2,Col3)
VALUES
('1', 'John','Book 1'),
('2', 'John','Book 2'),
('3', 'Jack','Book 1'),
('4', 'Joe','Book 1'),
('5', 'Joe','Book 2'),
('6', 'Jane','Book 1'),
('7', 'Jane','Book 2');
The update result I want to accomplish should update the records as follows. I would like tblbook.col3 = 1st.
select * from tblbook
('1', 'John','1st'),
('3', 'Jack','1st'),
('4', 'Joe','1st'),
('6', 'Jane','1st');
Can't seem to even get this done with distinct.

Use ROW_NUMBER to assign a number to each row grouped by the Author's name (col2) and then update the ones that have a number of 1
update tblbook set col3 = '1st'
where col1 in(
select
col1
from (
select
tblbook.col1,
tblbook.col2,
tblbook.col3,
ROW_NUMBER() OVER (PARTITION BY tblbook.Col2 order by tblbook.col1) as rownum
from tblbook
left outer join tblauthor on tblbook.col2 = tblauthor.col2
) [t1]
where [t1].rownum = 1
)
Fiddle: http://sqlfiddle.com/#!3/4b6c8/20/0

If you want to update tblbook so the third column is '1st' on duplicates, then you can easily do so with an updatable CTE:
with toupdate as (
select tbl2.*, row_number() over (partition by col2 order by col1) as seqnum
from tbl2
)
update toupdate
set col3 = '1st'
where seqnum = 1;
This is the closest that I can come to understanding what you really want.

Related

Combine subqueries without views

I work with languages where I can assign intermediate outputs to a variable and then work the with variables to create a final output. I know SQL doesn't work this way as much. Currently I have queries that require me to make subsets of tables and then I want to join those subsets together. I can mimic the variable assignment I do in my native languages using a VIEW but I want to know how to do this using a single query (otherwise the database will get messy with views quickly).
Below is a MWE to make 2 initial tables DeleteMe1 and DeleteMe2 (at the end). Then I'd use these two views to get current snapshots of each table. Last I'd use LEFT JOIN with the views to merge the 2 data sets.
Is there a way to see the code SQL uses on the Join Snapshoted Views header code I supply below
How could I eliminate the views intermediate step and combine into a single SQL query?
Create views for current snapshot:
CREATE VIEW [dbo].[CurrentSnapshotDeleteMe1]
AS
SELECT DISTINCT *
FROM
(SELECT
t.[Id]
,t.[OppId]
,t.[LastModifiedDate]
,t.[Stage]
FROM
[dbo].DeleteMe1 as t
INNER JOIN
(SELECT
[OppId], MAX([LastModifiedDate]) AS MaxLastModifiedDate
FROM
[dbo].DeleteMe1
WHERE
LastModifiedDate <= GETDATE()
GROUP BY
[OppId]) AS referenceGroup ON t.[OppId] = referenceGroup.[OppId]
AND t.[LastModifiedDate] = referenceGroup.[MaxLastModifiedDate]) as BigGroup
GO
CREATE VIEW [dbo].[CurrentSnapshotDeleteMe2]
AS
SELECT DISTINCT *
FROM
(SELECT
t.[Id]
,t.[OppId]
,t.[LastModifiedDate]
,t.[State]
FROM
[dbo].DeleteMe2 AS t
INNER JOIN (
SELECT [OppId], MAX([LastModifiedDate]) AS MaxLastModifiedDate
FROM [dbo].DeleteMe2
WHERE LastModifiedDate <= GETDATE()
GROUP BY [OppId]
) as referenceGroup
ON t.[OppId] = referenceGroup.[OppId] AND t.[LastModifiedDate] = referenceGroup.[MaxLastModifiedDate]
) as BigGroup
GO
Join snapshoted views:
SELECT
dm1.[Id] as IdDM1
,dm1.[OppId]
,dm1.[LastModifiedDate] as LastModifiedDateDM1
,dm1.[Stage]
,dm2.[Id] as IdDM2
,dm2.[LastModifiedDate] as LastModifiedDateDM2
,dm2.[State]
FROM [dbo].[CurrentSnapshotDeleteMe1] as dm1
LEFT JOIN [dbo].[CurrentSnapshotDeleteMe2] as dm2 ON dm1.OppId = dm2.OppId
Create original tables:
CREATE TABLE DeleteMe1
(
[Id] INT,
[OppId] INT,
[LastModifiedDate] DATE,
[Stage] VARCHAR(250),
)
INSERT INTO DeleteMe1
VALUES ('1', '1', '2019-04-01', 'A'),
('2', '1', '2019-05-01', 'E'),
('3', '1', '2019-06-01', 'B'),
('4', '2', '2019-07-01', 'A'),
('5', '2', '2019-08-01', 'B'),
('6', '3', '2019-09-01', 'C'),
('7', '4', '2019-10-01', 'B'),
('8', '4', '2019-11-01', 'C')
CREATE TABLE DeleteMe2
(
[Id] INT,
[OppId] INT,
[LastModifiedDate] DATE,
[State] VARCHAR(250),
)
INSERT INTO DeleteMe2
VALUES (' 1', '1', '2018-07-01', 'California'),
(' 2', '1', '2017-11-01', 'Delaware'),
(' 3', '4', '2017-12-01', 'California'),
(' 4', '2', '2018-01-01', 'Alaska'),
(' 5', '4', '2018-02-01', 'Delaware'),
(' 6', '2', '2018-09-01', 'Delaware'),
(' 7', '3', '2018-04-01', 'Alaska'),
(' 8', '1', '2018-05-01', 'Hawaii'),
(' 9', '4', '2018-06-01', 'California'),
('10', '1', '2018-07-01', 'Connecticut'),
('11', '2', '2018-08-01', 'Delaware'),
('12', '2', '2018-09-01', 'California')
I work with languages where I can assign intermediate outputs to a variable and then work the with variables to create a final output. I know SQL doesn't work this way as much.
Well, that's not true, sql does work this way, or at least sql-server does. You have temp tables and table variables.
Although you named your tables DeleteMe, from your statements it seems like it's the views you wish to treat as variables. So I'll focus on this.
Here's how to do it for your first view. It puts the results into a temporary table called #tempData1:
-- Optional: In case you re-run before you close your connection
if object_id('tempdb..#snapshot') is not null
drop table #snapshot1;
select
distinct t.Id, t.OppId, t.LastModifiedDate, t.Stage
into #snapshot1
from dbo.DeleteMe1 as t
inner join (
select OppId, max(LastModifiedDate) AS MaxLastModifiedDate
from dbo.DeleteMe1
where LastModifiedDate <= getdate()
group by OppId
) referenceGroup
on t.OppId = referenceGroup.OppId
and t.LastModifiedDate = referenceGroup.MaxLastModifiedDate;
The hashtag tells sql server that the table is to be stored temporarially. #tempTable1 will not survive when your connection closes.
Alternatively, you can create a table variable.
declare #snapshot1 table (
id int,
oppId int,
lastModifiedDate date,
stage varchar(50)
);
insert #snapshot1 (id, oppId, lastModifiedDate, stage)
select distinct ...
This table is discarded as soon as the query has finished executing.
From there, you can join on your temp tables:
SELECT dm1.[Id] as IdDM1, dm1.[OppId],
dm1.[LastModifiedDate] as LastModifiedDateDM1, dm1.[Stage],
dm2.[Id] as IdDM2, dm2.[LastModifiedDate] as LastModifiedDateDM2,
dm2.[State]
FROM #snapshot1 dm1
LEFT JOIN #snapshot2 dm2 ON dm1.OppId = dm2.OppId
Or your table variables:
From there, you can join on your temp tables:
SELECT dm1.[Id] as IdDM1, dm1.[OppId],
dm1.[LastModifiedDate] as LastModifiedDateDM1, dm1.[Stage],
dm2.[Id] as IdDM2, dm2.[LastModifiedDate] as LastModifiedDateDM2,
dm2.[State]
FROM #snapshot1 dm1
LEFT JOIN #snapshot2 dm2 ON dm1.OppId = dm2.OppId

Remove duplicates by multiple column criteria

I have following table
CREATE TABLE Test (
ID INT NOT NULL IDENTITY(1,1) PRIMARY KEY,
FIRST VARCHAR(10) NOT NULL,
SECOND VARCHAR(10) NOT NULL
)
Table filled with some duplicate data. TestTarget table have same structure and it filled using following procedural algorithm:
DECLARE #first varchar(10), #second varchar(10)
DECLARE c CURSOR FAST_FORWARD
FOR
SELECT first, second FROM Test ORDER BY id
OPEN c
FETCH NEXT FROM c INTO #first, #second
WHILE ##fetch_status = 0
BEGIN
IF NOT EXISTS(SELECT 1 FROM TestTarget WHERE first=#first OR second=#second)
INSERT INTO TestTarget (first, second) VALUES(#first, #second)
FETCH NEXT FROM c INTO #first, #second
END
CLOSE c
DEALLOCATE c
Briefly here we checking target table before insert if it already contains such 'first' OR 'second' value.
Example:
Source table
ID FIRST SECOND
1 A 2
2 A 1
3 A 3
4 B 2
5 B 1
6 B 3
7 B 2
8 B 4
9 C 2
10 C 3
INSERT INTO Test (first, second)
VALUES ('A', '2'),
('A', '1'),
('A', '3'),
('B', '2'),
('B', '1'),
('B', '3'),
('B', '2'),
('B', '4'),
('C', '2'),
('C', '3')
Target table
ID FIRST SECOND
1 A 2
5 B 1
10 C 3
Real source table have x*100k rows and at least 2 rows for same 'first' or 'second' column.
I'm looking for set based solution if it ever possible or please at least something faster than such loop because it takes hours for my real case.
NOTE Classic duplicate removals via partition/join/etc. is not the case here because it will produce different results even with different final number of rows.
INSERT INTO TestTarget (first, second)
SELECT first,second
FROM Test t
WHERE NOT EXISTS
(
SELECT 1
FROM Test t2
WHERE t2.id>t.id and (t2.first=t.first or t2.second=t.second)
)
I cannot think of any simple set based solution to your problem, I am afraid, but I would hope that something along the following lines would be much faster than your existing cursor:
declare #test table
(id int,
first varchar(1),
second varchar(1))
declare #target table
(id int,
first varchar(1),
second varchar(1))
declare #temp table
(id int,
first varchar(1),
second varchar(1))
INSERT INTO #Test (id, first, second)
VALUES (1, 'A', '2'),
(2, 'A', '1'),
(3, 'A', '3'),
(4, 'B', '2'),
(5, 'B', '1'),
(6, 'B', '3'),
(7, 'B', '2'),
(8, 'B', '4'),
(9, 'C', '2'),
(10, 'C', '3')
declare #firsts table
(first varchar(1))
declare #seconds table
(second varchar(1))
INSERT INTO #firsts
SELECT DISTINCT first FROM #test
INSERT INTO #seconds
SELECT DISTINCT second FROM #test
declare #firstcnt int = (SELECT count(*) FROM #firsts)
declare #secondcnt int = (SELECT count(*) FROM #firsts)
WHILE (#firstcnt > 0 AND #secondcnt > 0)
BEGIN
DELETE FROM #temp
INSERT INTO #temp
SELECT TOP 1 t.id, t.first, t.second FROM #test t
INNER JOIN #firsts f On t.first = f.first
INNER JOIN #seconds s On t.second = s.second
ORDER BY id
INSERT INTO #target
SELECT * FROM #temp
DELETE FROM #firsts WHERE first = (SELECT first FROM #temp)
SET #firstcnt = #firstcnt - 1
DELETE FROM #seconds WHERE second = (SELECT second FROM #temp)
SET #secondcnt = #secondcnt - 1
END
SELECT * FROM #target
This does produce the desired values and I would expect it to be faster because the while loop only needs to run for the total number of unique value pairs, rather than having to step through the entire table.
It also gives 10 C 3 as the last row, which I take to be correct, despite #Gordon's comment. If I understand the question correctly, the ID order takes precedence: that is to say, although 'A' and 'B' have entries with '3' as the second value, these entries have a greater id, than another second value that can legitimately be inserted.
HTH
using Recursive CTE,
declare #Target table(col1 varchar(20),col2 int)
declare #Test table(col1 varchar(20),col2 int)
INSERT INTO #Test (col1, col2
VALUES ('A', '2')
('A', '1')
('A', '3'),
('B', '1')
('B', '2'),
('B', '3'),
('B', '2'),
('B', '4'),
('C', '2'),
('C', '3')
 
;With CTE as
(
select col1 ,col2
,DENSE_RANK()over( ORDER by col1)rn1
from #Test
)
,cte1 AS(
select top 1 c.col1,c.col2,rn1 from cte c where rn1=1
union ALL
select c.col1,c.col2,c.rn1 from cte c
inner join cte1 c1
on c.rn1>c1.rn
where c.col2!=c1.col2
)
insert into #Target
select col1,col2 FROM(
select *,ROW_NUMBER()over(partition by col1 order by (select null)) rn2 from cte1
)t4
where rn2=1
select * from #Target

Multiple SQL MAX when items are not in order

I have some data as below:
DECLARE #MyTable AS TABLE
(productName varchar(13), test1 int,test2 int)
INSERT INTO #MyTable
(productName, test1,test2)
VALUES
('a', 1,1),
('a', 2,2),
('a', 3,3),
('b', 1,4),
('b', 2,5),
('b', 3,6),
('a', 1,7),
('a', 4,8),
('a', 5,9)
;
SELECT productname,MAX(test1) from #MyTable group BY productname
a MAX query on test1 column gives
a,5
b,3
but I need to have result as
a,3
b,3
a,5
when I have order by test2
You can solve this by using a trick with row_numbers, so that you assign 2 different row numbers, one for the whole data and one that is partitioned by productname. If you compare the difference between these numbers, you can figure out when product name has changed, and use that to determine the max values for each group.
select productname, max(test1) from (
SELECT *,
row_number() over (order by test2 asc) -
row_number() over (partition by productname order by test2 asc) as GRP
from #MyTable
) X
group by productname, GRP
You can test this in SQL Fiddle
If the test2 column is always a row number without gaps, you can use that too instead of the first row number column. If you need ordering in the data, you'll have to for example to use the max of test1 to do that.
Please check the following SQL Select statement
DECLARE #MyTable AS TABLE (productName varchar(13), test1 int,test2 int)
INSERT INTO #MyTable
(productName, test1,test2)
VALUES
('a', 1,1),
('a', 2,2),
('a', 3,3),
('b', 1,4),
('b', 2,5),
('b', 3,6),
('a', 1,7),
('a', 4,8),
('a', 5,9)
DECLARE #MyTableNew AS TABLE (id int identity(1,1), productName varchar(13), test1 int,test2 int)
insert into #MyTableNew select * from #MyTable
--select * from #MyTableNew
;with cte as (
SELECT
id, productName, test1, test2,
case when (lag(productName,1,'') over (order by id)) = productName then 0 else 1 end ischange
from #MyTableNew
), cte2 as (
select t.*,(select sum(ischange) from cte where id <= t.id) grp from cte t
)
select distinct grp, productName, max(test1) over (partition by grp) from cte2
This is implemented according to the following SQL Server Lag() function tutorial
The Lag() function is used to identify and order the groups in table data
Please try this query
DECLARE #MyTable AS TABLE
(productName varchar(13), test1 int,test2 int)
INSERT INTO #MyTable
(productName, test1,test2)
VALUES
('a', 1,1),
('a', 2,2),
('a', 3,3),
('b', 1,4),
('b', 2,5),
('b', 3,6),
('a', 1,7),
('a', 4,8),
('a', 5,9)
;
SELECT productname,MAX(test1)
from #MyTable
where test1 = test2
group BY productname
union all
SELECT productname,MAX(test1)
from #MyTable
where test1 != test2
group BY productname

Finding duplicate members in a table

I have come across a problem in writing a query to find duplicate members in a table. I have tried to simplify the problem with a sample table and data.
CREATE TABLE MYTABLE (
S_ID VARCHAR2(10),
PARAM VARCHAR2(10),
VALUE VARCHAR2(10)
);
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('1', 'NAME', 'A');
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('1', 'AGE', '15');
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('1', 'SEX', 'M');
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('2', 'NAME', 'B');
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('2', 'AGE', '16');
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('2', 'SEX', 'M');
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('3', 'NAME', 'A');
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('3', 'AGE', '15');
INSERT INTO MYTABLE (S_ID, PARAM, VALUE) VALUES ('3', 'SEX', 'M');
Here items with S_ID 1 and 3 are same.
Here's an easy way to find duplicate values in one field:
SELECT fieldName FROM TableName
GROUP BY FieldName
HAVING COUNT(*) > 1
You mean this? :
select colname, count(colname) from TableName
Group by colname Having (count(colname) > 1)

Multiple columns within the same SQL query

I have been working on this problem for a while now and I can't figure out what to do.
I have a huge SQL query with multiple joins and it gives me hundreds of thousands of records, which is perfect.
I then realized that I had 200 odd some records in another table that needs to be added to the first.
First table:
Field1 Field2 Field3
john smith 23 Boston
Mohammed Ali 45 New York
Stephanie Johnson 15 Los Angeles
New Table
Field1 OtherField1 OtherField2
Mark Khoury Null null
So I really only need to add the Field1 values from table two to the "bottom" of the first. All of the joins I made in the first query should also work for the values found in table two.
A union won't work because I only have on column to add. I would have to copy-paste the same query from the first table to get "Field2" and Field3" from the values of the second.
What I want it to look like is:
Field1 Field2 Field3
John Smith 23 Boston
Mohammed Ali 45 New York
Stephanie Johnson 15 Los Angeles
Mark Khoury 65 Houston
How can I go about doing this? I don't want to "JOIN" the tables, I want to unite them but only with one column.
Here is an example of what I mean:
Table one was created by doing something like the following:
SELECT table.value1 as Field1, table2.value2 as Field2, table3.value3 as Field3
FROM some_table as table
LEFT OUTER JOIN some_other_table as Table2 ON table.field = table2.field5
LEFT OUTER JOIN a_third_table as table3 ON table2.field2 = table3.field4
but now I have newTable with more Field1 values and I need to add those values to the first table.
I tried this:
SELECT COALESCE(table.value1, NewTable.value) as Field1, table2.value2 as Field2,
table3.value3 as Field3
FROM some_table as table
LEFT OUTER JOIN some_other_table as Table2 ON table.field = table2.field5
LEFT OUTER JOIN a_third_table as table3 ON table2.field2 = table3.field4, newTable
but this is giving me an exponential amount of results, where it should be giving me a few hundred more.
You could add a union with the first column and set the other columns as '' something like,
SELECT Field1, Field2
FROM table1
UNION
SELECT Field1, '' AS Field2
FROM table2
Would that get you what you needed?
Taking your example, would the below get you what you need?
SELECT table.value1 as Field1, table2.value2 as Field2, table3.value3 as Field3
FROM
(SELECT t.value1 as Field1 FROM some_table as t
UNION
SELECT t2.value1 as Field1 FROM some_table2 as t2) as table
LEFT OUTER JOIN some_other_table as Table2 ON table.field1 = table2.field5
LEFT OUTER JOIN a_third_table as table3 ON table2.field2 = table3.field4
Having the same idea as #rhollyer, here is a script that may produce what you're asking for.
DECLARE #Person TABLE
(
PersonID int,
DisplayName nvarchar(50)
)
DECLARE #PersonDetail TABLE
(
PersonID int,
Age int
)
DECLARE #PersonGeoInfo TABLE
(
PersonID int,
City nvarchar(200)
)
DECLARE #LostPersons TABLE
(
LostPersonID int,
DisplayName nvarchar(50),
Irrelevant1 decimal,
Irrelevant2 bit
)
INSERT INTO #Person (PersonID, DisplayName)
VALUES
(1, 'John Smith'),
(2, 'Mohammed Ali'),
(3, 'Stephanie Johnson')
INSERT INTO #PersonDetail(PersonID, Age)
VALUES
(1, 23),
(2, 45),
(3, 15),
(4, 65)
INSERT INTO #PersonGeoInfo(PersonID, City)
VALUES
(1, 'Boston'),
(2, 'New York'),
(3, 'Los Angeles'),
(4, 'Houston')
INSERT INTO #LostPersons(LostPersonID, DisplayName, Irrelevant1, Irrelevant2)
VALUES
(4, 'Mark Khoury', 9.5, 1)
DECLARE #Person TABLE
(
PersonID int,
DisplayName nvarchar(50)
)
DECLARE #PersonDetail TABLE
(
PersonID int,
Age int
)
DECLARE #PersonGeoInfo TABLE
(
PersonID int,
City nvarchar(200)
)
DECLARE #LostPersons TABLE
(
LostPersonID int,
DisplayName nvarchar(50),
Irrelevant1 decimal,
Irrelevant2 bit
)
INSERT INTO #Person (PersonID, DisplayName)
VALUES
(1, 'John Smith'),
(2, 'Mohammed Ali'),
(3, 'Stephanie Johnson')
INSERT INTO #PersonDetail(PersonID, Age)
VALUES
(1, 23),
(2, 45),
(3, 15),
(4, 65)
INSERT INTO #PersonGeoInfo(PersonID, City)
VALUES
(1, 'Boston'),
(2, 'New York'),
(3, 'Los Angeles'),
(4, 'Houston')
INSERT INTO #LostPersons(LostPersonID, DisplayName, Irrelevant1, Irrelevant2)
VALUES
(4, 'Mark Khoury', 9.5, 1)
SELECT P.DisplayName AS Field1, PD.Age AS Field2, PGI.City AS Field3
FROM (
SELECT P.PersonID AS PersonID, P.DisplayName AS DisplayName FROM #Person P
UNION
SELECT LP.LostPersonID AS PersonID, LP.DisplayName AS DisplayName FROM #LostPersons LP) AS P
LEFT OUTER JOIN #PersonDetail PD ON P.PersonID = PD.PersonID
LEFT OUTER JOIN #PersonGeoInfo PGI ON PD.PersonID = PGI.PersonID