SQL Duplicates optimization

SQL Duplicates optimization - sql

I have the following query:
Original query:
SELECT
cd1.cust_number_id, cd1.cust_number_id, cd1.First_Name, cd1.Last_Name
FROM #Customer_Data cd1
inner join #Customer_Data cd2 on
cd1.Cd_Id <> cd2.Cd_Id
and cd2.cust_number_id <> cd1.cust_number_id
and cd2.First_Name = cd1.First_Name
and cd2.Last_Name = cd1.Last_Name
inner join #Customer c1 on c1.Cust_id = cd1.cust_number_id
inner join #Customer c2 on c2.cust_id = cd2.cust_number_id
WHERE c1.cust_number <> c2.cust_number
I optimized it as follows, but there is an error in my optimization and I can't find it:
Optimized query:
SELECT cd1.cust_number_id, cd1.cust_number_id, cd1.First_Name,cd1.Last_Name
FROM (
SELECT cdResult.cust_number_id, cdResult.First_Name,cdResult.Last_Name, COUNT(*) OVER (PARTITION BY cdResult.First_Name, cdResult.Last_Name) as cnt_name_bday
FROM #Customer_Data cdResult
WHERE cdResult.First_Name IS NOT NULL
AND cdResult.Last_Name IS NOT NULL) AS cd1
WHERE cd1.cnt_name_bday > 1;
Test data:
DECLARE #Customer_Data TABLE
(
Cd_Id INT,
cust_number_id INT,
First_Name NVARCHAR(30),
Last_Name NVARCHAR(30)
)
INSERT #Customer_Data (Cd_Id,cust_number_id,First_Name,Last_Name)
VALUES (1, 22, N'Alex', N'Bor'),
(2, 22, N'Alex', N'Bor'),
(3, 23, N'Alex', N'Bor'),
(4, 24, N'Tom', N'Cruse'),
(5, 25, N'Tom', N'Cruse')
DECLARE #Customer TABLE
(
Cust_id INT,
Cust_number INT
)
INSERT #Customer (Cust_id, Cust_number)
VALUES (22, 022),
(23, 023),
(24, 024),
(25, 025)
The problem is that the original query returns 6 rows (duplicating the row). And optimized returns just duplicates, how to make the optimized query also duplicated the row?

I would suggest just using window functions:
SELECT CD.cud_customer_id
FROM (SELECT cd.*, COUNT(*) OVER (PARTITION BY cud_name, cud_birthday) as cnt_name_bday FROM dbo.customer_data cd
) cd
WHERE cnt_name_bday > 1;
Your query is finding duplicates for either name or birthday. You want duplicates with both at the same time.

You can use only one exists :
SELECT cd.cud_customer_id
FROM dbo.customer_data AS cd
WHERE EXISTS (SELECT 1
FROM dbo.customer_data AS c
WHERE c.cud_name = cd.cud_name AND c.cud_birthday = cd.cud_birthday AND c.cust_id <> cd.cud_customer_id
);

Related

How to Select rows in a table with all matching conditions in joining table

I have two tables (table A) and (table B) and one table variable (#country) that will build the where condition for finding out the list of employees matching the countries.
TableA(empId, name)
(1,John),(2,Mary),(3,Harry)
TableB(empId, country)
(1,Australia),(1,US),(1,UK),(2,US)
For example, I need to select only those employees from TableA who have resides in both Australia and US. i.e. emp 1 (John). The query should be able to handle more countries in where clause if require. This depends on the number of countries in table variable #country.
I have tried many option including the following query but nothing seems to work.
DECLARE #country TABLE (
[country] [nvarchar](255) NOT NULL
);
insert into #country (country) values('Australia'),('US')
Select E.empID, EC.empID,EC.country from TableA E
INNER JOIN TableB EC on E.empID= EC.empID
Where EC.country = ALL(Select country from #country)
Could you please advise on how to write the best query to achieve this task? Please note that #country can have one or more countries.

Try:
SELECT E.empID
,EC.empID
,EC.country
FROM TableA E
INNER JOIN TableB EC
ON E.empID = EC.empID
WHERE EXISTS (
SELECT 1
FROM TableB EC_US
WHERE EC_US.empID = EC.empID
and EC_US.Country = 'US'
)
AND EXISTS (
SELECT 1
FROM TableB EC_Aus
WHERE EC_Aus.empID = EC.empID
and EC_Aus.Country = 'Australia'
)
Or:
SELECT E.empID
,EC.empID
,EC.country
FROM TableA E
INNER JOIN TableB EC
ON E.empID = EC.empID
WHERE EC.empID IN (
SELECT EC_Sub.empID
FROM TableB EC_Sub
WHERE EC_Sub.Country IN ('Australia','US')
GROUP BY EC_Sub.empID
HAVING COUNT(*) = 2
)

Try Now, added where clause. Change any value of #country and execute the query:
DECLARE #TableA TABLE (empId INT, [Name] VARCHAR(100))
INSERT INTO #TableA VALUES (1, 'John')
INSERT INTO #TableA VALUES (2, 'Mary')
INSERT INTO #TableA VALUES (3, 'Harry')
DECLARE #TableB TABLE (empID INT, country VARCHAR(100))
INSERT INTO #TableB VALUES (1, 'Australia')
INSERT INTO #TableB VALUES (1, 'UK')
INSERT INTO #TableB VALUES (2, 'US')
DECLARE #country TABLE ([country] [nvarchar](255) NOT NULL);
INSERT INTO #country (country) VALUES('Australia'),('US')
SELECT a.* , tb.country
FROM #TableA AS a
INNER JOIN (
SELECT b.empid,
COUNT(*) AS empInMultipleCountry
FROM #TableB b
GROUP BY
empid
) b
ON a.empId = b.empid
INNER JOIN #TableB AS tb
ON tb.empId = a.empId
WHERE empInMultipleCountry > 1
AND EXISTS (SELECT 1 FROM #country AS c WHERE c.country = tb.country)

Select E.empID, EC.empID,EC.country
from TableA E INNER JOIN TableB EC on E.empID= EC.empID
Where EC.country IN ('Australia','US');

SQL Update Or Insert By Comparing Dates

I am trying to do the UPDATE or INSERT, but I am not sure if this is possible without using loop. Here is the example:
Says, I have this SQL below in which I joined two tables: tblCompany and tblOrders.
SELECT CompanyID, CompanyName, c.LastSaleDate, o.SalesOrderID, o.SalesPrice
, DATEADD(m, -6, GETDATE()) AS DateLast6MonthFromToday
FROM dbo.tblCompany c
CROSS APPLY (
SELECT TOP 1 SalesOrderID, SalesPrice
FROM dbo.tblOrders o
WHERE c.CompanyID = o.CompanyID
ORDER BY SalesOrderID DESC
) AS a
WHERE Type = 'End-User'
Sample Result:
CompanyID, SalesOrderID, SalesPrice, LastSalesDate, DateLast6MonthFromToday
101 10001 50 2/01/2016 10/20/2016
102 10002 80 12/01/2016 10/20/2016
103 10003 80 5/01/2016 10/20/2016
What I am trying to do is comparing the LastSalesDate and the DateLast6MonthFromToday. Condition is below:
If the LastSalesDate is lesser (earlier), then do the INSERT INTO tblOrders (CompanyID, Column1, Column2...) VALUES (CompanyIDFromQuery, Column1Value, Column2Value)
Else, do UPDATE tblOrders SET SalesPrice = 1111 WHERE SalesOrderID = a.SalesOrderID
As the above sample result, the query will only update SalesOrderID 10001 and 10003. And For Company 102, NO insert since the LastSaleDate is greater, then just do the UPDATE for the SalesOrderID.
I know it is probably can be done if I create a Cursor to loop through every record and do the comparison then Update or Insert, but I wonder if there is another way perform this without the loop since I have around 20K records.
Sorry for the confusion,

I don't know your tables structure and your data types. Also I know nothing
about duplicates and join ralationships between this 2 tables.
But I want only show how it works on next example:
use [your test db];
go
create table dbo.tblCompany
(
companyid int,
companyname varchar(max),
lastsaledate datetime,
[type] varchar(max)
);
create table dbo.tblOrders
(
CompanyID int,
SalesOrderID int,
SalesPrice float
);
insert into dbo.tblCompany
values
(1, 'Avito', '2016-01-01', 'End-User'),
(2, 'BMW', '2016-05-01', 'End-User'),
(3, 'PornHub', '2017-01-01', 'End-User')
insert into dbo.tblOrders
values
(1, 1, 500),
(1, 2, 700),
(1, 3, 900),
(2, 1, 500),
(2, 2, 700),
(2, 3, 900),
(3, 1, 500),
(3, 2, 700),
(3, 3, 900)
declare #column_1_value int = 5;
declare #column_2_value int = 777;
with cte as (
select
CompanyID,
SalesOrderID,
SalesPrice
from (
select
CompanyID,
SalesOrderID,
SalesPrice,
row_number() over(partition by CompanyID order by SalesOrderId desc) as rn
from
dbo.tblOrders
) t
where rn = 1
)
merge cte as target
using (select * from dbo.tblCompany where [type] = 'End-User') as source
on target.companyid = source.companyid
and source.lastsaledate >= dateadd(month, -6, getdate())
when matched
then update set target.salesprice = 1111
when not matched
then insert (
CompanyID,
SalesOrderID,
SalesPrice
)
values (
source.CompanyId,
#column_1_value,
#column_2_value
);
select * from dbo.tblOrders
If you will give me an information, then I can prepare target and source tables properly.

How to get the ID from the table which is not mention EndDate

How to get the name from the table which is not having EndDate
in the above pic i need to get D and G details from the table ,
( To understand mOre:
A, C,D,G are having end date, and A, C are again started, but D and G is not started, so from the query i need to get the name D and G
the code i used is not works for it
DECLARE #T AS TABLE
(
SubInventoryID int ,
SubInventoryName varchar(20),
RolesName varchar(20),
StartDate date,
EndDate date
)
INSERT INTO #T VALUES
(30,'RIF-Teller','Teller', '2016-12-27', '2017-01-23'),
(30,'RIF-Teller','Teller', '2016-12-08', NULL),
(30,'RIF-Teller','Teller', '2017-01-02', '2017-01-05'),
(31,'RIF-Teller','Teller', '2017-01-05', NULL),
(24,'MHQ-Teller','Teller', '2016-09-20', '2017-01-23'),
(24,'MHQ-Teller','Teller', '2016-08-01', '2017-01-05'),
(24,'MHQ-Teller','Teller', '2017-01-05', NULL)
Query
SELECT UP.SubInventoryID,S.SubInventoryName SubInventoryName,RolesName,UP.StartDate StartDate,
UP.EndDate EndDate , case when UP.EndDate IS null then 'Occupied' else 'Closed' End As Vacancy
FROM [View_Alx_UserPosition] UP
Inner join ALX_Branches B ON B.BranchID= UP.BranchID
Inner join ALX_SubInventories S ON S.SubInventoryID=UP.SubInventoryID WHERE UP.RolesName Like '%Teller%'
union
SELECT distinct(UP.SubInventoryID),S.SubInventoryName SubInventoryName, '' FullName, '' RolesName,NUll StartDate,
NUll EndDate,'Free' as vacancy
FROM [View_Alx_UserPosition] UP
Inner join ALX_Branches B ON B.BranchID= UP.BranchID
Inner join ALX_SubInventories S ON S.SubInventoryID=UP.SubInventoryID
WHERE UP.EndDate IS NOT NULL ANd UP.RolesName Like '%Teller%'
AND NOT EXISTS
(
SELECT 1
FROM [View_Alx_UserPosition] UP1
WHERE UP1.SubInventoryID = UP.SubInventoryID
AND UP1.StartDate >= UP.EndDate
-- AND UP1.EndDate IS NOT NULL
)

Update
Create and populate sample table (Please save us this step in your future questions)
DECLARE #T AS TABLE
(
ID int identity(1,1),
Name char(1),
StartDate date,
EndDate date
)
INSERT INTO #T VALUES
('A', '2016-04-04', '2017-04-03'),
('B', '2016-04-04', NULL),
('C', '2016-04-04', '2017-04-03'),
('D', '2016-04-04', '2017-04-03'),
('E', '2016-04-04', NULL),
('F', '2016-04-04', NULL),
('G', '2016-04-04', '2017-04-03'),
('C', '2017-04-03', NULL),
('A', '2017-04-03', NULL)
The query:
SELECT Name
FROM #T vu1
WHERE EndDate IS NOT NULL
AND NOT EXISTS
(
SELECT 1
FROM #T vu2
WHERE vu2.Name = vu1.Name
AND vu2.StartDate >= vu1.EndDate
)
Results:
Name
D
G
First version
Assuming I understand the question, this should do the trick:
SELECT Name
FROM View_User vu1
WHERE EndDate IS NOT NULL
AND NOT EXISTS
(
SELECT 1
FROM View_User vu2
WHERE vu2.Name = vu1.Name
AND vu2.StartDate >= vu1.EndDate
AND vu2.EndDate IS NOT NULL
)

SQL update one single row table with data from another

I have two tables. The first one with all movements in twelve months and the second one with claims registered in the same period of time. When I run the following query from the first table I've got 10 records. Of course, there are other records with a different number of movements (e.g.: 7, 23, 2 movements):
select t.cod_suc
,t.cod_ramo_comercial
,t.Poliza
,t.Item
,t.id_pv
from temp_portafolio_personal_accidents as t
where t.cod_suc = 2
and t.cod_ramo_comercial = 46
and t.Poliza = 50283
and t.Item = 1
and t.id_pv = 788383;
With the second query, for the second table, I have the following results:
select c.cod_suc
,c.cod_ramo_comercial
,c.[No. Policy]
,c.Item
,c.[ID Incident]
,max(c.id_pv) as id_pv
,count(distinct [No. Incident]) as 'Conteo R12'
from #claims as c
where c.[ID Incident] = 343632
group by c.cod_suc
,c.cod_ramo_comercial
,c.[No. Policy]
,c.Item
,c.[ID Incident];
Now, I need to update the first table but only one record. I'm using the following query, but all records are being updated. When I sum results I have 10 but is just one claim, as the second query shows.
update p
set [No. Siniestros R12] = b.[Conteo R12]
from temp_portafolio_personal_accidents p
left join
(select c.cod_suc
,c.cod_ramo_comercial
,c.[No. Policy]
,c.Item
,c.[ID Incident]
,max(c.id_pv) as id_pv
,count(distinct [No. Incident]) as 'Conteo R12'
from
#claims as c
where c.[ID Incident] = 343632
group by c.cod_suc
,c.cod_ramo_comercial
,c.[No. Policy]
,c.Item
,c.[ID Incident]
) b
on p.id_pv = b.id_pv
and p.cod_suc = b.cod_suc
and p.cod_ramo_comercial = b.cod_ramo_comercial
and p.Poliza = b.[No. Policy]
and p.Item = b.Item
where p.id_pv = 788383;

You can use a CTE with a ROW_NUMBER() function to do this. Simple example:
DECLARE #TABLE AS TABLE (Testing INT, Testing2 VARCHAR(55), Testing3 BIT);
INSERT INTO #TABLE VALUES (1, '1', 1);
INSERT INTO #TABLE VALUES (1, '1', 1);
INSERT INTO #TABLE VALUES (1, '1', 1);
INSERT INTO #TABLE VALUES (1, '1', 1);
INSERT INTO #TABLE VALUES (1, '1', 1);
INSERT INTO #TABLE VALUES (1, '1', 1);
INSERT INTO #TABLE VALUES (1, '1', 1);
INSERT INTO #TABLE VALUES (1, '1', 1);
WITH CTE AS
(
SELECT
ROW_NUMBER() OVER (ORDER BY Testing) AS RowID
,Testing
,Testing2
,Testing3
FROM #TABLE
)
UPDATE CTE
SET Testing = 2, Testing2 = '2', Testing3 = 0
WHERE RowID = 1
;
SELECT * FROM #TABLE
;

SQL return only distinct IDs from LEFT JOIN

I've inherited some fun SQL and am trying to figure out how to how to eliminate rows with duplicate IDs. Our indexes are stored in a somewhat columnar format and then we pivot all the rows into one with the values as different columns.
The below sample returns three rows of unique data, but the IDs are duplicated. I need just two rows with unique IDs (and the other columns that go along with it). I know I'll be losing some data, but I just need one matching row per ID to the query (first, top, oldest, newest, whatever).
I've tried using DISTINCT, GROUP BY, and ROW_NUMBER, but I keep getting the syntax wrong, or using them in the wrong place.
I'm also open to rewriting the query completely in a way that is reusable as I currently have to generate this on the fly (cardtypes and cardindexes are user defined) and would love to be able to create a stored procedure. Thanks in advance!
declare #cardtypes table ([ID] int, [Name] nvarchar(50))
declare #cards table ([ID] int, [CardTypeID] int, [Name] nvarchar(50))
declare #cardindexes table ([ID] int, [CardID] int, [IndexType] int, [StringVal] nvarchar(255), [DateVal] datetime)
INSERT INTO #cardtypes VALUES (1, 'Funny Cards')
INSERT INTO #cardtypes VALUES (2, 'Sad Cards')
INSERT INTO #cards VALUES (1, 1, 'Bunnies')
INSERT INTO #cards VALUES (2, 1, 'Dogs')
INSERT INTO #cards VALUES (3, 1, 'Cat')
INSERT INTO #cards VALUES (4, 1, 'Cat2')
INSERT INTO #cardindexes VALUES (1, 1, 1, 'Bunnies', null)
INSERT INTO #cardindexes VALUES (2, 1, 1, 'playing', null)
INSERT INTO #cardindexes VALUES (3, 1, 2, null, '2014-09-21')
INSERT INTO #cardindexes VALUES (4, 2, 1, 'Dogs', null)
INSERT INTO #cardindexes VALUES (5, 2, 1, 'playing', null)
INSERT INTO #cardindexes VALUES (6, 2, 1, 'poker', null)
INSERT INTO #cardindexes VALUES (7, 2, 2, null, '2014-09-22')
SELECT TOP(100)
[ID] = c.[ID],
[Name] = c.[Name],
[Keyword] = [colKeyword].[StringVal],
[DateAdded] = [colDateAdded].[DateVal]
FROM #cards AS c
LEFT JOIN #cardindexes AS [colKeyword] ON [colKeyword].[CardID] = c.ID AND [colKeyword].[IndexType] = 1
LEFT JOIN #cardindexes AS [colDateAdded] ON [colDateAdded].[CardID] = c.ID AND [colDateAdded].[IndexType] = 2
WHERE [colKeyword].[StringVal] LIKE 'p%' AND c.[CardTypeID] = 1
ORDER BY [DateAdded]
Edit:
While both solutions are valid, I ended up using the MAX() solution from #popovitsj as it was easier to implement. The issue of data coming from multiple rows doesn't really factor in for me as all rows are essentially part of the same record. I will most likely use both solutions depending on my needs.
Here's my updated query (as it didn't quite match the answer):
SELECT TOP(100)
[ID] = c.[ID],
[Name] = MAX(c.[Name]),
[Keyword] = MAX([colKeyword].[StringVal]),
[DateAdded] = MAX([colDateAdded].[DateVal])
FROM #cards AS c
LEFT JOIN #cardindexes AS [colKeyword] ON [colKeyword].[CardID] = c.ID AND [colKeyword].[IndexType] = 1
LEFT JOIN #cardindexes AS [colDateAdded] ON [colDateAdded].[CardID] = c.ID AND [colDateAdded].[IndexType] = 2
WHERE [colKeyword].[StringVal] LIKE 'p%' AND c.[CardTypeID] = 1
GROUP BY c.ID
ORDER BY [DateAdded]

You could use MAX or MIN to 'decide' on what to display for the other columns in the rows that are duplicate.
SELECT ID, MAX(Name), MAX(Keyword), MAX(DateAdded)
(...)
GROUP BY ID;

using row number windowed function along with a CTE will do this pretty well. For example:
;With preResult AS (
SELECT TOP(100)
[ID] = c.[ID],
[Name] = c.[Name],
[Keyword] = [colKeyword].[StringVal],
[DateAdded] = [colDateAdded].[DateVal],
ROW_NUMBER()OVER(PARTITION BY c.ID ORDER BY [colDateAdded].[DateVal]) rn
FROM #cards AS c
LEFT JOIN #cardindexes AS [colKeyword] ON [colKeyword].[CardID] = c.ID AND [colKeyword].[IndexType] = 1
LEFT JOIN #cardindexes AS [colDateAdded] ON [colDateAdded].[CardID] = c.ID AND [colDateAdded].[IndexType] = 2
WHERE [colKeyword].[StringVal] LIKE 'p%' AND c.[CardTypeID] = 1
ORDER BY [DateAdded]
)
SELECT * from preResult WHERE rn = 1

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

SQL Duplicates optimization - sql

You can use only one exists : SELECT cd.cud_customer_id FROM dbo.customer_data AS cd WHERE EXISTS (SELECT 1 FROM dbo.customer_data AS c WHERE c.cud_name = cd.cud_name AND c.cud_birthday = cd.cud_birthday AND c.cust_id <> cd.cud_customer_id );

Related

How to Select rows in a table with all matching conditions in joining table

SQL Update Or Insert By Comparing Dates

How to get the ID from the table which is not mention EndDate

SQL update one single row table with data from another

SQL return only distinct IDs from LEFT JOIN

Categories

Resources