SQL - Finding Duplicate Records based certain criteria

SQL - Finding Duplicate Records based certain criteria - sql

I have these records in the table - employee_projects
id
employee_id
project_id
status
1
emp1
proj1
VERIFIED
2
emp2
proj2
REJECTED
3
emp1
proj1
VERIFIED
4
emp1
proj3
REJECTED
5
emp2
proj2
REQUIRED
6
emp3
proj4
SUBMITTED
7
emp4
proj5
VERIFIED
8
emp4
proj6
VERIFIED
9
emp3
proj4
REQUIRED
Here are the criteria for determining duplicates:
Same employee ID, same project ID under the same status (Example: rows 1 and 3 are duplicates)
Same employee ID, same project ID but in different status (Example: rows 6 and 9 are duplicates).
An exception to duplication criteria#2 is if one project is REQUIRED and the same project is also REJECTED under the same employee, this is NOT considered a duplicate. For example, rows 2 and 5 are NOT duplicates.
I have a query for the first criterion:
select
emp_id,
proj_id,
status,
COUNT(*)
from
employee_projects
group by
emp_id,
proj_id,
status
having
COUNT(*) > 1
What I'm struggling to construct is the SQL for the second criterion.

maybe a self join can help you.
with t (employee_id ,project_id,status)
as
(
select 'emp1', 'proj1' , 'VERIFIED'
Union all select 'emp2', 'proj2' , 'REJECTED'
Union all select 'emp1', 'proj1' , 'VERIFIED'
Union all select 'emp1', 'proj3' , 'REJECTED'
Union all select 'emp2', 'proj2' , 'REQUIRED'
Union all select 'emp3', 'proj4' , 'SUBMITTED'
Union all select 'emp4', 'proj5' , 'VERIFIED'
Union all select 'emp4', 'proj6' , 'VERIFIED'
Union all select 'emp3', 'proj4' , 'REQUIRED'
)
select
t.employee_id,
t.project_id,
t.status,
'' as status,
'criteria#1' as SQL
from
t
group by
t.employee_id,
t.project_id,
t.status
having
COUNT(*) > 1
union all
SELECT
t.employee_id,
t.project_id,
t.status,
a.status,
'criteria#2' as SQL
FROM
t
left join t as a on
t.employee_id = a.employee_id and
t.project_id = a.project_id
where
t.status != a.status and
concat(t.status,a.status) != 'REQUIREDREJECTED' and
concat(t.status,a.status) != 'REJECTEDREQUIRED'

Try the following:
select T.emp_id, T.proj_id, T.status, D.dup_cnt
from employee_projects T join
(
select emp_id, proj_id, count(*) as dup_cnt
from employee_projects
group by emp_id, proj_id
having count(*) > 1 and
count(distinct case when status in ('REQUIRED', 'REJECTED') then status end) < 2
) D
on T.emp_id = D.emp_id and T.proj_id = D.proj_id
order by T.emp_id, T.proj_id
If you want to consider an employee with statuses ('REQUIRED', 'REJECTED', any other statuses) as duplicate, modify the having clause as the following:
select T.emp_id, T.proj_id, T.status, D.dup_cnt
from employee_projects T join
(
select emp_id, proj_id, count(*) as dup_cnt
from employee_projects
group by emp_id, proj_id
having count(*) > 1 and
(count(distinct case when status in ('REQUIRED', 'REJECTED') then status end) < 2 or count(distinct status) > 2)
) D
on T.emp_id = D.emp_id and T.proj_id = D.proj_id
order by T.emp_id, T.proj_id
See a demo.

Related

Avoid Unions to get TOP count

Here are two tables:
LocationId Address City State Zip
1 2100, 1st St Austin TX 76819
2 2200, 2nd St Austin TX 76829
3 2300, 3rd St Austin TX 76839
4 2400, 4th St Austin TX 76849
5 2500, 5th St Austin TX 76859
6 2600, 6th St Austin TX 76869
TripId PassengerId FromLocationId ToLocationId
1 746896 1 2
2 746896 2 1
3 234456 1 3
4 234456 3 1
5 234456 1 4
6 234456 4 1
7 234456 1 6
8 234456 6 1
9 746896 1 2
10 746896 2 1
11 746896 1 2
12 746896 2 1
I want TOP 5 locations which each passenger has traveled to (does not matter if its from or to location). I can get it using a UNION, but was wondering if there was a better way to do this.
My Solution:
select top 5 *
from
(select count(l.LocationId) as cnt, l.LocationId, l.Address1, l.Address2, l.City, St.State , l.Zip
from
Trip t
join LOCATION l on t.FromLocationId = l.LocationId
where t.PassengerId = 746896
group by count(l.LocationId) as cnt, l.LocationId, l.Address1, l.Address2, l.City, St.State , l.Zip
UNION
select count(l.LocationId) as cnt, l.LocationId, l.Address1, l.Address2, l.City, St.State , l.Zip
from
Trip t
join LOCATION l on t.ToLocationId = l.LocationId
where t.PassengerId = 746896
group by count(l.LocationId) as cnt, l.LocationId, l.Address1, l.Address2, l.City, St.State , l.Zip
) as tbl
order by cnt desc

This will give you top 5 location.
SELECT TOP 5 tmp.fromlocationid AS locationid,
Count(tmp.fromlocationid) AS Times
FROM (SELECT fromlocationid
FROM trip
UNION ALL
SELECT tolocationid
FROM trip) tmp
GROUP BY tmp.fromlocationid
Method 1: This will give you top 5 location of each passenger.
WITH cte AS
( SELECT passengerid,
locationid,
Count(locationid) AS Times,
Row_number() OVER(partition BY passengerid ORDER BY passengerid ASC) AS RowNum
FROM (SELECT tripid, passengerid, fromlocationid AS locationid
FROM trip
UNION ALL
SELECT tripid, passengerid, tolocationid AS locationid
FROM trip) tmp
GROUP BY passengerid, locationid )
SELECT *
FROM cte
WHERE rownum <= 5
ORDER BY passengerid, Times DESC
Method 2: Same result without Union Operator (Top 5 location of each passenger)
WITH cte AS
( SELECT passengerid,
locationid,
Count(locationid) AS Times,
Row_number() OVER(partition BY passengerid ORDER BY passengerid ASC) AS RowNum
FROM trip
UNPIVOT ( locationid
FOR subject IN (fromlocationid, tolocationid) ) u
GROUP BY passengerid, locationid )
SELECT *
FROM cte
WHERE rownum <= 5
ORDER BY passengerid, times DESC
If you also want to get the location details, you can simply join the location table.
SELECT cte.* , location.*
FROM cte
INNER JOIN location ON location.locationid = cte.locationid
WHERE rownum <= 5
ORDER BY passengerid, times DESC
Reference
- https://stackoverflow.com/a/19056083/6327676

YOou'll need to replace the SELECT *'s with the columns you need, however, something like this should work:
WITH Visits AS (
SELECT *,
COUNT(*) OVER (PARTITION BY t.PassengerID, L.LocationID) AS Visits
FROM Trip T
JOIN [Location] L ON T.FromLocationId = L.LocationId),
Rankings AS (
SELECT *,
DENSE_RANK() OVER (PARTITION BY V.PassengerID ORDER BY Visits DESC) AS Ranking
FROM Visits V)
SELECT *
FROM Rankings
WHERE Ranking <= 5;

Further simplified solution
select top 3 * from
(
Select distinct count(locationId) as cnt, locationId from trip
unpivot
(
locationId
for direction in (fromLocationId, toLocationId)
)u
where passengerId IN (746896, 234456)
group by direction, locationId
)as tbl2
order by cnt desc;
Solution combining columns
The main issue for me is avoiding union to combine the two columns.
The UNPIVOT command can do this.
select top 3 * from (
select count(locationId) cnt, locationId
from
(
Select valu as locationId, passengerId from trip
unpivot
(
valu
for loc in (fromLocationId, toLocationId)
)u
)united
where passengerId IN (746896, 234456)
group by locationId
) as tbl
order by cnt desc;
http://sqlfiddle.com/#!18/cec8b/136
If you want to get the counts by direction:
select top 3 * from (
select count(locationId) cnt, locationId, direction
from
(
Select valu as locationId, direction, passengerId from trip
unpivot
(
valu
for direction in (fromLocationId, toLocationId)
)u
)united
where passengerId IN (746896, 234456)
group by locationId, direction
) as tbl
order by cnt desc;
http://sqlfiddle.com/#!18/cec8b/139
Same Results as you ( minus some minor descriptions )
select top 3 * from
(
select distinct * from (
select count(locationId) cnt, locationId
from
(
Select valu as locationId, direction, passengerId from trip
unpivot
(
valu
for direction in (fromLocationId, toLocationId)
)u
)united
where passengerId IN (746896, 234456)
group by locationId, direction
) as tbl
)as tbl2
order by cnt desc;

You can do this without union all:
select top (5) t.passengerid, v.locationid, count(*)
from trip t cross apply
(values (fromlocationid), (tolocationid)) v(locationid) join
location l
on v.locationid = l.locationid
where t.PassengerId = 746896
group by t.passengerid, v.locationid
order by count(*) desc;
If you want an answer for all passengers, it would be a similar idea, using row_number(), but your query suggests you want the answer only for one customer at a time.
You can include additional fields from location as well.
Here is a SQL Fiddle.

Join Tables and Return data in single row different columns with count of join

We have 3 tables in Oracle 11g, need to left join them and return the data in single row different columns with count of the join, Is there any way We can acheive the same.
Example:
Table1: (Employee_Data)
Table2: (Employee_Address)
Table3: (Employee_Role)
Expected Result:
Mack has 2 addresses and 2 roles so Emp_Addr_Count is 2, Emp_Role_Count is 2 and the related data is in same row different column.
Kindly note that EMP_ID is unique in Employee_Data table and Employee_Address and Employee_Role could be multiple or zero for a Employee.
Thanks in Advance.

Try this:
SELECT E.Emp_Id
,E.Emp_Name
,E.Emp_Age
,NVL(MAX(EA.RN),0)Addr_Count
,NVL(MAX(CASE WHEN EA.RN = 1 THEN EA.Emp_Address END),' ')Emp_Address_1
,NVL(MAX(CASE WHEN EA.RN = 1 THEN EA.Emp_City END),' ')Emp_City_1
,NVL(MAX(CASE WHEN EA.RN = 2 THEN EA.Emp_Address END),' ')Emp_Address_2
,NVL(MAX(CASE WHEN EA.RN = 2 THEN EA.Emp_City END),' ')Emp_City_2
,NVL(MAX(ER.RN1),0)Role_Count
,NVL(MAX(CASE WHEN ER.RN1 = 1 THEN ER.Emp_task END),' ')Emp_task_1
,NVL(MAX(CASE WHEN ER.RN1 = 2 THEN ER.Emp_task END),' ')Emp_task_2 FROM Employee_Data E JOIN(
SELECT Emp_Id
,ROW_NUMBER() OVER(PARTITION BY Emp_Id ORDER BY Emp_City desc) RN
,Emp_City
,Emp_Address
FROM Employee_Address
)EA ON EA.Emp_Id = E.Emp_Id left JOIN(
SELECT Emp_Id
,ROW_NUMBER() OVER(PARTITION BY Emp_Id ORDER BY Emp_Task) RN1
,Emp_task
FROM Employee_Role
)ER ON ER.Emp_Id = E.Emp_Id GROUP BY E.Emp_Id,E.Emp_Name,E.Emp_Age
Output:
EMP_ID EMP_NAME EMP_AGE ADDR_COUNT EMP_ADDRESS_1 EMP_CITY_1 EMP_ADDRESS_2 EMP_CITY_2 ROLE_COUNT EMP_TASK_1 EMP_TASK_2
1 MACK 45 2 HOME PARADISE MUM TINDER ONCLAVE DEL 2 Manage Task Resource Manage
2 JACK 30 1 BLUE PLAZA MUM 1 Code
3 ANGEL 27 1 HOME PARADISE MUM 0

You can join them as in the following statement :
WITH t AS
(
SELECT d.*, a.emp_address, a.emp_city, r.emp_task
FROM employee_data d
JOIN employee_address a on ( d.emp_id = a.emp_id )
FULL OUTER JOIN employee_role r on ( d.emp_id = r.emp_id )
)
SELECT emp_id, emp_name, emp_age, count(distinct emp_address) emp_addr_count,
min(emp_address) emp_address_1, max(emp_city) emp_city_1,
decode(min(emp_address),max(emp_address),null,max(emp_address)) emp_address_2,
decode(min(emp_city),max(emp_city),null,min(emp_city)) emp_city_2,
count(distinct emp_task) emp_role_count, min(emp_task) emp_task_1,
decode(min(emp_task),max(emp_task),null,max(emp_task)) emp_task_2
FROM t
GROUP BY emp_id, emp_name, emp_age
ORDER BY emp_id;
SQL Fiddle Demo

Oracle SQL query count distinct branches

What I wanted to do is to know the number of employees that have their mgr and have no mgr. The table is like this:
Emp Branch Mgr
EmpA Branch1 Mgr1
EmpB Branch2 Mgr2
EmpC Branch1 Mgr2
EmpD Branch1
EmpE Branch2 Mgr2
EmpF Branch1 Mgr2
And the output that I wanted to get is like this:
Branch HasMgr HasNoMgr
Branch1 3 1
Branch2 2 0
already tried this code but the result is wrong
SELECT branches,
(SELECT COUNT(*) FROM tbl WHERE mgr IS NULL),
(SELECT COUNT(*) FROM tbl WHERE mgr IS NOT NULL )
FROM tbl GROUP BY branches

Use a sub-query to sum up all managers with/without a value. Hope this helps. Thanks.
SELECT branch,
SUM(case when Mgr is not null then 1 else 0 end) hasmgr,
SUM(case when Mgr is not null then 0 else 1 end) hasnomgr
FROM tbl
GROUP by branch;

With dat as(
Select 'Emp' emp , 'Branch' Branch, 'Mgr' as manager UNION ALL
Select 'EmpA' , 'Branch1', 'Mgr1' union all
Select 'EmpB' , 'Branch2', 'Mgr2' union all
Select 'EmpC' , 'Branch1' , 'Mgr2' union all
Select 'EmpD' , 'Branch1' , null union all
Select 'EmpE' , 'Branch2' ,'Mgr2' union all
Select 'EmpF' , 'Branch1' , 'Mgr2'
)
SELECT Branch,count(manager) hasMgr,sum(case when manager is null then 1
else 0 end) hasNoMgr FROM dat
group by branch

select branch,
sum(decode(mgr,null,0,1)) as "hasmgr",
sum(decode(mgr,null,1,0)) as "hasnomgr"
FROM TAB1
GROUP BY BRANCH

Tuning oracle subquery in select statement

I have a master table and a reference table as below.
WITH MAS as (
SELECT 10 as CUSTOMER_ID, 1 PROCESS_ID, 44 PROCESS_TYPE, 200 as AMOUNT FROM DUAL UNION ALL
SELECT 10 as CUSTOMER_ID, 1 PROCESS_ID, 44 PROCESS_TYPE, 250 as AMOUNT FROM DUAL UNION ALL
SELECT 10 as CUSTOMER_ID, 2 PROCESS_ID, 45 PROCESS_TYPE, 300 as AMOUNT FROM DUAL UNION ALL
SELECT 10 as CUSTOMER_ID, 2 PROCESS_ID, 45 PROCESS_TYPE, 350 as AMOUNT FROM DUAL
), REFTAB as (
SELECT 44 PROCESS_TYPE, 'A' GROUP_ID FROM DUAL UNION ALL
SELECT 44 PROCESS_TYPE, 'B' GROUP_ID FROM DUAL UNION ALL
SELECT 45 PROCESS_TYPE, 'C' GROUP_ID FROM DUAL UNION ALL
SELECT 45 PROCESS_TYPE, 'D' GROUP_ID FROM DUAL
) SELECT ...
My first select statement which works correctly is this one:
SELECT CUSTOMER_ID,
SUM(AMOUNT) as AMOUNT1,
SUM(CASE WHEN PROCESS_TYPE IN (SELECT PROCESS_TYPE FROM REFTAB WHERE GROUP_ID = 'A')
THEN AMOUNT ELSE NULL END) as AMOUNT2,
COUNT(CASE WHEN PROCESS_TYPE IN (SELECT PROCESS_TYPE FROM REFTAB WHERE GROUP_ID = 'D')
THEN 1 ELSE NULL END) as COUNT1
FROM MAS
GROUP BY CUSTOMER_ID
However, to address a performance issue, I changed it to this select statement:
SELECT CUSTOMER_ID,
SUM(AMOUNT) as AMOUNT1,
SUM(CASE WHEN GROUP_ID = 'A' THEN AMOUNT ELSE NULL END) as AMOUNT2,
COUNT(CASE WHEN GROUP_ID = 'D' THEN 1 ELSE NULL END) as COUNT1
FROM MAS A
LEFT JOIN REFTAB B ON A.PROCESS_TYPE = B.PROCESS_TYPE
GROUP BY CUSTOMER_ID
For the AMOUNT2 and COUNT1 columns, the values stay the same. But for AMOUNT1, the value is multiplied because of the join with the reference table.
I know I can add 1 more left join with an additional join condition on GROUP_ID. But that won't be any different from using a subquery.
Any idea how to make the query work with just 1 left join while not multiplying the AMOUNT1 value?

I know I can add 1 more left join with adding aditional GROUP_ID clause but it wont be different from subquery.
You'd be surprised. Having 2 left joins instead of subqueries in the SELECT gives the optimizer more ways of optimizing the query. I would still try it:
select m.customer_id,
sum(m.amount) as amount1,
sum(case when grpA.group_id is not null then m.amount end) as amount2,
count(grpD.group_id) as count1
from mas m
left join reftab grpA
on grpA.process_type = m.process_type
and grpA.group_id = 'A'
left join reftab grpD
on grpD.process_type = m.process_type
and grpD.group_id = 'D'
group by m.customer_id
You can also try this query, which uses the SUM() analytic function to calculate the amount1 value before the join to avoid the duplicate value problem:
select m.customer_id,
m.customer_sum as amount1,
sum(case when r.group_id = 'A' then m.amount end) as amount2,
count(case when r.group_id = 'D' then 'X' end) as count1
from (select customer_id,
process_type,
amount,
sum(amount) over (partition by customer_id) as customer_sum
from mas) m
left join reftab r
on r.process_type = m.process_type
group by m.customer_id,
m.customer_sum
You can test both options, and see which one performs better.

Starting off with your original query, simply replacing your IN queries with EXISTS statements should provide a significant boost. Also, be wary of summing NULLs, perhaps your ELSE statements should be 0?
SELECT CUSTOMER_ID,
SUM(AMOUNT) as AMOUNT1,
SUM(CASE WHEN EXISTS(SELECT 1 FROM REFTAB WHERE REFTAB.GROUP_ID = 'A' AND REFTAB.PROCESS_TYPE = MAS.PROCESS_TYPE)
THEN AMOUNT ELSE NULL END) as AMOUNT2,
COUNT(CASE WHEN EXISTS(SELECT 1 FROM REFTAB WHERE REFTAB.GROUP_ID = 'D' AND REFTAB.PROCESS_TYPE = MAS.PROCESS_TYPE)
THEN 1 ELSE NULL END) as COUNT1
FROM MAS
GROUP BY CUSTOMER_ID

The normal way is to aggregate the values before the group by. You can also use conditional aggregation, if the rest of the query is correct:
SELECT CUSTOMER_ID,
SUM(CASE WHEN seqnum = 1 THEN AMOUNT END) as AMOUNT1,
SUM(CASE WHEN GROUP_ID = 'A' THEN AMOUNT ELSE NULL END) as AMOUNT2,
COUNT(CASE WHEN GROUP_ID = 'D' THEN 1 ELSE NULL END) as COUNT1
FROM MAS A LEFT JOIN
(SELECT B.*, ROW_NUMBER() OVER (PARTITION BY PROCESS_TYPE ORDER BY PROCESS_TYPE) as seqnum
FROM REFTAB B
) B
ON A.PROCESS_TYPE = B.PROCESS_TYPE
GROUP BY CUSTOMER_ID;
This ignores the duplicates created by the joins.

Finding duplicate differences between two tables in sql

I try to find duplicate rows between two tables. This code works only if records are not duplicated:
(select [Name], [Age] from PeopleA
except
select [Name], [Age] from PeopleB)
union all
(select [Name], [Age] from PeopleB
except
select [Name], [Age] from PeopleA)
How to find missing, duplicate records. Robert 34 in PersonA table for example below:
PersonA:
Name | Age
-------------
John | 45
Robert | 34
Adam | 26
Robert | 34
PersonB:
Name | Age
-------------
John | 45
Robert | 34
Adam | 26

You can use UNION ALL to concat both tables and Group By with Having clause to find duplicates:
SELECT x.Name, x.Age, Cnt = Count(*)
FROM (
SELECT a.Name, a.Age
FROM PersonA a
UNION ALL
SELECT b.Name, b.Age
FROM PersonB b
) x
GROUP BY x.Name, x.Age
HAVING COUNT(*) > 1
According to your clarification in the comment, you could use following query to find all name-age combinations in PersonA which are different in PersonB:
WITH A AS(
SELECT a.Name, a.Age, cnt = count(*)
FROM PersonA a
GROUP BY a.Name, a.Age
),
B AS(
SELECT b.Name, b.Age, cnt = count(*)
FROM PersonB b
GROUP BY b.Name, b.Age
)
SELECT a.Name, a.Age
FROM A a LEFT OUTER JOIN B b
ON a.Name = b.Name AND a.Age = b.Age
WHERE a.cnt <> ISNULL(b.cnt, 0)
Demo
If you also want to find persons which are in PersonB but not in PersonA you should use a FULL OUTER JOIN as Gordon Linoff has commented:
WITH A AS(
SELECT a.Name, a.Age, cnt = count(*)
FROM PersonA a
GROUP BY a.Name, a.Age
),
B AS(
SELECT b.Name, b.Age, cnt = count(*)
FROM PersonB b
GROUP BY b.Name, b.Age
)
SELECT Name = ISNULL(a.Name, b.Name), Age = ISNULL(a.Age, b.Age)
FROM A a FULL OUTER JOIN B b
ON a.Name = b.Name AND a.Age = b.Age
WHERE ISNULL(a.cnt, 0) <> ISNULL(b.cnt, 0)
Demo

I like Tim's answer but you need to check in both tables if the records are missing. He is only checking if the records are missing in table A. Try this to check if records are missing in either of the tables and how many times.
Select *, 'PersonB' MissingInTable, a.cnt - isnull(b.cnt,0) TimesMissing From
(
Select *, count(1) cnt from PersonA group by Name, Age) A Left join
(Select *, count(1) cnt from PersonB group by Name, Age) B
On a.age=b.age and a.name=b.name
where a.cnt>isnull(b.cnt,0)
Union All
Select *, 'PersonA' MissingInTable, b.cnt - isnull(a.cnt,0) TimesMissing From
(
Select *, count(1) cnt from PersonA group by Name, Age) A Right join
(Select *, count(1) cnt from PersonB group by Name, Age) B
On a.age=b.age and a.name=b.name
where b.cnt>isnull(a.cnt,0)
See demo here : http://sqlfiddle.com/#!6/06020/13

Add another UNION ALL!
Code:
(SELECT [Name], [Age], 'Missing from B' AS [Type] from PeopleA
EXCEPT
SELECT [Name], [Age], 'Missing from B' AS [Type] from PeopleB)
UNION ALL
(SELECT [Name], [Age], 'Missing from A' as [Type] from PeopleB
EXCEPT
SELECT [Name], [Age], 'Missing from A' AS [Type] from PeopleA)
UNION ALL
SELECT [Name], [Age], 'Duplicate' AS [Type] FROM PeopleA INNER JOIN PeopleB ON PeopleA.Name = PeopleB.Name AND
PeopleA.Age=PeopleB.Age

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

SQL - Finding Duplicate Records based certain criteria - sql

Related

Avoid Unions to get TOP count

Join Tables and Return data in single row different columns with count of join

Oracle SQL query count distinct branches

Tuning oracle subquery in select statement

Finding duplicate differences between two tables in sql

Categories

Resources