SQL OVER (Partiton by) - Handle nulls - sql

I have a following scenario:
Table Employees:
First Name | Last Name | Department | Salary
-----------|-----------|------------|---------
John | Doe | Finance | 20
John | Doe | R&D | 20
John | null | Finance | 20
John | long | Finance | 20
and I want 1 row for each (First Name,Last Name),
unless we have a null in the last name, and then i want just 1 row with (First Name,null)
for the above example the result is:
First Name | Last Name | Department | Salary
-----------|-----------|------------|---------
John | null | Finance | 20
but if i didn't have that record then the result should have been:
First Name | Last Name | Department | Salary
-----------|-----------|------------|---------
John | Doe | R&D | 20
John | long | Finance | 20
I guess the answer involves some Partition By-s, but I'm not sure where.
Right now I came to this:
SELECT FirstName,LastName, DEPARTMENT,Salary,RK FROM
(
select * from
SELECT EXT.*,
ROW_NUMBER() OVER(PARTITION BY EXT.FirstName,EXT.LastName
ORDER BY rownum ASC) AS RK
FROM Employees EXT
)
WHERE RK = 1 ;
Thanks !

Your problem is in the PARTITION clause. You want every first name where there is a surname unless at least one surname with that first name is NULL, in which case you want only those first names that have a NULL surname.
The answer here is to use RANK() instead of ROW_NUMBER(). RANK() does not create a consecutive list; instead rows with equal values get the same rank.
select firstname, lastname, department, salary, rk
from ( select a.*
, rank() over ( partition by firstname
order by case when lastname is null then 0
else 1
end
) as rnk
from employees a
)
where rnk = 1
This works by making the existence of a surname relevant rather than the surname itself.
Two more points:
You had a nested select without parenthesis. This won't work.
There's no point ordering by ROWNUM. By definition rownum returns rows in the order returned by the statement, which means the rows will always be in the order of the ROWNUM.

something like this:
SQL> create table person
2 (
3 fname varchar2(10),
4 lname varchar2(10),
5 dept varchar2(10),
6 sal number
7 );
Table created.
SQL> insert into person values ('John', 'Doe', 'Finance', 20);
1 row created.
SQL> insert into person values ('John', 'Doe', 'R&D', 20);
1 row created.
SQL> insert into person values ('John', '', 'Finance', 20);
1 row created.
SQL> insert into person values ('John', 'Long', 'Finance', 20);
1 row created.
SQL> insert into person values ('Paul', 'Doe', 'R&D', 30);
1 row created.
SQL> insert into person values ('Paul', 'Doe', 'Finance', 30);
1 row created.
SQL> insert into person values ('Paul', 'Long', 'Finance', 30);
1 row created.
SQL> select fname, lname, dept, sal
2 from (select fname, lname, dept, sal,has_null,
3 row_number() over(partition by fname,
4 case when has_null = 'N' then lname else null end
5 order by lname desc nulls first) rn
6 from (select fname, lname,
7 nvl(max(case when lname is null then 'Y'
8 end) over(partition by fname), 'N') has_null, dept, sal
9 from person))
10 where rn = 1;
FNAME LNAME DEPT SAL
---------- ---------- ---------- ----------
John Finance 20
Paul Doe R&D 30
Paul Long Finance 30

That query does the (same) trick, but preforms better.
SELECT fname,
lname,
dept,
sal
FROM (SELECT fname,
lname,
dept,
sal,
First_value(lname)
OVER(
partition BY fname
ORDER BY lname nulls first) null_domain,
Row_number()
OVER (
partition BY fname, lname
ORDER BY fname) r
FROM person)
WHERE ( ( null_domain IS NULL
AND lname IS NULL )
OR null_domain IS NOT NULL )
AND r = 1;

Related

Remove duplicate rows in Postgres

I have two tables:
Employee:
ID
Name
Surname
143
Amy
Flowers
245
Natasha
Smith
365
John
Alexander
445
Natasha
Smith
565
Monica
Withhouse
644
Amy
Flowers
1023
Amy
Alexander
And employee_details:
ID
Employee_id
Document_numer
1
644
XXXXXXXXX
2
245
XXXXXX
3
365
XXXXXX
I need to remove duplicate records that are in the Employee table and that are not related to the employee_details table. In the example data, I would like to delete the employee doublet with the id 143 and 445.
And I must admit that I have no idea how to do it.Could you give me a hint?
The base is postgres
Delete from Employee
Where id not in (
Select Employee_id
from employee_details
)
and name in (
Select name
from Employee
Group by name having count(name) > 1
)
Though the question is already answered I am adding two different answers here using cte.
create table Employee(ID int, Name varchar(50), Surname varchar(50));
insert into Employee values(143, 'Amy', 'Flowers');
insert into Employee values(245, 'Natasha', 'Smith');
insert into Employee values(365, 'John', 'Alexander');
insert into Employee values(445, 'Natasha', 'Smith');
insert into Employee values(565, 'Monica', 'Withhouse');
insert into Employee values(644, 'Amy', 'Flowers');
insert into Employee values(1023, 'Amy', 'Alexander');
create table employee_details ( ID int, Employee_id int, Document_numer varchar(50));
insert into employee_details values(1, 644, 'XXXXXXXXX');
insert into employee_details values(2, 245, 'XXXXXX');
insert into employee_details values(3, 365, 'XXXXXX');
Delete query 1:
with duplicate_employees as
(
select * , count(id)over(partition by name,surname) duplicate_count from Employee
)
delete from Employee where id in(
select id from duplicate_employees de
where duplicate_count >1
and not exists
(
select 1 from employee_details e where e.Employee_id = de.ID
)
)
select * from employee
Output:
id
name
surname
245
Natasha
Smith
365
John
Alexander
565
Monica
Withhouse
644
Amy
Flowers
1023
Amy
Alexander
db<>fiddle here
Delete query 2:
with cte as
(
Select *, count(*)over(partition by name,surname) duplicate_count,
(case when exists
(
select 1 from employee_details ed where ed.Employee_id = e.ID
)
then 1 else 0 end) exist_in_details
from Employee e
)
delete from Employee where id in (select id from cte where duplicate_count>1 and exist_in_details=0 )
select * from Employee
Output:
id
name
surname
245
Natasha
Smith
365
John
Alexander
565
Monica
Withhouse
644
Amy
Flowers
1023
Amy
Alexander
db<>fiddle here

How do I return both values if the maximum if two rows are equal?

I need to find the persons with the maximum salaries in each department. I've got the code and found out the persons with the maximum salaries for each department. But then, when I looked at my data, there is another person that has the equal max value in the same department. Is there a way to return both persons' name?
example table:
Department Salary Name
Admin $1000 Amy
Admin $900 Ben
HR $1500 Cassy
HR $1500 Dan
I have tried this code:
SELECT department, Max(salary), name
FROM table
GROUP BY department
ORDER BY salary desc;
I've been getting Admin's person's details OK. But HR I can only get Cassy's name. Is there a way to get Dan's name in my output as well? Can anyone give me an example? Thank you
Hope this can help
SELECT department, salary, name
FROM table t
where salary= (select max(salary) from table where t.department = department)
You didn't mention the DBMS you are using.
With standard SQL, you can use window functions for this (which are supported by all modern DBMS):
select department, salary, name
from (
select department, salary, name,
dense_rank() over (partition by department order by salary desc) as rnk
from department
) t
where rnk = 1;
With NOT EXISTS:
SELECT department, salary, name
FROM tablename t
WHERE NOT EXISTS (
SELECT 1 FROM tablename
WHERE department = t.department and salary > t.salary
)
ORDER BY salary desc, name;
See the demo.
Results:
| Department | Salary | Name |
| ---------- | ------ | ----- |
| HR | 1500 | Cassy |
| HR | 1500 | Dan |
| Admin | 1000 | Amy |
You can use two levels of aggregation if you want one row per department with the names lists on the row:
select dept, salary, names
from (select dept, salary, group_concat(name) as names,
row_number() over (partition by dept order by salary desc) as seqnum
from example
group by dept, salary
) t
where seqnum = 1;

SQL Server - How to make partially duplicate rows inherit values from original row

In order to link records across datasets I first deleted the records down to non-duplicates based on key linking variables (partitioning over names, dob, sex etc. and deleting where row_number > 1). After the linking was done I'm left with a new variable "unique_id" however this will only be attributed to the original record (since I removed the partial duplicates). I now want to reattach this "unique_id" back to all of the partial duplicates. How could I go about doing this? Is there perhaps a better method I could have used from the start?
Data currently looks like this:
row_number unique_id id first_name last_name activity_date
1 10 2 Davy Jones 1726-11-25
2 -- 12 Davy Jones 1751-02-11
3 -- 43 Davy Jones 1811-06-15
1 100 12114 John Smith 2018-06-01
2 -- 123123 John Smith 2022-07-05
1 90 2591 Mary Sue 2013-05-18
And I want the "unique_id" to inherit the originals like this:
row_number unique_id id first_name last_name activity_date
1 10 2 Davy Jones 1726-11-25
2 10 12 Davy Jones 1751-02-11
3 10 43 Davy Jones 1811-06-15
1 100 12114 John Smith 2018-06-01
2 100 123123 John Smith 2022-07-05
1 90 2591 Mary Sue 2013-05-18
Code to produce this table is as follows:
create table #test (
unique_id int,
id int,
first_name varchar(255),
last_name varchar(255),
activity_date date
)
insert into #test
values (100, 12114, 'John', 'Smith', '2018-06-01')
insert into #test (id, first_name, last_name, activity_date)
values (123123, 'John', 'Smith', '2022-07-05')
insert into #test
values (90, 2591, 'Mary', 'Sue', '2013-05-18')
insert into #test
values (10, 2, 'Davy', 'Jones', '1726-11-25')
insert into #test (id, first_name, last_name, activity_date)
values (12, 'Davy', 'Jones', '1751-02-11')
insert into #test (id, first_name, last_name, activity_date)
values (43, 'Davy', 'Jones', '1811-06-15')
select
row_number() over (partition by first_name, last_name order by first_name, last_name) as row_number
,unique_id, id, first_name, last_name, activity_date
from #test
A simple method -- assuming one value per first_name/last_name pair -- is to use window functions:
select t.*, max(unique_id) over (partition by first_name, last_name) as new_unique_id
from #test t;
This can be put into an update:
with toupdate as (
select t.*, max(unique_id) over (partition by first_name, last_name) as new_unique_id
from #test t
)
update toupdate
set unique_id = new_unique_id;
Here is a rextester illustrating the syntax.
Try this:
with Dups as(
select
row_number() over (partition by first_name, last_name order by first_name, last_name) as dup_number,
-- dense_rank() over (order by first_name, last_name) as DuplicateGroupNumber, -- this allows you to see groups
max(unique_id) over (partition by first_name, last_name) as GroupUniqueID,
unique_id, id, first_name, last_name, activity_date
from #test
)
update a
set unique_id = GroupUniqueID
from #test as a
inner join Dups as b on a.id = b.id
select * from #test
Result
unique_id id first_name
----------- ----------- ------------
100 12114 John
100 123123 John
90 2591 Mary
10 2 Davy
10 12 Davy
10 43 Davy
Looks like you should join a subset of the records that has the linking id with the records that don't have the linking id using whatever fields you think appropriate and then update the id in the unlinked set from the id in the linked set.

SQL Employees Searching for Others in Same Department

I have a business rule that employees cannot purchase items from employees of the same department. I have two tables. one is the list of employees and their IDs:
Emp_ID Emp_Name Dept_ID
1 John 1
2 Bob 1
3 Susie 2
4 Jack 3
5 Jill 3
And a table of the employee ID and the employee ID they purchased from:
Emp_ID Bought_From_Emp_ID
1 2
2 3
4 5
5 1
My expected output would be to have the the employee id (or name) of both employees if one purchased an item from the same department:
Emp_ID Bought_From_Emp_ID Same_Dept_ID
1 2 1 --John and Bob are in Same Department (1)
4 5 3 --Jack and Jill are in Same Department (3)
How would I do this for millions of records? I have a feeling that this is very simple in the long run, but my mind hasn't shifted towards the solution yet.
I am using Teradata, but can use MSSQL if there are any SQL-specific answers
DECLARE #emp TABLE
(
emp_id INT,
emp_name VARCHAR(20),
dept_id INT
);
INSERT INTO #emp
(
emp_id,
emp_name,
dept_id
)
VALUES
--Emp_ID Emp_Name Dept_ID
(1, 'John ', 1),
(2, 'Bob', 1),
(3, 'Susie', 2),
(4, 'Jack', 3),
(5, 'Jill', 3);
DECLARE #purch TABLE
(
emp_id INT,
Bought_From_Emp_ID INT
);
INSERT INTO #purch
(
emp_id,
Bought_From_Emp_ID
)
VALUES
(1, 2),
(2, 3),
(4, 5),
(5, 1);
SELECT e.emp_id,
e1.emp_id AS Bought_From_Emp_ID,
e.dept_id AS Same_Dept_ID
FROM #purch p
JOIN #emp e
ON p.emp_id = e.emp_id
JOIN #emp e1
ON p.Bought_From_Emp_ID = e1.emp_id
AND e.dept_id = e1.dept_id
WHERE e1.emp_id <> e.emp_id;
Try this query
SELECT purch.emp_id AS EmpID,
purch.bought_from_emp_id AS BoughtFrom,
T1.dept_id AS department
FROM purch
INNER JOIN emp T1
ON T1.emp_id = purch.emp_id
INNER JOIN emp T2
ON T2.emp_id = purch.bought_from_emp_id
WHERE t1.dept_id = t2.dept_id
Output
+-------+------------+------------+
| EmpID | BoughtFrom | department |
+-------+------------+------------+
| 1 | 2 | 1 |
| 4 | 5 | 3 |
+-------+------------+------------+
Demo: http://www.sqlfiddle.com/#!18/22746/1/0

How do I select a row from nearly duplicate rows based on a field value?

If I have rows with this data:
ID |Name |ContractType|
---|------------|------------|
1 |Aaron Shatz | 6-month |
2 |Jim Smith |12-month |
3 |Jim Smith | 6-month |
4 |Mark Johnson|12-month |
I can't use Id to determine which record to use: I have to use ContractType. I want to select all records from a table, but if there are records with the same Name value, I want to pick the 12-month contract record.
The result of the query should be:
ID |Name |ContractType|
---|------------|------------|
1 |Aaron Shatz | 6-month |
2 |Jim Smith |12-month |
4 |Mark Johnson|12-month |
Hard coded version
This solution assumes that there are only two contract types namely 6-month and 12-month. Please scroll to the bottom for dynamic version.
Click here to view the demo in SQL Fiddle.
Script:
CREATE TABLE contracts
(
id INT NOT NULL IDENTITY
, name VARCHAR(30) NOT NULL
, contracttype VARCHAR(30) NOT NULL
);
INSERT INTO contracts (name, contracttype) VALUES
('Aaron Shatz', '6-month'),
('Jim Smith', '12-month'),
('Jim Smith', '12-month'),
('Mark Johnson', '12-month'),
('John Doe', '6-month'),
('Mark Johnson', '6-month'),
('Aaron Shatz', '6-month');
SELECT id
, name
, contracttype
FROM
(
SELECT id
, name
, contracttype
, ROW_NUMBER() OVER(PARTITION BY name ORDER BY contracttype) AS rownum
FROM contracts
) T1
WHERE rownum = 1
ORDER BY id;
Output:
id name contracttype
-- ------------ ------------
1 Aaron Shatz 6-month
2 Jim Smith 12-month
4 Mark Johnson 12-month
5 John Doe 6-month
Dynamic version
This moves the contract type data into a table of its own with a sequence column. Based on how the contract types are ordered, the query will fetch the appropriate records.
Click here to view the demo in SQL Fiddle.
Script:
CREATE TABLE contracts
(
id INT NOT NULL IDENTITY
, name VARCHAR(30) NOT NULL
, contracttypeid INT NOT NULL
);
CREATE TABLE contracttypes
(
id INT NOT NULL IDENTITY
, contracttype VARCHAR(30) NOT NULL
, sequence INT NOT NULL
)
INSERT INTO contracttypes (contracttype, sequence) VALUES
('12-month', 1),
('6-month', 3),
('15-month', 2);
INSERT INTO contracts (name, contracttypeid) VALUES
('Aaron Shatz', 2),
('Jim Smith', 2),
('Jim Smith', 3),
('Mark Johnson', 1),
('John Doe', 2),
('Mark Johnson', 2),
('Aaron Shatz', 2);
SELECT id
, name
, contracttype
FROM
(
SELECT c.id
, c.name
, ct.contracttype
, ROW_NUMBER() OVER(PARTITION BY name ORDER BY ct.sequence) AS rownum
FROM contracts c
LEFT OUTER JOIN contracttypes ct
ON c.contracttypeid = ct.id
) T1
WHERE rownum = 1
ORDER BY id;
Output:
id name contracttype
-- ------------ ------------
1 Aaron Shatz 6-month
3 Jim Smith 15-month
4 Mark Johnson 12-month
5 John Doe 6-month
This works only because the OP has confirmed that only two contract types are possible, and the one he wants (for each contractor) happens to be the one that orders first alphabetically. So a couple of coincidences make this solution straight-forward.
;WITH x AS
(
SELECT ID, Name, ContractType, rn = ROW_NUMBER() OVER
(PARTITION BY Name ORDER BY ContractType)
FROM dbo.some_table
)
SELECT ID, Name, ContractType
FROM x
WHERE rn = 1
ORDER BY ID;
If you need to make this more dynamic, I suppose you could say:
DECLARE #PreferredContractType VARCHAR(32);
SET #PreferredContractType = '12-month';
;WITH x AS
(
SELECT ID, Name, ContractType, rn = ROW_NUMBER() OVER
(PARTITION BY Name ORDER BY CASE ContractType
WHEN #PreferredContractType THEN 1 ELSE 2 END
)
FROM dbo.some_table
)
SELECT ID, Name, ContractType
FROM x
WHERE rn = 1
ORDER BY ID;