Sql query to fetch minimum date start against each employee - sql

I have two table asg and work rship
In work rship table there is date_start for each employee. For few employee date_start is duplicate so I have to choose the min(date_start) from work rship table
for this i wrote a query:
SELECT assignment_name,
REGEXP_SUBSTR(ASSIGNMENT_NAME, '[0-9]+') PERSON_NUMBER,
NVL( wrk.date_start,t.effective_start_date) ,
NVL( WRK.WORKER_TYPE,'E'),
NVL( WRK.LEGAL_EMPLOYER_NAME, 'N/A')
FROM
(SELECT apps.assignment_table.*,
COUNT(*) OVER (PARTITION BY assignment_name, effective_start_date, effective_end_date, effective_latest_change) AS c
FROM apps.assignment_table
) T
LEFT OUTER JOIN
(SELECT *
FROM apps.work_table
WHERE date_start =
(SELECT MIN(date_start) FROM apps.work_table
)
) wrk
ON regexp_substr(t.assignment_name, '[0-9]+')=wrk.person_number
WHERE C =1;
But in the above query I did a mistake of just selecting select min(date_start) from apps.work_table this means min(date_start) from the entire table.
Instead of this I should have included the regexp_substr(t.assignment_name, '[0-9]+')=wrk.person_number inside the inline query itself.
But now when I am including it.. it's not working.

LEFT OUTER JOIN
(SELECT *
FROM apps.work_table
WHERE date_start =
(SELECT MIN(date_start) FROM apps.work_table
)
) wrk
ON
Unless the start_date for the person is the minimum value across all people in the entire table (and not just the minimum date for this person) then the code above will not find any rows that will match the person you want.
What you probably meant to do is something like:
LEFT OUTER JOIN
( SELECT *
FROM (
SELECT t.*,
ROW_NUMBER() OVER ( PARTITION BY person_number ORDER BY date_start ASC ) AS rn
FROM apps.work_table t
)
WHERE rn = 1
) wrk
ON

Related

SAME code with changes in WHERE clause inside a sub query gives different result SQL

I am trying to find the difference between 2 code snippets that are similar since it is giving different results.
In the code below, The count is 15506
select
application_ID,
userid,
product,
sub_product,
ts_est,
event_type from
(
Select
fa.ApplicationId as application_ID,
ro.userid,
ro.product,
ro.sub_product,
ro.ts_est,
ro.event_type,
from_utc_timestamp(fa.bornondate, 'America/New_York') as app_bornondate,
row_number() over(
partition by ro.userid, ro.product, ro.event_type
order by
fa.bornondate desc
) as rowid
from
T1 ro
left join T2 fa on ro.userid = fa.userid
where event_type = "OS"
and cast(ts_est as date) >= '2021-10-05' and cast(ts_est as date) <= '2022-03-07' and product = "pt"
)
where rowid = 1
Here is the 2nd code snippet where I have used the conditions outside the subquery. The count is 15096
select
application_ID,
userid,
product,
sub_product,
ts_est,
event_type from
(
Select
fa.ApplicationId as application_ID,
ro.userid,
ro.product,
ro.sub_product,
ro.ts_est,
ro.event_type,
from_utc_timestamp(fa.bornondate, 'America/New_York') as app_bornondate,
row_number() over(
partition by ro.userid, ro.product, ro.event_type
order by
fa.bornondate desc
) as rowid
from
T1 ro
left join T2 fa on ro.userid = fa.userid
)
where rowid = 1 and cast(ts_est as date) >= '2021-10-05'
and cast(ts_est as date) <= '2022-03-07' and product = "pt" and event_type = "OS"
When I am using a minus operation for the two codes. It is showing 410 differences but the userid is present in both the tables which is strange.
I am new to sql and I'm having trouble taking this further though.
Thanks!

Pulling rows with SQL MAXDATE

I'm trying to run a query from a HR table. I'm bringing in all employees and their dependents who share the same 'Primary_Key_Value'. My statement works but I'm getting duplicates because some dependents have multiple MED_COV_EFFECTIVE_DATEs. I need to bring in only the lastest or MAX date. When I try to use the MAX(MED_COV_EFFECTIVE_DATE) function, I'm getting errors. Can someone please help me?
SELECT DISTINCT PRIMARY_KEY_VALUE, RECORD_ID, LAST_NAME, FIRST_NAME, DATE_OF_BIRTH, HIRE_DATE,
RELATIONSHIP_CODE, MED_COV_EFFECTIVE_DATE, SOCIAL_SECURITY_NUMBER
FROM COVERAGE_TABLE T1
WHERE T1.PRIMARY_KEY_VALUE IN
(
SELECT T2.PRIMARY_KEY_VALUE
FROM COVERAGE_TABLE T2
WHERE T2.HIRE_DATE IS NOT NULL
)
ORDER BY PRIMARY_KEY_VALUE, RECORD_ID
Dang, wasn't thinking about inner select can only return 1 column earlier. Try something like this:
SELECT T2.PRIMARY_KEY_VALUE, MAX(T2.Med_Cov_Effective_Date)
INTO #MostRecentCoveredKeys
FROM COVERAGE_TABLE T2
WHERE T2.HIRE_DATE IS NOT NULL
GROUP BY T2.Primary_Key_Value
This should give you a unique set of Primary_Key_Values.
Or CTE Version:
; WITH MostRecentCoveredKeys
AS
SELECT T2.PRIMARY_KEY_VALUE, MAX(T2.Med_Cov_Effective_Date)
FROM COVERAGE_TABLE T2
WHERE T2.HIRE_DATE IS NOT NULL
GROUP BY T2.Primary_Key_Value
Then JOIN the original table and cte (or temp table) like so:
SELECT PRIMARY_KEY_VALUE, RECORD_ID, LAST_NAME, FIRST_NAME, DATE_OF_BIRTH,
HIRE_DATE, RELATIONSHIP_CODE, MED_COV_EFFECTIVE_DATE,
SOCIAL_SECURITY_NUMBER
FROM COVERAGE_TABLE T1
INNER JOIN MostRecentCoveredKeys mrck
ON mrck.Primary_Key_Value = T1.Primary_Key_Value
ORDER BY T1.PRIMARY_KEY_VALUE, T1.RECORD_ID
--you need to include the '#' in front of table name
--on join if using the temp table version
--DROP TABLE #MostRecentCoveredKeys
This query will fetch the latest data:
SELECT B.PRIMARY_KEY_VALUE, B.RECORD_ID, B.LAST_NAME, B.FIRST_NAME,
B.DATE_OF_BIRTH, B.HIRE_DATE, B.RELATIONSHIP_CODE,
B.MED_COV_EFFECTIVE_DATE, B.SOCIAL_SECURITY_NUMBER
FROM
(SELECT PRIMARY_KEY_VALUE, MAX(MED_COV_EFFECTIVE_DATE) MAX_DATE
FROM COVERAGE_TABLE
WHERE HIRE_DATE IS NOT NULL
GROUP BY PRIMARY_KEY_VALUE) A INNER JOIN
(SELECT * FROM COVERAGE_TABLE
WHERE HIRE_DATE IS NOT NULL) B
ON A.PRIMARY_KEY_VALUE=B.PRIMARY_KEY_VALUE AND A.MAX_DATE=B.MED_COV_EFFECTIVE_DATE;
;with a as
(
select
*
,row_number() over(partition by PRIMARY_KEY_VALUE, order by Med_Cov_Effective_Date desc) rn
FROM COVERAGE_TABLE T1
WHERE HIRE_DATE IS NOT NULL
)
select *
from a
where rn=1
ORDER BY PRIMARY_KEY_VALUE, RECORD_ID

Get the rows with first phone number

I need to display 100 members per page. Because of multiple phone numbers of a member, I have to pick the first phone number for each member.
Here is the query which one gets every phone numbers of a member:
SELECT * FROM
(
SELECT
row_number() over(order by(1)) rn,
NAME, PHONE
FROM MEMBERS t0
LEFT OUTER JOIN MEMBER_IDENTITY ON MEMBER_IDENTITY.ID=t0.ID
LEFT JOIN MEMBER_PHONE ON MEMBER_PHONE.MEMBER_ID=t0.ID
WHERE
NAME LIKE 'U%'
ORDER BY NAME ASC
)
WHERE rn >= 0
AND rn <= 100
How can I pick first -or MAX, etc- phone number?
You could make a sub query for retrieving the phone numbers together with their row number per member, and then filter out the first of them:
SELECT *
FROM (
SELECT row_number() OVER (ORDER BY name ASC) rn,
name,
phone
FROM members t0
LEFT JOIN member_identity
ON member_identity.id = t0.id
LEFT JOIN (
SELECT member_id,
phone
row_number() OVER (PARTITION BY member_id ORDER BY (1)) ph_rn
FROM member_phone
) member_phone
ON member_phone.member_id = t0.id
AND ph_rn = 1
WHERE name LIKE 'U%'
ORDER BY name ASC
)
WHERE rn BETWEEN 0 AND 100
I would:
use the same order for determining the rn value as the result set (ORDER BY name ASC), otherwise the order will not be consistent across pages;
use BETWEEN in the outer WHERE condition, although the lower bound condition (0) is not necessary for the first page.
When you switch to MAX you can apply the ROW_NUMBER directly on the name (Windowed Aggregate Functions are calulated after GROUP BY/HAVING):
SELECT * FROM
(
SELECT
row_number() over (ORDER BY NAME ASC) rn,
NAME, MAX(PHONE)
FROM MEMBERS t0
LEFT OUTER JOIN MEMBER_IDENTITY ON MEMBER_IDENTITY.ID=t0.ID
LEFT JOIN MEMBER_PHONE ON MEMBER_PHONE.MEMBER_ID=t0.ID
WHERE NAME LIKE 'U%'
GROUP BY NAME
)
WHERE rn >= 0
AND rn <= 100
or move the aggregation into a Derived Table:
SELECT * FROM
(
SELECT
row_number() over (ORDER BY NAME ASC) rn,
NAME, PHONE
FROM MEMBERS t0
LEFT OUTER JOIN MEMBER_IDENTITY ON MEMBER_IDENTITY.ID=t0.ID
LEFT JOIN
( SELECT MEMBER_ID, MAX(PHONE) AS PHONE
FROM MEMBER_PHONE
GROUP BY NAME
) MEMBER_PHONE
ON MEMBER_PHONE.MEMBER_ID=t0.ID
WHERE NAME LIKE 'U%'
)
WHERE rn >= 0
AND rn <= 100

Use MIN() where you cannot GROUP?

I feel pretty dumb, but I get stuck with an apparently very easy query. I have something like this, where every row is a user that watched a movie:
user_id date duration
1 01-01-01 62m
1 03-01-01 95m
2 02-01-01 58m
2 06-01-01 25m
2 08-01-01 95m
3 03-01-01 96m
Now, what I would like to have is a table where I have the first movie watched by each user and its duration. The problem is if I use MIN() then I have to GROUP both user_id and duration. But if I GROUP for duration as well, then I am basically going to have the same table back. How can I solve the problem?
You can use a ranking function like ROW_NUMBER:
WITH CTE AS
(
SELECT rn = ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY date ASC),
user_id, date, duration
FROM dbo.TableName
)
SELECT user_id, date, duration FROM CTE WHERE rn = 1
The advantage of ROW_NUMBER is that you can change the logic easily. For example, if you want to reverse the logic and get the row of the last watched film per user, you just have to change ORDER BY date ASC to ORDER BY date DESC.
The advantage of theCTE (common-table-expression) is that you can also use it to delete or update these records. Often used to delete or identify duplicates. So you can first select to see what will be deleted/updated before you execute it.
Try this query. I haven't tested it.
SELECT date, duration FROM tablename n
WHERE NOT EXISTS(
SELECT date, user_id FROM tablename g
WHERE n.user_id = g.user_id AND g.date < n.date
);
Assuming there can only be a single record per user per date, it'd be something like this:
select y.*
from table t
inner join (
select user_id, min(date) mindate
from table
group by user_id
) t1
on t.user_id = t1.user_id
and t.date = t1.mindate
You can use ROW_NUMBER() which is a ranking function that generates sequential number for every group based on the column that you want to sort. In this case, if there is a tie, only one record for every user is selected but if you want to select all of them, you need to use DENSE_RANK() rather than ROW_NUMBER()
SELECT user_id, date, duration
FROM
(
SELECT user_id, date, duration,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY date) rn
FROM tableName
) a
WHERE rn = 1
this also assumes that the data type of column date is DATE
If you are using SQL Server 2005 or later, you can use windowing functions.
SELECT *
FROM
(
SELECT user_id, date, duration, MIN(date) OVER(PARTITION BY user_id) AS MIN_DATE
FROM MY_TABLE
) AS RESULTS
WHERE date = MIN_DATE
The over clause and partion by will "group by" the user_id and select the min date per user_id without eliminating any rows. Then you select from the table where the date is equal to the min date and you are left with the first date per user_id. This is a common trick once you know about windowing functions.
If you want the first watch_date per user, there should be no date before this date for this user:
SELECT *
FROM watched_movies wm
WHERE NOT EXISTS (
SELECT *
FROM watched_movies nx
WHERE nx.user_id = wm.user_id
AND nx.watch_date < wm.watch_date
);
Note: I replaced the date column by watch_date, since date is a reserved word (type name).
This should give you the duration of the first movie watched on the earliest date:
SELECT a.user_id, b.date, a.duration
FROM table a
INNER JOIN (SELECT user_id,min(date) date FROM table GROUP BY user_id) b ON a.user_id = b.user_id AND a.date = b.date
INNER JOIN (SELECT user_id,date,min(session_id) FROM table GROUP BY user_id, date) c ON b.user_id = c.user_id AND b.date = c.date AND a.session_id = c.session_id
Try this:
WITH TABLE1
AS (SELECT
'1' AS USER_ID,
'01-01-01' AS DT,
62 AS DURATION
FROM
DUAL
UNION ALL
SELECT
'1' AS USER_ID,
'03-01-01' AS DT,
95 AS DURATION
FROM
DUAL
UNION ALL
SELECT
'2' AS USER_ID,
'02-01-01' AS DT,
58 AS DURATION
FROM
DUAL
UNION ALL
SELECT
'2' AS USER_ID,
'06-01-01' AS DT,
25 AS DURATION
FROM
DUAL
UNION ALL
SELECT
'2' AS USER_ID,
'08-01-01' AS DT,
95 AS DURATION
FROM
DUAL
UNION ALL
SELECT
'3' AS USER_ID,
'03-01-01' AS DT,
96 AS DURATION
FROM
DUAL)
SELECT
*
FROM
(SELECT
USER_ID,
DT,
DURATION,
RANK ( ) OVER (PARTITION BY USER_ID ORDER BY DT ASC) AS ROW_RANK
FROM
TABLE1)
WHERE
ROW_RANK = 1
Use a sub-query to get the min date then join that back to the table to get all other relevant columns.
SELECT T2.user_id
,T2.date
,T2.duration
FROM YourTable T2
INNER JOIN
(
SELECT T1.user_id
,MIN(T1.date) as first_date
FROM YourTable T1
) SQ
ON T2.user_id = sq.user_id
AND T2.date = sq.first_date

SQL: How to find duplicates based on two fields?

I have rows in an Oracle database table which should be unique for a combination of two fields but the unique constrain is not set up on the table so I need to find all rows which violate the constraint myself using SQL. Unfortunately my meager SQL skills aren't up to the task.
My table has three columns which are relevant: entity_id, station_id, and obs_year. For each row the combination of station_id and obs_year should be unique, and I want to find out if there are rows which violate this by flushing them out with an SQL query.
I have tried the following SQL (suggested by this previous question) but it doesn't work for me (I get ORA-00918 column ambiguously defined):
SELECT
entity_id, station_id, obs_year
FROM
mytable t1
INNER JOIN (
SELECT entity_id, station_id, obs_year FROM mytable
GROUP BY entity_id, station_id, obs_year HAVING COUNT(*) > 1) dupes
ON
t1.station_id = dupes.station_id AND
t1.obs_year = dupes.obs_year
Can someone suggest what I'm doing wrong, and/or how to solve this?
SELECT *
FROM (
SELECT t.*, ROW_NUMBER() OVER (PARTITION BY station_id, obs_year ORDER BY entity_id) AS rn
FROM mytable t
)
WHERE rn > 1
SELECT entity_id, station_id, obs_year
FROM mytable t1
WHERE EXISTS (SELECT 1 from mytable t2 Where
t1.station_id = t2.station_id
AND t1.obs_year = t2.obs_year
AND t1.RowId <> t2.RowId)
Change the 3 fields in the initial select to be
SELECT
t1.entity_id, t1.station_id, t1.obs_year
Re-write of your query
SELECT
t1.entity_id, t1.station_id, t1.obs_year
FROM
mytable t1
INNER JOIN (
SELECT entity_id, station_id, obs_year FROM mytable
GROUP BY entity_id, station_id, obs_year HAVING COUNT(*) > 1) dupes
ON
t1.station_id = dupes.station_id AND
t1.obs_year = dupes.obs_year
I think the ambiguous column error (ORA-00918) was because you were selecting columns whose names appeared in both the table and the subquery, but you did not specifiy if you wanted it from dupes or from mytable (aliased as t1).
Could you not create a new table that includes the unique constraint, and then copy across the data row by row, ignoring failures?
You need to specify the table for the columns in the main select. Also, assuming entity_id is the unique key for mytable and is irrelevant to finding duplicates, you should not be grouping on it in the dupes subquery.
Try:
SELECT t1.entity_id, t1.station_id, t1.obs_year
FROM mytable t1
INNER JOIN (
SELECT station_id, obs_year FROM mytable
GROUP BY station_id, obs_year HAVING COUNT(*) > 1) dupes
ON
t1.station_id = dupes.station_id AND
t1.obs_year = dupes.obs_year
SELECT *
FROM (
SELECT t.*, ROW_NUMBER() OVER (PARTITION BY station_id, obs_year ORDER BY entity_id) AS rn
FROM mytable t
)
WHERE rn > 1
by Quassnoi is the most efficient for large tables.
I had this analysis of cost :
SELECT a.dist_code, a.book_date, a.book_no
FROM trn_refil_book a
WHERE EXISTS (SELECT 1 from trn_refil_book b Where
a.dist_code = b.dist_code and a.book_date = b.book_date and a.book_no = b.book_no
AND a.RowId <> b.RowId)
;
gave a cost of 1322341
SELECT a.dist_code, a.book_date, a.book_no
FROM trn_refil_book a
INNER JOIN (
SELECT b.dist_code, b.book_date, b.book_no FROM trn_refil_book b
GROUP BY b.dist_code, b.book_date, b.book_no HAVING COUNT(*) > 1) c
ON
a.dist_code = c.dist_code and a.book_date = c.book_date and a.book_no = c.book_no
;
gave a cost of 1271699
while
SELECT dist_code, book_date, book_no
FROM (
SELECT t.dist_code, t.book_date, t.book_no, ROW_NUMBER() OVER (PARTITION BY t.book_date, t.book_no
ORDER BY t.dist_code) AS rn
FROM trn_refil_book t
) p
WHERE p.rn > 1
;
gave a cost of 1021984
The table was not indexed....
SELECT entity_id, station_id, obs_year
FROM mytable
GROUP BY entity_id, station_id, obs_year
HAVING COUNT(*) > 1
Specify the fields to find duplicates on both the SELECT and the GROUP BY.
It works by using GROUP BY to find any rows that match any other rows based on the specified Columns.
The HAVING COUNT(*) > 1 says that we are only interested in seeing any rows that occur more than 1 time (and are therefore duplicates)
I thought a lot of the solutions here were cumbersome and tough to understand since I had a 3 column primary key constraint and needed to find the duplicates. So here's an option
SELECT id, name, value, COUNT(*) FROM db_name.table_name
GROUP BY id, name, value
HAVING COUNT(*) > 1
I'm surprised there aren't any answers here that use a CTE (Common Table Expression)
WITH cte as (
SELECT
ROW_NUMBER()
OVER(
PARTITION BY Last_Name, First_Name order by BIRTHDATE)
AS RN,
Employee_number, First_Name, Last_Name, BirthDate,
SUM(1)
OVER(
PARTITION BY Last_Name, First_Name
ORDER BY BIRTHDATE ROWS BETWEEN UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING)
AS CNT
FROM
employment)
select * from cte where cnt > 1
Not only will this find duplicates (on first and last name only), it will tell you how many there are.