Partition By over Two Columns in Row_Number function - sql

I am trying to RANK the records using the following query:
SELECT
ROW_NUMBER() over (partition by
TW.EMPL_ID,TW.HR_DEPT_ID,TW.Transfer_Startdate
order by TW.EMPL_ID,TW.Effective_Bdate) RN,
TW.EMPL_ID,TW.HR_DEPT_ID,TW.Transfer_Startdate,Effective_BDate from
TT_EMPLOYEE_WORKDAY TW
where TW.HR_DOMAIN_CODE = 'SGP'
However the resultant Row_Number computed column only displays partition for the first column. Ideally I expected to have the same value for Row_Number where the partition by column data is identical.
Any clue where I might be going wrong?
USING RANK or DENSE RANK isn't an option as I want to identify all such rows for multiple employee where EMPL_ID, HR_DEPT_ID and Transfer_StartDate are same (RN=1)
Sample data:
RN AON_EMPL_ID HR_DEPT_ID Transfer_Startdate Effective_BDate
1 0100690 69895 01/01/2017 2017-01-01
2 0100690 69895 01/01/2017 2017-01-03
3 0100690 69895 01/01/2017 2017-01-04

expanding sample data to:
create table t (
aon_empl_id varchar(16)
, hr_dept_id varchar(16)
, Transfer_Startdate date
, Effective_bdate date
);
insert into t values
('0100690','69895','01/01/2017','2017-01-01')
,('0100690','69895','01/01/2017','2017-01-03')
,('0100690','69895','01/01/2017','2017-01-04')
,('0200700','69895','01/01/2016','2016-01-01')
,('0200700','69895','01/01/2016','2016-01-03')
,('0200700','69896','01/01/2017','2017-01-04')
,('0200700','69896','01/01/2017','2017-01-04');
using top with ties
select top 1 with ties
aon_empl_id
, hr_dept_id
, Transfer_Startdate = convert(char(10),Transfer_Startdate,120)
, Effective_bdate = convert(char(10),Effective_bdate,120)
from t
order by row_number() over (
partition by aon_empl_id, hr_dept_id, Transfer_Startdate
order by Effective_bdate
)
rextester demo: http://rextester.com/KOIZ42069
returns:
+-------------+------------+--------------------+-----------------+
| aon_empl_id | hr_dept_id | Transfer_Startdate | Effective_bdate |
+-------------+------------+--------------------+-----------------+
| 0100690 | 69895 | 2017-01-01 | 2017-01-01 |
| 0200700 | 69895 | 2016-01-01 | 2016-01-01 |
| 0200700 | 69896 | 2017-01-01 | 2017-01-04 |
+-------------+------------+--------------------+-----------------+
Alternative using a common table expression with row_number():
;with cte as (
select
rn = row_number() over (
partition by aon_empl_id, hr_dept_id, Transfer_Startdate
order by Effective_bdate
)
, aon_empl_id
, hr_dept_id
, Transfer_Startdate = convert(char(10),Transfer_Startdate,120)
, Effective_bdate = convert(char(10),Effective_bdate,120)
from t tw
)
select *
from cte
where rn = 1
returns:
+----+-------------+------------+--------------------+-----------------+
| rn | aon_empl_id | hr_dept_id | Transfer_Startdate | Effective_bdate |
+----+-------------+------------+--------------------+-----------------+
| 1 | 0100690 | 69895 | 2017-01-01 | 2017-01-01 |
| 1 | 0200700 | 69895 | 2016-01-01 | 2016-01-01 |
| 1 | 0200700 | 69896 | 2017-01-01 | 2017-01-04 |
+----+-------------+------------+--------------------+-----------------+

SELECT
RANK() over (partition by --or DENSE_RANK()
TW.EMPL_ID,TW.HR_DEPT_ID,TW.Transfer_Startdate
order by TW.EMPL_ID,TW.Effective_Bdate) RN,
TW.EMPL_ID,TW.HR_DEPT_ID,TW.Transfer_Startdate,Effective_BDate from
TT_EMPLOYEE_WORKDAY TW
where TW.HR_DOMAIN_CODE = 'SGP'
UPDATE
SELECT
RANK() over (partition by --or DENSE_RANK()
TW.EMPL_ID,TW.HR_DEPT_ID,TW.Transfer_Startdate
order by TW.EMPL_ID) RN,
TW.EMPL_ID,TW.HR_DEPT_ID,TW.Transfer_Startdate,Effective_BDate from
TT_EMPLOYEE_WORKDAY TW
where TW.HR_DOMAIN_CODE = 'SGP'
Order by RN,TW.Effective_Bdate

This bit of code appears to be working:
SELECT
dense_rank() over (partition by AON_EMPL_ID
order by AON_EMPL_ID,HR_DEPT_ID,Transfer_StartDate) RN,
TW.AON_EMPL_ID,TW.HR_DEPT_ID,TW.Transfer_Startdate,Effective_BDate from
TT_AON_EMPLOYEE_WORKDAY TW
where TW.HR_DOMAIN_CODE = 'SGP'
Apparently, I just need to partition by AON_EMPL_ID and everything else should go to Order By clause.

Related

Moving average on other window function redshift

I got stuck in a problem and need help on this
I have a table like this:
created_time_id | txn_src
1-1-2017 | A
1-1-2017 | A
1-1-2017 | B
1-1-2017 | A
1-1-2017 | C
2-1-2017 | A
2-1-2017 | C
2-1-2017 | B
2-1-2017 | A
3-1-2017 | A
3-1-2017 | A
3-1-2017 | C
In redshift, I have to create a moving average column for the above table along with the source count partition by date
currently I have written the below query
select
txn_src,
created_time_id::char(8)::date as "time",
count_payment
from
(
select
txn_src,
created_time_id,
count(1) as count_payment,
row_number() over (partition by created_time_id
order by
count(1) desc) as seqnum
from
my_table
where
created_time_id >= '1-1-2017' and txn_source is not null
group by
1,
2
) x
where
seqnum <= 10
order by
"time" ,
count_payment desc
This gives me the correct output like
1-1-2017 | A | 3
1-1-2017 | B | 1
and so on
I need moving average like this
time |src|cnt|mvng_avg
1-1-2017 | A | 3 |3
1-1-2017 | B | 1 |1
1-1-2017 | C | 1 |1
2-1-2017 | A | 2 |2.5
and so on ..
Can anybody suggest some good solution for this.
After some struggle, I was able to resolve this using below query.
with txn_source_by_date as (
select
txn_source ,
created_time_id,
count(1) as count_payment,
row_number() over (partition by created_time_id
order by
count(1) desc) as seqnum
from
my_table
where
created_time_id >= 20220801
and txn_source is not null
group by
1,
2
)
select
txn_source,
created_time_id::char(8)::date as "time",
count_payment,
avg(count_payment) over (partition by txn_source
order by
created_time_id rows between 29 preceding and current row ) mvng_avg
from
txn_source_by_date
group by
txn_source,
created_time_id,
count_payment
order by
"time",
txn_source

Select Top 1 based on distinct columns

I need to select the top 1 record from each group of column UnitID and CompanyCode and using order by from column CreatedDate
Here's an example of my table
ID | UnitID | CompanyCode | CreatedDate |
----------------------------------------|
1 | A1 | G100 | 2020-03-12 |
2 | A1 | G100 | 2020-03-13 |
3 | A1 | G100 | 2020-03-14 |
4 | B2 | G100 | 2020-03-12 |
5 | B2 | F200 | 2020-03-13 |
6 | B2 | E300 | 2020-03-14 |
My expected results would be these rows
ID | UnitID | CompanyCode | CreatedDate |
----------------------------------------|
3 | A1 | G100 | 2020-03-14 |
4 | B2 | G100 | 2020-03-12 |
5 | B2 | F200 | 2020-03-13 |
6 | B2 | E300 | 2020-03-14 |
We looking at UnitID first, next check CompanyCode If there is a record with different CompanyCode it will be display, but if have same, it will be select top 1 with order by createdDate
SIMPLE QUERY: SELECT ID, UnitID, CompanyCode, CreatedDate FROM Tbl_Unit ORDER BY CreatedDate
Anyone know how this can be achieved?
If I understand correctly, the next statement may help:
SELECT ID, UnitID, CompanyCode, CreatedDate
FROM (
SELECT
ID, UnitID, CompanyCode, CreatedDate,
ROW_NUMBER() OVER (PARTITION BY UnitID, CompanyCode ORDER BY CreatedDate) AS Rn
FROM Tbl_Unit
) t
WHERE Rn = 1
I like using TOP 1 WITH TIES for handling this type of query on SQL Server:
SELECT TOP 1 WITH TIES *
FROM Tbl_Unit
ORDER BY
ROW_NUMBER() OVER (PARTITION BY UnitID, CompanyCode ORDER BY CreatedDate DESC);
Using ROW_NUMBER() Function.
SELECT ID , UnitID , CompanyCode , CreatedDate FROM
(
select TAB.* , ROW_NUMBER() OVER (PARTITION BY UnitId , CompanyCode order by createddate desc ) RNK from TAB
) Drived WHERE RNK=1;
Demo
ROW_NUMBER() -- Funtion to generate row number
OVER (PARTITION BY UnitId , CompanyCode -- Partition range
order by createddate desc -- Sorting order )

applying window function to big data set (how to optimize?)

I have to do some data analysis on a table with 400+ million rows. I got this to work on a small sample but I'm sure it will run out of memory in production.
The table structure is like this (for millions of serial numbers):
+------------+---------------+------------+----------+
| date | serial_number | status_1 | status_2 |
+------------+---------------+------------+----------+
| 10/1/2018 | 123 | warehouse | v |
| 10/10/2018 | 123 | warehouse | w |
| 10/20/2018 | 123 | warehouse | x |
| 11/2/2018 | 123 | in transit | y |
+------------+---------------+------------+----------+
I need to get the dates where status_1 = 'in transit' currently and status_2 = 'x' on a previous date. That should look like this:
+-----------+---------------+------------+----------+------------+
| date_1 | serial_number | status_1 | status_2 | date_2 |
+-----------+---------------+------------+----------+------------+
| 11/2/2018 | 123 | in transit | x | 10/20/2018 |
+-----------+---------------+------------+----------+------------+
I got it using two rank functions, but this will probably choke on a big table.
with transit as (
select
*
from (
select *,
rank() over(partition by serial_number order by date desc) rnk
from sample_t
order by serial_number, date asc
)
where rnk=1 and status_1 = 'in transit'
),
x_type as (
select
*
from (
select *,
rank() over(partition by serial_number order by date desc) rnk
from sample_t
order by serial_number, date asc
)
where rnk>1 and status_2 = 'x'
)
select tr.date date_1,
tr.serial_number,
tr.status_1,
x.status_2,
x.date date_2
from transit tr left join x_type x on tr.serial_number = x.serial_number
I can't see how to do this with one rank function. Is there a better, more efficient way?
You can use lag to do this.
select *
from (select t.*
,lag(status_2) over(partition by serial_no order by date) as prev_status_2
,lag(date) over(partition by serial_no order by date) as prev_date
from tbl t
) t
where status_1 = 'in_transit' and prev_status_2 = 'x'

Select top 1 Student Fee From List In SQL Server

In my SQL Server table, I have this data:
+------+-----+------------+
| Name | Fee | Date_Time |
+------+-----+------------+
| AA | 50 | 2018-03-27 |
| AA | 30 | 2018-04-10 |
| BB | 40 | 2018-01-10 |
| BB | 10 | 2018-04-10 |
| CC | 10 | 2018-04-10 |
| DD | 10 | 2018-04-10 |
+------+-----+------------+
How can I get data using SQL query like TOP 1 for (AA, BB, CC, DD) ORDER BY Date_Time DESC into a list?
+------+-----+------------+
| Name | Fee | Date_Time |
+------+-----+------------+
| AA | 30 | 2018-04-10 |
| BB | 10 | 2018-04-10 |
| CC | 10 | 2018-04-10 |
| DD | 10 | 2018-04-10 |
+------+-----+------------+
Use row_number() function to get the top most Fee
select top(1) with ties Name, Fee, Date_Time
from table t
order by row_number() over (partition by Name order by Date_Time desc)
Another approach can be
SELECT Name,Fee,Date_Time FROM
(
SELECT *, ROW_NUMBER() OVER(PARTITION BY NAME ORDER BY DATE_TIME DESC) RN
FROM [TABLE_NAME]
) T
WHERE RN=1
In case if you have multiple entries on same day for a particular fee, and you want both should appear you can use DENSE_RANK() instead of ROW_NUMBER() like following.
SELECT Name,Fee,Date_Time FROM
(
SELECT *, DENSE_RANK() OVER(PARTITION BY NAME ORDER BY DATE_TIME DESC) RN
FROM [TABLE_NAME]
) T
WHERE RN=1
DEMO
Give a row_number based on the partition by Name and order by descending order of Date_Time and then select rows having row_number is 1.
Query
;with cte as (
select [rn] = row_number() over(
partition by [Name]
order by [Date_Time] desc
), *
from [your_table_name]
)
select [Name], [Fee], [Date_Time]
from cte
where [rn] = 1;

How to select multiple max rows from a table date-wise in SQL Server

I need to select rows with max(batchid) for each day.
Sample table:
Id | BatchId |Date |KeyValue
-- | --------|---------------------|-----------
1 | 1 | 2016-12-13 12:30:66 |1234
2 | 1 | 2016-12-13 12:30:66 |5654
3 | 2 | 2016-12-13 08:30:66 |1234
4 | 2 | 2016-12-13 08:30:66 |5654
5 | 1 | 2016-12-12 12:10:45 |1234
6 | 1 | 2016-12-12 12:10:45 |5634
7 | 2 | 2016-12-12 08:10:45 |1234
8 | 2 | 2016-12-12 08:10:45 |5634
9 | 3 | 2016-12-12 04:10:45 |9628
Expected output:
Id | BatchId |Date |KeyValue
-- | --------|---------------------|-----------
3 | 2 | 2016-12-13 08:30:66 |1234
4 | 2 | 2016-12-13 08:30:66 |5654
9 | 3 | 2016-12-12 04:10:45 |9628
Thanks in advance !
Use max() over . . .:
select t.*
from (select t.*, max(date) over (partition by cast(date as date)) as maxdate
from sample t
) t
where maxdate = date;
Or, rank()/dense_rank():
select t.*
from (select t.*,
dense_rank() over (partition by cast(date as date) order by date desc) as seqnum
from sample t
) t
where seqnum = 1;
Correction and expansion of Gordon Linoff's:
rextester: http://rextester.com/GECAZ46515
create table BatchKeys (Id int,BatchId int,Date datetime,KeyValue int)
insert into BatchKeys (Id,BatchId,Date,KeyValue) values
(1,1,'2016-12-13T12:30:06',1234)
,(2,1,'2016-12-13T12:30:06',5654)
,(3,2,'2016-12-13T08:30:06',1234)
,(4,2,'2016-12-13T08:30:06',5654)
,(5,1,'2016-12-12T12:10:45',1234)
,(6,1,'2016-12-12T12:10:45',5634)
,(7,2,'2016-12-12T08:10:45',1234)
,(8,2,'2016-12-12T08:10:45',5634)
,(9,3,'2016-12-12T04:10:45',9628)
select t.*
from (select t.*, max(BatchId) over (partition by cast(date as date)) as maxBatchId
from BatchKeys t
) t
where maxBatchId = BatchId;
select t.*
from (select t.*,
dense_rank() over (partition by cast(date as date) order by BatchId desc) as seqnum
from BatchKeys t
) t
where seqnum = 1;
select top 1 with ties t.*
from BatchKeys t
order by dense_rank() over (partition by cast(date as date) order by BatchId desc)