using min and max in group by clause - sql

I want below output in oracle sql.
I have data in table as below :
id start_date end_date assignment number
1 2.02.2014 15.02.2014 10
2 25.02.2014 30.02.2014 20
3 26.03.2014 04.05.2014 30
4 06.06.2014 31.12.4712 10
I need output using group by
assignment_number start_date end_date
10 02.02.2014 15.02.2014
10 06.06.2014 31.12.4712
20 25.02.2014 30.02.2014
30 26.03.2014 04.05.2014
I tried using min(start_date) and max(end_date) for assignment 10 ia was getting output as
assignment_number start_date end_date
10 02.02.2014 31.12.4712
But I want as :-
assignment_number start_date end_date
10 02.02.2014 15.02.2014
10 06.06.2014 31.12.4712
Please help

I think you'd have to calculate the min and max separately, then union them. Try something like this:
SELECT
assignment_number
, start_date
, end_date
FROM
(SELECT
assignment_number
, start_date
, end_date
FROM TABLE
GROUP BY assignment_number
HAVING MIN(start_date)
UNION
SELECT
assignment_number
, start_date
, end_date
FROM TABLE
GROUP BY assignment_number
HAVING MAX(end_date)
)
ORDER BY
1 ASC
, 2 ASC
, 3 ASC
;

sql fiddle
select id, to_char(start_date,'dd.mm.yyyy') start_date, to_char(end_date,'dd.mm.yyyy') end_date,ASSIGNMENT_NUMBER from sof1 s
where not exists
(select 1 from sof1 s2
where s2.assignment_number=s.assignment_number
and s2.start_date<s.start_date
)
or not exists
(select 1 from sof1 s2
where s2.assignment_number=s.assignment_number
and s2.end_date>s.end_date
)
order by ASSIGNMENT_NUMBER
With analytic function:
sql fiddle
select id, to_char(start_date,'dd.mm.yyyy') start_date, to_char(end_date,'dd.mm.yyyy') end_date,ASSIGNMENT_NUMBER from
(select s.*
, min (start_date) over (partition by ASSIGNMENT_NUMBER) sd
, max (end_date) over (partition by ASSIGNMENT_NUMBER) ed
from sof1 s
)
where start_date=sd or end_date=ed
order by ASSIGNMENT_NUMBER, start_date

Related

SQL Between date column filed not null

I would like to count all unique customers that were active on 2019-01-01 with the condition that they also were active in the subsequent 3 days.
Main table
date customer_id time_spent_online_min
2019-01-01 1 5
2019-01-01 2 6
2019-01-01 3 4
2019-01-02 1 7
2019-01-02 2 5
2019-01-03 3 3
2019-01-04 1 4
2019-01-04 2 6
Output table
date total_active_customers
2019-01-01 2
This is what I have tried so far:
with cte as(
select customer_id
,date
,time_spent_online_min
from main_table
where date between date '2019-01-01' and date '2019-01-04'
and customer_id is not null)
select date
,count(distinct(customer_id)) as total_active_customers
from cte
where date = date '2019-01-01'
group by 1
If you have only one record per day, you can use lead():
select date, count(*)
from (select t.*, lead(date, 3) over (partition by customer_id order by date) as date_3
from main_table t
) t
where date = '2019-01-01' and
date_3 = '2019-01-04'
group by date;
If you can have more than one record per day, then aggregate and then use lead():
select date, count(*)
from (select t.*, lead(date, 3) over (partition by customer_id order by date) as date_3
from (select customer_id, date, sum(time_spent_online_min) as time_spent_online_min
from maintable t
group by customer_id, date
) t
) t
where date = '2019-01-01' and
date_3 = '2019-01-04'
group by date;
You can also easily expand this to any dates:
select date, count(*)
from (select t.*, lead(date, 3) over (partition by customer_id order by date) as date_3
from main_table t
) t
where date_3 = date + interval '3' day
group by date;
I would use exists logic here:
SELECT COUNT(*)
FROM main_table t1
WHERE
date = '2019-01-01' AND
EXISTS (SELECT 1 FROM main_table t2
WHERE t2.customer_id = t1.customer_id AND t2.date = '2019-01-02') AND
EXISTS (SELECT 1 FROM main_table t2
WHERE t2.customer_id = t1.customer_id AND t2.date = '2019-01-03') AND
EXISTS (SELECT 1 FROM main_table t2
WHERE t2.customer_id = t1.customer_id AND t2.date = '2019-01-04');
This answer assumes that a given customer would only have one record for one date of activity.
WITH
-- your input
input(dt,customer_id,time_spent_online_min) AS (
SELECT DATE '2019-01-01',1,5
UNION ALL SELECT DATE '2019-01-01',2,6
UNION ALL SELECT DATE '2019-01-01',3,4
UNION ALL SELECT DATE '2019-01-02',1,7
UNION ALL SELECT DATE '2019-01-02',2,5
UNION ALL SELECT DATE '2019-01-03',3,3
UNION ALL SELECT DATE '2019-01-04',1,4
UNION ALL SELECT DATE '2019-01-04',2,6
)
,
-- count the active days in this row and the following 3 days
count_activity AS (
SELECT
*
, COUNT(customer_id) OVER(
PARTITION BY customer_id ORDER BY dt
RANGE BETWEEN CURRENT ROW AND INTERVAL '3 DAY' FOLLOWING
) AS act_count
FROM input
)
SELECT
dt
, COUNT(*) AS total_active_customers
FROM count_activity
WHERE dt='2019-01-01'
AND act_count > 2
GROUP BY dt
;
-- out dt | total_active_customers
-- out ------------+------------------------
-- out 2019-01-01 | 2

Skip specific rows using LAG in sql

I have a table that looks like this:
Using the LAG function in SQL, I would like to perform the LAG on only values where star_date=end_date and get the past previous start_date record where start_date=end_date.
That my end table will have an extra column like this:
I hope my question is clear, any help is appreciated.
You can assign a group to these values and use that:
select t.*,
(case when start_date = end_date
then lag(start_date) over (partition by (case when start_date = end_date then 1 else 0 end) order by start_date)
end) as prev_eq_start_date
from t;
Or:
select t.*,
(case when start_date = end_date
then lag(start_date) over (partition by start_date = end_date order by start_date)
end) as prev_eq_start_date
from t;
Note if you data is big and most rows have different dates, then you might have a resources issue. In this case, an additional, unused partition by key can help:
select t.*,
(case when start_date = end_date
then lag(start_date) over (partition by (case when start_date = end_date then 1 else 2 end), (case when start_date <> end_date then start_date end) order by start_date)
end) as prev_eq_start_date
from t;
This has no impact on the result but it can avoid a resources error caused by too many rows with different values.
Below is for BigQuery Standard SQL
#standardSQL
SELECT *, NULL AS lag_result
FROM `project.dataset.table` WHERE start_date != end_date
UNION ALL
SELECT *, LAG(start_date) OVER(ORDER BY start_date)
FROM `project.dataset.table` WHERE start_date = end_date
If to apply to sample data in your question - result is
Row user_id start_date end_date lag_result
1 1 2019-01-01 2019-02-28 null
2 3 2019-02-27 2019-02-28 null
3 4 2019-08-04 2019-09-01 null
4 2 2019-02-01 2019-02-01 null
5 5 2019-08-07 2019-08-07 2019-02-01
6 6 2019-08-27 2019-08-27 2019-08-07
Btw, in case if your start_date and end_date are of STRING data type ('27/02/2019') vs. DATE type ('2019-02-27' as it was assumed in above query) - you should use below one
#standardSQL
SELECT *, NULL AS lag_result
FROM `project.dataset.table` WHERE start_date != end_date
UNION ALL
SELECT *, LAG(start_date) OVER(ORDER BY PARSE_DATE('%d/%m/%Y', start_date))
FROM `project.dataset.table` WHERE start_date = end_date
with result
Row user_id start_date end_date lag_result
1 1 01/01/2019 28/02/2019 null
2 3 27/02/2019 28/02/2019 null
3 4 04/08/2019 01/09/2019 null
4 2 01/02/2019 01/02/2019 null
5 5 07/08/2019 07/08/2019 01/02/2019
6 6 27/08/2019 27/08/2019 07/08/2019
Use JOIN
SQL FIDDLE
SELECT T.*,T1.LAG_Result
FROM TABLE T LEFT JOIN
(
SELECT User_Id,LAG(start_date) OVER(ORDER BY start_date) LAG_Result
FROM TABLE S
WHERE start_date = end_date
) T1 ON T.User_Id = T1.User_Id

Insert the table data based on grouping of two columns

I have a oracle table with the following format,
For eg:
JLID Dcode SID TDT QTY
8295783 3119255 9842 3/5/2018 14
8269771 3119255 9842 3/6/2018 11
8302211 3119255 1126 3/1/2018 19
Here I have different SID for the same Dcode, now I need to get the SID with the maximum Qty. (i.e) for SID 9842 - (14+11)=25, for SID 1126 it is 19, then the results should be on SID 9842. So, our query should returns the following results
JLID Dcode START_DT END_DT SID
111 3119255 3/1/2018 3/31/2018 12:00 9842
Startdate and enddate should be calculated from TDT (i.e) start date is the first date of the month and the end date is the last date of the month
Can anyone please suggest me some ideas to do it.
It might be as simple as this:
SELECT Dcode, start_date, end_date, SID FROM (
SELECT Dcode, SID, TRUNC(start_date, 'MONTH') AS start_date
, LAST_DAY(end_date) AS end_date
, ROW_NUMBER() OVER ( PARTITION BY Dcode ORDER BY total_qty DESC ) AS rn
FROM (
SELECT Dcode, SID, MIN(TDT) AS start_date, MAX(TDT) AS end_date
, SUM(QTY) AS total_qty
FROM mytable
GROUP BY Dcode, SID
)
) WHERE rn = 1
In the inner most subquery I aggregation to get the range of dates and total quantity for particular values of Dcode and SID. Then I use an anaylitic (window) function to get the row for which total quantity is the greatest. (You would want to use RANK() in place of ROW_NUMBER() in the event you want to return more than one value of SID with the same quantity.)
Here's one option which doesn't contain JLID = 111 in the final result as I have no idea where you took it from.
SQL> with test (jlid, dcode, sid, tdt, qty) as
2 (select 8295783, 3119255, 9842, date '2018-03-05', 14 from dual union
3 select 8269771, 3119255, 9842, date '2018-08-22', 11 from dual union
4 select 8302211, 3119255, 1126, date '2018-03-01', 19 from dual union
5 --
6 select 1234567, 1112223, 1000, date '2018-06-16', 88 from dual
7 )
8 select dcode,
9 min (trunc (tdt, 'mm')) start_dt, --> MIN
10 max (last_day (tdt)) end_dt, --> MAX
11 sid
12 from (select dcode,
13 sid,
14 tdt,
15 sqty,
16 rank () over (partition by dcode order by sqty desc) rnk
17 from (select dcode,
18 sid,
19 tdt,
20 sum (qty) over (partition by dcode, sid) sqty
21 from test))
22 where rnk = 1
23 group by dcode, sid; --> GROUP BY
DCODE START_DT END_DT SID
---------- ---------------- ---------------- ----------
1112223 01.06.2018 00:00 30.06.2018 00:00 1000
3119255 01.03.2018 00:00 31.08.2018 00:00 9842
SQL>

Oracle SQL overlap between begin date and end date in 2 or more records

Database my_table:
id seq start_date end_date
1 1 01-01-2017 02-01-2017
1 2 07-01-2017 09-01-2017
1 3 11-01-2017 11-01-2017
2 1 20-01-2017 20-01-2017
3 1 01-02-2017 02-02-2017
3 2 03-02-2017 04-02-2017
3 3 08-01-2017 09-02-2017
3 4 09-01-2017 10-02-2017
3 5 10-01-2017 12-02-2017
My requirement is to get the first date (normally seq 1 start date) and end date (normally last seq end date) and the number of dates occurred during all seq for each unique ID.
Date occurred:
id 1 2 3
01-01-2017 20-01-2017 01-02-2017
02-01-2017 02-02-2017
07-01-2017 03-02-2017
08-01-2017 04-02-2017
09-01-2017 08-02-2017
11-01-2017 09-02-2017
10-02-2017
11-02-2017
12-02-2017
total 6 1 9
Here is the result I want:
id start_date end_date num_date
1 01-01-2017 11-01-2017 6
2 20-01-2017 20-01-2017 1
3 01-02-2017 12-02-2017 9
I have tried
SELECT id
, MIN(start_date)
, MAX(end_date)
, SUM(end_date - start_date + 1)
FROM my_table
GROUP BY id
and this SQL statement work fine in id 1 and 2 since there is no overlap date between begin date and end date. But for id 3, the result num_date is 11. Could you please suggest the SQL statement to solve this problem? Thank you.
One more question: The date in database is in datetime format. How do I convert it to date. I tried to use TRUNC function but it sometimes convert date to yesterday instead.
You need to count how many times an end_date equals the following start_date. For this you need to use the lag() or the lead() analytic function. You can use a case expression for the comparison, but alas you can't wrap the case expression within a COUNT or SUM in the same query; you need a subquery and an outer query.
Something like this; not tested, since you didn't provide CREATE TABLE and INSERT statements to recreate your sample data.
select id, min(start_date) as start_date, max(end_date) as end_date,
sum(end_date - start_date + 1 - flag) as num_days
from ( select id, start_date, end_date,
case when start_date = lag(end_date)
over (partition by id order by end_date) then 1
else 0 end as flag
from my_table
)
group by id;
SELECT id,
MIN( start_date ) AS start_date,
MAX( end_date ) AS end_date,
SUM( end_date - start_date + 1 ) AS num_days
FROM (
SELECT id,
GREATEST(
start_date,
COALESCE(
LAG( end_date ) OVER ( PARTITION BY id ORDER BY seq ) + 1,
start_date
)
) AS start_date,
end_date
FROM your_table
)
WHERE start_date <= end_date
GROUP BY id;

Number of unique dates

There is table:
CREATE TABLE my_table
(gr_id NUMBER,
start_date DATE,
end_date DATE);
All dates always have zero time portion. I need to know a fastest way to compute number of unique dates inside gr_id.
For example, if there is rows (dd.mm.rrrr):
1 | 01.01.2000 | 07.01.2000
1 | 01.01.2000 | 07.01.2000
2 | 01.01.2000 | 03.01.2000
2 | 05.01.2000 | 07.01.2000
3 | 01.01.2000 | 04.01.2000
3 | 03.01.2000 | 05.01.2000
then right answer will be
1 | 7
2 | 6
3 | 5
At now I use additional table
CREATE TABLE mfr_date_list
(MFR_DATE DATE);
with every date between 01.01.2000 and 31.12.2020 and query like this:
SELECT COUNT(DISTINCT mfr_date_list.mfr_date) cnt,
dt.gr_id
FROM dwh_mfr.mfr_date_list,
(SELECT gr_id,
start_date AS sd,
end_date AS ed
FROM my_table
) dt
WHERE mfr_date_list.mfr_date BETWEEN dt.sd AND dt.ed
AND dt.ed IS NOT NULL
GROUP BY dt.gr_id
This query return correct resul data set, but I think it's not fastest way. I think there is some way to build query withot table mfr_date_list at all.
Oracle 11.2 64-bit.
I would expect what you're doing to be the fastest way (as always test). Your query can be simplified, though this only aids understanding and not necessarily speed:
select t.gr_id, count(distinct dl.mfr_date) as cnt
from my_table t
join mfr_date_list dl
on dl.mfr_date between t.date_start and t.date_end
where t.end_date is not null
group by t.gr_id
Whatever you do you have to generate the data between the two dates somehow as you need to remove the overlap. One way would be to use CAST(MULTISET()), as Lalit Kumar explains:
select gr_id, count(distinct end_date - column_value + 1)
from my_table m
cross join table(cast(multiset(select level
from dual
connect by level <= m.end_date - m.start_date + 1
) as sys.odcinumberlist))
group by gr_id;
GR_ID COUNT(DISTINCTEND_DATE-COLUMN_VALUE+1)
---------- --------------------------------------
1 7
2 6
3 5
This is very Oracle specific but should perform substantially better than most other row-generators as you're only accessing the table once and you're generating the minimal number of rows required due to the condition linking MY_TABLE and your generated rows.
What you really need to do is combine the ranges and then count the lengths. This can be quite challenging because of duplicate dates. The following is one way to approach this.
First, enumerate the dates and determine whether the date is "in" or "out". When the cumulative sum is 0 then it is "out":
select t.gr_id, dt,
sum(inc) over (partition by t.gr_id order by dt) as cume_inc
from (select t.gr_id, t.start_date as dt, 1 as inc
from my_table t
union all
select t.gr_id, t.end_date + 1, -1 as inc
from my_table t
) t
Then, use lead() to determine how long the period is:
with inc as (
select t.gr_id, dt,
sum(inc) over (partition by t.gr_id order by dt) as cume_inc
from (select t.gr_id, t.start_date as dt, 1 as inc
from my_table t
union all
select t.gr_id, t.end_date + 1, -1 as inc
from my_table t
) t
)
select t.gr_id,
sum(nextdt - dt) as daysInUse
from (select inc.*, lead(dt) over (partition by t.gr_id order by dt) as nextdt
from inc
) t
group by t.gr_id;
This is close to what you want. The following are two challenges: (1) putting in the limits and (2) handling ties. The following should work (although there might be off-by-one and boundary issues):
with inc as (
select t.gr_id, dt, priority,
sum(inc) over (partition by t.gr_id order by dt) as cume_inc
from ((select t.gr_id, t.start_date as dt, count(*) as inc, 1 as priority
from my_table t
group by t.gr_id, t.start_date
)
union all
(select t.gr_id, t.end_date + 1, - count(*) as inc, -1
from my_table t
group by t.gr_id, t.end_date
)
) t
)
select t.gr_id,
sum(least(nextdt, date '2020-12-31') - greatest(dt, date, '2010-01-01')) as daysInUse
from (select inc.*, lead(dt) over (partition by t.gr_id order by dt, priority) as nextdt
from inc
) t
group by t.gr_id;