Partitioning by specific value and then by range in Oracle - sql

I have a table with the following columns:
CREATE TABLE CUST_HISTORY (
ID NUMBER,
PRD_CNT NUMBER,
DATE_TO DATE
)
Now, I would like to apply the following partitioning strategy:
all values where DATE_TO = '9999-12-31' should be assigned to one partition called "p_max"
all remaining values of DATE_TO should be partitioned by monthly intervals (from DATE_TO)
Any hints?

From this answer:
CREATE TABLE CUST_HISTORY (
ID NUMBER,
PRD_CNT NUMBER,
DATE_TO DATE
)
PARTITION BY RANGE (date_to)
INTERVAL (INTERVAL '1' MONTH)
(PARTITION p_first VALUES LESS THAN ( DATE '2019-01-01' ) );
db<>fiddle
If you particularly want the partition to be named as p_max then you can use a virtual column to remap the DATE_TO value from a high vale to a low value so you can name the partition and then use range intervals:
CREATE TABLE CUST_HISTORY (
ID NUMBER,
PRD_CNT NUMBER,
DATE_TO DATE CHECK ( DATE_TO >= DATE '1900-01-01' ),
remapped_date_to DATE
GENERATED ALWAYS AS
( CASE WHEN date_to = DATE '9999-12-31' THEN DATE '0001-01-01' ELSE date_to END )
VIRTUAL
)
PARTITION BY RANGE (remapped_date_to)
INTERVAL (INTERVAL '1' MONTH)
(PARTITION p_max VALUES LESS THAN ( DATE '1900-01-01' ) );
db<>fiddle
or use AUTOMATIC LIST partitioning (Oracle 12c or later) with a virtual column:
CREATE TABLE CUST_HISTORY (
ID NUMBER,
PRD_CNT NUMBER,
DATE_TO DATE,
month_to DATE
-- INVISIBLE
GENERATED ALWAYS AS
( CASE WHEN date_to = DATE '9999-12-31' THEN date_to ELSE TRUNC( date_to, 'MM' ) END )
VIRTUAL
)
PARTITION BY LIST ( month_to ) AUTOMATIC
( PARTITION p_max VALUES ( DATE '9999-12-31' ) );
db<>fiddle
(If you want you can also make the virtual column INVISIBLE)

The solution with monthly partitioning (please observe last_day function):
CREATE TABLE CUST_HISTORY (
ID NUMBER,
PRD_CNT NUMBER,
DATE_TO DATE,
month_to DATE
GENERATED ALWAYS AS
( CASE WHEN date_to = DATE '9999-12-31' THEN date_to ELSE TRUNC( last_day(date_to) ) END )
VIRTUAL
)
PARTITION BY LIST ( month_to ) AUTOMATIC
( PARTITION p_max VALUES ( DATE '9999-12-31' ) );
DB<>FIDDLE
Thank you for inspiration, #mt0!

Related

Snowflake SQL : How do I insert and build in a array : all dates between a specific start and a specific end date

Given two dates, activation_date and termination_date per customer
Imagine there is 20 days between the activation_date and the termination_date.
How do I get by customer an array which start with the activation_date and continue by day to the termination_date ?
I try using :
SELECT
customer_id,
first_day,
last_day,
ARRAY_AGG(TO_DATE(MY_DATE)) WITHIN GROUP (ORDER BY MY_DATE asc)
FROM (
SELECT
customer_id,
activation_day as first_day,
termination_date_clean_formatted as last_day,
TO_DATE(dateadd(day, SEQ4(), first_day)) AS MY_DATE
FROM v
,TABLE(GENERATOR(ROWCOUNT=>(20000)))
WHERE MY_DATE <= last_day
)
group by subscription_id, first_day, last_day
)
select *
from test
But unfortunatly it doesn't work at all, the results is completly random by customer_id, MY_DATE is not even starting at the same date as the first_day and i got only 8 results max per customer_id which is impossible.
The result I'm excepting to have is :
customer_id
array
546464654
[ "2022-01-02", "2022-01-03"....]
116541165
[ "2022-05-06", "2022-05-07"....]
Thanks for helping :)
You can generate an array with all dates between dates with a JS UDF:
create or replace function gen_dates_array(SINCE date, UNTIL date)
returns variant
language javascript
as $$
dates = [];
currentDate = SINCE;
while (currentDate < UNTIL) {
dates.push(new Date(currentDate));
currentDate.setDate(currentDate.getDate() + 1);
}
dates.push(UNTIL);
return dates.map(x => x.toISOString().split('T')[0]);
$$;
For example:
select gen_dates_array('2020-01-10'::date, '2020-01-13'::date)
-- [ "2020-01-10", "2020-01-11", "2020-01-12", "2020-01-13" ]
An alternative SQL only approach using ARRAY_AGG(),DATEDIFF(), FLATTEN(), REPEAT():
WITH SOURCE as (
SELECT
DATEADD(day, uniform(1, 365, random(12)), '2020-01-01')::DATE AS start_date
, DATEADD(day, uniform(5,20, random(120)), start_date)::DATE end_date
, DATEDIFF(DAY,START_DATE,END_DATE) CUSTOMER_LENGTH_DAYS
, uniform(1, 10, random(12)) customer_id
FROM TABLE(GENERATOR(rowcount => 10)) v )
SELECT CUSTOMER_ID
,START_DATE
,END_DATE
,ARRAY_AGG( DATEADD(day,INDEX,START_DATE)::DATE) VOILA
FROM SOURCE
,LATERAL FLATTEN(INPUT=>STRTOK_TO_ARRAY(REPEAT('1~',CUSTOMER_LENGTH_DAYS),'~'))
GROUP BY 1,2,3

Generate Rows Between Two Dates, Copying Down The Values in the Remaining Columns

I'm trying to write a script that will look at the issue date and termination date for each policy in a table. I want to be able to take those two dates, create a row for each year in between those two dates, and then fill in the values in the remaining columns.
I've been working with a recursive CTE approach in Redshift and I've got to the point where I can create the annual records. The part I'm stuck on is how to include the other columns in the table and fill each of the created rows with the same information as the row above.
For example, if I start with a record that looks something like
policy_number
issue_date
termination_date
issue_state
product
plan_code
001
1985-05-26
2005-03-02
CT
ROP
123456
I want to build a table that would look like this
policy_number
issue_date
termination_date
issue_state
product
plan_code
start_date
001
1985-05-26
2005-03-02
CT
ROP
123456
1985-05-26
001
1985-05-26
2005-03-02
CT
ROP
123456
1986-05-26
001
1985-05-26
2005-03-02
CT
ROP
123456
1987-05-26
...
...
...
...
...
...
...
001
1985-05-26
2005-03-02
CT
ROP
123456
2004-05-26
001
1985-05-26
2005-03-02
CT
ROP
123456
2005-03-02
Here's the code I've got so far:
WITH RECURSIVE start_dt AS
(
SELECT MIN(issue_date) AS s_dt -- step 1: grab start date
FROM myTable
WHERE policy_number = '001'
GROUP BY policy_number
),
end_dt AS
(
SELECT MAX(effective_date) AS e_dt -- step 2: grab the termination date
FROM myTable
WHERE policy_number = '001'
GROUP BY policy_number
),
dates (dt) AS
(
-- start at the start date
SELECT s_dt dt -- selectin start date from step 1
FROM start_dt
UNION ALL
-- recursive lines
SELECT dateadd(YEAR,1,dt)::DATE dt -- converted to date to avoid type mismatch -- adding annual records until the termination date
FROM dates
WHERE dt <= (SELECT e_dt FROM end_dt)
-- stop at the end date
)
SELECT *
FROM dates
which yields
dt
1985-05-26
1986-05-26
1987-05-26
...
How can I include the rest the columns in my table? I'm also open to using a cross join if that would be a better approach. I'm expecting this to generate around 10,000,000 rows, so any optimization would be much appreciated.
If I understand correctly you have a table with begin/end dates and you have a process for generating all the needed dates to span the min / max of these. You want to apply this list of dates to the starting table to get all rows replicated between begin and end.
You have a good start - the list of dates. The usual process is to join the dates with the table using inequality conditions. (ON dt >= begin and dt <= end)
You will need to deal with some edge condition around the unique dates for each input row. If you need to maintain these unique dates you will need to fudge the join condition. All doable.
==============================================================
Back from biz trip and can give more concrete guidance.
There's 2 ways to do this. The first is the CTE approach you are driving down but this will pass all the data through each loop of the CTE. This could be slow. This would look like (including data setup):
create table mytable (
policy_number varchar(8),
issue_date timestamp,
termination_date timestamp,
issue_state varchar(4),
product varchar(16),
plan_code int);
insert into mytable values
('001', '1985-05-26', '2005-03-02', 'CT', 'ROP', 123456),
('002', '1988-07-25', '2005-08-07', 'CT', 'ROP', 654321)
;
with recursive pdata(policy_number, issue_date, termination_date,
issue_state, product, plan_code, start_date,
yr) as (
select policy_number, issue_date, termination_date, issue_state,
product, plan_code, issue_date as start_date, 0 as yr
from mytable
union all
select policy_number, issue_date, termination_date, issue_state,
product, plan_code,
issue_date + yr * (interval '1 years') as start_date,
yr + 1 as yr
from pdata
where start_date < termination_date
)
select policy_number, issue_date, termination_date,
issue_state, product, plan_code,
case when start_date > termination_date
then termination_date
else start_date
end as start_date
from pdata
order by start_date, policy_number;
The other way to do this is to generate the length of years in the recursive CTE but apply the data expansion in a loop join. This has the benefit of not carrying all the data through the recursive calls but has the expense of the loop join. It should be faster with large amounts of data but you can decide which is right for you.
Since each input row has its own date I left things in year intervals as this is cleaner. This looks like:
create table mytable (
policy_number varchar(8),
issue_date timestamp,
termination_date timestamp,
issue_state varchar(4),
product varchar(16),
plan_code int);
insert into mytable values
('001', '1985-05-26', '2005-03-02', 'CT', 'ROP', 123456),
('002', '1988-07-25', '2005-08-07', 'CT', 'ROP', 654321)
;
with recursive nums(yr, maxnum) as (
select 0::int as yr,
date_part('year', max(termination_date)) -
date_part('year', min(issue_date)) as maxnum
from mytable
union all
select yr + 1 as yr, maxnum
from nums
where yr <= maxnum
)
select policy_number, issue_date, termination_date,
issue_state, product, plan_code,
case when issue_date + yr * interval '1 year' > termination_date
then termination_date
else issue_date + yr * interval '1 year'
end as start_date
from mytable p
left join nums n
on termination_date + interval '1 year'
> issue_date + yr * interval '1 year'
order by start_date, policy_number;

Big Query - User Defined Function - Scalar Subquery Error

I am trying to re-create a user defined function based on the query below, however, when I ran it, it returned me this error instead. Appreciate if anyone here knows a workaround.
Scalar subquery produced more than one element:
create or replace function `dataset.list_of_days`
(user_id string, start_date date, end_date date) AS
((
with temp as (
select user_id, day from
unnest(generate_date_array(start_date, end_date)) day)
select as struct row_number() over (partition by user_id order by day asc) as row_num,
user_id, day
from temp
));
with temp as (
select '100110' as user_id, date('2020-01-31') as start_date,
date('2020-02-28') as end_date )
select dataset.list_of_days(user_id, start_date, end_date)
from temp;
BigQuery's UDF could only return a scalar value, it seems you don't have too huge of an output from inside the UDF, you may consider to rewrite it as
create or replace function `dataset.list_of_days`
(user_id string, start_date date, end_date date) AS
(ARRAY( -- NOTE: ARRAY() was added to you original query
with temp as (
select user_id, day from
unnest(generate_date_array(start_date, end_date)) day)
select as struct row_number() over (partition by user_id order by day asc) as row_num,
user_id, day
from temp
));
with temp as (
select '100110' as user_id, date('2020-01-31') as start_date,
date('2020-02-28') as end_date )
select dataset.list_of_days(user_id, start_date, end_date)
from temp;
Note now the return type of the UDF is ARRAY< STRUCT<INT64, STRING, DATE> >, you may unnest the array if you'll need it as a table with multiple rows.

how to calculate difference between dates in BigQuery

I have a table named Employees with Columns: PersonID, Name, StartDate. I want to calculate 1) difference in days between the newest and oldest employee and 2) the longest period of time (in days) without any new hires. I have tried to use DATEDIFF, however the dates are in a single column and I'm not sure what other method I should use. Any help would be greatly appreciated
Below is for BigQuery Standard SQL
#standardSQL
SELECT
SUM(days_before_next_hire) AS days_between_newest_and_oldest_employee,
MAX(days_before_next_hire) - 1 AS longest_period_without_new_hire
FROM (
SELECT
DATE_DIFF(
StartDate,
LAG(StartDate) OVER(ORDER BY StartDate),
DAY
) days_before_next_hire
FROM `project.dataset.your_table`
)
You can test, play with above using dummy data as in the example below
#standardSQL
WITH `project.dataset.your_table` AS (
SELECT DATE '2019-01-01' StartDate UNION ALL
SELECT '2019-01-03' StartDate UNION ALL
SELECT '2019-01-13' StartDate
)
SELECT
SUM(days_before_next_hire) AS days_between_newest_and_oldest_employee,
MAX(days_before_next_hire) - 1 AS longest_period_without_new_hire
FROM (
SELECT
DATE_DIFF(
StartDate,
LAG(StartDate) OVER(ORDER BY StartDate),
DAY
) days_before_next_hire
FROM `project.dataset.your_table`
)
with result
Row days_between_newest_and_oldest_employee longest_period_without_new_hire
1 12 9
Note use of -1 in calculating longest_period_without_new_hire - it is really up to you to use this adjustment or not depends on your preferences of counting gaps
1) difference in days between the newest and oldest record
WITH table AS (
SELECT DATE(created_at) date, *
FROM `githubarchive.day.201901*`
WHERE _table_suffix<'2'
AND repo.name = 'google/bazel-common'
AND type='ForkEvent'
)
SELECT DATE_DIFF(MAX(date), MIN(date), DAY) max_minus_min
FROM table
2) the longest period of time (in days) without any new records
WITH table AS (
SELECT DATE(created_at) date, *
FROM `githubarchive.day.201901*`
WHERE _table_suffix<'2'
AND repo.name = 'google/bazel-common'
AND type='ForkEvent'
)
SELECT MAX(diff) max_diff
FROM (
SELECT DATE_DIFF(date, LAG(date) OVER(ORDER BY date), DAY) diff
FROM table
)

Merge the records for overlapping dates

I have data as below and want to merge the records for overlapping dates. MIN and MAX of start and end dates for overlapping records should be the Start and end date of merged record.
Before merge:
Item Code Start_date End_date
============== =========== ===========
111 15-May-2004 20-Jun-2004
111 22-May-2004 07-Jun-2004
111 20-Jun-2004 13-Aug-2004
111 27-May-2004 30-Aug-2004
111 02-Sep-2004 23-Dec-2004
222 21-May-2004 19-Aug-2004
Required output:
Item Code Start_date End_date
============== =========== ===========
111 15-May-2004 30-Aug-2004
111 02-Sep-2004 23-Dec-2004
222 21-May-2004 19-Aug-2004
you can create sample data using
create table item(item_code number, start_date date, end_date date);
insert into item values (111,to_date('15-May-2004','DD-Mon-YYYY'),to_date('20-Jun-2004','DD-Mon-YYYY'));
insert into item values (111,to_date('22-May-2004','DD-Mon-YYYY'),to_date('07-Jun-2004','DD-Mon-YYYY'));
insert into item values (111,to_date('20-Jun-2004','DD-Mon-YYYY'),to_date('13-Aug-2004','DD-Mon-YYYY'));
insert into item values (111,to_date('27-May-2004','DD-Mon-YYYY'),to_date('30-Aug-2004','DD-Mon-YYYY'));
insert into item values (111,to_date('02-Sep-2004','DD-Mon-YYYY'),to_date('23-Dec-2004','DD-Mon-YYYY'));
insert into item values (222,to_date('21-May-2004','DD-Mon-YYYY'),to_date('19-Aug-2004','DD-Mon-YYYY'));
commit;
The code for this type of problem is rather tricky. Here is one approach that works pretty well:
with item (item_code, start_date, end_date) as (
select 111,to_date('15-05-2004','DD-MM-YYYY'),to_date('20-06-2004','DD-MM-YYYY') from dual union all
select 111,to_date('22-05-2004','DD-MM-YYYY'),to_date('07-06-2004','DD-MM-YYYY') from dual union all
select 111,to_date('20-06-2004','DD-MM-YYYY'),to_date('13-08-2004','DD-MM-YYYY') from dual union all
select 111,to_date('27-05-2004','DD-MM-YYYY'),to_date('30-08-2004','DD-MM-YYYY') from dual union all
select 111,to_date('02-09-2004','DD-MM-YYYY'),to_date('23-12-2004','DD-MM-YYYY') from dual union all
select 222,to_date('21-05-2004','DD-MM-YYYY'),to_date('19-08-2004','DD-MM-YYYY') from dual
),
id as (
select item_code, start_date as dte, count(*) as inc
from item
group by item_code, start_date
union all
select item_code, end_date, - count(*) as inc
from item
group by item_code, end_date
),
id2 as (
select id.*, sum(inc) over (partition by item_code order by dte) as running_inc
from id
),
id3 as (
select id2.*, sum(case when running_inc = 0 then 1 else 0 end) over (partition by item_code order by dte desc) as grp
from id2
)
select item_code, min(dte) as start_date, max(dte) as end_date
from id3
group by item_code, grp;
And a rextester to validate it.
What is this doing? Good question. The idea in these problems is to define the adjacent groups. This method does so by counting the number of "starts" and "ends" up to a given date. When the value is 0, a group ends.
The specific steps are as follows:
(1) Break out all the dates onto separate rows along with an indicator of whether the date is a start date or end date. This indicator is key to defining the ranges -- +1 to "enter" and "-1" to exit.
(2) Calculate the running total of the indicators. The 0s in this total are the ends of overlapping ranges.
(3) Do a reverse cumulative sum of the 0s to identify the groups.
(4) Aggregate to get the final results.
You can look at each of the CTEs to see what is happening in the data.
It's a variation of a gaps&islands problem. First calculate the maximum previous end date for each row. Then filter the rows where the current row's start date is greater than that max date, this is the start of a new group and the group's end date is found in the next row.
WITH max_dates AS
(
SELECT
item_code
,start_date
,Max(end_date) -- get the maximum prevous end_date
Over (PARTITION BY item_code
ORDER BY start_date
ROWS BETWEEN Unbounded Preceding AND 1 Preceding) AS max_prev_date
,Max(end_date) -- get the maximum overall date (only needed for the last group)
Over (PARTITION BY item_code) AS max_date
FROM item
)
SELECT
item_code
,start_date
,Coalesce(Lead(max_prev_date) -- next row got the end date for the current row
Over (PARTITION BY item_code
ORDER BY start_date)
,max_date ) AS end_date -- no next row for the last row --> overall maximum end_date
FROM max_dates
WHERE max_prev_date < start_date -- maximum previous end date is less than current start date --> start of a new group
OR max_prev_date IS NULL -- first row
In SQL Server you can try this. It will give your desired output but as performance point of view the query might slow down, When there is a large number of data to be checked.
DECLARE #item Table(item_code int, start_date date, end_date date);
insert into #item values (111,'15-May-2004','20-Jun-2004');
insert into #item values (111,'22-May-2004','07-Jun-2004');
insert into #item values (111,'20-Jun-2004','13-Aug-2004');
insert into #item values (111,'27-May-2004','30-Aug-2004');
insert into #item values (111,'02-Sep-2004','23-Dec-2004');
insert into #item values (222,'21-May-2004','19-Aug-2004');
SELECT * FROM #item WHERE item_code IN (SELECT item_code FROM #item GROUP BY item_code) AND
(start_date IN (SELECT max(start_date) FROM #item GROUP BY item_code) or start_date In (SELECT min(start_date) FROM #item GROUP BY item_code))
with help of above answers i am able to simplify this as below
WITH max_dates AS
(
SELECT
item_code
,start_date
,end_date
,Max(end_date)
Over (PARTITION BY item_code
ORDER BY start_date
) AS max_date
FROM item
) ,
max_dates1 as
(
select max_dates.* , lag(max_date) over(partition by item_code order by 1) as MPD from max_dates
)
select ITEM_CODE,start_date,end_date from max_dates1
WHERE MPD < start_date
OR MPD IS NULL