Create Historical Table from Dates with Ranked Contracts (Gaps and Islands?) - sql

I have an issue in Teradata where I am trying to build a historical contract table that lists a system, it's corresponding contracts and the start and end dates of each contract. This table would then be queried for reporting as a point in time table. Here is some code to better explain.
CREATE TABLE TMP_WORK_DB.SOLD_SYSTEMS
(
SYSTEM_ID varchar(5),
CONTRACT_TYPE varchar(10),
CONTRACT_RANK int,
CONTRACT_STRT_DT date,
CONTRACT_END_DT date
);
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('AAA', 'BEST', 10, '2012-01-01', '2012-06-30');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('AAA', 'BEST', 9, '2012-01-01', '2012-06-30');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('AAA', 'OK', 1, '2012-08-01', '2012-12-30');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('BBB', 'BEST', 10, '2013-12-01', '2014-03-02');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('BBB', 'BETTER', 7, '2013-12-01', '2017-03-02');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('BBB', 'GOOD', 4, '2016-12-02', '2017-12-02');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('CCC', 'BEST', 10, '2009-10-13', '2014-10-14');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('CCC', 'BETTER', 7, '2009-10-13', '2016-10-14');
INSERT INTO TMP_WORK_DB.SOLD_SYSTEMS VALUES ('CCC', 'OK', 2, '2008-10-13', '2017-10-14');
The required output would be:
SYSTEM_ID CONTRACT_TYPE CONTRACT_STRT_DT CONTARCT_END_DT CONTRACT_RANK
AAA BEST 01/01/2012 06/30/2012 10
AAA OK 08/01/2012 12/30/2012 1
BBB BEST 12/01/2013 03/02/2014 10
BBB BETTER 03/03/2014 03/02/2017 7
BBB GOOD 03/03/2017 12/02/2017 4
CCC OK 10/13/2008 10/12/2009 2
CCC BEST 10/13/2009 10/14/2014 10
CCC BETTER 10/15/2014 10/14/2016 7
CCC OK 10/15/2016 10/14/2017 2
I'm not necessarily looking to reduce rows but am looking to get the correct state of the system_id at any given point in time. Note that when a higher ranked contract ends and a lower ranked contract is still active the lower ranked picks up where the higher one left off.
We are using TD 14 and I have been able to get the easy records where the dates flow sequentially and are of higher rank but am having trouble with the overlaps where two different ranked contracts cover multiple date spans.
I found this blog post (Sharpening Stones) and got it working for the most part but am still having trouble setting the new start dates for the overlapping contracts.
Any help would be appreciated. Thanks.
*UPDATE 04/04/2014 *
I came up with the following code which gives me exactly what I want but I'm not sure of the performance. It works on smaller data sets of a few hundred rows but I havent tested it on several million:
*UPDATE 04/07/2014 *
Updated the date subquery due to spool issues. This query explodes all days where the contract is possibly active and then uses the ROW_NUMBER function to get the highest ranked CONTRACT_TYPE per day. The MIN/MAX functions are then partitioned over the system and contract type to pick up when the highest ranked contract type changes.
*UPDATE - 2 - 04/07/2014 *
I cleaned up the query and it seems to be perform a little better.
SELECT
SYSTEM_ID
, CONTRACT_TYPE
, MIN(CALENDAR_DATE) NEW_START_DATE
, MAX(CALENDAR_DATE) NEW_END_DATE
, CONTRACT_RANK
FROM (
SELECT
CALENDAR_DATE
, SYSTEM_ID
, CONTRACT_TYPE
, CONTRACT_RANK
, ROW_NUMBER() OVER (PARTITION BY SYSTEM_ID, CALENDAR_DATE ORDER BY CONTRACT_RANK DESC, CONTRACT_STRT_DT DESC, CONTRACT_END_DT DESC) AS RNK
FROM SOLD_SYSTEMS t1
JOIN (
SELECT CALENDAR_DATE
FROM FULL_CALENDAR_TABLE ia
WHERE CALENDAR_DATE > DATE'2013-01-01'
)dt
ON CALENDAR_DATE BETWEEN CONTRACT_STRT_DT AND CONTRACT_END_DT
QUALIFY RNK = 1
)z1
GROUP BY 1,2,5

Following approach uses the new PERIOD functions in TD13.10.
-- 1. TD_SEQUENCED_COUNT can't be used in joins, so create a Volatile Table
-- 2. TD_SEQUENCED_COUNT can't use additional columns (e.g. CONTRACT_RANK),
-- so simply create a new row whenever a period starts or ends without
-- considering CONTRACT_RANK
CREATE VOLATILE TABLE vt AS
(
WITH cte
(
SYSTEM_ID
,pd
)
AS
(
SELECT
SYSTEM_ID
-- PERIODs can easily be constructed on-the-fly, but the end date is not inclusive,
-- so I had to adjust to your implementation, CONTRACT_END_DT +/- 1:
,PERIOD(CONTRACT_STRT_DT, CONTRACT_END_DT + 1) AS pd
FROM SOLD_SYSTEMS
)
SELECT
SYSTEM_ID
,BEGIN(pd) AS CONTRACT_STRT_DT
,END(pd) - 1 AS CONTRACT_END_DT
FROM
TABLE (TD_SEQUENCED_COUNT
(NEW VARIANT_TYPE(cte.SYSTEM_ID)
,cte.pd)
RETURNS (SYSTEM_ID VARCHAR(5)
,Policy_Count INTEGER
,pd PERIOD(DATE))
HASH BY SYSTEM_ID
LOCAL ORDER BY SYSTEM_ID ,pd) AS dt
)
WITH DATA
PRIMARY INDEX (SYSTEM_ID)
ON COMMIT PRESERVE ROWS
;
-- Find the matching CONTRACT_RANK
SELECT
vt.SYSTEM_ID
,t.CONTRACT_TYPE
,vt.CONTRACT_STRT_DT
,vt.CONTRACT_END_DT
,t.CONTRACT_RANK
FROM vt
-- If both vt and SOLD_SYSTEMS have a NUPI on SYSTEM_ID this join should be
-- quite efficient
JOIN SOLD_SYSTEMS AS t
ON vt.SYSTEM_ID = t.SYSTEM_ID
AND ( t.CONTRACT_STRT_DT, t.CONTRACT_END_DT)
OVERLAPS (vt.CONTRACT_STRT_DT, vt.CONTRACT_END_DT)
QUALIFY
-- As multiple contracts for the same period are possible:
-- find the row with the highest rank
ROW_NUMBER()
OVER (PARTITION BY vt.SYSTEM_ID,vt.CONTRACT_STRT_DT
ORDER BY t.CONTRACT_RANK DESC, vt.CONTRACT_END_DT DESC) = 1
ORDER BY 1,3
;
-- Previous query might return consecutive rows with the same CONTRACT_RANK, e.g.
-- BBB BETTER 2014-03-03 2016-12-01 7
-- BBB BETTER 2016-12-02 2017-03-02 7
-- If you don't want that you have to normalize the data:
WITH cte
(
SYSTEM_ID
,CONTRACT_STRT_DT
,CONTRACT_END_DT
,CONTRACT_RANK
,CONTRACT_TYPE
,pd
)
AS
(
SELECT
vt.SYSTEM_ID
,vt.CONTRACT_STRT_DT
,vt.CONTRACT_END_DT
,t.CONTRACT_RANK
,t.CONTRACT_TYPE
,PERIOD(vt.CONTRACT_STRT_DT, vt.CONTRACT_END_DT + 1) AS pd
FROM vt
JOIN SOLD_SYSTEMS AS t
ON vt.SYSTEM_ID = t.SYSTEM_ID
AND ( t.CONTRACT_STRT_DT, t.CONTRACT_END_DT)
OVERLAPS (vt.CONTRACT_STRT_DT, vt.CONTRACT_END_DT)
QUALIFY
ROW_NUMBER()
OVER (PARTITION BY vt.SYSTEM_ID,vt.CONTRACT_STRT_DT
ORDER BY t.CONTRACT_RANK DESC, vt.CONTRACT_END_DT DESC) = 1
)
SELECT
SYSTEM_ID
,CONTRACT_TYPE
,BEGIN(pd) AS CONTRACT_STRT_DT
,END(pd) - 1 AS CONTRACT_END_DT
,CONTRACT_RANK
FROM
TABLE (TD_NORMALIZE_MEET
(NEW VARIANT_TYPE(cte.SYSTEM_ID
,cte.CONTRACT_RANK
,cte.CONTRACT_TYPE)
,cte.pd)
RETURNS (SYSTEM_ID VARCHAR(5)
,CONTRACT_RANK INT
,CONTRACT_TYPE VARCHAR(10)
,pd PERIOD(DATE))
HASH BY SYSTEM_ID
LOCAL ORDER BY SYSTEM_ID, CONTRACT_RANK, CONTRACT_TYPE, pd ) A
ORDER BY 1, 3;
Edit: This is another way to get the result of the 2nd query without Volatile Table and TD_SEQUENCED_COUNT:
SELECT
t.SYSTEM_ID
,t.CONTRACT_TYPE
,BEGIN(CONTRACT_PERIOD) AS CONTRACT_STRT_DT
,END(CONTRACT_PERIOD)- 1 AS CONTRACT_END_DT
,t.CONTRACT_RANK
,dt.p P_INTERSECT PERIOD(t.CONTRACT_STRT_DT,t.CONTRACT_END_DT + 1) AS CONTRACT_PERIOD
FROM
(
SELECT
dt.SYSTEM_ID
,PERIOD(d, MIN(d)
OVER (PARTITION BY dt.SYSTEM_ID
ORDER BY d
ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)) AS p
FROM
(
SELECT
SYSTEM_ID
,CONTRACT_STRT_DT AS d
FROM SOLD_SYSTEMS
UNION
SELECT
SYSTEM_ID
,CONTRACT_END_DT + 1 AS d
FROM SOLD_SYSTEMS
) AS dt
QUALIFY p IS NOT NULL
) AS dt
JOIN SOLD_SYSTEMS AS t
ON dt.SYSTEM_ID = t.SYSTEM_ID
WHERE CONTRACT_PERIOD IS NOT NULL
QUALIFY
ROW_NUMBER()
OVER (PARTITION BY dt.SYSTEM_ID,p
ORDER BY t.CONTRACT_RANK DESC, t.CONTRACT_END_DT DESC) = 1
ORDER BY 1,3
And based on that you can also include the normalization in a single query:
WITH cte
(
SYSTEM_ID
,CONTRACT_TYPE
,CONTRACT_STRT_DT
,CONTRACT_END_DT
,CONTRACT_RANK
,pd
)
AS
(
SELECT
t.SYSTEM_ID
,t.CONTRACT_TYPE
,BEGIN(CONTRACT_PERIOD) AS CONTRACT_STRT_DT
,END(CONTRACT_PERIOD)- 1 AS CONTRACT_END_DT
,t.CONTRACT_RANK
,dt.p P_INTERSECT PERIOD(t.CONTRACT_STRT_DT,t.CONTRACT_END_DT + 1) AS CONTRACT_PERIOD
FROM
(
SELECT
dt.SYSTEM_ID
,PERIOD(d, MIN(d)
OVER (PARTITION BY dt.SYSTEM_ID
ORDER BY d
ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)) AS p
FROM
(
SELECT
SYSTEM_ID
,CONTRACT_STRT_DT AS d
FROM SOLD_SYSTEMS
UNION
SELECT
SYSTEM_ID
,CONTRACT_END_DT + 1 AS d
FROM SOLD_SYSTEMS
) AS dt
QUALIFY p IS NOT NULL
) AS dt
JOIN SOLD_SYSTEMS AS t
ON dt.SYSTEM_ID = t.SYSTEM_ID
WHERE CONTRACT_PERIOD IS NOT NULL
QUALIFY
ROW_NUMBER()
OVER (PARTITION BY dt.SYSTEM_ID,p
ORDER BY t.CONTRACT_RANK DESC, t.CONTRACT_END_DT DESC) = 1
)
SELECT
SYSTEM_ID
,CONTRACT_TYPE
,BEGIN(pd) AS CONTRACT_STRT_DT
,END(pd) - 1 AS CONTRACT_END_DT
,CONTRACT_RANK
FROM
TABLE (TD_NORMALIZE_MEET
(NEW VARIANT_TYPE(cte.SYSTEM_ID
,cte.CONTRACT_RANK
,cte.CONTRACT_TYPE)
,cte.pd)
RETURNS (SYSTEM_ID VARCHAR(5)
,CONTRACT_RANK INT
,CONTRACT_TYPE VARCHAR(10)
,pd PERIOD(DATE))
HASH BY SYSTEM_ID
LOCAL ORDER BY SYSTEM_ID, CONTRACT_RANK, CONTRACT_TYPE, pd ) A
ORDER BY 1, 3;

SEL system_id,contract_type,MAX(contract_rank),
CASE WHEN contract_strt_dt<prev_end_dt THEN prev_end_dt+1
ELSE contract_strt_dt
END AS new_start ,contract_strt_dt,contract_end_dt,
MIN(contract_end_dt) OVER (PARTITION BY system_id
ORDER BY contract_strt_dt,contract_end_dt ROWS BETWEEN 1 PRECEDING
AND 1 PRECEDING) prev_end_dt
FROM sold_systems
GROUP BY system_id,contract_type,contract_strt_dt,contract_end_dt
ORDER BY contract_strt_dt,contract_end_dt,prev_end_dt

I think I'v got it....
try this
select SYSTEM_ID, CONTRACT_TYPE,CONTRACT_RANK,
case
when CONTRACT_STRT_DT<NEW_START_DATE then NEW_START_DATE /*if new_star_date overlap startdate then get new_Start_date */
else CONTRACT_STRT_DT
end as new_contract_str_dt,
CONTRACT_END_DT
from
(select t1.SYSTEM_ID,t1.CONTRACT_TYPE,t1.CONTRACT_RANK,t1.CONTRACT_STRT_DT,t1.CONTRACT_END_DT,
coalesce(max(t1.CONTRACT_END_DT) over (partition by t1.SYSTEM_ID order by t1.CONTRACT_RANK desc rows between UNBOUNDED PRECEDING and 1 preceding ), t1.CONTRACT_STRT_DT) NEW_START_DATE
from SOLD_SYSTEMS t1
) as temp1
/*you may remove fully overlapped contracts*/
where NEW_START_DATE<=CONTRACT_END_DT
It's simpler and have a good execution plan... You can work with large tables (don't forget to collect statistics )

Related

Select an excess quantity(rows) based on a total quantity from another table

I'm trying to select an excess quantity (per row grouped by same Groupletter and Date) based on a total quantity of the another table. I am able to get the excess but I think it's a long code and kinda hard to understand. So I'm wondering if someone can make my code shorter and precise.
Here is my code:
IF (SELECT OBJECT_ID('tempdb..#TableA')) IS NOT NULL
DROP TABLE #TableA
GO
CREATE TABLE #TableA
(
GroupLetter CHAR(2),
Quantity INT,
ValueDate DATE
)
INSERT INTO #TableA VALUES
('A',1,'01-02-2000'),('A',1,'01-02-2000'),('A',1,'01-02-2000'),('A',2,'01-03-2000'),('A',2,'01-03-2000'),
('B',2,'01-04-2002'),('B',2,'01-05-2002'),
('C',1,'01-02-2003'),('C',1,'01-02-2003'),('C',1,'02-02-2003'),
('D',1,'02-02-2004'),('D',1,'02-02-2004'),
('E',30,'01-02-2005'),('E',3,'01-07-2005'),('E',1,'01-02-2005'),
('F',30,'01-06-2006'),('F',15,'01-06-2006'),('F',2,'01-08-2006'),
('G',1,'01-02-2007')
;
IF (SELECT OBJECT_ID('tempdb..#TableB')) IS NOT NULL
DROP TABLE #TableB
GO
CREATE TABLE #TableB
(
GroupLetter CHAR(2),
Limit INT
)
INSERT INTO #TableB VALUES
('A',3),
('B',3),
('c',1),
('D',2),
('E',29),
('F',32),
('G',3)
;
------------------------------------------------
WITH Step1 AS
(
SELECT a.GroupLetter,
a.Quantity,
a.ValueDate,
SUM(a.Quantity) OVER (PARTITION BY a.GroupLetter,
a.ValueDate
ORDER BY a.Quantity DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS RunningQty,
b.Limit
FROM #TableA a
JOIN #TableB b ON a.GroupLetter = b.GroupLetter
)
,Step2 AS
(
SELECT
*,
CASE
WHEN RunningQty <= Limit THEN Limit
ELSE Limit - LAG(RunningQty,1,0) OVER (PARTITION BY GroupLetter,
ValueDate
ORDER BY Quantity DESC)
END AS Qty
FROM Step1
)
,Step3 AS
(
SELECT
*,
CASE
WHEN qty <= 0
THEN Quantity
WHEN Quantity - qty > 0
THEN Quantity - qty
END AS DeniedQty
FROM Step2
)
SELECT dg.GroupLetter,
dg.DeniedQty,
dg.ValueDate
FROM Step3 dg
WHERE (dg.DeniedQty > 0 AND dg.DeniedQty IS NOT NULL)
And the result is:
GroupLetter
DeniedQty
ValueDate
A
1
2000-01-03
C
1
2003-01-02
E
1
2005-01-02
E
1
2005-01-02
F
13
2006-01-06

Identify Consecutive Chunks in SQL Server Table

I have this table:
ValueId bigint // (identity) item ID
ListId bigint // group ID
ValueDelta int // item value
ValueCreated datetime2 // item created
What I need is to find consecutive Values within the same Group ordered by Created, not ID. Created and ID are not guaranteed to be in the same order.
So the output should be:
ListID bigint
FirstId bigint // from this ID (first in LID with Value ordered by Date)
LastId bigint // to this ID (last in LID with Value ordered by Date)
ValueDelta int // all share this value
ValueCount // and this many occurrences (number of items between FirstId and LastId)
I can do this with Cursors but I'm sure that's not the best idea so I'm wondering if this can be done in a query.
Please, for the answer (if any), explain it a bit.
UPDATE: SQLfiddle basic data set
It does look like a gaps-and-island problem.
Here is one way to do it. It would likely work faster than your variant.
The standard idea for gaps-and-islands is to generate two sets of row numbers partitioning them in two ways. The difference between such row numbers (rn1-rn2) would remain the same within each consecutive chunk. Run the query below CTE-by-CTE and examine intermediate results to see what is going on.
WITH
CTE_RN
AS
(
SELECT
[ValueId]
,[ListId]
,[ValueDelta]
,[ValueCreated]
,ROW_NUMBER() OVER (PARTITION BY ListID ORDER BY ValueCreated) AS rn1
,ROW_NUMBER() OVER (PARTITION BY ListID, [ValueDelta] ORDER BY ValueCreated) AS rn2
FROM [Value]
)
SELECT
ListID
,MIN(ValueID) AS FirstID
,MAX(ValueID) AS LastID
,MIN(ValueCreated) AS FirstCreated
,MAX(ValueCreated) AS LastCreated
,ValueDelta
,COUNT(*) AS ValueCount
FROM CTE_RN
GROUP BY
ListID
,ValueDelta
,rn1-rn2
ORDER BY
FirstCreated
;
This query produces the same result as yours on your sample data set.
It is not quite clear whether FirstID and LastID can be MIN and MAX, or they indeed must be from the first and last rows (when ordered by ValueCreated). If you need really first and last, the query would become a bit more complicated.
In your original sample data set the "first" and "min" for the FirstID are the same. Let's change the sample data set a little to highlight this difference:
insert into [Value]
([ListId], [ValueDelta], [ValueCreated])
values
(1, 1, '2019-01-01 01:01:02'), -- 1.1
(1, 0, '2019-01-01 01:02:01'), -- 2.1
(1, 0, '2019-01-01 01:03:01'), -- 2.2
(1, 0, '2019-01-01 01:04:01'), -- 2.3
(1, -1, '2019-01-01 01:05:01'), -- 3.1
(1, -1, '2019-01-01 01:06:01'), -- 3.2
(1, 1, '2019-01-01 01:01:01'), -- 1.2
(1, 1, '2019-01-01 01:08:01'), -- 4.2
(2, 1, '2019-01-01 01:08:01') -- 5.1
;
All I did is swapped the ValueCreated between the first and seventh rows, so now the FirstID of the first group is 7 and LastID is 1. Your query returns correct result. My simple query above doesn't.
Here is the variant that produces correct result. I decided to use FIRST_VALUE and LAST_VALUE functions to get the appropriate IDs. Again, run the query CTE-by-CTE and examine intermediate results to see what is going on.
This variant produces the same result as your query even with the adjusted sample data set.
WITH
CTE_RN
AS
(
SELECT
[ValueId]
,[ListId]
,[ValueDelta]
,[ValueCreated]
,ROW_NUMBER() OVER (PARTITION BY ListID ORDER BY ValueCreated) AS rn1
,ROW_NUMBER() OVER (PARTITION BY ListID, ValueDelta ORDER BY ValueCreated) AS rn2
FROM [Value]
)
,CTE2
AS
(
SELECT
ValueId
,ListId
,ValueDelta
,ValueCreated
,rn1
,rn2
,rn1-rn2 AS Diff
,FIRST_VALUE(ValueID) OVER(
PARTITION BY ListID, ValueDelta, rn1-rn2 ORDER BY ValueCreated
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS FirstID
,LAST_VALUE(ValueID) OVER(
PARTITION BY ListID, ValueDelta, rn1-rn2 ORDER BY ValueCreated
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS LastID
FROM CTE_RN
)
SELECT
ListID
,FirstID
,LastID
,MIN(ValueCreated) AS FirstCreated
,MAX(ValueCreated) AS LastCreated
,ValueDelta
,COUNT(*) AS ValueCount
FROM CTE2
GROUP BY
ListID
,ValueDelta
,rn1-rn2
,FirstID
,LastID
ORDER BY FirstCreated;
Use a CTE that adds a Row_Number column, partitioned by GroupId and Value and ordered by Created.
Then select from the CTE, GROUP BY GroupId and Value; use COUNT(*) to get the Count, and use correlated subqueries to select the ValueId with the MIN(RowNumber) (which will always be 1, so you can just use that instead of MIN) and the MAX(RowNumber) to get FirstId and LastId.
Although, now that I've noticed you're using SQL Server 2017, you should be able to use First_Value() and Last_Value() instead of correlated subqueries.
After many iterations I think I have a working solution. I'm absolutely sure it's far from optimal but it works.
Link is here: http://sqlfiddle.com/#!18/4ee9f/3
Sample data:
create table [Value]
(
[ValueId] bigint not null identity(1,1),
[ListId] bigint not null,
[ValueDelta] int not null,
[ValueCreated] datetime2 not null,
constraint [PK_Value] primary key clustered ([ValueId])
);
insert into [Value]
([ListId], [ValueDelta], [ValueCreated])
values
(1, 1, '2019-01-01 01:01:01'), -- 1.1
(1, 0, '2019-01-01 01:02:01'), -- 2.1
(1, 0, '2019-01-01 01:03:01'), -- 2.2
(1, 0, '2019-01-01 01:04:01'), -- 2.3
(1, -1, '2019-01-01 01:05:01'), -- 3.1
(1, -1, '2019-01-01 01:06:01'), -- 3.2
(1, 1, '2019-01-01 01:01:02'), -- 1.2
(1, 1, '2019-01-01 01:08:01'), -- 4.2
(2, 1, '2019-01-01 01:08:01') -- 5.1
The Query that seems to work:
-- this is the actual order of data
select *
from [Value]
order by [ListId] asc, [ValueCreated] asc;
-- there are 4 sets here
-- set 1 GroupId=1, Id=1&7, Value=1
-- set 2 GroupId=1, Id=2-4, Value=0
-- set 3 GroupId=1, Id=5-6, Value=-1
-- set 4 GroupId=1, Id=8-8, Value=1
-- set 5 GroupId=2, Id=9-9, Value=1
with [cte1] as
(
select [v1].[ListId]
,[v2].[ValueId] as [FirstId], [v2].[ValueCreated] as [FirstCreated]
,[v1].[ValueId] as [LastId], [v1].[ValueCreated] as [LastCreated]
,isnull([v1].[ValueDelta], 0) as [ValueDelta]
from [dbo].[Value] [v1]
join [dbo].[Value] [v2] on [v2].[ListId] = [v1].[ListId]
and isnull([v2].[ValueDeltaPrev], 0) = isnull([v1].[ValueDeltaPrev], 0)
and [v2].[ValueCreated] <= [v1].[ValueCreated] and not exists (
select 1
from [dbo].[Value] [v3]
where 1=1
and ([v3].[ListId] = [v1].[ListId])
and ([v3].[ValueCreated] between [v2].[ValueCreated] and [v1].[ValueCreated])
and [v3].[ValueDelta] != [v1].[ValueDelta]
)
), [cte2] as
(
select [t1].*
from [cte1] [t1]
where not exists (select 1 from [cte1] [t2] where [t2].[ListId] = [t1].[ListId]
and ([t1].[FirstId] != [t2].[FirstId] or [t1].[LastId] != [t2].[LastId])
and [t1].[FirstCreated] between [t2].[FirstCreated] and [t2].[LastCreated]
and [t1].[LastCreated] between [t2].[FirstCreated] and [t2].[LastCreated]
)
)
select [ListId], [FirstId], [LastId], [FirstCreated], [LastCreated], [ValueDelta] as [ValueDelta]
,(select count(*) from [dbo].[Value] where [ListId] = [t].[ListId] and [ValueCreated] between [t].[FirstCreated] and [t].[LastCreated]) as [ValueCount]
from [cte2] [t];
How it works:
join table to self on same list but only on older (or equal date to handle single sets) values
join again on self and exclude any overlaps keeping only largest date set
once we identify largest sets, we then count entries in set dates
If anyone can find a better / friendlier solution, you get the answer.
PS: The dumb straightforward Cursor approach seems a lot faster than this. Still testing.

Determining consecutive and independent PTO days

Based on feedback, I am restructuring my question.
I am working with SQL on a Presto database.
My objective is to report on employees that take consecutive days of PTO or Sick Time since the beginning of 2018. My desired output would have the individual islands of time taken by employee with the start and end dates, along the lines of:
The main table I am using is d_employee_time_off
There are only two time_off_type_name: PTO and Sick Leave.
The ds is a datestamp and I use the latest ds (usually the current date)
I have access to a date table named d_date
I can join the tables on d_employee_time_off.time_off_date = d_date.full_date
I hope that I have structured this question in a fashion that is understandable.
I believe the need here is to join the day off material to a calendar table.
In the example solution below I am generating this "on the fly" but I think you do have your own solution for this. Also in my example I have used the string 'Monday' and moved backward from that (or, you could use 'Friday' and move forward). I'm, not keen on language dependent solutions but as I'm not a Presto user wasn't able to test anything on Presto. So the example below uses some of your own logic, but using SQL Server syntax which I trust you can translate to Presto:
Query:
;WITH
Digits AS (
SELECT 0 AS digit UNION ALL
SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL
SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL
SELECT 9
)
, cal AS (
SELECT
ca.number
, dateadd(day,ca.number,'20180101') as cal_date
, datename(weekday,dateadd(day,ca.number,'20180101')) weekday
FROM Digits [1s]
CROSS JOIN Digits [10s]
CROSS JOIN Digits [100s] /* add more like this as needed */
cross apply (
SELECT
[1s].digit
+ [10s].digit * 10
+ [100s].digit * 100 /* add more like this as needed */
AS number
) ca
)
, time_off AS (
select
*
from cal
inner join mytable t on (cal.cal_date = t.time_off_date and cal.weekday <> 'Monday')
or (cal.cal_date between dateadd(day,-2,t.time_off_date)
and t.time_off_date and datename(weekday,t.time_off_date) = 'Monday')
)
, starting_points AS (
SELECT
employee_id,
cal_date,
dense_rank() OVER(partition by employee_id
ORDER BY
time_off_date
) AS rownum
FROM
time_off A
WHERE
NOT EXISTS (
SELECT
*
FROM
time_off B
WHERE
B.employee_id = A.employee_id
AND B.cal_date = DATEADD(day, -1, A.cal_date)
)
)
, ending_points AS (
SELECT
employee_id,
cal_date,
dense_rank() OVER(partition by employee_id
ORDER BY
time_off_date
) AS rownum
FROM
time_off A
WHERE
NOT EXISTS (
SELECT
*
FROM
time_off B
WHERE
B.employee_id = A.employee_id
AND B.cal_date = DATEADD(day, 1, A.cal_date)
)
)
SELECT
S.employee_id,
S.cal_date AS start_range,
E.cal_date AS end_range
FROM
starting_points S
JOIN
ending_points E
ON E.employee_id = S.employee_id
AND E.rownum = S.rownum
order by employee_id
, start_range
Result:
employee_id start_range end_range
1 200035 02.01.2018 02.01.2018
2 200035 20.04.2018 27.04.2018
3 200037 27.01.2018 29.01.2018
4 200037 31.03.2018 02.04.2018
see: http://rextester.com/MISZ50793
CREATE TABLE mytable(
ID INT NOT NULL
,employee_id INTEGER NOT NULL
,type VARCHAR(3) NOT NULL
,time_off_date DATE NOT NULL
,time_off_in_days INT NOT NULL
);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (1,200035,'PTO','2018-01-02',1);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (2,200035,'PTO','2018-04-20',1);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (3,200035,'PTO','2018-04-23',1);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (4,200035,'PTO','2018-04-24',1);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (5,200035,'PTO','2018-04-25',1);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (6,200035,'PTO','2018-04-26',1);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (7,200035,'PTO','2018-04-27',1);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (8,200037,'PTO','2018-01-29',1);
INSERT INTO mytable(id,employee_id,type,time_off_date,time_off_in_days) VALUES (9,200037,'PTO','2018-04-02',1);

Drop rows identified within moving time window

I have a dataset of hospitalisations ('spells') - 1 row per spell. I want to drop any spells recorded within a week after another (there could be multiple) - the rationale being is that they're likely symptomatic of the same underlying cause. Here is some play data:
create table hif_user.rzb_recurse_src (
patid integer not null,
eventdate integer not null,
type smallint not null
);
insert into hif_user.rzb_recurse_src values (1,1,1);
insert into hif_user.rzb_recurse_src values (1,3,2);
insert into hif_user.rzb_recurse_src values (1,5,2);
insert into hif_user.rzb_recurse_src values (1,9,2);
insert into hif_user.rzb_recurse_src values (1,14,2);
insert into hif_user.rzb_recurse_src values (2,1,1);
insert into hif_user.rzb_recurse_src values (2,5,1);
insert into hif_user.rzb_recurse_src values (2,19,2);
Only spells of type 2 - within a week after any other - are to be dropped. Type 1 spells are to remain.
For patient 1, dates 1 & 9 should be kept. For patient 2, all rows should remain.
The issue is with patient 1. Spell date 9 is identified for dropping as it is close to spell date 5; however, as spell date 5 is close to spell date 1 is should be dropped therefore allowing spell date 9 to live...
So, it seems a recursive problem. However, I've not used recursive programming in SQL before and I'm struggling to really picture how to do it. Can anyone help? I should add that I'm using Teradata which has more restrictions than most with recursive SQL (only UNION ALL sets allowed I believe).
It's a cursor logic, check one row after the other if it fits your rules, so recursion is the easiest (maybe the only) way to solve your problem.
To get a decent performance you need a Volatile Table to facilitate this row-by-row processing:
CREATE VOLATILE TABLE vt (patid, eventdate, exac_type, rn, startdate) AS
(
SELECT r.*
,ROW_NUMBER() -- needed to facilitate the join
OVER (PARTITION BY patid ORDER BY eventdate) AS rn
FROM hif_user.rzb_recurse_src AS r
) WITH DATA ON COMMIT PRESERVE ROWS;
WITH RECURSIVE cte (patid, eventdate, exac_type, rn, startdate) AS
(
SELECT vt.*
,eventdate AS startdate
FROM vt
WHERE rn = 1 -- start with the first row
UNION ALL
SELECT vt.*
-- check if type = 1 or more than 7 days from the last eventdate
,CASE WHEN vt.eventdate > cte.startdate + 7
OR vt.exac_type = 1
THEN vt.eventdate -- new start date
ELSE cte.startdate -- keep old date
END
FROM vt JOIN cte
ON vt.patid = cte.patid
AND vt.rn = cte.rn + 1 -- proceed to next row
)
SELECT *
FROM cte
WHERE eventdate - startdate = 0 -- only new start days
order by patid, eventdate
I think the key to solving this is getting the first date more than 7 days from the current date and then doing a recursive subquery:
with rrs as (
select rrs.*,
(select min(rrs2.eventdate)
from hif_user.rzb_recurse_src rrs2
where rrs2.patid = rrs.patid and
rrs2.eventdate > rrs.eventdate + 7
) as eventdate7
from hif_user.rzb_recurse_src rrs
),
recursive cte as (
select patid, min(eventdate) as eventdate, min(eventdate7) as eventdate7
from hif_user.rzb_recurse_src rrs
group by patid
union all
select cte.patid, cte.eventdate7, rrs.eventdate7
from cte join
hif_user.rzb_recurse_src rrs
on rrs.patid = cte.patid and
rrs.eventdate = cte.eventdate7
)
select cte.patid, cte.eventdate
from cte;
If you want additional columns, then join in the original table at the last step.

how to get query value from 1 row to use to another row?

This is example query:
payment_Type payment_value cost_type cost value
Cost I 100 Registration 40
Cost I 100 books 40
Cost I 100 Lab 40
The COST I has 3 elements Cost_Type that have their own Cost_value.
I want to manipulate like this:
payment_Type payment_value cost_type cost value Payment_by_cost_type
Cost I 100 Registration 40 40
Cost I 100 books 40 40
Cost I 100 Lab 40 20
The point is the I want to divided the payment_value into each cost_value. In the example the payment_by_cost becomes 40, 40, 20 = 100.
The lab cost_value is 40 but it can assign value is 20 because remains from the divided 2 cost type above.
Is it possible that I can use the value from Payment_by_cost_type in the next row record? I have been trying to insert the value Payment_by_cost_type to a temporary table but select cannot have insert statement.
Does anyone have any ideas on how to solve this? I've been consulting to DWH he said it must using Store procedure it cannot done by query.
I guess your table contains not only "Cost I" but other values so here is a query to output results for all groups (by Payment_type) in the table:
;with table1 as
(select
t.*,
row_number()
OVER
(PARTITION BY payment_Type order by cost_type) rn
from t
)
,table2 as
( select t4.*,isnull((select sum(cost_value) from table1
where table1.payment_type=t4.payment_type and rn<t4.rn),0) CumSum
from table1 t4
)
select payment_type,payment_value,cost_type,cost_value,
case when cost_value+CumSum<=payment_value then cost_value
else
payment_value-Cumsum
end
from table2
order by Payment_type,rn;
SQLFIDDLE demo
You need to define some kind of order for your records to define in which order the payments should be applied
Once you have done that (i'm using ID in this example)...
select *
, case
when payment_value-(select isnull(SUM(cost_value),0) from yourtable t2 where t2.id<t1.id)<cost_value
then payment_value-(select isnull(SUM(cost_value),0) from yourtable t2 where t2.id<t1.id)
else cost_value
end
from yourtable t1
Doing it step by step using common table expressions.
declare #t table (
payment_type varchar(20),
payment_value int,
cost_type varchar(20),
cost_value int,
cost_id int --for the ordering
)
insert #t values
('Cost I',100,'Registration',40,1),
('Cost I',100,'books',40,2),
('Cost I',100,'Lab',40,3),
('Cost 2',100,'Registration',40,4),
('Cost 2',100,'books',50,5),
('Cost 2',100,'Lab',40,6)
--get count for each payment_type to determine last row
;with payment_value_cte(payment_type,payment_value,count) as
(
select payment_type,payment_value,COUNT(*) from #t group by payment_type,payment_value
),
--use sequential index for each row in payment type
payment_value_index_cte(
payment_type ,
payment_value,
cost_type,
cost_value,
cost_id,
row) as
(
select *,ROW_NUMBER() OVER(PARTITION BY payment_type ORDER BY cost_id) from #t --assumes order is by an id
),
--get sum of each row for payment type except last row
payment_value_sum_except_last_cte(
payment_type,
payment_value,
current_sum) as
(
select pi.payment_type,pi.payment_value,SUM(cost_value)
from payment_value_index_cte pi
inner join payment_value_cte pt on pt.payment_type = pi.payment_type
where pi.row < pt.count
group by pi.payment_type,pi.payment_value
)
select
pi.payment_type,pi.payment_value,pi.cost_type,pi.cost_value,
--if last row calculate difference, else use the cost_value
case when pi.row = pt.count then pt.payment_value - pe.current_sum else pi.cost_value end [Payment_by_cost_type]
from payment_value_index_cte pi
inner join payment_value_cte pt on pt.payment_type = pi.payment_type
inner join payment_value_sum_except_last_cte pe on pe.payment_type = pi.payment_type
SELECT payment_Type, payment_value, cost_type, cost_value,
CASE WHEN ROW_NUMBER() OVER (ORDER BY(SELECT 1)) = COUNT(*) OVER (PARTITION BY payment_Type)
THEN SUM(cost_value) OVER (PARTITION BY payment_Type) - payment_value
ELSE cost_value END AS Payment_by_cost_type
FROM dbo.your_table
Demo on SQLFiddle