SQL - Decaying Time Since Event Then Starting Over At the Next Event - sql

There are many similar questions and answers already posted but I could not find one with these differences. 1) The count of NULLs starts over, and 2) there is a math function applied to the replaced value.
An event either takes place or not (NULL or 1), by date by customer. Can assume that a customer has one and only one row for every date.
I want to replace the NULLs with a decay function based on number of consecutive NULLs (time from event). A customer can have the event every day, skip a day, skip multiple days. But once the event takes place, the decay starts over. Currently my decay is divide by 2 but that is for example.
DT
CUSTOMER
EVENT
DESIRED
2022-01-01
a
1
1
2022-01-02
a
1
1
2022-01-03
a
1
1
2022-01-04
a
1
1
2022-01-05
a
1
1
2022-01-01
b
1
1
2022-01-02
b
0.5
2022-01-03
b
0.25
2022-01-04
b
1
1
2022-01-05
b
0.5
I can produce the desired result, but it is very unwieldy. Looking if there is a better way. This will need to be extended for multiple event columns.
create or replace temporary table the_data (
dt date,
customer char(10),
event int,
desired float)
;
insert into the_data values ('2022-01-01', 'a', 1, 1);
insert into the_data values ('2022-01-02', 'a', 1, 1);
insert into the_data values ('2022-01-03', 'a', 1, 1);
insert into the_data values ('2022-01-04', 'a', 1, 1);
insert into the_data values ('2022-01-05', 'a', 1, 1);
insert into the_data values ('2022-01-01', 'b', 1, 1);
insert into the_data values ('2022-01-02', 'b', NULL, 0.5);
insert into the_data values ('2022-01-03', 'b', NULL, 0.25);
insert into the_data values ('2022-01-04', 'b', 1, 1);
insert into the_data values ('2022-01-05', 'b', NULL, 0.5);
with
base as (
select * from the_data
),
find_nan as (
select *, case when event is null then 1 else 0 end as event_is_nan from base
),
find_nan_diff as (
select *, event_is_nan - coalesce(lag(event_is_nan) over (partition by customer order by dt), 0) as event_is_nan_diff from find_nan
),
find_nan_group as (
select *, sum(case when event_is_nan_diff = -1 then 1 else 0 end) over (partition by customer order by dt) as nan_group from find_nan_diff
),
consec_nans as (
select *, sum(event_is_nan) over (partition by customer, nan_group order by dt) as n_consec_nans from find_nan_group
),
decay as (
select *, case when n_consec_nans > 0 then 0.5 / n_consec_nans else 1 end as decay_factor from consec_nans
),
ffill as (
select *, first_value(event) over (partition by customer order by dt) as ffill_value from decay
),
final as (
select *, ffill_value * decay_factor as the_answer from ffill
)
select * from final
order by customer, dt
;
Thanks

The query could be simplified by using CONDITIONAL_CHANGE_EVENT to generate subgrp helper column:
WITH cte AS (
SELECT *, CONDITIONAL_CHANGE_EVENT(event IS NULL) OVER(PARTITION BY CUSTOMER
ORDER BY DT) AS subgrp
FROM the_data
)
SELECT *, COALESCE(EVENT, 0.5 / ROW_NUMBER() OVER(PARTITION BY CUSTOMER, SUBGRP
ORDER BY DT)) AS computed_decay
FROM cte
ORDER BY CUSTOMER, DT;
Output:
EDIT:
Without using CONDITIONAL_CHANGE_EVENT:
WITH cte AS (
SELECT *,
CASE WHEN
event = LAG(event,1, event) OVER(PARTITION BY customer ORDER BY dt)
OR (event IS NULL AND LAG(event) OVER(PARTITION BY customer ORDER BY dt) IS NULL)
THEN 0 ELSE 1 END AS l
FROM the_data
), cte2 AS (
SELECT *, SUM(l) OVER(PARTITION BY customer ORDER BY dt) AS SUBGRP
FROM cte
)
SELECT *, COALESCE(EVENT, 0.5 / ROW_NUMBER() OVER(PARTITION BY CUSTOMER, SUBGRP
ORDER BY DT)) AS computed_decay
FROM cte2
ORDER BY CUSTOMER, DT;
db<>fiddle demo

Related

Get range of dates from dates record in MS SQL

I have dates record
with DateTable (dateItem) as
(
select '2022-07-03' union all
select '2022-07-05' union all
select '2022-07-04' union all
select '2022-07-09' union all
select '2022-07-12' union all
select '2022-07-13' union all
select '2022-07-18'
)
select dateItem
from DateTable
order by 1 asc
I want to get ranges of dates between this record like this
with DateTableRange (dateItemStart, dateItemend) as
(
select '2022-07-03','2022-07-05' union all
select '2022-07-09','2022-07-09' union all
select '2022-07-12','2022-07-13' union all
select '2022-07-18','2022-07-18'
)
select dateItemStart, dateItemend
from DateTableRange
I am able to do it in SQL with looping using while or looping by getting first one and check the next dates and if they are 1 plus then I add it in enddate and do the same in loop
But I don't know what the best or optimized way is, as there were lots of looping and temp tables involve
Edited :
as in data we have 3,4,5 and 6,7,8 is missing so range is 3-5
9 exist and 10 is missing so range is 9-9
so ranges is purely depend on the consecutive data in datetable
Any suggestion will be appreciated
With some additional clarity this requires a gaps-and-islands approach to first identify adjacent rows as groups, from which you can then use a window to identify the first and last value of each group.
I'm sure this could be refined further but should give your desired results:
with DateTable (dateItem) as
(
select '2022-07-03' union all
select '2022-07-05' union all
select '2022-07-04' union all
select '2022-07-09' union all
select '2022-07-12' union all
select '2022-07-13' union all
select '2022-07-18'
), valid as (
select *,
case when exists (
select * from DateTable d2 where Abs(DateDiff(day, d.dateitem, d2.dateitem)) = 1
) then 1 else 0 end v
from DateTable d
), grp as (
select *,
Row_Number() over(order by dateitem) - Row_Number()
over (partition by v order by dateitem) g
from Valid v
)
select distinct
Iif(v = 0, dateitem, First_Value(dateitem) over(partition by g order by dateitem)) DateItemStart,
Iif(v = 0, dateitem, First_Value(dateitem) over(partition by g order by dateitem desc)) DateItemEnd
from grp
order by dateItemStart;
See Demo Fiddle
After clarification, this is definitely a 'gaps and islands' problem.
The solution can be like this
WITH DateTable(dateItem) AS
(
SELECT * FROM (
VALUES
('2022-07-03'),
('2022-07-05'),
('2022-07-04'),
('2022-07-09'),
('2022-07-12'),
('2022-07-13'),
('2022-07-18')
) t(v)
)
SELECT
MIN(dateItem) AS range_from,
MAX(dateItem) AS range_to
FROM (
SELECT
*,
SUM(CASE WHEN DATEADD(day, 1, prev_dateItem) >= dateItem THEN 0 ELSE 1 END) OVER (ORDER BY rn) AS range_id
FROM (
SELECT
ROW_NUMBER() OVER (ORDER BY dateItem) AS rn,
CAST(dateItem AS date) AS dateItem,
CAST(LAG(dateItem) OVER (ORDER BY dateItem) AS date) AS prev_dateItem
FROM DateTable
) groups
) islands
GROUP BY range_id
You can check a working demo

Get last date of modification in database by value

How it is possible to get - when was the last change (by date) - in
this table:
id
date
value
1
01.01.2021
0.0
1
02.01.2021
10.0
1
03.01.2021
15.0
1
04.01.2021
25.0
1
05.01.2021
25.0
1
06.01.2021
25.0
Of course I could use clause where and it will works, but i have a lot of rows and for some i don't now exactly day when this happend.
The resault should be:
id
date
value
1
04.01.2021
25.0
Try this one:
with mytable as (
select 1 as id, date '2021-01-01' as date, 0 as value union all
select 1, date '2021-01-02', 10 union all
select 1, date '2021-01-03', 15 union all
select 1, date '2021-01-04', 25 union all
select 1, date '2021-01-05', 25 union all
select 1, date '2021-01-06', 25
)
select id, array_agg(struct(date, value) order by last_change_date desc limit 1)[offset(0)].*
from (
select *, if(value != lag(value) over (partition by id order by date), date, null) as last_change_date
from mytable
)
group by id
in this scenario I would be using two field in my database "created_at and updated_at" with the type as "timestamp". You may simply fetch your records using OrderBY "updated_at" field.
see what this gives you:
SELECT MAX(date) OVER (PARTITION BY(value)) AS lastChange
FROM Table
WHERE id = 1
The following query and reproducible example on db-fiddle works. I've also included some additional test records.
CREATE TABLE my_data (
`id` INTEGER,
`date` date,
`value` INTEGER
);
INSERT INTO my_data
(`id`, `date`, `value`)
VALUES
('1', '01.01.2021', '0.0'),
('1', '02.01.2021', '10.0'),
('1', '03.01.2021', '15.0'),
('1', '04.01.2021', '25.0'),
('1', '05.01.2021', '25.0'),
('1', '06.01.2021', '25.0'),
('2', '05.01.2021', '25.0'),
('2', '06.01.2021', '23.0'),
('3', '03.01.2021', '15.0'),
('3', '04.01.2021', '25.0'),
('3', '05.01.2021', '17.0'),
('3', '06.01.2021', '17.0');
Query #1
SELECT
id,
date,
value
FROM (
SELECT
*,
row_number() over (partition by id order by date desc) as id_rank
FROM (
SELECT
id,
m1.date,
m1.value,
rank() over (partition by id,m1.value order by date asc) as id_value_rank,
CASE
WHEN (m1.date = (max(m1.date) over (partition by id,m1.value ))) THEN 1
ELSE 0
END AS is_max_date_for_group,
CASE
WHEN (m1.date = (max(m1.date) over (partition by id ))) THEN 1
ELSE 0
END AS is_max_date_for_id
from
my_data m1
) m2
WHERE (m2.is_max_date_for_group = m2.is_max_date_for_id and is_max_date_for_group <> 0 and id_value_rank=1) or (id_value_rank=1 and is_max_date_for_id=0)
) t
where t.id_rank=1
order by id, date, value;
id
date
value
1
04.01.2021
25
2
06.01.2021
23
3
05.01.2021
17
View on DB Fiddle
I actually find that the simplest method is to enumerate the rows by id/date and by id/date/value in descending order. These are the same for the last group . . . and the rest is aggregation:
select id, min(date), value
from (select t.*,
row_number() over (partition by id order by date desc) as seqnum,
row_number() over (partition by id, value order by date desc) as seqnum_2
from t
) t
where seqnum = seqnum_2
group by id;
If you use lag(), I would recommend using qualify for performance:
select t.*
from (select t.*
from t
qualify lag(value) over (partition by id order by date) <> value or
lag(value) over (partition by id order by date) is null
) t
qualify row_number() over (partition by id order by date desc) = 1;
Note: Both of these work if the value is the same for all rows. Other methods may not work in that situation.

Sum last two records including last record of a group

In SQL Server 2017, how do I sum the last two records and show the last record in a single query?
CREATE TABLE Billing
(
Customer CHAR(12),
Month INT,
Amount INT
)
GO
INSERT INTO Billing VALUES ('AAAA', 3, 5)
INSERT INTO Billing VALUES ('AAAA', 2, 0)
INSERT INTO Billing VALUES ('AAAA', 1, 2)
INSERT INTO Billing VALUES ('BBBB', 10, 0)
INSERT INTO Billing VALUES ('BBBB', 12, 1)
INSERT INTO Billing VALUES ('BBBB', 11, 0)
INSERT INTO Billing VALUES ('BBBB', 13, 6)
Expected output:
Customer Total Last 2 Bills Last Bill
-----------------------------------------
AAAA 5 5
BBBB 7 6
I tried using SUM with LAST_VALUE with ORDER BY
You can filter out rows by using the ROW_NUMBER() window function, as in:
select
customer,
sum(amount) as total_last_2_bills,
sum(case when rn = 1 then amount else 0 end) as last_bill
from (
select
*,
row_number() over (partition by customer order by month desc) as rn
from billing
) x
where rn <= 2
group by customer
See SQL Fiddle.
You can use window functions:
select customer, (prev_amount + amount), amount
from (select b.*,
lag(amount) over (partition by customer order by month) as prev_amount,
lead(month) over (partition by customer order by month) as next_month
from billing b
) b
where next_month is null;
If you want to ignore values of 0, then filter:
select customer, (coalesce(prev_amount, 0) + amount), amount
from (select b.*,
lag(amount) over (partition by customer order by month) as prev_amount,
lead(month) over (partition by customer order by month) as next_month
from billing b
where amount <> 0
) b
where next_month is null;

Calculating average by using the previous row's value and following row's value

I have calculated average values for each month. Some months are NULL and my manager wants me to use the previous row's value and following month's value and fill the months which are having NULL values.
Current result (see below pic):
Expected Result
DECLARE #DATE DATE = '2017-01-01';
WITH DATEDIM AS
(
SELECT DISTINCT DTM.FirstDayOfMonth
FROM DATEDIM DTM
WHERE Date >= '01/01/2017'
AND Date <= DATEADD(mm,-1,Getdate())
),
Tab1 AS
(
SELECT
T1.FirstDayOfMonth AS MONTH_START,
AVG1,
ROW_NUMBER() OVER (
ORDER BY DATEADD(MM,DATEDIFF(MM, 0, T1.FirstDayOfMonth),0) DESC
) AS RNK
FROM DATEDIM T1
LEFT OUTER JOIN (
SELECT
DATEADD(MM,DATEDIFF(MM, 0, StartDate),0) MONTH_START,
AVG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) AS AVG1
FROM DATATable
WHERE EndDate >= StartDate
AND StartDate >= #DATE
AND EndDate >= #DATE
GROUP BY DATEADD(MM,DATEDIFF(MM, 0, StartDate),0)
) T2 ON T1.FirstDayOfMonth = T2.MONTH_START
)
SELECT *
FROM Tab1
Using your CTEs
select MONTH_START,
case when AVG1 is null then
(select top(1) t2.AVG1
from Tab1 t2
where t1.RNK > t2.RNK and t2.AVG1 is not null
order by t2.RNK desc)
else AVG1 end AVG1,
RNK
from Tab1 t1
Edit
Version for an average of nearest peceding and nearest following non-nulls. Both must exist otherwise NULL is returned.
select MONTH_START,
case when AVG1 is null then
( (select top(1) t2.AVG1
from Tab1 t2
where t1.RNK > t2.RNK and t2.AVG1 is not null
order by t2.RNK desc)
+(select top(1) t2.AVG1
from Tab1 t2
where t1.RNK < t2.RNK and t2.AVG1 is not null
order by t2.RNK)
) / 2
else AVG1 end AVG1,
RNK
from Tab1 t1
I can't quite tell what you are trying to calculate the average of, but this is quite simple with window functions:
select t.*,
avg(val) over (order by month_start rows between 1 preceding and 1 rollowing)
from t;
In your case, I think this translates as:
select datefromparts(year(startdate), month(startdate), 1) as float,
avg(val) as monthaverage,
avg(avg(val)) over (order by min(startdate) rows between 1 preceding and 1 following)
from datatable d
where . . .
group by datefromparts(year(startdate), month(startdate), 1)
You can manipulate previous and following row values using window functions:
SELECT MAX(row_value) OVER(
ORDER BY ... ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS Previous_Value,
MAX(row_value) OVER(
ORDER BY ... ROWS BETWEEN 1 FOLLOWING AND 1 FOLLOWING) AS Next_Value
Alternatively you can use LAG/LEAD functions and modify your sub-query where you get the AVG:
SELECT
src.MONTH_START,
CASE
WHEN src.prev_val IS NULL OR src.next_val IS NULL
THEN COALESCE(src.prev_val, src.next_val) -- Return non-NULL value (if exists)
ELSE (src.prev_val + src.next_val ) / 2
END AS AVG_new
FROM (
SELECT
DATEADD(MM,DATEDIFF(MM, 0, StartDate),0) MONTH_START,
LEAD(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) OVER(ORDER BY ...) AS prev_val,
LAG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) OVER(ORDER BY ...) AS next_val
-- AVG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) AS AVG1
FROM DATATable
WHERE EndDate >= StartDate
AND StartDate >= #DATE
AND EndDate >= #DATE
GROUP BY DATEADD(MM,DATEDIFF(MM, 0, StartDate),0)
) AS src
I haven't tested it, but give it a shot and see how it works. You may need to put at least one column in the ORDER BY portion of the window function.
You could try this query (I just reflected in my sample data relevant parts, I omitted date column):
declare #tbl table (rank int, value int);
insert into #tbl values
(1, null),
(2, 20),
(3, 30),
(4, null),
(5, null),
(6, null),
(7, 40),
(8, null),
(9, null),
(10, 36),
(11, 22);
;with cte as (
select *,
DENSE_RANK() over (order by case when value is null then rank else value end) drank,
case when value is null then lag(value) over (order by rank) end lag,
case when value is null then lead(value) over (order by rank) end lead
from #tbl
)
select rank, value, case when value is null then
max(lag) over (partition by grp) / 2 +
max(lead) over (partition by grp) / 2
else value end valueWithAvg
from (
select *,
rank - drank grp from cte
) a order by rank

SQL server to do like Group By task

I have a table with SQL server as below,
Date Value
---------------------------------------------------
08-01-2016 1
08-02-2016 1
08-03-2016 1
08-04-2016 1
08-05-2016 1
08-06-2016 2
08-07-2016 2
08-08-2016 2
08-09-2016 2.5
08-10-2016 1
08-11-2016 1
Since the original table is too large, even I used 'Results to file', it still raise the exception 'System.OutOfMemoryException'. That's why I want to organize the table into this kind.
But I don't have a good logic to deal with. Therefore, I want to change the table into this kind as below.
Date_from Date_to Value
-------------------------------------------------
08-01-2016 08-05-2016 1
08-06-2016 08-08-2016 2
08-09-2016 08-09-2016 2.5
08-10-2016 08-11-2016 1
I appreciate your ideas!
Commonly called as Groups and Island problem. Here is one trick to do this
;WITH data
AS (SELECT *,Lag(Value, 1)OVER(ORDER BY Dates) [pVal]
FROM (VALUES ('08-01-2016',1 ),
('08-02-2016',1 ),
('08-03-2016',1 ),
('08-04-2016',1 ),
('08-05-2016',1 ),
('08-06-2016',2 ),
('08-07-2016',2 ),
('08-08-2016',2 ),
('08-09-2016',2.5 ),
('08-10-2016',1 ),
('08-11-2016',1 )) tc (Dates, Value)),
intr
AS (SELECT Dates,
Value,
Sum(Iif(pVal = Value, 0, 1)) OVER(ORDER BY Dates) AS [Counter]
FROM data)
SELECT Min(Dates) AS Dates_from,
Max(Dates) AS Dates_to,
Value
FROM intr
GROUP BY [Counter],
Value
The cumulative sum/lag approach is one method. In this case, a simpler method is:
select min(date) as date_from, max(date) as date_to, value
from (select t.*,
dateadd(day, - row_number() over (partition by value order by date),date) as grp
from t
) t
group by value, grp;
This uses the observation that the dates are consecutive with no gaps. Hence, subtracting a sequence from the date will yield a constant -- when the values are the same.
Here is an example:
DECLARE #T TABLE (
[Date] DATE,
[Value] DECIMAL(9,2)
)
INSERT #T VALUES
( '08-01-2016', 1 ),
( '08-02-2016', 1 ),
( '08-03-2016', 1 ),
( '08-04-2016', 1 ),
( '08-05-2016', 1 ),
( '08-06-2016', 2 ),
( '08-07-2016', 2 ),
( '08-08-2016', 2 ),
( '08-09-2016', 2.5 ),
( '08-10-2016', 1 ),
( '08-11-2016', 1 )
SELECT * FROM #T
SELECT A.[Date] StartDate, B.[Date] EndDate, A.[Value] FROM (
SELECT A.*, ROW_NUMBER() OVER (ORDER BY A.[Date], A.[Value]) O FROM #T A
LEFT JOIN #T B ON B.[Value] = A.[Value] AND B.[Date] = DATEADD(d, -1, A.[Date])
WHERE B.[Date] IS NULL
) A
JOIN (
SELECT A.*, ROW_NUMBER() OVER (ORDER BY A.[Date], A.[Value]) O FROM #T A
LEFT JOIN #T B ON B.[Value] = A.[Value] AND B.[Date] = DATEADD(d, 1, A.[Date])
WHERE B.[Date] IS NULL
) B ON B.O = A.O
Prdp's solution is great but just in case if anyone is still using SQL Server 2008 where LAG() and The Parallel Data Warehouse (PDW) features are not available here is an alternative:
SAMPLE DATA:
IF OBJECT_ID('tempdb..#Temp') IS NOT NULL
DROP TABLE #Temp;
CREATE TABLE #Temp([Dates] DATE
, [Value] FLOAT);
INSERT INTO #Temp([Dates]
, [Value])
VALUES
('08-01-2016'
, 1),
('08-02-2016'
, 1),
('08-03-2016'
, 1),
('08-04-2016'
, 1),
('08-05-2016'
, 1),
('08-06-2016'
, 2),
('08-07-2016'
, 2),
('08-08-2016'
, 2),
('08-09-2016'
, 2.5),
('08-10-2016'
, 1),
('08-11-2016'
, 1);
QUERY:
;WITH Seq
AS (SELECT SeqNo = ROW_NUMBER() OVER(ORDER BY [Dates]
, [Value])
, t.Dates
, t.[Value]
FROM #Temp t)
SELECT StartDate = MIN([Dates])
, EndDate = MAX([Dates])
, [Value]
FROM
(SELECT [Value]
, [Dates]
, SeqNo
, rn = SeqNo - ROW_NUMBER() OVER(PARTITION BY [Value] ORDER BY SeqNo)
FROM Seq s) a
GROUP BY [Value]
, rn
ORDER BY StartDate;
RESULTS: