SQL Table Join with Weighting - sql

I'm trying to create a table that will calculate the weights of spend for a customer in the months they shop for a period. For example, the following customer (faux data) has this spend profile:
/* Customer spend */
create or replace temp table ts_all_transactions
(
inferred_customer_id varchar(128)
,nw_date date
,spend number(21,2)
);
insert into ts_all_transactions
values
('52f5','2019-06-01',17.35)
,('52f5','2018-11-01',24.85)
,('52f5','2019-12-01',1.40)
,('52f5','2019-01-01',2.45)
,('52f5','2019-03-01',3.90)
,('52f5','2020-01-01',37.55)
,('52f5','2019-10-01',13.20)
,('52f5','2019-09-01',5.70)
;
A calendar containing the months in which a period falls in, along with a weighting is then created:
-- Calculate weights for each period of the time series
-- Create a staging table
create or replace temp table period_dimension as
select abs(seq4()-12) as period,
dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as start_date,
dateadd(month, 12, start_date) as end_date
from table(generator(rowcount => 12)) -- number of months after reference date in previous line
;
select * from period_dimension;
create or replace temp table my_date_dimension
(
my_date date not null
,year smallint not null
,month smallint not null
,month_name char(3) not null
,day_of_mon smallint not null
,day_of_week varchar(9) not null
,week_of_year smallint not null
,day_of_year smallint not null
)
as
with my_date as (
select
seq4(),
dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as my_date
from table(generator(rowcount=>23))
)
select my_date
,year(my_date)
,month(my_date)
,monthname(my_date)
,day(my_date)
,dayofweek(my_date)
,weekofyear(my_date)
,dayofyear(my_date)
from my_date
;
create or replace table weight_lookup as
select
a.period
,b.my_date
,rank() over (partition by a.period order by b.my_date) as weight
from period_dimension a
inner join my_date_dimension b
where b.my_date >= a.start_date
and b.my_date < a.end_date
order by 1,2
;
-- Create a staging table
create or replace temp table period_dimension2 as
select abs(seq4()-12) as period,
dateadd(month, seq4(), dateadd(month, -23, date_trunc('Month', current_date()))) as start_date,
last_day(dateadd(month, 11, start_date)) as end_date
from table(generator(rowcount => 12)) -- number of months after reference date in previous line
;
The above is then used to calculate an average spend based on the months the customer shops in the period, however, I'm not getting the output I expect:
-- For each month of each period, group all together by period here so we have 12 periods
-- so each period represents 12 rolling months with period 12 being the oldest period
create or replace temp table ts_spend_time as
select
a.inferred_customer_id
,b.period
,max(a.nw_date) as max_mnth /* Month in period where most spend was made */
,sum(a.spend * b.weight) / 78 as avg_spend /* Sum of weights 12,11,10...1 to give 78 */
from ts_all_transactions a
inner join weight_lookup b on a.nw_date = b.my_date
inner join period_dimension2 c on b.my_date = c.start_date and b.period = c.period
where b.my_date >= c.start_date
and b.my_date <= c.end_date
group by 1,2
order by 1 desc, 2,3
;
The output I get from the above code is this:
create or replace temp table ts_spend_time_wrong_out
(
inferred_customer_id varchar(128)
,period number(11)
,max_mnth date
,avg_spend number(38,8)
);
insert into ts_spend_time_wrong_out
values
('52f5',3,'2019-03-01',0.05000000)
,('52f5',5,'2019-01-01',0.03141026)
,('52f5',7,'2018-11-01',0.31858974)
;
I would like to get an output like this:
create or replace temp table ts_spend_time_should_be
(
inferred_customer_id varchar(128)
,period number(11)
,max_mnth date
,avg_spend number(38,8)
);
insert into ts_spend_time_should_be
values
('52f5',1,'01JAN2020',6.301923077)
,('52f5',2,'01JAN2020',7.266025641)
,('52f5',3,'01JAN2020',8.280128205)
,('52f5',4,'01JAN2020',9.294230769)
,('52f5',5,'01DEC2019',4.081410256)
,('52f5',6,'01OCT2019',4.412179487)
,('52f5',7,'01OCT2019',5.276923077)
,('52f5',8,'01SEP2019',3.941666667)
,('52f5',9,'01JUN2019',3.687179487)
,('52f5',10,'01JUN2019',4.309615385)
,('52f5',11,'01JUN2019',4.932051282)
,('52f5',12,'01MAR2019',2.662820513)
;
In the correct solution example, the average spend is calculated by period as follows: ((17.35*2)+(5.7*5)+(13.20*6)+(1.4*8)+(37.55*9)) / 78
How can I resolve this? TIA

firstly you should use row_number() over(order by seq4()) as there can be gaps in a seq()
so working half way through you question
with ts_all_transactions as (
select id, nw_date::date as nw_date, spend from values
('52f5','2019-06-01',17.35)
,('52f5','2018-11-01',24.85)
,('52f5','2019-12-01',1.40)
,('52f5','2019-01-01',2.45)
,('52f5','2019-03-01',3.90)
,('52f5','2020-01-01',37.55)
,('52f5','2019-10-01',13.20)
,('52f5','2019-09-01',5.70)
v(id,nw_date, spend)
), period_dimension as (
select
row_number() over(order by seq4())-1 as rn0,
abs(rn0-12) as period,
dateadd('month', rn0, dateadd(month, -23, date_trunc('Month', current_date()))) as start_date,
dateadd('month', 12, start_date) as end_date
from table(generator(rowcount => 12)) -- number of months after reference date in previous line
), weight_periods as (
select p.period
,p.start_date
,p.end_date
,row_number() over(partition by p.period order by seq4())-1 as rn1
,dateadd('month',-rn1, p.end_date ) as weight_month
,12 - rn1 + 1 as weight
from period_dimension p,
table(generator(rowcount => 12))
), monthly_spends as (
select id
,date_trunc('month', nw_date) as m_date
,sum(spend) as s_spend
from ts_all_transactions
group by 1,2
)
select m.id
,w.period
,w.end_date
,w.weight_month
,m.s_spend
,w.weight
,m.s_spend * w.weight as w_spend
from monthly_spends m
join weight_periods w on m.m_date = w.weight_month
order by 1,2,3,4;
gives:
ID PERIOD END_DATE WEIGHT_MONTH S_SPEND WEIGHT W_SPEND
52f5 1 2020-05-01 2019-06-01 17.35 2 34.70
52f5 1 2020-05-01 2019-09-01 5.70 5 28.50
52f5 1 2020-05-01 2019-10-01 13.20 6 79.20
52f5 1 2020-05-01 2019-12-01 1.40 8 11.20
52f5 1 2020-05-01 2020-01-01 37.55 9 337.95
52f5 2 2020-04-01 2019-06-01 17.35 3 52.05
...
This shows up to this point we can see the inputs to the values you are expecting for the "weighted average" which can be done via :
select m.id
,w.period
,sum(m.s_spend * w.weight) as t_w_spend
,round(t_w_spend / 78,3) as weighted_avg_spend
from monthly_spends m
join weight_periods w on m.m_date = w.weight_month
group by 1,2
order by 1,2;
which gives:
ID PERIOD T_W_SPEND WEIGHTED_AVG_SPEND
52f5 1 491.55 6.302
52f5 2 566.75 7.266
52f5 3 641.95 8.230
52f5 4 724.95 9.294
52f5 5 804.05 10.308
52f5 6 362.35 4.646
52f5 7 386.75 4.958
52f5 8 479.05 6.142
52f5 9 361.70 4.637
52f5 10 336.15 4.310
52f5 11 384.70 4.932
52f5 12 433.25 5.554
which starts the same, but diverge as I think your date periods are done "wrong"
but the next point is you have this line
,max(a.nw_date) as max_mnth /* Month in period where most spend was made */
which does not do what you comment it as doing..
what it does is find the max date value in the aggregate.
to do that you need to go back to the monthly results sql and put a first_value() into the mix then select the results via:
id, period, max_spend_month, sum(w_spend)/78 as weighted_avg_spend
from (
select m.id
,w.period
,w.end_date
,w.weight_month
,m.s_spend
,w.weight
,m.s_spend * w.weight as w_spend
,first_value(w.weight_month) over (partition by m.id, w.period order by m.s_spend desc) as max_spend_month
from monthly_spends m
join weight_periods w on m.m_date = w.weight_month
)
group by 1,2,3
order by 1,2;
which now matches your expectations:
ID PERIOD MAX_SPEND_MONTH WEIGHTED_AVG_SPEND
52f5 1 2020-01-01 6.30192308
52f5 2 2020-01-01 7.26602564
52f5 3 2020-01-01 8.23012821
52f5 4 2020-01-01 9.29423077
52f5 5 2020-01-01 10.30833333
52f5 6 2019-06-01 4.64551282
52f5 7 2019-06-01 4.95833333
52f5 8 2018-11-01 6.14166667
52f5 9 2018-11-01 4.63717949
52f5 10 2018-11-01 4.30961538
52f5 11 2018-11-01 4.93205128
52f5 12 2018-11-01 5.55448718

Related

How to create a start and end date with no gaps from one date column and to sum a value within the dates

I am new SQL coding using in SQL developer.
I have a table that has 4 columns: Patient ID (ptid), service date (dt), insurance payment amount (insr_amt), out of pocket payment amount (op_amt). (see table 1 below)
What I would like to do is (1) create two columns "start_dt" and "end_dt" using the "dt" column where if there are no gaps in the date by the patient ID then populate the start and end date with the first and last date by patient ID, however if there is a gap in service date within the patient ID then to create the separate start and end date rows per patient ID, along with (2) summing the two payment amounts by patient ID with in the one set of start and end date visits (see table 2 below).
What would be the way to run this using SQL code in SQL developer?
Thank you!
Table 1:
Ptid
dt
insr_amt
op_amt
A
1/1/2021
30
20
A
1/2/2021
30
10
A
1/3/2021
30
10
A
1/4/2021
30
30
B
1/6/2021
10
10
B
1/7/2021
20
10
C
2/1/2021
15
30
C
2/2/2021
15
30
C
2/6/2021
60
30
Table 2:
Ptid
start_dt
end_dt
total_insr_amt
total_op_amt
A
1/1/2021
1/4/2021
120
70
B
1/6/2021
1/7/2021
30
20
C
2/1/2021
2/2/2021
30
60
C
2/6/2021
2/6/2021
60
30
You didn't mention the specific database so this solution works in PostgreSQL. You can do:
select
ptid,
min(dt) as start_dt,
max(dt) as end_dt,
sum(insr_amt) as total_insr_amt,
sum(op_amt) as total_op_amt
from (
select *,
sum(inc) over(partition by ptid order by dt) as grp
from (
select *,
case when dt - interval '1 day' = lag(dt) over(partition by ptid order by dt)
then 0 else 1 end as inc
from t
) x
) y
group by ptid, grp
order by ptid, grp
Result:
ptid start_dt end_dt total_insr_amt total_op_amt
----- ---------- ---------- -------------- -----------
A 2021-01-01 2021-01-04 120 70
B 2021-01-06 2021-01-07 30 20
C 2021-02-01 2021-02-02 30 60
C 2021-02-06 2021-02-06 60 30
See running example at DB Fiddle 1.
EDIT for Oracle
As requested, the modified query that works in Oracle is:
select
ptid,
min(dt) as start_dt,
max(dt) as end_dt,
sum(insr_amt) as total_insr_amt,
sum(op_amt) as total_op_amt
from (
select x.*,
sum(inc) over(partition by ptid order by dt) as grp
from (
select t.*,
case when dt - 1 = lag(dt) over(partition by ptid order by dt)
then 0 else 1 end as inc
from t
) x
) y
group by ptid, grp
order by ptid, grp
See running example at db<>fiddle 2.

SQL : create intermediate data from date range

I have a table as shown here:
USER
ROI
DATE
1
5
2021-11-24
1
4
2021-11-26
1
6
2021-11-29
I want to get the ROI for the dates in between the other dates, expected result will be as below
From 2021-11-24 to 2021-11-30
USER
ROI
DATE
1
5
2021-11-24
1
5
2021-11-25
1
4
2021-11-26
1
4
2021-11-27
1
4
2021-11-28
1
6
2021-11-29
1
6
2021-11-30
You may use a calendar table approach here. Create a table containing all dates and then join with it. Sans an actual table, you may use an inline CTE:
WITH dates AS (
SELECT '2021-11-24' AS dt UNION ALL
SELECT '2021-11-25' UNION ALL
SELECT '2021-11-26' UNION ALL
SELECT '2021-11-27' UNION ALL
SELECT '2021-11-28' UNION ALL
SELECT '2021-11-29' UNION ALL
SELECT '2021-11-30'
),
cte AS (
SELECT USER, ROI, DATE, LEAD(DATE) OVER (ORDER BY DATE) AS NEXT_DATE
FROM yourTable
)
SELECT t.USER, t.ROI, d.dt
FROM dates d
INNER JOIN cte t
ON d.dt >= t.DATE AND (d.dt < t.NEXT_DATE OR t.NEXT_DATE IS NULL)
ORDER BY d.dt;

How to return id users buy several months consecutive?

How can I get all user_id values from the data below, for all rows containing the same user_id value over consecutive months from a given start date in the date column.
For example, given the below table....
date
user_id
2018-11-01
13
2018-11-01
13
2018-11-01
14
2018-11-01
15
2018-12-01
13
2019-01-01
13
2019-01-01
14
...supposing I want to get the user_id values for consecutive months prior to (but not including) 2019-01-01 then I'd have this as my output:
user_id
m_year
13
2018-11
13
2018-12
13
2019-01
probably can be applied windows function
If you want to aggregate on a user and the year-months
select
t.user_id,
to_char(date_trunc('month',t.date),'YYYY-MM') as m_year
from yourtable t
where t.date < '2019-02-01'::date
group by t.user_id, date_trunc('month',t.date)
order by t.user_id, m_year
But if you only want those with consecutive months, then a little extra is needed.
select
user_id,
to_char(ym,'YYYY-MM') as m_year
from
(
select t.user_id
, date_trunc('month',t.date) as ym
, lag(date_trunc('month',t.date))
over (partition by t.user_id order by date_trunc('month',t.date)) as prev_ym
, lead(date_trunc('month',t.date))
over (partition by t.user_id order by date_trunc('month',t.date)) as next_ym
from yourtable t
where t.date < '2019-02-01'::date
group by t.user_id, date_trunc('month',t.date)
) q
where (ym - prev_ym <= '31 days'::interval or
next_ym - ym <= '31 days'::interval)
order by user_id, ym
user_id | m_year
------: | :------
13 | 2018-11
13 | 2018-12
13 | 2019-01
db<>fiddle here
you don't need a window function in this specific query. Just try :
SELECT DISTINCT ON (user_id) user_id, date_trunc('month', date :: date) AS m_year
FROM your_table

Find From/To Dates across multiple rows - SQL Postgres

I want to be able to "book" within range of dates, but you can't book across gaps of days. So booking across multiple rates is fine as long as they are contiguous.
I am happy to change data structure/index, if there are better ways of storing start/end ranges.
So far I have a "rates" table which contains Start/End Periods of time with a daily rate.
e.g. Rates Table.
ID Price From To
1 75.00 2015-04-12 2016-04-15
2 100.00 2016-04-16 2016-04-17
3 50.00 2016-04-18 2016-04-30
For the above data I would want to return:
From To
2015-04-12 2016-4-30
For simplicity sake it is safe to assume that dates are safely consecutive. For contiguous dates To is always 1 day before from.
For the case there is only 1 row, I would want it to return the From/To of that single row.
Also to clarify if I had the following data:
ID Price From To
1 75.00 2015-04-12 2016-04-15
2 100.00 2016-04-17 2016-04-18
3 50.00 2016-04-19 2016-04-30
4 50.00 2016-05-01 2016-05-21
Meaning where there is a gap >= 1 day it would count as a separate range.
In which case I would expect the following:
From To
2015-04-12 2016-04-15
2015-04-17 2016-05-21
Edit 1
After playing around I have come up with the following SQL which seems to work. Although I'm not sure if there are better ways/issues with it?
WITH grouped_rates AS
(SELECT
from_date,
to_date,
SUM(grp_start) OVER (ORDER BY from_date, to_date) group
FROM (SELECT
gite_id,
from_date,
to_date,
CASE WHEN (from_date - INTERVAL '1 DAY') = lag(to_date)
OVER (ORDER BY from_date, to_date)
THEN 0
ELSE 1
END grp_start
FROM rates
GROUP BY from_date, to_date) AS start_groups)
SELECT
min(from_date) from_date,
max(to_date) to_date
FROM grouped_rates
GROUP BY grp;
This is identifying contiguous overlapping groups in the data. One approach is to find where each group begins and then do a cumulative sum. The following query adds a flag indicating if a row starts a group:
select r.*,
(case when not exists (select 1
from rates r2
where r2.from < r.from and r2.to >= r.to or
(r2.from = r.from and r2.id < r.id)
)
then 1 else 0 end) as StartFlag
from rate r;
The or in the correlation condition is to handle the situation where intervals that define a group overlap on the start date for the interval.
You can then do a cumulative sum on this flag and aggregate by that sum:
with r as (
select r.*,
(case when not exists (select 1
from rates r2
where (r2.from < r.from and r2.to >= r.to) or
(r2.from = r.from and r2.id < r.id)
)
then 1 else 0 end) as StartFlag
from rate r
)
select min(from), max(to)
from (select r.*,
sum(r.StartFlag) over (order by r.from) as grp
from r
) r
group by grp;
CREATE TABLE prices( id INTEGER NOT NULL PRIMARY KEY
, price MONEY
, date_from DATE NOT NULL
, date_upto DATE NOT NULL
);
-- some data (upper limit is EXCLUSIVE)
INSERT INTO prices(id, price, date_from, date_upto) VALUES
( 1, 75.00, '2015-04-12', '2016-04-16' )
,( 2, 100.00, '2016-04-17', '2016-04-19' )
,( 3, 50.00, '2016-04-19', '2016-05-01' )
,( 4, 50.00, '2016-05-01', '2016-05-22' )
;
-- SELECT * FROM prices;
-- Recursive query to "connect the dots"
WITH RECURSIVE rrr AS (
SELECT date_from, date_upto
, 1 AS nperiod
FROM prices p0
WHERE NOT EXISTS (SELECT * FROM prices nx WHERE nx.date_upto = p0.date_from) -- no preceding segment
UNION ALL
SELECT r.date_from, p1.date_upto
, 1+r.nperiod AS nperiod
FROM prices p1
JOIN rrr r ON p1.date_from = r.date_upto
)
SELECT * FROM rrr r
WHERE NOT EXISTS (SELECT * FROM prices nx WHERE nx.date_from = r.date_upto) -- no following segment
;
Result:
date_from | date_upto | nperiod
------------+------------+---------
2015-04-12 | 2016-04-16 | 1
2016-04-17 | 2016-05-22 | 3
(2 rows)

Find gaps in time not covered by records with start date and end date

I have a table of fee records (f_fee_item) as follows:
Fee_Item_ID int
Fee_Basis_ID int
Start_Date date
End_Date date
(irrelevant columns removed)
Assume that records for the same Fee_Basis_ID won't overlap.
I need to find the Start_Date and End_Date of each gap in the fee records for each Fee_Basis_ID between a supplied #Query_Start_Date and #Query_End_Date. I need this data to calculate fee accruals for all periods where fees have not been charged.
I also need the query to return a record if there are no fee records at all for a given Fee_Basis_ID (Fee_Basis_ID is a foreign key to D_Fee_Basis.Fee_Basis_ID if that helps).
For example:
#Query_Start_Date = '2011-01-01'
#Query_Start_Date = '2011-09-30'
D_Fee_Basis:
F_Fee_Item
1
2
3
F_Fee_Item:
Fee_Item_ID Fee_Basis_ID Start_Date End_Date
1 1 2011-01-01 2011-03-31
2 1 2011-04-01 2011-06-30
3 2 2011-01-01 2011-03-31
4 2 2011-05-01 2011-06-30
Required Results:
Fee_Basis_ID Start_Date End_Date
1 2011-07-01 2011-09-30
2 2011-04-01 2011-04-30
2 2011-07-01 2011-09-30
3 2011-01-01 2011-09-30
I've bee trying different self-joins for days trying to get it working but with no luck.
Please help!!
Here is a solution:
declare #Query_Start_Date date= '2011-01-01'
declare #Query_End_Date date = '2011-09-30'
declare #D_Fee_Basis table(F_Fee_Item int)
insert #D_Fee_Basis values(1)
insert #D_Fee_Basis values(2)
insert #D_Fee_Basis values(3)
declare #F_Fee_Item table(Fee_Item_ID int, Fee_Basis_ID int,Start_Date date,End_Date date)
insert #F_Fee_Item values(1,1,'2011-01-01','2011-03-31')
insert #F_Fee_Item values(2,1,'2011-04-01','2011-06-30')
insert #F_Fee_Item values(3,2,'2011-01-01','2011-03-31')
insert #F_Fee_Item values(4,2,'2011-05-01','2011-06-30')
;with a as
(-- find all days between Start_Date and End_Date
select #Query_Start_Date d
union all
select dateadd(day, 1, d)
from a
where d < #Query_end_Date
), b as
(--find all unused days
select a.d, F_Fee_Item Fee
from a, #D_Fee_Basis Fee
where not exists(select 1 from #F_Fee_Item where a.d between Start_Date and End_Date and Fee.F_Fee_Item = Fee_Basis_ID)
),
c as
(--find all start dates
select d, Fee, rn = row_number() over (order by fee, d) from b
where not exists (select 1 from b b2 where dateadd(day,1, b2.d) = b.d and b2.Fee= b.Fee)
),
e as
(--find all end dates
select d, Fee, rn = row_number() over (order by fee, d) from b
where not exists (select 1 from b b2 where dateadd(day,-1, b2.d) = b.d and b2.Fee= b.Fee)
)
--join start dates with end dates
select c.Fee Fee_Basis_ID, c.d Start_Date, e.d End_Date from c join e on c.Fee = e.Fee and c.rn = e.rn
option (maxrecursion 0)
Link for result:
https://data.stackexchange.com/stackoverflow/q/114193/