SQL - cumulative sum in postgres - sql

I have my data like this:
item - initial_value - amount - dateofpurchase
A 100 -3 2018-11-22
A 100 -2 2018-11-22
B 200 -5 2018-11-22
B 200 6 2018-11-22
B 200 -1 2018-11-22
(everything is ordered by date and time)
I want to calculate this column, that shows how much stock do you have after each step and taking in count the last amount
item - initial_value - amount - dateofpurchase - cumulative
A 100 -3 2018-11-22 97
A 100 -2 2018-11-22 95
B 200 -5 2018-11-22 195
B 200 6 2018-11-22 201
B 200 -1 2018-11-22 200
I've been trying a sum function with unbounded preceding and current row with no luck

You can use window functions and subtraction:
select t.*,
( initial_amount +
sum(amount) over (partition by item order by date_of_purchase)
) as cumulative
from t;

use window function
with cte as
(
select 'A' item, 100 as initial_value, -3 amount, '2018-11-22'::date as dateofpurchase
union all
select 'A' ,100, -2, '2018-11-22'
union all
select 'B',200, -5,'2018-11-22'
union all
select 'B',200, 6,'2018-11-22'
union all
select 'B',200, -1,'2018-11-22'
)
, t1 as
(select t.*, row_number() over(partition by item order by dateofpurchase) rn
from cte t
)
, t3 as
(select *, case when rn=1 then initial_value else 0 end as val from t1
) select item,initial_value,amount,dateofpurchase, sum(val+amount) over(partition by item order by rn) as cumulative from t3
Sample output
item initial_value amount dateofpurchase cumulative
A 100 -3 2018-11-22 97
A 100 -2 2018-11-22 95
B 200 -5 2018-11-22 195
B 200 6 2018-11-22 201
B 200 -1 2018-11-22 200
demo link

Related

Select the first record after the last but one X

I'm trying to get the first BEG_PERIOD date immediately after the last but one record of X (DEF_ENDING) of each user (USER_ID).
So I have this:
USER_ID
BEG_PERIOD
END_PERIOD
DEF_ENDING
159
01-07-2022
31-07-2022
X
159
25-09-2022
15-10-2022
X
159
01-11-2022
13-11-2022
159
14-11-2022
21-12-2022
X
159
01-01-2023
30-01-2023
X
414
01-04-2022
31-05-2022
X
414
01-07-2022
30-09-2022
414
01-10-2022
01-12-2022
X
480
01-07-2022
30-06-2022
480
01-07-2022
30-08-2022
X
480
02-09-2022
01-11-2022
X
503
15-03-2022
16-06-2022
X
503
19-07-2022
23-07-2022
503
24-07-2022
31-10-2022
503
01-11-2022
21-12-2022
X
The dates I need are the ones in bold
Can you help me?
I tried this but I only get the latest dates :(
SELECT
p.USER_ID,
p.BEG_PERIOD
FROM
PERIODS p
INNER JOIN PERIODS p2 ON
p.USER_ID = p2.USER_ID
AND
p.BEG_PERIOD = (
SELECT
MAX( BEG_PERIOD )
FROM
PERIODS
WHERE
PERIODS.USER_ID = p.USER_ID
)
WHERE
p.USER_ID > 10
This should work based on the sample data:
with data as (
select *,
sum(case when DEF_ENDING = 'X' then 1 end)
over (partition by USER_ID order by BEG_PERIOD desc) as grp
from PERIODS
)
select
USER_ID,
min(BEG_PERIOD) as BEG_PERIOD,
min(END_PERIOD) as END_PERIOD,
min(DEF_ENDING) as DEF_ENDING
from data
where grp = 1
group by USER_ID;
If you can't rely on the two dates being minimums then:
with data as (
select *,
sum(case when DEF_ENDING = 'X' then 1 end)
over (partition by USER_ID order by BEG_PERIOD desc) as grp
from PERIODS
), data2 as (
select *,
row_number() over (partition by USER_ID order by BEG_PERIOD) as rn
from data
where grp = 1
)
select *
from data2
where rn = 1;
This can also be done entirely via subqueries if that's more appropriate at the level of your class:
select USER_ID, min(BEG_PERIOD), min(END_PERIOD), min(DEF_ENDING)
from periods p1
where p1.BEG_PERIOD > (
select max(BEG_PERIOD)
from periods p2
where p2.USER_ID = p1.USER_ID and p2.DEF_ENDING = 'X'
and exists (
select 1
from periods p3
where p3.USER_ID = p2.USER_ID and p3.DEF_ENDING = 'X'
and p3.BEG_PERIOD > p2.BEG_PERIOD
)
)
group by USER_ID;
Try the following using the ROW_NUMBER and `LAG' window functions:
/* this to assign row numbers only for rows where def_ending = 'X' */
with order_def_ending as
(
select *,
case def_ending when 'X' then
row_number() over (partition by user_id order by
case def_ending when 'X' then 1 else 2 end,
end_period desc)
else null end rn,
lag(def_ending, 1, def_ending) over (partition by user_id order by end_period) pde /* previous end_period value */
from yourTbl
),
lag_rn as
(
select *,
lag(rn) over (partition by user_id order by end_period) prn /* previous row_number value */
from order_def_ending
)
select user_id, beg_period, end_period, def_ending
from lag_rn
where (
prn = 2 or /* when there are multiple rows with def_ending = 'X' */
(prn = 1 and rn is null) /* when there is only one row with def_ending = 'X' */
) and pde = 'X' /* ensure that the previous value of def_ending is = 'X' */
order by user_id, end_period
See demo
I think, this works on SQL server 2008
with periods as(
select USER_ID, cast(BEG_PERIOD as date)BEG_PERIOD,cast(END_PERIOD as date)END_PERIOD,DEF_ENDING
from (values
(159,'01-07-2022','31-07-2022','X')
,(159,'25-09-2022','15-10-2022','X')
,(159,'01-11-2022','13-11-2022',null)
,(159,'14-11-2022','21-12-2022','X')
,(159,'01-01-2023','30-01-2023','X')
,(414,'01-04-2022','31-05-2022','X')
,(414,'01-07-2022','30-09-2022',null)
,(414,'01-10-2022','01-12-2022','X')
,(480,'01-07-2022','30-06-2022',null)
,(480,'01-07-2022','30-08-2022','X')
,(480,'02-09-2022','01-11-2022','X')
,(503,'15-03-2022','16-06-2022','X')
,(503,'19-07-2022','23-07-2022',null)
,(503,'24-07-2022','31-10-2022',null)
,(503,'01-11-2022','21-12-2022','X')
)t(USER_ID, BEG_PERIOD, END_PERIOD, DEF_ENDING)
)
,cte as (
select *
,(select sum(case when def_ending='X' then 1 else 0 end)
from periods t2 where t2.user_id=t1.USER_ID and t2.BEG_PERIOD>=t1.BEG_PERIOD
) N -- last but one has N=2, all next N=1 (reverse order of counts)
from periods t1
)
select *
,(select min(t2.BEG_PERIOD)
from cte t2 where t2.user_id=t1.USER_ID and t2.N=1
) LastButOne -- first after last but one with N=1
from cte t1
Result
USER_ID
BEG_PERIOD
END_PERIOD
DEF_ENDING
N
LastButOne
159
2022-07-01
2022-07-31
X
4
2023-01-01
159
2022-09-25
2022-10-15
X
3
2023-01-01
159
2022-11-01
2022-11-13
NULL
2
2023-01-01
159
2022-11-14
2022-12-21
X
2
2023-01-01
159
2023-01-01
2023-01-30
X
1
2023-01-01
414
2022-04-01
2022-05-31
X
2
2022-07-01
414
2022-07-01
2022-09-30
NULL
1
2022-07-01
414
2022-10-01
2022-12-01
X
1
2022-07-01
480
2022-07-01
2022-06-30
NULL
2
2022-09-02
480
2022-07-01
2022-08-30
X
2
2022-09-02
480
2022-09-02
2022-11-01
X
1
2022-09-02
503
2022-03-15
2022-06-16
X
2
2022-07-19
503
2022-07-19
2022-07-23
NULL
1
2022-07-19
503
2022-07-24
2022-10-31
NULL
1
2022-07-19
503
2022-11-01
2022-12-21
X
1
2022-07-19
About Parallel Data Warehouse,
as mentioned here, Non-PDW versions of SQL Server before 2012 do not support the ORDER BY clause with aggregate functions like MIN.
Windowing function support was considerably extended in 2012, compared with the basic implementation available starting with SQL Server 2005. The extensions were made available in Parallel Data Warehouse before being incorporated in the box product.

Filling missing weekend rows with previous working day values

I have a data table like the below. For each customer, missing days(weekends or holidays) should be inserted with the balance of previous working day. And this should only be done between the dates that customer has in the table. Balance should be added as 0 for dates outside the customer date range in the table. So for customer with id 1 should be filled between 2022-07-01 and 2022-07-31. Customer with id 2 should be filled between 2022-07-07 and 2022-07-19. Also for the dates 2022-07-01 to 2022-07-07 and 2022-07-19 to 2022-07-31 balance should be added as 0.
Data Table
date customer_id balance
2022-07-01 1 100
2022-07-04 1 150
2022-07-05 1 200
. 1 .
. 1 .
2022-07-31 1 650
2022-07-07 2 200
2022-07-08 2 300
2022-07-11 2 400
. 2 .
. 2 .
2022-07-19 2 750
Output table should look like this:
date customer_id balance
2022-07-01 1 100
2022-07-02 1 100
2022-07-03 1 100
2022-07-04 1 150
2022-07-05 1 200
. 1 .
. 1 .
2022-07-31 1 650
2022-07-01 2 0
2022-07-02 2 0
. 2 .
. 2 .
2022-07-07 2 200
2022-07-08 2 300
2022-07-09 2 300
2022-07-10 2 300
2022-07-11 2 400
. 2 .
. 2 .
2022-07-19 2 750
2022-07-20 2 0
. 2 .
. 2 .
2022-07-31 2 0
There are some solutions that use cross join with calendar table to similar questions on the site. But i couldn't implement them for my case.
Any help is much appreciated.
The below is a solution that uses recursion instead of a calendar table.
It essentially works by 'extending' your original data to create some extra rows with 0 balances for every customer at:
The min date in the table (if the customer didn't already have a record at the min date)
The max date in the table (if the customer didn't already have a record at the max date)
The day after the last record for the customer (as long as this doesn't go over the max date in the table)
It then uses recursion to plug the gaps between the dates for each customer.
With balances as (
-- This is a simplified version of the data already in your table
SELECT '2022-07-01' as dt, 1 as customer_id, 100 as balance
UNION ALL SELECT '2022-07-04' as dt, 1 as customer_id, 150 as balance
UNION ALL SELECT '2022-07-05' as dt, 1 as customer_id, 200 as balance
UNION ALL SELECT '2022-07-31' as dt, 1 as customer_id, 650 as balance
UNION ALL SELECT '2022-07-07' as dt, 2 as customer_id, 200 as balance
UNION ALL SELECT '2022-07-08' as dt, 2 as customer_id, 300 as balance
UNION ALL SELECT '2022-07-11' as dt, 2 as customer_id, 400 as balance
UNION ALL SELECT '2022-07-19' as dt, 2 as customer_id, 750 as balance
)
, min_records as (
-- This can create a 0 balance record for each customer at the min date in the table
SELECT dt, customer_id, 0 as balance
FROM (
SELECT min(dt) as dt
FROM balances
) as min_dt
CROSS JOIN (
SELECT DISTINCT customer_id
FROM balances
) as customers
)
, max_records as (
-- This can create a 0 balance record for each customer at the max date in the table
SELECT dt, customer_id, 0 as balance
FROM (
SELECT max(dt) as dt
FROM balances
) as min_dt
CROSS JOIN (
SELECT DISTINCT customer_id
FROM balances
) as customers
)
, max_customer_records as (
-- This creates a 0 balance record for each customer for the day after their last record,
-- so long as that date does not go beyond the max date in the table
SELECT dateadd(day, 1, max(dt)) as dt, customer_id, 0 as balance
FROM balances as a
CROSS JOIN (
SELECT max(dt) as max_dt
FROM balances
) as m
GROUP BY customer_id, max_dt
HAVING max(dt) < max_dt
)
, extended_balances as (
-- We then join all of the tables above to the original balances table.
-- Grouping to the dt + customer level and sum(balance) wont cause issues for customers
-- who already had a record on the min(dt) or max(dt) because x + 0 still = x
SELECT dt, customer_id, sum(balance) as balance
FROM (
SELECT *
FROM balances
UNION
SELECT dt, customer_id, balance
FROM min_records
UNION
SELECT dt, customer_id, balance
FROM max_records
UNION
SELECT dt, customer_id, balance
FROM max_customer_records
) AS A
GROUP BY dt, customer_id
)
, recursive_query as (
-- Now we use recursion to fill in the gaps between the dates
SELECT dt as original_dt
, dt
, customer_id
, balance
-- We use lead() to find the date when a new balance exists
, coalesce(lead(dt) over(partition by customer_id order by dt asc), dateadd(day, 1, dt)) as next_dt
FROM extended_balances
UNION ALL
SELECT original_dt
, dateadd(day, 1, dt)
, customer_id
, balance
, next_dt
FROM recursive_query
WHERE dateadd(day, 1, dt) < next_dt
)
SELECT dt, customer_id, balance
FROM recursive_query
ORDER BY customer_id, dt
To help illustrate the steps, I've included examples of key tables:
Balances:
dt
customer_id
balance
2022-07-01
1
100
2022-07-04
1
150
2022-07-05
1
200
2022-07-31
1
650
2022-07-07
2
200
2022-07-08
2
300
2022-07-11
2
400
2022-07-19
2
750
Extended Balances:
dt
customer_id
balance
2022-07-01
1
100
2022-07-04
1
150
2022-07-05
1
200
2022-07-31
1
650
2022-07-01
2
0
2022-07-07
2
200
2022-07-08
2
300
2022-07-11
2
400
2022-07-19
2
750
2022-07-20
2
0
2022-07-31
2
0
First 10 records of the recursive query:
original_dt
dt
customer_id
balance
next_dt
2022-07-01
2022-07-01
1
100
2022-07-04
2022-07-01
2022-07-02
1
100
2022-07-04
2022-07-01
2022-07-03
1
100
2022-07-04
2022-07-04
2022-07-04
1
150
2022-07-05
2022-07-05
2022-07-05
1
200
2022-07-31
2022-07-05
2022-07-06
1
200
2022-07-31
2022-07-05
2022-07-07
1
200
2022-07-31
2022-07-05
2022-07-08
1
200
2022-07-31
2022-07-05
2022-07-09
1
200
2022-07-31
2022-07-05
2022-07-10
1
200
2022-07-31

Snowflake SQL - Count Distinct Users within descending time interval

I want to count the distinct amount of users over the last 60 days, and then, count the distinct amount of users over the last 59 days, and so on and so forth.
Ideally, the output would look like this (TARGET OUTPUT)
Day Distinct Users
60 200
59 200
58 188
57 185
56 180
[...] [...]
where 60 days is the max total possible distinct users, and then 59 would have a little less and so on and so forth.
my query looks like this.
select
count(distinct (case when datediff(day,DATE,current_date) <= 60 then USER_ID end)) as day_60,
count(distinct (case when datediff(day,DATE,current_date) <= 59 then USER_ID end)) as day_59,
count(distinct (case when datediff(day,DATE,current_date) <= 58 then USER_ID end)) as day_58
FROM Table
The issue with my query is that This outputs the data by column instead of by rows (like shown below) AND, most importantly, I have to write out this logic 60x for each of the 60 days.
Current Output:
Day_60 Day_59 Day_58
209 207 207
Is it possible to write the SQL in a way that creates the target as shown initially above?
Using below data in CTE format -
with data_cte(dates,userid) as
(select * from values
('2022-05-01'::date,'UID1'),
('2022-05-01'::date,'UID2'),
('2022-05-02'::date,'UID1'),
('2022-05-02'::date,'UID2'),
('2022-05-03'::date,'UID1'),
('2022-05-03'::date,'UID2'),
('2022-05-03'::date,'UID3'),
('2022-05-04'::date,'UID1'),
('2022-05-04'::date,'UID1'),
('2022-05-04'::date,'UID2'),
('2022-05-04'::date,'UID3'),
('2022-05-04'::date,'UID4'),
('2022-05-05'::date,'UID1'),
('2022-05-06'::date,'UID1'),
('2022-05-07'::date,'UID1'),
('2022-05-07'::date,'UID2'),
('2022-05-08'::date,'UID1')
)
Query to get all dates and count and distinct counts -
select dates,count(userid) cnt, count(distinct userid) cnt_d
from data_cte
group by dates;
DATES
CNT
CNT_D
2022-05-01
2
2
2022-05-02
2
2
2022-05-03
3
3
2022-05-04
5
4
2022-05-05
1
1
2022-05-06
1
1
2022-05-08
1
1
2022-05-07
2
2
Query to get difference of date from current date
select dates,datediff(day,dates,current_date()) ddiff,
count(userid) cnt,
count(distinct userid) cnt_d
from data_cte
group by dates;
DATES
DDIFF
CNT
CNT_D
2022-05-01
45
2
2
2022-05-02
44
2
2
2022-05-03
43
3
3
2022-05-04
42
5
4
2022-05-05
41
1
1
2022-05-06
40
1
1
2022-05-08
38
1
1
2022-05-07
39
2
2
Get records with date difference beyond a certain range only -
include clause having
select datediff(day,dates,current_date()) ddiff,
count(userid) cnt,
count(distinct userid) cnt_d
from data_cte
group by dates
having ddiff<=43;
DDIFF
CNT
CNT_D
43
3
3
42
5
4
41
1
1
39
2
2
38
1
1
40
1
1
If you need to prefix 'day' to each date diff count, you can
add and outer query to previously fetched data-set and add the needed prefix to the date diff column as following -
I am using CTE syntax, but you may use sub-query given you will select from table -
,cte_1 as (
select datediff(day,dates,current_date()) ddiff,
count(userid) cnt,
count(distinct userid) cnt_d
from data_cte
group by dates
having ddiff<=43)
select 'day_'||to_char(ddiff) days,
cnt,
cnt_d
from cte_1;
DAYS
CNT
CNT_D
day_43
3
3
day_42
5
4
day_41
1
1
day_39
2
2
day_38
1
1
day_40
1
1
Updated the answer to get distinct user count for number of days range.
A clause can be included in the final query to limit to number of days needed.
with data_cte(dates,userid) as
(select * from values
('2022-05-01'::date,'UID1'),
('2022-05-01'::date,'UID2'),
('2022-05-02'::date,'UID1'),
('2022-05-02'::date,'UID2'),
('2022-05-03'::date,'UID5'),
('2022-05-03'::date,'UID2'),
('2022-05-03'::date,'UID3'),
('2022-05-04'::date,'UID1'),
('2022-05-04'::date,'UID6'),
('2022-05-04'::date,'UID2'),
('2022-05-04'::date,'UID3'),
('2022-05-04'::date,'UID4'),
('2022-05-05'::date,'UID7'),
('2022-05-06'::date,'UID1'),
('2022-05-07'::date,'UID8'),
('2022-05-07'::date,'UID2'),
('2022-05-08'::date,'UID9')
),cte_1 as
(select datediff(day,dates,current_date()) ddiff,userid
from data_cte), cte_2 as
(select distinct ddiff from cte_1 )
select cte_2.ddiff,
(select count(distinct userid)
from cte_1 where cte_1.ddiff <= cte_2.ddiff) cnt
from cte_2
order by cte_2.ddiff desc
DDIFF
CNT
47
9
46
9
45
9
44
8
43
5
42
4
41
3
40
1
You can do unpivot after getting your current output.
sample one.
select
*
from (
select
209 Day_60,
207 Day_59,
207 Day_58
)unpivot ( cnt for days in (Day_60,Day_59,Day_58));

Running assignment of values with break T-SQL

With the below table of data
Customer
Amount Billed
Amount Paid
Date
1
100
60
01/01/2000
1
100
40
01/02/2000
2
200
150
01/01/2000
2
200
30
01/02/2000
2
200
10
01/03/2000
2
200
15
01/04/2000
I would like to create the next two columns
Customer
Amount Billed
Amount Paid
Assigned
Remainder
Date
1
100
60
60
40
01/01/2000
1
100
40
40
0
01/02/2000
2
200
150
150
50
01/01/2000
2
200
30
30
20
01/02/2000
2
200
10
10
10
01/03/2000
2
200
15
10
-5
01/04/2000
The amount paid on each line should be removed from the amount billed and pushed onto the next line for the same customer. The process should continue until there are no more records or the remainder is < 0.
Is there a way of doing this without a cursor? Maybe a recursive CTE?
Thanks
As I mentioned in the comments, this is just a cumulative SUM:
WITH YourTable AS(
SELECT *
FROM (VALUES(1,100,60 ,CONVERT(date,'01/01/2000')),
(1,100,40 ,CONVERT(date,'01/02/2000')),
(2,200,150,CONVERT(date,' 01/01/2000')),
(2,200,30 ,CONVERT(date,'01/02/2000')),
(2,200,10 ,CONVERT(date,'01/03/2000')),
(2,200,15 ,CONVERT(date,'01/04/2000')))V(Customer,AmountBilled,AmountPaid,[Date]))
SELECT Customer,
AmountBilled,
AmountPaid,
AmountBilled - SUM(AmountPaid) OVER (PARTITION BY Customer ORDER BY [Date] ASC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS Remainder,
[Date]
FROM YourTable
ORDER BY Customer,
[Date];
Note this returns -5 for the last row, not 5, as 200 - 205 = -5. If you want 5 wrap the whole expression in an absolute function.
You can achieve this using recursive CTE as well.
DECLARE #customer table (Customer int, AmountBilled int, AmountPaid int, PaidDate date)
insert into #customer
values
(1 ,100, 60 ,'01/01/2000')
,(1 ,100, 40 ,'01/02/2000')
,(2 ,200, 150 ,'01/01/2000')
,(2 ,200, 30 ,'01/02/2000')
,(2 ,200, 10 ,'01/03/2000')
,(2 ,200, 15 ,'01/04/2000');
;WITH CTE_CustomerRNK as
(
SELECT *, ROW_NUMBER() OVER(PARTITION BY customer order by paiddate) AS RNK
from #customer),
CTE_Customer as
(
SELECT customer, AmountBilled, AmountPaid, (amountbilled-amountpaid) as remainder, paiddate ,RNK FROM CTE_CustomerRNK where rnk = 1
union all
SELECT r.customer, r.AmountBilled, r.AmountPaid, (c.remainder - r.AmountPaid) as remainder, r.PaidDate, r.rnk
FROM CTE_CustomerRNK as r
inner join CTE_Customer as c
on c.Customer = r.Customer
and r.rnk = c.rnk + 1
)
SELECT customer, AmountBilled, AmountPaid, remainder, paiddate
FROM CTE_Customer order by Customer
customer
AmountBilled
AmountPaid
remainder
paiddate
1
100
60
40
2000-01-01
1
100
40
0
2000-01-02
2
200
150
50
2000-01-01
2
200
30
20
2000-01-02
2
200
10
10
2000-01-03
2
200
15
-5
2000-01-04

sql date difference with multiple variables

I'm trying to get the number of days difference in dates between the effdate status 0 that follows the most recent status 1
the code below yields the following results
SELECT * FROM
(SELECT FILEKEY, STATUS, EFFDATE FROM ASTATUSHIST
UNION
SELECT FILEKEY, ASTATUS, ASTATUSEFFDATE FROM USERS ) A
ORDER BY 1, 3 DESC
130 0 2019-10-25 00:00:00.000
130 0 2017-03-01 00:00:00.000
130 0 2017-01-01 00:00:00.000
130 1 2005-02-01 00:00:00.000
130 0 2001-03-03 00:00:00.000
130 0 2000-01-30 00:00:00.000
130 0 2000-01-01 00:00:00.000
this code combines 2 tables to get the complete history for a given user.
Ideally I could produce something that looks like this:
130 4352
or
125 null
where the null is filekey without a status 1 or a filekey with a status 1 but without a following status 0
Thanks
In all supported versions of SQL Server, you can use window functions:
with t as (
<your query here>
)
select t.*,
datediff(day, date, next_date) as days_diff
from (select t.*,
row_number() over (partition by filekey, status order by date desc) as seqnum,
lead(date) over (partition by filekey order by date) as next_date
from t
) t
where seqnum = 1;