How to maintain cumulative sum for each User in SQL server - sql

I had a table like
ID UserID rupees time
1 1 200 2014-01-05
---------------------------------
2 1 500 2014-04-06
----------------------------------
3 2 10 2014-05-05
----------------------------------
4 2 20 2014-05-06
----------------------------------
I want the output lie
ID UserID Rupees time CumulativeSum
1 1 200 2014-01-05 200
-------------------------------------------------
2 1 500 2014-04-06 700
-------------------------------------------------
3 2 10 2014-05-06 10
-------------------------------------------------
4 2 20 2014-05-06 30
---------------------------------------------------
How can i get this table as purput

Please try using CTE:
;With T as(
select
*,
ROW_NUMBER() over(partition by UserId order by [time]) RN
from tbl
)
select
UserID,
rupees,
[time],
(select SUM(rupees)
from T b
where b.UserID=a.UserID and b.RN<=a.RN) CumulativeSum
from T a
For records with column value time increasing, try the below query:
select
UserID,
rupees,
[time],
(select SUM(rupees)
from tbl b
where b.UserID=a.UserID and b.[time]<=a.[time]) CumulativeSum
from tbl a

For SQL Server 2012 or later, you can use SUM() with an OVER clause that specifies a ROW clause:
declare #t table (ID int,UserID int,rupees int,[time] date)
insert into #t(ID,UserID,rupees,[time]) values
(1,1,200,'20140105'),
(2,1,500,'20140406'),
(3,2, 10,'20140505'),
(4,2, 20,'20140506')
select
*,
SUM(rupees) OVER (
PARTITION BY UserID
ORDER BY id /* or time? */
ROWS BETWEEN
UNBOUNDED PRECEDING AND
CURRENT ROW)
as total
from #t
Result:
ID UserID rupees time total
----------- ----------- ----------- ---------- -----------
1 1 200 2014-01-05 200
2 1 500 2014-04-06 700
3 2 10 2014-05-05 10
4 2 20 2014-05-06 30

DECLARE #t table (UserID INT,rupees INT,DateKey Date )
INSERT INTO #t VALUES
(1,200,'2014-01-05'),
(2,300,'2014-01-06'),
(2,800,'2014-03-06')
select UserID,
rupees,
DateKey,
(SELECT SUM(rupees)from #t t
where t.rupees <= tt.rupees) from #t tt
GROUP BY UserID,rupees,DateKey

Hope this too helps you.
DECLARE #tab TABLE (id INT,userId INT,rupees INT,[time] Date)
INSERT INTO #tab VALUES
(1,1,200 ,'2014-01-05'),
(2,1,500 ,'2014-04-06'),
(3,2,10 ,'2014-05-05'),
(4,2,20 ,'2014-05-06')
SELECT LU.id,LU.userId,LU.rupees,LU.time,SUM(b.rupees) CumulativeSum
FROM (SELECT *,ROW_NUMBER() OVER (PARTITION BY userId ORDER BY [time]) R FROM #tab) B
JOIN (SELECT *,ROW_NUMBER() OVER (PARTITION BY userId ORDER BY [time]) R FROM #tab) LU
ON B.userId = LU.userId AND B.R <= LU.R
GROUP BY LU.id,LU.userId,LU.rupees,LU.time
Result

I am assuming that you are not using SQL Server 2012, which provides the cumulative sum function. The other answers use some form of the row_number() function, but these seems totally unnecessary. I usually approach cumulative sums using correlated subqueries:
select ID, UserID, rupees, [time],
(select sum(rupees)
from table t2
where t2.UserId = t.UserId and
t2.ID <= t.ID
) as CumulativeSum
from table t;
This requires having a column that uniquely identifies each row, and that seems to be the purpose of id. For performance, I would want to have an index on table(UserId, ID, rupees).

select *, SUM(rupees) OVER (
PARTITION BY UserID
ORDER BY id) as CumSum from #tbl

Related

Rolling Average in SQL with Partition [duplicate]

declare #t table
(
id int,
SomeNumt int
)
insert into #t
select 1,10
union
select 2,12
union
select 3,3
union
select 4,15
union
select 5,23
select * from #t
the above select returns me the following.
id SomeNumt
1 10
2 12
3 3
4 15
5 23
How do I get the following:
id srome CumSrome
1 10 10
2 12 22
3 3 25
4 15 40
5 23 63
select t1.id, t1.SomeNumt, SUM(t2.SomeNumt) as sum
from #t t1
inner join #t t2 on t1.id >= t2.id
group by t1.id, t1.SomeNumt
order by t1.id
SQL Fiddle example
Output
| ID | SOMENUMT | SUM |
-----------------------
| 1 | 10 | 10 |
| 2 | 12 | 22 |
| 3 | 3 | 25 |
| 4 | 15 | 40 |
| 5 | 23 | 63 |
Edit: this is a generalized solution that will work across most db platforms. When there is a better solution available for your specific platform (e.g., gareth's), use it!
The latest version of SQL Server (2012) permits the following.
SELECT
RowID,
Col1,
SUM(Col1) OVER(ORDER BY RowId ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS Col2
FROM tablehh
ORDER BY RowId
or
SELECT
GroupID,
RowID,
Col1,
SUM(Col1) OVER(PARTITION BY GroupID ORDER BY RowId ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS Col2
FROM tablehh
ORDER BY RowId
This is even faster. Partitioned version completes in 34 seconds over 5 million rows for me.
Thanks to Peso, who commented on the SQL Team thread referred to in another answer.
For SQL Server 2012 onwards it could be easy:
SELECT id, SomeNumt, sum(SomeNumt) OVER (ORDER BY id) as CumSrome FROM #t
because ORDER BY clause for SUM by default means RANGE UNBOUNDED PRECEDING AND CURRENT ROW for window frame ("General Remarks" at https://msdn.microsoft.com/en-us/library/ms189461.aspx)
Let's first create a table with dummy data:
Create Table CUMULATIVESUM (id tinyint , SomeValue tinyint)
Now let's insert some data into the table;
Insert Into CUMULATIVESUM
Select 1, 10 union
Select 2, 2 union
Select 3, 6 union
Select 4, 10
Here I am joining same table (self joining)
Select c1.ID, c1.SomeValue, c2.SomeValue
From CumulativeSum c1, CumulativeSum c2
Where c1.id >= c2.ID
Order By c1.id Asc
Result:
ID SomeValue SomeValue
-------------------------
1 10 10
2 2 10
2 2 2
3 6 10
3 6 2
3 6 6
4 10 10
4 10 2
4 10 6
4 10 10
Here we go now just sum the Somevalue of t2 and we`ll get the answer:
Select c1.ID, c1.SomeValue, Sum(c2.SomeValue) CumulativeSumValue
From CumulativeSum c1, CumulativeSum c2
Where c1.id >= c2.ID
Group By c1.ID, c1.SomeValue
Order By c1.id Asc
For SQL Server 2012 and above (much better performance):
Select
c1.ID, c1.SomeValue,
Sum (SomeValue) Over (Order By c1.ID )
From CumulativeSum c1
Order By c1.id Asc
Desired result:
ID SomeValue CumlativeSumValue
---------------------------------
1 10 10
2 2 12
3 6 18
4 10 28
Drop Table CumulativeSum
A CTE version, just for fun:
;
WITH abcd
AS ( SELECT id
,SomeNumt
,SomeNumt AS MySum
FROM #t
WHERE id = 1
UNION ALL
SELECT t.id
,t.SomeNumt
,t.SomeNumt + a.MySum AS MySum
FROM #t AS t
JOIN abcd AS a ON a.id = t.id - 1
)
SELECT * FROM abcd
OPTION ( MAXRECURSION 1000 ) -- limit recursion here, or 0 for no limit.
Returns:
id SomeNumt MySum
----------- ----------- -----------
1 10 10
2 12 22
3 3 25
4 15 40
5 23 63
Late answer but showing one more possibility...
Cumulative Sum generation can be more optimized with the CROSS APPLY logic.
Works better than the INNER JOIN & OVER Clause when analyzed the actual query plan ...
/* Create table & populate data */
IF OBJECT_ID('tempdb..#TMP') IS NOT NULL
DROP TABLE #TMP
SELECT * INTO #TMP
FROM (
SELECT 1 AS id
UNION
SELECT 2 AS id
UNION
SELECT 3 AS id
UNION
SELECT 4 AS id
UNION
SELECT 5 AS id
) Tab
/* Using CROSS APPLY
Query cost relative to the batch 17%
*/
SELECT T1.id,
T2.CumSum
FROM #TMP T1
CROSS APPLY (
SELECT SUM(T2.id) AS CumSum
FROM #TMP T2
WHERE T1.id >= T2.id
) T2
/* Using INNER JOIN
Query cost relative to the batch 46%
*/
SELECT T1.id,
SUM(T2.id) CumSum
FROM #TMP T1
INNER JOIN #TMP T2
ON T1.id > = T2.id
GROUP BY T1.id
/* Using OVER clause
Query cost relative to the batch 37%
*/
SELECT T1.id,
SUM(T1.id) OVER( PARTITION BY id)
FROM #TMP T1
Output:-
id CumSum
------- -------
1 1
2 3
3 6
4 10
5 15
Select
*,
(Select Sum(SOMENUMT)
From #t S
Where S.id <= M.id)
From #t M
You can use this simple query for progressive calculation :
select
id
,SomeNumt
,sum(SomeNumt) over(order by id ROWS between UNBOUNDED PRECEDING and CURRENT ROW) as CumSrome
from #t
There is a much faster CTE implementation available in this excellent post:
http://weblogs.sqlteam.com/mladenp/archive/2009/07/28/SQL-Server-2005-Fast-Running-Totals.aspx
The problem in this thread can be expressed like this:
DECLARE #RT INT
SELECT #RT = 0
;
WITH abcd
AS ( SELECT TOP 100 percent
id
,SomeNumt
,MySum
order by id
)
update abcd
set #RT = MySum = #RT + SomeNumt
output inserted.*
For Ex: IF you have a table with two columns one is ID and second is number and wants to find out the cumulative sum.
SELECT ID,Number,SUM(Number)OVER(ORDER BY ID) FROM T
Once the table is created -
select
A.id, A.SomeNumt, SUM(B.SomeNumt) as sum
from #t A, #t B where A.id >= B.id
group by A.id, A.SomeNumt
order by A.id
The SQL solution wich combines "ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW" and "SUM" did exactly what i wanted to achieve.
Thank you so much!
If it can help anyone, here was my case. I wanted to cumulate +1 in a column whenever a maker is found as "Some Maker" (example). If not, no increment but show previous increment result.
So this piece of SQL:
SUM( CASE [rmaker] WHEN 'Some Maker' THEN 1 ELSE 0 END)
OVER
(PARTITION BY UserID ORDER BY UserID,[rrank] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS Cumul_CNT
Allowed me to get something like this:
User 1 Rank1 MakerA 0
User 1 Rank2 MakerB 0
User 1 Rank3 Some Maker 1
User 1 Rank4 Some Maker 2
User 1 Rank5 MakerC 2
User 1 Rank6 Some Maker 3
User 2 Rank1 MakerA 0
User 2 Rank2 SomeMaker 1
Explanation of above: It starts the count of "some maker" with 0, Some Maker is found and we do +1. For User 1, MakerC is found so we dont do +1 but instead vertical count of Some Maker is stuck to 2 until next row.
Partitioning is by User so when we change user, cumulative count is back to zero.
I am at work, I dont want any merit on this answer, just say thank you and show my example in case someone is in the same situation. I was trying to combine SUM and PARTITION but the amazing syntax "ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW" completed the task.
Thanks!
Groaker
Above (Pre-SQL12) we see examples like this:-
SELECT
T1.id, SUM(T2.id) AS CumSum
FROM
#TMP T1
JOIN #TMP T2 ON T2.id < = T1.id
GROUP BY
T1.id
More efficient...
SELECT
T1.id, SUM(T2.id) + T1.id AS CumSum
FROM
#TMP T1
JOIN #TMP T2 ON T2.id < T1.id
GROUP BY
T1.id
Try this
select
t.id,
t.SomeNumt,
sum(t.SomeNumt) Over (Order by t.id asc Rows Between Unbounded Preceding and Current Row) as cum
from
#t t
group by
t.id,
t.SomeNumt
order by
t.id asc;
Try this:
CREATE TABLE #t(
[name] varchar NULL,
[val] [int] NULL,
[ID] [int] NULL
) ON [PRIMARY]
insert into #t (id,name,val) values
(1,'A',10), (2,'B',20), (3,'C',30)
select t1.id, t1.val, SUM(t2.val) as cumSum
from #t t1 inner join #t t2 on t1.id >= t2.id
group by t1.id, t1.val order by t1.id
Without using any type of JOIN cumulative salary for a person fetch by using follow query:
SELECT * , (
SELECT SUM( salary )
FROM `abc` AS table1
WHERE table1.ID <= `abc`.ID
AND table1.name = `abc`.Name
) AS cum
FROM `abc`
ORDER BY Name

Create episode for each value with new Begin and End Dates

This is in reference to below Question
Loop through each value to the seq num
But now Client want to see the data differently and started a new thread for this question.
below is the requirement.
This is the data .
ID seqNum DOS Service End Date
1 1 1/1/2017 1/15/2017
1 2 1/16/2017 1/16/2017
1 3 1/17/2017 1/21/2017
1 4 1/22/2017 2/13/2017
1 5 2/14/2017 3/21/2017
1 6 2/16/2017 3/21/2017
Expected outPut:
ID SeqNum DOSBeg DOSEnd
1 1 1/1/2017 1/30/2017
1 2 1/31/2017 3/1/2017
1 3 3/2/2017 3/31/2017
For each DOSBeg, add 29 and that is DOSEnd. then Add 1 to DOSEnd (1/31/2017) is new DOSBeg.
Now add 29 to (1/31/2017) and that is 3/1/2017 which is DOSEnd . Repeat this untill DOSend >=Max End Date i.e 3/21/2017.
Basically, we need episode of 29 days for each ID.
I tried with this code and it is giving me duplicates.
with cte as (
select ID, minDate as DOSBeg,dateadd(day,29,mindate) as DOSEnd
from #temp
union all
select ID,dateadd(day,1,DOSEnd) as DOSBeg,dateadd(day,29,dateadd(day,1,DOSEnd)) as DOSEnd
from cte
)
select ID,DOSBeg,DOSEnd
from cte
OPTION (MAXRECURSION 0)
Here mindate is Minimum DOS for this ID i.e. 1/1/2017
I came up with below logic and this is working fine for me. Is there any better way than this ?
declare #table table (id int, seqNum int identity(1,1), DOS date, ServiceEndDate date)
insert into #table
values
(1,'20170101','20170115'),
(1,'20170116','20170116'),
(1,'20170117','20170121'),
(1,'20170122','20170213'),
(1,'20170214','20170321'),
(1,'20170216','20170321'),
(2,'20170101','20170103'),
(2,'20170104','20170118')
select * into #temp from #table
--drop table #data
select distinct ID, cast(min(DOS) over (partition by ID) as date) as minDate
,row_Number() over (partition by ID order by ID, DOS) as SeqNum,
DOS,
max(ServiceEndDate) over (partition by ID)as maxDate
into #data
from #temp
--drop table #StartDateLogic
with cte as
(select ID,mindate as startdate,maxdate
from #data
union all
select ID,dateadd(day,30,startdate) as startdate,maxdate
from cte
where maxdate >= dateadd(day,30,startdate))
select distinct ID,startdate
into #StartDateLogic
from cte
OPTION (MAXRECURSION 0)
--final Result set
select ID
,ROW_NUMBER() over (Partition by ID order by ID,StartDate) as SeqNum
,StartDate
,dateadd(day,29,startdate) as EndDate
from #StartDateLogic
You were on the right track wit the recursive cte, but you forgot the anchor.
declare #table table (id int, seqNum int identity(1,1), DOS date, ServiceEndDate date)
insert into #table
values
(1,'20170101','20170115'),
(1,'20170116','20170116'),
(1,'20170117','20170121'),
(1,'20170122','20170213'),
(1,'20170214','20170321'),
(1,'20170216','20170321'),
(2,'20170101','20170103'),
(2,'20170104','20170118')
;with dates as(
select top 1 with ties id, seqnum, DOSBeg = DOS, DOSEnd = dateadd(day,29,DOS)
from #table
order by row_number() over (partition by id order by seqnum)
union all
select t.id, t.seqNum, DOSBeg = dateadd(day,1,d.DOSEnd), DOSEnd = dateadd(day,29,dateadd(day,1,d.DOSEnd))
from dates d
inner join #table t on
d.id = t.id and t.seqNum = d.seqNum + 1
)
select *
from dates d
where d.DOSEnd <= (select max(dateadd(month,1,ServiceEndDate)) from #table where id = d.id)
order by id, seqNum

How to remove duplicate ID rows. While removing, use the rows that has NULL value in another column

While removing duplicate rows with same ID value, how to remove the rows that has null value in one particular column.
Note: there are other non-duplicate rows (below e.g., 12) that has NULL value and should still get selected in the result set.
Input table:
Id | sale_date | price
-----------------------------
11 20051020 22.1
11 NULL 20.1
12 NULL 20.1
13 20051020 20.1
Expected result:
Id | sale_date | price
-----------------------------
11 20051020 22.1
12 NULL 20.1
13 20051020 20.1
Assuming you have SQL Server 2008 or above, this will work for you. I use row_number and assign the values by ID starting at the max date. So any value higher than 1 is lower than the max date for that particular ID so I delete row_num greater than 1.
Check it out:
DECLARE #yourTable TABLE (ID INT,Sale_date DATE, Price FLOAT);
INSERT INTO #yourTable
VALUES (11,'20051020',22.1),
(11,NULL,20.1),
(12,NULL,20.1),
(13,'20051020',20.1);
WITH CTE
AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY ID ORDER BY sale_date DESC) AS row_num
FROM #yourTable
)
DELETE
FROM CTE
WHERE row_num > 1
SELECT *
FROM #yourTable
Try this
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY ID ORDER BY Sale_Date desc) AS ROW_NUM FROM AA1) A
WHERE ROW_NUM<2
If you have duplicate Ids, and not every sale_date is NULL, you can keep the latest date, and delete the other ones:
DELETE #MyTable
FROM #MyTable AS T1
INNER JOIN (SELECT Id, MAX(sale_date) AS sale_date FROM #MyTable GROUP BY Id HAVING COUNT(*) > 1)
AS T2 ON T1.ID = T2.ID AND (T1.sale_date is null OR T1.sale_date < T2.sale_date)

SUM() data in a column based on another column data

I have a sql table
Project ID Employee ID Total Days
1 100 1
1 100 1
1 100 2
1 100 6
1 200 8
1 200 2
Now i need this table to look like
Project ID Employee ID Total Days
1 100 10
1 200 10
As iam new to sql,i am little confuse to use SUM() based on above condition.
This query below produces two columns: EmployeeID, totalDays.
SELECT EmployeeID, SUM(totalDays) totalDays
FROM tableName
GROUP BY EmployeeID
follow-up question: why is in your desired result the projectId is 1 and 2?
Here are two approaches
Declare #t Table(ProjectId Int, EmployeeId Int,TotalDays Int)
Insert Into #t Values(1,100,1),(1,100,1),(1,100,2),(1,100,6),(1,200,8),(1,200,2)
Approach1:
Select ProjectId,EmployeeId,TotalDays = Sum(TotalDays)
From #t
Group By ProjectId,EmployeeId
Approach2:
;With Cte As(
Select
ProjectId
,EmployeeId
,TotalDays = Sum(TotalDays) Over(Partition By EmployeeId)
,Rn = Row_Number() Over(Partition By EmployeeId Order By EmployeeId)
From #t )
Select ProjectId,EmployeeId,TotalDays
From Cte Where Rn = 1
Result
ProjectId EmployeeId TotalDays
1 100 10
1 200 10
select min("Project ID")as 'Project ID',"Employee ID"
, SUM("Total Days") as 'Total Days'
from table1
group by "Employee ID"

Rows inside the greatest streak?

Given the Rows
symbol_id profit date
1 100 2009-08-18 01:01:00
1 100 2009-08-18 01:01:01
1 156 2009-08-18 01:01:04
1 -56 2009-08-18 01:01:06
1 18 2009-08-18 01:01:07
How would I most efficiently select the rows that are involved in the greatest streak (of profit).
The greatest streak would be the first 3 rows, and I would want those rows. The query I came up with is just a bunch of nested queries and derived tables. I am looking for an efficient way to do this using common table expressions or something more advanced.
You haven't defined how 0 profit should be treated or what happens if there is a tie for longest streak. But something like...
;WITH T1 AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY symbol_id ORDER BY date) -
ROW_NUMBER() OVER (PARTITION BY symbol_id, SIGN(profit)
ORDER BY date) AS Grp
FROM Data
), T2 AS
(
SELECT *,
COUNT(*) OVER (PARTITION BY symbol_id,Grp) AS StreakLen
FROM T1
)
SELECT TOP 1 WITH TIES *
FROM T2
ORDER BY StreakLen DESC
Or - if you are looking for most profitable streak
;WITH T1 AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY symbol_id ORDER BY date) -
ROW_NUMBER() OVER (PARTITION BY symbol_id, CASE WHEN profit >= 0 THEN 1 END
ORDER BY date) AS Grp
FROM Data
), T2 AS
(
SELECT *,
SUM(profit) OVER (PARTITION BY symbol_id,Grp) AS StreakProfit
FROM T1
)
SELECT TOP 1 WITH TIES *
FROM T2
ORDER BY StreakProfit DESC
declare #T table
(
symbol_id int,
profit int,
[date] datetime
)
insert into #T values
(1, 100, '2009-08-18 01:01:00'),
(1, 100, '2009-08-18 01:01:01'),
(1, 156, '2009-08-18 01:01:04'),
(1, -56, '2009-08-18 01:01:06'),
(1, 18 , '2009-08-18 01:01:07')
;with C1 as
(
select *,
row_number() over(order by [date]) as rn
from #T
),
C2 as
(
select *,
rn - row_number() over(order by rn) as grp
from C1
where profit >= 0
)
select top 1 with ties *
from C2
order by sum(profit) over(partition by grp) desc
Result:
symbol_id profit date rn grp
----------- ----------- ----------------------- -------------------- --------------------
1 100 2009-08-18 01:01:00.000 1 0
1 100 2009-08-18 01:01:01.000 2 0
1 156 2009-08-18 01:01:04.000 3 0
If that's a MSSQL server then you want to consider using TOP 3 in your select clause
and ORDER BY PROFIT DESC.
If mysql/postgres you might want to consider using limit in your select clause with
the same order by too.
hope this helps.