SQL server to do like Group By task - sql

I have a table with SQL server as below,
Date Value
---------------------------------------------------
08-01-2016 1
08-02-2016 1
08-03-2016 1
08-04-2016 1
08-05-2016 1
08-06-2016 2
08-07-2016 2
08-08-2016 2
08-09-2016 2.5
08-10-2016 1
08-11-2016 1
Since the original table is too large, even I used 'Results to file', it still raise the exception 'System.OutOfMemoryException'. That's why I want to organize the table into this kind.
But I don't have a good logic to deal with. Therefore, I want to change the table into this kind as below.
Date_from Date_to Value
-------------------------------------------------
08-01-2016 08-05-2016 1
08-06-2016 08-08-2016 2
08-09-2016 08-09-2016 2.5
08-10-2016 08-11-2016 1
I appreciate your ideas!

Commonly called as Groups and Island problem. Here is one trick to do this
;WITH data
AS (SELECT *,Lag(Value, 1)OVER(ORDER BY Dates) [pVal]
FROM (VALUES ('08-01-2016',1 ),
('08-02-2016',1 ),
('08-03-2016',1 ),
('08-04-2016',1 ),
('08-05-2016',1 ),
('08-06-2016',2 ),
('08-07-2016',2 ),
('08-08-2016',2 ),
('08-09-2016',2.5 ),
('08-10-2016',1 ),
('08-11-2016',1 )) tc (Dates, Value)),
intr
AS (SELECT Dates,
Value,
Sum(Iif(pVal = Value, 0, 1)) OVER(ORDER BY Dates) AS [Counter]
FROM data)
SELECT Min(Dates) AS Dates_from,
Max(Dates) AS Dates_to,
Value
FROM intr
GROUP BY [Counter],
Value

The cumulative sum/lag approach is one method. In this case, a simpler method is:
select min(date) as date_from, max(date) as date_to, value
from (select t.*,
dateadd(day, - row_number() over (partition by value order by date),date) as grp
from t
) t
group by value, grp;
This uses the observation that the dates are consecutive with no gaps. Hence, subtracting a sequence from the date will yield a constant -- when the values are the same.

Here is an example:
DECLARE #T TABLE (
[Date] DATE,
[Value] DECIMAL(9,2)
)
INSERT #T VALUES
( '08-01-2016', 1 ),
( '08-02-2016', 1 ),
( '08-03-2016', 1 ),
( '08-04-2016', 1 ),
( '08-05-2016', 1 ),
( '08-06-2016', 2 ),
( '08-07-2016', 2 ),
( '08-08-2016', 2 ),
( '08-09-2016', 2.5 ),
( '08-10-2016', 1 ),
( '08-11-2016', 1 )
SELECT * FROM #T
SELECT A.[Date] StartDate, B.[Date] EndDate, A.[Value] FROM (
SELECT A.*, ROW_NUMBER() OVER (ORDER BY A.[Date], A.[Value]) O FROM #T A
LEFT JOIN #T B ON B.[Value] = A.[Value] AND B.[Date] = DATEADD(d, -1, A.[Date])
WHERE B.[Date] IS NULL
) A
JOIN (
SELECT A.*, ROW_NUMBER() OVER (ORDER BY A.[Date], A.[Value]) O FROM #T A
LEFT JOIN #T B ON B.[Value] = A.[Value] AND B.[Date] = DATEADD(d, 1, A.[Date])
WHERE B.[Date] IS NULL
) B ON B.O = A.O

Prdp's solution is great but just in case if anyone is still using SQL Server 2008 where LAG() and The Parallel Data Warehouse (PDW) features are not available here is an alternative:
SAMPLE DATA:
IF OBJECT_ID('tempdb..#Temp') IS NOT NULL
DROP TABLE #Temp;
CREATE TABLE #Temp([Dates] DATE
, [Value] FLOAT);
INSERT INTO #Temp([Dates]
, [Value])
VALUES
('08-01-2016'
, 1),
('08-02-2016'
, 1),
('08-03-2016'
, 1),
('08-04-2016'
, 1),
('08-05-2016'
, 1),
('08-06-2016'
, 2),
('08-07-2016'
, 2),
('08-08-2016'
, 2),
('08-09-2016'
, 2.5),
('08-10-2016'
, 1),
('08-11-2016'
, 1);
QUERY:
;WITH Seq
AS (SELECT SeqNo = ROW_NUMBER() OVER(ORDER BY [Dates]
, [Value])
, t.Dates
, t.[Value]
FROM #Temp t)
SELECT StartDate = MIN([Dates])
, EndDate = MAX([Dates])
, [Value]
FROM
(SELECT [Value]
, [Dates]
, SeqNo
, rn = SeqNo - ROW_NUMBER() OVER(PARTITION BY [Value] ORDER BY SeqNo)
FROM Seq s) a
GROUP BY [Value]
, rn
ORDER BY StartDate;
RESULTS:

Related

Calculating average by using the previous row's value and following row's value

I have calculated average values for each month. Some months are NULL and my manager wants me to use the previous row's value and following month's value and fill the months which are having NULL values.
Current result (see below pic):
Expected Result
DECLARE #DATE DATE = '2017-01-01';
WITH DATEDIM AS
(
SELECT DISTINCT DTM.FirstDayOfMonth
FROM DATEDIM DTM
WHERE Date >= '01/01/2017'
AND Date <= DATEADD(mm,-1,Getdate())
),
Tab1 AS
(
SELECT
T1.FirstDayOfMonth AS MONTH_START,
AVG1,
ROW_NUMBER() OVER (
ORDER BY DATEADD(MM,DATEDIFF(MM, 0, T1.FirstDayOfMonth),0) DESC
) AS RNK
FROM DATEDIM T1
LEFT OUTER JOIN (
SELECT
DATEADD(MM,DATEDIFF(MM, 0, StartDate),0) MONTH_START,
AVG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) AS AVG1
FROM DATATable
WHERE EndDate >= StartDate
AND StartDate >= #DATE
AND EndDate >= #DATE
GROUP BY DATEADD(MM,DATEDIFF(MM, 0, StartDate),0)
) T2 ON T1.FirstDayOfMonth = T2.MONTH_START
)
SELECT *
FROM Tab1
Using your CTEs
select MONTH_START,
case when AVG1 is null then
(select top(1) t2.AVG1
from Tab1 t2
where t1.RNK > t2.RNK and t2.AVG1 is not null
order by t2.RNK desc)
else AVG1 end AVG1,
RNK
from Tab1 t1
Edit
Version for an average of nearest peceding and nearest following non-nulls. Both must exist otherwise NULL is returned.
select MONTH_START,
case when AVG1 is null then
( (select top(1) t2.AVG1
from Tab1 t2
where t1.RNK > t2.RNK and t2.AVG1 is not null
order by t2.RNK desc)
+(select top(1) t2.AVG1
from Tab1 t2
where t1.RNK < t2.RNK and t2.AVG1 is not null
order by t2.RNK)
) / 2
else AVG1 end AVG1,
RNK
from Tab1 t1
I can't quite tell what you are trying to calculate the average of, but this is quite simple with window functions:
select t.*,
avg(val) over (order by month_start rows between 1 preceding and 1 rollowing)
from t;
In your case, I think this translates as:
select datefromparts(year(startdate), month(startdate), 1) as float,
avg(val) as monthaverage,
avg(avg(val)) over (order by min(startdate) rows between 1 preceding and 1 following)
from datatable d
where . . .
group by datefromparts(year(startdate), month(startdate), 1)
You can manipulate previous and following row values using window functions:
SELECT MAX(row_value) OVER(
ORDER BY ... ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS Previous_Value,
MAX(row_value) OVER(
ORDER BY ... ROWS BETWEEN 1 FOLLOWING AND 1 FOLLOWING) AS Next_Value
Alternatively you can use LAG/LEAD functions and modify your sub-query where you get the AVG:
SELECT
src.MONTH_START,
CASE
WHEN src.prev_val IS NULL OR src.next_val IS NULL
THEN COALESCE(src.prev_val, src.next_val) -- Return non-NULL value (if exists)
ELSE (src.prev_val + src.next_val ) / 2
END AS AVG_new
FROM (
SELECT
DATEADD(MM,DATEDIFF(MM, 0, StartDate),0) MONTH_START,
LEAD(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) OVER(ORDER BY ...) AS prev_val,
LAG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) OVER(ORDER BY ...) AS next_val
-- AVG(CAST(DATEDIFF(dd, StartDate, EndDate) AS FLOAT)) AS AVG1
FROM DATATable
WHERE EndDate >= StartDate
AND StartDate >= #DATE
AND EndDate >= #DATE
GROUP BY DATEADD(MM,DATEDIFF(MM, 0, StartDate),0)
) AS src
I haven't tested it, but give it a shot and see how it works. You may need to put at least one column in the ORDER BY portion of the window function.
You could try this query (I just reflected in my sample data relevant parts, I omitted date column):
declare #tbl table (rank int, value int);
insert into #tbl values
(1, null),
(2, 20),
(3, 30),
(4, null),
(5, null),
(6, null),
(7, 40),
(8, null),
(9, null),
(10, 36),
(11, 22);
;with cte as (
select *,
DENSE_RANK() over (order by case when value is null then rank else value end) drank,
case when value is null then lag(value) over (order by rank) end lag,
case when value is null then lead(value) over (order by rank) end lead
from #tbl
)
select rank, value, case when value is null then
max(lag) over (partition by grp) / 2 +
max(lead) over (partition by grp) / 2
else value end valueWithAvg
from (
select *,
rank - drank grp from cte
) a order by rank

Find all missing Numbers using one SQL query

The data looks like
[Month] [Date]
---------------
201306 1
201306 2
201306 5
201306 6
201306 7
201307 1
201307 4
201307 6
201309 1
201309 2
How to find all missing Dates by Month?
Here is the expected results
[Month] [Date]
---------------
201306 3
201306 4
201307 2
201307 3
201307 5
I think does it and pretty efficiently
declare #T table (yy int, dd int);
insert into #T values
(201306, 1)
, (201306, 2)
, (201306, 5)
, (201306, 6)
, (201306, 7)
, (201307, 1)
, (201307, 4)
, (201307, 6)
, (201309, 1)
, (201309, 2);
with cte as
( select yy, min(dd) + 1 as mn, max(dd) as mx
from #T
group by yy
having min(dd) + 1 < max(dd)
union all
select c.yy, c.mn + 1, c.mx
from cte c
where c.mn + 1 < c.mx
)
select yy, mn as dd
from cte
except
select yy, dd
from #T t
order by yy, mn;
yy dd
----------- -----------
201306 3
201306 4
201307 2
201307 3
201307 5
You would need some kind of lookup tables which could has intermediate dates and use cross join with left join to find missing dates
First thought
;with cte as (
select min(date) mdate, max(date) mxdate from table
union all
select mdate+1 as mdate, mxdate
from cte c
where c.mdate < c.mxdate
)
select distinct t.Month, c.mdate
from table t cross join (select mdate from cte) c
left join table t1 on t1.month = t.Month and t1.date = c.mdate
where t1.date is null
Second thought
;with cte as (
select month, min(date) over (partition by month) mdate, max(date) over (partition by month) mxdate
from sample t union all
select month, mdate+1 as mdate, mxdate
from cte c
where c.month = month and c.mdate < c.mxdate
)
select c.month, c.mdate
from cte c left join sample t1
on t1.month = c.Month and t1.date = c.mdate
where t1.date is null
group by c.month, c.mdate
Demo
Consider using a recursive query
with rndata as
(
select row_number() over (partition by mon order by d) rn, * from data
), rcte as
(
select mon, d, (select max(d) from data where data.mon = rndata.mon) max_d
from rndata where rn = 1
union all
select rcte.mon, rcte.d + 1, rcte.max_d
from rcte
where rcte.d + 1 < max_d
)
select mon, d
from rcte
where not exists (
select 1
from data
where rcte.mon = data.mon and
rcte.d = data.d
)
dbfiddle demo
Consider using below approach.
CREATE TABLE #Date([Month] int, [Date] int)
INSERT INTO #Date
VALUES(201306, 1)
,(201306, 2)
,(201306, 5)
,(201306, 6)
,(201306, 7)
,(201307, 1)
,(201307, 4)
,(201307, 6)
,(201309, 1)
,(201309, 2)
;WITH CTE AS
(
SELECT
*,LEAD([Date]) OVER(ORDER BY [Month],[Date]) AS NextDate
FROM #Date d
)
SELECT
d.[Month], m.Dt AS [Date]
FROM CTE d
CROSS APPLY( SELECT v.Dt
FROM
(VALUES(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11)
,(11),(12),(13),(14),(15),(16),(17),(18),(19),(20)
,(21),(22),(23),(24),(25),(26),(27),(28),(29),(30),(31)
) AS v(Dt)
WHERE v.Dt > d.Date AND v.Dt < d.NextDate
) m
The only limitation this approach has is, it is not able to find missing days that falls before first date.
This is a fast but still simple solution:
1) use sys.sysobjects as tally table to gett all dates in each month.
2) calc min/max range for each month to keep only gaps inside the range.
3) join tally and range to get expected dates for each month, and left join your table to math existing dates.
4) filter in WHERE condition only missing dates
declare #T table ([month] int, [date] int);
insert into #T values
(201306, 1)
, (201306, 2)
, (201306, 5)
, (201306, 6)
, (201306, 7)
, (201307, 1)
, (201307, 4)
, (201307, 6)
, (201309, 1)
, (201309, 2);
with
n as (select top 31 ROW_NUMBER() over (order by id) n from sys.sysobjects),
r as (select [month], MIN([date]) dd1, MAX([date]) dd2 from #t group by [month])
select r.[month], n [date]
from r
join n on n between dd1 and dd2
left join #T t on n.N = t.[date] and r.[month] = t.[month]
where dd2<>dd1 and t.[date] is null
order by r.[month], n
You can use numbers/Tally table approach like below:
See live demo
create table sample ([Month] int, [Date] int)
insert into sample values
(201306, 1)
,(201306, 2)
,(201306, 5)
,(201306, 6)
,(201306, 7)
,(201307, 1)
,(201307, 4)
,(201307, 6)
,(201309, 1)
,(201309, 2);
; with daysinmonth as
(
select * from
(
values
(1,31),(2,28),(3,31),(4,30),(5,31),(6,30),
(7,31),(8,31),(9,30),(10,31),(11,30),(12,31)
) v(m,d)
)
select [month], dd
from sample
cross apply
(
select top
(
select d from
daysinmonth
where m=cast( right(cast([Month] as varchar(6)),2) as int)
)
row_number() over ( order by (select null)) dd
from
sys.tables t1 cross join
sys.tables t2
) c
where [date]<>dd

SQL Last activity of given type

So I have a Visitor table, and a Visitor_activity table. Say:
Visitor
Visitor_ID Int
Visitor_name varchar(20)
Visitor_Activity
ID Int
Visitor_ID Int
Activity_Type char(3) -- values IN or OUT
Activity_Time datetime
Visitors might sign in and out multiple times in a day.
I'd like a nice query to tell me all visitors who are in: i.e. the last activity for today (on activity_time) was an "IN" not an "OUT". Any advice much appreciated.
It's T-SQL by the way, but I think it's more of an in-principle question.
One way to solve this is to use a correlated not exists predicate:
select Activity_Time, Visitor_ID
from Visitor_Activity t1
where Activity_Type = 'IN'
and not exists (
select 1
from Visitor_Activity
where Activity_Type = 'OUT'
and Visitor_ID = t1.Visitor_ID
and Activity_Time > t1.Activity_Time
and cast(Activity_Time as date) = cast(t1.Activity_Time as date)
)
This basically says get all visitor_id that have type = IN for which there doesn't exists any type = OUT record with a later time (on the same date).
Sample SQL Fiddle
SELECT
v.*
FROM
Visitors v
JOIN Visitor_Activity va ON va.Visitor_ID = v.Visitor_ID
WHERE
va.Activity_Type = 'IN'
AND NOT EXISTS ( SELECT
*
FROM
Visitor_Activity va_out
WHERE
va_out.Visitor_ID = va.Visitor_ID
AND va_out.Activity_Type = 'OUT'
AND va_out.Activity_Time > va.Activity_Time )
with visitorsInOut as (
select Visitor_id,
max(case when Activity_Type = 'in' then Activity_Time else null end) inTime,
max(case when Activity_Type = 'out' then Activity_Time else null end) outTime
from Visitor_Activity
where datediff(dd, Activity_Time, getdate()) = 0
group by Visitor_id)
select Visitor_id
from visitorsInOut
where inTime > outTime or outTime is null
This uses a CTE to find the activity record with the greatest Activity_Time where the Activity_Type = 'IN' and assigns it RowNum 1. Then you can INNER JOIN the CTE to the Visitor table, filtering by the CTE results where RowNum = 1.
; WITH VisAct AS(
SELECT act.Visitor_ID
, ROW_NUMBER() OVER(PARTITION BY Visitor_ID ORDER BY Activity_Time DESC) AS RowNum
FROM Visitor_Activity act
WHERE act.Activity_Type = 'IN'
AND act.Activity_Time >= CAST(GETDATE() AS DATE)
)
SELECT vis.Visitor_ID, vis.Visitor_name
FROM Visitor vis
INNER JOIN VisAct act
ON act.Visitor_ID = vis.Visitor_ID
WHERE act.Row_Num = 1
You can pull the most recent action for each visitor, and then only return those where the last action for today was to check in.
SELECT v.Visitor_ID, v.Visitor_Name, va.Activity_Type, va.Activity_Time
FROM Visitor AS v
INNER JOIN (SELECT Visitor_ID, Activity_Type, Activity_Time, RANK() OVER (PARTITION BY Visitor_ID ORDER BY Activity_Time DESC) AS LastAction
FROM Visitor_Activity
-- checks for today, can be omitted if you still want
-- to see someone checked in from yesterday
WHERE DATEDIFF(d, 0, Activity_Time) = DATEDIFF(d, 0, getdate())
) AS va ON va.Visitor_ID = v.Visitor_ID
WHERE LastAction = 1
AND Activity_Type = 'IN'
With CROSS APPLY:
DECLARE #d DATE = '20150320'
DECLARE #v TABLE
(
visitor_id INT ,
visitor_name NVARCHAR(MAX)
)
DECLARE #a TABLE
(
visitor_id INT ,
type CHAR(3) ,
time DATETIME
)
INSERT INTO #v
VALUES ( 1, 'A' ),
( 2, 'B' ),
( 3, 'C' )
INSERT INTO #a
VALUES ( 1, 'in', '2015-03-20 19:32:27.513' ),
( 1, 'out', '2015-03-20 19:32:27.514' ),
( 1, 'in', '2015-03-20 19:32:27.515' ),
( 2, 'in', '2015-03-20 19:32:27.516' ),
( 2, 'out', '2015-03-20 19:32:27.517' ),
( 3, 'in', '2015-03-20 19:32:27.518' ),
( 3, 'out', '2015-03-20 19:32:27.519' ),
( 3, 'in', '2015-03-20 19:32:27.523' )
SELECT *
FROM #v v
CROSS APPLY ( SELECT *
FROM ( SELECT TOP 1
type
FROM #a a
WHERE a.visitor_id = v.visitor_id
AND a.time >= #d
AND a.time < DATEADD(dd, 1, #d)
ORDER BY time DESC
) i
WHERE type = 'in'
) c
Output:
visitor_id visitor_name type
1 A in
3 C in
The principle:
First you are selecting all visitors.
Then you are applying to visitor last activity
SELECT TOP 1
type
FROM #a a
WHERE a.visitor_id = v.visitor_id
AND a.time >= #d
AND a.time < DATEADD(dd, 1, #d)
ORDER BY time DESC
Then you are selecting from previous step in order to get empty set which will filter out visitors whose last activity was not 'in'. If last activity was 'in' you get one row in result and thus applying works. If last activity is 'out' then outer query will result in empty set, and by design CROSS APPLY will eliminate such visitor.

FIFO Balance at every month end with sql

I am trying to get the balance quantity at the end of every month i.e. running total at every month end with FIFO only but its not showing any result. Pls help
Here is my query.
declare #Stock table (Item char(3) not null,Date date not null,TxnType varchar(3) not null,Qty int not null,Price decimal(10,2) null)
insert into #Stock(Item , [Date] , TxnType, Qty, Price) values
('ABC','20120401','IN', 200, 750.00),
('ABC','20120402','OUT', 100 ,null ),
('ABC','20120403','IN', 50, 700.00),
('ABC','20120404','IN', 75, 800.00),
('ABC','20120405','OUT', 175, null ),
('XYZ','20120406','IN', 150, 350.00),
('XYZ','20120407','OUT', 120 ,null ),
('XYZ','20120408','OUT', 10 ,null ),
('XYZ','20120409','IN', 90, 340.00),
('ABC','20120510','IN', 200, 750.00),
('ABC','20120511','OUT', 100 ,null ),
('ABC','20120512','IN', 50, 700.00),
('ABC','20120513','IN', 75, 800.00),
('ABC','20120514','OUT', 175, null ),
('XYZ','20120515','IN', 150, 350.00),
('XYZ','20120516','OUT', 120 ,null ),
('XYZ','20120517','OUT', 10 ,null ),
('XYZ','20120518','IN', 90, 340.00);
;WITH OrderedIn as (
select *,ROW_NUMBER() OVER (PARTITION BY month(date) ORDER BY DATE) as rn
from #Stock
where TxnType = 'IN'
), RunningTotals as (
select Item,Qty,Price,Qty as Total,0 as PrevTotal,rn from OrderedIn where rn = 1
union all
select rt.Item,oi.Qty,oi.Price,rt.Total + oi.Qty,rt.Total,oi.rn
from
RunningTotals rt
inner join
OrderedIn oi
on
rt.Item = oi.Item and
rt.rn = oi.rn - 1
), TotalOut as (
select Item,SUM(Qty) as Qty from #Stock where TxnType='OUT' group by Item
)
select
rt.Item,SUM(CASE WHEN PrevTotal > out.Qty THEN rt.Qty ELSE rt.Total - out.Qty END * Price)
from
RunningTotals rt
inner join
TotalOut out
on
rt.Item = out.Item
where
rt.Total > out.Qty
group by rt.Item
Output
Month Item (No column name(qty*price))
4 ABC 40000
4 XYZ 37600
5 ABC 77500
5 XYZ 76100
With you sample data.Are you looking for this,else clearly sketch your desired output .
;WITH CTE AS
(
select Item ,[Date] ,TxnType ,s.Qty
,Price ,ROW_NUMBER() OVER (PARTITION BY month(date),TxnType ORDER BY DATE) as rn
from #Stock S
)
,
CTE1 AS
(
SELECT ITEM ,[DATE] ,TXNTYPE ,QTY,MONTH(DATE) MONTHDT,RN FROM CTE WHERE RN=1
UNION ALL
SELECT S.ITEM ,S.[DATE] ,S.TXNTYPE ,S.QTY+A.QTY,A.MONTHDT ,S.RN
FROM CTE S INNER JOIN CTE1 A
ON S.TXNTYPE=A.TXNTYPE AND MONTH(S.[DATE])=A.MONTHDT AND S.RN-A.RN=1
)
,CTE2 AS
(
SELECT *
,ROW_NUMBER() OVER (PARTITION BY month(date),TxnType ORDER BY QTY DESC )RN1
FROM CTE1 S
)
SELECT A.DATE,A.QTY-(SELECT B.Qty from CTE2 B
where a.MONTHDT=b.MONTHDT and b.RN1=1 AND b.TxnType='OUT')
from CTE2 A
WHERE A.RN1=1 AND a.TxnType='IN'
Try this.
;WITH cte
AS (SELECT Row_number()OVER(partition BY item, Datepart(mm, date) ORDER BY date DESC) rn,
item,
Month(date) AS [month],
TxnType,
Price,
date
FROM #Stock
WHERE TxnType = 'IN'),
cte1
AS (SELECT Row_number() OVER(partition BY item, Datepart(mm, date) ORDER BY date DESC) rn,
item,
Month(date) [month],
(SELECT Sum(CASE
WHEN TxnType = 'out' THEN -1 * qty
ELSE qty
END) AS qty
FROM #Stock b
WHERE a.item = b.item
AND a.Date >= b.Date) qty
FROM #Stock a)
SELECT a.[month],
a.Item,
( a.Price * b.qty ) running_tot
FROM cte a
JOIN cte1 b
ON a.Item = b.Item
AND a.[month] = b.[month]
WHERE a.rn = 1
AND b.rn = 1

How to merge time intervals in SQL Server

Suppose I have the following an event table with personId, startDate and endDate.
I want to know how much time the person X spent doing an event (the events can override each other).
If the person just has 1 event, its easy: datediff(dd, startDate, endDate)
If the person has 2 events it gets tricky.
I'll set some scenarios for the expected results.
Scenario 1
startDate endDate
1 4
3 5
This means he the results should be the datediff from 1 to 5
Scenario 2
startDate endDate
1 3
6 9
this means he the results should be the some of datediff(dd,1,3) and datediff(dd,6,9)
How can I get this result on an sql query? I can only think of a bunch of if statements, but the same person can have n events so the query will be really confusing.
Shredder Edit: I'd like to add a 3rd scenario:
startDate endDate
1 5
4 8
11 15
Desired result to Shredder scenario:
(1,5) and (4,8) merge in (1,8) since they overlap then we need to datediff(1,8) + datediff(11,15) => 7 + 4 => 11
You can use a recursive CTE to build a list of dates and then count the distinct dates.
declare #T table
(
startDate date,
endDate date
);
insert into #T values
('2011-01-01', '2011-01-05'),
('2011-01-04', '2011-01-08'),
('2011-01-11', '2011-01-15');
with C as
(
select startDate,
endDate
from #T
union all
select dateadd(day, 1, startDate),
endDate
from C
where dateadd(day, 1, startDate) < endDate
)
select count(distinct startDate) as DayCount
from C
option (MAXRECURSION 0)
Result:
DayCount
-----------
11
Or you can use a numbers table. Here I use master..spt_values:
declare #MinStartDate date
select #MinStartDate = min(startDate)
from #T
select count(distinct N.number)
from #T as T
inner join master..spt_values as N
on dateadd(day, N.Number, #MinStartDate) between T.startDate and dateadd(day, -1, T.endDate)
where N.type = 'P'
Here's a solution that uses the Tally table idea (which I first heard of in an article by Itzk Ben-Gan -- I still cut and paste his code whenver the subject comes up). The idea is to generate a list of ascending integers, join the source data by range against the numbers, and then count the number of distinct numbers, as follows. (This code uses syntax from SQL Server 2008, but with minor modifications would work in SQL 2005.)
First set up some testing data:
CREATE TABLE #EventTable
(
PersonId int not null
,startDate datetime not null
,endDate datetime not null
)
INSERT #EventTable
values (1, 'Jan 1, 2011', 'Jan 4, 2011')
,(1, 'Jan 3, 2011', 'Jan 5, 2011')
,(2, 'Jan 1, 2011', 'Jan 3, 2011')
,(2, 'Jan 6, 2011', 'Jan 9, 2011')
Determine some initial values
DECLARE
#Interval bigint
,#FirstDay datetime
,#PersonId int = 1 -- (or whatever)
Get the first day and the maximum possible number of dates (to keep the cte from generating extra values):
SELECT
#Interval = datediff(dd, min(startDate), max(endDate)) + 1
,#FirstDay = min(startDate)
from #EventTable
where PersonId = #PersonId
Cut and paste over the one routine and modify and test it to only return as many integers as we'll need:
/*
;WITH
Pass0 as (select 1 as C union all select 1), --2 rows
Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
Pass5 as (select 1 as C from Pass4 as A, Pass4 as B),--4,294,967,296 rows
Tally as (select row_number() over(order by C) as Number from Pass5)
select Number from Tally where Number <= #Interval
*/
And now revise it by first joining to the intervals defined in each source row, and then count each distinct value found:
;WITH
Pass0 as (select 1 as C union all select 1), --2 rows
Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
Pass5 as (select 1 as C from Pass4 as A, Pass4 as B),--4,294,967,296 rows
Tally as (select row_number() over(order by C) as Number from Pass5)
SELECT PersonId, count(distinct Number) EventDays
from #EventTable et
inner join Tally
on dateadd(dd, Tally.Number - 1, #FirstDay) between et.startDate and et.endDate
where et.PersonId = #PersonId
and Number <= #Interval
group by PersonId
Take out the #PersonId filter and you'd get it for all persons. And with minor modification you can do it for any time interval, not just days (which is why I set the Tally table to generate severely large numbers.)
The following SQL is for the three scenarios you've described
with sampleData
AS (
SELECT 1 personid,1 startDate,4 endDate
UNION SELECT 1,3,5
UNION SELECT 2,1,3
UNION SELECT 2,6,9
UNION SELECT 3,1,5
UNION SELECT 3,4,8
UNION SELECT 3,11, 15
),
cte
AS (SELECT personid,
startdate,
enddate,
Row_number() OVER(ORDER BY personid, startdate) AS rn
FROM sampledata),
overlaps
AS (SELECT a.personid,
a.startdate,
b.enddate,
a.rn id1,
b.rn id2
FROM cte a
INNER JOIN cte b
ON a.personid = b.personid
AND a.enddate > b.startdate
AND a.rn = b.rn - 1),
nooverlaps
AS (SELECT a.personid,
a.startdate,
a.enddate
FROM cte a
LEFT JOIN overlaps b
ON a.rn = b.id1
OR a.rn = b.id2
WHERE b.id1 IS NULL)
SELECT personid,
SUM(timespent) timespent
FROM (SELECT personid,
enddate - startdate timespent
FROM nooverlaps
UNION
SELECT personid,
enddate - startdate
FROM overlaps) t
GROUP BY personid
Produces this result
Personid timeSpent
----------- -----------
1 4
2 5
3 11
Notes: I used the simple integers but the DateDiffs should work too
Correctness issue There is a correctness issue if your data is allowed to have multiple overlaps as Cheran S noted, the results won't be correct and you should use one of the other answers instead. His example used [1,5],[4,8],[7,11] for the same person ID
Algebra. If B-n is the ending time of the nth event, and A-n is the starting time of the nth event, then the sum of the differences is the difference of the sums. So you can write
select everything else, sum(cast(endDate as int)) - sum(cast(startDate as int)) as daysSpent
If your dates have no time component, this works. Otherwise, you could use a real.
Try something like this
select
personId,
sum(DateDuration) as TotalDuration
from
(
select personId, datediff(dd, startDate, endDate) as DateDuration
from yourEventTable
) a
group by personId
;WITH cte(gap)
AS
(
SELECT sum(b-a) from xxx GROUP BY uid
)
SELECT * FROM cte
Edit 1: I have modified both solutions to get correct results.
Edit 2: I have done comparative tests using the solutions proposed by Mikael Eriksson, Conrad Frix, Philip Kelley and me. All tests use an EventTable with the following structure:
CREATE TABLE EventTable
(
EventID INT IDENTITY PRIMARY KEY
,PersonId INT NOT NULL
,StartDate DATETIME NOT NULL
,EndDate DATETIME NOT NULL
,CONSTRAINT CK_StartDate_Before_EndDate CHECK(StartDate < EndDate)
);
Also, all tests use warm buffer (no DBCC DROPCLEANBUFFERS) and cold [plan] cache (I have executed DBCC FREEPROCCACHE before every test). Because some solutions use a filter(PersonId = 1) and others not, I have inserted into EventTable rows for only one person (INSERT ...(PersonId,...) VALUES (1,...)).
These are the results:
My solutions use recursive CTEs.
Solution 1:
WITH BaseCTE
AS
(
SELECT e.StartDate
,e.EndDate
,e.PersonId
,ROW_NUMBER() OVER(PARTITION BY e.PersonId ORDER BY e.StartDate, e.EndDate) RowNumber
FROM EventTable e
), RecursiveCTE
AS
(
SELECT b.PersonId
,b.RowNumber
,b.StartDate
,b.EndDate
,b.EndDate AS MaxEndDate
,1 AS PseudoDenseRank
FROM BaseCTE b
WHERE b.RowNumber = 1
UNION ALL
SELECT crt.PersonId
,crt.RowNumber
,crt.StartDate
,crt.EndDate
,CASE WHEN crt.EndDate > prev.MaxEndDate THEN crt.EndDate ELSE prev.MaxEndDate END
,CASE WHEN crt.StartDate <= prev.MaxEndDate THEN prev.PseudoDenseRank ELSE prev.PseudoDenseRank + 1 END
FROM RecursiveCTE prev
INNER JOIN BaseCTE crt ON prev.PersonId = crt.PersonId
AND prev.RowNumber + 1 = crt.RowNumber
), SumDaysPerPersonAndInterval
AS
(
SELECT src.PersonId
,src.PseudoDenseRank --Interval ID
,DATEDIFF(DAY, MIN(src.StartDate), MAX(src.EndDate)) Days
FROM RecursiveCTE src
GROUP BY src.PersonId, src.PseudoDenseRank
)
SELECT x.PersonId, SUM( x.Days ) DaysPerPerson
FROM SumDaysPerPersonAndInterval x
GROUP BY x.PersonId
OPTION(MAXRECURSION 32767);
Solution 2:
DECLARE #Base TABLE --or a temporary table: CREATE TABLE #Base (...)
(
PersonID INT NOT NULL
,StartDate DATETIME NOT NULL
,EndDate DATETIME NOT NULL
,RowNumber INT NOT NULL
,PRIMARY KEY(PersonID, RowNumber)
);
INSERT #Base (PersonID, StartDate, EndDate, RowNumber)
SELECT e.PersonId
,e.StartDate
,e.EndDate
,ROW_NUMBER() OVER(PARTITION BY e.PersonID ORDER BY e.StartDate, e.EndDate) RowNumber
FROM EventTable e;
WITH RecursiveCTE
AS
(
SELECT b.PersonId
,b.RowNumber
,b.StartDate
,b.EndDate
,b.EndDate AS MaxEndDate
,1 AS PseudoDenseRank
FROM #Base b
WHERE b.RowNumber = 1
UNION ALL
SELECT crt.PersonId
,crt.RowNumber
,crt.StartDate
,crt.EndDate
,CASE WHEN crt.EndDate > prev.MaxEndDate THEN crt.EndDate ELSE prev.MaxEndDate END
,CASE WHEN crt.StartDate <= prev.MaxEndDate THEN prev.PseudoDenseRank ELSE prev.PseudoDenseRank + 1 END
FROM RecursiveCTE prev
INNER JOIN #Base crt ON prev.PersonId = crt.PersonId
AND prev.RowNumber + 1 = crt.RowNumber
), SumDaysPerPersonAndInterval
AS
(
SELECT src.PersonId
,src.PseudoDenseRank --Interval ID
,DATEDIFF(DAY, MIN(src.StartDate), MAX(src.EndDate)) Days
FROM RecursiveCTE src
GROUP BY src.PersonId, src.PseudoDenseRank
)
SELECT x.PersonId, SUM( x.Days ) DaysPerPerson
FROM SumDaysPerPersonAndInterval x
GROUP BY x.PersonId
OPTION(MAXRECURSION 32767);