SQL group by - How to get new records that did not exist in any previous month - sql

I have the following data.
DECLARE #Table TABLE(
RequestID varchar(100)
, PersonID varchar(100)
, StartDate DATE
, EndDate DATE
, StatusType VARCHAR(150)
)
INSERT INTO #Table
SELECT '445566', 'Person2', '9/13/2022', '9/16/2022', 'TransactionStarted'
UNION ALL
SELECT '445566', 'Person2', '9/13/2022', '9/16/2022', 'TransactionEnded'
UNION ALL
SELECT '112233', 'Person2', '8/13/2022', '8/16/2022', 'TransactionStarted'
UNION ALL
SELECT '112233', 'Person2', '8/13/2022', '8/16/2022', 'TransactionEnded'
UNION ALL
SELECT '556677', 'Person1', '8/10/2022', '8/12/2022', 'TransactionStarted'
UNION ALL
SELECT '556677', 'Person1', '8/10/2022', '8/12/2022', 'TransactionEnded'
I get the data out like this, and it works well
SELECT COUNT(Distinct RequestID) TotalCountPerPerson, DATENAME(mm, StartDate) [SRMonthName], MONTH(StartDate) [SRMonthNumber], YEAR(StartDate) [SRYear]
FROM #Table
GROUP BY MONTH(StartDate), DATENAME(mm,StartDate), YEAR(StartDate)
ORDER BY YEAR(StartDate), MONTH(StartDate)
But, I need an additional point of data that I am having trouble figuring out how to get. In the sample data, You can see that "Person 2" and "Person1" were both "new" to the system in August. I need a way to add them to the query so that I can get Total Count and New Count per month and year. Anyone help with the query?
SELECT 2 TotalCountPerPerson, 2 NewThisMonth, 'August'[SRMonthName], 8 [SRMonthNumber], 2022 [SRYear]
UNION ALL
SELECT 1, 0 NewThisMonth,'September', 9, 2022

You may use a self left join as the following:
SELECT COUNT(Distinct T.PersonID) TotalCountPerPerson,
COUNT(Distinct CASE WHEN D.PersonID IS NULL THEN T.PersonID END) NewThisMonth,
DATENAME(mm, T.StartDate) [SRMonthName],
MONTH(T.StartDate) [SRMonthNumber], YEAR(T.StartDate) [SRYear]
FROM #Table T LEFT JOIN #Table D
ON T.PersonID = D.personID AND T.StartDate > D.StartDate
GROUP BY MONTH(T.StartDate), DATENAME(mm,T.StartDate), YEAR(T.StartDate)
ORDER BY YEAR(T.StartDate), MONTH(T.StartDate)
Also, you may use EXISTS operator as the following:
WITH CTE AS
(
SELECT *, CASE WHEN NOT EXISTS
(SELECT 1 FROM #Table D WHERE D.PersonID = T.PersonID AND D.StartDate<T.StartDate)
THEN T.PersonID
END AS ISNEW
FROM #Table T
)
SELECT COUNT(Distinct PersonID) TotalCountPerPerson,
COUNT(Distinct ISNEW) NewThisMonth,
DATENAME(mm, StartDate) [SRMonthName],
MONTH(StartDate) [SRMonthNumber], YEAR(StartDate) [SRYear]
FROM CTE
GROUP BY MONTH(StartDate), DATENAME(mm,StartDate), YEAR(StartDate)
ORDER BY YEAR(StartDate), MONTH(StartDate);
See a demo.

Related

Taking most recent values in sum over date range

I have a table which has the following columns: DeskID *, ProductID *, Date *, Amount (where the columns marked with * make the primary key). The products in use vary over time, as represented in the image below.
Table format on the left, and a (hopefully) intuitive representation of the data on the right for one desk
The objective is to have the sum of the latest amounts of products by desk and date, including products which are no longer in use, over a date range.
e.g. using the data above the desired table is:
So on the 1st Jan, the sum is 1 of Product A
On the 2nd Jan, the sum is 2 of A and 5 of B, so 7
On the 4th Jan, the sum is 1 of A (out of use, so take the value from the 3rd), 5 of B, and 2 of C, so 8 in total
etc.
I have tried using a partition on the desk and product ordered by date to get the most recent value and turned the following code into a function (Function1 below) with #date Date parameter
select #date 'Date', t.DeskID, SUM(t.Amount) 'Sum' from (
select #date 'Date', t.DeskID, t.ProductID, t.Amount
, row_number() over (partition by t.DeskID, t.ProductID order by t.Date desc) as roworder
from Table1 t
where 1 = 1
and t.Date <= #date
) t
where t.roworder = 1
group by t.DeskID
And then using a utility calendar table and cross apply to get the required values over a time range, as below
select * from Calendar c
cross apply Function1(c.CalendarDate)
where c.CalendarDate >= '20190101' and c.CalendarDate <= '20191009'
This has the expected results, but is far too slow. Currently each desk uses around 50 products, and the products roll every month, so after just 5 years each desk has a history of ~3000 products, which causes the whole thing to grind to a halt. (Roughly 30 seconds for a range of a single month)
Is there a better approach?
Change your function to the following should be faster:
select #date 'Date', t.DeskID, SUM(t.Amount) 'Sum'
FROM (SELECT m.DeskID, m.ProductID, MAX(m.[Date) AS MaxDate
FROM Table1 m
where m.[Date] <= #date) d
INNER JOIN Table1 t
ON d.DeskID=t.DeskID
AND d.ProductID=t.ProductID
and t.[Date] = d.MaxDate
group by t.DeskID
The performance of TVF usually suffers. The following removes the TVF completely:
-- DROP TABLE Table1;
CREATE TABLE Table1 (DeskID int not null, ProductID nvarchar(32) not null, [Date] Date not null, Amount int not null, PRIMARY KEY ([Date],DeskID,ProductID));
INSERT Table1(DeskID,ProductID,[Date],Amount)
VALUES (1,'A','2019-01-01',1),(1,'A','2019-01-02',2),(1,'B','2019-01-02',5),(1,'A','2019-01-03',1)
,(1,'B','2019-01-03',4),(1,'C','2019-01-03',3),(1,'B','2019-01-04',5),(1,'C','2019-01-04',2),(1,'C','2019-01-05',2)
GO
DECLARE #StartDate date=N'2019-01-01';
DECLARE #EndDate date=N'2019-01-05';
;WITH cte_p
AS
(
SELECT DISTINCT DeskID,ProductID
FROM Table1
WHERE [Date] <= #EndDate
),
cte_a
AS
(
SELECT #StartDate AS [Date], p.DeskID, p.ProductID, ISNULL(a.Amount,0) AS Amount
FROM (
SELECT t.DeskID, t.ProductID
, MAX(t.Date) AS FirstDate
FROM Table1 t
WHERE t.Date <= #StartDate
GROUP BY t.DeskID, t.ProductID) f
INNER JOIN Table1 a
ON f.DeskID=a.DeskID
AND f.ProductID=a.ProductID
AND f.[FirstDate]=a.[Date]
RIGHT JOIN cte_p p
ON p.DeskID=a.DeskID
AND p.ProductID=a.ProductID
UNION ALL
SELECT DATEADD(DAY,1,a.[Date]) AS [Date], t.DeskID, t.ProductID, t.Amount
FROM Table1 t
INNER JOIN cte_a a
ON t.DeskID=a.DeskID
AND t.ProductID=a.ProductID
AND t.[Date] > a.[Date]
AND t.[Date] <= DATEADD(DAY,1,a.[Date])
WHERE a.[Date]<#EndDate
UNION ALL
SELECT DATEADD(DAY,1,a.[Date]) AS [Date], a.DeskID, a.ProductID, a.Amount
FROM cte_a a
WHERE NOT EXISTS(SELECT 1 FROM Table1 t
WHERE t.DeskID=a.DeskID
AND t.ProductID=a.ProductID
AND t.[Date] > a.[Date]
AND t.[Date] <= DATEADD(DAY,1,a.[Date]))
AND a.[Date]<#EndDate
)
SELECT [Date], DeskID, SUM(Amount)
FROM cte_a
GROUP BY [Date], DeskID;

Display Month Gaps for Each location

I have the following query which takes in the opps and calculates the duration, and revenue for each month. However, for some locations, where there is no data, it is missing some months. Essentially, I would like all months to appear for each of the location and record type. I tried a left outer join on the calendar but that didn't seem to work either.
Here is the query:
;With DateSequence( [Date] ) as
(
Select CAST(#fromdate as DATE) as [Date]
union all
Select CAST(dateadd(day, 1, [Date]) as Date)
from DateSequence
where Date < #todate
)
INSERT INTO CalendarTemp (Date, Day, DayOfWeek, DayOfYear, WeekOfYear, Month, MonthName, Year)
Select
[Date] as [Date],
DATEPART(DAY,[Date]) as [Day],
DATENAME(dw, [Date]) as [DayOfWeek],
DATEPART(DAYOFYEAR,[Date]) as [DayOfYear],
DATEPART(WEEK,[Date]) as [WeekOfYear],
DATEPART(MONTH,[Date]) as [Month],
DATENAME(MONTH,[Date]) as [MonthName],
DATEPART(YEAR,[Date]) as [Year]
from DateSequence option (MaxRecursion 10000)
;
DELETE FROM CalendarTemp WHERE DayOfWeek IN ('Saturday', 'Sunday');
SELECT
AccountId
,AccountName
,Office
,Stage = (CASE WHEN StageName = 'Closed Won' THEN 'Closed Won'
ELSE 'Open'
END)
,Id
,Name
,RecordType= (CASE
WHEN recordtypeid = 'LAS1' THEN 'S'
END)
,Start_Date
,End_Date
,Probability
,Estimated_Revenue_Won = ISNULL(Amount, 0)
,ROW_NUMBER() OVER(PARTITION BY Name ORDER BY Name) AS Row
--,Revenue_Per_Day = CAST(ISNULL(Amount/NULLIF(dbo.CalculateNumberOFWorkDays(Start_Date, End_Date),0),0) as money)
,YEAR(c.Date) as year
,MONTH(c.Date) as Month
,c.MonthName
--, ISNULL(CAST(Sum((Amount)/NULLIF(dbo.CalculateNumberOFWorkDays(Start_Date, End_Date),0)) as money),0) As RevenuePerMonth
FROM SF_Extracted_Opps o
LEFT OUTER JOIN CalendarTemp c on o.Start_Date <= c.Date AND o.End_Date >= c.Date
WHERE
Start_Date <= #todate AND End_Date >= #fromdate
AND Office IN (#Location)
AND recordtypeid IN ('LAS1')
GROUP BY
AccountId
,AccountName
,Office
,(CASE WHEN StageName = 'Closed Won' THEN 'Closed Won'
ELSE 'Open'
END)
,Id
,Name
,(CASE
WHEN recordtypeid = 'LAS1' THEN 'S'
END)
,Amount
--, CAST(ISNULL(Amount/NULLIF(dbo.CalculateNumberOFWorkDays(Start_Date, End_Date),0),0) as money)
,Start_Date
,End_Date
,Probability
,YEAR(c.Date)
,Month(c.Date)
,c.MonthName
,dbo.CalculateNumberOFWorkDays(Start_Date, End_Date)
ORDER BY Office
, (CASE
WHEN recordtypeid = 'LAS1' THEN 'S'
END)
,(CASE WHEN StageName = 'Closed Won' THEN 'Closed Won'
ELSE 'Open'
END)
, [Start_Date], Month(c.Date), AccountName, Row;
I tried adding another left outer join to this and using this a sub query and the join essentially on the calendar based on the year and month, but that did not seem to work either. Suggestions would be extremely appreciated.
--Date Calendar for each location:
;With DateSequence( [Date], Locatio) as
(
Select CAST(#fromdate as DATE) as [Date], oo.Office as location
union all
Select CAST(dateadd(day, 1, [Date]) as Date), oo.Office as location
from DateSequence dts
join Opportunity_offices oo on 1 = 1
where Date < #todate
)
--select result
INSERT INTO CalendarTemp (Location,Date, Day, DayOfWeek, DayOfYear, WeekOfYear, Month, MonthName, Year)
Select
location,
[Date] as [Date],
DATEPART(DAY,[Date]) as [Day],
DATENAME(dw, [Date]) as [DayOfWeek],
DATEPART(DAYOFYEAR,[Date]) as [DayOfYear],
DATEPART(WEEK,[Date]) as [WeekOfYear],
DATEPART(MONTH,[Date]) as [Month],
DATENAME(MONTH,[Date]) as [MonthName],
DATEPART(YEAR,[Date]) as [Year]
from DateSequence option (MaxRecursion 10000)
;
you have your LEFT JOIN backwards if you want all records from CalendarTemp and only those that match from SF_Extracted_Opps then you the CalendarTemp should be the table on the LEFT. You can however switch LEFT JOIN to RIGHT JOIN and it should be fixed. The other issue will be your WHERE statement is using columns from your SF_Extracted_Opps table which will just make that an INNER JOIN again.
here is one way to fix.
SELECT
.....
FROM
CalendarTemp c
LEFT JOIN SF_Extracted_Opps o
ON o.Start_Date <= c.Date AND o.End_Date >= c.Date
AND o.Start_Date <= #todate AND End_Date >= #fromdate
AND o.Office IN (#Location)
AND o.recordtypeid IN ('LAS1')
The other issue you might run into is because you remove weekends from your CalendarTemp Table not all dates are represented I would test with the weekends still in and out and see if you get different results.
this line:
AND o.Start_Date <= #todate AND End_Date >= #fromdate
should not be needed either because you are already limiting the dates from the line before and values in your CalendarTempTable
A note about your CalendarDate table you don't have to go back and delete those records simply add the day of week as a WHERE statement on the select that populates that table.
Edit for All Offices you can use a cross join of your offices table with your CalendarTemp table to do this do it in your final query not the cte that builds the calendar. The problem with doing it in the CTE calendar definition is that it is recursive so you would have to do it in both the anchor and the recursive member definition.
SELECT
.....
FROM
CalendarTemp c
CROSS JOIN Opportunity_offices oo
LEFT JOIN SF_Extracted_Opps o
ON o.Start_Date <= c.Date AND o.End_Date >= c.Date
AND o.Start_Date <= #todate AND End_Date >= #fromdate
AND oo.office = o.Office
AND o.recordtypeid IN ('LAS1')

Normalization of Year bringing nulls back

I have the following query:
SELECT DISTINCT
YEAR(DateRegistered) as Years,
Months.[MonthName],
COUNT(UserID)as totalReg
FROM
Months WITH(NOLOCK)
LEFT OUTER JOIN
UserProfile WITH(NOLOCK)
ON
Months.MonthID = MONTH(DateRegistered)
AND
DateRegistered > DATEADD(MONTH, -12,GETDATE())
GROUP BY YEAR(DateRegistered), Months.[MonthName]
ORDER BY Months.[MonthName]
As you can tell this will always bring back 12 months worth of data. As such it is working, although there is a bug with this method.
It creates Null values in months where there is no data, now the record should exist(whole point of the query) but Year field is bringing Nulls which is something I dont want.
Now I understand the problem is because there is no data, how is it supposed to know what year?
So my question is - is there any way to sort this out and replace the nulls? I suspect I will have to completely change my methodology.
**YEAR** **MONTH** **TOTAL**
2013 April 1
2013 August 1
NULL December 0
2013 February 8
2013 January 1
2013 July 1
NULL June 0
2013 March 4
NULL May 0
NULL November 0
NULL October 0
2012 September 3
If you want 12 months of data, then construct a list of numbers from 1 to 12 and use these as offsets with getdate():
with nums as (
select 12 as level union all
select level - 1
from nums
where level > 1
)
select YEAR(thedate) as Years,
Months.[MonthName],
COUNT(UserID) as totalReg
FROM (select DATEADD(MONTH, - nums.level, GETDATE()) as thedate
from nums
) mon12 left outer join
Months WITH (NOLOCK)
on month(mon12.thedate) = months.monthid left outer join
UserProfile WITH (NOLOCK)
ON Months.MonthID = MONTH(DateRegistered) and
DateRegistered > DATEADD(MONTH, -12, GETDATE())
GROUP BY YEAR(thedate), Months.[MonthName]
ORDER BY Months.[MonthName];
I find something strange about the query though. You are defining the span from the current date. However, you seem to be splitting the months themselves on calendar boundaries. I also find the table months to be awkward. Why aren't you just using the datename() and month() functions?
Try this out:
;With dates as (
Select DateName(Month, getdate()) as [Month],
DatePart(Year, getdate()) as [Year],
1 as Iteration
Union All
Select DateName(Month,DATEADD(MONTH, -Iteration, getdate())),
DatePart(Year,DATEADD(MONTH, -Iteration, getdate())),
Iteration + 1
from dates
where Iteration < 12
)
SELECT DISTINCT
d.Year,
d.Month as [MonthName],
COUNT(up.UserID)as totalReg
FROM dates d
LEFT OUTER JOIN UserProfile up ON d.Month = DateName(DateRegistered)
And d.Year = DatePart(Year, DateRegistered)
GROUP BY d.Year, d.Month
ORDER BY d.Year, d.Month
Here's my attempt at a solution:
declare #UserProfile table
(
id bigint not null identity(1,1) primary key clustered
, name nvarchar(32) not null
, dateRegistered datetime not null default(getutcdate())
)
insert #UserProfile
select 'person 1', '2011-01-23'
union select 'person 2', '2013-01-01'
union select 'person 3', '2013-05-27'
declare #yearMin int, #yearMax int
select #yearMin = year(MIN(dateRegistered))
, #yearMax= year(MAX(dateRegistered))
from #UserProfile
;with monthCte as
(
select 1 monthNo, DATENAME(month, '1900-01-01') Name
union all
select monthNo + 1, DATENAME(month, dateadd(month,monthNo,'1900-01-01'))
from monthCte
where monthNo < 12
)
, yearCte as
(
select #yearMin yearNo
union all
select yearNo + 1
from yearCte
where yearNo < #yearMax
)
select y.yearNo, m.Name, COUNT(up.id) UsersRegisteredThisPeriod
from yearCte y
cross join monthCte m
left outer join #UserProfile up
on year(up.dateRegistered) = y.yearNo
and month(up.dateRegistered) = m.monthNo
group by y.yearNo, m.monthNo, m.Name
order by y.yearNo, m.monthNo
SQL Fiddle Version: http://sqlfiddle.com/#!6/d41d8/6640
You have to calculate the counts in a Derived Table (or a CTE) first and then join
untested:
SELECT
COALESCE(dt.Years, YEAR(DATEADD(MONTH, -Months.MonthID, GETDATE()))),
Months.[MonthName],
COALESCE(dt.totalReg, 0)
FROM
Months WITH(NOLOCK)
LEFT OUTER JOIN
(
SELECT
YEAR(DateRegistered) AS Years,
MONTH(DateRegistered) AS Mon,
COUNT(UserID)AS totalReg
FROM UserProfile WITH(NOLOCK)
WHERE DateRegistered > DATEADD(MONTH, -12,GETDATE())
GROUP BY
YEAR(DateRegistered),
MONTH(DateRegistered)
) AS dt
ON Months.MonthID = dt.mon
ORDER BY 1, Months.MonthID
I changed the order to Months.MonthID instead of MonthName and i added year because you might have august 2012 and 2013 in your result.

How to count open records, grouped by hour and day in SQL-server-2008-r2

I have hospital patient admission data in Microsoft SQL Server r2 that looks something like this:
PatientID, AdmitDate, DischargeDate
Jones. 1-jan-13 01:37. 1-jan-13 17:45
Smith 1-jan-13 02:12. 2-jan-13 02:14
Brooks. 4-jan-13 13:54. 5-jan-13 06:14
I would like count the number of patients in the hospital day by day and hour by hour (ie at
1-jan-13 00:00. 0
1-jan-13 01:00. 0
1-jan-13 02:00. 1
1-jan-13 03:00. 2
And I need to include the hours when there are no patients admitted in the result.
I can't create tables so making a reference table listing all the hours and days is out, though.
Any suggestions?
To solve this problem, you need a list of date-hours. The following gets this from the admit date cross joined to a table with 24 hours. The table of 24 hours is calculating from information_schema.columns -- a trick for getting small sequences of numbers in SQL Server.
The rest is just a join between this table and the hours. This version counts the patients at the hour, so someone admitted and discharged in the same hour, for instance is not counted. And in general someone is not counted until the next hour after they are admitted:
with dh as (
select DATEADD(hour, seqnum - 1, thedatehour ) as DateHour
from (select distinct cast(cast(AdmitDate as DATE) as datetime) as thedatehour
from Admission a
) a cross join
(select ROW_NUMBER() over (order by (select NULL)) as seqnum
from INFORMATION_SCHEMA.COLUMNS
) hours
where hours <= 24
)
select dh.DateHour, COUNT(*) as NumPatients
from dh join
Admissions a
on dh.DateHour between a.AdmitDate and a.DischargeDate
group by dh.DateHour
order by 1
This also assumes that there are admissions on every day. That seems like a reasonable assumption. If not, a calendar table would be a big help.
Here is one (ugly) way:
;WITH DayHours AS
(
SELECT 0 DayHour
UNION ALL
SELECT DayHour+1
FROM DayHours
WHERE DayHour+1 <= 23
)
SELECT B.AdmitDate, A.DayHour, COUNT(DISTINCT PatientID) Patients
FROM DayHours A
CROSS JOIN (SELECT DISTINCT CONVERT(DATE,AdmitDate) AdmitDate
FROM YourTable) B
LEFT JOIN YourTable C
ON B.AdmitDate = CONVERT(DATE,C.AdmitDate)
AND A.DayHour = DATEPART(HOUR,C.AdmitDate)
GROUP BY B.AdmitDate, A.DayHour
This is a bit messy and includes a temp table with the test data you provided but
CREATE TABLE #HospitalPatientData (PatientId NVARCHAR(MAX), AdmitDate DATETIME, DischargeDate DATETIME)
INSERT INTO #HospitalPatientData
SELECT 'Jones.', '1-jan-13 01:37:00.000', '1-jan-13 17:45:00.000' UNION
SELECT 'Smith', '1-jan-13 02:12:00.000', '2-jan-13 02:14:00.000' UNION
SELECT 'Brooks.', '4-jan-13 13:54:00.000', '5-jan-13 06:14:00.000'
;WITH DayHours AS
(
SELECT 0 DayHour
UNION ALL
SELECT DayHour+1
FROM DayHours
WHERE DayHour+1 <= 23
),
HospitalPatientData AS
(
SELECT CONVERT(nvarchar(max),AdmitDate,103) as AdmitDate ,DATEPART(hour,(AdmitDate)) as AdmitHour, COUNT(PatientID) as CountOfPatients
FROM #HospitalPatientData
GROUP BY CONVERT(nvarchar(max),AdmitDate,103), DATEPART(hour,(AdmitDate))
),
Results AS
(
SELECT MAX(h.AdmitDate) as Date, d.DayHour
FROM HospitalPatientData h
INNER JOIN DayHours d ON d.DayHour=d.DayHour
GROUP BY AdmitDate, CountOfPatients, DayHour
)
SELECT r.*, COUNT(h.PatientId) as CountOfPatients
FROM Results r
LEFT JOIN #HospitalPatientData h ON CONVERT(nvarchar(max),AdmitDate,103)=r.Date AND DATEPART(HOUR,h.AdmitDate)=r.DayHour
GROUP BY r.Date, r.DayHour
ORDER BY r.Date, r.DayHour
DROP TABLE #HospitalPatientData
This may get you started:
BEGIN TRAN
DECLARE #pt TABLE
(
PatientID VARCHAR(10)
, AdmitDate DATETIME
, DischargeDate DATETIME
)
INSERT INTO #pt
( PatientID, AdmitDate, DischargeDate )
VALUES ( 'Jones', '1-jan-13 01:37', '1-jan-13 17:45' ),
( 'Smith', '1-jan-13 02:12', '2-jan-13 02:14' )
, ( 'Brooks', '4-jan-13 13:54', '5-jan-13 06:14' )
DECLARE #StartDate DATETIME = '20130101'
, #FutureDays INT = 7
;
WITH dy
AS ( SELECT TOP (#FutureDays)
ROW_NUMBER() OVER ( ORDER BY name ) dy
FROM sys.columns c
) ,
hr
AS ( SELECT TOP 24
ROW_NUMBER() OVER ( ORDER BY name ) hr
FROM sys.columns c
)
SELECT refDate, COUNT(p.PatientID) AS PtCount
FROM ( SELECT DATEADD(HOUR, hr.hr - 1,
DATEADD(DAY, dy.dy - 1, #StartDate)) AS refDate
FROM dy
CROSS JOIN hr
) ref
LEFT JOIN #pt p ON ref.refDate BETWEEN p.AdmitDate AND p.DischargeDate
GROUP BY refDate
ORDER BY refDate
ROLLBACK

How to merge time intervals in SQL Server

Suppose I have the following an event table with personId, startDate and endDate.
I want to know how much time the person X spent doing an event (the events can override each other).
If the person just has 1 event, its easy: datediff(dd, startDate, endDate)
If the person has 2 events it gets tricky.
I'll set some scenarios for the expected results.
Scenario 1
startDate endDate
1 4
3 5
This means he the results should be the datediff from 1 to 5
Scenario 2
startDate endDate
1 3
6 9
this means he the results should be the some of datediff(dd,1,3) and datediff(dd,6,9)
How can I get this result on an sql query? I can only think of a bunch of if statements, but the same person can have n events so the query will be really confusing.
Shredder Edit: I'd like to add a 3rd scenario:
startDate endDate
1 5
4 8
11 15
Desired result to Shredder scenario:
(1,5) and (4,8) merge in (1,8) since they overlap then we need to datediff(1,8) + datediff(11,15) => 7 + 4 => 11
You can use a recursive CTE to build a list of dates and then count the distinct dates.
declare #T table
(
startDate date,
endDate date
);
insert into #T values
('2011-01-01', '2011-01-05'),
('2011-01-04', '2011-01-08'),
('2011-01-11', '2011-01-15');
with C as
(
select startDate,
endDate
from #T
union all
select dateadd(day, 1, startDate),
endDate
from C
where dateadd(day, 1, startDate) < endDate
)
select count(distinct startDate) as DayCount
from C
option (MAXRECURSION 0)
Result:
DayCount
-----------
11
Or you can use a numbers table. Here I use master..spt_values:
declare #MinStartDate date
select #MinStartDate = min(startDate)
from #T
select count(distinct N.number)
from #T as T
inner join master..spt_values as N
on dateadd(day, N.Number, #MinStartDate) between T.startDate and dateadd(day, -1, T.endDate)
where N.type = 'P'
Here's a solution that uses the Tally table idea (which I first heard of in an article by Itzk Ben-Gan -- I still cut and paste his code whenver the subject comes up). The idea is to generate a list of ascending integers, join the source data by range against the numbers, and then count the number of distinct numbers, as follows. (This code uses syntax from SQL Server 2008, but with minor modifications would work in SQL 2005.)
First set up some testing data:
CREATE TABLE #EventTable
(
PersonId int not null
,startDate datetime not null
,endDate datetime not null
)
INSERT #EventTable
values (1, 'Jan 1, 2011', 'Jan 4, 2011')
,(1, 'Jan 3, 2011', 'Jan 5, 2011')
,(2, 'Jan 1, 2011', 'Jan 3, 2011')
,(2, 'Jan 6, 2011', 'Jan 9, 2011')
Determine some initial values
DECLARE
#Interval bigint
,#FirstDay datetime
,#PersonId int = 1 -- (or whatever)
Get the first day and the maximum possible number of dates (to keep the cte from generating extra values):
SELECT
#Interval = datediff(dd, min(startDate), max(endDate)) + 1
,#FirstDay = min(startDate)
from #EventTable
where PersonId = #PersonId
Cut and paste over the one routine and modify and test it to only return as many integers as we'll need:
/*
;WITH
Pass0 as (select 1 as C union all select 1), --2 rows
Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
Pass5 as (select 1 as C from Pass4 as A, Pass4 as B),--4,294,967,296 rows
Tally as (select row_number() over(order by C) as Number from Pass5)
select Number from Tally where Number <= #Interval
*/
And now revise it by first joining to the intervals defined in each source row, and then count each distinct value found:
;WITH
Pass0 as (select 1 as C union all select 1), --2 rows
Pass1 as (select 1 as C from Pass0 as A, Pass0 as B),--4 rows
Pass2 as (select 1 as C from Pass1 as A, Pass1 as B),--16 rows
Pass3 as (select 1 as C from Pass2 as A, Pass2 as B),--256 rows
Pass4 as (select 1 as C from Pass3 as A, Pass3 as B),--65536 rows
Pass5 as (select 1 as C from Pass4 as A, Pass4 as B),--4,294,967,296 rows
Tally as (select row_number() over(order by C) as Number from Pass5)
SELECT PersonId, count(distinct Number) EventDays
from #EventTable et
inner join Tally
on dateadd(dd, Tally.Number - 1, #FirstDay) between et.startDate and et.endDate
where et.PersonId = #PersonId
and Number <= #Interval
group by PersonId
Take out the #PersonId filter and you'd get it for all persons. And with minor modification you can do it for any time interval, not just days (which is why I set the Tally table to generate severely large numbers.)
The following SQL is for the three scenarios you've described
with sampleData
AS (
SELECT 1 personid,1 startDate,4 endDate
UNION SELECT 1,3,5
UNION SELECT 2,1,3
UNION SELECT 2,6,9
UNION SELECT 3,1,5
UNION SELECT 3,4,8
UNION SELECT 3,11, 15
),
cte
AS (SELECT personid,
startdate,
enddate,
Row_number() OVER(ORDER BY personid, startdate) AS rn
FROM sampledata),
overlaps
AS (SELECT a.personid,
a.startdate,
b.enddate,
a.rn id1,
b.rn id2
FROM cte a
INNER JOIN cte b
ON a.personid = b.personid
AND a.enddate > b.startdate
AND a.rn = b.rn - 1),
nooverlaps
AS (SELECT a.personid,
a.startdate,
a.enddate
FROM cte a
LEFT JOIN overlaps b
ON a.rn = b.id1
OR a.rn = b.id2
WHERE b.id1 IS NULL)
SELECT personid,
SUM(timespent) timespent
FROM (SELECT personid,
enddate - startdate timespent
FROM nooverlaps
UNION
SELECT personid,
enddate - startdate
FROM overlaps) t
GROUP BY personid
Produces this result
Personid timeSpent
----------- -----------
1 4
2 5
3 11
Notes: I used the simple integers but the DateDiffs should work too
Correctness issue There is a correctness issue if your data is allowed to have multiple overlaps as Cheran S noted, the results won't be correct and you should use one of the other answers instead. His example used [1,5],[4,8],[7,11] for the same person ID
Algebra. If B-n is the ending time of the nth event, and A-n is the starting time of the nth event, then the sum of the differences is the difference of the sums. So you can write
select everything else, sum(cast(endDate as int)) - sum(cast(startDate as int)) as daysSpent
If your dates have no time component, this works. Otherwise, you could use a real.
Try something like this
select
personId,
sum(DateDuration) as TotalDuration
from
(
select personId, datediff(dd, startDate, endDate) as DateDuration
from yourEventTable
) a
group by personId
;WITH cte(gap)
AS
(
SELECT sum(b-a) from xxx GROUP BY uid
)
SELECT * FROM cte
Edit 1: I have modified both solutions to get correct results.
Edit 2: I have done comparative tests using the solutions proposed by Mikael Eriksson, Conrad Frix, Philip Kelley and me. All tests use an EventTable with the following structure:
CREATE TABLE EventTable
(
EventID INT IDENTITY PRIMARY KEY
,PersonId INT NOT NULL
,StartDate DATETIME NOT NULL
,EndDate DATETIME NOT NULL
,CONSTRAINT CK_StartDate_Before_EndDate CHECK(StartDate < EndDate)
);
Also, all tests use warm buffer (no DBCC DROPCLEANBUFFERS) and cold [plan] cache (I have executed DBCC FREEPROCCACHE before every test). Because some solutions use a filter(PersonId = 1) and others not, I have inserted into EventTable rows for only one person (INSERT ...(PersonId,...) VALUES (1,...)).
These are the results:
My solutions use recursive CTEs.
Solution 1:
WITH BaseCTE
AS
(
SELECT e.StartDate
,e.EndDate
,e.PersonId
,ROW_NUMBER() OVER(PARTITION BY e.PersonId ORDER BY e.StartDate, e.EndDate) RowNumber
FROM EventTable e
), RecursiveCTE
AS
(
SELECT b.PersonId
,b.RowNumber
,b.StartDate
,b.EndDate
,b.EndDate AS MaxEndDate
,1 AS PseudoDenseRank
FROM BaseCTE b
WHERE b.RowNumber = 1
UNION ALL
SELECT crt.PersonId
,crt.RowNumber
,crt.StartDate
,crt.EndDate
,CASE WHEN crt.EndDate > prev.MaxEndDate THEN crt.EndDate ELSE prev.MaxEndDate END
,CASE WHEN crt.StartDate <= prev.MaxEndDate THEN prev.PseudoDenseRank ELSE prev.PseudoDenseRank + 1 END
FROM RecursiveCTE prev
INNER JOIN BaseCTE crt ON prev.PersonId = crt.PersonId
AND prev.RowNumber + 1 = crt.RowNumber
), SumDaysPerPersonAndInterval
AS
(
SELECT src.PersonId
,src.PseudoDenseRank --Interval ID
,DATEDIFF(DAY, MIN(src.StartDate), MAX(src.EndDate)) Days
FROM RecursiveCTE src
GROUP BY src.PersonId, src.PseudoDenseRank
)
SELECT x.PersonId, SUM( x.Days ) DaysPerPerson
FROM SumDaysPerPersonAndInterval x
GROUP BY x.PersonId
OPTION(MAXRECURSION 32767);
Solution 2:
DECLARE #Base TABLE --or a temporary table: CREATE TABLE #Base (...)
(
PersonID INT NOT NULL
,StartDate DATETIME NOT NULL
,EndDate DATETIME NOT NULL
,RowNumber INT NOT NULL
,PRIMARY KEY(PersonID, RowNumber)
);
INSERT #Base (PersonID, StartDate, EndDate, RowNumber)
SELECT e.PersonId
,e.StartDate
,e.EndDate
,ROW_NUMBER() OVER(PARTITION BY e.PersonID ORDER BY e.StartDate, e.EndDate) RowNumber
FROM EventTable e;
WITH RecursiveCTE
AS
(
SELECT b.PersonId
,b.RowNumber
,b.StartDate
,b.EndDate
,b.EndDate AS MaxEndDate
,1 AS PseudoDenseRank
FROM #Base b
WHERE b.RowNumber = 1
UNION ALL
SELECT crt.PersonId
,crt.RowNumber
,crt.StartDate
,crt.EndDate
,CASE WHEN crt.EndDate > prev.MaxEndDate THEN crt.EndDate ELSE prev.MaxEndDate END
,CASE WHEN crt.StartDate <= prev.MaxEndDate THEN prev.PseudoDenseRank ELSE prev.PseudoDenseRank + 1 END
FROM RecursiveCTE prev
INNER JOIN #Base crt ON prev.PersonId = crt.PersonId
AND prev.RowNumber + 1 = crt.RowNumber
), SumDaysPerPersonAndInterval
AS
(
SELECT src.PersonId
,src.PseudoDenseRank --Interval ID
,DATEDIFF(DAY, MIN(src.StartDate), MAX(src.EndDate)) Days
FROM RecursiveCTE src
GROUP BY src.PersonId, src.PseudoDenseRank
)
SELECT x.PersonId, SUM( x.Days ) DaysPerPerson
FROM SumDaysPerPersonAndInterval x
GROUP BY x.PersonId
OPTION(MAXRECURSION 32767);