I have tried browsing the problems & answers in this forum, but neither of them fit's my case sufficiently.
I have some people reporting in their status for 2 categories, which looks like this:
TimeStamp | PersonID | Category | Value
2015-07-02 01:25:00 | 2303 | CatA | 8.2
2015-07-02 01:25:00 | 2303 | CatB | 10.1
2015-07-02 03:35:00 | 2303 | CatA | 8.0
2015-07-02 03:35:00 | 2303 | CatB | 9.9
2015-07-02 02:30:00 | 4307 | CatA | 8.7
2015-07-02 02:30:00 | 4307 | CatB | 12.7
.
.
.
2015-07-31 22:15:00 | 9011 | CatA | 7.9
2015-07-31 22:15:00 | 9011 | CatB | 8.9
Some people report status several times per hour, but others only a couple of times per day.
I need to produce an an output, which shows latest know status for each day, for each hour of the day, for each person and category. This should look like this:
Date |Hour| Person | Category | Value
2015-07-02 | 1 | 2307 | CatA | Null
2015-07-02 | 1 | 2307 | CatB | Null
2015-07-02 | 2 | 2307 | CatA | 8.2
2015-07-02 | 2 | 2307 | CatB | 10.2
2015-07-02 | 3 | 2307 | CatA | 8.2
2015-07-02 | 3 | 2307 | CatB | 10.2
2015-07-02 | 4 | 2307 | CatA | 8.0
2015-07-02 | 4 | 2307 | CatB | 9.9
.
.
.
2015-07-31 | 23 | 9011 | CatA | 7.9
2015-07-31 | 23 | 9011 | CatB | 8.9
The first row(s) for each person and category will probably be null as there will be no known values as this is "beginning of time"
I have tried using a sub query like this:
SELECT Date
,hour
,Person
,Category
,(SELECT TOP 1 status FROM readings WHERE (readings.Date<=structure.Date) AND readings.Hour<=structure.hour)....and so forth.... order by TimeStamp DESC
FROM structure
This works - except in terms of performance because I need to do this for a month, for 2.000 persons for 2 categories and that means that the sub query must run (30*24*2000*2=2,880,000) times, and given the fact that table containing the readings also contains hundreds of thousands of readings, this don't work.
I have also tried messing round with row_number(), but have not succeed in this.
Any suggestions?
Edit (19-10-2015 15:34): In my query example above I am referring to a "structure" table. This is actually just (for the time being) a view, with the following SQL:
SELECT Calendar.CalendarDay, Hours.Hour, Persons.Person, Categories.Category
FROM Calendar CROSS JOIN Hours CROSS JOIN Persons CROSS JOIN Categories
This in order to produce a table containing a row for each day, for each hour for each person and each category. This table then contains (30*24*2000*2=2,880,000) rows.
For each of these rows, I need to locate the latest status from the readings table. So for each Day, for each hour, for each person and each category I need to read the latest available status from the readings table.
Let me guess.
Based on the task "to produce an output, which shows latest know status for each day, for each hour of the day, for each person and category" you need to take three steps:
(1) Find latest records for every hour;
(2) Get a table of all date and hours to show;
(3) Multiply that date-hours-table by persons and categories and left join the result with latest-records-for-every-hour.
-- Test data
declare #t table ([Timestamp] datetime2(0), PersonId int, Category varchar(4), Value decimal(3,1));
insert into #t values
('2015-07-02 01:25:00', 2303, 'CatA', 8.2 ),
('2015-07-02 01:45:00', 2303, 'CatA', 9.9 ),
('2015-07-02 01:25:00', 2303, 'CatB', 10.1 ),
('2015-07-02 03:35:00', 2303, 'CatA', 8.0 ),
('2015-07-02 03:35:00', 2303, 'CatB', 9.9 ),
('2015-07-02 02:30:00', 4307, 'CatA', 8.7 ),
('2015-07-02 02:30:00', 4307, 'CatB', 12.7 );
-- Latest records for every hour
declare #Latest table (
[Date] date,
[Hour] tinyint,
PersonId int,
Category varchar(4),
Value decimal(3,1)
primary key ([Date], [Hour], PersonId, Category)
);
insert into #Latest
select top 1 with ties
[Date] = cast([Timestamp] as date),
[Hour] = datepart(hour, [Timestamp]),
PersonId ,
Category ,
Value
from
#t
order by
row_number() over(partition by cast([Timestamp] as date), datepart(hour, [Timestamp]), PersonId, Category order by [Timestamp] desc);
-- Date-hours table
declare #FromDateTime datetime2(0);
declare #ToDateTime datetime2(0);
select #FromDateTime = min([Timestamp]), #ToDateTime = max([Timestamp]) from #t;
declare #DateDiff int = datediff(day, #FromDateTime, #ToDateTime);
declare #FromDate date = cast(#FromDateTime as date);
declare #FromHour int = datepart(hour, #FromDateTime);
declare #ToHour int = datepart(hour, #ToDateTime);
declare #DayHours table ([Date] date, [Hour] tinyint, primary key clustered ([Date], [Hour]) );
with N as
(
select n from (values (1),(2),(3),(4),(5),(6),(7),(8),(9),(10)) t(n)
),
D as (
select
row_number() over(order by (select 1))-1 as d
from
N n1, N n2, N n3
),
H as (
select top 24
row_number() over(order by (select 1)) - 1 as h
from
N n1, N n2
)
insert into #DayHours
select dateadd(day, d, #FromDate), h
from
D, h
where
#FromHour <= (d * 100 + h)
and (d * 100 + h) <= (#DateDiff * 100 + #ToHour);
-- #PersonsIds & #Categories tables (just an imitation of the real tables)
declare #PersonsIds table (Id int primary key);
declare #Categories table (Category varchar(4) primary key);
insert into #PersonsIds select distinct PersonId from #t;
insert into #Categories select distinct Category from #t;
-- The result
select
dh.[Date],
dh.[Hour],
PersonId = p.Id,
c.Category,
l.Value
from
#PersonsIds p cross join #Categories c cross join #DayHours dh
left join #Latest l on l.[Date] = dh.[Date] and l.[Hour] = dh.[Hour] and l.PersonId = p.Id and l.Category = c.Category
order by
[Date], [Hour], PersonId, Category;
Edit (1):
OK.
In order to bring over the previous values to empty spaces,
let's replace the last select statement with this one:
select top 1 with ties
dh.[Date],
dh.[Hour],
PersonId = p.Id,
c.Category,
l.Value
from
#PersonsIds p cross join #Categories c cross join #DayHours dh
left join #Latest l
on (l.[Date] = dh.[Date] and l.[Hour] <= dh.[Hour] or l.[Date] < dh.[Date])
and l.PersonId = p.Id and l.Category = c.Category
order by
row_number()
over (partition by dh.[Date], dh.[Hour], p.Id, c.Category
order by l.[Date] desc, l.[Hour] desc);
Edit (2):
Let's try to collect the Cartesian product in temporary table with clustered index: PersonId, Category, [Date], [Hour].
And then update the table dragging non-changed values:
declare #Result table (
[Date] date,
[Hour] tinyint,
PersonId int,
Category varchar(4),
Value decimal(3,1)
primary key (PersonId, Category, [Date], [Hour]) -- Important !!!
)
insert into #Result
select
dh.[Date],
dh.[Hour],
PersonId = p.Id,
c.Category,
l.Value
from
#PersonsIds p cross join #Categories c cross join #DayHours dh
left join #Latest l on l.[Date] = dh.[Date] and l.[Hour] = dh.[Hour] and l.PersonId = p.Id and l.Category = c.Category
order by
[Date], [Hour], PersonId, Category;
declare #PersonId int;
declare #Category varchar(4);
declare #Value decimal(3,1);
update #Result set
#Value = Value = isnull(Value, case when #PersonId = PersonId and #Category = Category then #Value end),
#PersonId = PersonId,
#Category = Category;
For yet better performance consider changing table variables with temporary tables and applying indexes in accordance with query plan recommendations.
If i got it correctly ..it should give you desired result.
select st.Date,
case when hour =1 then NULL
else hour
end as hour
,st.Person,st.Category,
(select status from reading qualify row_number() over (partition by personid
order by status desc)=1)
from structure;
You can achieve this in SQL, but it will be quite slow, because for every person, category, day and hour you will have to look for the latest entry for the person and category until then. Just think of the process: Pick a record in your big table, find all statuses until then, order them and find the latest thus and pick its value. And this will be done for every record in your big table.
You might be better of to simply retrieve all data with a program written in a programming language and collect the data with a control-break algorithm.
However, let's see how it's done in SQL.
One problem is SQL Server's poor date/time functions. We want to compare date plus hour, which would be easiest with strings in 'yyyymmddhh' format, e.g. '2015101923' < '2015102001'. In your big table you have date and hour and in your status table you have datetimes. Let's see how we can get the desired strings:
convert(varchar(8), bigtable.calendarday, 112) +
right('0' + convert(varchar(2), bigtable.hour), 2)
and
convert(varchar(8), status.timestamp, 112) +
right('0' + convert(varchar(2), datepart(hour, status.timestamp)), 2)
As this is - along with person and category - our key criterion to find records, you may want to have it as computed columns and add indexes (person + category + dayhourkey) in both tables.
You'd select from your big table and get the status value in a subquery. In order to get the latest matching record, you'd order by timestamp and limit to 1 record.
select
personid,
calendarday,
hour,
category,
(
select value
from status s
where s.personid = b.personid
and s.category = b.category
and convert(varchar(8), s.timestamp, 112) + right('0' + convert(varchar(2), datepart(hour, s.timestamp)), 2) <=
convert(varchar(8), b.calendarday, 112) + right('0' + convert(varchar(2), b.hour), 2)
order by s.timestamp desc limit 1
) as value
from bigtable b;
Related
Hi I am trying to run a query to return a row for each month between 2 dates for each project that I have. See example data:
Project Start End
1 1/1/2015 3/1/2015
2 2/1/2015 4/1/2015
End Data needed:
Project Month
1 1/1/2015
1 2/1/2015
1 3/1/2015
2 2/1/2015
2 3/1/2015
2 4/1/2015
I have several projects and will need a query to do this for all of them at the same time. How can I do this in SQL Server?
Another option is a CROSS APPLY with an ad-hoc tally table
Select A.Project
,Month = B.D
From YourTable A
Cross Apply (
Select Top (DateDiff(MONTH,A.Start,A.[End])+1) D=DateAdd(Month,-1+Row_Number() Over (Order By(Select null)),A.Start)
From master..spt_values
) B
Returns
Project Month
1 2015-01-01
1 2015-02-01
1 2015-03-01
2 2015-02-01
2 2015-03-01
2 2015-04-01
This is simple if you have or create a table for Months:
create table dbo.Months([Month] date primary key);
declare #StartDate date = '20100101'
,#NumberOfYears int = 30;
insert dbo.Months([Month],MonthEnd)
select top (12*#NumberOfYears)
[Month] = dateadd(month, row_number() over (order by number) -1, #StartDate)
from master.dbo.spt_values;
If you really do not want to have a Months table, you can use a cte like this:
declare #StartDate date = '20100101'
,#NumberOfYears int = 10;
;with Months as (
select top (12*#NumberOfYears)
[Month] = dateadd(month, row_number() over (order by number) -1, #StartDate)
from master.dbo.spt_values
)
Then query it like so:
select
t.Project
, m.Month
from t
inner join dbo.Months m
on m.Month >= t.Start
and m.Month <= t.[End]
rextester demo: http://rextester.com/SXPX26360
returns:
+---------+------------+
| Project | Month |
+---------+------------+
| 1 | 2015-01-01 |
| 1 | 2015-02-01 |
| 1 | 2015-03-01 |
| 2 | 2015-02-01 |
| 2 | 2015-03-01 |
| 2 | 2015-04-01 |
+---------+------------+
calendar and numbers tables reference:
Generate a set or sequence without loops 2- Aaron Bertrand
Creating a Date Table/Dimension in SQL Server 2008 - David Stein
Calendar Tables - Why You Need One - David Stein
Creating a date dimension or calendar table in SQL Server - Aaron Bertrand
TSQL Function to Determine Holidays in SQL Server - Tim Cullen
F_TABLE_DATE - Michael Valentine Jones
I personally like a tally table for this kind of thing. It is the swiss army knife of t-sql.
I create a view on my system for this. If you don't want to create a view you can easily use these ctes anytime you need a tally table.
create View [dbo].[cteTally] as
WITH
E1(N) AS (select 1 from (values (1),(1),(1),(1),(1),(1),(1),(1),(1),(1))dt(n)),
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
cteTally(N) AS
(
SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E4
)
select N from cteTally
GO
Now we just need your sample data in a table.
create table #Projects
(
Project int
, Start datetime
, EndDate datetime
)
insert #Projects
select 1, '1/1/2015', '3/1/2015' union all
select 2, '2/1/2015', '4/1/2015'
At this point we get to the real issue here which is retrieving your information. With the sample data and the view this becomes pretty simple.
select p.*
, NewMonth = DATEADD(MONTH, t.N - 1, p.Start)
from #Projects p
join cteTally t on t.N <= DATEDIFF(MONTH, p.Start, p.EndDate) + 1
order by p.Project
, t.N
Generate Time series take help from the link.
Then join time using between
SELECT --something
FROM table1 a
/type of/ JOIN table2 b ON b.field2 BETWEEN a.field2 AND a.field3
Assume that I have a table (MyTable) as follows:
item_id | date
----------------
1 | 2016-06-08
1 | 2016-06-07
1 | 2016-06-05
1 | 2016-06-04
1 | 2016-05-31
...
2 | 2016-06-08
2 | 2016-06-06
2 | 2016-06-04
2 | 2016-05-31
...
3 | 2016-05-31
...
I would like to build a weekly summary table that reports on a running 7 day window. The window would basically say "How many unique item_ids were reported in the preceding 7 days"?
So, in this case, the output table would look something like:
date | weekly_ids
----------------------
2016-05-31| 3 # All 3 were present on the 31st
2016-06-01| 3 # All 3 were present on the 31st which is < 7 days before the 1st
2016-06-02| 3 # Same
2016-06-03| 3 # Same
2016-06-04| 3 # Same
2016-06-05| 3 # Same
2016-06-06| 3 # Same
2016-06-07| 3 # Same
2016-06-08| 2 # item 3 was not present for the entire last week so it does not add to the count.
I've tried:
SELECT
item_id,
date,
MAX(present) OVER (
PARTITION BY item_id
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS is_present
FROM (
# Inner query
SELECT
item_id,
date,
1 AS present,
FROM MyTable
)
GROUP BY date
ORDER BY date DESC
This feels like it is going in the right direction. But as it is, the window runs over the wrong time-frame when dates aren't present (too many dates) and it also doesn't output records for dates when the item_id wasn't present (even if it was present on the previous date). Is there a simple resolution to this problem?
If it's helpful and necessary
I can hard-code an oldest date
I also can get a table of all of the item_ids in existence.
This query will only be run on BigQuery, so BQ specific functions/syntax are fair game and SQL functions/syntax that doesn't run on BigQuery unfortunately doesn't help me ...
I have created a temp table to hold dates, however, you probably would benefit from adding a permanent table to your database for these joins. Trust me it will cause less headaches.
DECLARE #my_table TABLE
(
item_id int,
date DATETIME
)
INSERT #my_table SELECT 1,'2016-06-08'
INSERT #my_table SELECT 1,'2016-06-07'
INSERT #my_table SELECT 1,'2016-06-05'
INSERT #my_table SELECT 1,'2016-06-04'
INSERT #my_table SELECT 1,'2016-05-31'
INSERT #my_table SELECT 2,'2016-06-08'
INSERT #my_table SELECT 2,'2016-06-06'
INSERT #my_table SELECT 2,'2016-06-04'
INSERT #my_table SELECT 2,'2016-05-31'
INSERT #my_table SELECT 3,'2016-05-31'
DECLARE #TrailingDays INT=7
DECLARE #LowDate DATETIME='01/01/2016'
DECLARE #HighDate DATETIME='12/31/2016'
DECLARE #Calendar TABLE(CalendarDate DATETIME)
DECLARE #LoopDate DATETIME=#LowDate
WHILE(#LoopDate<=#HighDate) BEGIN
INSERT #Calendar SELECT #LoopDate
SET #LoopDate=DATEADD(DAY,1,#LoopDate)
END
SELECT
date=HighDate,
weekly_ids=COUNT(DISTINCT item_id)
FROM
(
SELECT
HighDate=C.CalendarDate,
LowDate=LAG(C.CalendarDate, #TrailingDays,0) OVER (ORDER BY C.CalendarDate)
FROM
#Calendar C
WHERE
CalendarDate BETWEEN #LowDate AND #HighDate
)AS X
LEFT OUTER JOIN #my_table MT ON MT.date BETWEEN LowDate AND HighDate
GROUP BY
LowDate,
HighDate
Try below example. It can give you direction to explore
Purely GBQ - Legacy SQL
SELECT date, items FROM (
SELECT
date, COUNT(DISTINCT item_id) OVER(ORDER BY sec RANGE BETWEEN 60*60*24*2 PRECEDING AND CURRENT ROW) AS items
FROM (
SELECT
item_id, date, timestamp_to_sec(timestamp(date)) AS sec
FROM (
SELECT calendar.day AS date, MyTable.item_id AS item_id
FROM (
SELECT DATE(DATE_ADD(TIMESTAMP('2016-05-28'), pos - 1, "DAY")) AS day
FROM (
SELECT ROW_NUMBER() OVER() AS pos, *
FROM (FLATTEN((
SELECT SPLIT(RPAD('', 1 + DATEDIFF(TIMESTAMP(CURRENT_DATE()), TIMESTAMP('2016-05-28')), '.'),'') AS h
FROM (SELECT NULL)),h
)))
) AS calendar
LEFT JOIN (
SELECT date, item_id
FROM
(SELECT 1 AS item_id, '2016-06-08' AS date),
(SELECT 1 AS item_id, '2016-06-07' AS date),
(SELECT 1 AS item_id, '2016-06-05' AS date),
(SELECT 1 AS item_id, '2016-06-04' AS date),
(SELECT 1 AS item_id, '2016-05-28' AS date),
(SELECT 2 AS item_id, '2016-06-08' AS date),
(SELECT 2 AS item_id, '2016-06-06' AS date),
(SELECT 2 AS item_id, '2016-06-04' AS date),
(SELECT 2 AS item_id, '2016-05-31' AS date),
(SELECT 3 AS item_id, '2016-05-31' AS date),
(SELECT 3 AS item_id, '2016-06-05' AS date)
) AS MyTable
ON calendar.day = MyTable.date
)
)
)
GROUP BY date, items
ORDER BY date
Please note
oldest date - 2016-05-28 - is hardcoded in calendar subquery
window size is controled in RANGE BETWEEN 60*60*24*2 PRECEDING AND CURRENT ROW; if you need 7 days - the expression should be 60*60*24*6
have in mind specifics of COUNT(DISTINCT) in BigQuery Legacy SQL
I have a table which contains datetime rows like below.
ID | DateTime
1 | 12:00
2 | 12:02
3 | 12:03
4 | 12:04
5 | 12:05
6 | 12:10
I want to identify those rows where there is a 'gap' of 5 minutes between rows (for example, row 5 and 6).
I know that we need to use DATEDIFF, but how can I only get those rows which are consecutive with each other?
You can use LAG, LEAD window functions for this:
SELECT ID
FROM (
SELECT ID, [DateTime],
DATEDIFF(mi, LAG([DateTime]) OVER (ORDER BY ID), [DateTime]) AS prev_diff,
DATEDIFF(mi, [DateTime], LEAD([DateTime]) OVER (ORDER BY ID)) AS next_diff
FROM mytable) AS t
WHERE prev_diff >= 5 OR next_diff >= 5
Output:
ID
==
5
6
Note: The above query assumes that order is defined by ID field. You can easily substitute this field with any other field that specifies order in your table.
You might try this (I'm not sure if it's really fast)
SELECT current.datetime AS current_datetime,
previous.datetime AS previous_datetime,
DATEDIFF(minute, previous.datetime, current.datetime) AS gap
FROM my_table current
JOIN my_table previous
ON previous.datetime < current.datetime
AND NOT EXISTS (SELECT *
FROM my_table others
WHERE others.datetime < current.datetime
AND others.datetime > previous.datetime);
update SS2012: Use LAG
DECLARE #tbl TABLE(ID INT, T TIME)
INSERT INTO #tbl VALUES
(1,'12:00')
,(2,'12:02')
,(3,'12:03')
,(4,'12:04')
,(5,'12:05')
,(6,'12:10');
WITH TimesWithDifferenceToPrevious AS
(
SELECT ID
,T
,LAG(T) OVER(ORDER BY T) AS prev
,DATEDIFF(MI,LAG(T) OVER(ORDER BY T),T) AS MinuteDiff
FROM #tbl
)
SELECT *
FROM TimesWithDifferenceToPrevious
WHERE ABS(MinuteDiff) >=5
The result
6 12:10:00.0000000 12:05:00.0000000 5
I am trying to take given date ranges found in a data set and divide them into unique rows for each day in the range (example below). Doing the opposite in SQL is pretty straight forward, but I am struggling to achieve the desired query output.
Beginning data:
ITEM START_DATE END_DATE
A 1/1/2015 1/5/2015
B 2/5/2015 2/7/2015
Desired query output:
ITEM DATE_COVERED
A 1/1/2015
A 1/2/2015
A 1/3/2015
A 1/4/2015
A 1/5/2015
B 2/5/2015
B 2/6/2015
B 2/7/2015
The fastest way will be some tally table:
DECLARE #t TABLE
(
ITEM CHAR(1) ,
START_DATE DATE ,
END_DATE DATE
)
INSERT INTO #t
VALUES ( 'A', '1/1/2015', '1/5/2015' ),
( 'B', '2/5/2015', '2/7/2015' )
;WITH cte AS(SELECT -1 + ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) d FROM
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t1(n) CROSS JOIN
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t2(n) CROSS JOIN
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t3(n) CROSS JOIN
(VALUES(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t4(n))
SELECT t.ITEM, ca.DATE_COVERED FROM #t t
CROSS APPLY(SELECT DATEADD(dd, d, t.START_DATE) AS DATE_COVERED
FROM cte
WHERE DATEADD(dd, d, t.START_DATE) BETWEEN t.START_DATE AND t.END_DATE) ca
ORDER BY t.ITEM, ca.DATE_COVERED
Query:
SQLFiddleExample
SELECT t.ITEM,
DATEADD(day,n.number, t.START_DATE) AS DATE_COVERED
FROM Table1 t,
(SELECT number
FROM master..spt_values
WHERE [type] = 'P') n
WHERE START_DATE <= DATEADD(day,n.number, t.START_DATE)
AND END_DATE >= DATEADD(day,n.number, t.START_DATE)
Result:
| ITEM | DATE_COVERED |
|------|--------------|
| A | 2015-01-01 |
| A | 2015-01-02 |
| A | 2015-01-03 |
| A | 2015-01-04 |
| A | 2015-01-05 |
| B | 2015-02-05 |
| B | 2015-02-06 |
| B | 2015-02-07 |
NOTE: this only works if the difference between your startdate and enddate is a maximum of 2047 days (master..spt_values only allows 0..2047 range of values)
select item, dateadd(d,v.number,d.start_date) adate
from begindata d
join master..spt_values v on v.type='P'
and v.number between 0 and datediff(d, start_date, end_date)
order by adate;
I'd like to say I did this myself but I got the code from this
Here is a fiddle with your expected result
TRY THIS...
CREATE TABLE Table1
([ITEM] varchar(1), [START_DATE] date, [END_DATE] date)
;
INSERT INTO Table1
([ITEM], [START_DATE], [END_DATE])
VALUES ('A', '2015-01-01', '2015-01-05'), ('B', '2015-02-05', 2015-02-07');
WITH Days
AS ( SELECT ITEM, START_DATE AS [Date], 1 AS [level] from Table1
UNION ALL
SELECT TABLE1.ITEM, DATEADD(DAY, 1, [Date]), [level] + 1
FROM Days,Table1
WHERE DAYS.ITEM=TABLE1.ITEM AND [Date] < END_DATE )
SELECT distinct [Date]
FROM Days
DEMO
Im stuck on a SQL query. Im using SQL Server.
Given a table that contains Jobs with a start and end date. These jobs can span days or months. I need to get the total combined number of days worked each month for all jobs that intersected those months.
Jobs
-----------------------------------
JobId | Start | End | DayRate |
-----------------------------------
1 | 1.1.13 | 2.2.13 | 2500 |
2 | 5.1.13 | 5.2.13 | 2000 |
3 | 3.3.13 | 2.4.13 | 3000 |
The results i need are:
Month | Days
--------------
Jan | 57
Feb | 7
Mar | 28
Apr | 2
Any idea how i would right such a query ?
I would also like to work out the SUM for each month based on multiplying the dayrate by number of days worked for each job, how would i add this to the results ?
Thanks
You can use recursive CTE to extract all days from start to end for each JobID and then just group by month (and year I guess).
;WITH CTE_TotalDays AS
(
SELECT [Start] AS DT, JobID FROM dbo.Jobs
UNION ALL
SELECT DATEADD(DD,1,c.DT), c.JobID FROM CTE_TotalDays c
WHERE c.DT < (SELECT [End] FROM Jobs j2 WHERE j2.JobId = c.JobID)
)
SELECT
MONTH(DT) AS [Month]
,YEAR(DT) AS [Year]
,COUNT(*) AS [Days]
FROM CTE_TotalDays
GROUP BY MONTH(DT),YEAR(DT)
OPTION (MAXRECURSION 0)
SQLFiddle DEMO
PS: There are 58 days in Jan in your example and not 57 ;)
You can do it using following approach:
/* Your table with periods */
declare #table table(JobId int, Start date, [End] date, DayRate money)
INSERT INTO #table (JobId , Start, [End], DayRate)
VALUES
(1, '20130101','20130202', 2500),
(2,'20130105','20130205', 2000),
(3,'20130303','20130402' , 3000 )
/* create table where stored all possible dates
if this code are supposed to be executed often you can create
table with dates ones to avoid overhead of filling it */
declare #dates table(d date)
declare #d date='20000101'
WHILE #d<'20500101'
BEGIN
INSERT INTO #dates (d) VALUES (#d)
SET #d=DATEADD(DAY,1,#d)
END;
/* and at last get desired output */
SELECT YEAR(d.d) [YEAR], DATENAME(month,d.d) [MONTH], COUNT(*) [Days]
FROM #dates d
CROSS JOIN #table t
WHERE d.d BETWEEN t.Start AND t.[End]
GROUP BY YEAR(d.d), DATENAME(month,d.d)
This only have 1 recursive call instead of 1 for each row. I imagine this will perform better than the chosen answer when you have large amount of data.
declare #t table(JobId int, Start date, [End] date, DayRate int)
insert #t values
(1,'2013-01-01','2013-02-02', 2500),(2,'2013-01-05','2013-02-05', 2000),(3,'2013-03-03', '2013-04-02',3000)
;WITH a AS
(
SELECT min(Start) s, max([End]) e
FROM #t
), b AS
(
SELECT s, e from a
UNION ALL
SELECT dateadd(day, 1, s), e
FROM b WHERE s <> e
)
SELECT
MONTH(b.s) AS [Month]
,YEAR(b.s) AS [Year]
,COUNT(*) AS [Days]
,SUM(DayRate) MonthDayRate
FROM b
join #t t
on b.s between t.Start and t.[End]
GROUP BY MONTH(b.s),YEAR(b.s)
OPTION (MAXRECURSION 0)
Result:
Month Year Days MonthDayRate
1 2013 58 131500
2 2013 7 15000
3 2013 29 87000
4 2013 2 6000