SQL to get an daily average from month total - sql

I have a table that lists month totals (targets)
person total month
----------- --------------------- -----------
1001 114.00 201005
1001 120.00 201006
1001 120.00 201007
1001 120.00 201008
.
1002 114.00 201005
1002 222.00 201006
1002 333.00 201007
1002 111.00 201008
.
.
but month is an integer(!)
I also have another table that has a list of working days (calendar)
tran_date day_type
----------------------- ---------------------------------
1999-05-01 00:00:00.000 WEEKEND
1999-05-02 00:00:00.000 WEEKEND
1999-05-03 00:00:00.000 WORKING_DAY
1999-05-04 00:00:00.000 WORKING_DAY
1999-06-01 00:00:00.000 .....
.
.
.
What I want to do is get a list of dates with the average for that day based on the number of days in the month where day_type is 'WORKING_DAY' / the month's total.
so if I had say 20 working days in 201005 then I'd get an average of 114/20 on each working day, while the other days would be 0.
somthing like
person tran_date day_avg
------- ----------------------- ---------------------------------
1001 2010-05-01 00:00:00.000 0
1001 2010-05-02 00:00:00.000 0
1001 2010-05-03 00:00:00.000 114/2 (as there are two working days)
1001 2010-05-04 00:00:00.000 114/2 (as there are two working days)
.
.
.
It has to be done as a CTE as this is a limitation of the target system (I can only do one statement)
I can start off with (Dates to
WITH
Dates AS
(
SELECT CAST('19990501' as datetime) TRAN_DATE
UNION ALL
SELECT TRAN_DATE + 1
FROM Dates
WHERE TRAN_DATE + 1 <= CAST('20120430' as datetime)
),
Targets as
(
select CAST(cast(month as nvarchar) + '01' as dateTime) mon_start,
DATEADD(MONTH, 1, CAST(cast(month as nvarchar) + '01' as dateTime)) mon_end,
total
from targets
)
select ????

Sample data (may vary):
select * into #totals from (
select '1001' as person, 114.00 as total, 199905 as month union
select '1001', 120.00, 199906 union
select '1001', 120.00, 199907 union
select '1001', 120.00, 199908
) t
select * into #calendar from (
select cast('19990501' as datetime) as tran_date, 'WEEKEND' as day_type union
select '19990502', 'WEEKEND' union
select '19990503', 'WORKING_DAY' union
select '19990504', 'WORKING_DAY' union
select '19990505', 'WORKING_DAY' union
select '19990601', 'WEEKEND' union
select '19990602', 'WORKING_DAY' union
select '19990603', 'WORKING_DAY' union
select '19990604', 'WORKING_DAY' union
select '19990605', 'WORKING_DAY' union
select '19990606', 'WORKING_DAY' union
select '19990701', 'WORKING_DAY' union
select '19990702', 'WEEKEND' union
select '19990703', 'WEEKEND' union
select '19990704', 'WORKING_DAY' union
select '19990801', 'WORKING_DAY' union
select '19990802', 'WORKING_DAY' union
select '19990803', 'WEEKEND' union
select '19990804', 'WEEKEND' union
select '19990805', 'WORKING_DAY' union
select '19990901', 'WORKING_DAY'
) t
Select statement, it returns 0 if the day is 'weekend' or not exists in calendar table. Please keep in mind that MAXRECURSION is a value between 0 and 32,767.
;with dates as (
select cast('19990501' as datetime) as tran_date
union all
select dateadd(dd, 1, tran_date)
from dates where dateadd(dd, 1, tran_date) <= cast('20010101' as datetime)
)
select t.person , d.tran_date, (case when wd.tran_date is not null then t.total / w_days else 0 end) as day_avg
from dates d
left join #totals t on
datepart(yy, d.tran_date) * 100 + datepart(mm, d.tran_date) = t.month
left join (
select datepart(yy, tran_date) * 100 + datepart(mm, tran_date) as month, count(*) as w_days
from #calendar
where day_type = 'WORKING_DAY'
group by datepart(yy, tran_date) * 100 + datepart(mm, tran_date)
) c on t.month = c.month
left join #calendar wd on d.tran_date = wd.tran_date and wd.day_type = 'WORKING_DAY'
where t.person is not null
option(maxrecursion 20000)

You could calculate the number of working days per month in a subquery. Only the subquery would have to use group by. For example:
select t.person
, wd.tran_date
, t.total / m.WorkingDays as day_avg
from #Targets t
join #WorkingDays wd
on t.month = convert(varchar(6), wd.tran_date, 112)
left join
(
select convert(varchar(6), tran_date, 112) as Month
, sum(case when day_type = 'WORKING_DAY' then 1 end) as WorkingDays
from #WorkingDays
group by
convert(varchar(6), tran_date, 112)
) as m
on m.Month = t.month
Working example at SE Data.
For the "magic number" 112 in convert, see the MSDN page.

If I understood your question correctly, the following query should do it:
SELECT
*,
ISNULL(
(
SELECT total
FROM targets
WHERE
MONTH(tran_date) = month - ROUND(month, -2)
AND c1.day_type = 'WORKING_DAY'
) /
(
SELECT COUNT(*)
FROM calendar c2
WHERE
MONTH(c1.tran_date) = MONTH(c2.tran_date)
AND c2.day_type = 'WORKING_DAY'
),
0
) day_avg
FROM
calendar c1
In plain English:
For each row in calendar,
get the total of the corresponding month if this row is a working day (otherwise get NULL),
get the number of working days in the same month
and divide them.
Finally, convert the NULL (of non-working days) into 0.

Related

SQL Select only missing months

Notice the 2017-04-01, 2018-02-01, 2018-07-01, and 2019-01-01 months are missing in the output. I want to show only those months which are missing. Does anyone know how to go about this?
Query:
SELECT TO_DATE("Month", 'mon''yy') as dates FROM sample_sheet
group by dates
order by dates asc;
Output:
2017-01-01
2017-02-01
2017-03-01
2017-05-01
2017-06-01
2017-07-01
2017-08-01
2017-09-01
2017-10-01
2017-11-01
2017-12-01
2018-01-01
2018-03-01
2018-04-01
2018-05-01
2018-06-01
2018-08-01
2018-09-01
2018-10-01
2018-11-01
2018-12-01
2019-02-01
2019-03-01
2019-04-01
I don't know Vertica, so I wrote a working proof of concept in Microsoft SQL Server and tried to convert it to Vertica syntax based on the online documentation.
It should look like this:
with
months as (
select 2017 as date_year, 1 as date_month, to_date('2017-01-01', 'YYYY-MM-DD') as first_date, to_date('2017-01-31', 'yyyy-mm-dd') as last_date
union all
select
year(add_months(first_date, 1)) as date_year,
month(add_months(first_date, 1)) as date_month,
add_months(first_date, 1) as first_date,
last_day(add_months(first_date, 1)) as last_date
from months
where first_date < current_date
),
sample_dates (a_date) as (
select to_date('2017-01-15', 'YYYY-MM-DD') union all
select to_date('2017-01-22', 'YYYY-MM-DD') union all
select to_date('2017-02-01', 'YYYY-MM-DD') union all
select to_date('2017-04-15', 'YYYY-MM-DD') union all
select to_date('2017-06-15', 'YYYY-MM-DD')
)
select *
from sample_dates right join months on sample_dates.a_date between first_date and last_date
where sample_dates.a_date is null
Months is a recursive dynamic table that holds all months since 2017-01, with first and last day of the month. sample_dates is just a list of dates to test the logic - you should replace it with your own table.
Once you build that monthly calendar table all you need to do is check your dates against it using an outer query to see what dates are not between any of those periods between first_date and last_date columns.
You can build a TIMESERIES of all dates between the first input date and the last input date (The highest granularity of a TIMESERIES is the day.), and filter out only the months' first days out of that; then left join that created sequence of firsts of month with your input to find out where the join would fail, checking for NULLS from the input branch of the join:
WITH
-- your input
input(mth1st) AS (
SELECT DATE '2017-01-01'
UNION ALL SELECT DATE '2017-02-01'
UNION ALL SELECT DATE '2017-03-01'
UNION ALL SELECT DATE '2017-05-01'
UNION ALL SELECT DATE '2017-06-01'
UNION ALL SELECT DATE '2017-07-01'
UNION ALL SELECT DATE '2017-08-01'
UNION ALL SELECT DATE '2017-09-01'
UNION ALL SELECT DATE '2017-10-01'
UNION ALL SELECT DATE '2017-11-01'
UNION ALL SELECT DATE '2017-12-01'
UNION ALL SELECT DATE '2018-01-01'
UNION ALL SELECT DATE '2018-03-01'
UNION ALL SELECT DATE '2018-04-01'
UNION ALL SELECT DATE '2018-05-01'
UNION ALL SELECT DATE '2018-06-01'
UNION ALL SELECT DATE '2018-08-01'
UNION ALL SELECT DATE '2018-09-01'
UNION ALL SELECT DATE '2018-10-01'
UNION ALL SELECT DATE '2018-11-01'
UNION ALL SELECT DATE '2018-12-01'
UNION ALL SELECT DATE '2019-02-01'
UNION ALL SELECT DATE '2019-03-01'
UNION ALL SELECT DATE '2019-04-01'
)
,
-- need a series of month's firsts
-- TIMESERIES works for INTERVAL DAY TO SECOND
-- so build that timeseries, and filter out
-- the month's firsts
limits(mth1st) AS (
SELECT MIN(mth1st) FROM input
UNION ALL SELECT MAX(mth1st) FROM input
)
,
alldates AS (
SELECT dt::DATE FROM limits
TIMESERIES dt AS '1 day' OVER(ORDER BY mth1st::TIMESTAMP)
)
,
allfirsts(mth1st) AS (
SELECT dt FROM alldates WHERE DAY(dt)=1
)
SELECT
allfirsts.mth1st
FROM allfirsts
LEFT JOIN input USING(mth1st)
WHERE input.mth1st IS NULL;
-- out mth1st
-- out ------------
-- out 2017-04-01
-- out 2018-02-01
-- out 2018-07-01
-- out 2019-01-01

Compare data of current week against same week of previous years

I have this table that contains sales by stores & date.
-------------------------------------------
P_DATE - P_STORE - P_SALES
-------------------------------------------
2019-02-05 - S1 - 5000
2019-02-05 - S2 - 9850
2018-06-17 - S1 - 6980
2018-05-17 - S2 - 6590
..
..
..
-------------------------------------------
I want to compare Sum of sales for each store of last 10 weeks of this year with same week of previous years.
I want a result like this :
---------------------------------------------------
Week - Store - Sales-2019 - Sales2018
---------------------------------------------------
20 - S1 - 2580 - 2430
20 - S2 - 2580 - 2430
.
.
10 - S1 - 5905 - 5214
10 - S2 - 4789 - 6530
---------------------------------------------------
I'v tried this :
Select
[Week] = DATEPART(WEEK, E_Date),
[Store] = E_store
[Sales 2019] = Case when Year(P_date) = '2019' Then Sum (P_Sales)
[Sales 2018] = Case when Year(P_date) = '2018' Then Sum (P_Sales)
From
PIECE
Group by
DATEPART(WEEK, E_Date),
E_store
I need your help please.
This script will consider 10 weeks including current week-
WITH wk_list (COMMON,DayMinus)
AS
(
SELECT 1,0 UNION ALL
SELECT 1,1 UNION ALL
SELECT 1,2 UNION ALL
SELECT 1,3 UNION ALL
SELECT 1,4 UNION ALL
SELECT 1,5 UNION ALL
SELECT 1,6 UNION ALL
SELECT 1,7 UNION ALL
SELECT 1,8 UNION ALL
SELECT 1,9
)
SELECT
DATEPART(ISO_WEEK, P_DATE) WK,
P_STORE,
SUM(CASE WHEN YEAR(P_DATE) = 2019 THEN P_SALES ELSE 0 END) SALES_2019,
SUM(CASE WHEN YEAR(P_DATE) = 2018 THEN P_SALES ELSE 0 END) SALES_2018
FROM your_table
WHERE YEAR(P_DATE) IN (2019,2018)
AND DATEPART(ISO_WEEK, P_DATE) IN
(
SELECT A.WKNUM-wk_list.DayMinus AS [WEEK NUMBER]
FROM wk_list
INNER JOIN (
SELECT 1 AS COMMON,DATENAME(ISO_WEEK,GETDATE()) WKNUM
) A ON wk_list.COMMON = A.COMMON
)
GROUP BY DATEPART(ISO_WEEK, P_DATE),P_STORE
But if you want to exclude current week, just replace the following part in above script
, wk_list (COMMON,DayMinus)
AS
(
SELECT 1,1 UNION ALL
SELECT 1,2 UNION ALL
SELECT 1,3 UNION ALL
SELECT 1,4 UNION ALL
SELECT 1,5 UNION ALL
SELECT 1,6 UNION ALL
SELECT 1,7 UNION ALL
SELECT 1,8 UNION ALL
SELECT 1,9 UNION ALL
SELECT 1,10
)
Is this what you're looking for?
DECLARE #t TABLE (TransactionID INT, Week INT, Year INT, Amount MONEY)
INSERT INTO #t
(TransactionID, Week, Year, Amount)
VALUES
(1, 20, 2018, 50),
(2, 20, 2019, 20),
(3, 19, 2018, 35),
(4, 19, 2019, 40),
(5, 20, 2018, 70),
(6, 20, 2019, 80)
SELECT TOP 10 Week, [2018], [2019] FROM (SELECT Week, Year, SUM(Amount) As Amount FROM #t GROUP BY Week, Year) t
PIVOT
(
SUM(Amount)
FOR Year IN ([2018], [2019])
) sq
ORDER BY Week DESC

Date wise hourly (on 24 hour) coustomer count

I have a data set where customer id , customer join time and leave time available. I want to count hourly basis each date customer
Here is sample data set
My expected output
Here I going to add my code snip that i tried,where 1st created 24 hours span then tried to join and aggregate function for getting expected result and got for current date but i need for any date i.e dynamically
select logdate as date,timespan,count(customer_id)
(
SELECT userid,cast(joinTime as date) as logdate,customer_id
,starttime,endtime,timespan
FROM login_out_logs AS logTable
left join
(select '00:00:00 - 01:00:00' timespan,DATEadd(hh,0,cast(dateadd(dd,-1,getdate()))) starttime,dateadd(hh,1,cast(dateadd(dd,-1,getdate()))) endtime
union
select '01:00:00 - 02:00:00', dateadd(hh,1,cast(dateadd(dd,-1,getdate()))),dateadd(hh,2,cast(dateadd(dd,-1,getdate())))
union
select '02:00:00 - 03:00:00', dateadd(hh,2,cast(dateadd(dd,-1,getdate()))),dateadd(hh,3,cast(dateadd(dd,-1,getdate())))
union
select '03:00:00 - 04:00:00', dateadd(hh,3,cast(dateadd(dd,-1,getdate()))),dateadd(hh,4,cast(dateadd(dd,-1,getdate())))
union
select '04:00:00 - 05:00:00', dateadd(hh,4,cast(dateadd(dd,-1,getdate()))),dateadd(hh,5,cast(dateadd(dd,-1,getdate())))
union
select '05:00:00 - 06:00:00',dateadd(hh,5,cast(dateadd(dd,-1,getdate()))),dateadd(hh,6,cast(dateadd(dd,-1,getdate())))
union
select '06:00:00 - 07:00:00',dateadd(hh,6,cast(dateadd(dd,-1,getdate()))),dateadd(hh,7,cast(dateadd(dd,-1,getdate())))
union
select '07:00:00 - 08:00:00',dateadd(hh,7,cast(dateadd(dd,-1,getdate()))),dateadd(hh,8,cast(dateadd(dd,-1,getdate())))
union
select '08:00:00 - 09:00:00',dateadd(hh,8,cast(dateadd(dd,-1,getdate()))),dateadd(hh,9,cast(dateadd(dd,-1,getdate())))
union
select '09:00:00 - 10:00:00',dateadd(hh,9,cast(dateadd(dd,-1,getdate()))),dateadd(hh,10,cast(dateadd(dd,-1,getdate())))
union
select '10:00:00 - 11:00:00',dateadd(hh,10,cast(dateadd(dd,-1,getdate()))),dateadd(hh,11,cast(dateadd(dd,-1,getdate())))
union
select '11:00:00 - 12:00:00',dateadd(hh,11,cast(dateadd(dd,-1,getdate()))),dateadd(hh,12,cast(dateadd(dd,-1,getdate())))
union
select '12:00:00 - 13:00:00',dateadd(hh,12,cast(dateadd(dd,-1,getdate()))),dateadd(hh,13,cast(dateadd(dd,-1,getdate())))
union
select '13:00:00 - 14:00:00',dateadd(hh,13,cast(dateadd(dd,-1,getdate()))),dateadd(hh,14,cast(dateadd(dd,-1,getdate())))
union
select '14:00:00 - 15:00:00',dateadd(hh,14,cast(dateadd(dd,-1,getdate()))),dateadd(hh,15,cast(dateadd(dd,-1,getdate())))
union
select '15:00:00 - 16:00:00',dateadd(hh,15,cast(dateadd(dd,-1,getdate()))),dateadd(hh,16,cast(dateadd(dd,-1,getdate())))
union
select '16:00:00 - 17:00:00',dateadd(hh,16,cast(dateadd(dd,-1,getdate()))),dateadd(hh,17,cast(dateadd(dd,-1,getdate())))
union
select '17:00:00 - 18:00:00',dateadd(hh,17,cast(dateadd(dd,-1,getdate()))),dateadd(hh,18,cast(dateadd(dd,-1,getdate())))
union
select '18:00:00 - 19:00:00',dateadd(hh,18,cast(dateadd(dd,-1,getdate()))),dateadd(hh,19,cast(dateadd(dd,-1,getdate())))
union
select '19:00:00 - 20:00:00',dateadd(hh,19,cast(dateadd(dd,-1,getdate()))),dateadd(hh,20,cast(dateadd(dd,-1,getdate())))
union
select '20:00:00 - 21:00:00',dateadd(hh,20,cast(dateadd(dd,-1,getdate()))),dateadd(hh,21,cast(dateadd(dd,-1,getdate())))
union
select '21:00:00 - 22:00:00',dateadd(hh,21,cast(dateadd(dd,-1,getdate()))),dateadd(hh,22,cast(dateadd(dd,-1,getdate())))
union
select '22:00:00 - 23:00:00',dateadd(hh,22,cast(dateadd(dd,-1,getdate()))),dateadd(hh,23,cast(dateadd(dd,-1,getdate())))
union
select '24:00:00 - 00:00:00',dateadd(hh,23,cast(dateadd(dd,-1,getdate()))),dateadd(hh,23,dateadd(mi,59,cast(dateadd(dd,-1,getdate())))))a
on starttime between jointime and leaveTime
or endtime between jointime and leaveTime
or jointime>=starttime and jointime<endtime
) as T
group by leaveTime,timespan
Date Hour customer_count
2018-01-01 8-9 1
2018-01-01 9-10 1
2018-01-01 10-11 1
2018-01-01 11-12 1
2018-01-01 12-13 1
2018-01-01 13-14 1
2018-01-01 14-15 1
2018-01-01 15-16 1
2018-01-01 16-17 1
2018-01-01 17-18 1
2018-01-01 18-19 1
2018-01-01 19-20 1
2018-01-01 20-21 2
2018-01-01 21-22 3
2018-01-01 22-23 2
2018-01-01 23-00 1
Here is an approach - maybe this already solves your problem. I designed it in order to work with any day-difference between join and leave. However, I can't tell anything about the performance on larger sets since I tested with your example only and the evaluation of all relevant hours might take a bit longer if it comes to bigger data sets.
Anyways, I used a recursice cte here in order to evaluate all hours between join and leave and lateron I group by date and hour:
DECLARE #Cust TABLE(
customer_id INT,
joinTime DATETIME,
leaveTime DATETIME
)
INSERT INTO #Cust VALUES
(536, '2018-01-01 08:05:00', '2018-01-01 18:31:00'),
(344, '2018-01-01 19:37:00', '2018-01-01 20:16:00'),
(344, '2018-01-01 19:49:00', '2018-01-01 20:00:00'),
(899, '2018-01-01 20:49:00', '2018-01-01 21:14:00'),
(2336, '2018-01-01 21:02:00', '2018-01-01 21:03:00'),
(335, '2018-01-01 21:03:00', '2018-01-01 23:43:00'),
(2336, '2018-01-01 21:03:00', '2018-01-02 00:06:00'),
(899, '2018-01-01 21:18:00', '2018-01-01 22:24:00'),
(345, '2018-01-01 21:21:00', '2018-01-01 21:39:00'),
(345, '2018-01-01 21:53:00', '2018-01-02 00:13:00');
;WITH cte AS(
SELECT c.customer_id,
c.joinTime,
c.leaveTime,
c.joinTime x
FROM #Cust c
UNION ALL
SELECT c.customer_id,
c.joinTime,
c.leaveTime,
DATEADD(HOUR, 1, x) x
FROM cte c
WHERE DATEADD(HOUR, 1, x) <= CASE WHEN DATEPART(MINUTE, x) < DATEPART(MINUTE, c.leaveTime) THEN c.leaveTime ELSE DATEADD(HOUR, 1, c.leaveTime) END
)
SELECT CONVERT(DATE, x) AS cDate, DATEPART(HOUR, x) AS cHour, COUNT(*) AS cCount
FROM cte
GROUP BY CONVERT(DATE, x), DATEPART(HOUR, x)
ORDER BY 1,2
OPTION (MAXRECURSION 0)
Try this:
;WITH hourlist(starthour) AS (
SELECT 0 -- Seed Row
UNION ALL
SELECT starthour + 1 -- Recursion
FROM hourlist
where starthour+1<=23
)
SELECT
day
,convert(nvarchar,starthour)+'-'+convert(nvarchar,case when starthour+1=24 then 0 else starthour+1 end) hourtitle
,count(distinct customer_id) 'customer count'
FROM
hourlist h -- list of all hourse
cross join
(
select distinct dateadd(day,datediff(day,0, joinTime),0) from #login_out_logs
union
select distinct dateadd(day,datediff(day,0,leaveTime),0) from #login_out_logs
)q10(day) -- list of all days of jointime and leavetime
inner join #login_out_logs l on -- log considered for specific day/hour if starts before hourend and ends before hourstart
l.joinTime <dateadd(hour,starthour+1,q10.day)
and
l.leaveTime>=dateadd(hour,starthour ,q10.day)
group by day,starthour
order by day,starthour
Note: this will only work for jointimes and leavetimes that differ 0 or 1 days, not 2 or more.

Select unexist date yyyy/mm

I have a SQL query as follows:
SELECT
R.BOOK_YM, COUNT(*)
FROM
#TB_TRANSED_GUI_AMOUNT AS R
GROUP
BY R.BOOK_YM
The selected results like as follows:
BOOK_YM | count(*)
---------+---------
201411 | 51
201412 | 142
201501 | 1
201506 | 1
How do I modify the result like the follows:
BOOK_YM | count(*)
--------+-----------
201411 | 51
201412 | 142
201501 | 1
201502 | 0
201503 | 0
201504 | 0
201505 | 0
201506 | 1
You can use the following to get a table of all YYYYMM between 2 dates and use the resulting #YMList variable in a join:
declare #StartDate date = cast(dateadd(month, -12, GetDate()) as date)
declare #EndDate date = cast(GetDate() as date)
declare #YMList TABLE (YYYYMM INT)
;with dates as (
SELECT cast(#StartDate as Date) [date]
UNION ALL
SELECT DATEADD(month,1,t.date)
FROM dates t
WHERE t.[date] < #EndDate
)
insert #YMList (YYYYMM)
select
YEAR([Date]) * 100 + MONTH([Date]) AS YYYYMM
from dates
WHERE [Date] < #EndDate
OPTION (MAXRECURSION 10000)
Try this: (I think BOOK_YM is a varchar field if not CAST it)
SELECT
Y.YYYY + M.MM AS BOOK_YM,
COUNT(R.BOOK_YM) AS CNT
FROM
(SELECT SUBSTRING(BOOK_YM, 1, 4) As YYYY
FROM t
GROUP BY SUBSTRING(BOOK_YM, 1, 4)) AS Y
CROSS JOIN
(SELECT '01' As MM
UNION ALL SELECT '02'
UNION ALL SELECT '03'
UNION ALL SELECT '04'
UNION ALL SELECT '05'
UNION ALL SELECT '06'
UNION ALL SELECT '07'
UNION ALL SELECT '08'
UNION ALL SELECT '09'
UNION ALL SELECT '10'
UNION ALL SELECT '11'
UNION ALL SELECT '12') AS M
LEFT JOIN
t AS R
ON Y.YYYY + M.MM = R.BOOK_YM
WHERE
Y.YYYY + M.MM BETWEEN (SELECT MIN(BOOK_YM) FROM t) AND (SELECT MAX(BOOK_YM) FROM t)
GROUP BY
Y.YYYY + M.MM

Get records only if consecutive days per user is 30 or greater

I have the following data being returned from a query. Essentially I am putting this in a temp table so it is now in a temp table that I can query off of(Obviously a lot more data in real life, I am just showing an example):
EmpId Date
1 2011-01-01
1 2011-01-02
1 2011-01-03
2 2011-02-03
3 2011-03-01
4 2011-03-02
5 2011-01-02
I need to return only EmpId's that have 30 or more consecutive days in the date column. I also need to return the day count for these employees that have 30 or more consecutive days. There could potentially be 2 or more sets of different consecutive days that are 30 or more days. iIn this instance I would like to return multiple rows. So if an employee has a date from 2011-01-01 to 2011-02-20 then return this and the count in one row. Then if this same employee has dates of 2011-05-01 to 2011-07-01 then return this in another row. Essentially all breaks in consecutive days are treated as a seperate record.
Using DENSE_RANK should do the trick:
;WITH sampledata
AS (SELECT 1 AS id, DATEADD(day, -0, GETDATE())AS somedate
UNION ALL SELECT 1, DATEADD(day, -1, GETDATE())
UNION ALL SELECT 1, DATEADD(day, -2, GETDATE())
UNION ALL SELECT 1, DATEADD(day, -3, GETDATE())
UNION ALL SELECT 1, DATEADD(day, -4, GETDATE())
UNION ALL SELECT 1, DATEADD(day, -5, GETDATE())
UNION ALL SELECT 1, DATEADD(day, -10, GETDATE())
UNION ALL SELECT 1, '2011-01-01 00:00:00'
UNION ALL SELECT 1, '2010-12-31 00:00:00'
UNION ALL SELECT 1, '2011-02-01 00:00:00'
UNION ALL SELECT 1, DATEADD(day, -10, GETDATE())
UNION ALL SELECT 2, DATEADD(day, 0, GETDATE())
UNION ALL SELECT 2, DATEADD(day, -1, GETDATE())
UNION ALL SELECT 2, DATEADD(day, -2, GETDATE())
UNION ALL SELECT 2, DATEADD(day, -6, GETDATE())
UNION ALL SELECT 3, DATEADD(day, 0, GETDATE())
UNION ALL SELECT 4, DATEADD(day, 0, GETDATE())
UNION ALL SELECT 5, DATEADD(day, 0, GETDATE()))
, ranking
AS (SELECT *, DENSE_RANK()OVER(PARTITION BY id ORDER BY DATEDIFF(day, 0, somedate)) - DATEDIFF(day, 0, somedate)AS dategroup
FROM sampledata)
SELECT id
, MIN(somedate)AS range_start
, MAX(somedate)AS range_end
, DATEDIFF(day, MIN(somedate), MAX(somedate)) + 1 AS consecutive_days
FROM ranking
GROUP BY id, dategroup
--HAVING DATEDIFF(day, MIN(somedate), MAX(somedate)) + 1 >= 30 --change as needed
ORDER BY id, range_start
Something like this should do the trick, haven't tested it though.
SELECT
a.empid
, count(*) as consecutive_count
, min(a.mydate) as startdate
FROM (SELECT * FROM logins ORDER BY mydate) a
INNER JOIN (SELECT * FROM logins ORDER BY mydate) b
ON (a.empid = b.empid AND datediff(day,a.mydate,b.mydate) = 1
GROUP BY a.empid, startdate
HAVING consecutive_count > 30
This is a good case for a recursive CTE. I stole the data table from #Davin:
with data AS --sample data
( SELECT 1 as id ,DATEADD(DD,-0,GETDATE()) as date UNION ALL
SELECT 1 as id ,DATEADD(DD,-1,GETDATE()) as date UNION ALL
SELECT 1 as id ,DATEADD(DD,-2,GETDATE()) as date UNION ALL
SELECT 1 as id ,DATEADD(DD,-3,GETDATE()) as date UNION ALL
SELECT 1 as id ,DATEADD(DD,-4,GETDATE()) as date UNION ALL
SELECT 1 as id ,DATEADD(DD,-5,GETDATE()) as date UNION ALL
SELECT 1 as id ,DATEADD(DD,-10,GETDATE()) as date UNION ALL
SELECT 1 as id ,'2011-01-01 00:00:00.000' as date UNION ALL
SELECT 1 as id ,'2010-12-31 00:00:00.000' as date UNION ALL
SELECT 1 as id ,'2011-02-01 00:00:00.000' as date UNION ALL
SELECT 1 as id ,DATEADD(DD,-10,GETDATE()) as date UNION ALL
SELECT 2 as id ,DATEADD(DD,0,GETDATE()) as date UNION ALL
SELECT 2 as id ,DATEADD(DD,-1,GETDATE()) as date UNION ALL
SELECT 2 as id ,DATEADD(DD,-2,GETDATE()) as date UNION ALL
SELECT 2 as id ,DATEADD(DD,-6,GETDATE()) as date UNION ALL
SELECT 3 as id ,DATEADD(DD,0,GETDATE()) as date UNION ALL
SELECT 4 as id ,DATEADD(DD,0,GETDATE()) as date UNION ALL
SELECT 5 as id ,DATEADD(DD,0,GETDATE()) as date )
,CTE AS
(
SELECT id, CAST(date as date) Date, Consec = 1
FROM data
UNION ALL
SELECT t.id, CAST(t.date as DATE) Date, Consec = (c.Consec + 1)
FROM data T
INNER JOIN CTE c
ON T.id = c.id
AND CAST(t.date as date) = CAST(DATEADD(day, 1, c.date) as date)
)
SELECT id, MAX(consec)
FROM CTE
GROUP BY id
ORDER BY id
Basically this generates a lot of rows per person, and measures how many days in a row each date represents.
Assuming there are no duplicate dates for the same employee:
;WITH ranged AS (
SELECT
EmpId,
Date,
RangeId = DATEDIFF(DAY, 0, Date)
- ROW_NUMBER() OVER (PARTITION BY EmpId ORDER BY Date)
FROM atable
)
SELECT
EmpId,
StartDate = MIN(Date),
EndDate = MAX(Date),
DayCount = DATEDIFF(DAY, MIN(Date), MAX(Date)) + 1
FROM ranged
GROUP BY EmpId, RangeId
HAVING DATEDIFF(DAY, MIN(Date), MAX(Date)) + 1 >= 30
ORDER BY EmpId, MIN(Date)
DATEDIFF turns the dates into integers (the difference of days between the 0 date (1900-01-01) and Date). If the dates are consecutive, the integers are consecutive too. Using the data sample in the question as an example, the DATEDIFF results will be:
EmpId Date DATEDIFF
----- ---------- --------
1 2011-01-01 40542
1 2011-01-02 40543
1 2011-01-03 40544
2 2011-02-03 40575
3 2011-03-01 40601
4 2011-03-02 40602
5 2011-01-02 40543
Now, if you take each employee's rows, assign row numbers to them in the order of dates, and get the difference between the numeric representations and row numbers, you will find that the difference stays the same for consecutive numbers (and, therefore, consecutive dates). Using a slightly different sample for better illustration, it will look like this:
Date DATEDIFF RowNum RangeId
---------- -------- ------ -------
2011-01-01 40542 1 40541
2011-01-02 40543 2 40541
2011-01-03 40544 3 40541
2011-01-05 40546 4 40542
2011-01-07 40548 5 40543
2011-01-08 40549 6 40543
2011-01-09 40550 7 40543
The specific value of RangeId is not important, only the fact that it remains the same for consecutive dates matters. Based on that fact, you can use it as a grouping criterion to count the dates in the group and get the range bounds.
The above query uses DATEDIFF(DAY, MIN(Date), MAX(Date)) + 1 to count the days, but you could also simply use COUNT(*) instead.