Simultaneous calls - sql

I'm trying to calculate the number of simultaneous calls at the time a particular call is made by looking at the datetime ranges. My query works, but takes ~10 minutes to perform for only 95,000 records, which is too long. Any ideas for optimization?
SELECT r.*,
rr.ChannelsActive 'ChannelsActive'
FROM #rg r
OUTER APPLY
(
SELECT SUM(1) AS ChannelsActive
FROM #rg r_inner
WHERE
(
r_inner.CallStart BETWEEN r.CallStart AND r.CallEnd
OR r_inner.CallEnd BETWEEN r.CallStart AND r.CallEnd
OR r.CallStart BETWEEN r_inner.CallStart AND r_inner.CallEnd
OR r.CallEnd BETWEEN r_inner.CallStart AND r_inner.CallEnd
)
) rr
Example Data
CREATE TABLE #rg
(
CallStart DATETIME,
CallEnd DATETIME
)
CREATE INDEX ix1
ON #rg(CallStart, CallEnd)
CREATE INDEX ix2
ON #rg(CallEnd, CallStart);
WITH T(N, R)
AS (SELECT TOP (95000) ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS RN,
ABS(120 + 30 * SQRT(-2 * LOG(ABS(CAST(CAST(CRYPT_GEN_RANDOM(8) AS BIGINT) AS FLOAT) / 9223372036854775807))) * COS(2 * PI() * ABS(CAST(CAST(CRYPT_GEN_RANDOM(8) AS BIGINT) AS FLOAT) / 9223372036854775807)))
FROM sys.all_objects o1,
sys.all_objects o2)
INSERT INTO #rg
SELECT DATEADD(SECOND, N, GETDATE()),
DATEADD(SECOND, N + R, GETDATE())
FROM T

This should do it:
;WITH cteCallEvents As
(
SELECT *, CallStart As EventTime, 1 As EventType FROM #rg r
UNION ALL
SELECT *, CallEnd As EventTime, 0 As EventType FROM #rg r
)
, cteCallCounts As
(
SELECT *,
ROW_NUMBER() OVER(Order By EventTime) as EventCount,
ROW_NUMBER() OVER(Partition By EventType Order By EventTime) as TypeCount
FROM cteCallEvents
)
SELECT *,
2*TypeCount - EventCount As OpenCalls
FROM cteCallCounts
WHERE EventType = 1
It should take a couple of seconds at most. Should work on any SQL Server 2005+.

Use SQL like this to get a list of start/end events...
Select CallStart, 1 As CallCount From #rg
Union All
Select CallEnd, -1 From #rg
Order By CallStart
...then treat this as a simple running totals problem, which is solved differently depending upon your SQL Server version or can be easily addressed in code if that's an option.

Related

Get range of dates from dates record in MS SQL

I have dates record
with DateTable (dateItem) as
(
select '2022-07-03' union all
select '2022-07-05' union all
select '2022-07-04' union all
select '2022-07-09' union all
select '2022-07-12' union all
select '2022-07-13' union all
select '2022-07-18'
)
select dateItem
from DateTable
order by 1 asc
I want to get ranges of dates between this record like this
with DateTableRange (dateItemStart, dateItemend) as
(
select '2022-07-03','2022-07-05' union all
select '2022-07-09','2022-07-09' union all
select '2022-07-12','2022-07-13' union all
select '2022-07-18','2022-07-18'
)
select dateItemStart, dateItemend
from DateTableRange
I am able to do it in SQL with looping using while or looping by getting first one and check the next dates and if they are 1 plus then I add it in enddate and do the same in loop
But I don't know what the best or optimized way is, as there were lots of looping and temp tables involve
Edited :
as in data we have 3,4,5 and 6,7,8 is missing so range is 3-5
9 exist and 10 is missing so range is 9-9
so ranges is purely depend on the consecutive data in datetable
Any suggestion will be appreciated
With some additional clarity this requires a gaps-and-islands approach to first identify adjacent rows as groups, from which you can then use a window to identify the first and last value of each group.
I'm sure this could be refined further but should give your desired results:
with DateTable (dateItem) as
(
select '2022-07-03' union all
select '2022-07-05' union all
select '2022-07-04' union all
select '2022-07-09' union all
select '2022-07-12' union all
select '2022-07-13' union all
select '2022-07-18'
), valid as (
select *,
case when exists (
select * from DateTable d2 where Abs(DateDiff(day, d.dateitem, d2.dateitem)) = 1
) then 1 else 0 end v
from DateTable d
), grp as (
select *,
Row_Number() over(order by dateitem) - Row_Number()
over (partition by v order by dateitem) g
from Valid v
)
select distinct
Iif(v = 0, dateitem, First_Value(dateitem) over(partition by g order by dateitem)) DateItemStart,
Iif(v = 0, dateitem, First_Value(dateitem) over(partition by g order by dateitem desc)) DateItemEnd
from grp
order by dateItemStart;
See Demo Fiddle
After clarification, this is definitely a 'gaps and islands' problem.
The solution can be like this
WITH DateTable(dateItem) AS
(
SELECT * FROM (
VALUES
('2022-07-03'),
('2022-07-05'),
('2022-07-04'),
('2022-07-09'),
('2022-07-12'),
('2022-07-13'),
('2022-07-18')
) t(v)
)
SELECT
MIN(dateItem) AS range_from,
MAX(dateItem) AS range_to
FROM (
SELECT
*,
SUM(CASE WHEN DATEADD(day, 1, prev_dateItem) >= dateItem THEN 0 ELSE 1 END) OVER (ORDER BY rn) AS range_id
FROM (
SELECT
ROW_NUMBER() OVER (ORDER BY dateItem) AS rn,
CAST(dateItem AS date) AS dateItem,
CAST(LAG(dateItem) OVER (ORDER BY dateItem) AS date) AS prev_dateItem
FROM DateTable
) groups
) islands
GROUP BY range_id
You can check a working demo

SQL - Combine two rows if difference is below threshhold

I have a table like this in SQL Server:
id start_time end_time
1 10:00:00 10:34:00
2 10:38:00 10:52:00
3 10:53:00 11:23:00
4 11:24:00 11:56:00
5 14:20:00 14:40:00
6 14:41:00 14:59:00
7 15:30:00 15:40:00
What I would like to have is a query that outputs consolidated records based on the time difference between two consecutive records (end_time of row n and start_time row n+1) . All records where the time difference is less than 2 minutes should be combined into one time entry and the ID of the first record should be kept. This should also combine more than two records if multiple consecutive records have a time difference less than 2 minutes.
This would be the expected output:
id start_time end_time
1 10:00:00 10:34:00
2 10:38:00 11:56:00
5 14:20:00 14:59:00
7 15:30:00 15:40:00
Thanks in advance for any tips how to build the query.
Edit:
I started with following code to calculate the lead_time and the time difference but do not know how to group and consolidate.
WITH rows AS
(
SELECT *, ROW_NUMBER() OVER (ORDER BY Id) AS rn
FROM #temp
)
SELECT mc.id, mc.start_time, mc.end_time, mp.start_time lead_time, DATEDIFF(MINUTE, mc.[end_time], mp.[start_time]) as DiffToNewSession
FROM rows mc
LEFT JOIN rows mp
ON mc.rn = mp.rn - 1
The window function in t-sql can realize a lot of data statistics, such as
create table #temp(id int identity(1,1), start_time time, end_time time)
insert into #temp(start_time, end_time)
values ('10:00:00', '10:34:00')
, ('10:38:00', '10:52:00')
, ('10:53:00', '11:23:00')
, ('11:24:00', '11:56:00')
, ('14:20:00', '14:40:00')
, ('14:41:00', '14:59:00')
, ('15:30:00', '15:40:00')
;with c0 as(
select *, LAG(end_time,1,'00:00:00') over (order by id) as lag_time
from #temp
), c1 as(
select *, case when DATEDIFF(MI, lag_time, start_time) <= 2 then 1 else -0 end as gflag
from c0
), c2 as(
select *, SUM(case when gflag=0 then 1 else 0 end) over(order by id) as gid
from c1
)
select MIN(id) as id, MIN(start_time) as start_time, MAX(end_time) as end_time
from c2
group by gid
In order to better describe the process of data construction, I simply use c0, c1, c2... to represent levels, you can merge some levels and optimize.
If you can’t use id as a sorting condition, then you need to change the sorting part in the above statement.
You can use a recursive cte to get the result that you want. This method just simple compare current end_time with next start_time. If it is less than the 2 mintues threshold use the same start_time as grp_start. And the end, simple do a GROUP BY on the grp_start
with rcte as
(
-- anchor member
select *, grp_start = start_time
from tbl
where id = 1
union all
-- recursive member
select t.id, t.start_time, t.end_time,
grp_start = case when datediff(second, r.end_time, t.start_time) <= 120
then r.grp_start
else t.start_time
end
from tbl t
inner join rcte r on t.id = r.id + 1
)
select id = min(id), grp_start as start_time, max(end_time) as end_time
from rcte
group by grp_start
demo
I guess this should do the trick without recursion. Again I used several ctes in order to make the solution a bit easier to read. guess it can be reduced a little...
INSERT INTO T1 VALUES
(1,'10:00:00','10:34:00')
,(2,'10:38:00','10:52:00')
,(3,'10:53:00','11:23:00')
,(4,'11:24:00','11:56:00')
,(5,'14:20:00','14:40:00')
,(6,'14:41:00','14:59:00')
,(7,'15:30:00','15:40:00')
GO
WITH cte AS(
SELECT *
,ROW_NUMBER() OVER (ORDER BY id) AS rn
,DATEDIFF(MINUTE, ISNULL(LAG(endtime) OVER (ORDER BY id), starttime), starttime) AS diffMin
,COUNT(*) OVER (PARTITION BY (SELECT 1)) as maxRn
FROM T1
),
cteFirst AS(
SELECT *
FROM cte
WHERE rn = 1 OR diffMin > 2
),
cteGrp AS(
SELECT *
,ISNULL(LEAD(rn) OVER (ORDER BY id), maxRn+1) AS nextRn
FROM cteFirst
)
SELECT f.id, f.starttime, MAX(ISNULL(n.endtime, f.endtime)) AS endtime
FROM cteGrp f
LEFT JOIN cte n ON n.rn >= f.rn AND n.rn < f.nextRn
GROUP BY f.id, f.starttime

Speeding up SQL Server cross apply to get aggregated data

In SQL Server, I am trying to put together a single query which grabs a row and includes the aggregated data from a two hour window before that row as well as aggregated data from one hour window after. How can I make this run faster?
The rows have time stamps to a millisecond precision, and are not evenly spaced. I have over 50 million rows in this table, and the query does not seem to be completing. There are indexes in many places, but they don't seem to help. I was also thinking about using a window function, but I am not sure that its possible to have a sliding window with unevenly distributed rows. Also, for the future one hour window, I am not sure how that would be done with a SQL window.
Box is a string and has 10 unique values.
Process is a string and has 30 unique values.
The average duration_ms is 200 ms.
Errors account for less than 0.1% of the data.
The 50 million rows describes a years worth of data.
select
c1.start_time,
c1.end_time,
c1.box,
c1.process,
datediff(ms,c1.start_time,c1.end_time) as duration_ms,
datepart(dw,c1.start_time) as day_of_week,
datepart(hour,c1.start_time) as hour_of_day,
c3.*,
c5.*
from metrics_table c1
cross apply
(select
avg(cast(datediff(ms,c2.start_time,c2.end_time) as numeric)) as avg_ms,
count(1) as num_process_total,
count(distinct process) as num_process_unique,
count(distinct box) as num_box_unique
from metrics_table c2
where datediff(minute,c2.start_time,c1.start_time) <= 120
and c1.start_time> c2.start_time
and c2.error_code = 0
) c3
cross apply
(select
avg(case when datediff(ms,c4.start_time,c4.end_time)>1000 then 1.0 else 0.0 end) as percent_over_thresh
from metrics_table c4
where datediff(hour,c1.start_time,c4.start_time) <= 1
and c4.start_time> c1.start_time
and c4.error_code= 0
) c5
where
c1.error_code= 0
Edit
Version: SQL Azure 12.0
Adding execution plan:
The following should be a step in the right direction...
Note: c2.start_time & c4.start_time are no longer wrappen in DATEDIFF functions making them SARGable...
SELECT
c1.start_time,
c1.end_time,
c1.box,
c1.process,
DATEDIFF(ms, c1.start_time, c1.end_time) AS duration_ms,
DATEPART(dw, c1.start_time) AS day_of_week,
DATEPART(HOUR, c1.start_time) AS hour_of_day,
--c3.*,
avg_ms = CASE WHEN
c5.*
FROM
dbo.metrics_table c1
CROSS APPLY (
SELECT
AVG(CAST(DATEDIFF(ms, c2.start_time, c2.end_time) AS NUMERIC)) AS avg_ms,
COUNT(1) AS num_process_total,
COUNT(DISTINCT process) AS num_process_unique,
COUNT(DISTINCT box) AS num_box_unique
FROM
dbo.metrics_table c2
WHERE
--DATEDIFF(minute,c2.start_time,c1.start_time) <= 120
c2.start_time <= DATEADD(MINUTE, -120, c1.start_time)
--and c1.start_time> c2.start_time
AND c2.error_code = 0
) c3
CROSS APPLY (
SELECT
AVG(CASE WHEN DATEDIFF(ms, c4.start_time, c4.end_time) > 1000 THEN 1.0 ELSE 0.0 END
) AS percent_over_thresh
FROM
dbo.metrics_table c4
WHERE
--DATEDIFF(HOUR, c1.start_time, c4.start_time) <= 1
c4.start_time >= DATEADD(HOUR, 1, c1.start_time)
--and c4.start_time> c1.start_time
AND c4.error_code = 0
) c5
WHERE
c1.error_code = 0;
Of course, making a query SARGable doesn't do any good unless there's an appropriate index available. The following should be good for all 3 metrics_table references... (see what indexes are currently available, there's a chance that you may not need to create a new index)
CREATE NONCLUSTERED INDEX ixf_metricstable_errorcode_starttime ON dbo.metrics_table (
error_code,
start_time
)
INCLUDE (
end_time,
box,
process
)
WHERE
error_code = 0;
I used Between and got good performance in my simple test rig. I've also used columnstore as 50 million records is DW volumes:
CREATE TABLE dbo.metrics_table (
rowId INT IDENTITY,
start_time DATETIME NOT NULL,
end_time DATETIME NOT NULL,
box VARCHAR(10) NOT NULL,
process VARCHAR(10) NOT NULL,
error_code INT NOT NULL
);
-- Add records
;WITH cte AS (
SELECT TOP 3334 ROW_NUMBER() OVER ( ORDER BY ( SELECT 1 ) ) rn
FROM sys.columns c1
CROSS JOIN sys.columns c2
CROSS JOIN sys.columns c3
)
INSERT INTO dbo.metrics_table ( start_time, end_time, box, process, error_code )
SELECT
DATEADD( ms, rn, DATEADD( day, rn % 365, '1 Jan 2017' ) ) AS start_time,
DATEADD( ms, rn % 409, DATEADD( ms, rn, DATEADD( day, rn % 365, '1 Jan 2017' ) ) ) AS end_time,
'box' + CAST( boxes.box AS VARCHAR(10) ) box,
'process' + CAST( boxes.box AS VARCHAR(10) ) process,
ABS( CAST( rn % 3000 AS BIT ) -1 ) error_code
FROM cte c
CROSS JOIN ( SELECT TOP 10 rn FROM cte ) AS boxes(box)
CROSS JOIN ( SELECT TOP 30 rn FROM cte ) AS processes(process);
-- Create normal clustered index to order the data
CREATE CLUSTERED INDEX cci_metrics_table ON dbo.metrics_table ( start_time, end_time, box, process );
--CREATE CLUSTERED INDEX cci_metrics_table ON dbo.metrics_table ( box, process, start_time, end_time );
-- Convert to columnstore
CREATE CLUSTERED COLUMNSTORE INDEX cci_metrics_table ON dbo.metrics_table WITH ( MAXDOP = 1, DROP_EXISTING = ON );
IF OBJECT_ID('tempdb..#tmp1' ) IS NOT NULL DROP TABLE #tmp1
-- two hour window before, 1 hour window after
SELECT
c1.start_time,
c1.end_time,
c1.box,
c1.process,
DATEDIFF( ms, c1.start_time, c1.end_time ) AS duration_ms,
DATEPART( dw, c1.start_time ) AS day_of_week,
DATEPART( hour, c1.start_time ) AS hour_of_day,
c2.xavg,
c2.num_process_total,
c2.num_process_unique,
c2.num_box_unique,
c3.percent_over_thresh
INTO #tmp1
FROM dbo.metrics_table c1
CROSS APPLY
(
SELECT
COUNT(1) AS num_process_total,
AVG( CAST( DATEDIFF( ms, start_time, end_time ) AS NUMERIC ) ) xavg,
COUNT( DISTINCT process ) num_process_unique,
COUNT( DISTINCT box ) num_box_unique
FROM dbo.metrics_table c2
WHERE c2.error_code = 0
AND c2.start_time Between DATEADD( minute, -120, c1.start_time ) And c1.start_time
AND c1.start_time > c2.start_time
) c2
CROSS APPLY
(
SELECT
AVG( CASE WHEN DATEDIFF( ms, c4.start_time, c4.end_time ) > 1000 THEN 1.0 ELSE 0.0 END ) percent_over_thresh
FROM dbo.metrics_table c4
WHERE c4.error_code = 0
AND c4.start_time Between c1.start_time And DATEADD( minute, 60, c1.start_time )
AND c4.start_time > c1.start_time
) c3
WHERE error_code = 0

Calculate Average time spend based on a change in location zone

I have a table similar to
create table LOCHIST
(
RES_ID VARCHAR(10) NOT NULL,
LOC_DATE TIMESTAMP NOT NULL,
LOC_ZONE VARCHAR(10)
)
with values such as
insert into LOCHIST values(0911,2015-09-23 12:27:00.000000,SYLVSYLGA);
insert into LOCHIST values(5468,2013-02-15 13:13:24.000000,30726);
insert into LOCHIST values(23894,2013-02-15 13:12:13.000000,BECTFOUNC);
insert into LOCHIST values(24119,2013-02-15 13:12:09.000000,30363);
insert into LOCHIST values(7101,2013-02-15 13:11:37.000000,37711);
insert into LOCHIST values(26083,2013-02-15 13:11:36.000000,SHAWANDAL);
insert into LOCHIST values(24978,2013-02-15 13:11:36.000000,38132);
insert into LOCHIST values(26696,2013-02-15 13:11:27.000000,29583);
insert into LOCHIST values(5468,2013-02-15 13:11:00.000000,37760);
insert into LOCHIST values(5552,2013-02-15 13:10:55.000000,30090);
insert into LOCHIST values(24932,2013-02-15 13:10:48.000000,JBTTLITGA);
insert into LOCHIST values(23894,2013-02-15 13:10:42.000000,47263);
insert into LOCHIST values(26803,2013-02-15 13:10:25.000000,32534);
insert into LOCHIST values(24434,2013-02-15 13:10:03.000000,PLANSUFVA);
insert into LOCHIST values(26696,2013-02-15 13:10:00.000000,GEORALBGA);
insert into LOCHIST values(5468,2013-02-15 13:09:54.000000,19507);
insert into LOCHIST values(23894,2013-02-15 13:09:48.000000,37725);
This table literally goes on for millions of records.
Each RES_ID represents the ID of a trailer who pings their location to a LOC_ZONE which is then stored at the time in LOC_DATE.
What I am trying to find, is the average amount of time spent for all trailers in a specific location zone. For example, if trailer x spent 4 hours in in loc zone PLANSUFVA, and trailer y spent 6 hours in loc zone PLANSUFVA I would want to return
Loc Zone Avg Time
PLANSUFVA 5
Is there anyway to do this without cursors?
I really appreciate your help.
This needs SQL 2012:
with data
as (
select *, (case when LOC_ZONE != PREVIOUS_LOC_ZONE or PREVIOUS_LOC_ZONE is null then ROW_ID else null end) as STAY_START, (case when LOC_ZONE != NEXT_LOC_ZONE or NEXT_LOC_ZONE is null then ROW_ID else null end) as STAY_END
from (
select RES_ID, LOC_ZONE, LOC_DATE, lead(LOC_DATE, 1) over (partition by RES_ID, LOC_ZONE order by LOC_DATE) as NEXT_LOC_DATE, lag(LOC_ZONE, 1) over (partition by RES_ID order by LOC_DATE) as PREVIOUS_LOC_ZONE, lead(LOC_ZONE, 1) over (partition by RES_ID order by LOC_DATE) as NEXT_LOC_ZONE, ROW_NUMBER() over (order by RES_ID, LOC_ZONE, LOC_DATE) as ROW_ID
from LOCHIST
) t
), stays as (
select * from (
select RES_ID, LOC_ZONE, STAY_START, lead(STAY_END, 1) over (order by ROWID) as STAY_END
from (
select RES_ID, LOC_ZONE, STAY_START, STAY_END, ROW_NUMBER() over (order by RES_ID, LOC_ZONE, STAY_START desc) as ROWID
from data
where STAY_START is not null or STAY_END is not null
) t
) t
where STAY_START is not null and STAY_END is not null
)
select s.LOC_ZONE, avg(datediff(second, LOC_DATE, NEXT_LOC_DATE)) / 60 / 60 as AVG_IN_HOURS
from data d
inner join stays s on d.RES_ID = s.RES_ID and d.LOC_ZONE = s.LOC_ZONE and d.ROW_ID >= s.STAY_START and d.ROW_ID < s.STAY_END
group by s.LOC_ZONE
To solve this problem, you need the amount of time spent at each location.
One way to do this is with a correlated subquery. You need to group adjacent values. The idea is to find the next value in the sequence:
select resid, min(loc_zone) as loc_zone, min(loc_date) as StartTime,
max(loc_date) as EndTime,
nextdate as NextStartTime
from (select lh.*,
(select min(loc_date) from lochist lh2
where lh2.res_id = lh.res_id and lh2.loc_zone <> lh.loc_zone and
lh2.loc_date > lh.loc_date
) as nextdate
from lochist lh
) lh
group by lh.res_id, nextdate
With this data, you can then get the average that you want.
I am not clear if the time should be based on EndTime - StartTime (last recorded time at the location minus the first recorded time) or NextStartTime - startTime (first time at next location minus first time at this location).
Also, this returns NULL for the last location for each res_id. You don't say what to do about the last in the sequence.
If you build an index on res_id, loc_date, loc_zone, it might run faster.
If you had Oracle or SQL Server 2012, the right query is:
select lh.*,
lead(loc_date) over (partition by res_id order by loc_date) as nextdate
from (select lh.*,
lag(loc_zone) over (partition by res_id order by loc_date) as prevzone
from lochist lh
) lh
where prevzone is null or prevzone <> loc_zone
Now you have one row per stay and nextdate is the date at the next zone.
This should get you each zone ordered by the average number of minutes spent in it. The CROSS APPLY returns the next ping in a different zone.
SELECT
loc.LOC_ZONE
,AVG(DATEDIFF(mi,loc.LOC_DATE,nextPing.LOC_DATE)) AS avgMinutes
FROM LOCHIST loc
CROSS APPLY(
SELECT TOP 1 loc2.LOC_DATE
FROM LOCHIST loc2
WHERE loc2.RES_ID = loc.RES_ID
AND loc2.LOC_DATE > loc.LOC_DATE
AND loc2.LOC_ZONE <> loc.LOC_ZONE
ORDER BY loc2.LOC_DATE ASC
) AS nextPing
GROUP BY loc.LOC_ZONE
ORDER BY avgMinutes DESC
My variation of the solution:
select LOC_ZONE, avg(TOTAL_TIME) AVG_TIME from (
select RES_ID, LOC_ZONE, sum(TIME_SPENT) TOTAL_TIME
from (
select RES_ID, LOC_ZONE, datediff(mi, lag(LOC_DATE, 1) over (
partition by RES_ID order by LOC_DATE), LOC_DATE) TIME_SPENT
from LOCHIST
) t
where TIME_SPENT is not null
group by RES_ID, LOC_ZONE) f
group by LOC_ZONE
This accounts for multiple stays at the same location. The choice between lag or lead depends if a stay should start or end with the ping (ie, if one trailer sends a ping from A and then x hours later from B, does that count for A or B).
To do this without using either a cursor or a correlated subquery, try:
with rl as
(select l.*, rank() over (partition by res_id order by loc_date) rn
from lochist l),
fdr as
(select rc.*, coalesce(rn.loc_date, getdate()) next_date
from rl rc
left join rl rn on rc.res_id = rn.res_id and rc.rn + 1 = rn.rn)
select loc_zone, avg(datediff(second, loc_date, next_date))/3600 avg_time
from fdr
group by loc_zone
SQLFiddle here.
(Because of the way that SQLServer calculates time differences, it's probably better to calculate the average time in seconds and then divide by 60*60. With the exception of the getdate() and datediff clauses - which can be replaced by sysdate and next_date - loc_date - this should work in both SQLServer 2005 onwards and Oracle 10g onwards.)

Doing a comparison using the previous row?

I'm trying to work out an efficient way of comparing two rows in SQL Server 2008. I need to write a query which finds all rows in the Movement table which have Speed < 10 N consecutive times.
The structure of the table is:
EventTime
Speed
If the data were:
2012-02-05 13:56:36.980, 2
2012-02-05 13:57:36.980, 11
2012-02-05 13:57:46.980, 2
2012-02-05 13:59:36.980, 2
2012-02-05 14:06:36.980, 22
2012-02-05 15:56:36.980, 2
Then it would return rows 3/4 (13:57:46.980 / 13:59:36.980) if I looked for 2 consecutive rows, and would return nothing if I looked for three consecutive rows. The order of the data is EventTime/DateTime only.
Any help you could give me would be great. I'm considering using cursors but they're usually pretty inefficient. Also, this table is approximately 10m rows in size, so the more efficient the better! :)
Thanks!
DECLARE
#n INT,
#speed_limit INT
SELECT
#n = 5,
#speed_limit = 10
;WITH
partitioned AS
(
SELECT
*,
CASE WHEN speed < #speed_limit THEN 1 ELSE 0 END AS PartitionID
FROM
Movement
)
,
sequenced AS
(
SELECT
ROW_NUMBER() OVER ( ORDER BY EventTime) AS MasterSeqID,
ROW_NUMBER() OVER (PARTITION BY PartitionID ORDER BY EventTime) AS PartIDSeqID,
*
FROM
partitioned
)
,
filter AS
(
SELECT
MasterSeqID - PartIDSeqID AS GroupID,
MIN(MasterSeqID) AS GroupFirstMastSeqID,
MAX(MasterSeqID) AS GroupFinalMastSeqID
FROM
sequenced
WHERE
PartitionID = 1
GROUP BY
MasterSeqID - PartIDSeqID
HAVING
COUNT(*) >= #n
)
SELECT
sequenced.*
FROM
filter
INNER JOIN
sequenced
ON sequenced.MasterSeqID >= filter.GroupFirstMastSeqID
AND sequenced.MasterSeqID <= filter.GroupFinalMastSeqID
Alternative final steps (inspired by #t-clausen-dk), to avoid an additional JOIN. I would test both to see which is more performant.
,
filter AS
(
SELECT
MasterSeqID - PartIDSeqID AS GroupID,
COUNT(*) OVER (PARTITION BY MasterSeqID - PartIDSeqID) AS GroupSize,
*
FROM
sequenced
WHERE
PartitionID = 1
)
SELECT
*
FROM
filter
WHERE
GroupSize >= #n
declare #t table(EventTime datetime, Speed int)
insert #t values('2012-02-05 13:56:36.980', 2)
insert #t values('2012-02-05 13:57:36.980', 11)
insert #t values('2012-02-05 13:57:46.980', 2)
insert #t values('2012-02-05 13:59:36.980', 2)
insert #t values('2012-02-05 14:06:36.980', 22)
insert #t values('2012-02-05 15:56:36.980', 2)
declare #N int = 1
;with a as
(
select EventTime, Speed, row_number() over (order by EventTime) rn from #t
), b as
(
select EventTime, Speed, 1 grp, rn from a where rn = 1
union all
select a.EventTime, a.Speed, case when a.speed < 10 and b.speed < 10 then grp else grp + 1 end, a.rn
from a join b on a.rn = b.rn+1
), c as
(
select EventTime, Speed, count(*) over (partition by grp) cnt from b
)
select * from c
where cnt > #N
OPTION (MAXRECURSION 0) -- Thx Dems
Almost the same ideea as Dems, a little bit different:
select * from (
select eventtime, speed, rnk, new_rnk,
rnk - new_rnk,
max(rnk) over (partition by speed, new_rnk-rnk) -
min(rnk) over (partition by speed, new_rnk-rnk) + 1 as no_consec
from (
select eventtime, rnk, speed,
row_number() over (partition by speed order by eventtime) as new_rnk
from (
select eventtime, speed,
row_number() over (order by eventtime) as rnk
from a
) a
where a.speed < 5
)
order by eventtime
)
where no_consec >= 2;
5 is speed limit and 2 is min number of consecutive events.
I put date as number for simplicity of writing the create database.
SQLFIDDLE
EDIT:
To answer to comments, I've added three columns in the first inner query. To get only the first row you need to add an pos_in_group = 1 to WHERE clause and the distance is at your fingers.
SQLFIDDLE
select eventtime, speed, min_date, max_date, pos_in_group
from (
select eventtime, speed, rnk, new_rnk,
rnk - new_rnk,
row_number() over (partition by speed, new_rnk-rnk order by eventtime) pos_in_group,
min(eventtime) over (partition by speed, new_rnk-rnk) min_date,
max(eventtime) over (partition by speed, new_rnk-rnk) max_date,
max(rnk) over (partition by speed, new_rnk-rnk) -
min(rnk) over (partition by speed, new_rnk-rnk) + 1 as no_consec
from (
select eventtime, rnk, speed,
row_number() over (partition by speed order by eventtime) as new_rnk
from (
select eventtime, speed,
row_number() over (order by eventtime) as rnk
from a
) a
where a.speed < 5
)
order by eventtime
)
where no_consec > 1;