Data Aggregation in SQL Server 2005 - sql

I need a query for SQl server 2005 (SQL server management studio express).
I have data stored as 1 minute time frame (1 minute each row), for each table columns are ID, Symbol, DateTime, Open, High, Low, Close, Volume.
I need to convert (compress) to every possibile multiple time frame, so let's say 10 minutes, 13, 15, and so on.
Provide full details if somebody could help.
Thanks
Alberto

Alberto, it looks like you need a "Group By" clause in SQL statements (as Leppie stated). So, you should better look for it.
First you should filter the rows that is subject for aggregation by using begin and end date/time and then group them by the mentioned clause.
Here is the first link when i search "sql group by" keywords via Google.

;WITH cte AS
(SELECT *,
(32 * CAST([DATETIME] AS INT)) + DATEPART(HOUR,[DATETIME]) + (DATEPART(MINUTE,[DATETIME])/15)/4.0 AS Seg
FROM prices
)
,cte1 AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY Symbol,Seg ORDER BY [DATETIME]) AS RN_ASC ,
ROW_NUMBER() OVER (PARTITION BY Symbol,Seg ORDER BY [DATETIME] DESC) AS RN_DESC
FROM cte
)
SELECT
Symbol,
Seg,
MAX(CASE WHEN RN_ASC=1 THEN [DATETIME] END) AS OpenDateTime,
MAX(CASE WHEN RN_ASC=1 THEN [OPEN] END) AS [OPEN],
MAX(High) High,
MIN(Low) Low,
SUM(Volume) Volume,
MAX(CASE WHEN RN_DESC=1 THEN [CLOSE] END) AS [CLOSE],
MAX(CASE WHEN RN_DESC=1 THEN [DATETIME] END) AS CloseDateTime
FROM cte1
GROUP BY Symbol,Seg
ORDER BY OpenDateTime
Or another approach that may be worth testing to see if it is any faster.
DECLARE #D1 DATETIME
DECLARE #D2 DATETIME
DECLARE #Interval FLOAT
SET #D1 = '2010-10-18 09:00:00.000'
SET #D2 = '2010-10-19 18:00:00.000'
SET #Interval = 15
;WITH
L0 AS (SELECT 1 AS c UNION ALL SELECT 1),
L1 AS (SELECT 1 AS c FROM L0 A CROSS JOIN L0 B),
L2 AS (SELECT 1 AS c FROM L1 A CROSS JOIN L1 B),
L3 AS (SELECT 1 AS c FROM L2 A CROSS JOIN L2 B),
L4 AS (SELECT 1 AS c FROM L3 A CROSS JOIN L3 B),
Nums AS (SELECT ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS i FROM L4),
Ranges AS(
SELECT
DATEADD(MINUTE,#Interval*(i-1),#D1) AS StartRange,
DATEADD(MINUTE,#Interval*i,#D1) AS NextRange
FROM Nums where i <= 1+CEILING(DATEDIFF(MINUTE,#D1,#D2)/#Interval))
,cte AS (
SELECT
*
,ROW_NUMBER() OVER (PARTITION BY Symbol,r.StartRange ORDER BY [DateTime]) AS RN_ASC
,ROW_NUMBER() OVER (PARTITION BY Symbol,r.StartRange ORDER BY [DateTime] DESC) AS RN_DESC
FROM Ranges r
JOIN prices p ON p.[DateTime] >= r.StartRange and p.[DateTime] < r.NextRange )
SELECT
Symbol,
MAX(CASE WHEN RN_ASC=1 THEN [DateTime] END) AS OpenDateTime,
MAX(CASE WHEN RN_ASC=1 THEN [Open] END) AS [Open],
MAX(High) High,
MIN(Low) Low,
SUM(Volume) Volume,
MAX(CASE WHEN RN_DESC=1 THEN [Close] END) AS [Close],
MAX(CASE WHEN RN_DESC=1 THEN [DateTime] END) AS CloseDateTime
FROM cte
GROUP BY Symbol,StartRange
ORDER BY OpenDateTime

Not simple "Group By" - Open and Close values need taken for first and correspondingly last row in group. Or at least so is it for Forex data :)

Would be prettier with a stored proc to extract MIN(datetime) first, but here's a sketch:
WITH quarters(q) AS (
SELECT DISTINCT
15*CAST(DATEDIFF("n",'2000/01/01',dataora) / 15 as Int) AS primo
FROM
Prezzi
)
SELECT
simbolo, DATEADD("n",q,'2000/01/01') AS tick,
MIN(minimo) AS minimo, MAX(massimo) AS massimo,
(SELECT
TOP 1 apertura FROM Prezzi P
WHERE
P.simbolo = simbolo AND
P.dataora >= DATEADD("n",q,'2000/01/01')
ORDER BY
P.dataora ASC
) as primaapertura,
(SELECT
TOP 1 chiusura FROM Prezzi P
WHERE
P.simbolo = simbolo AND
P.dataora < DATEADD("s",14*60+59,DATEADD("n",q,'2000/01/01'))
ORDER BY
P.dataora DESC
) as ultimachiusara,
SUM(volume) / COUNT(*) AS volumemedio
FROM
quarters INNER JOIN Prezzi
ON dataora BETWEEN DATEADD("n",q,'2000/01/01')
AND DATEADD("s",14*60+59,DATEADD("n",q,'2000/01/01'))
GROUP BY
simbolo, DATEADD("n",q,'2000/01/01')
ORDER BY
1, 2
The WITH clause gets a list of 15 minute intervals, rounded down, in your dataset (let's assume nothing before 2000).
Then use those intervals to group by 14:59 interval.
For the volume, you'll have to decide if you want average or the total.
The syntax might be a tad off, but you should get the idea.
EDIT: Adjusted MIN(open), MIN(close) to pick up FIRST and LAST. In reality this won't change much, as the concept of Open and Close depend on knowing the time difference between the exchange where the quote originated and the clock of the computer collecting the data.
In addition, unless the OP has the privilege of a real-time feed from all the exchanges, all the quotes are delayed by 20 minutes anyway.
EDIT(2): Quite right, FIRST and LAST are carry-overs from my IBM days >;-)
Solution now selects first and last quotes during the interval using TOP with ASC/DESC.

Declare #tbl1MinENI Table
(ID int identity,
Simbolo char(3),
DataOra datetime,
Apertura numeric(15,4),
Massimo numeric(15,4),
Minimo numeric(15,4),
Chiusura numeric(15,4),
Volume int)
Insert Into #tbl1MinENI ( Simbolo, DataOra, Apertura, Massimo, Minimo, Chiusura, Volume)
Values
('ENI', '2010/10/18 09:00:00', 16.1100, 16.1800, 16.1100, 16.1400, 244015),
('ENI', '2010/10/18 09:01:00', 16.1400, 16.1400, 16.1300, 16.1400, 15692 ),
('ENI', '2010/10/18 09:02:00', 16.1400, 16.1500, 16.1400, 16.1500, 147035),
('ENI', '2010/10/18 09:03:00', 16.1500, 16.1600, 16.1500, 16.1600, 5181 ),
('ENI', '2010/10/18 09:04:00', 16.1600, 16.2000, 16.1600, 16.1900, 5134 ),
('ENI', '2010/10/18 09:05:00', 16.1900, 16.1900, 16.1800, 16.1800, 15040 ),
('ENI', '2010/10/18 09:06:00', 16.1900, 16.1900, 16.1600, 16.1600, 68867 ),
('ENI', '2010/10/18 09:07:00', 16.1600, 16.1600, 16.1600, 16.1600, 7606 ),
('ENI', '2010/10/18 09:08:00', 16.1500, 16.1500, 16.1500, 16.1500, 725 ),
('ENI', '2010/10/18 09:09:00', 16.1600, 16.1600, 16.1600, 16.1600, 81 ),
('ENI', '2010/10/18 09:10:00', 16.1700, 16.1800, 16.1700, 16.1700, 68594 ),
('ENI', '2010/10/18 09:11:00', 16.1800, 16.1800, 16.1800, 16.1800, 6619 )
Declare #nRowsPerGroup int = 3
;With Prepare as
(
Select datediff(minute, '2010/10/18 09:00:00', DataOra)/#nRowsPerGroup as Grp,
Row_Number() over (partition by datediff(minute, '2010/10/18 09:00:00', DataOra)/#nRowsPerGroup order by dataora) as rn,
*
From tbl1MinENI
), b as
(
Select a.Grp,
Min(a.DataOra) as GroupDataOra,
Min(ID) AperturaID,
max(a.Massimo) as Massimo,
Min(a.Minimo) as Minimo,
max(id) ChiusuraID,
sum(a.Volume) as Volume
From Prepare a
Group by Grp
)
Select b.grp,
b.GroupDataOra,
ta.Apertura,
b.Massimo,
b.Minimo,
tc.Chiusura,
b.Volume
From b
Inner Join tbl1MinENI ta on ta.ID=b.AperturaID
Inner Join tbl1MinENI tc on tc.ID=b.ChiusuraID
;

Related

SQL - Return count of consecutive days where value was unchanged

I have a table like
date
ticker
Action
'2022-03-01'
AAPL
BUY
'2022-03-02'
AAPL
SELL.
'2022-03-03'
AAPL
BUY.
'2022-03-01'
CMG
SELL.
'2022-03-02'
CMG
HOLD.
'2022-03-03'
CMG
HOLD.
'2022-03-01'
GPS
SELL.
'2022-03-02'
GPS
SELL.
'2022-03-03'
GPS
SELL.
I want to do a group by ticker then count all the times that Actions have sequentially been the value that they are as of the last date, here it's 2022-03-03. ie for this example table it'd be like;
ticker
NumSequentialDaysAction
AAPL
0
CMG
1
GPS
2
Fine to pass in 2022-03-03 as a value, don't need to figure that out on the fly.
Tried something like this
---Table Creation---
CREATE TABLE UserTable
([Date] DATETIME2, [Ticker] varchar(5), [Action] varchar(5))
;
INSERT INTO UserTable
([Date], [Ticker], [Action])
VALUES
('2022-03-01' , 'AAPL' , 'BUY'),
('2022-03-02' , 'AAPL' , 'SELL'),
('2022-03-03' , 'AAPL' , 'BUY'),
('2022-03-01' , 'CMG' , 'SELL'),
('2022-03-02' , 'CMG' , 'HOLD'),
('2022-03-03' , 'CMG' , 'HOLD'),
('2022-03-01' , 'GPS' , 'SELL'),
('2022-03-02' , 'GPS' , 'SELL'),
('2022-03-03' , 'GPS' , 'SELL')
;
---Attempted Solution---
I'm thinking that I need to do a sub query to get the last value and join on itself to get the matching values. Then apply a window function, ordered by date to see that the proceeding value is sequential.
WITH CTE AS (SELECT Date, Ticker, Action,
ROW_NUMBER() OVER (PARTITION BY Ticker, Action ORDER BY Date) as row_num
FROM UserTable)
SELECT Ticker, COUNT(DISTINCT Date) as count_of_days
FROM CTE
WHERE row_num = 1
GROUP BY Ticker;
WITH CTE AS (SELECT Date, Ticker, Action,
DENSE_RANK() OVER (PARTITION BY Ticker ORDER BY Action,Date) as rank
FROM table)
SELECT Ticker, COUNT(DISTINCT Date) as count_of_days
FROM CTE
WHERE rank = 1
GROUP BY Ticker;
You can do this with the help of the LEAD function like so. You didn't specify which RDBMS you're using. This solution works in PostgreSQL:
WITH "withSequential" AS (
SELECT
ticker,
(LEAD("Action") OVER (PARTITION BY ticker ORDER BY date ASC) = "Action") AS "nextDayIsSameAction"
FROM UserTable
)
SELECT
ticker,
SUM(
CASE
WHEN "nextDayIsSameAction" IS TRUE THEN 1
ELSE 0
END
) AS "NumSequentialDaysAction"
FROM "withSequential"
GROUP BY ticker
Here is a way to do this using gaps and islands solution.
Thanks for sharing the create and insert scripts, which helps to build the solution quickly.
dbfiddle link.
https://dbfiddle.uk/rZLDTrNR
with data
as (
select date
,ticker
,action
,case when lag(action) over(partition by ticker order by date) <> action then
1
else 0
end as marker
from usertable
)
,interim_data
as (
select *
,sum(marker) over(partition by ticker order by date) as grp_val
from data
)
,interim_data2
as (
select *
,count(*) over(partition by ticker,grp_val) as NumSequentialDaysAction
from interim_data
)
select ticker,NumSequentialDaysAction
from interim_data2
where date='2022-03-03'
Another option, you could use the difference between two row_numbers approach as the following:
select [Ticker], count(*)-1 NumSequentialDaysAction -- you could use (distinct) to remove duplicate rows
from
(
select *,
row_number() over (partition by [Ticker] order by [Date]) -
row_number() over (partition by [Ticker], [Action] order by [Date]) grp
from UserTable
where [date] <= '2022-03-03'
) RN_Groups
/* get only rows where [Action] = last date [Action] */
where [Action] = (select top 1 [Action] from UserTable T
where T.[Ticker] = RN_Groups.[Ticker] and [date] <= '2022-03-03'
order by [Date] desc)
group by [Ticker], [Action], grp
See demo

SQL - Combine two rows if difference is below threshhold

I have a table like this in SQL Server:
id start_time end_time
1 10:00:00 10:34:00
2 10:38:00 10:52:00
3 10:53:00 11:23:00
4 11:24:00 11:56:00
5 14:20:00 14:40:00
6 14:41:00 14:59:00
7 15:30:00 15:40:00
What I would like to have is a query that outputs consolidated records based on the time difference between two consecutive records (end_time of row n and start_time row n+1) . All records where the time difference is less than 2 minutes should be combined into one time entry and the ID of the first record should be kept. This should also combine more than two records if multiple consecutive records have a time difference less than 2 minutes.
This would be the expected output:
id start_time end_time
1 10:00:00 10:34:00
2 10:38:00 11:56:00
5 14:20:00 14:59:00
7 15:30:00 15:40:00
Thanks in advance for any tips how to build the query.
Edit:
I started with following code to calculate the lead_time and the time difference but do not know how to group and consolidate.
WITH rows AS
(
SELECT *, ROW_NUMBER() OVER (ORDER BY Id) AS rn
FROM #temp
)
SELECT mc.id, mc.start_time, mc.end_time, mp.start_time lead_time, DATEDIFF(MINUTE, mc.[end_time], mp.[start_time]) as DiffToNewSession
FROM rows mc
LEFT JOIN rows mp
ON mc.rn = mp.rn - 1
The window function in t-sql can realize a lot of data statistics, such as
create table #temp(id int identity(1,1), start_time time, end_time time)
insert into #temp(start_time, end_time)
values ('10:00:00', '10:34:00')
, ('10:38:00', '10:52:00')
, ('10:53:00', '11:23:00')
, ('11:24:00', '11:56:00')
, ('14:20:00', '14:40:00')
, ('14:41:00', '14:59:00')
, ('15:30:00', '15:40:00')
;with c0 as(
select *, LAG(end_time,1,'00:00:00') over (order by id) as lag_time
from #temp
), c1 as(
select *, case when DATEDIFF(MI, lag_time, start_time) <= 2 then 1 else -0 end as gflag
from c0
), c2 as(
select *, SUM(case when gflag=0 then 1 else 0 end) over(order by id) as gid
from c1
)
select MIN(id) as id, MIN(start_time) as start_time, MAX(end_time) as end_time
from c2
group by gid
In order to better describe the process of data construction, I simply use c0, c1, c2... to represent levels, you can merge some levels and optimize.
If you can’t use id as a sorting condition, then you need to change the sorting part in the above statement.
You can use a recursive cte to get the result that you want. This method just simple compare current end_time with next start_time. If it is less than the 2 mintues threshold use the same start_time as grp_start. And the end, simple do a GROUP BY on the grp_start
with rcte as
(
-- anchor member
select *, grp_start = start_time
from tbl
where id = 1
union all
-- recursive member
select t.id, t.start_time, t.end_time,
grp_start = case when datediff(second, r.end_time, t.start_time) <= 120
then r.grp_start
else t.start_time
end
from tbl t
inner join rcte r on t.id = r.id + 1
)
select id = min(id), grp_start as start_time, max(end_time) as end_time
from rcte
group by grp_start
demo
I guess this should do the trick without recursion. Again I used several ctes in order to make the solution a bit easier to read. guess it can be reduced a little...
INSERT INTO T1 VALUES
(1,'10:00:00','10:34:00')
,(2,'10:38:00','10:52:00')
,(3,'10:53:00','11:23:00')
,(4,'11:24:00','11:56:00')
,(5,'14:20:00','14:40:00')
,(6,'14:41:00','14:59:00')
,(7,'15:30:00','15:40:00')
GO
WITH cte AS(
SELECT *
,ROW_NUMBER() OVER (ORDER BY id) AS rn
,DATEDIFF(MINUTE, ISNULL(LAG(endtime) OVER (ORDER BY id), starttime), starttime) AS diffMin
,COUNT(*) OVER (PARTITION BY (SELECT 1)) as maxRn
FROM T1
),
cteFirst AS(
SELECT *
FROM cte
WHERE rn = 1 OR diffMin > 2
),
cteGrp AS(
SELECT *
,ISNULL(LEAD(rn) OVER (ORDER BY id), maxRn+1) AS nextRn
FROM cteFirst
)
SELECT f.id, f.starttime, MAX(ISNULL(n.endtime, f.endtime)) AS endtime
FROM cteGrp f
LEFT JOIN cte n ON n.rn >= f.rn AND n.rn < f.nextRn
GROUP BY f.id, f.starttime

How can I find DATEDIFF for records in the same field?

I have a reporting table that looks like this - BEFORE:
The FREQ_CALC is the number of months between EFFECTIVEDATE and EXPIRY_DATE, divided by the noMonths field, FREQ_CODE, after the M.
I need to get everything into this shape - AFTER.
I am trying to figure out how to calculate the 'FREQUENCY' as well as the fields in blue, green, and pink (pink is very easy). Basically, 'FREQ_CODE' has an 'M' character and after that I have months and days in a month. If noMonths is 3, I need to start mxDays with 90, and then find the difference in the number of days from the maturityDate field, so it's not the DATEDIFF() between two fields, but the DATEDIFF between increasing dates in the same field, grouped by Credit_Line_NO. So, the three cells in yellow start mxDays. Also, mxFactor is 1 when mxDays is 30 or 90, and it is 365/360, when mxDays is 365. Finally, the Calc is the mxDays * Amount. This is super-easy. I just can't figure out how to get the mxDays and mxFactor setup.
For additional clarity, 91 days = 6/30/2018 - 3/31/2018 and 92 days = 9/30/2018 - 6/30/2018. Also, 1.0111 = 91/90 and 1.0222 = 92/90. Similarly, 0.8111 = 73/90. Finally, 1.0139 = 365/360 because noMonths = 12.
Maybe this requires a CTE and a couple Case...When...Then statements. Not sure...
I am using SQL Server 2008.
-- Here is my DDL
-- Drop table Reporting_Table
CREATE TABLE Reporting_Table (
Credit_Line_NO Varchar(10),
noMonths INT,
EFFECTIVEDATE Date,
EXPIRY_DATE Date,
Amount Money,
mxDays INT,
mxFactor decimal(5,4),
Calc Money)
INSERT INTO Reporting_Table (Credit_Line_NO, noMonths, EFFECTIVEDATE, EXPIRY_DATE, Amount, mxDays, mxFactor, Calc)
Values('9938810','3','3/31/2018','6/12/2020','11718.75','90','1','11718.75')
INSERT INTO Reporting_Table (Credit_Line_NO, noMonths, EFFECTIVEDATE, EXPIRY_DATE, Amount, mxDays, mxFactor, Calc)
Values('2235461','1','6/30/2018','6/6/2019','12345','30','1','12345')
INSERT INTO Reporting_Table (Credit_Line_NO, noMonths, EFFECTIVEDATE, EXPIRY_DATE, Amount, mxDays, mxFactor, Calc)
Values('3365434','12','6/30/2018','6/30/2019','298523.36085','365','1.01388888888889','302669.518639583')
For SQL 2008 you need to order your table with row_number and join each row with previous one. Then make calculations
with cte as (
select
*, rn = row_number() over (partition by Credit_Line_NO order by maturityDate)
from
Reporting_Table
)
select
a.*, mxDay = isnull(q.dayDiff, q.mDay), z.mxFactor
, Calc = z.mxFactor * a.Amount
from
cte a
left join cte b on a.Credit_Line_NO = b.Credit_Line_NO and a.rn - 1 = b.rn
cross apply (select
mDay = case
when a.noMonths = 1 then 30
when a.noMonths = 3 then 90
when a.noMonths = 12 then 365
end, dayDiff = datediff(dd, b.maturityDate, a.maturityDate)) q
cross apply (select mxFactor = cast(1.0 * isnull(q.dayDiff, q.mDay) / q.mDay as decimal(10,4))) z
Edit:
This is update query:
with cte as (
select
*, rn = row_number() over (partition by Credit_Line_NO order by maturityDate)
from
Reporting_Table
)
, cte2 as (
select
a.Credit_Line_NO, a.noMonths, a.maturityDate, a.Amount, mxDay = isnull(q.dayDiff, q.mDay), z.mxFactor
, Calc = z.mxFactor * a.Amount
from
cte a
left join cte b on a.Credit_Line_NO = b.Credit_Line_NO and a.rn - 1 = b.rn
cross apply (select
mDay = case
when a.noMonths = 1 then 30
when a.noMonths = 3 then 90
when a.noMonths = 12 then 365
end, dayDiff = datediff(dd, b.maturityDate, a.maturityDate)) q
cross apply (select mxFactor = cast(1.0 * isnull(q.dayDiff, q.mDay) / q.mDay as decimal(10,4))) z
)
update r
set r.mxDay = c.mxDay, r.mxFactor = c.mxFactor, r.Calc = c.Calc
from
Reporting_Table r
join cte2 c on r.Credit_Line_NO = c.Credit_Line_NO and r.noMonths = c.noMonths and r.maturityDate = c.maturityDate

Speeding up SQL Server cross apply to get aggregated data

In SQL Server, I am trying to put together a single query which grabs a row and includes the aggregated data from a two hour window before that row as well as aggregated data from one hour window after. How can I make this run faster?
The rows have time stamps to a millisecond precision, and are not evenly spaced. I have over 50 million rows in this table, and the query does not seem to be completing. There are indexes in many places, but they don't seem to help. I was also thinking about using a window function, but I am not sure that its possible to have a sliding window with unevenly distributed rows. Also, for the future one hour window, I am not sure how that would be done with a SQL window.
Box is a string and has 10 unique values.
Process is a string and has 30 unique values.
The average duration_ms is 200 ms.
Errors account for less than 0.1% of the data.
The 50 million rows describes a years worth of data.
select
c1.start_time,
c1.end_time,
c1.box,
c1.process,
datediff(ms,c1.start_time,c1.end_time) as duration_ms,
datepart(dw,c1.start_time) as day_of_week,
datepart(hour,c1.start_time) as hour_of_day,
c3.*,
c5.*
from metrics_table c1
cross apply
(select
avg(cast(datediff(ms,c2.start_time,c2.end_time) as numeric)) as avg_ms,
count(1) as num_process_total,
count(distinct process) as num_process_unique,
count(distinct box) as num_box_unique
from metrics_table c2
where datediff(minute,c2.start_time,c1.start_time) <= 120
and c1.start_time> c2.start_time
and c2.error_code = 0
) c3
cross apply
(select
avg(case when datediff(ms,c4.start_time,c4.end_time)>1000 then 1.0 else 0.0 end) as percent_over_thresh
from metrics_table c4
where datediff(hour,c1.start_time,c4.start_time) <= 1
and c4.start_time> c1.start_time
and c4.error_code= 0
) c5
where
c1.error_code= 0
Edit
Version: SQL Azure 12.0
Adding execution plan:
The following should be a step in the right direction...
Note: c2.start_time & c4.start_time are no longer wrappen in DATEDIFF functions making them SARGable...
SELECT
c1.start_time,
c1.end_time,
c1.box,
c1.process,
DATEDIFF(ms, c1.start_time, c1.end_time) AS duration_ms,
DATEPART(dw, c1.start_time) AS day_of_week,
DATEPART(HOUR, c1.start_time) AS hour_of_day,
--c3.*,
avg_ms = CASE WHEN
c5.*
FROM
dbo.metrics_table c1
CROSS APPLY (
SELECT
AVG(CAST(DATEDIFF(ms, c2.start_time, c2.end_time) AS NUMERIC)) AS avg_ms,
COUNT(1) AS num_process_total,
COUNT(DISTINCT process) AS num_process_unique,
COUNT(DISTINCT box) AS num_box_unique
FROM
dbo.metrics_table c2
WHERE
--DATEDIFF(minute,c2.start_time,c1.start_time) <= 120
c2.start_time <= DATEADD(MINUTE, -120, c1.start_time)
--and c1.start_time> c2.start_time
AND c2.error_code = 0
) c3
CROSS APPLY (
SELECT
AVG(CASE WHEN DATEDIFF(ms, c4.start_time, c4.end_time) > 1000 THEN 1.0 ELSE 0.0 END
) AS percent_over_thresh
FROM
dbo.metrics_table c4
WHERE
--DATEDIFF(HOUR, c1.start_time, c4.start_time) <= 1
c4.start_time >= DATEADD(HOUR, 1, c1.start_time)
--and c4.start_time> c1.start_time
AND c4.error_code = 0
) c5
WHERE
c1.error_code = 0;
Of course, making a query SARGable doesn't do any good unless there's an appropriate index available. The following should be good for all 3 metrics_table references... (see what indexes are currently available, there's a chance that you may not need to create a new index)
CREATE NONCLUSTERED INDEX ixf_metricstable_errorcode_starttime ON dbo.metrics_table (
error_code,
start_time
)
INCLUDE (
end_time,
box,
process
)
WHERE
error_code = 0;
I used Between and got good performance in my simple test rig. I've also used columnstore as 50 million records is DW volumes:
CREATE TABLE dbo.metrics_table (
rowId INT IDENTITY,
start_time DATETIME NOT NULL,
end_time DATETIME NOT NULL,
box VARCHAR(10) NOT NULL,
process VARCHAR(10) NOT NULL,
error_code INT NOT NULL
);
-- Add records
;WITH cte AS (
SELECT TOP 3334 ROW_NUMBER() OVER ( ORDER BY ( SELECT 1 ) ) rn
FROM sys.columns c1
CROSS JOIN sys.columns c2
CROSS JOIN sys.columns c3
)
INSERT INTO dbo.metrics_table ( start_time, end_time, box, process, error_code )
SELECT
DATEADD( ms, rn, DATEADD( day, rn % 365, '1 Jan 2017' ) ) AS start_time,
DATEADD( ms, rn % 409, DATEADD( ms, rn, DATEADD( day, rn % 365, '1 Jan 2017' ) ) ) AS end_time,
'box' + CAST( boxes.box AS VARCHAR(10) ) box,
'process' + CAST( boxes.box AS VARCHAR(10) ) process,
ABS( CAST( rn % 3000 AS BIT ) -1 ) error_code
FROM cte c
CROSS JOIN ( SELECT TOP 10 rn FROM cte ) AS boxes(box)
CROSS JOIN ( SELECT TOP 30 rn FROM cte ) AS processes(process);
-- Create normal clustered index to order the data
CREATE CLUSTERED INDEX cci_metrics_table ON dbo.metrics_table ( start_time, end_time, box, process );
--CREATE CLUSTERED INDEX cci_metrics_table ON dbo.metrics_table ( box, process, start_time, end_time );
-- Convert to columnstore
CREATE CLUSTERED COLUMNSTORE INDEX cci_metrics_table ON dbo.metrics_table WITH ( MAXDOP = 1, DROP_EXISTING = ON );
IF OBJECT_ID('tempdb..#tmp1' ) IS NOT NULL DROP TABLE #tmp1
-- two hour window before, 1 hour window after
SELECT
c1.start_time,
c1.end_time,
c1.box,
c1.process,
DATEDIFF( ms, c1.start_time, c1.end_time ) AS duration_ms,
DATEPART( dw, c1.start_time ) AS day_of_week,
DATEPART( hour, c1.start_time ) AS hour_of_day,
c2.xavg,
c2.num_process_total,
c2.num_process_unique,
c2.num_box_unique,
c3.percent_over_thresh
INTO #tmp1
FROM dbo.metrics_table c1
CROSS APPLY
(
SELECT
COUNT(1) AS num_process_total,
AVG( CAST( DATEDIFF( ms, start_time, end_time ) AS NUMERIC ) ) xavg,
COUNT( DISTINCT process ) num_process_unique,
COUNT( DISTINCT box ) num_box_unique
FROM dbo.metrics_table c2
WHERE c2.error_code = 0
AND c2.start_time Between DATEADD( minute, -120, c1.start_time ) And c1.start_time
AND c1.start_time > c2.start_time
) c2
CROSS APPLY
(
SELECT
AVG( CASE WHEN DATEDIFF( ms, c4.start_time, c4.end_time ) > 1000 THEN 1.0 ELSE 0.0 END ) percent_over_thresh
FROM dbo.metrics_table c4
WHERE c4.error_code = 0
AND c4.start_time Between c1.start_time And DATEADD( minute, 60, c1.start_time )
AND c4.start_time > c1.start_time
) c3
WHERE error_code = 0

Doing a comparison using the previous row?

I'm trying to work out an efficient way of comparing two rows in SQL Server 2008. I need to write a query which finds all rows in the Movement table which have Speed < 10 N consecutive times.
The structure of the table is:
EventTime
Speed
If the data were:
2012-02-05 13:56:36.980, 2
2012-02-05 13:57:36.980, 11
2012-02-05 13:57:46.980, 2
2012-02-05 13:59:36.980, 2
2012-02-05 14:06:36.980, 22
2012-02-05 15:56:36.980, 2
Then it would return rows 3/4 (13:57:46.980 / 13:59:36.980) if I looked for 2 consecutive rows, and would return nothing if I looked for three consecutive rows. The order of the data is EventTime/DateTime only.
Any help you could give me would be great. I'm considering using cursors but they're usually pretty inefficient. Also, this table is approximately 10m rows in size, so the more efficient the better! :)
Thanks!
DECLARE
#n INT,
#speed_limit INT
SELECT
#n = 5,
#speed_limit = 10
;WITH
partitioned AS
(
SELECT
*,
CASE WHEN speed < #speed_limit THEN 1 ELSE 0 END AS PartitionID
FROM
Movement
)
,
sequenced AS
(
SELECT
ROW_NUMBER() OVER ( ORDER BY EventTime) AS MasterSeqID,
ROW_NUMBER() OVER (PARTITION BY PartitionID ORDER BY EventTime) AS PartIDSeqID,
*
FROM
partitioned
)
,
filter AS
(
SELECT
MasterSeqID - PartIDSeqID AS GroupID,
MIN(MasterSeqID) AS GroupFirstMastSeqID,
MAX(MasterSeqID) AS GroupFinalMastSeqID
FROM
sequenced
WHERE
PartitionID = 1
GROUP BY
MasterSeqID - PartIDSeqID
HAVING
COUNT(*) >= #n
)
SELECT
sequenced.*
FROM
filter
INNER JOIN
sequenced
ON sequenced.MasterSeqID >= filter.GroupFirstMastSeqID
AND sequenced.MasterSeqID <= filter.GroupFinalMastSeqID
Alternative final steps (inspired by #t-clausen-dk), to avoid an additional JOIN. I would test both to see which is more performant.
,
filter AS
(
SELECT
MasterSeqID - PartIDSeqID AS GroupID,
COUNT(*) OVER (PARTITION BY MasterSeqID - PartIDSeqID) AS GroupSize,
*
FROM
sequenced
WHERE
PartitionID = 1
)
SELECT
*
FROM
filter
WHERE
GroupSize >= #n
declare #t table(EventTime datetime, Speed int)
insert #t values('2012-02-05 13:56:36.980', 2)
insert #t values('2012-02-05 13:57:36.980', 11)
insert #t values('2012-02-05 13:57:46.980', 2)
insert #t values('2012-02-05 13:59:36.980', 2)
insert #t values('2012-02-05 14:06:36.980', 22)
insert #t values('2012-02-05 15:56:36.980', 2)
declare #N int = 1
;with a as
(
select EventTime, Speed, row_number() over (order by EventTime) rn from #t
), b as
(
select EventTime, Speed, 1 grp, rn from a where rn = 1
union all
select a.EventTime, a.Speed, case when a.speed < 10 and b.speed < 10 then grp else grp + 1 end, a.rn
from a join b on a.rn = b.rn+1
), c as
(
select EventTime, Speed, count(*) over (partition by grp) cnt from b
)
select * from c
where cnt > #N
OPTION (MAXRECURSION 0) -- Thx Dems
Almost the same ideea as Dems, a little bit different:
select * from (
select eventtime, speed, rnk, new_rnk,
rnk - new_rnk,
max(rnk) over (partition by speed, new_rnk-rnk) -
min(rnk) over (partition by speed, new_rnk-rnk) + 1 as no_consec
from (
select eventtime, rnk, speed,
row_number() over (partition by speed order by eventtime) as new_rnk
from (
select eventtime, speed,
row_number() over (order by eventtime) as rnk
from a
) a
where a.speed < 5
)
order by eventtime
)
where no_consec >= 2;
5 is speed limit and 2 is min number of consecutive events.
I put date as number for simplicity of writing the create database.
SQLFIDDLE
EDIT:
To answer to comments, I've added three columns in the first inner query. To get only the first row you need to add an pos_in_group = 1 to WHERE clause and the distance is at your fingers.
SQLFIDDLE
select eventtime, speed, min_date, max_date, pos_in_group
from (
select eventtime, speed, rnk, new_rnk,
rnk - new_rnk,
row_number() over (partition by speed, new_rnk-rnk order by eventtime) pos_in_group,
min(eventtime) over (partition by speed, new_rnk-rnk) min_date,
max(eventtime) over (partition by speed, new_rnk-rnk) max_date,
max(rnk) over (partition by speed, new_rnk-rnk) -
min(rnk) over (partition by speed, new_rnk-rnk) + 1 as no_consec
from (
select eventtime, rnk, speed,
row_number() over (partition by speed order by eventtime) as new_rnk
from (
select eventtime, speed,
row_number() over (order by eventtime) as rnk
from a
) a
where a.speed < 5
)
order by eventtime
)
where no_consec > 1;