SQL moving average - sql

How do you create a moving average in SQL?
Current table:
Date Clicks
2012-05-01 2,230
2012-05-02 3,150
2012-05-03 5,520
2012-05-04 1,330
2012-05-05 2,260
2012-05-06 3,540
2012-05-07 2,330
Desired table or output:
Date Clicks 3 day Moving Average
2012-05-01 2,230
2012-05-02 3,150
2012-05-03 5,520 4,360
2012-05-04 1,330 3,330
2012-05-05 2,260 3,120
2012-05-06 3,540 3,320
2012-05-07 2,330 3,010

This is an Evergreen Joe Celko question.
I ignore which DBMS platform is used. But in any case Joe was able to answer more than 10 years ago with standard SQL.
Joe Celko SQL Puzzles and Answers citation:
"That last update attempt suggests that we could use the predicate to
construct a query that would give us a moving average:"
SELECT S1.sample_time, AVG(S2.load) AS avg_prev_hour_load
FROM Samples AS S1, Samples AS S2
WHERE S2.sample_time
BETWEEN (S1.sample_time - INTERVAL 1 HOUR)
AND S1.sample_time
GROUP BY S1.sample_time;
Is the extra column or the query approach better? The query is
technically better because the UPDATE approach will denormalize the
database. However, if the historical data being recorded is not going
to change and computing the moving average is expensive, you might
consider using the column approach.
MS SQL Example:
CREATE TABLE #TestDW
( Date1 datetime,
LoadValue Numeric(13,6)
);
INSERT INTO #TestDW VALUES('2012-06-09' , '3.540' );
INSERT INTO #TestDW VALUES('2012-06-08' , '2.260' );
INSERT INTO #TestDW VALUES('2012-06-07' , '1.330' );
INSERT INTO #TestDW VALUES('2012-06-06' , '5.520' );
INSERT INTO #TestDW VALUES('2012-06-05' , '3.150' );
INSERT INTO #TestDW VALUES('2012-06-04' , '2.230' );
SQL Puzzle query:
SELECT S1.date1, AVG(S2.LoadValue) AS avg_prev_3_days
FROM #TestDW AS S1, #TestDW AS S2
WHERE S2.date1
BETWEEN DATEADD(d, -2, S1.date1 )
AND S1.date1
GROUP BY S1.date1
order by 1;

One way to do this is to join on the same table a few times.
select
(Current.Clicks
+ isnull(P1.Clicks, 0)
+ isnull(P2.Clicks, 0)
+ isnull(P3.Clicks, 0)) / 4 as MovingAvg3
from
MyTable as Current
left join MyTable as P1 on P1.Date = DateAdd(day, -1, Current.Date)
left join MyTable as P2 on P2.Date = DateAdd(day, -2, Current.Date)
left join MyTable as P3 on P3.Date = DateAdd(day, -3, Current.Date)
Adjust the DateAdd component of the ON-Clauses to match whether you want your moving average to be strictly from the past-through-now or days-ago through days-ahead.
This works nicely for situations where you need a moving average over only a few data points.
This is not an optimal solution for moving averages with more than a few data points.

select t2.date, round(sum(ct.clicks)/3) as avg_clicks
from
(select date from clickstable) as t2,
(select date, clicks from clickstable) as ct
where datediff(t2.date, ct.date) between 0 and 2
group by t2.date
Example here.
Obviously you can change the interval to whatever you need. You could also use count() instead of a magic number to make it easier to change, but that will also slow it down.

General template for rolling averages that scales well for large data sets
WITH moving_avg AS (
SELECT 0 AS [lag] UNION ALL
SELECT 1 AS [lag] UNION ALL
SELECT 2 AS [lag] UNION ALL
SELECT 3 AS [lag] --ETC
)
SELECT
DATEADD(day,[lag],[date]) AS [reference_date],
[otherkey1],[otherkey2],[otherkey3],
AVG([value1]) AS [avg_value1],
AVG([value2]) AS [avg_value2]
FROM [data_table]
CROSS JOIN moving_avg
GROUP BY [otherkey1],[otherkey2],[otherkey3],DATEADD(day,[lag],[date])
ORDER BY [otherkey1],[otherkey2],[otherkey3],[reference_date];
And for weighted rolling averages:
WITH weighted_avg AS (
SELECT 0 AS [lag], 1.0 AS [weight] UNION ALL
SELECT 1 AS [lag], 0.6 AS [weight] UNION ALL
SELECT 2 AS [lag], 0.3 AS [weight] UNION ALL
SELECT 3 AS [lag], 0.1 AS [weight] --ETC
)
SELECT
DATEADD(day,[lag],[date]) AS [reference_date],
[otherkey1],[otherkey2],[otherkey3],
AVG([value1] * [weight]) / AVG([weight]) AS [wavg_value1],
AVG([value2] * [weight]) / AVG([weight]) AS [wavg_value2]
FROM [data_table]
CROSS JOIN weighted_avg
GROUP BY [otherkey1],[otherkey2],[otherkey3],DATEADD(day,[lag],[date])
ORDER BY [otherkey1],[otherkey2],[otherkey3],[reference_date];

select *
, (select avg(c2.clicks) from #clicks_table c2
where c2.date between dateadd(dd, -2, c1.date) and c1.date) mov_avg
from #clicks_table c1

Use a different join predicate:
SELECT current.date
,avg(periods.clicks)
FROM current left outer join current as periods
ON current.date BETWEEN dateadd(d,-2, periods.date) AND periods.date
GROUP BY current.date HAVING COUNT(*) >= 3
The having statement will prevent any dates without at least N values from being returned.

assume x is the value to be averaged and xDate is the date value:
SELECT avg(x) from myTable WHERE xDate BETWEEN dateadd(d, -2, xDate) and xDate

In hive, maybe you could try
select date, clicks, avg(clicks) over (order by date rows between 2 preceding and current row) as moving_avg from clicktable;

For the purpose, I'd like to create an auxiliary/dimensional date table like
create table date_dim(date date, date_1 date, dates_2 date, dates_3 dates ...)
while date is the key, date_1 for this day, date_2 contains this day and the day before; date_3...
Then you can do the equal join in hive.
Using a view like:
select date, date from date_dim
union all
select date, date_add(date, -1) from date_dim
union all
select date, date_add(date, -2) from date_dim
union all
select date, date_add(date, -3) from date_dim

NOTE: THIS IS NOT AN ANSWER but an enhanced code sample of Diego Scaravaggi's answer. I am posting it as answer as the comment section is insufficient. Note that I have parameter-ized the period for Moving aveage.
declare #p int = 3
declare #t table(d int, bal float)
insert into #t values
(1,94),
(2,99),
(3,76),
(4,74),
(5,48),
(6,55),
(7,90),
(8,77),
(9,16),
(10,19),
(11,66),
(12,47)
select a.d, avg(b.bal)
from
#t a
left join #t b on b.d between a.d-(#p-1) and a.d
group by a.d

--#p1 is period of moving average, #01 is offset
declare #p1 as int
declare #o1 as int
set #p1 = 5;
set #o1 = 3;
with np as(
select *, rank() over(partition by cmdty, tenor order by markdt) as r
from p_prices p1
where
1=1
)
, x1 as (
select s1.*, avg(s2.val) as avgval from np s1
inner join np s2
on s1.cmdty = s2.cmdty and s1.tenor = s2.tenor
and s2.r between s1.r - (#p1 - 1) - (#o1) and s1.r - (#o1)
group by s1.cmdty, s1.tenor, s1.markdt, s1.val, s1.r
)

I'm not sure that your expected result (output) shows classic "simple moving (rolling) average" for 3 days. Because, for example, the first triple of numbers by definition gives:
ThreeDaysMovingAverage = (2.230 + 3.150 + 5.520) / 3 = 3.6333333
but you expect 4.360 and it's confusing.
Nevertheless, I suggest the following solution, which uses window-function AVG. This approach is much more efficient (clear and less resource-intensive) than SELF-JOIN introduced in other answers (and I'm surprised that no one has given a better solution).
-- Oracle-SQL dialect
with
data_table as (
select date '2012-05-01' AS dt, 2.230 AS clicks from dual union all
select date '2012-05-02' AS dt, 3.150 AS clicks from dual union all
select date '2012-05-03' AS dt, 5.520 AS clicks from dual union all
select date '2012-05-04' AS dt, 1.330 AS clicks from dual union all
select date '2012-05-05' AS dt, 2.260 AS clicks from dual union all
select date '2012-05-06' AS dt, 3.540 AS clicks from dual union all
select date '2012-05-07' AS dt, 2.330 AS clicks from dual
),
param as (select 3 days from dual)
select
dt AS "Date",
clicks AS "Clicks",
case when rownum >= p.days then
avg(clicks) over (order by dt
rows between p.days - 1 preceding and current row)
end
AS "3 day Moving Average"
from data_table t, param p;
You see that AVG is wrapped with case when rownum >= p.days then to force NULLs in first rows, where "3 day Moving Average" is meaningless.

We can apply Joe Celko's "dirty" left outer join method (as cited above by Diego Scaravaggi) to answer the question as it was asked.
declare #ClicksTable table ([Date] date, Clicks int)
insert into #ClicksTable
select '2012-05-01', 2230 union all
select '2012-05-02', 3150 union all
select '2012-05-03', 5520 union all
select '2012-05-04', 1330 union all
select '2012-05-05', 2260 union all
select '2012-05-06', 3540 union all
select '2012-05-07', 2330
This query:
SELECT
T1.[Date],
T1.Clicks,
-- AVG ignores NULL values so we have to explicitly NULLify
-- the days when we don't have a full 3-day sample
CASE WHEN count(T2.[Date]) < 3 THEN NULL
ELSE AVG(T2.Clicks)
END AS [3-Day Moving Average]
FROM #ClicksTable T1
LEFT OUTER JOIN #ClicksTable T2
ON T2.[Date] BETWEEN DATEADD(d, -2, T1.[Date]) AND T1.[Date]
GROUP BY T1.[Date]
Generates the requested output:
Date Clicks 3-Day Moving Average
2012-05-01 2,230
2012-05-02 3,150
2012-05-03 5,520 4,360
2012-05-04 1,330 3,330
2012-05-05 2,260 3,120
2012-05-06 3,540 3,320
2012-05-07 2,330 3,010

Related

How to retrieve default value even when do no exist record in database

Querying data as showed below there is no record on database for day 2021-10-03.
date
value
2021-10-01
100
2021-10-02
90
2021-10-04
10
2021-10-05
40
I would like to execute the query using date between as SELECT ... WHERE date BETWEEN '2021-10-01' AND '2021-10-05' and in case of do not exist data for a specific day, to retrieve zero as exemplified below:
date
value
2021-10-01
100
2021-10-02
90
2021-10-03
10
2021-10-04
10
2021-10-05
40
Is it possible? in bigQuery?
I tried the query below, but retrieved duplicated values.
WITH `project.myproject` AS (
SELECT
DATA_VENDA AS date,
CAST(SUM(VLR_VENDA_TABELA) AS FLOAT64) AS total,
FROM `project.myproject`
WHERE
(DATA_VENDA BETWEEN '2020-10-02'
AND '2020-10-07')
AND COD_CP = '0000010232'
GROUP BY
DATA_VENDA
ORDER BY
DATA_VENDA
),
dates AS (
SELECT total, date
FROM `project.myproject`, UNNEST(GENERATE_DATE_ARRAY(date('2020-10-02'), date('2020-10-07'))) AS date
)
SELECT d.date, IFNULL(t.total, 0) total
FROM dates d
LEFT JOIN `project.myproject` t
ON d.date = t.date
AND d.total = t.total
ORDER BY d.date
I found out the answers running command below. The difference from that to this is that in this new one I removed the line AND d.total = t.total, who was responsible for creating duplicated data. The final answer follow below:
WITH `project.myproject` AS (
SELECT
DATA_VENDA AS date,
CAST(SUM(VLR_VENDA_TABELA) AS FLOAT64) AS total,
FROM `project.myproject`
WHERE
(DATA_VENDA BETWEEN '2020-10-02'
AND '2020-10-07')
AND COD_CP = '0000010232'
GROUP BY
DATA_VENDA
ORDER BY
DATA_VENDA
),
dates AS (
SELECT total, date
FROM `project.myproject`, UNNEST(GENERATE_DATE_ARRAY(date('2020-10-02'), date('2020-10-07'))) AS date
)
SELECT d.date, IFNULL(t.total, 0) total
FROM dates d
LEFT JOIN `project.myproject` t
ON d.date = t.date
ORDER BY d.date
You can simply do that with the common table expression(CTE) as shown below.
DECLARE #Datatemp TABLE (
Id INT IDENTITY(1,1) NOT NULL,
CDate DATETIME,
Val INT
)
INSERT INTO #Datatemp SELECT '2021-10-01',10
INSERT INTO #Datatemp SELECT '2021-10-02',50
INSERT INTO #Datatemp SELECT '2021-10-04',24
INSERT INTO #Datatemp SELECT '2021-10-05',18
;WITH DateTemp(Date) AS (
SELECT CAST('2021-10-01' AS DATETIME)
UNION ALL
SELECT [Date]+1
FROM DateTemp
WHERE [Date] < '2021-10-05'
)
SELECT DateTemp.[Date] CDat
,ISNULL(t.Val, 0) Val
FROM DateTemp
LEFT JOIN #Datatemp t ON t.CDate = DateTemp.[Date]
ORDER BY DateTemp.[Date]
--OPTION (MAXRECURSION 0)
By default number of iterations for recursive CTE is 100. As long as this number is exceeded, the query will be interrupted and an error will be generated. If you want to remove this restriction, you can specify MAXRECURSION 0.

SQL - '1' IF hour in month EXISTS, '0' IF NOT EXISTS

I have a table that has aggregations down to the hour level YYYYMMDDHH. The data is aggregated and loaded by an external process (I don't have control over). I want to test the data on a monthly basis.
The question I am looking to answer is: Does every hour in the month exist?
I'm looking to produce output that will return a 1 if the hour exists or 0 if the hour does not exist.
The aggregation table looks something like this...
YYYYMM YYYYMMDD YYYYMMDDHH DATA_AGG
201911 20191101 2019110100 100
201911 20191101 2019110101 125
201911 20191101 2019110103 135
201911 20191101 2019110105 95
… … … …
201911 20191130 2019113020 100
201911 20191130 2019113021 110
201911 20191130 2019113022 125
201911 20191130 2019113023 135
And defined as...
CREATE TABLE YYYYMMDDHH_DATA_AGG AS (
YYYYMM VARCHAR,
YYYYMMDD VARCHAR,
YYYYMMDDHH VARCHAR,
DATA_AGG INT
);
I'm looking to produce the following below...
YYYYMMDDHH HOUR_EXISTS
2019110100 1
2019110101 1
2019110102 0
2019110103 1
2019110104 0
2019110105 1
... ...
In the example above, two hours do not exist, 2019110102 and 2019110104.
I assume I'd have to join the aggregation table against a computed table that contains all the YYYYMMDDHH combos???
The database is Snowflake, but assume most generic ANSI SQL queries will work.
You can get what you want with a recursive CTE
The recursive CTE generates the list of possible Hours. And then a simple left outer join gets you the flag for if you have any records that match that hour.
WITH RECURSIVE CTE (YYYYMMDDHH) as
(
SELECT YYYYMMDDHH
FROM YYYYMMDDHH_DATA_AGG
WHERE YYYYMMDDHH = (SELECT MIN(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG)
UNION ALL
SELECT TO_VARCHAR(DATEADD(HOUR, 1, TO_TIMESTAMP(C.YYYYMMDDHH, 'YYYYMMDDHH')), 'YYYYMMDDHH') YYYYMMDDHH
FROM CTE C
WHERE TO_VARCHAR(DATEADD(HOUR, 1, TO_TIMESTAMP(C.YYYYMMDDHH, 'YYYYMMDDHH')), 'YYYYMMDDHH') <= (SELECT MAX(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG)
)
SELECT
C.YYYYMMDDHH,
IFF(A.YYYYMMDDHH IS NOT NULL, 1, 0) HOUR_EXISTS
FROM CTE C
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG A
ON C.YYYYMMDDHH = A.YYYYMMDDHH;
If your timerange is too long you'll have issues with the cte recursing too much. You can create a table or temp table with all of the possible hours instead. For example:
CREATE OR REPLACE TEMPORARY TABLE HOURS (YYYYMMDDHH VARCHAR) AS
SELECT TO_VARCHAR(DATEADD(HOUR, SEQ4(), TO_TIMESTAMP((SELECT MIN(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG), 'YYYYMMDDHH')), 'YYYYMMDDHH')
FROM TABLE(GENERATOR(ROWCOUNT => 10000)) V
ORDER BY 1;
SELECT
H.YYYYMMDDHH,
IFF(A.YYYYMMDDHH IS NOT NULL, 1, 0) HOUR_EXISTS
FROM HOURS H
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG A
ON H.YYYYMMDDHH = A.YYYYMMDDHH
WHERE H.YYYYMMDDHH <= (SELECT MAX(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG);
You can then fiddle with the generator count to make sure you have enough hours.
You can generate a table with every hour of the month and LEFT OUTER JOIN your aggregation to it:
WITH EVERY_HOUR AS (
SELECT TO_CHAR(DATEADD(HOUR, HH, TO_DATE(YYYYMM::TEXT, 'YYYYMM')),
'YYYYMMDDHH')::NUMBER YYYYMMDDHH
FROM (SELECT DISTINCT YYYYMM FROM YYYYMMDDHH_DATA_AGG) t
CROSS JOIN (
SELECT ROW_NUMBER() OVER (ORDER BY NULL) - 1 HH
FROM TABLE(GENERATOR(ROWCOUNT => 745))
) h
QUALIFY YYYYMMDDHH < (YYYYMM + 1) * 10000
)
SELECT h.YYYYMMDDHH, NVL2(a.YYYYMM, 1, 0) HOUR_EXISTS
FROM EVERY_HOUR h
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG a ON a.YYYYMMDDHH = h.YYYYMMDDHH
Here's something that might help get you started. I'm guessing you want to have 'synthetic' [YYYYMMDD] values? Otherwise, if the value aren't there, then they shouldn't appear in the list
DROP TABLE IF EXISTS #_hours
DROP TABLE IF EXISTS #_temp
--Populate a table with hours ranging from 00 to 23
CREATE TABLE #_hours ([hour_value] VARCHAR(2))
DECLARE #_i INT = 0
WHILE (#_i < 24)
BEGIN
INSERT INTO #_hours
SELECT FORMAT(#_i, '0#')
SET #_i += 1
END
-- Replicate OP's sample data set
CREATE TABLE #_temp (
[YYYYMM] INTEGER
, [YYYYMMDD] INTEGER
, [YYYYMMDDHH] INTEGER
, [DATA_AGG] INTEGER
)
INSERT INTO #_temp
VALUES
(201911, 20191101, 2019110100, 100),
(201911, 20191101, 2019110101, 125),
(201911, 20191101, 2019110103, 135),
(201911, 20191101, 2019110105, 95),
(201911, 20191130, 2019113020, 100),
(201911, 20191130, 2019113021, 110),
(201911, 20191130, 2019113022, 125),
(201911, 20191130, 2019113023, 135)
SELECT X.YYYYMM, X.YYYYMMDD, X.YYYYMMDDHH
-- Case: If 'target_hours' doesn't exist, then 0, else 1
, CASE WHEN X.target_hours IS NULL THEN '0' ELSE '1' END AS [HOUR_EXISTS]
FROM (
-- Select right 2 characters from converted [YYYYMMDDHH] to act as 'target values'
SELECT T.*
, RIGHT(CAST(T.[YYYYMMDDHH] AS VARCHAR(10)), 2) AS [target_hours]
FROM #_temp AS T
) AS X
-- Right join to keep all of our hours and only the target hours that match.
RIGHT JOIN #_hours AS H ON H.hour_value = X.target_hours
Sample output:
YYYYMM YYYYMMDD YYYYMMDDHH HOUR_EXISTS
201911 20191101 2019110100 1
201911 20191101 2019110101 1
NULL NULL NULL 0
201911 20191101 2019110103 1
NULL NULL NULL 0
201911 20191101 2019110105 1
NULL NULL NULL 0
With (almost) standard sql, you can do a cross join of the distinct values of YYYYMMDD to a list of all possible hours and then left join to the table:
select concat(d.YYYYMMDD, h.hour) as YYYYMMDDHH,
case when t.YYYYMMDDHH is null then 0 else 1 end as hour_exists
from (select distinct YYYYMMDD from tablename) as d
cross join (
select '00' as hour union all select '01' union all
select '02' union all select '03' union all
select '04' union all select '05' union all
select '06' union all select '07' union all
select '08' union all select '09' union all
select '10' union all select '11' union all
select '12' union all select '13' union all
select '14' union all select '15' union all
select '16' union all select '17' union all
select '18' union all select '19' union all
select '20' union all select '21' union all
select '22' union all select '23'
) as h
left join tablename as t
on concat(d.YYYYMMDD, h.hour) = t.YYYYMMDDHH
order by concat(d.YYYYMMDD, h.hour)
Maybe in Snowflake you can construct the list of hours with a sequence much easier instead of all those UNION ALLs.
This version accounts for the full range of days, across months and years. It's a simple cross join of the set of possible days with the set of possible hours of the day -- left joined to actual dates.
set first = (select min(yyyymmdd::number) from YYYYMMDDHH_DATA_AGG);
set last = (select max(yyyymmdd::number) from YYYYMMDDHH_DATA_AGG);
with
hours as (select row_number() over (order by null) - 1 h from table(generator(rowcount=>24))),
days as (
select
row_number() over (order by null) - 1 as n,
to_date($first::text, 'YYYYMMDD')::date + n as d,
to_char(d, 'YYYYMMDD') as yyyymmdd
from table(generator(rowcount=>($last-$first+1)))
)
select days.yyyymmdd || lpad(hours.h,2,0) as YYYYMMDDHH, nvl2(t.yyyymmddhh,1,0) as HOUR_EXISTS
from days cross join hours
left join YYYYMMDDHH_DATA_AGG t on t.yyyymmddhh = days.yyyymmdd || lpad(hours.h,2,0)
order by 1
;
$first and $last can be packed in as sub-queries if you prefer.

Previous row end date as the next row start date in SQL

Need Some help Please,
I have a Field called 'hist_lastupdated' that contains the last updated date of the modification of the price of a product.
Based in this field, i want to extract the start date and the end date of the modification.
In fact i have this:
**Product_id , Price , hist_lastupdated**
284849 18.95 2015-05-29 00:53:55
284849 15.95 2015-08-14 01:04:46
284849 18.95 2016-06-11 00:50:31
284849 15.95 2016-08-24 00:45:11
And i want to get the result like that :
**Product_id , Price , hist_lastupdated ,start_date , End_date**
284849 18.95 2015-05-29 00:53:55 2014-05-01 00:00:00 2015-05-29 00:53:55
284849 15.95 2015-08-14 01:04:46 2015-05-29 00:53:55 2015-08-14 01:04:46
284849 18.95 2016-06-11 00:50:31 2015-08-14 01:04:46 2016-06-11 00:50:31
284849 15.95 2016-08-24 00:45:11 2016-06-11 00:50:31 2016-08-24 00:45:11
In two word, the start date is the end date of the previous line
i have many product id
Something like this:
select Product_id,
Price,
hist_lastupdated,
lag(hist_lastupdated) over (partition by product_id order by hist_lastupdated) as start_date,
hist_lastupdated as end_date
from the_table
You didn't explain where the start_date for the first column is calculated. If that is beginning of the month from hist_lastupdated you can do something like this:
lag(hist_lastupdated, 1, date_trunc('month', hist_lastupdated)) over (...)
I'm not sure how you would do this with just SQL but if you're able to do a bit of scripting you can write up a quick program that goes something like this (pseudocode):
lines = execute(SELECT product_id, price, hist_lastupdated FROM ProductTable)
startDate = 00:00:00 2014-05-01
outputLines = []
for row in lines:
outLine = []
outline.append(row[0])
outline.append(row[1])
outline.append(row[2])
outline.append(startDate)
outline.append(row[2])
startDate = row[2]
#Now do what you want with the output you have in a nice list of lists in the format you need, insert into a different table, write to a file, whatever you want.
I would use one of these solutions with MS SQL Server. Hopefully one of them will apply to your problem.
Pure SQL statement would look like this:
select
t.product_id, t.price, s.start_date, t.end_date
from
product t
outer apply
(
select top 1
end_date start_date
from
product o
where
o.end_date < t.end_date
order by
o.end_date desc
) s
The cross apply for each record returned can be a performance problem even with good indexing.
If your SQL server supports the LAG function:
select
t.product_id, t.price,
LAG(T.end_date) over (order by t.end_date),
t.end_date
from
product t
Or you may find a way to do the same thing with variables in an update statement to "remember" the value in the previously updated record like the T-SQL:
-- Insert the desired output into a table variable that also has a start_date field.
-- Be sure to insert the records ordered by the date value.
declare #output table (product_id int, price numeric(10,2), [start_date] datetime, [end_date] datetime)
insert #output (product_id, price, end_date)
select 1, 10, '1/1/2015'
union all select 2, 11, '2/1/2015'
union all select 3, 15, '3/1/2015'
union all select 4, 20, '4/1/2015'
order by 3
-- Update the start date using the end date from the previous record
declare #start_date datetime, #end_date datetime
update
#output
set
#start_date = #end_date,
#end_date = end_date,
start_date = #start_date
select * from #output
I don't think this technique is recommended by Microsoft, but it has served me well and worked consistently. I only used this technique with table variables. I would be less inclined to trust the update sequence of records in in an actual table. Now I would use LAG() instead.
This is the solution that i find it,i wanted to work with the lag function but the result is not what i wanted to have.
The solution :
WITH
price_table_1 as (
select
-1 + ROW_NUMBER() over (partition by t1.product_id,t1.id ,t1.channel_id) as rownum_w1,
t1.id,
t1.product_id,
t1.channel_id,
t1.member_id,
t1.quantity,
t1.price,
t1.promo_dt_start,
t1.promo_dt_end,
t1.hist_lastupdated
FROM dwh_prod.hist_prices t1
where t1.channel_id='1004' and t1.product_id = '5896' and t1.quantity = '1' and t1.promo_dt_start is null
order by t1.product_id,t1.channel_id,t1.hist_lastupdated
),price_table_2 as (
select
ROW_NUMBER() over (partition by t2.product_id,t2.id ,t2.channel_id) as rownum_w2,
t2.id,
t2.product_id,
t2.channel_id,
t2.member_id,
t2.quantity,
t2.price,
t2.promo_dt_start,
t2.promo_dt_end,
t2.hist_lastupdated
FROM dwh_prod.hist_prices t2
where t2.channel_id='1004' and t2.product_id = '5896' and t2.quantity = '1' and t2.promo_dt_start is null
order by t2.product_id,t2.channel_id,t2.hist_lastupdated
)
select
t1.id,
t1.product_id,
t1.channel_id,
t1.member_id,
t1.quantity,
t1.price,
t1.promo_dt_start,
t1.promo_dt_end,
t2.hist_lastupdated as start_date,
t1.hist_lastupdated as end_date
FROM price_table_1 t1
inner join price_table_2 t2
on t2.product_id = t1.product_id and t2.id = t1.id and t2.channel_id = t1.channel_id
and rownum_w1 = (rownum_w2)
UNION ALL
select
t1.id,
t1.product_id,
t1.channel_id,
t1.member_id,
t1.quantity,
t1.price,
t1.promo_dt_start,
t1.promo_dt_end,
CONVERT(TIMESTAMP,'2014-01-01') as start_date,
t1.hist_lastupdated as end_date
FROM price_table_1 t1
where rownum_w1 = '0';

Getting ranges that are not in database

I want to get all times that an event is not taking place for each room. The start of the day is 9:00:00 and end is 22:00:00.
What my database looks like is this:
Event EventStart EventEnd Days Rooms DayStarts
CISC 3660 09:00:00 12:30:00 Monday 7-3 9/19/2014
MATH 2501 15:00:00 17:00:00 Monday:Wednesday 7-2 10/13/2014
CISC 1110 14:00:00 16:00:00 Monday 7-3 9/19/2014
I want to get the times that aren't in the database.
ex. For SelectedDate (9/19/2014) the table should return:
Room FreeTimeStart FreeTimeEnd
7-3 12:30:00 14:00:00
7-3 16:00:00 22:00:00
ex2. SelectedDate (10/13/2014):
Room FreeTimeStart FreeTimeEnd
7-2 9:00:00 15:00:00
7-2 17:00:00 22:00:00
What I have tried is something like this:
select * from Events where ________ NOT BETWEEN eventstart AND eventend;
But I do not know what to put in the place of the space.
This was a pretty complex request. SQL works best with sets, and not looking at line by line. Here is what I came up with. To make it easier to figure out, I wrote it as a series of CTE's so I could work through the problem a step at a time. I am not saying that this is the best possible way to do it, but it doesn't require the use of any cursors. You need the Events table and a table of the room names (otherwise, you don't see a room that doesn't have any bookings).
Here is the query and I will explain the methodology.
DECLARE #Events TABLE (Event varchar(20), EventStart Time, EventEnd Time, Days varchar(50), Rooms varchar(10), DayStarts date)
INSERT INTO #Events
SELECT 'CISC 3660', '09:00:00', '12:30:00', 'Monday', '7-3', '9/19/2014' UNION
SELECT 'MATH 2501', '15:00:00', '17:00:00', 'Monday:Wednesday', '7-2', '10/13/2014' UNION
SELECT 'CISC 1110', '14:00:00', '16:00:00', 'Monday', '7-3', '9/19/2014'
DECLARE #Rooms TABLE (RoomName varchar(10))
INSERT INTO #Rooms
SELECT '7-2' UNION
SELECT '7-3'
DECLARE #SelectedDate date = '9/19/2014'
DECLARE #MinTimeInterval int = 30 --smallest time unit room can be reserved for
;WITH
D1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
),
D2(N) AS (SELECT 1 FROM D1 a, D1 b),
D4(N) AS (SELECT 1 FROM D2 a, D2 b),
Numbers AS (SELECT TOP 3600 ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -1 AS Number FROM D4),
AllTimes AS
(SELECT CAST(DATEADD(n,Numbers.Number*#MinTimeInterval,'09:00:00') as time) AS m FROM Numbers
WHERE DATEADD(n,Numbers.Number*#MinTimeInterval,'09:00:00') <= '22:00:00'),
OccupiedTimes AS (
SELECT e.Rooms, ValidTimes.m
FROM #Events E
CROSS APPLY (SELECT m FROM AllTimes WHERE m BETWEEN CASE WHEN e.EventStart = '09:00:00' THEN e.EventStart ELSE DATEADD(n,1,e.EventStart) END and CASE WHEN e.EventEnd = '22:00:00' THEN e.EventEnd ELSE DATEADD(n,-1,e.EventEnd) END) ValidTimes
WHERE e.DayStarts = #SelectedDate
),
AllRoomsAllTimes AS (
SELECT * FROM #Rooms R CROSS JOIN AllTimes
), AllOpenTimes AS (
SELECT a.*, ROW_NUMBER() OVER( PARTITION BY (a.RoomName) ORDER BY a.m) AS pos
FROM AllRoomsAllTimes A
LEFT OUTER JOIN OccupiedTimes o ON a.RoomName = o.Rooms AND a.m = o.m
WHERE o.m IS NULL
), Finalize AS (
SELECT a1.RoomName,
CASE WHEN a3.m IS NULL OR DATEDIFF(n,a3.m, a1.m) > #MinTimeInterval THEN a1.m else NULL END AS FreeTimeStart,
CASE WHEN a2.m IS NULL OR DATEDIFF(n,a1.m,a2.m) > #MinTimeInterval THEN A1.m ELSE NULL END AS FreeTimeEnd,
ROW_NUMBER() OVER( ORDER BY a1.RoomName ) AS Pos
FROM AllOpenTimes A1
LEFT OUTER JOIN AllOpenTimes A2 ON a1.RoomName = a2.RoomName and a1.pos = a2.pos-1
LEFT OUTER JOIN AllOpenTimes A3 ON a1.RoomName = a3.RoomName and a1.pos = a3.pos+1
WHERE A2.m IS NULL OR DATEDIFF(n,a1.m,a2.m) > #MinTimeInterval
OR
A3.m IS NULL OR DATEDIFF(n,a3.m, a1.m) > #MinTimeInterval
)
SELECT F1.RoomName, f1.FreeTimeStart, f2.FreeTimeEnd FROM Finalize F1
LEFT OUTER JOIN Finalize F2 ON F1.Pos = F2.pos-1 AND f1.RoomName = f2.RoomName
WHERE f1.pos % 2 = 1
In the first several lines, I create temp variables to simulate your tables Events and Rooms.
The variable #MinTimeInterval determines what time interval the room schedules can be on (every 30 min, 15 min, etc - this number needs to divide evenly into 60).
Since SQL cannot query data that is missing, we need to create a table that holds all of the times that we want to check for. The first several lines in the WITH create a table called AllTimes which are all the possible time intervals in your day.
Next, we get a list of all of the times that are occupied (OccupiedTimes), and then LEFT OUTER JOIN this table to the AllTimes table which gives us all the available times. Since we only want the start and end of each free time, create the Finalize table which self joins each record to the previous and next record in the table. If the times in these rows are greater than #MinTimeInterval, then we know it is either a start or end of a free time.
Finally we self join this last table to put the start and end times in the same row and only look at every other row.
This will need to be adjusted if a single row in Events spans multiple days or multiple rooms.
Here's a solution that will return the "complete picture" including rooms that aren't booked at all for the day in question:
Declare #Date char(8) = '20141013'
;
WITH cte as
(
SELECT *
FROM -- use your table name instead of the VALUES construct
(VALUES
('09:00:00','12:30:00' ,'7-3', '20140919'),
('15:00:00','17:00:00' ,'7-2', '20141013'),
('14:00:00','16:00:00' ,'7-3', '20140919')) x(EventStart , EventEnd,Rooms, DayStarts)
), cte_Days_Rooms AS
-- get a cartesian product for the day specified and all rooms as well as the start and end time to compare against
(
SELECT y.EventStart,y.EventEnd, x.rooms,a.DayStarts FROM
(SELECT #Date DayStarts) a
CROSS JOIN
(SELECT DISTINCT Rooms FROM cte)x
CROSS JOIN
(SELECT '09:00:00' EventStart,'09:00:00' EventEnd UNION ALL
SELECT '22:00:00' EventStart,'22:00:00' EventEnd) y
), cte_1 AS
-- Merge the original data an the "base data"
(
SELECT * FROM cte WHERE DayStarts=#Date
UNION ALL
SELECT * FROM cte_Days_Rooms
), cte_2 as
-- use the ROW_NUMBER() approach to sort the data
(
SELECT *, ROW_NUMBER() OVER(PARTITION BY DayStarts, Rooms ORDER BY EventStart) as pos
FROM cte_1
)
-- final query: self join with an offest of one row, eliminating duplicate rows if a room is booked starting 9:00 or ending 22:00
SELECT c2a.DayStarts, c2a.Rooms , c2a.EventEnd, c2b.EventStart
FROM cte_2 c2a
INNER JOIN cte_2 c2b on c2a.DayStarts = c2b.DayStarts AND c2a.Rooms =c2b.Rooms AND c2a.pos = c2b.pos -1
WHERE c2a.EventEnd <> c2b.EventStart
ORDER BY c2a.DayStarts, c2a.Rooms

How to count open records, grouped by hour and day in SQL-server-2008-r2

I have hospital patient admission data in Microsoft SQL Server r2 that looks something like this:
PatientID, AdmitDate, DischargeDate
Jones. 1-jan-13 01:37. 1-jan-13 17:45
Smith 1-jan-13 02:12. 2-jan-13 02:14
Brooks. 4-jan-13 13:54. 5-jan-13 06:14
I would like count the number of patients in the hospital day by day and hour by hour (ie at
1-jan-13 00:00. 0
1-jan-13 01:00. 0
1-jan-13 02:00. 1
1-jan-13 03:00. 2
And I need to include the hours when there are no patients admitted in the result.
I can't create tables so making a reference table listing all the hours and days is out, though.
Any suggestions?
To solve this problem, you need a list of date-hours. The following gets this from the admit date cross joined to a table with 24 hours. The table of 24 hours is calculating from information_schema.columns -- a trick for getting small sequences of numbers in SQL Server.
The rest is just a join between this table and the hours. This version counts the patients at the hour, so someone admitted and discharged in the same hour, for instance is not counted. And in general someone is not counted until the next hour after they are admitted:
with dh as (
select DATEADD(hour, seqnum - 1, thedatehour ) as DateHour
from (select distinct cast(cast(AdmitDate as DATE) as datetime) as thedatehour
from Admission a
) a cross join
(select ROW_NUMBER() over (order by (select NULL)) as seqnum
from INFORMATION_SCHEMA.COLUMNS
) hours
where hours <= 24
)
select dh.DateHour, COUNT(*) as NumPatients
from dh join
Admissions a
on dh.DateHour between a.AdmitDate and a.DischargeDate
group by dh.DateHour
order by 1
This also assumes that there are admissions on every day. That seems like a reasonable assumption. If not, a calendar table would be a big help.
Here is one (ugly) way:
;WITH DayHours AS
(
SELECT 0 DayHour
UNION ALL
SELECT DayHour+1
FROM DayHours
WHERE DayHour+1 <= 23
)
SELECT B.AdmitDate, A.DayHour, COUNT(DISTINCT PatientID) Patients
FROM DayHours A
CROSS JOIN (SELECT DISTINCT CONVERT(DATE,AdmitDate) AdmitDate
FROM YourTable) B
LEFT JOIN YourTable C
ON B.AdmitDate = CONVERT(DATE,C.AdmitDate)
AND A.DayHour = DATEPART(HOUR,C.AdmitDate)
GROUP BY B.AdmitDate, A.DayHour
This is a bit messy and includes a temp table with the test data you provided but
CREATE TABLE #HospitalPatientData (PatientId NVARCHAR(MAX), AdmitDate DATETIME, DischargeDate DATETIME)
INSERT INTO #HospitalPatientData
SELECT 'Jones.', '1-jan-13 01:37:00.000', '1-jan-13 17:45:00.000' UNION
SELECT 'Smith', '1-jan-13 02:12:00.000', '2-jan-13 02:14:00.000' UNION
SELECT 'Brooks.', '4-jan-13 13:54:00.000', '5-jan-13 06:14:00.000'
;WITH DayHours AS
(
SELECT 0 DayHour
UNION ALL
SELECT DayHour+1
FROM DayHours
WHERE DayHour+1 <= 23
),
HospitalPatientData AS
(
SELECT CONVERT(nvarchar(max),AdmitDate,103) as AdmitDate ,DATEPART(hour,(AdmitDate)) as AdmitHour, COUNT(PatientID) as CountOfPatients
FROM #HospitalPatientData
GROUP BY CONVERT(nvarchar(max),AdmitDate,103), DATEPART(hour,(AdmitDate))
),
Results AS
(
SELECT MAX(h.AdmitDate) as Date, d.DayHour
FROM HospitalPatientData h
INNER JOIN DayHours d ON d.DayHour=d.DayHour
GROUP BY AdmitDate, CountOfPatients, DayHour
)
SELECT r.*, COUNT(h.PatientId) as CountOfPatients
FROM Results r
LEFT JOIN #HospitalPatientData h ON CONVERT(nvarchar(max),AdmitDate,103)=r.Date AND DATEPART(HOUR,h.AdmitDate)=r.DayHour
GROUP BY r.Date, r.DayHour
ORDER BY r.Date, r.DayHour
DROP TABLE #HospitalPatientData
This may get you started:
BEGIN TRAN
DECLARE #pt TABLE
(
PatientID VARCHAR(10)
, AdmitDate DATETIME
, DischargeDate DATETIME
)
INSERT INTO #pt
( PatientID, AdmitDate, DischargeDate )
VALUES ( 'Jones', '1-jan-13 01:37', '1-jan-13 17:45' ),
( 'Smith', '1-jan-13 02:12', '2-jan-13 02:14' )
, ( 'Brooks', '4-jan-13 13:54', '5-jan-13 06:14' )
DECLARE #StartDate DATETIME = '20130101'
, #FutureDays INT = 7
;
WITH dy
AS ( SELECT TOP (#FutureDays)
ROW_NUMBER() OVER ( ORDER BY name ) dy
FROM sys.columns c
) ,
hr
AS ( SELECT TOP 24
ROW_NUMBER() OVER ( ORDER BY name ) hr
FROM sys.columns c
)
SELECT refDate, COUNT(p.PatientID) AS PtCount
FROM ( SELECT DATEADD(HOUR, hr.hr - 1,
DATEADD(DAY, dy.dy - 1, #StartDate)) AS refDate
FROM dy
CROSS JOIN hr
) ref
LEFT JOIN #pt p ON ref.refDate BETWEEN p.AdmitDate AND p.DischargeDate
GROUP BY refDate
ORDER BY refDate
ROLLBACK