SQL query from Oracle SQL to T-SQL - sql

I have a subquery which is used for an Oracle database, but I want to use an equivalent query for a SQL Server database.
I didn't figure out how to migrate the TO_TIMESTAMP(TO_CHAR(TO_DATE part and also didn't know how to handle the thing with rownums in T-SQL.
Is it even possible to migrate this query?
SELECT 0 run_id,
0 tran_id,
0 sort_id,
' ' tran_type,
10 prod_id,
72 type_id,
1 value,
TO_TIMESTAMP(TO_CHAR(TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS') + rownum -1, 'YYYY.MM.DD') || to_char(sw.end_time, 'HH24:MI:SS'), 'YYYY.MM.DD HH24:MI:SS') event_publication,
EXTRACT (YEAR
FROM (TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS') + rownum -1)) y,
EXTRACT (MONTH
FROM (TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS') + rownum -1)) mo,
EXTRACT (DAY
FROM (TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS') + rownum -1)) d,
to_number(to_char (sw.end_time, 'HH24')) h,
to_number(to_char (sw.end_time, 'MI')) mi,
to_number(to_char (sw.end_time, 'SS')) s,
0 ms
FROM all_objects ao,
settlement_win sw,
prod_def pd
WHERE pd.prod_id = 10
AND sw.country = pd.country
AND sw.commodity = pd.commodity
AND rownum <= TO_DATE('2016-03-18 23:59:00', 'YYYY.MM.DD HH24:MI:SS') -TO_DATE('2016-03-18 00:00:00', 'YYYY.MM.DD HH24:MI:SS')+1

The first thing to address is the use of rownum which has no direct equivalent in TSQL but we can mimic it, and for this particular query you need to recognize that the table ALL_OBJECTS is only being used to produce a number of rows. It has no other purpose to the query.
In TSQL we can generate rows using a CTE and there are many many variants of this, but for here I suggest:
;WITH
cteDigits AS (
SELECT 0 AS digit UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL
SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9
)
, cteTally AS (
SELECT
d1s.digit
+ d10s.digit * 10
+ d100s.digit * 100 /* add more like this as needed */
-- + d1000s.digit * 1000 /* add more like this as needed */
+ 1 AS rownum
FROM cteDigits d1s
CROSS JOIN cteDigits d10s
CROSS JOIN cteDigits d100s /* add more like this as needed */
--CROSS JOIN cteDigits d1000s /* add more like this as needed */
)
This will quickly spin-up 1000 rows as is and can be extended to produce many more rows by adding more cross joins. Note this returns a column called rownum which starts at 1 thus mimicking the Oracle rownum.
So next you can just add some of the remaining query, like this:
SELECT
0 run_id
, 0 tran_id
, 0 sort_id
, ' ' tran_type
, 10 prod_id
, 72 type_id
, 1 value
, convert(varchar, dateadd(day, rownum - 1,'20160318'),121) event_publication
-- several missing rows here
, 0 ms
FOM cteTally
INNER JOIN settlement_win sw
INNER JOIN prod_def pd ON sw.country = pd.country AND sw.commodity = pd.commodity
WHERE pd.prod_id = 10
AND rownum <= datediff(day,'20160318','20160318') + 1
Note that you really do not need a to_timestamp() equivalent you just need the ability to output date and time to the maximum precision of your data which appears to be to the level of seconds.
To progress further (I think) requires an understanding of the data held in the column sw.end_time. If this can be converted to the mssql datetime data type then it is just a matter of adding a number of days to that value to arrive at the event_publication and similarly if sw.end_time is converted to a datetime data type then use date_part() to get the hours, minutes and seconds from that column. e.g.
, DATEADD(day,rownum-1,CONVERT(datetime, sw.end_time)) AS event_publication
also, if such a calculation works then it would be possible to use an apply operator to simplify the overall query, something like this
;WITH
cteDigits AS (
SELECT 0 AS digit UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL
SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9
)
, cteTally AS (
SELECT
d1s.digit
+ d10s.digit * 10
+ d100s.digit * 100 /* add more like this as needed */
-- + d1000s.digit * 1000 /* add more like this as needed */
+ 1 AS rownum
FROM cteDigits d1s
CROSS JOIN cteDigits d10s
CROSS JOIN cteDigits d100s /* add more like this as needed */
--CROSS JOIN cteDigits d1000s /* add more like this as needed */
)
SELECT
0 run_id
, 0 tran_id
, 0 sort_id
, ' ' tran_type
, 10 prod_id
, 72 type_id
, 1 value
, convert(varchar(23), CA.Event_publication, 121) Event_publication
, datepart(day,CA.Event_publication) dd
, datepart(month,CA.Event_publication) mm
, datepart(year,CA.Event_publication) yyyy
, datepart(hour,CA.Event_publication) hh24
, datepart(minute,CA.Event_publication) mi
, datepart(second,CA.Event_publication) ss
, 0 ms
FOM cteTally
INNER JOIN settlement_win sw
INNER JOIN prod_def pd ON sw.country = pd.country AND sw.commodity = pd.commodity
CROSS APPLY (
SELECT DATEADD(day,rownum-1,CONVERT(datetime, sw.end_time)) AS event_publication ) CA
WHERE pd.prod_id = 10
AND rownum <= datediff(day,'20160318','20160318') + 1
NB: IT may be necessary to include this datediff(day,'19000101,'20160318') (which equals 42445) into the calculation of the event_date e.g.
SELECT DATEADD(day,42445 + (rownum-1),CONVERT(datetime, sw.end_time)) AS event_publication
One last point is that you could use datetime2 instead of datetime if you really do need a greater degree of time precision but there is no easily apparent requirement for that.

Related

Hive query takes forever on Superset

I have a query that was written in Presto SQL format (100 lines of insert a query result to a table that already exists) and takes within 10 minutes to get the result.
Now I am going to use Airflow and need to change the query to Hive SQL format to append previous month's data, there is no error, but it is taking 75+ minutes now and the query is still running and not returning any result.
Shall I 'stop' it or is there anything else to consider?
SET hive.limit.query.max.table.partition = 1000000;
INSERT INTO TABLE schema.temp_tbl partition(year_month_key)
Select
distinct
tbl.account_id,
tbl.theme_status,
streaming.streaming_hours,
tbl.year_month as year_month_key
From
(
Select
tbl_0.year_month,
tbl_0.account_id,
case when max(tbl_0.theme_status) = 1 then 'With Theme' else 'No Theme' end as theme_status
From
(Select
streaming.year_month,
streaming.account_id,
case when theme_events.account_id is not null then 1 else 0 end as theme_status
from
(
Select
substring(date_key, 1, 7) as year_month,
last_day(add_months(date_key, -1)) as year_month_ed,
date_key,
upper(account_id) as account_id,
play_seconds
from agg_device_streaming_metrics_daily
Where date_key between date_add(last_day(add_months(current_date, -2)),1) and last_day(add_months(current_date, -1))
and play_seconds > 0
) streaming
left join
(
Select
upper(theme.virtualuserid) as account_id,
min(theme.createddate) as min_createddate,
min(theme.date_key) as date_key
From
(
select * from theme_activate_event_history
where date_key between '2019-01-01' and '2020-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between '2020-01-01' and '2021-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between '2021-01-01' and '2022-01-01'
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
union
select * from theme_activate_event_history
where date_key between cast('2022-01-01' as date) and last_day(add_months(current_date, -1))
and activate = 'true' and themetype in ('ThemeBundle','ScreenSaver','Skin','Audio')
) theme
group by theme.virtualuserid
) theme_events
on streaming.account_id = theme_events.account_id
and date(theme_events.date_key) <= date(streaming.year_month_ed)
) tbl_0
group by tbl_0.year_month, tbl_0.account_id
) tbl
inner join
(Select
substring(date_key, 1, 7) as year_month,
upper(account_id) as account_id,
cast(sum(play_seconds) / 3600 as double) as streaming_hours
from agg_device_streaming_metrics_daily
Where date_key between date_add(last_day(add_months(current_date, -2)),1) and last_day(add_months(current_date, -1))
and play_seconds > 0
group by substring(date_key, 1, 7), upper(account_id)
) streaming
on tbl.account_id = streaming.account_id and tbl.year_month = streaming.year_month;

SQL - '1' IF hour in month EXISTS, '0' IF NOT EXISTS

I have a table that has aggregations down to the hour level YYYYMMDDHH. The data is aggregated and loaded by an external process (I don't have control over). I want to test the data on a monthly basis.
The question I am looking to answer is: Does every hour in the month exist?
I'm looking to produce output that will return a 1 if the hour exists or 0 if the hour does not exist.
The aggregation table looks something like this...
YYYYMM YYYYMMDD YYYYMMDDHH DATA_AGG
201911 20191101 2019110100 100
201911 20191101 2019110101 125
201911 20191101 2019110103 135
201911 20191101 2019110105 95
… … … …
201911 20191130 2019113020 100
201911 20191130 2019113021 110
201911 20191130 2019113022 125
201911 20191130 2019113023 135
And defined as...
CREATE TABLE YYYYMMDDHH_DATA_AGG AS (
YYYYMM VARCHAR,
YYYYMMDD VARCHAR,
YYYYMMDDHH VARCHAR,
DATA_AGG INT
);
I'm looking to produce the following below...
YYYYMMDDHH HOUR_EXISTS
2019110100 1
2019110101 1
2019110102 0
2019110103 1
2019110104 0
2019110105 1
... ...
In the example above, two hours do not exist, 2019110102 and 2019110104.
I assume I'd have to join the aggregation table against a computed table that contains all the YYYYMMDDHH combos???
The database is Snowflake, but assume most generic ANSI SQL queries will work.
You can get what you want with a recursive CTE
The recursive CTE generates the list of possible Hours. And then a simple left outer join gets you the flag for if you have any records that match that hour.
WITH RECURSIVE CTE (YYYYMMDDHH) as
(
SELECT YYYYMMDDHH
FROM YYYYMMDDHH_DATA_AGG
WHERE YYYYMMDDHH = (SELECT MIN(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG)
UNION ALL
SELECT TO_VARCHAR(DATEADD(HOUR, 1, TO_TIMESTAMP(C.YYYYMMDDHH, 'YYYYMMDDHH')), 'YYYYMMDDHH') YYYYMMDDHH
FROM CTE C
WHERE TO_VARCHAR(DATEADD(HOUR, 1, TO_TIMESTAMP(C.YYYYMMDDHH, 'YYYYMMDDHH')), 'YYYYMMDDHH') <= (SELECT MAX(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG)
)
SELECT
C.YYYYMMDDHH,
IFF(A.YYYYMMDDHH IS NOT NULL, 1, 0) HOUR_EXISTS
FROM CTE C
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG A
ON C.YYYYMMDDHH = A.YYYYMMDDHH;
If your timerange is too long you'll have issues with the cte recursing too much. You can create a table or temp table with all of the possible hours instead. For example:
CREATE OR REPLACE TEMPORARY TABLE HOURS (YYYYMMDDHH VARCHAR) AS
SELECT TO_VARCHAR(DATEADD(HOUR, SEQ4(), TO_TIMESTAMP((SELECT MIN(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG), 'YYYYMMDDHH')), 'YYYYMMDDHH')
FROM TABLE(GENERATOR(ROWCOUNT => 10000)) V
ORDER BY 1;
SELECT
H.YYYYMMDDHH,
IFF(A.YYYYMMDDHH IS NOT NULL, 1, 0) HOUR_EXISTS
FROM HOURS H
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG A
ON H.YYYYMMDDHH = A.YYYYMMDDHH
WHERE H.YYYYMMDDHH <= (SELECT MAX(YYYYMMDDHH) FROM YYYYMMDDHH_DATA_AGG);
You can then fiddle with the generator count to make sure you have enough hours.
You can generate a table with every hour of the month and LEFT OUTER JOIN your aggregation to it:
WITH EVERY_HOUR AS (
SELECT TO_CHAR(DATEADD(HOUR, HH, TO_DATE(YYYYMM::TEXT, 'YYYYMM')),
'YYYYMMDDHH')::NUMBER YYYYMMDDHH
FROM (SELECT DISTINCT YYYYMM FROM YYYYMMDDHH_DATA_AGG) t
CROSS JOIN (
SELECT ROW_NUMBER() OVER (ORDER BY NULL) - 1 HH
FROM TABLE(GENERATOR(ROWCOUNT => 745))
) h
QUALIFY YYYYMMDDHH < (YYYYMM + 1) * 10000
)
SELECT h.YYYYMMDDHH, NVL2(a.YYYYMM, 1, 0) HOUR_EXISTS
FROM EVERY_HOUR h
LEFT OUTER JOIN YYYYMMDDHH_DATA_AGG a ON a.YYYYMMDDHH = h.YYYYMMDDHH
Here's something that might help get you started. I'm guessing you want to have 'synthetic' [YYYYMMDD] values? Otherwise, if the value aren't there, then they shouldn't appear in the list
DROP TABLE IF EXISTS #_hours
DROP TABLE IF EXISTS #_temp
--Populate a table with hours ranging from 00 to 23
CREATE TABLE #_hours ([hour_value] VARCHAR(2))
DECLARE #_i INT = 0
WHILE (#_i < 24)
BEGIN
INSERT INTO #_hours
SELECT FORMAT(#_i, '0#')
SET #_i += 1
END
-- Replicate OP's sample data set
CREATE TABLE #_temp (
[YYYYMM] INTEGER
, [YYYYMMDD] INTEGER
, [YYYYMMDDHH] INTEGER
, [DATA_AGG] INTEGER
)
INSERT INTO #_temp
VALUES
(201911, 20191101, 2019110100, 100),
(201911, 20191101, 2019110101, 125),
(201911, 20191101, 2019110103, 135),
(201911, 20191101, 2019110105, 95),
(201911, 20191130, 2019113020, 100),
(201911, 20191130, 2019113021, 110),
(201911, 20191130, 2019113022, 125),
(201911, 20191130, 2019113023, 135)
SELECT X.YYYYMM, X.YYYYMMDD, X.YYYYMMDDHH
-- Case: If 'target_hours' doesn't exist, then 0, else 1
, CASE WHEN X.target_hours IS NULL THEN '0' ELSE '1' END AS [HOUR_EXISTS]
FROM (
-- Select right 2 characters from converted [YYYYMMDDHH] to act as 'target values'
SELECT T.*
, RIGHT(CAST(T.[YYYYMMDDHH] AS VARCHAR(10)), 2) AS [target_hours]
FROM #_temp AS T
) AS X
-- Right join to keep all of our hours and only the target hours that match.
RIGHT JOIN #_hours AS H ON H.hour_value = X.target_hours
Sample output:
YYYYMM YYYYMMDD YYYYMMDDHH HOUR_EXISTS
201911 20191101 2019110100 1
201911 20191101 2019110101 1
NULL NULL NULL 0
201911 20191101 2019110103 1
NULL NULL NULL 0
201911 20191101 2019110105 1
NULL NULL NULL 0
With (almost) standard sql, you can do a cross join of the distinct values of YYYYMMDD to a list of all possible hours and then left join to the table:
select concat(d.YYYYMMDD, h.hour) as YYYYMMDDHH,
case when t.YYYYMMDDHH is null then 0 else 1 end as hour_exists
from (select distinct YYYYMMDD from tablename) as d
cross join (
select '00' as hour union all select '01' union all
select '02' union all select '03' union all
select '04' union all select '05' union all
select '06' union all select '07' union all
select '08' union all select '09' union all
select '10' union all select '11' union all
select '12' union all select '13' union all
select '14' union all select '15' union all
select '16' union all select '17' union all
select '18' union all select '19' union all
select '20' union all select '21' union all
select '22' union all select '23'
) as h
left join tablename as t
on concat(d.YYYYMMDD, h.hour) = t.YYYYMMDDHH
order by concat(d.YYYYMMDD, h.hour)
Maybe in Snowflake you can construct the list of hours with a sequence much easier instead of all those UNION ALLs.
This version accounts for the full range of days, across months and years. It's a simple cross join of the set of possible days with the set of possible hours of the day -- left joined to actual dates.
set first = (select min(yyyymmdd::number) from YYYYMMDDHH_DATA_AGG);
set last = (select max(yyyymmdd::number) from YYYYMMDDHH_DATA_AGG);
with
hours as (select row_number() over (order by null) - 1 h from table(generator(rowcount=>24))),
days as (
select
row_number() over (order by null) - 1 as n,
to_date($first::text, 'YYYYMMDD')::date + n as d,
to_char(d, 'YYYYMMDD') as yyyymmdd
from table(generator(rowcount=>($last-$first+1)))
)
select days.yyyymmdd || lpad(hours.h,2,0) as YYYYMMDDHH, nvl2(t.yyyymmddhh,1,0) as HOUR_EXISTS
from days cross join hours
left join YYYYMMDDHH_DATA_AGG t on t.yyyymmddhh = days.yyyymmdd || lpad(hours.h,2,0)
order by 1
;
$first and $last can be packed in as sub-queries if you prefer.

SQL - Select values from a table based on dates using incrementing dates

I have a SQL table of dates (MM/DD format), targets, and levels, as such:
Date Target Level
10/2 1000 1
10/4 2000 1
10/7 2000 2
I want to use those dates as tiers, or checkpoints, for when to use the respective targets and levels. So, anything on or after those dates (until the next date) would use that target/level. Anything before the first date just uses the values from the first date.
I want to select a range of dates (a 5 week range of dates, with the start date and end date of the range being determined by the current day: 3 weeks back from today, to 2 weeks forward from today) and fill in the targets and levels accordingly, as such:
Date Target Level
10/1 1000 1
10/2 1000 1
10/3 1000 1
10/4 2000 1
10/5 2000 1
10/6 2000 1
10/7 2000 2
10/8 2000 2
...
11/5 2000 2
How do I go about:
Selecting the range of dates (as efficiently as possible)
Filling in the range of dates with the respective target/level from the appropriate date in my table?
Thank you.
You can do this using outer apply. The following creates a list of dates using a recursive CTE:
with d as (
select cast(getdate() as date) as dte
union all
select dateadd(day, -1, dte)
from d
where dte >= getdate() - 30
select d.dte, t.target, t.level
from d outer apply
(select top 1 t.*
from t
where d.dte >= t.dte
order by t.dte desc
);
you can use a CTE to generate your 'missing' dates, then use a CROSS APPLY to obtain the target and level that was last active (by querying the TOP 1 DESC where the date is on or before current date) - finally I introduced 'maximum date' as a variable
DECLARE #MAXD as DATETIME = '20161105';
WITH DATS AS (SELECT MIN([Date]) D FROM dbo.YourTab
UNION ALL
SELECT dateadd(day,1,D) FROM DATS WHERE D < #MAXD)
select DATS.D, CA.Target, CA.Level from DATS
CROSS APPLY
(SELECT TOP 1 Y.Target, Y.Level FROM YourTab Y
WHERE
Y.[Date] <= DATS.D
ORDER BY Y.Date DESC) CA
option (maxrecursion 0);
I made a bit of a change with dates to go back 3 and forward two weeks - also I switched to outer apply to handle no data in force
DECLARE #MIND as DATETIME = dateadd(week,-3,cast(getdate() as date));
DECLARE #MAXD as DATETIME = dateadd(week, 5,#MIND);
WITH DATS AS (SELECT #MIND D
UNION ALL
SELECT dateadd(day,1,D) FROM DATS WHERE D < #MAXD)
select DATS.D, CA.Target, CA.Level from DATS
OUTER APPLY
(SELECT TOP 1 Y.Target, Y.Level FROM YourTab Y WHERE Y.[Date] <= DATS.D ORDER BY Y.Date DESC) CA
ORDER BY DATS.D
option (maxrecursion 0);
Final change - if there is no earlier value for the date - take first future row
DECLARE #MIND as DATETIME = dateadd(week,-3,cast(getdate() as date));
DECLARE #MAXD as DATETIME = dateadd(week, 5,#MIND);
WITH DATS AS (SELECT #MIND D
UNION ALL
SELECT dateadd(day,1,D) FROM DATS WHERE D < #MAXD)
select DATS.D, COALESCE(CA.Target, MQ.Target) Target , COALESCE(CA.Level, MQ.Level) Level from DATS
OUTER APPLY
(SELECT TOP 1 Y.Target, Y.Level FROM YourTab Y WHERE Y.[Date] <= DATS.D ORDER BY Y.Date DESC) CA
OUTER APPLY
(
SELECT TOP 1 M.Target, M.Level FROM YourTab M ORDER BY M.[Date] ASC
) MQ
ORDER BY DATS.D
option (maxrecursion 0);
I don't know why you store dates as MM/DD but you need some conversion into right datatype. This could do a trick:
;WITH YourTable AS (
SELECT *
FROM (VALUES
('10/2', 1000, 1),
('10/4', 2000, 1),
('10/7', 2000, 2)
) as t([Date], [Target], [Level])
), dates_cte AS ( --this CTE is generating dates you need
SELECT DATEADD(week,-3,GETDATE()) as d --3 weeks back
UNION ALL
SELECT dateadd(day,1,d)
FROM dates_cte
WHERE d < DATEADD(week,2,GETDATE()) --2 weeks forward
)
SELECT REPLACE(CONVERT(nvarchar(5),d,101),'/0','/') as [Date],
COALESCE(t.[Target],t1.[Target]) [Target],
COALESCE(t.[Level],t1.[Level]) [Level]
FROM dates_cte dc
OUTER APPLY ( --Here we got PREVIOUS values
SELECT TOP 1 *
FROM YourTable
WHERE CONVERT(datetime,REPLACE([Date],'/','/0')+'/2016',101) <= dc.d
ORDER BY CONVERT(datetime,REPLACE([Date],'/','/0')+'/2016',101) DESC
) t
OUTER APPLY ( --Here we got NEXT values and use them if there is no PREV
SELECT TOP 1 *
FROM YourTable
WHERE CONVERT(datetime,REPLACE([Date],'/','/0')+'/2016',101) >= dc.d
ORDER BY CONVERT(datetime,REPLACE([Date],'/','/0')+'/2016',101) ASC
) t1
Output:
Date Target Level
10/5 2000 1
10/6 2000 1
10/7 2000 2
10/8 2000 2
10/9 2000 2
10/10 2000 2
10/11 2000 2
10/12 2000 2
...
11/9 2000 2
EDIT
With Categories:
;WITH YourTable AS (
SELECT *
FROM (VALUES
('10/2', 1000, 1, 'A'),
('10/4', 3000, 1, 'B'),
('10/7', 2000, 2, 'A')
) as t([Date], [Target], [Level], [Category])
), dates_cte AS (
SELECT DATEADD(week,-3,GETDATE()) as d
UNION ALL
SELECT dateadd(day,1,d)
FROM dates_cte
WHERE d < DATEADD(week,2,GETDATE())
)
SELECT REPLACE(CONVERT(nvarchar(5),d,101),'/0','/') as [Date],
COALESCE(t.[Target],t1.[Target]) [Target],
COALESCE(t.[Level],t1.[Level]) [Level],
c.Category
FROM dates_cte dc
CROSS JOIN (
SELECT DISTINCT Category
FROM YourTable
) c
OUTER APPLY (
SELECT TOP 1 *
FROM YourTable
WHERE CONVERT(datetime,REPLACE([Date],'/','/0')+'/2016',101) <= dc.d
AND c.Category = Category
ORDER BY CONVERT(datetime,REPLACE([Date],'/','/0')+'/2016',101) DESC
) t
OUTER APPLY (
SELECT TOP 1 *
FROM YourTable
WHERE CONVERT(datetime,REPLACE([Date],'/','/0')+'/2016',101) >= dc.d
AND c.Category = Category
ORDER BY CONVERT(datetime,REPLACE([Date],'/','/0')+'/2016',101) ASC
) t1
ORDER BY c.Category, d
Not sure if I'm over simplifying this, but:
select min(X.Date) Date_Range_Start, max(X.date) Date_Range_End
, V.<value_date>
, isnull(X.Target, 'Out of range') Target
, isnull(X.Level, 'Out of range') Level
from X --replace this with your table
left join <value_table> V --table with dates to be assessed
on V.<Date> between X.Date_Range_Start and X.Date_Range_End
group by Target, Level, V.<value_date>

SQL Query Help (Advanced - for me!)

I have a question about a SQL query I am trying to write.
I need to query data from a database.
The database has, amongst others, these 3 fields:
Account_ID #, Date_Created, Time_Created
I need to write a query that tells me how many accounts were opened per hour.
I have written said query, but there are times that there were 0 accounts created, so these "hours" are not populated in the results.
For example:
Volume Date__Hour
435 12-Aug-12 03
213 12-Aug-12 04
125 12-Aug-12 06
As seen in the example above, hour 5 did not have any accounts opened.
Is there a way that the result can populate the hour but and display 0 accounts opened for this hour?
Example of how I want my results to look like:
Volume Date_Hour
435 12-Aug-12 03
213 12-Aug-12 04
0 12-Aug-12 05
125 12-Aug-12 06
Thanks!
Update: This is what I have so far
SELECT count(*) as num_apps, to_date(created_ts,'DD-Mon-RR') as app_date, to_char(created_ts,'HH24') as app_hour
FROM accounts
WHERE To_Date(created_ts,'DD-Mon-RR') >= To_Date('16-Aug-12','DD-Mon-RR')
GROUP BY To_Date(created_ts,'DD-Mon-RR'), To_Char(created_ts,'HH24')
ORDER BY app_date, app_hour
To get the results you want, you will need to create a table (or use a query to generate a "temp" table) and then use a left join to your calculation query to get rows for every hour - even those with 0 volume.
For example, assume I have a table with app_date and app_hour fields. Also assume that this table has a row for every day/hour you wish to report on.
The query would be:
SELECT NVL(c.num_apps,0) as num_apps, t.app_date, t.app_hour
FROM time_table t
LEFT OUTER JOIN
(
SELECT count(*) as num_apps, to_date(created_ts,'DD-Mon-RR') as app_date, to_char(created_ts,'HH24') as app_hour
FROM accounts
WHERE To_Date(created_ts,'DD-Mon-RR') >= To_Date('16-Aug-12','DD-Mon-RR')
GROUP BY To_Date(created_ts,'DD-Mon-RR'), To_Char(created_ts,'HH24')
ORDER BY app_date, app_hour
) c ON (t.app_date = c.app_date AND t.app_hour = c.app_hour)
I believe the best solution is not to create some fancy temporary table but just use this construct:
select level
FROM Dual
CONNECT BY level <= 10
ORDER BY level;
This will give you (in ten rows):
1
2
3
4
5
6
7
8
9
10
For hours interval just little modification:
select 0 as num_apps, (To_Date('16-09-12','DD-MM-RR') + level / 24) as created_ts
FROM dual
CONNECT BY level <= (sysdate - To_Date('16-09-12','DD-MM-RR')) * 24 ;
And just for the fun of it adding solution for you(I didn't try syntax, so I'm sorry for any mistake, but the idea is clear):
SELECT SUM(num_apps) as num_apps, to_date(created_ts,'DD-Mon-RR') as app_date, to_char(created_ts,'HH24') as app_hour
FROM(
SELECT count(*) as num_apps, created_ts
FROM accounts
WHERE To_Date(created_ts,'DD-Mon-RR') >= To_Date('16-09-12','DD-MM-RR')
UNION ALL
select 0 as num_apps, (To_Date('16-09-12','DD-MM-RR') + level / 24) as created_ts
FROM dual
CONNECT BY level <= (sysdate - To_Date('16-09-12','DD-MM-RR')) * 24 ;
)
GROUP BY To_Date(created_ts,'DD-Mon-RR'), To_Char(created_ts,'HH24')
ORDER BY app_date, app_hour
;
You can also use a CASE statement in the SELECT to force the value you want.
It can be useful to have a "sequence table" kicking around, for all sorts of reasons, something that looks like this:
create table dbo.sequence
(
id int not null primary key clustered ,
)
Load it up with million or so rows, covering positive and negative values.
Then, given a table that looks like this
create table dbo.SomeTable
(
account_id int not null primary key clustered ,
date_created date not null ,
time_created time not null ,
)
Your query is then as simple as (in SQL Server):
select year_created = years.id ,
month_created = months.id ,
day_created = days.id ,
hour_created = hours.id ,
volume = t.volume
from ( select * ,
is_leap_year = case
when id % 400 = 0 then 1
when id % 100 = 0 then 0
when id % 4 = 0 then 1
else 0
end
from dbo.sequence
where id between 1980 and year(current_timestamp)
) years
cross join ( select *
from dbo.sequence
where id between 1 and 12
) months
left join ( select *
from dbo.sequence
where id between 1 and 31
) days on days.id <= case months.id
when 2 then 28 + years.is_leap_year
when 4 then 30
when 6 then 30
when 9 then 30
when 11 then 30
else 31
end
cross join ( select *
from dbo.sequence
where id between 0 and 23
) hours
left join ( select date_created ,
hour_created = datepart(hour,time_created ) ,
volume = count(*)
from dbo.SomeTable
group by date_created ,
datepart(hour,time_created)
) t on datepart( year , t.date_created ) = years.id
and datepart( month , t.date_created ) = months.id
and datepart( day , t.date_created ) = days.id
and t.hour_created = hours.id
order by 1,2,3,4
It's not clear to me if created_ts is a datetime or a varchar. If it's a datetime, you shouldn't use to_date; if it's a varchar, you shouldn't use to_char.
Assuming it's a datetime, and borrowing #jakub.petr's FROM Dual CONNECT BY level trick, I suggest:
SELECT count(*) as num_apps, to_char(created_ts,'DD-Mon-RR') as app_date, to_char(created_ts,'HH24') as app_hour
FROM (select level-1 as hour FROM Dual CONNECT BY level <= 24) h
LEFT JOIN accounts a on h.hour = to_number(to_char(a.created_ts,'HH24'))
WHERE created_ts >= To_Date('16-Aug-12','DD-Mon-RR')
GROUP BY trunc(created_ts), h.hour
ORDER BY app_date, app_hour

SQL moving average

How do you create a moving average in SQL?
Current table:
Date Clicks
2012-05-01 2,230
2012-05-02 3,150
2012-05-03 5,520
2012-05-04 1,330
2012-05-05 2,260
2012-05-06 3,540
2012-05-07 2,330
Desired table or output:
Date Clicks 3 day Moving Average
2012-05-01 2,230
2012-05-02 3,150
2012-05-03 5,520 4,360
2012-05-04 1,330 3,330
2012-05-05 2,260 3,120
2012-05-06 3,540 3,320
2012-05-07 2,330 3,010
This is an Evergreen Joe Celko question.
I ignore which DBMS platform is used. But in any case Joe was able to answer more than 10 years ago with standard SQL.
Joe Celko SQL Puzzles and Answers citation:
"That last update attempt suggests that we could use the predicate to
construct a query that would give us a moving average:"
SELECT S1.sample_time, AVG(S2.load) AS avg_prev_hour_load
FROM Samples AS S1, Samples AS S2
WHERE S2.sample_time
BETWEEN (S1.sample_time - INTERVAL 1 HOUR)
AND S1.sample_time
GROUP BY S1.sample_time;
Is the extra column or the query approach better? The query is
technically better because the UPDATE approach will denormalize the
database. However, if the historical data being recorded is not going
to change and computing the moving average is expensive, you might
consider using the column approach.
MS SQL Example:
CREATE TABLE #TestDW
( Date1 datetime,
LoadValue Numeric(13,6)
);
INSERT INTO #TestDW VALUES('2012-06-09' , '3.540' );
INSERT INTO #TestDW VALUES('2012-06-08' , '2.260' );
INSERT INTO #TestDW VALUES('2012-06-07' , '1.330' );
INSERT INTO #TestDW VALUES('2012-06-06' , '5.520' );
INSERT INTO #TestDW VALUES('2012-06-05' , '3.150' );
INSERT INTO #TestDW VALUES('2012-06-04' , '2.230' );
SQL Puzzle query:
SELECT S1.date1, AVG(S2.LoadValue) AS avg_prev_3_days
FROM #TestDW AS S1, #TestDW AS S2
WHERE S2.date1
BETWEEN DATEADD(d, -2, S1.date1 )
AND S1.date1
GROUP BY S1.date1
order by 1;
One way to do this is to join on the same table a few times.
select
(Current.Clicks
+ isnull(P1.Clicks, 0)
+ isnull(P2.Clicks, 0)
+ isnull(P3.Clicks, 0)) / 4 as MovingAvg3
from
MyTable as Current
left join MyTable as P1 on P1.Date = DateAdd(day, -1, Current.Date)
left join MyTable as P2 on P2.Date = DateAdd(day, -2, Current.Date)
left join MyTable as P3 on P3.Date = DateAdd(day, -3, Current.Date)
Adjust the DateAdd component of the ON-Clauses to match whether you want your moving average to be strictly from the past-through-now or days-ago through days-ahead.
This works nicely for situations where you need a moving average over only a few data points.
This is not an optimal solution for moving averages with more than a few data points.
select t2.date, round(sum(ct.clicks)/3) as avg_clicks
from
(select date from clickstable) as t2,
(select date, clicks from clickstable) as ct
where datediff(t2.date, ct.date) between 0 and 2
group by t2.date
Example here.
Obviously you can change the interval to whatever you need. You could also use count() instead of a magic number to make it easier to change, but that will also slow it down.
General template for rolling averages that scales well for large data sets
WITH moving_avg AS (
SELECT 0 AS [lag] UNION ALL
SELECT 1 AS [lag] UNION ALL
SELECT 2 AS [lag] UNION ALL
SELECT 3 AS [lag] --ETC
)
SELECT
DATEADD(day,[lag],[date]) AS [reference_date],
[otherkey1],[otherkey2],[otherkey3],
AVG([value1]) AS [avg_value1],
AVG([value2]) AS [avg_value2]
FROM [data_table]
CROSS JOIN moving_avg
GROUP BY [otherkey1],[otherkey2],[otherkey3],DATEADD(day,[lag],[date])
ORDER BY [otherkey1],[otherkey2],[otherkey3],[reference_date];
And for weighted rolling averages:
WITH weighted_avg AS (
SELECT 0 AS [lag], 1.0 AS [weight] UNION ALL
SELECT 1 AS [lag], 0.6 AS [weight] UNION ALL
SELECT 2 AS [lag], 0.3 AS [weight] UNION ALL
SELECT 3 AS [lag], 0.1 AS [weight] --ETC
)
SELECT
DATEADD(day,[lag],[date]) AS [reference_date],
[otherkey1],[otherkey2],[otherkey3],
AVG([value1] * [weight]) / AVG([weight]) AS [wavg_value1],
AVG([value2] * [weight]) / AVG([weight]) AS [wavg_value2]
FROM [data_table]
CROSS JOIN weighted_avg
GROUP BY [otherkey1],[otherkey2],[otherkey3],DATEADD(day,[lag],[date])
ORDER BY [otherkey1],[otherkey2],[otherkey3],[reference_date];
select *
, (select avg(c2.clicks) from #clicks_table c2
where c2.date between dateadd(dd, -2, c1.date) and c1.date) mov_avg
from #clicks_table c1
Use a different join predicate:
SELECT current.date
,avg(periods.clicks)
FROM current left outer join current as periods
ON current.date BETWEEN dateadd(d,-2, periods.date) AND periods.date
GROUP BY current.date HAVING COUNT(*) >= 3
The having statement will prevent any dates without at least N values from being returned.
assume x is the value to be averaged and xDate is the date value:
SELECT avg(x) from myTable WHERE xDate BETWEEN dateadd(d, -2, xDate) and xDate
In hive, maybe you could try
select date, clicks, avg(clicks) over (order by date rows between 2 preceding and current row) as moving_avg from clicktable;
For the purpose, I'd like to create an auxiliary/dimensional date table like
create table date_dim(date date, date_1 date, dates_2 date, dates_3 dates ...)
while date is the key, date_1 for this day, date_2 contains this day and the day before; date_3...
Then you can do the equal join in hive.
Using a view like:
select date, date from date_dim
union all
select date, date_add(date, -1) from date_dim
union all
select date, date_add(date, -2) from date_dim
union all
select date, date_add(date, -3) from date_dim
NOTE: THIS IS NOT AN ANSWER but an enhanced code sample of Diego Scaravaggi's answer. I am posting it as answer as the comment section is insufficient. Note that I have parameter-ized the period for Moving aveage.
declare #p int = 3
declare #t table(d int, bal float)
insert into #t values
(1,94),
(2,99),
(3,76),
(4,74),
(5,48),
(6,55),
(7,90),
(8,77),
(9,16),
(10,19),
(11,66),
(12,47)
select a.d, avg(b.bal)
from
#t a
left join #t b on b.d between a.d-(#p-1) and a.d
group by a.d
--#p1 is period of moving average, #01 is offset
declare #p1 as int
declare #o1 as int
set #p1 = 5;
set #o1 = 3;
with np as(
select *, rank() over(partition by cmdty, tenor order by markdt) as r
from p_prices p1
where
1=1
)
, x1 as (
select s1.*, avg(s2.val) as avgval from np s1
inner join np s2
on s1.cmdty = s2.cmdty and s1.tenor = s2.tenor
and s2.r between s1.r - (#p1 - 1) - (#o1) and s1.r - (#o1)
group by s1.cmdty, s1.tenor, s1.markdt, s1.val, s1.r
)
I'm not sure that your expected result (output) shows classic "simple moving (rolling) average" for 3 days. Because, for example, the first triple of numbers by definition gives:
ThreeDaysMovingAverage = (2.230 + 3.150 + 5.520) / 3 = 3.6333333
but you expect 4.360 and it's confusing.
Nevertheless, I suggest the following solution, which uses window-function AVG. This approach is much more efficient (clear and less resource-intensive) than SELF-JOIN introduced in other answers (and I'm surprised that no one has given a better solution).
-- Oracle-SQL dialect
with
data_table as (
select date '2012-05-01' AS dt, 2.230 AS clicks from dual union all
select date '2012-05-02' AS dt, 3.150 AS clicks from dual union all
select date '2012-05-03' AS dt, 5.520 AS clicks from dual union all
select date '2012-05-04' AS dt, 1.330 AS clicks from dual union all
select date '2012-05-05' AS dt, 2.260 AS clicks from dual union all
select date '2012-05-06' AS dt, 3.540 AS clicks from dual union all
select date '2012-05-07' AS dt, 2.330 AS clicks from dual
),
param as (select 3 days from dual)
select
dt AS "Date",
clicks AS "Clicks",
case when rownum >= p.days then
avg(clicks) over (order by dt
rows between p.days - 1 preceding and current row)
end
AS "3 day Moving Average"
from data_table t, param p;
You see that AVG is wrapped with case when rownum >= p.days then to force NULLs in first rows, where "3 day Moving Average" is meaningless.
We can apply Joe Celko's "dirty" left outer join method (as cited above by Diego Scaravaggi) to answer the question as it was asked.
declare #ClicksTable table ([Date] date, Clicks int)
insert into #ClicksTable
select '2012-05-01', 2230 union all
select '2012-05-02', 3150 union all
select '2012-05-03', 5520 union all
select '2012-05-04', 1330 union all
select '2012-05-05', 2260 union all
select '2012-05-06', 3540 union all
select '2012-05-07', 2330
This query:
SELECT
T1.[Date],
T1.Clicks,
-- AVG ignores NULL values so we have to explicitly NULLify
-- the days when we don't have a full 3-day sample
CASE WHEN count(T2.[Date]) < 3 THEN NULL
ELSE AVG(T2.Clicks)
END AS [3-Day Moving Average]
FROM #ClicksTable T1
LEFT OUTER JOIN #ClicksTable T2
ON T2.[Date] BETWEEN DATEADD(d, -2, T1.[Date]) AND T1.[Date]
GROUP BY T1.[Date]
Generates the requested output:
Date Clicks 3-Day Moving Average
2012-05-01 2,230
2012-05-02 3,150
2012-05-03 5,520 4,360
2012-05-04 1,330 3,330
2012-05-05 2,260 3,120
2012-05-06 3,540 3,320
2012-05-07 2,330 3,010