Exploding time range hour-wise in BigQuery - sql

I have table with shift start and end times. I'd like to explode it, giving each row an hour and duration of time given in that hour. i.e.
Table "Shift_time":
Output Result:
I tried solutions here (Explode time duration defined by start and end timestamp by the hour) but were giving me incorrect results

For your requirement, you can use generate an array of starting hour and end hour of each row. From there you can calculate based on your requirement. You can try the below sample query :
Query:
CREATE TEMP TABLE mdata AS (
SELECT
'2022-12-15' mdate,
'abc' code,
PARSE_TIME("%H:%M:%S","2:15:00") start_time,
PARSE_TIME("%H:%M:%S","2:21:00") end_time
UNION ALL
SELECT
"2022-12-15",
"abc",
PARSE_TIME("%H:%M:%S","2:45:00"),
PARSE_TIME("%H:%M:%S","2:55:00")
UNION ALL
SELECT
"2022-12-15",
"abc",
PARSE_TIME("%H:%M:%S","3:20:00"),
PARSE_TIME("%H:%M:%S","7:22:00")
UNION ALL
SELECT
"2022-12-15",
"xyz",
PARSE_TIME("%H:%M:%S","8:18:00"),
PARSE_TIME("%H:%M:%S","9:25:00") );
WITH
t2 AS (
SELECT
*,
EXTRACT(hour
FROM
start_time)start_hr,
EXTRACT(hour
FROM
end_time)end_hr
FROM
mdata )
SELECT
mdate,
code,
new_start_hr hour,
SUM(timediff)minute
FROM (
SELECT
*,
CASE
WHEN new_start_hr BETWEEN start_hr+1 AND end_hr-1 THEN 60
WHEN new_start_hr = end_hr
AND new_start_hr > start_hr THEN TIME_DIFF(end_time,TIME(new_start_hr, 0, 0),minute)
WHEN new_start_hr < end_hr AND new_start_hr = start_hr THEN TIME_DIFF(TIME(new_start_hr+1, 0, 0),start_time,minute)
ELSE
TIME_DIFF(end_time,start_time, minute)
END
timediff
FROM (
SELECT
*
FROM
t2,
UNNEST(GENERATE_ARRAY(start_hr, end_hr)) new_start_hr ) )
GROUP BY
mdate,
code,
new_start_hr
ORDER BY
mdate,
code,
new_start_hr
Output:

Related

Pro Rata in BigQuery [duplicate]

This question already has answers here:
SQL equally distribute value by rows
(2 answers)
Do loop in BigQuery
(1 answer)
Closed 1 year ago.
I want to pro rate a table like this:
into a table like this:
essentially I want to create rows for the days between date_start and date end and then divide spend by how many days there are.
I am currently using the query below to do this, using BigQuery scripting - I know this probably is a horrible way of querying this but I'm not sure how else to do it. It takes about 30 seconds to run this query for just 3 rows.
DECLARE i INT64 DEFAULT 1;
DECLARE n int64;
SET n = (SELECT COUNT(*) FROM `pro_rata_test.data`);
DELETE FROM `pro_rata_test.pro_rata` WHERE TRUE;
WHILE i <= n DO
INSERT INTO
pro_rata_test.pro_rata
SELECT
day,
country,
campaign,
other,
SUM(spend)/(
SELECT
DATETIME_DIFF(DATETIME(TIMESTAMP(date_end)),
DATETIME(TIMESTAMP(date_start)),
DAY) + 1
FROM (
SELECT *, ROW_NUMBER() OVER(ORDER BY date_start) AS rn FROM `pro_rata_test.data`)
WHERE
rn = i) AS spend
FROM (
SELECT *, ROW_NUMBER() OVER(ORDER BY date_start) AS rn FROM `pro_rata_test.data`),
UNNEST(GENERATE_DATE_ARRAY(date_start, date_end)) day
WHERE
rn = i
GROUP BY
day,
country,
campaign,
other
ORDER BY
day;
SET
i = i + 1;
END WHILE
Try generate_date_array and unnest:
with mytable as (
select date '2021-01-01' as date_start, date '2021-01-10' as date_end, 100 as spend, 'FR' as country, 'Campaign1' as campaign, 'test1' as Other union all
select date '2021-01-11', date '2021-02-27', 150, 'UK', 'Campaign1', 'test2' union all
select date '2021-03-20', date '2021-04-20', 500, 'UK', 'Campaign2', 'test2'
)
select
day,
country,
campaign,
other,
spend/(date_diff(date_end, date_start, day)+1) as spend
from mytable, unnest(generate_date_array(date_start, date_end)) as day
order by day

SQL Running Total By Second - TimeStamp / Value

I'm having troubles writing a query that would aggregate my results per one second. Here I'm creating an example table and make two inserts.
create table example (
start timestamp,
stop timestamp,
qty INTEGER
);
insert into example(start, stop, qty)
values ('2019-06-11 09:59:59', '2019-06-11 10:00:04', 14);
insert into example(start, stop, qty)
values ('2019-06-11 10:00:00', '2019-06-11 10:00:03', 12);
I need a query that would return me something like this:
or
Where 1,2,3,4,5 are seconds from the first 2 inserts. 09:59:59 to 10:00:04 gives 5 seconds.
and 14, 26, 26, 26, 14 is the sum of qty for the rows with the same date.
14 + 12 = 26 and hence this number. And this addition occurs only for the seconds that occure in the same moment.
Is such a query possible?
In Oracle SQL, you could do something like this:
WITH test_data AS (
SELECT to_date('2019-06-11 09:59:59', 'yyyy-mm-dd hh24:mi:ss') AS start_time, to_date('2019-06-11 10:00:04', 'yyyy-mm-dd hh24:mi:ss') AS end_time, 14 AS qty FROM dual UNION ALL
SELECT to_date('2019-06-11 10:00:00', 'yyyy-mm-dd hh24:mi:ss') AS start_time, to_date('2019-06-11 10:00:03', 'yyyy-mm-dd hh24:mi:ss') AS end_time, 12 AS qty FROM dual
), seconds_between_first_last AS (
SELECT MIN(t.start_time) AS first_start_time,
MAX(t.end_time) AS last_end_time,
(MAX(t.end_time) - MIN(t.start_time)) * (24*60*60) AS seconds_elapsed /* Get the number of seconds between the first start time and the last end time */
FROM test_data t
), second_rows AS (
SELECT LEVEL AS seconds_since_start,
d.first_start_time + ((LEVEL - 1) / (24*60*60)) AS target_time
FROM seconds_between_first_last d
CONNECT BY LEVEL <= d.seconds_elapsed /* Get one row for each second in the interval */
)
SELECT r.seconds_since_start,
COALESCE(SUM(d.qty), 0) AS total_qty_in_interval
FROM second_rows r
LEFT JOIN test_data d
ON d.start_time <= r.target_time
AND d.end_time > r.target_time
GROUP BY r.seconds_since_start
ORDER BY r.seconds_since_start
You can get the boundaries easily enough:
with ss as (
select start as ts, qty
from t
union all
select stop, -qty
from t
)
select ts, sum(qty) as day_qty,
sum(sum(qty)) over (order by ts) as running_qty
from ss
group by ts;
This has all the timestamps when something starts or stops. It does not "fill in" values. The best way to do that depends on the database.
In access we have to use a workaround. See the example below.
SELECT TimeStamp, (SELECT SUM(Value) AS Total FROM Table1 WHERE Table1.TImeStamp <= T1.TimeStamp) AS Total
FROM Table1 as T1;

how to calculate difference between dates in BigQuery

I have a table named Employees with Columns: PersonID, Name, StartDate. I want to calculate 1) difference in days between the newest and oldest employee and 2) the longest period of time (in days) without any new hires. I have tried to use DATEDIFF, however the dates are in a single column and I'm not sure what other method I should use. Any help would be greatly appreciated
Below is for BigQuery Standard SQL
#standardSQL
SELECT
SUM(days_before_next_hire) AS days_between_newest_and_oldest_employee,
MAX(days_before_next_hire) - 1 AS longest_period_without_new_hire
FROM (
SELECT
DATE_DIFF(
StartDate,
LAG(StartDate) OVER(ORDER BY StartDate),
DAY
) days_before_next_hire
FROM `project.dataset.your_table`
)
You can test, play with above using dummy data as in the example below
#standardSQL
WITH `project.dataset.your_table` AS (
SELECT DATE '2019-01-01' StartDate UNION ALL
SELECT '2019-01-03' StartDate UNION ALL
SELECT '2019-01-13' StartDate
)
SELECT
SUM(days_before_next_hire) AS days_between_newest_and_oldest_employee,
MAX(days_before_next_hire) - 1 AS longest_period_without_new_hire
FROM (
SELECT
DATE_DIFF(
StartDate,
LAG(StartDate) OVER(ORDER BY StartDate),
DAY
) days_before_next_hire
FROM `project.dataset.your_table`
)
with result
Row days_between_newest_and_oldest_employee longest_period_without_new_hire
1 12 9
Note use of -1 in calculating longest_period_without_new_hire - it is really up to you to use this adjustment or not depends on your preferences of counting gaps
1) difference in days between the newest and oldest record
WITH table AS (
SELECT DATE(created_at) date, *
FROM `githubarchive.day.201901*`
WHERE _table_suffix<'2'
AND repo.name = 'google/bazel-common'
AND type='ForkEvent'
)
SELECT DATE_DIFF(MAX(date), MIN(date), DAY) max_minus_min
FROM table
2) the longest period of time (in days) without any new records
WITH table AS (
SELECT DATE(created_at) date, *
FROM `githubarchive.day.201901*`
WHERE _table_suffix<'2'
AND repo.name = 'google/bazel-common'
AND type='ForkEvent'
)
SELECT MAX(diff) max_diff
FROM (
SELECT DATE_DIFF(date, LAG(date) OVER(ORDER BY date), DAY) diff
FROM table
)

Recursive CTE in Amazon Redshift

We are trying to port a code to run on Amazon Redshift, but Refshift won't run the recursive CTE function. Any good soul that knows how to port this?
with tt as (
select t.*, row_number() over (partition by id order by time) as seqnum
from t
),
recursive cte as (
select t.*, time as grp_start
from tt
where seqnum = 1
union all
select tt.*,
(case when tt.time < cte.grp_start + interval '3 second'
then tt.time
else tt.grp_start
end)
from cte join
tt
on tt.seqnum = cte.seqnum + 1
)
select cte.*,
(case when grp_start = lag(grp_start) over (partition by id order by time)
then 0 else 1
end) as isValid
from cte;
Or, a different code to reproduce the logic below.
It is a binary result that:
it is 1 if it is the first known value of an ID
it is 1 if it is 3 seconds or later than the previous "1" of that ID
It is 0 if it is less than 3 seconds than the previous "1" of that ID
Note 1: this is not the difference in seconds from the previous record
Note 2: there are many IDs in the data set
Note 3: original dataset has ID and Date
Desired output:
https://i.stack.imgur.com/k4KUQ.png
Dataset poc:
http://www.sqlfiddle.com/#!15/41d4b
As of this writing, Redshift does support recursive CTE's: see documentation here
To note when creating a recursive CTE in Redshift:
start the query: with recursive
column names must be declared for all recursive cte's
Consider the following example for creating a list of dates using recursive CTE's:
with recursive
start_dt as (select current_date s_dt)
, end_dt as (select dateadd(day, 1000, current_date) e_dt)
-- the recusive cte, note declaration of the column `dt`
, dates (dt) as (
-- start at the start date
select s_dt dt from start_dt
union all
-- recursive lines
select dateadd(day, 1, dt)::date dt -- converted to date to avoid type mismatch
from dates
where dt <= (select e_dt from end_dt) -- stop at the end date
)
select *
from dates
The below code could help you.
SELECT id, time, CASE WHEN sec_diff is null or prev_sec_diff - sec_diff > 3
then 1
else 0
end FROM (
select id, time, sec_diff, lag(sec_diff) over(
partition by id order by time asc
)
as prev_sec_diff
from (
select id, time, date_part('s', time - lag(time) over(
partition by id order by time asc
)
)
as sec_diff from hon
) x
) y

Google BigQuery: Rolling Count Distinct

I have a table with is simply a list of dates and user IDs (not aggregated).
We define a metric called active users for a given date by counting the distinct number of IDs that appear in the previous 45 days.
I am trying to run a query in BigQuery that, for each day, returns the day plus the number of active users for that day (count distinct user from 45 days ago until today).
I have experimented with window functions, but can't figure out how to define a range based on the date values in a column. Instead, I believe the following query would work in a database like MySQL, but does not in BigQuery.
SELECT
day,
(SELECT
COUNT(DISTINCT visid)
FROM daily_users
WHERE day BETWEEN DATE_ADD(t.day, -45, "DAY") AND t.day
) AS active_users
FROM daily_users AS t
GROUP BY 1
This doesn't work in BigQuery: "Subselect not allowed in SELECT clause."
How to do this in BigQuery?
BigQuery documentation claims that count(distinct) works as a window function. However, that doesn't help you, because you are not looking for a traditional window frame.
One method would adds a record for each date after a visit:
select theday, count(distinct visid)
from (select date_add(u.day, n.n, "day") as theday, u.visid
from daily_users u cross join
(select 1 as n union all select 2 union all . . .
select 45
) n
) u
group by theday;
Note: there may be simpler ways to generate a series of 45 integers in BigQuery.
Below should work with BigQuery
#legacySQL
SELECT day, active_users FROM (
SELECT
day,
COUNT(DISTINCT id)
OVER (ORDER BY ts RANGE BETWEEN 45*24*3600 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, TIMESTAMP_TO_SEC(TIMESTAMP(day)) AS ts
FROM daily_users
)
) GROUP BY 1, 2 ORDER BY 1
Above assumes that day field is represented as '2016-01-10' format.
If it is not a case , you should adjust TIMESTAMP_TO_SEC(TIMESTAMP(day)) in most inner select
Also please take a look at COUNT(DISTINC) specifics in BigQuery
Update for BigQuery Standard SQL
#standardSQL
SELECT
day,
(SELECT COUNT(DISTINCT id) FROM UNNEST(active_users) id) AS active_users
FROM (
SELECT
day,
ARRAY_AGG(id)
OVER (ORDER BY ts RANGE BETWEEN 3888000 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, UNIX_DATE(PARSE_DATE('%Y-%m-%d', day)) * 24 * 3600 AS ts
FROM daily_users
)
)
GROUP BY 1, 2
ORDER BY 1
You can test / play with it using below dummy sample
#standardSQL
WITH daily_users AS (
SELECT 1 AS id, '2016-01-10' AS day UNION ALL
SELECT 2 AS id, '2016-01-10' AS day UNION ALL
SELECT 1 AS id, '2016-01-11' AS day UNION ALL
SELECT 3 AS id, '2016-01-11' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-12' AS day UNION ALL
SELECT 1 AS id, '2016-01-13' AS day
)
SELECT
day,
(SELECT COUNT(DISTINCT id) FROM UNNEST(active_users) id) AS active_users
FROM (
SELECT
day,
ARRAY_AGG(id)
OVER (ORDER BY ts RANGE BETWEEN 86400 PRECEDING AND CURRENT ROW) AS active_users
FROM (
SELECT day, id, UNIX_DATE(PARSE_DATE('%Y-%m-%d', day)) * 24 * 3600 AS ts
FROM daily_users
)
)
GROUP BY 1, 2
ORDER BY 1