Bigquery with UNNEST, LEFT JOIN and WHERE statement - sql

the following query with UNNEST and LEFT JOIN adds empty "0" rows with date:
SELECT cal_day, count(e.datetime) AS cnt
FROM UNNEST(
GENERATE_DATE_ARRAY(DATE('2018-12-10'), CURRENT_DATE(), INTERVAL 1 DAY)
) AS cal_day
LEFT JOIN `eventlogs` e
ON cal_day = CAST( TIMESTAMP_MICROS( CAST(CAST(e.datetime AS NUMERIC)*1000 AS INT64)) AS DATE)
# WHERE ( CAST(datetime AS NUMERIC) > 1544375081371.431 ) AND message LIKE '%mymessage%'
GROUP BY cal_day
ORDER BY cal_day
LIMIT 10000
results to:
1 2018-12-10 00:00:00 UTC 561
2 2018-12-11 00:00:00 UTC 1473
3 2018-12-12 00:00:00 UTC 650
4 2018-12-13 00:00:00 UTC 407
5 2018-12-14 00:00:00 UTC 283
6 2018-12-15 00:00:00 UTC 1
6 2018-12-16 00:00:00 UTC 0
7 2018-12-17 00:00:00 UTC 213
8 2018-12-18 00:00:00 UTC 583
this is not the case when I add the WHERE clause. How can I add message='mymessage' to the unnest so that I get 0 count dates with my WHERE?

#standardSQL
SELECT cal_day, IFNULL(cnt, 0) AS cnt
FROM UNNEST(
GENERATE_DATE_ARRAY(DATE('2018-12-10'), CURRENT_DATE(), INTERVAL 1 DAY)
) AS cal_day
LEFT JOIN (
SELECT
CAST( TIMESTAMP_MICROS( CAST(CAST(datetime AS NUMERIC)*1000 AS INT64)) AS DATE) AS day,
COUNT(datetime) AS cnt
FROM `eventlogs`
WHERE (CAST(datetime AS NUMERIC) > 1544375081371.431 )
AND message LIKE '%mymessage%'
GROUP BY day
) e
ON cal_day = e.day
ORDER BY cal_day
LIMIT 10000
As you can see - I just moved filtering logic inside subselect

Related

How to fill the time gap after grouping date record for months in postgres

I have table records as -
date n_count
2020-02-19 00:00:00 4
2020-07-14 00:00:00 1
2020-07-17 00:00:00 1
2020-07-30 00:00:00 2
2020-08-03 00:00:00 1
2020-08-04 00:00:00 2
2020-08-25 00:00:00 2
2020-09-23 00:00:00 2
2020-09-30 00:00:00 3
2020-10-01 00:00:00 11
2020-10-05 00:00:00 12
2020-10-19 00:00:00 1
2020-10-20 00:00:00 1
2020-10-22 00:00:00 1
2020-11-02 00:00:00 376
2020-11-04 00:00:00 72
2020-11-11 00:00:00 1
I want to be grouped all the records into months for finding month total count which is working, but there is a missing of month. how to fill this gap.
time month_count
"2020-02-01" 4
"2020-07-01" 4
"2020-08-01" 5
"2020-09-01" 5
"2020-10-01" 26
"2020-11-01" 449
This is what I have tried.
SELECT (date_trunc('month', date))::date AS time,
sum(n_count) as month_count
FROM table1
group by time
order by time asc
You can use generate_series() to generate all starts of months between the earliest and latest date available in the table, then bring the table with a left join:
select d.dt, coalesce(sum(t.n_count), 0) as month_count
from (
select generate_series(date_trunc('month', min(date)), date_trunc('month', max(date)), '1 month') as dt
from table1
) as d(dt)
left join table1 t on t.date >= d.dt and t.date < d.dt + interval '1 month'
group by d.dt
order by d.dt
I would simply UNION a date series, generated from MIN and MAX date:
demo:db<>fiddle
WITH cte AS ( -- 1
SELECT
*,
date_trunc('month', date)::date AS time
FROM
t
)
SELECT
time,
SUM(n_count) as month_count --3
FROM (
SELECT
time,
n_count
FROM cte
UNION
SELECT -- 2
generate_series(
(SELECT MIN(time) FROM cte),
(SELECT MAX(time) FROM cte),
interval '1 month'
)::date,
0
) s
GROUP BY time
ORDER BY time
Use CTE to calculate date_trunc only once. Could be left out if you like to call your table twice in the UNION below
Generate monthly date series from MIN to MAX date containing your n_count value = 0. Add it to the table
Do your calculation

ORACLE SQL - How to find the number of reliefs each teacher has, each day, 2 months before the teacher resigned?

I need some help in finding the number of reliefs each teacher has, every single day, 2 months before the teacher resigns.
Join_dt - teacher's join date,
Resign_dt - teacher's resign date,
Relief_ID - Relief teacher's ID,
Start_dt - Relief's start date,
End_dt - Relief's end date,
note that there may be overlapping dates between 2 or more different reliefs and so I need to find the number of distinct reliefs each teacher has for each date.
This is what I am given:
Teacher_ID Join_dt Resign_dt Relief_ID Start_dt End_dt
12 2006-08-30 2019-08-01 20 2017-02-07 2019-07-04
12 2006-08-30 2019-08-01 20 2016-11-10 2019-01-30
12 2006-08-30 2019-08-01 103 2016-08-20 2019-07-29
12 2006-08-30 2019-08-01 17 2016-01-30 2017-12-30
23 2017-10-01 2018-11-12 44 2018-10-19 2018-11-11
23 2017-10-01 2018-11-12 29 2018-04-01 2018-12-02
23 2017-10-01 2018-11-12 06 2017-11-25 2018-05-02
05 2015-02-11 2019-10-02 38 2019-01-17 2019-07-21
05 2015-02-11 2019-10-02 11 2018-11-02 2019-02-05
05 2015-02-11 2019-10-02 15 2018-09-30 2018-10-03
Expected result:
Teacher_ID Dates No_of_reliefs
12 2019-07-31 0
12 2019-07-30 0
12 2019-07-29 1
12 2019-07-28 1
12 2019-07-27 1
... ...
12 2019-07-04 2
... ...
12 2016-05-30 2
12 2016-05-29 2
12 2016-05-28 2
12 2016-05-27 2
12 2016-05-26 1
23 2018-10-31 2
... ...
For date 2019-07-29, No_of_reliefs = 1 because of Relief_ID 103.
For date 2017-07-04, No_of_reliefs = 2 because of Relief_ID 20 & 103.
Dates are supposed to start from 1 month before the teacher resigned. For Teacher_ID 23, since she resigned on 2019-11-12, dates shall start from 2019-10-31.
I have tried using connect by but the execution time is really long since it involves a large amount of data.
Any other methods will be greatly appreciated!!
Thank you kind souls!!!
You can use
connect by level <= last_day(add_months(Resign_dt,-1)) - add_months(Resign_dt,-2) clause :
I suppose you mean 2 months before resignment for the starting date, and ending on the last day of the previous month.
with t1(Teacher_ID,Resign_dt,Relief_ID,start_dt,end_dt) as
(
select 12,date'2019-08-01',20 ,date'2017-02-07',date'2019-07-04' from dual union all
select 12,date'2019-08-01',20 ,date'2016-11-10',date'2019-01-30' from dual union all
select 12,date'2019-08-01',103,date'2016-08-20',date'2019-07-29' from dual
......
), t2 as
(
select distinct last_day(add_months(Resign_dt,-1)) - level + 1 as Resign_dt, Teacher_ID
from t1
connect by level <= last_day(add_months(Resign_dt,-1)) - add_months(Resign_dt,-2)
and prior Teacher_ID = Teacher_ID and prior sys_guid() is not null
)
select Teacher_ID, to_char(Resign_dt,'yyyy-mm-dd') as Dates,
(select count(distinct Relief_ID)
from t1
where t2.Resign_dt between start_dt and end_dt
and t2.Teacher_ID = Teacher_ID
)
from t2
order by Teacher_ID, Resign_dt desc;
Demo
select d.dt
, tr.Teacher_ID
--, tr.Join_dt
--, tr.Resign_dt
, count(tr.Relief_ID)
--, tr.Start_dt
--, tr.End_dt
from tr
right outer join (
SELECT dt
FROM (
SELECT DATE '2006-01-01' + ROWNUM - 1 dt
FROM DUAL CONNECT BY ROWNUM < 5000
) q
WHERE EXTRACT(YEAR FROM dt) < EXTRACT(YEAR FROM sysdate) + 2
--order by 1
) d on d.dt between tr.Join_dt and tr.End_dt
and d.dt between tr.Start_dt and tr.Resign_dt
group by d.dt
, tr.Teacher_ID
order by d.dt desc

Counting records and grouping them by the hour

I'm trying to count the records in my table and grouping them by hour, i'm getting results with my query but I want it to return every hour even if there are no records.
My current query is,
SELECT nvl(count(*),0) AS transactioncount, trunc(date_modified, 'HH') as TRANSACTIONDATE
FROM TABLE
WHERE date_modified between to_date('23-JAN-19 07:00:00','dd-MON-yy hh24:mi:ss') and to_date('24-Jan-19 06:59:59','dd-MON-yy hh24:mi:ss')
group by trunc(date_modified, 'HH');
This returns a result like this,
TRANSACTIONCOUNT | TRANSACTIONDATE
43 | 23-Jan-19 07:00:00
47 | 23-Jan-19 08:00:00
156 | 23-Jan-19 14:00:00
558 | 23-Jan-19 15:00:00
What I want is for it to return every hour between my 2 dates so,
TRANSACTIONCOUNT | TRANSACTIONDATE
43 | 23-Jan-19 07:00:00
47 | 23-Jan-19 08:00:00
0 | 23-Jan-19 09:00:00
0 | 23-Jan-19 10:00:00
0 | 23-Jan-19 11:00:00
0 | 23-Jan-19 12:00:00
0 | 23-Jan-19 13:00:00
156 | 23-Jan-19 14:00:00
558 | 23-Jan-19 15:00:00
--......
0 | 24-Jan-19 00:00:00
0 | 24-Jan-19 01:00:00
0 | 24-Jan-19 02:00:00
--and so on
To fill the holes in the transaction hours you create first a complete table of hours.
You may use Recursive Subquery Factoring to do it
WITH hour_table(TRANSACTIONDATE) AS (
SELECT to_date('23-JAN-19 07:00:00','dd-MON-yy hh24:mi:ss') /* init hour here */
FROM DUAL
UNION ALL
SELECT TRANSACTIONDATE + 1/24
FROM hour_table
WHERE TRANSACTIONDATE + 1/24 < to_date('24-JAN-19 06:59:59','dd-MON-yy hh24:mi:ss') /* limit here */
)
select * from hour_table;
TRANSACTIONDATE
-------------------
23.01.2019 07:00:00
23.01.2019 08:00:00
...
24.01.2019 05:00:00
24.01.2019 06:00:00
Note that you use the staring and ending date in this query, the starting date must be exact an hour.
Next step is as simple as to outer join this hour table to your aggregation and set the default value for the missing hours with NVL.
with hour_table(TRANSACTIONDATE) AS (
SELECT to_date('23-JAN-19 07:00:00','dd-MON-yy hh24:mi:ss') /* init hour here */
FROM DUAL
UNION ALL
SELECT TRANSACTIONDATE + 1/24
FROM hour_table
WHERE TRANSACTIONDATE + 1/24 < to_date('24-JAN-19 06:59:59','dd-MON-yy hh24:mi:ss') /* limit */
),
agg as (
SELECT nvl(count(*),0) AS transactioncount, trunc(date_modified, 'HH') as TRANSACTIONDATE
FROM "TABLE"
WHERE date_modified between to_date('23-JAN-19 07:00:00','dd-MON-yy hh24:mi:ss') and to_date('24-Jan-19 06:59:59','dd-MON-yy hh24:mi:ss')
group by trunc(date_modified, 'HH')
)
select t.TRANSACTIONDATE, nvl(transactioncount,0) transactioncount
from hour_table t
left outer join agg a
on t.TRANSACTIONDATE = a.TRANSACTIONDATE
order by 1;
You might consider using the following with CONNECT BY level logic :
SELECT sum(transactioncount) as transactioncount, transactiondate
FROM
(
with "TABLE"(date_modified) as
(
SELECT timestamp'2019-01-23 08:00:00' FROM dual union all
SELECT timestamp'2019-01-23 08:30:00' FROM dual union all
SELECT timestamp'2019-01-23 09:00:00' FROM dual union all
SELECT timestamp'2019-01-24 05:01:00' FROM dual
)
SELECT nvl(count(*),0) AS transactioncount, trunc(date_modified, 'hh24') as transactiondate
FROM "TABLE" t
GROUP BY trunc(date_modified, 'HH24')
UNION ALL
SELECT 0, timestamp'2019-01-23 07:00:00' + ( level - 1 )/24
FROM dual
CONNECT BY level <= 24 * extract( day from
timestamp'2019-01-24 06:59:59'-
timestamp'2019-01-23 07:00:00') +
extract( hour from
timestamp'2019-01-24 06:59:59'-
timestamp'2019-01-23 07:00:00') + 1
)
GROUP BY transactiondate
ORDER BY transactiondate
Rextester Demo

bigquery - calculate monthly outstanding values

I'm trying to solve the following problem:
a user took three loans with running times of 3,4 and 5 months.
How to calculate in BigQuery for each point in time, how much he owns?
I know to do this calculation in R or Python but would clearly prefer a BigQuery/SQL solution.
Thank you!
I have the data:
Take Date Return Date Sum
2016-01-01 2016-03-31 10
2016-02-01 2016-05-31 20
2016-03-01 2016-07-31 50
I need the output like this:
Date Sum
2016-01-01 10
2016-02-01 30
2016-03-01 80
2016-04-01 70
2016-05-01 70
2016-06-01 50
2016-07-01 50
2016-08-01 0
Below is for BigQuery Standard SQL
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, DATE '2016-01-01' take_date, DATE '2016-03-31' return_date, 10 amount
UNION ALL SELECT 1, DATE '2016-02-01', DATE '2016-05-31', 20
UNION ALL SELECT 1, DATE '2016-03-01', DATE '2016-07-31', 50
), dates AS (
SELECT id, day
FROM (
SELECT id, GENERATE_DATE_ARRAY(
MIN(take_date),
DATE_ADD(DATE_TRUNC(MAX(return_date), MONTH), INTERVAL 1 MONTH),
INTERVAL 1 MONTH
) days
FROM `project.dataset.table`
GROUP BY id
), UNNEST(days) day
)
SELECT d.id, d.day, SUM(IF(d.day BETWEEN t.take_date AND t.return_date, amount, 0)) amount
FROM dates d
LEFT JOIN `project.dataset.table` t
ON d.id = t.id
GROUP BY d.id, d.day
ORDER BY d.day
with result as
Row id day amount
1 1 2016-01-01 10
2 1 2016-02-01 30
3 1 2016-03-01 80
4 1 2016-04-01 70
5 1 2016-05-01 70
6 1 2016-06-01 50
7 1 2016-07-01 50
8 1 2016-08-01 0

Calculating concurrency from a set of ranges

I have a set of rows containing a start timestamp and a duration. I want to perform various summaries using the overlap or concurrency.
For example: peak daily concurrency, peak concurrency grouped on another column.
Example data:
timestamp,duration
2016-01-01 12:00:00,300
2016-01-01 12:01:00,300
2016-01-01 12:06:00,300
I would like to know that peak for the period was 12:01:00-12:05:00 at 2 concurrent.
Any ideas on how to achieve this using BigQuery or, less exciting, a Map/Reduce job?
For a per-minute resolution, with session lengths of up to 255 minutes:
SELECT session_minute, COUNT(*) c
FROM (
SELECT start, DATE_ADD(start, i, 'MINUTE') session_minute FROM (
SELECT * FROM (
SELECT TIMESTAMP("2015-04-30 10:14") start, 7 minutes
),(
SELECT TIMESTAMP("2015-04-30 10:15") start, 12 minutes
),(
SELECT TIMESTAMP("2015-04-30 10:15") start, 12 minutes
),(
SELECT TIMESTAMP("2015-04-30 10:18") start, 12 minutes
),(
SELECT TIMESTAMP("2015-04-30 10:23") start, 3 minutes
)
) a
CROSS JOIN [fh-bigquery:public_dump.numbers_255] b
WHERE a.minutes>b.i
)
GROUP BY 1
ORDER BY 1
STEP 1 - First you need find all periods (start and end) with
respective concurrent entries
SELECT ts AS start, LEAD(ts) OVER(ORDER BY ts) AS finish,
SUM(entry) OVER(ORDER BY ts) AS concurrent_entries
FROM (
SELECT ts, SUM(entry)AS entry
FROM
(SELECT ts, 1 AS entry FROM yourTable),
(SELECT DATE_ADD(ts, duration, 'second') AS ts, -1 AS entry FROM yourTable)
GROUP BY ts
HAVING entry != 0
)
ORDER BY ts
Assuming input as below
(SELECT TIMESTAMP('2016-01-01 12:00:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:01:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:06:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:07:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:10:00') AS ts, 300 AS duration),
(SELECT TIMESTAMP('2016-01-01 12:11:00') AS ts, 300 AS duration)
the output of above query will look somehow like this:
start finish concurrent_entries
2016-01-01 12:00:00 UTC 2016-01-01 12:01:00 UTC 1
2016-01-01 12:01:00 UTC 2016-01-01 12:05:00 UTC 2
2016-01-01 12:05:00 UTC 2016-01-01 12:07:00 UTC 1
2016-01-01 12:07:00 UTC 2016-01-01 12:10:00 UTC 2
2016-01-01 12:10:00 UTC 2016-01-01 12:12:00 UTC 3
2016-01-01 12:12:00 UTC 2016-01-01 12:15:00 UTC 2
2016-01-01 12:15:00 UTC 2016-01-01 12:16:00 UTC 1
2016-01-01 12:16:00 UTC null 0
You might still want to polish above query a little - but mainly it does what you need
STEP 2 - now you can do any stats off of above result
For example peak on whole period:
SELECT
start, finish, concurrent_entries, RANK() OVER(ORDER BY concurrent_entries DESC) AS peak
FROM (
SELECT ts AS start, LEAD(ts) OVER(ORDER BY ts) AS finish,
SUM(entry) OVER(ORDER BY ts) AS concurrent_entries
FROM (
SELECT ts, SUM(entry)AS entry FROM
(SELECT ts, 1 AS entry FROM yourTable),
(SELECT DATE_ADD(ts, duration, 'second') AS ts, -1 AS entry FROM yourTable)
GROUP BY ts
HAVING entry != 0
)
)
ORDER BY peak