Cross join for time series postgresql query - sql

I have a table with Items with
Item_id, Item_time, Item_numbers
1 2017-01-01 18:00:00 2
2 2017-01-01 18:10:00 2
3 2017-01-01 19:10:00 3
I want to group the items by hourly for some specific time (between 9 to 3 for each day) and in case if there is no entry for the particular hours then it should it be a 0.
Desired Output:
Item_time Item_numbers
2017-01-01 18:00:00 4
2017-01-01 19:00:00 3
2017-01-01 20:00:00 0
with hour_items as (select date_trunc('hour', item_time) "hour",
avg(item_numbers) as value from items where item_id=2 and
fact_time::date= '2017-01-01' group by hour) select hour, value from
hour_items where EXTRACT(HOUR FROM hour) >= '9' and EXTRACT(HOUR FROM
> hour) < '15'.
The above query groups them correctly but the where the hour is missing, there is no entry. Though it should be an entry with a 0 as stated in the desired output.

This should do.
We get all the distinct days (CTE dates), then we generate hours for each of those dates (CTE hours) and finally we left join our data on "per our" basis.
with sample_data as (
select 1 as item_id, '2018-01-01 12:03:15'::timestamp as item_time, 2 as item_numbers
union all
select 2 as item_id, '2018-01-01 12:41:15'::timestamp as item_time, 1 as item_numbers
union all
select 3 as item_id, '2018-01-01 17:41:15'::timestamp as item_time, 2 as item_numbers
union all
select 4 as item_id, '2018-01-01 19:41:15'::timestamp as item_time, 2 as item_numbers
),
dates as (
select distinct item_time::date
from sample_data
),
hours as (
select item_time + interval '1 hour' * a as hour
from dates
cross join generate_series(0,23) a
)
select h.hour, sum(coalesce(sd.item_numbers,0))
from hours h
left join sample_data sd on h.hour = date_trunc('hour', sd.item_time)
where extract(hour from hour) between 9 and 17
group by h.hour
order by h.hour

Related

Explode time duration defined by start and end timestamp by the hour

I have a table with work shifts (1 row per shift) that include date, start and end time.
Main goal: I want to aggregate the number of working hours per hour per store.
This is what my shift table looks like:
employee_id
store
start_timestamp
end_timestamp
1
1
2022-01-01T07:00
2022-01-01T11:30
2
1
2022-01-01T08:30
2022-01-01T12:30
...
...
...
...
I want to "explode" the information into a table something like this:
hour
employee_id
store
date
scheduled_work (h)
07:00
1
1
2022-01-01
1
08:00
1
1
2022-01-01
1
09:00
1
1
2022-01-01
1
10:00
1
1
2022-01-01
1
11:00
1
1
2022-01-01
0.5
08:00
2
1
2022-01-01
0.5
09:00
2
1
2022-01-01
1
10:00
2
1
2022-01-01
1
11:00
2
1
2022-01-01
1
12:00
2
1
2022-01-01
0.5
...
...
...
...
...
I have tried using a method using cross joins and it consumed a lot of memory and looks like this:
with test as (
select 1 as employee_id, 1 as store_id, timestamp('2022-01-01 07:00:00') as start_timestamp, timestamp('2022-01-01 11:30:00') as end_timestamp union all
select 2 as employee_id, 1 as store_id, timestamp('2022-01-01 08:30:00') as start_timestamp, timestamp('2022-01-01 12:30:00') as end_timestamp
)
, cte as (
select ts
, test.*
, safe_divide(
timestamp_diff(
least(date_add(ts, interval 1 hour), end_timestamp)
, greatest(ts, start_timestamp)
, millisecond
)
, 3600000
) as scheduled_work
from test
cross join unnest(generate_timestamp_array(timestamp('2022-01-01 07:00:00'),
timestamp('2022-01-01 12:30:00'), interval 1 hour)) as ts
order by employee_id, ts)
select * from cte
where scheduled_work >= 0;
It's working but I know this will not be good when the number of shifts starts to add up. Does anyone have another solution that is more efficient?
I'm using BigQuery.
you might want to remove order by inside cte subquery, it'll affect the query performance.
And another similar approach:
WITH test AS (
select 1 as employee_id, 1 as store_id, timestamp('2022-01-01 07:00:00') as start_timestamp, timestamp('2022-01-01 11:30:00') as end_timestamp union all
select 2 as employee_id, 1 as store_id, timestamp('2022-01-01 08:30:00') as start_timestamp, timestamp('2022-01-01 12:30:00') as end_timestamp
),
explodes AS (
SELECT employee_id, store_id, EXTRACT(DATE FROM h) date, TIME_TRUNC(EXTRACT(TIME FROM h), HOUR) hour, 1 AS scheduled_work
FROM test,
UNNEST (GENERATE_TIMESTAMP_ARRAY(
TIMESTAMP_TRUNC(start_timestamp + INTERVAL 1 HOUR, HOUR),
TIMESTAMP_TRUNC(end_timestamp - INTERVAL 1 HOUR, HOUR), INTERVAL 1 HOUR
)) h
UNION ALL
SELECT employee_id, store_id, EXTRACT(DATE FROM h), TIME_TRUNC(EXTRACT(TIME FROM h), HOUR),
CASE offset
WHEN 0 THEN 1 - (EXTRACT(MINUTE FROM h) * 60 + EXTRACT(SECOND FROM h)) / 3600
WHEN 1 THEN (EXTRACT(MINUTE FROM h) * 60 + EXTRACT(SECOND FROM h)) / 3600
END
FROM test, UNNEST([start_timestamp, end_timestamp]) h WITH OFFSET
)
SELECT * FROM explodes WHERE scheduled_work > 0;
Consider below approach
with temp as (
select * replace(
parse_time('%H:%M', start_time) as start_time,
parse_time('%H:%M', end_time) as end_time
)
from your_table
)
select * except(start_time, end_time),
case
when hour = time_trunc(start_time, hour) then (60 - time_diff(start_time, hour, minute)) / 60
when hour = time_trunc(end_time, hour) then time_diff(end_time, hour, minute) / 60
else 1
end as scheduled_work
from (
select time_add(time_trunc(start_time, hour), interval delta hour) as hour,
employee_id, store, date, start_time, end_time
from temp, unnest(generate_array(0,time_diff(end_time, start_time, hour))) delta
)
order by employee_id, hour
if applied to sample data as in your question
output is

How to fill the time gap after grouping date record for months in postgres

I have table records as -
date n_count
2020-02-19 00:00:00 4
2020-07-14 00:00:00 1
2020-07-17 00:00:00 1
2020-07-30 00:00:00 2
2020-08-03 00:00:00 1
2020-08-04 00:00:00 2
2020-08-25 00:00:00 2
2020-09-23 00:00:00 2
2020-09-30 00:00:00 3
2020-10-01 00:00:00 11
2020-10-05 00:00:00 12
2020-10-19 00:00:00 1
2020-10-20 00:00:00 1
2020-10-22 00:00:00 1
2020-11-02 00:00:00 376
2020-11-04 00:00:00 72
2020-11-11 00:00:00 1
I want to be grouped all the records into months for finding month total count which is working, but there is a missing of month. how to fill this gap.
time month_count
"2020-02-01" 4
"2020-07-01" 4
"2020-08-01" 5
"2020-09-01" 5
"2020-10-01" 26
"2020-11-01" 449
This is what I have tried.
SELECT (date_trunc('month', date))::date AS time,
sum(n_count) as month_count
FROM table1
group by time
order by time asc
You can use generate_series() to generate all starts of months between the earliest and latest date available in the table, then bring the table with a left join:
select d.dt, coalesce(sum(t.n_count), 0) as month_count
from (
select generate_series(date_trunc('month', min(date)), date_trunc('month', max(date)), '1 month') as dt
from table1
) as d(dt)
left join table1 t on t.date >= d.dt and t.date < d.dt + interval '1 month'
group by d.dt
order by d.dt
I would simply UNION a date series, generated from MIN and MAX date:
demo:db<>fiddle
WITH cte AS ( -- 1
SELECT
*,
date_trunc('month', date)::date AS time
FROM
t
)
SELECT
time,
SUM(n_count) as month_count --3
FROM (
SELECT
time,
n_count
FROM cte
UNION
SELECT -- 2
generate_series(
(SELECT MIN(time) FROM cte),
(SELECT MAX(time) FROM cte),
interval '1 month'
)::date,
0
) s
GROUP BY time
ORDER BY time
Use CTE to calculate date_trunc only once. Could be left out if you like to call your table twice in the UNION below
Generate monthly date series from MIN to MAX date containing your n_count value = 0. Add it to the table
Do your calculation

BigQuery: how to do semi left join?

I couldn't come up with a good title for this question. Sorry about that.
I have two tables A and B. Both have timestamps and shares a common ID between them. Here are schemas of both tables:
Table A:
========
a_id int,
common_id int,
ts timestamp
...
Table B:
========
b_id int,
common_id int,
ts timestamp,
temperature int
Table A is more like device data whenever it changes its status. Table B is more IoT data which contains a temperature of a device every minute or so.
What I want to do is to create a Table C from these two tables. Table C would be in essence Table A + its temperature in closest time from table B.
How can I do this purely in BigQuery SQL? The temperature info doesn't need to be precise.
Below option (for BigQuery Standard SQL) assumes that in addition of temperature from table b you still need all the rest of values from respective row
#standardSQL
SELECT
ARRAY_AGG(
STRUCT(a_id, a.common_id, a.ts, b_id, b.ts AS b_ts, temperature)
ORDER BY ABS(TIMESTAMP_DIFF(a.ts, b.ts, SECOND))
LIMIT 1
)[SAFE_OFFSET(0)].*
FROM `project.dataset.table_a` a
LEFT JOIN `project.dataset.table_b` b
ON a.common_id = b.common_id
AND ABS(TIMESTAMP_DIFF(a.ts, b.ts, MINUTE)) < 30
GROUP BY TO_JSON_STRING(a)
I smoke-tested it with below generated dummy data
#standardSQL
WITH `project.dataset.table_a` AS (
SELECT CAST(1000000 * RAND() AS INT64) a_id, common_id, ts
FROM UNNEST(GENERATE_TIMESTAMP_ARRAY('2018-01-01 00:00:00', '2018-01-01 23:59:59', INTERVAL 45*60 + 27 SECOND)) ts
CROSS JOIN UNNEST(GENERATE_ARRAY(1, 10)) common_id
), `project.dataset.table_b` AS (
SELECT CAST(1000000 * RAND() AS INT64) b_id, common_id, ts, CAST(60 + 40 * RAND() AS INT64) temperature
FROM UNNEST(GENERATE_TIMESTAMP_ARRAY('2018-01-01 00:00:00', '2018-01-01 23:59:59', INTERVAL 1 MINUTE)) ts
CROSS JOIN UNNEST(GENERATE_ARRAY(1, 10)) common_id
)
SELECT
ARRAY_AGG(
STRUCT(a_id, a.common_id, a.ts, b_id, b.ts AS b_ts, temperature)
ORDER BY ABS(TIMESTAMP_DIFF(a.ts, b.ts, SECOND))
LIMIT 1
)[SAFE_OFFSET(0)].*
FROM `project.dataset.table_a` a
LEFT JOIN `project.dataset.table_b` b
ON a.common_id = b.common_id
AND ABS(TIMESTAMP_DIFF(a.ts, b.ts, MINUTE)) < 30
GROUP BY TO_JSON_STRING(a)
with example of few rows from output:
Row a_id common_id ts b_id b_ts temperature
1 276623 1 2018-01-01 00:00:00 UTC 166995 2018-01-01 00:00:00 UTC 74
2 218354 1 2018-01-01 00:45:27 UTC 464901 2018-01-01 00:45:00 UTC 87
3 265634 1 2018-01-01 01:30:54 UTC 565385 2018-01-01 01:31:00 UTC 87
4 758075 1 2018-01-01 02:16:21 UTC 55894 2018-01-01 02:16:00 UTC 84
5 306355 1 2018-01-01 03:01:48 UTC 844429 2018-01-01 03:02:00 UTC 92
6 348502 1 2018-01-01 03:47:15 UTC 375859 2018-01-01 03:47:00 UTC 90
7 774920 1 2018-01-01 04:32:42 UTC 438164 2018-01-01 04:33:00 UTC 61
Here - I set table_b to have temperature for each minute for 10 devices during the whole day of '2018-01-01' and in table_a I set status changed each 45 min 27 sec for same 10 devices during same day. a_id and b_id - just random numbers between 0 and 999999
Note: ABS(TIMESTAMP_DIFF(a.ts, b.ts, MINUTE)) < 30 clause in JOIN controls period that you can consider ok to look for closest ts (in case if some IoT entries are absent from table_b
Measuring the closest time by TIMESTAMP_DIFF(a.ts,b.ts, SECOND) - by its absolute value to get the closest in any direction:
WITH a AS (
SELECT 1 id, TIMESTAMP('2018-01-01 11:01:00') ts
UNION ALL SELECT 1, ('2018-01-02 10:00:00')
UNION ALL SELECT 2, ('2018-01-02 10:00:00')
)
, b AS (
SELECT 1 id, TIMESTAMP('2018-01-01 12:01:00') ts, 43 temp
UNION ALL SELECT 1, TIMESTAMP('2018-01-01 12:06:00'), 47
)
SELECT *,
(SELECT temp
FROM b
WHERE a.id=b.id
ORDER BY ABS(TIMESTAMP_DIFF(a.ts,b.ts, SECOND))
LIMIT 1) temp
FROM a

Date wise hourly (on 24 hour) coustomer count

I have a data set where customer id , customer join time and leave time available. I want to count hourly basis each date customer
Here is sample data set
My expected output
Here I going to add my code snip that i tried,where 1st created 24 hours span then tried to join and aggregate function for getting expected result and got for current date but i need for any date i.e dynamically
select logdate as date,timespan,count(customer_id)
(
SELECT userid,cast(joinTime as date) as logdate,customer_id
,starttime,endtime,timespan
FROM login_out_logs AS logTable
left join
(select '00:00:00 - 01:00:00' timespan,DATEadd(hh,0,cast(dateadd(dd,-1,getdate()))) starttime,dateadd(hh,1,cast(dateadd(dd,-1,getdate()))) endtime
union
select '01:00:00 - 02:00:00', dateadd(hh,1,cast(dateadd(dd,-1,getdate()))),dateadd(hh,2,cast(dateadd(dd,-1,getdate())))
union
select '02:00:00 - 03:00:00', dateadd(hh,2,cast(dateadd(dd,-1,getdate()))),dateadd(hh,3,cast(dateadd(dd,-1,getdate())))
union
select '03:00:00 - 04:00:00', dateadd(hh,3,cast(dateadd(dd,-1,getdate()))),dateadd(hh,4,cast(dateadd(dd,-1,getdate())))
union
select '04:00:00 - 05:00:00', dateadd(hh,4,cast(dateadd(dd,-1,getdate()))),dateadd(hh,5,cast(dateadd(dd,-1,getdate())))
union
select '05:00:00 - 06:00:00',dateadd(hh,5,cast(dateadd(dd,-1,getdate()))),dateadd(hh,6,cast(dateadd(dd,-1,getdate())))
union
select '06:00:00 - 07:00:00',dateadd(hh,6,cast(dateadd(dd,-1,getdate()))),dateadd(hh,7,cast(dateadd(dd,-1,getdate())))
union
select '07:00:00 - 08:00:00',dateadd(hh,7,cast(dateadd(dd,-1,getdate()))),dateadd(hh,8,cast(dateadd(dd,-1,getdate())))
union
select '08:00:00 - 09:00:00',dateadd(hh,8,cast(dateadd(dd,-1,getdate()))),dateadd(hh,9,cast(dateadd(dd,-1,getdate())))
union
select '09:00:00 - 10:00:00',dateadd(hh,9,cast(dateadd(dd,-1,getdate()))),dateadd(hh,10,cast(dateadd(dd,-1,getdate())))
union
select '10:00:00 - 11:00:00',dateadd(hh,10,cast(dateadd(dd,-1,getdate()))),dateadd(hh,11,cast(dateadd(dd,-1,getdate())))
union
select '11:00:00 - 12:00:00',dateadd(hh,11,cast(dateadd(dd,-1,getdate()))),dateadd(hh,12,cast(dateadd(dd,-1,getdate())))
union
select '12:00:00 - 13:00:00',dateadd(hh,12,cast(dateadd(dd,-1,getdate()))),dateadd(hh,13,cast(dateadd(dd,-1,getdate())))
union
select '13:00:00 - 14:00:00',dateadd(hh,13,cast(dateadd(dd,-1,getdate()))),dateadd(hh,14,cast(dateadd(dd,-1,getdate())))
union
select '14:00:00 - 15:00:00',dateadd(hh,14,cast(dateadd(dd,-1,getdate()))),dateadd(hh,15,cast(dateadd(dd,-1,getdate())))
union
select '15:00:00 - 16:00:00',dateadd(hh,15,cast(dateadd(dd,-1,getdate()))),dateadd(hh,16,cast(dateadd(dd,-1,getdate())))
union
select '16:00:00 - 17:00:00',dateadd(hh,16,cast(dateadd(dd,-1,getdate()))),dateadd(hh,17,cast(dateadd(dd,-1,getdate())))
union
select '17:00:00 - 18:00:00',dateadd(hh,17,cast(dateadd(dd,-1,getdate()))),dateadd(hh,18,cast(dateadd(dd,-1,getdate())))
union
select '18:00:00 - 19:00:00',dateadd(hh,18,cast(dateadd(dd,-1,getdate()))),dateadd(hh,19,cast(dateadd(dd,-1,getdate())))
union
select '19:00:00 - 20:00:00',dateadd(hh,19,cast(dateadd(dd,-1,getdate()))),dateadd(hh,20,cast(dateadd(dd,-1,getdate())))
union
select '20:00:00 - 21:00:00',dateadd(hh,20,cast(dateadd(dd,-1,getdate()))),dateadd(hh,21,cast(dateadd(dd,-1,getdate())))
union
select '21:00:00 - 22:00:00',dateadd(hh,21,cast(dateadd(dd,-1,getdate()))),dateadd(hh,22,cast(dateadd(dd,-1,getdate())))
union
select '22:00:00 - 23:00:00',dateadd(hh,22,cast(dateadd(dd,-1,getdate()))),dateadd(hh,23,cast(dateadd(dd,-1,getdate())))
union
select '24:00:00 - 00:00:00',dateadd(hh,23,cast(dateadd(dd,-1,getdate()))),dateadd(hh,23,dateadd(mi,59,cast(dateadd(dd,-1,getdate())))))a
on starttime between jointime and leaveTime
or endtime between jointime and leaveTime
or jointime>=starttime and jointime<endtime
) as T
group by leaveTime,timespan
Date Hour customer_count
2018-01-01 8-9 1
2018-01-01 9-10 1
2018-01-01 10-11 1
2018-01-01 11-12 1
2018-01-01 12-13 1
2018-01-01 13-14 1
2018-01-01 14-15 1
2018-01-01 15-16 1
2018-01-01 16-17 1
2018-01-01 17-18 1
2018-01-01 18-19 1
2018-01-01 19-20 1
2018-01-01 20-21 2
2018-01-01 21-22 3
2018-01-01 22-23 2
2018-01-01 23-00 1
Here is an approach - maybe this already solves your problem. I designed it in order to work with any day-difference between join and leave. However, I can't tell anything about the performance on larger sets since I tested with your example only and the evaluation of all relevant hours might take a bit longer if it comes to bigger data sets.
Anyways, I used a recursice cte here in order to evaluate all hours between join and leave and lateron I group by date and hour:
DECLARE #Cust TABLE(
customer_id INT,
joinTime DATETIME,
leaveTime DATETIME
)
INSERT INTO #Cust VALUES
(536, '2018-01-01 08:05:00', '2018-01-01 18:31:00'),
(344, '2018-01-01 19:37:00', '2018-01-01 20:16:00'),
(344, '2018-01-01 19:49:00', '2018-01-01 20:00:00'),
(899, '2018-01-01 20:49:00', '2018-01-01 21:14:00'),
(2336, '2018-01-01 21:02:00', '2018-01-01 21:03:00'),
(335, '2018-01-01 21:03:00', '2018-01-01 23:43:00'),
(2336, '2018-01-01 21:03:00', '2018-01-02 00:06:00'),
(899, '2018-01-01 21:18:00', '2018-01-01 22:24:00'),
(345, '2018-01-01 21:21:00', '2018-01-01 21:39:00'),
(345, '2018-01-01 21:53:00', '2018-01-02 00:13:00');
;WITH cte AS(
SELECT c.customer_id,
c.joinTime,
c.leaveTime,
c.joinTime x
FROM #Cust c
UNION ALL
SELECT c.customer_id,
c.joinTime,
c.leaveTime,
DATEADD(HOUR, 1, x) x
FROM cte c
WHERE DATEADD(HOUR, 1, x) <= CASE WHEN DATEPART(MINUTE, x) < DATEPART(MINUTE, c.leaveTime) THEN c.leaveTime ELSE DATEADD(HOUR, 1, c.leaveTime) END
)
SELECT CONVERT(DATE, x) AS cDate, DATEPART(HOUR, x) AS cHour, COUNT(*) AS cCount
FROM cte
GROUP BY CONVERT(DATE, x), DATEPART(HOUR, x)
ORDER BY 1,2
OPTION (MAXRECURSION 0)
Try this:
;WITH hourlist(starthour) AS (
SELECT 0 -- Seed Row
UNION ALL
SELECT starthour + 1 -- Recursion
FROM hourlist
where starthour+1<=23
)
SELECT
day
,convert(nvarchar,starthour)+'-'+convert(nvarchar,case when starthour+1=24 then 0 else starthour+1 end) hourtitle
,count(distinct customer_id) 'customer count'
FROM
hourlist h -- list of all hourse
cross join
(
select distinct dateadd(day,datediff(day,0, joinTime),0) from #login_out_logs
union
select distinct dateadd(day,datediff(day,0,leaveTime),0) from #login_out_logs
)q10(day) -- list of all days of jointime and leavetime
inner join #login_out_logs l on -- log considered for specific day/hour if starts before hourend and ends before hourstart
l.joinTime <dateadd(hour,starthour+1,q10.day)
and
l.leaveTime>=dateadd(hour,starthour ,q10.day)
group by day,starthour
order by day,starthour
Note: this will only work for jointimes and leavetimes that differ 0 or 1 days, not 2 or more.

How to combine multiple SELECTs into a single SELECT by a common column in (BigQuery) SQL?

Given I have multiple tables in BigQuery, hence I have multiple SQL-statements that gives me "the number of X per day". For example:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as installs
FROM database.table1
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | installs |
-------------------------
| 2017-01-01 | 11 |
| 2017-01-02 | 22 |
etc
Another statement:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as uninstalls
FROM database.table2
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | uninstalls |
---------------------------
| 2017-01-02 | 22 |
| 2017-01-03 | 33 |
etc
Another statement:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as cases
FROM database.table3
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | cases |
----------------------
| 2017-01-01 | 11 |
| 2017-01-03 | 33 |
etc
etc
Now I need to combine all these into a single SELECT statement that gives the following results:
| day | installs | uninstalls | cases |
----------------------------------------------
| 2017-01-01 | 11 | 0 | 11 |
| 2017-01-02 | 22 | 22 | 0 |
| 2017-01-03 | 0 | 33 | 33 |
etc
Is this even possible?
Or what's the closest SQL-statement I can write that would give me a similar result?
Any feedback is appreciated!
Here is a self-contained example that might help to get you started. It uses two dummy tables, InstallEvents and UninstallEvents, which contain timestamps for the respective actions. It creates a common table expression called StartAndEnd that computes the minimum and maximum dates for these events in order to decide which dates to aggregate over, then unions the contents of the InstallEvents and UninstallEvents, counting the events for each day.
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
),
StartAndEnd AS (
SELECT MIN(DATE(timestamp)) AS min_date, MAX(DATE(timestamp)) AS max_date
FROM (
SELECT * FROM InstallEvents UNION ALL
SELECT * FROM UninstallEvents
)
)
SELECT
day,
COUNTIF(is_install AND DATE(timestamp) = day) AS installs,
COUNTIF(NOT is_install AND DATE(timestamp) = day) AS uninstalls
FROM (
SELECT *, true AS is_install
FROM InstallEvents UNION ALL
SELECT *, false
FROM UninstallEvents
)
CROSS JOIN UNNEST(GENERATE_DATE_ARRAY(
(SELECT min_date FROM StartAndEnd),
(SELECT max_date FROM StartAndEnd)
)) AS day
GROUP BY day
ORDER BY day;
If you know what the start and end dates are in advance, you can hard-code them in the query instead and then omit the StartAndEnd CTE:
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
)
SELECT
day,
COUNTIF(is_install AND DATE(timestamp) = day) AS installs,
COUNTIF(NOT is_install AND DATE(timestamp) = day) AS uninstalls
FROM (
SELECT *, true AS is_install
FROM InstallEvents UNION ALL
SELECT *, false
FROM UninstallEvents
)
CROSS JOIN UNNEST(GENERATE_DATE_ARRAY('2017-01-01', '2017-01-04')) AS day
GROUP BY day
ORDER BY day;
To see the events in the sample data, use a query that unions the contents:
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
)
SELECT timestamp, true AS is_install
FROM InstallEvents UNION ALL
SELECT timestamp, false
FROM UninstallEvents;
Below is for BigQuery Standard SQL
#standardSQL
WITH calendar AS (
SELECT day
FROM (
SELECT MIN(min_day) AS min_day, MAX(max_day) AS max_day
FROM (
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table1` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table2` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table3`
)
), UNNEST(GENERATE_DATE_ARRAY(min_day, max_day, INTERVAL 1 DAY)) AS day
)
SELECT
c.day AS day,
IFNULL(SUM(installs), 0) AS installs,
IFNULL(SUM(uninstalls), 0) AS uninstalls,
IFNULL(SUM(cases),0) AS cases
FROM calendar AS c
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) installs FROM `database.table1` GROUP BY day) t1 ON t1.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) uninstalls FROM `database.table2` GROUP BY day) t2 ON t2.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) cases FROM `database.table3` GROUP BY day) t3 ON t3.day = c.day
GROUP BY day
HAVING installs + uninstalls + cases > 0
-- ORDER BY day
Please note: you are using timestamp as a column name which is not the best practice as it is keyword, so in my example i leave your naming but consider to change this!
You can test / play this solution with below dummy data
#standardSQL
WITH `database.table1` AS (
SELECT TIMESTAMP '2017-01-01' AS timestamp, 1 AS installs
UNION ALL SELECT TIMESTAMP '2017-01-01', 22
),
`database.table2` AS (
SELECT TIMESTAMP '2016-12-01' AS timestamp, 1 AS installs UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL
SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22
),
`database.table3` AS (
SELECT TIMESTAMP '2017-01-01' AS timestamp, 1 AS installs UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL
SELECT TIMESTAMP '2017-01-10', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22
),
calendar AS (
SELECT day
FROM (
SELECT MIN(min_day) AS min_day, MAX(max_day) AS max_day
FROM (
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table1` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table2` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table3`
)
), UNNEST(GENERATE_DATE_ARRAY(min_day, max_day, INTERVAL 1 DAY)) AS day
)
SELECT
c.day AS day,
IFNULL(SUM(installs), 0) AS installs,
IFNULL(SUM(uninstalls), 0) AS uninstalls,
IFNULL(SUM(cases),0) AS cases
FROM calendar AS c
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) installs FROM `database.table1` GROUP BY day) t1 ON t1.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) uninstalls FROM `database.table2` GROUP BY day) t2 ON t2.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) cases FROM `database.table3` GROUP BY day) t3 ON t3.day = c.day
GROUP BY day
HAVING installs + uninstalls + cases > 0
ORDER BY day
I am not very familiar with bigquery, so this is probably not going to be a copy-paste answer.
You'll first have to build a calander table to make sure you have all dates. Here's an example for sql server. There are probably examples for bigquery available as well. The following assumes a Calander table with Date attribute in timestamp.
Once you have your calander table you can join all your tables to that:
SELECT FORMAT_TIMESTAMP("%F",C.Date) AS day
, COUNT(T1.DATE(T1.TIMESTAMP)) AS installs --Here you could also use your FORMAT_TIMESTAMP
, COUNT(T1.DATE(T2.TIMESTAMP)) AS uninstalls
FROM Calander C
LEFT JOIN database.table1 T1
ON DATE(T1.TIMESTAMP) = DATE(C.Date) --Convert to date to remove times, you could also use your FORMAT_TIMESTAMP
LEFT JOIN database.table2 T2
ON DATE(T2.TIMESTAMP) = DATE(C.Date)
GROUP BY day
ORDER BY day ASC