I am using Postgres 9.3.3
I have a table with multiple events, two of them are "AVAILABLE" and "UNAVAILABLE". These events are assigned to a specific object. There are also other object ids in this table (removed for clarity):
What I need is the "available" time per day, something like that:
SQL Fiddle
select
object_id, day,
sum(upper(available) - lower(available)) as available
from (
select
g.object_id, date_trunc('day', d) as day,
(
available *
tsrange(date_trunc('day', d), date_trunc('day', d)::date + 1, '[)')
) as available
from
(
select
object_id, event,
tsrange(
timestamp,
lead(timestamp) over(
partition by object_id order by timestamp
),
'[)'
) as available
from events
where event in ('AVAILABLE', 'UNAVAILABLE')
) s
right join
(
generate_series(
(select min(timestamp) from events),
(select max(timestamp) from events),
'1 day'
) g (d)
cross join
(select distinct object_id from events) s
) g on
tsrange(date_trunc('day', d), date_trunc('day', d)::date + 1, '[)') && available and
(event = 'AVAILABLE' or event is null) and
g.object_id = s.object_id
) s
group by 1, 2
order by 1, 2
psql output
object_id | day | available
-----------+---------------------+-----------
1 | 1970-01-02 00:00:00 | 12:00:00
1 | 1970-01-03 00:00:00 | 12:00:00
1 | 1970-01-04 00:00:00 |
1 | 1970-01-05 00:00:00 | 1 day
1 | 1970-01-06 00:00:00 | 1 day
1 | 1970-01-07 00:00:00 | 12:00:00
Table DDL
create table events (
object_id int,
event text,
timestamp timestamp
);
insert into events (object_id, event, timestamp) values
(1, 'AVAILABLE', '1970-01-02 12:00:00'),
(1, 'UNAVAILABLE', '1970-01-03 12:00:00'),
(1, 'AVAILABLE', '1970-01-05 00:00:00'),
(1, 'UNAVAILABLE', '1970-01-07 12:00:00');
Your example output suggests that you want all your objects to be returned, but grouped. If that is the case, this query can do that
select object_id, day, sum(upper(tsrange) - lower(tsrange))
from (
select object_id, date(day) as day, e.tsrange * tsrange(day, day + interval '1' day) tsrange
from generate_series(timestamp '1970-01-01', '1970-01-07', interval '1' day) day
left join (
select object_id,
case event
when 'AVAILABLE' then tsrange(timestamp, lead(timestamp) over (partition by object_id order by timestamp))
else null
end tsrange
from events
where event in ('AVAILABLE', 'UNAVAILABLE')
) e on e.tsrange && tsrange(day, day + interval '1' day)
) d
group by object_id, day
order by day, object_id
But that will output something like that (if you have multiple object_ids):
object_id | day | sum
-----------+--------------+-----------
| '1970-01-01' |
1 | '1970-01-02' | '12:00:00'
1 | '1970-01-03' | '12:00:00'
| '1970-01-04' |
1 | '1970-01-05' | '1 day'
1 | '1970-01-06' | '1 day'
2 | '1970-01-06' | '12:00:00'
1 | '1970-01-07' | '12:00:00'
In my opinion it would make much more sense, if you would query just one object at a time:
select day, sum(upper(tsrange) - lower(tsrange))
from (
select date(day) as day, e.tsrange * tsrange(day, day + interval '1' day) tsrange
from generate_series(timestamp '1970-01-01', '1970-01-07', interval '1' day) day
left join (
select case event
when 'AVAILABLE' then tsrange(timestamp, lead(timestamp) over (partition by object_id order by timestamp))
else null
end tsrange
from events
where event in ('AVAILABLE', 'UNAVAILABLE')
and object_id = 1
) e on e.tsrange && tsrange(day, day + interval '1' day)
) d
group by day
order by day
This will output something, like:
day | sum
--------------+----------
'1970-01-01' |
'1970-01-02' | '12:00:00'
'1970-01-03' | '12:00:00'
'1970-01-04' |
'1970-01-05' | '1 day'
'1970-01-06' | '1 day'
'1970-01-07' | '12:00:00'
I used this schema/data for my outputs:
create table events (
object_id int,
event text,
timestamp timestamp
);
insert into events (object_id, event, timestamp)
values (1, 'AVAILABLE', '1970-01-02 12:00:00'),
(1, 'UNAVAILABLE', '1970-01-03 12:00:00'),
(1, 'AVAILABLE', '1970-01-05 00:00:00'),
(1, 'UNAVAILABLE', '1970-01-07 12:00:00'),
(2, 'AVAILABLE', '1970-01-06 00:00:00'),
(2, 'UNAVAILABLE', '1970-01-06 06:00:00'),
(2, 'AVAILABLE', '1970-01-06 12:00:00'),
(2, 'UNAVAILABLE', '1970-01-06 18:00:00');
This is a partial answer. If we assume that the next event after available is unavailable, then lead() comes to the rescue and the following is a start:
select object_id, to_char(timestamp, 'YYYY-MM-DD') as day,
to_char(nextts - timestamp, 'HH24:MI') as interval
from (select t.*,
lead(timestamp) over (partition by object_id order by timestamp) as nextts
from table t
where event in ('AVAILABLE', 'UNAVAILABLE')
) t
where event = 'AVAILABLE'
group by object_id, to_char(timestamp, 'YYYY-MM-DD');
I suspect, though, that when the interval spans multiple days, you want to split the days into separate parts. This becomes more of a challenge.
Related
I have a database table that represents activities and for each activity, how long it took.
It looks something like this :
activity_id | name | status | start_date | end_date
=================================================================
1 | name1 | WIP | 2019-07-24 ... | 2019-07-24 ...
start_date and end_date are timestamps. I use a view with a column total_time that is described like that:
date_part('day'::text,
COALESCE(sprint_activity.end_date::timestamp with time zone, CURRENT_TIMESTAMP)
- sprint_activity.start_date::timestamp with time zone
) + date_part('hour'::text,
COALESCE(sprint_activity.end_date::timestamp with time zone, CURRENT_TIMESTAMP)
- sprint_activity.start_date::timestamp with time zone
) / 24::double precision AS total_time
I would like to create a table for vacation or half day vacations that looks like:
date | work_percentage
=================================================
2019-07-24 | 0.4
2019-07-23 | 0.7
And then, I would like to calculate total_time in a way that uses this vacations table such that:
If a date is not in the column it's considered to have work_percentage==1
For every date that is in the table, reduce the relative percentage from the total_time query.
So let's take an example:
Activity - "Write report" started at 11-July-2019 14:00 and ended at 15-July-2019 19:00 - so the time diff is 4 days and 5 hours.
The 13th and 14th were weekend so I'd like to have a column in the vacations table that holds 2019-07-13 with work_percentage == 1 and the same for the 14th.
Deducting those vacations, the time diff would be 2 days and 5 hours as the 13th and 14th are not workdays.
Hope this example explains it better.
I think you can take this example and add some modifications based on your database
Just ddl statements to test script
create table activities (
user_id int,
activity_id int,
name text,
status text,
start_date timestamp,
end_date timestamp
);
create table vacations (
user_id int,
date date,
work_percentage numeric
);
insert into activities
values
(1, 1, 'name1', 'WIP', timestamp'2019-07-20 10:00:00', timestamp'2019-07-25 8:00:00'),
(2, 2, 'name2', 'DONE', timestamp'2019-07-28 19:00:00', timestamp'2019-08-01 7:00:00'),
(1, 3, 'name3', 'DONE', timestamp'2019-07-21 12:00:00', timestamp'2019-07-21 15:00:00'),
(-1, 4, 'Write report', 'DONE', timestamp'2019-07-11 14:00:00', timestamp'2019-07-15 19:00:00');
insert into vacations
values
(1, date'2019-07-21', 0.5),
(1, date'2019-07-22', 0),
(1, date'2019-07-23', 0.25),
(2, date'2019-07-29', 0),
(2, date'2019-07-30', 0),
(-1, date'2019-07-13', 0),
(-1, date'2019-07-14', 0);
sql script
with
daily_activity as (
select
*,
date(
generate_series(
date(start_date),
date(end_date),
interval'1 day')
) as date_key
from
activities
),
raw_data as (
select
da.*,
v.work_percentage,
case
when date(start_date) = date(end_date)
then (end_date - start_date) * coalesce(work_percentage, 1)
when date(start_date) = date_key
then (date(start_date) + 1 - start_date) * coalesce(work_percentage, 1)
when date(end_date) = date_key
then (end_date - date(end_date)) * coalesce(work_percentage, 1)
else interval'24 hours' * coalesce(work_percentage, 1)
end as activity_coverage
from
daily_activity as da
left join vacations as v on da.user_id = v.user_id
and da.date_key = v.date
)
select
user_id,
activity_id,
name,
status,
start_date,
end_date,
justify_interval(sum(activity_coverage)) as total_activity_time
from
raw_data
group by
1, 2, 3, 4, 5, 6
I`m stuck a bit with understanding of my further actions while performing queries.
I have two tables "A"(date, response, b_id) and "B"(id, country). I need to count hourly ratio of a number of entries where response exists to the total number of entries on a specific date. The final selection should consist of columns "hour", "ratio".
SELECT COUNT(*) FROM A WHERE RESPONSE IS NOT NULL//counting entries with response
SELECT COUNT(*) FROM A//counting total number of entries
How to count the ratio? Should I create a separate variable for it?
How to count for each hour on a day? Should I make smth like a loop? + How can I get the "hour" part of a date?
What is the best way to select the hours and counted ratio? Should I make a separate table for it?
I`m rather new to make complex queries, so I woud be happy for every kind of help
You can do this as:
select to_char(datecol, 'HH24') as hour,
count(response) as has_response, count(*) as total,
count(response) / count(*) as ratio
from a
where datecol >= date '2018-09-18' and datecol < date '2018-09-19'
group by to_char(datecol, 'HH24');
You can also do this using avg() -- which is also fun:
select to_char(datecol, 'HH24'),
avg(case when response is not null then 1.0 else 0 end) as ratio
from a
where datecol >= date '2018-09-18' and datecol < date '2018-09-19'
group by to_char(datecol, 'HH24')
In this case, that requires more typing, though.
SQL Fiddle
Oracle 11g R2 Schema Setup:
CREATE TABLE A ( dt, response, b_id ) AS
SELECT DATE '2018-09-18' + INTERVAL '00:00' HOUR TO MINUTE, NULL, 1 FROM DUAL UNION ALL
SELECT DATE '2018-09-18' + INTERVAL '00:10' HOUR TO MINUTE, 'A', 1 FROM DUAL UNION ALL
SELECT DATE '2018-09-18' + INTERVAL '00:20' HOUR TO MINUTE, 'B', 1 FROM DUAL UNION ALL
SELECT DATE '2018-09-18' + INTERVAL '01:00' HOUR TO MINUTE, 'C', 1 FROM DUAL UNION ALL
SELECT DATE '2018-09-18' + INTERVAL '01:10' HOUR TO MINUTE, 'D', 1 FROM DUAL UNION ALL
SELECT DATE '2018-09-18' + INTERVAL '02:00' HOUR TO MINUTE, NULL, 1 FROM DUAL UNION ALL
SELECT DATE '2018-09-18' + INTERVAL '03:00' HOUR TO MINUTE, 'E', 1 FROM DUAL UNION ALL
SELECT DATE '2018-09-18' + INTERVAL '05:10' HOUR TO MINUTE, 'F', 1 FROM DUAL;
Query 1:
SELECT b_id,
TO_CHAR( TRUNC( dt, 'HH' ), 'YYYY-MM-DD HH24:MI:SS' ) AS hour,
COUNT(RESPONSE) AS total_response_per_hour,
COUNT(*) AS total_per_hour,
total_response_per_day,
total_per_day,
COUNT(response) / total_response_per_day AS ratio_for_responses,
COUNT(*) / total_per_day AS ratio
FROM (
SELECT A.*,
COUNT(RESPONSE) OVER ( PARTITION BY b_id, TRUNC( dt ) ) AS total_response_per_day,
COUNT(*) OVER ( PARTITION BY b_id, TRUNC( dt ) ) AS total_per_day
FROM A
)
GROUP BY
b_id,
total_per_day,
total_response_per_day,
TRUNC( dt, 'HH' )
ORDER BY
TRUNC( dt, 'HH' )
Results:
| B_ID | HOUR | TOTAL_RESPONSE_PER_HOUR | TOTAL_PER_HOUR | TOTAL_RESPONSE_PER_DAY | TOTAL_PER_DAY | RATIO_FOR_RESPONSES | RATIO |
|------|---------------------|-------------------------|----------------|------------------------|---------------|---------------------|-------|
| 1 | 2018-09-18 00:00:00 | 2 | 3 | 6 | 8 | 0.3333333333333333 | 0.375 |
| 1 | 2018-09-18 01:00:00 | 2 | 2 | 6 | 8 | 0.3333333333333333 | 0.25 |
| 1 | 2018-09-18 02:00:00 | 0 | 1 | 6 | 8 | 0 | 0.125 |
| 1 | 2018-09-18 03:00:00 | 1 | 1 | 6 | 8 | 0.16666666666666666 | 0.125 |
| 1 | 2018-09-18 05:00:00 | 1 | 1 | 6 | 8 | 0.16666666666666666 | 0.125 |
SELECT withResponses.hour,
withResponses.cnt AS withResponse,
alls.cnt AS AllEntries,
(withResponses.cnt / alls.cnt) AS ratio
FROM
( SELECT to_char(d, 'DD-MM-YY - HH24') || ':00 to :59 ' hour,
count(*) AS cnt
FROM A
WHERE RESPONSE IS NOT NULL
GROUP BY to_char(d, 'DD-MM-YY - HH24') || ':00 to :59 ' ) withResponses,
( SELECT to_char(d, 'DD-MM-YY - HH24') || ':00 to :59 ' hour,
count(*) AS cnt
FROM A
GROUP BY to_char(d, 'DD-MM-YY - HH24') || ':00 to :59 ' ) alls
WHERE alls.hour = withResponses.hour ;
SQLFiddle: http://sqlfiddle.com/#!4/c09b9/2
I have the following SUB QUERY as part of a SELECT statement. Which is supposed to take one calculated time, away from another calculated time.
However Postgres doesn't like having the Window function within the FROM clause.
(SELECT count(*) AS work_hours
FROM generate_series (TIMESTAMP 'epoch' + MAX(wog.endtime) OVER(PARTITION BY woas.workorderid ORDER BY wog.endtime DESC)/1000 * INTERVAL '1 second'
, TIMESTAMP 'epoch' + nth_value(wog.endtime,2) OVER(PARTITION BY woas.workorderid ORDER BY wog.endtime DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)/1000 * INTERVAL '1 second' - interval '1h'
, interval '1h') h
WHERE EXTRACT(ISODOW FROM h) < 6
AND h::time >= '08:00'
AND h::time <= '18:00') AS "Max minus Second Max",
Postgres returns the following error:
ERROR: cannot use window function in function expression in FROM
How can I reformat the above statement, so that it parses without error?
Update:
I dont think the structure of the query is the issue. If I put timestamp string in place of the functions it works fine.
(SELECT count(*) AS work_hours
FROM generate_series (timestamp '2018-01-06 13:30'
, timestamp '2018-01-08 21:29' - interval '1h'
, interval '1h') h
WHERE EXTRACT(ISODOW FROM h) < 6
AND h::time >= '08:00'
AND h::time <= '18:00') "Time Difference" from workorder wo
After "FROM" clause a table/view to query the data from is expected.
You refactor the query like this:
select count(*) from (
select generate_series( ....)
where (cond1 and cond2..)
Obviously it will not work when you put the function in the "from" clause.
If you don't already have a solution, you might want to try this. I think it does what you want.
Set-up test: similar enough to your situation, I hope.
create table work_order_times
(
work_order_id integer,
end_time bigint -- milliseconds
);
insert into work_order_times (work_order_id, end_time) values (23, extract(epoch from now()) * 1000);
insert into work_order_times (work_order_id, end_time) values (23, (extract(epoch from now()) - 20000) * 1000);
insert into work_order_times (work_order_id, end_time) values (57, (extract(epoch from now()) - 40000) * 1000);
insert into work_order_times (work_order_id, end_time) values (57, (extract(epoch from now()) - 60000) * 1000);
insert into work_order_times (work_order_id, end_time) values (57, (extract(epoch from now()) - 80000) * 1000);
Check set-up:
select
work_order_id,
end_time,
to_timestamp(end_time / 1000) as end_timestamp
from
work_order_times
order by
work_order_id,
end_time;
work_order_id | end_time | end_timestamp
---------------+---------------+------------------------
23 | 1516251234772 | 2018-01-18 04:53:54+00
23 | 1516271234769 | 2018-01-18 10:27:14+00
57 | 1516191234774 | 2018-01-17 12:13:54+00
57 | 1516211234773 | 2018-01-17 17:47:14+00
57 | 1516231234772 | 2018-01-17 23:20:34+00
(5 rows)
The Query:
select
work_order_id,
generate_series (penultimate_timestamp, latest_timestamp, interval '1 hour')
from
(
select
work_order_id,
to_timestamp(latest_end_time / 1000) as latest_timestamp,
to_timestamp(penultimate_end_time / 1000) as penultimate_timestamp
from
(
select
work_order_id,
row_number() over last_2_timestamps as row_number,
max(end_time) over last_2_timestamps as latest_end_time,
lead(end_time) over last_2_timestamps as penultimate_end_time
from
work_order_times
window
last_2_timestamps as (partition by work_order_id order by end_time desc)
) x
where
row_number = 1
) y;
The results:
work_order_id | generate_series
---------------+------------------------
23 | 2018-01-18 04:53:54+00
23 | 2018-01-18 05:53:54+00
23 | 2018-01-18 06:53:54+00
23 | 2018-01-18 07:53:54+00
23 | 2018-01-18 08:53:54+00
23 | 2018-01-18 09:53:54+00
57 | 2018-01-17 17:47:14+00
57 | 2018-01-17 18:47:14+00
57 | 2018-01-17 19:47:14+00
57 | 2018-01-17 20:47:14+00
57 | 2018-01-17 21:47:14+00
57 | 2018-01-17 22:47:14+00
(12 rows)
The PostgreSQL documentation does mention some restrictions about how window functions can be nested, but it seems to work like this.
It would make things slightly simpler to store the endtime as a timestamp rather than a number of milliseconds, which is what it appears to be, but maybe you don't have the opportunity to do that.
Given I have multiple tables in BigQuery, hence I have multiple SQL-statements that gives me "the number of X per day". For example:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as installs
FROM database.table1
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | installs |
-------------------------
| 2017-01-01 | 11 |
| 2017-01-02 | 22 |
etc
Another statement:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as uninstalls
FROM database.table2
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | uninstalls |
---------------------------
| 2017-01-02 | 22 |
| 2017-01-03 | 33 |
etc
Another statement:
SELECT FORMAT_TIMESTAMP("%F",timestamp) AS day, COUNT(*) as cases
FROM database.table3
GROUP BY day
ORDER BY day ASC
Which would give the result:
| day | cases |
----------------------
| 2017-01-01 | 11 |
| 2017-01-03 | 33 |
etc
etc
Now I need to combine all these into a single SELECT statement that gives the following results:
| day | installs | uninstalls | cases |
----------------------------------------------
| 2017-01-01 | 11 | 0 | 11 |
| 2017-01-02 | 22 | 22 | 0 |
| 2017-01-03 | 0 | 33 | 33 |
etc
Is this even possible?
Or what's the closest SQL-statement I can write that would give me a similar result?
Any feedback is appreciated!
Here is a self-contained example that might help to get you started. It uses two dummy tables, InstallEvents and UninstallEvents, which contain timestamps for the respective actions. It creates a common table expression called StartAndEnd that computes the minimum and maximum dates for these events in order to decide which dates to aggregate over, then unions the contents of the InstallEvents and UninstallEvents, counting the events for each day.
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
),
StartAndEnd AS (
SELECT MIN(DATE(timestamp)) AS min_date, MAX(DATE(timestamp)) AS max_date
FROM (
SELECT * FROM InstallEvents UNION ALL
SELECT * FROM UninstallEvents
)
)
SELECT
day,
COUNTIF(is_install AND DATE(timestamp) = day) AS installs,
COUNTIF(NOT is_install AND DATE(timestamp) = day) AS uninstalls
FROM (
SELECT *, true AS is_install
FROM InstallEvents UNION ALL
SELECT *, false
FROM UninstallEvents
)
CROSS JOIN UNNEST(GENERATE_DATE_ARRAY(
(SELECT min_date FROM StartAndEnd),
(SELECT max_date FROM StartAndEnd)
)) AS day
GROUP BY day
ORDER BY day;
If you know what the start and end dates are in advance, you can hard-code them in the query instead and then omit the StartAndEnd CTE:
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
)
SELECT
day,
COUNTIF(is_install AND DATE(timestamp) = day) AS installs,
COUNTIF(NOT is_install AND DATE(timestamp) = day) AS uninstalls
FROM (
SELECT *, true AS is_install
FROM InstallEvents UNION ALL
SELECT *, false
FROM UninstallEvents
)
CROSS JOIN UNNEST(GENERATE_DATE_ARRAY('2017-01-01', '2017-01-04')) AS day
GROUP BY day
ORDER BY day;
To see the events in the sample data, use a query that unions the contents:
WITH InstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-01 00:00:00', INTERVAL x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 100)) AS x
),
UninstallEvents AS (
SELECT TIMESTAMP_ADD('2017-01-02 00:00:00', INTERVAL 2 * x HOUR) AS timestamp
FROM UNNEST(GENERATE_ARRAY(0, 50)) AS x
)
SELECT timestamp, true AS is_install
FROM InstallEvents UNION ALL
SELECT timestamp, false
FROM UninstallEvents;
Below is for BigQuery Standard SQL
#standardSQL
WITH calendar AS (
SELECT day
FROM (
SELECT MIN(min_day) AS min_day, MAX(max_day) AS max_day
FROM (
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table1` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table2` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table3`
)
), UNNEST(GENERATE_DATE_ARRAY(min_day, max_day, INTERVAL 1 DAY)) AS day
)
SELECT
c.day AS day,
IFNULL(SUM(installs), 0) AS installs,
IFNULL(SUM(uninstalls), 0) AS uninstalls,
IFNULL(SUM(cases),0) AS cases
FROM calendar AS c
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) installs FROM `database.table1` GROUP BY day) t1 ON t1.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) uninstalls FROM `database.table2` GROUP BY day) t2 ON t2.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) cases FROM `database.table3` GROUP BY day) t3 ON t3.day = c.day
GROUP BY day
HAVING installs + uninstalls + cases > 0
-- ORDER BY day
Please note: you are using timestamp as a column name which is not the best practice as it is keyword, so in my example i leave your naming but consider to change this!
You can test / play this solution with below dummy data
#standardSQL
WITH `database.table1` AS (
SELECT TIMESTAMP '2017-01-01' AS timestamp, 1 AS installs
UNION ALL SELECT TIMESTAMP '2017-01-01', 22
),
`database.table2` AS (
SELECT TIMESTAMP '2016-12-01' AS timestamp, 1 AS installs UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL
SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22
),
`database.table3` AS (
SELECT TIMESTAMP '2017-01-01' AS timestamp, 1 AS installs UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL SELECT TIMESTAMP '2017-01-01', 22 UNION ALL
SELECT TIMESTAMP '2017-01-10', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22 UNION ALL SELECT TIMESTAMP '2017-01-02', 22
),
calendar AS (
SELECT day
FROM (
SELECT MIN(min_day) AS min_day, MAX(max_day) AS max_day
FROM (
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table1` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table2` UNION ALL
SELECT MIN(DATE(timestamp)) AS min_day, MAX(DATE(timestamp)) AS max_day FROM `database.table3`
)
), UNNEST(GENERATE_DATE_ARRAY(min_day, max_day, INTERVAL 1 DAY)) AS day
)
SELECT
c.day AS day,
IFNULL(SUM(installs), 0) AS installs,
IFNULL(SUM(uninstalls), 0) AS uninstalls,
IFNULL(SUM(cases),0) AS cases
FROM calendar AS c
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) installs FROM `database.table1` GROUP BY day) t1 ON t1.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) uninstalls FROM `database.table2` GROUP BY day) t2 ON t2.day = c.day
LEFT JOIN (SELECT DATE(timestamp) day, COUNT(1) cases FROM `database.table3` GROUP BY day) t3 ON t3.day = c.day
GROUP BY day
HAVING installs + uninstalls + cases > 0
ORDER BY day
I am not very familiar with bigquery, so this is probably not going to be a copy-paste answer.
You'll first have to build a calander table to make sure you have all dates. Here's an example for sql server. There are probably examples for bigquery available as well. The following assumes a Calander table with Date attribute in timestamp.
Once you have your calander table you can join all your tables to that:
SELECT FORMAT_TIMESTAMP("%F",C.Date) AS day
, COUNT(T1.DATE(T1.TIMESTAMP)) AS installs --Here you could also use your FORMAT_TIMESTAMP
, COUNT(T1.DATE(T2.TIMESTAMP)) AS uninstalls
FROM Calander C
LEFT JOIN database.table1 T1
ON DATE(T1.TIMESTAMP) = DATE(C.Date) --Convert to date to remove times, you could also use your FORMAT_TIMESTAMP
LEFT JOIN database.table2 T2
ON DATE(T2.TIMESTAMP) = DATE(C.Date)
GROUP BY day
ORDER BY day ASC
I can use DATEDIFF to find the difference between one set of dates like this
DATEDIFF(MINUTE, #startdate, #enddate)
but how would I find the total time span between multiple sets of dates? I don't know how many sets (stops and starts) I will have.
The data is on multiple rows with start and stops.
ID TimeStamp StartOrStop TimeCode
----------------------------------------------------------------
1 2017-01-01 07:00:00 Start 1
2 2017-01-01 08:15:00 Stop 2
3 2017-01-01 10:00:00 Start 1
4 2017-01-01 11:00:00 Stop 2
5 2017-01-01 10:30:00 Start 1
6 2017-01-01 12:00:00 Stop 2
This code would work assuming that your table only store data from one person, and they should be of the order Start/Stop/Start/Stop
WITH StartTime AS (
SELECT
TimeStamp
, ROW_NUMBER() PARTITION BY (ORDER BY TimeStamp) RowNum
FROM
<<table>>
WHERE
TimeCode = 1
), StopTime AS (
SELECT
TimeStamp
, ROW_NUMBER() PARTITION BY (ORDER BY TimeStamp) RowNum
FROM
<<table>>
WHERE
TimeCode = 2
)
SELECT
SUM (DATEDIFF( MINUTE, StartTime.TimeStamp, StopTime.TimeStamp )) As TotalTime
FROM
StartTime
JOIN StopTime ON StartTime.RowNum = StopTime.RowNum
This will work if your starts and stops are reliable. Your sample has two starts in order - 10:00 and 10:30 starts. I assume in production you will have an employee id to group on, so I added this to the sample data in place of the identity column.
Also in production, the CTE sets will be reduced by using a parameter on date. If there are overnight shifts, you would want your stops CTE to use dateadd(day, 1, #startDate) as your upper bound when retrieving end date.
Set up sample:
declare #temp table (
EmpId int,
TimeStamp datetime,
StartOrStop varchar(55),
TimeCode int
);
insert into #temp
values
(1, '2017-01-01 07:00:00', 'Start', 1),
(1, '2017-01-01 08:15:00', 'Stop', 2),
(1, '2017-01-01 10:00:00', 'Start', 1),
(1, '2017-01-01 11:00:00', 'Stop', 2),
(2, '2017-01-01 10:30:00', 'Start', 1),
(2, '2017-01-01 12:00:00', 'Stop', 2)
Query:
;with starts as (
select t.EmpId,
t.TimeStamp as StartTime,
row_number() over (partition by t.EmpId order by t.TimeStamp asc) as rn
from #temp t
where Timecode = 1 --Start time code?
),
stops as (
select t.EmpId,
t.TimeStamp as EndTime,
row_number() over (partition by t.EmpId order by t.TimeStamp asc) as rn
from #temp t
where Timecode = 2 --Stop time code?
)
select cast(min(sub.StartTime) as date) as WorkDay,
sub.EmpId as Employee,
min(sub.StartTime) as ClockIn,
min(sub.EndTime) as ClockOut,
sum(sub.MinutesWorked) as MinutesWorked
from
(
select strt.EmpId,
strt.StartTime,
stp.EndTime,
datediff(minute, strt.StartTime, stp.EndTime) as MinutesWorked
from starts strt
inner join stops stp
on strt.EmpId = stp.EmpId
and strt.rn = stp.rn
)sub
group by sub.EmpId
This works assuming your table has an incremental ID and interleaving start/stop records
--Data sample as provided
declare #temp table (
Id int,
TimeStamp datetime,
StartOrStop varchar(55),
TimeCode int
);
insert into #temp
values
(1, '2017-01-01 07:00:00', 'Start', 1),
(2, '2017-01-01 08:15:00', 'Stop', 2),
(3, '2017-01-01 10:00:00', 'Start', 1),
(4, '2017-01-01 11:00:00', 'Stop', 2),
(5, '2017-01-01 10:30:00', 'Start', 1),
(6, '2017-01-01 12:00:00', 'Stop', 2)
--let's see every pair start/stop and discard stop/start
select start.timestamp start, stop.timestamp stop,
datediff(mi,start.timestamp,stop.timestamp) minutes
from #temp start inner join #temp stop
on start.id+1= stop.id and start.timecode=1
--Sum all for required result
select sum(datediff(mi,start.timestamp,stop.timestamp) ) totalMinutes
from #temp start inner join #temp stop
on start.id+1= stop.id and start.timecode=1
Results
+-------------------------+-------------------------+---------+
| start | stop | minutes |
+-------------------------+-------------------------+---------+
| 2017-01-01 07:00:00.000 | 2017-01-01 08:15:00.000 | 75 |
| 2017-01-01 10:00:00.000 | 2017-01-01 11:00:00.000 | 60 |
| 2017-01-01 10:30:00.000 | 2017-01-01 12:00:00.000 | 90 |
+-------------------------+-------------------------+---------+
+--------------+
| totalMinutes |
+--------------+
| 225 |
+--------------+
Maybe the tricky part is the join clause. We need to join #table with itself by deferring 1 ID. Here is where on start.id+1= stop.id did its work.
In the other hand, for excluding stop/start couple we use start.timecode=1. In case we don't have a column with this information, something like stop.id%2=0 works just fine.