Select dividing of two time series - sql

I have first query:
WITH one_day_intervals AS (
SELECT date_trunc('day', (current_date-offs-2)) AS start_time,
date_trunc('day', (current_date-offs-1)) AS end_time
FROM generate_series(1, 7, 1) AS offs
)
SELECT start_time, end_time, x
FROM (
SELECT i.start_time AS start_time, i.end_time AS end_time, count(*) AS x
FROM (
SELECT created
FROM tracking_msg AS tm
WHERE tm.cid='ae69123c-cb29-420b-9a65-bbe6ae156f57' AND tm.sid NOT IN (SELECT sid FROM session_msg)
) AS sub1
RIGHT JOIN one_day_intervals AS i ON sub1.created >= i.start_time AND sub1.created < i.end_time
GROUP BY i.start_time, i.end_time ORDER BY i.start_time
) AS sub2;
and at the output:
image1
And second query:
WITH one_day_intervals AS (
SELECT date_trunc('day', (current_date-offs-2)) AS start_time,
date_trunc('day', (current_date-offs-1)) AS end_time
FROM generate_series(1, 7, 1) AS offs
)
SELECT start_time, end_time, y
FROM (
SELECT i.start_time AS start_time, i.end_time AS end_time, count(*) AS y
FROM (
SELECT created
FROM tracking_msg AS tm
WHERE tm.cid='ae69123c-cb29-420b-9a65-bbe6ae156f57' AND tm.sid IN (SELECT sid FROM session_msg)
) AS sub1
RIGHT JOIN one_day_intervals AS i ON sub1.created >= i.start_time AND sub1.created < i.end_time
GROUP BY i.start_time, i.end_time ORDER BY i.start_time
) AS sub2;
Output: image2
Each query have the same date period. I want to select start_time, end_time and x/y

So, basically the difference between x and y is that tm.sid is IN / NOT IN another table.
You can calculate both at once, with conditional aggregation:
SELECT ...,
COUNT(*) FILTER (WHERE tm.sid NOT IN (SELECT sid FROM session_msg)) AS x,
COUNT(*) FILTER (WHERE tm.sid IN (SELECT sid FROM session_msg)) AS y
Or, with older PostgreSQL (9.3 or before):
SELECT ...,
COUNT(CASE WHEN tm.sid NOT IN (SELECT sid FROM session_msg) THEN 1 END) AS x,
COUNT(CASE WHEN tm.sid IN (SELECT sid FROM session_msg) THEN 1 END) AS y

Related

Merge Overlapping Intervals and Track Maximum Value in BigQuery SQL

I am trying to solve a problem where i want to merge overlapping intervals for a given column id, but i also want to track the maximum value for each overlapped interval. I have start_time and stop_time for each interval and each interval has a hierarchy/priority associated with it.
These are the following columns in the table:
id, start_time, stop_time, some_value
example input:
example output:
Below is for BigQuery Standard SQL and I assume you stll working on the same use-case as in previous question, so I wanted to keep it inline with that solution - and you can extend it for when you also want to account for priorities for example
So, anyway:
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MAX(some_value) some_value
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, MAX(some_value) some_value
FROM (
SELECT id, start_time, stop_time, some_value, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time, some_value,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
If to apply to your sample data - result is
Row id start_time stop_time some_value
1 1 0 36 50
2 1 41 47 23
Is it possible to add one more column to the result which will show number of events during that time period
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MAX(some_value) some_value, ANY_VALUE(To_JSON_STRING(b)) event_hash
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, MAX(some_value) some_value, COUNT(DISTINCT event_hash) events
FROM (
SELECT *, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT *,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
with result
Row id start_time stop_time some_value events
1 1 0 36 50 8
2 1 41 47 23 1
You can determine when a new grouping starts using a cumulative max(). Then a cumulative conditional count() to identify the groups . . . and finally aggregation:
select min(start_time), max(stop_time), max(some_value)
from (select t.*,
countif(prev_stop_time is null or prev_stop_time < start_time) over (partition by id order by start_time) as grp
from (select t.*,
max(stop_time) over (partition by id order by start_time rows between unbounded preceding and 1 preceding) as prev_stop_time
from t
) t
) t
group by item_id, grp;

Merge Overlapping Time Intervals based on Hierarchy in SQL

I am trying to solve a problem where i want to merge overlapping intervals for a given column id, but i also want to merge them based on hierarchy/priority. I have start_time and stop_time for each interval and each interval has a hierarchy/priority associated with it.
These are the following columns in the table:
id, start_time, stop_time, priority
I was able to solve the problem where i do not have take into account the priority, but i am struggling with this one.
Red colour: p1 (priority 1)
Blue Colour: p2 (priority 2)
Green colour: p3 (priority 3)
Note that in the example input below we will have 9 rows having same id, and the output will have 6 rows. Please note that for some id's might have only some of the priority values or just one, the solution should take care of that.
expected input and output:
Below is for BigQuery Standard SQL
#standardSQL
WITH check_times AS (
SELECT id, start_time AS time FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS time FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, time AS start_time, LEAD(time) OVER(PARTITION BY id ORDER BY time) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time, MIN(priority) priority
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time, ANY_VALUE(priority) priority
FROM (
SELECT id, start_time, stop_time, priority, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time, priority,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) OR
priority != IFNULL(LAG(priority) OVER(PARTITION BY id ORDER BY start_time), -1) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
If to apply to sample data from your question - result is
Can you also share a solution where we merge intervals based on just id and no priority column
I just simply slightly adjusted above query to ignore priority
#standardSQL
WITH check_times AS (
SELECT id, start_time AS TIME FROM `project.dataset.table` UNION DISTINCT
SELECT id, stop_time AS TIME FROM `project.dataset.table`
), distinct_intervals AS (
SELECT id, TIME AS start_time, LEAD(TIME) OVER(PARTITION BY id ORDER BY TIME) stop_time
FROM check_times
), deduped_intervals AS (
SELECT a.id, a.start_time, a.stop_time
FROM distinct_intervals a
JOIN `project.dataset.table` b
ON a.id = b.id
AND a.start_time BETWEEN b.start_time AND b.stop_time
AND a.stop_time BETWEEN b.start_time AND b.stop_time
GROUP BY a.id, a.start_time, a.stop_time
), combined_intervals AS (
SELECT id, MIN(start_time) start_time, MAX(stop_time) stop_time
FROM (
SELECT id, start_time, stop_time, COUNTIF(flag) OVER(PARTITION BY id ORDER BY start_time) grp
FROM (
SELECT id, start_time, stop_time,
start_time != IFNULL(LAG(stop_time) OVER(PARTITION BY id ORDER BY start_time), start_time) flag
FROM deduped_intervals
)
)
GROUP BY id, grp
)
SELECT *
FROM combined_intervals
-- ORDER BY id, start_time
with result
Row id start_time stop_time
1 1 0 36
2 1 41 47
This is a "combining" islands problem. One solution is to find where the islands begin and do a cumulative sum of the beginnings. You can determine the beginning by seeing where there are no overlaps:
select id, priority, min(start_time), max(stop_time)
from (select t.*,
countif(coalesce(prev_stop_time, stop_time) < stop_time) over (partition by id, priority order by start_time) as grp
from (select t.*,
max(stop_time) over (partition by id, priority order by start_time rows between unbounded preceding and 1 preceding) as prev_stop_time
from t
) t
) t
group by id, priority, grp;

CASE AND WHEN SQL

I have transactional data of customers' purchase. I tried to select customer_id from the last 1 month and calculate recency as the average day customers come to purchase (AVG(gap))
SELECT
customer_id,
(
CASE WHEN day::DATE<= '2015-05-01'::DATE AND day::DATE > '2015-05-01'::DATE - INTERVAL '1 month'
THEN
(
SELECT
AVG(gap)
FROM
(
SELECT
customer_id,
( day- LAG(day) OVER ( PARTITION BY customer_id ORDER BY day ) ) AS gap
FROM
baskets
JOIN
basket_lines
USING
( basket_id )
GROUP BY 1
) a
) b
ELSE 0
) AS A
FROM
baskets
JOIN
basket_lines
USING
(basket_id)
GROUP BY
1;
However, I have an error like `
ERROR: syntax error at or near "b"
LINE 45: GROUP BY 1)a)b ELSE 0) AS A
^
Does it mean I can not use subquery after THEN statement?
A subquery in the THEN clause does not take an alias. Also, you must end your CASE expression with END:
SELECT
customer_id,
(CASE WHEN day::DATE<= '2015-05-01'::DATE AND
day::DATE > '2015-05-01'::DATE - INTERVAL '1 month'
THEN
(SELECT AVG(gap) FROM (
SELECT customer_id,
(day- LAG(day) OVER (PARTITION BY customer_id ORDER BY day)) as gap
FROM baskets
JOIN basket_lines
USING (basket_id)
GROUP BY 1) a) ELSE 0 END) AS A
FROM baskets
JOIN basket_lines
USING (basket_id)
GROUP BY 1;
But you have a correlated subquery in your select statement. This is probably not optimal, and we can likely rewrite your query using a join.
I propose the following refactor:
WITH cte AS (
SELECT
customer_id,
(day- LAG(day) OVER (PARTITION BY customer_id ORDER BY day)) as gap
FROM baskets
INNER JOIN basket_lines
USING (basket_id)
WHERE day::DATE<= '2015-05-01'::DATE AND
day::DATE > '2015-05-01'::DATE - INTERVAL '1 month'
)
SELECT
customer_id,
AVG(gap) AS cust_avg
FROM cte
GROUP BY
customer_id;

Teradata - How to account for missing hours in timestamp when using extract() function

I have the following statement to extract the date, hour and number of users from a table in a Teradata DB . . .
SELECT
CAST(end_time AS DATE) AS end_date,
EXTRACT(HOUR FROM end_time) AS end_hour,
COUNT(users) AS total_users
FROM table
GROUP BY end_date, end_hour
When using the extract() function, my resultset contains missing hours where there is no activity by users over a 24 hour period... I'm wondering is there any technique to account for these missing hours in my resultset?
I can't creat a lookup table to reference as I don't have the necessary permissions to create a table on this DB.
Any help would be appreciated!
sys_calendar.calendar to generate the requested dates (change the range as needed)
WITH RECURSIVE to generate the hours
with recursive cte_hours (hr)
as
(
select 0 from (select 1) t(c)
union all select hr + 1 from cte_hours where hr < 23
)
select c.calendar_date as dt
,h.hr as hr
,zeroifnull(t.total_users) as total_users
from sys_calendar.calendar as c
cross join cte_hours as h
left join (select cast(end_time as date) as end_date
,extract(hour from end_time) as end_hour
,count(users) as total_users
from mytable t
group by end_date
,end_hour
) t
on t.end_date = c.calendar_date
and t.end_hour = h.hr
where c.calendar_date between current_date - 10 and current_date
order by dt,hr
;
For #GordonLinoff
select 0
0
select 1
1
select 0
union all
select 1
[3888] A SELECT for a UNION,INTERSECT or MINUS must reference a table.
select 0 from (select 1 as c) t
union all
select 1 from (select 1 as c) t
0
1
or
select 0 from (select 1) t(c)
union all
select 1 from (select 1) t(c)
0
1
If you want all hours from all days in the database, then you can generate the rows using cross join and then use left join to bring in results:
SELECT d.end_date,
EXTRACT(HOUR FROM end_time) AS end_hour,
COUNT(t.users) AS total_users
FROM (select distinct CAST(end_time AS DATE) AS end_date from table) d CROSS JOIN
(select distinct EXTRACT(HOUR FROM end_time) AS end_hour from table) h LEFT JOIN
table t
ON t.end_date = d.end_date and t.end_hour = d.end_hour
GROUP BY e.end_date, h.end_hour;
If all hours are not represented, you can use an explicit list:
SELECT d.end_date,
EXTRACT(HOUR FROM end_time) AS end_hour,
COUNT(t.users) AS total_users
FROM (select distinct CAST(end_time AS DATE) AS end_date from table) d CROSS JOIN
(select * from (select 0 as end_hour) t UNION ALL
select * from (select 1 as end_hour) t UNION ALL
. . .
) h LEFT JOIN
table t
ON t.end_date = d.end_date and t.end_hour = d.end_hour
GROUP BY e.end_date, h.end_hour;

How to insert from multiple places into one table in this situation (Oracle)?

I'm trying to insert data into a table that can be used as a report to view when each database had their last successful backup. The table that is being inserted into has name, input_type, status, start_time, end_time, and duration columns, and my SQL is currently:
INSERT INTO schema.Backup_Report (input_type, status, start_time, end_time, duration)
SELECT *
FROM (SELECT INPUT_TYPE,
STATUS,
START_TIME,
END_TIME,
elapsed_seconds/60 as duration_in_min
FROM v$RMAN_BACKUP_JOB_DETAILS
ORDER BY end_time desc)
WHERE rownum = 1
AND input_type = 'DB INCR'
AND STATUS = 'COMPLETED';
This works, but I'm having trouble figuring out how to also insert into the name column, as I have tried using a union in:
SELECT *
FROM (SELECT name from v$database union
SELECT INPUT_TYPE,
STATUS,
START_TIME,
END_TIME,
elapsed_seconds/60 as duration_in_min
FROM v$RMAN_BACKUP_JOB_DETAILS
ORDER BY end_time desc)
This gives me an ora-00904 error, as I think it doesn't like the order by end_time part.
SELECT *
FROM ( SELECT *
FROM (SELECT INPUT_TYPE,
STATUS,
START_TIME,
END_TIME,
elapsed_seconds/60 as duration_in_min
FROM v$RMAN_BACKUP_JOB_DETAILS
ORDER BY
END_TIME DESC )
WHERE ROWNUM = 1
AND INPUT_TYPE = 'DB INCR'
AND STATUS = 'COMPLETED' )
CROSS JOIN
( SELECT name from v$database );