Analyse each identifier individually in single query on PostgreSQL - sql
Description
I have PostgreSQL table that looks like this:
identifier
state
card_presence
progress
timestamp
V000000000000123
0
true
1000
2022-12-01 12:45:02
V000000000000123
2
true
1022
2022-12-01 12:45:03
V000000000000123
3
true
1024
2022-12-01 12:48:03
V000000000000124
2
true
974
2022-12-01 12:43:00
V000000000000124
6
true
982
2022-12-01 12:55:00
I have to analyze this data quite frequently (at ~60s) intervals. First stage of analysis is a complex query which processes the data in multiple steps. At the moment the I execute the query for each identifier individually.
Basically what the query does is somewhat what is described in: Time intervals analysis in BigQuery
The query looks like:
with real_data as (
SELECT
(CASE WHEN card_presence != false THEN state ELSE -1 END) as state,
progress,
lead(timestamp) over(order by timestamp) - interval '1 second' as next_timestamp,
timestamp
FROM telemetry_tacho
WHERE driver_identifier = 'V100000165676000' AND state IS NOT NULL AND timestamp >= CURRENT_TIMESTAMP - INTERVAL '2 weeks'
), sample_by_second as (
SELECT
state,
progress,
ts,
date_trunc('minute', ts) ts_minute
FROM
real_data,
generate_series(
timestamp,
coalesce(
next_timestamp,
date_trunc('minute', timestamp + interval '60 seconds')
),
interval '1 second'
) ts
), sample_by_second_with_weight as (
SELECT
state,
MIN(progress) as min_progress,
MAX(progress) as max_progress,
ts_minute,
count(*) weight
FROM sample_by_second
GROUP BY state, ts_minute
), sample_by_minute as (
SELECT
ts_minute,
(array_agg(state ORDER BY weight DESC))[1] as state,
MIN(min_progress) as min_progress,
MAX(max_progress) as max_progress
FROM sample_by_second_with_weight
GROUP BY ts_minute
), add_previous_state as (
SELECT
ts_minute,
state,
min_progress,
max_progress,
lag(state) OVER (ORDER BY ts_minute) as prev_state
FROM sample_by_minute
), add_group_indication as (
SELECT
ts_minute,
state,
min_progress,
max_progress,
SUM(CASE
WHEN state = 0 AND prev_state = -1 THEN 0
WHEN state = -1 AND prev_state = 0 THEN 0
WHEN state != prev_state THEN 1
ELSE 0
END) over (order by ts_minute) as group_id
FROM add_previous_state
), computed as (
select
group_id,
min(ts_minute) as ts_minute_min,
max(ts_minute) as ts_minute_max,
min(state) as state,
MIN(min_progress) as min_progress,
MAX(max_progress) as max_progress,
min(ts_minute) as start_timestamp,
max(ts_minute) + interval '1 minute' end_timestamp,
60 * count(*) as duration
from add_group_indication
group by group_id
), include_surrounding_states as (
select
*,
lag(state) over(order by start_timestamp) prev_state,
lead(state) over(order by start_timestamp) next_state
from computed
), filter_out_invalid_states as (
select
state,
min_progress,
max_progress,
start_timestamp,
end_timestamp,
lag(state) over(order by start_timestamp) prev_state,
lead(state) over(order by start_timestamp) next_state
from include_surrounding_states
where not (state = 2 AND prev_state = 3 AND next_state = 3 AND duration = 60)
), recalculate_group_id as (
select
SUM(CASE WHEN state != prev_state THEN 1 ELSE 0 END) over (order by start_timestamp) as group_id,
state,
min_progress,
max_progress,
COALESCE(start_timestamp, CURRENT_TIMESTAMP - INTERVAL '2 weeks') as start_timestamp, -- Add period start timestamp for the first entry
COALESCE(end_timestamp, CURRENT_TIMESTAMP) as end_timestamp
from filter_out_invalid_states
), final_data as (
SELECT
MAX(state) AS state,
MIN(min_progress) AS min_progress,
MAX(max_progress) AS max_progress,
MAX(max_progress) - MIN(min_progress) AS progress_diff,
EXTRACT('epoch' FROM min(start_timestamp))::integer AS start_timestamp,
EXTRACT('epoch' FROM max(end_timestamp))::integer AS end_timestamp,
EXTRACT('epoch' FROM (max(end_timestamp) - min(start_timestamp))::interval)::integer AS duration
FROM recalculate_group_id
GROUP BY group_id
ORDER BY start_timestamp ASC
)
select * from final_data;
Sample data
Input
"identifier","card_presence","state","progress","timestamp"
"0000000000000123",TRUE,0,100000,"2022-12-01 00:00:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-01 10:00:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-01 10:05:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-01 15:00:02+00"
"0000000000000123",TRUE,3,100000,"2022-12-01 15:45:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-01 20:15:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-01 20:15:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 05:14:45+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 05:15:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 05:15:01+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 06:10:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 07:11:20+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 07:11:28+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 07:13:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 08:01:06+00"
"0000000000000123",TRUE,0,100000,"2022-12-02 08:30:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 08:30:10+00"
"0000000000000123",TRUE,0,100000,"2022-12-02 09:45:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 10:30:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-02 15:00:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 15:45:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-02 16:45:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 01:45:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 02:25:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 05:18:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-03 06:15:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 07:00:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-03 11:30:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 12:15:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-03 13:15:00+00"
Output
"state","min_progress","max_progress","progress_diff","start_timestamp","end_timestamp","duration"
0,100000,100000,0,1669852800,1669889100,36300
3,100000,100000,0,1669889100,1669906800,17700
0,100000,100000,0,1669906800,1669909500,2700
3,100000,100000,0,1669909500,1669925700,16200
0,100000,100000,0,1669925700,1669958100,32400
3,100000,100000,0,1669958100,1669974300,16200
0,100000,100000,0,1669974300,1669977000,2700
3,100000,100000,0,1669977000,1669993200,16200
0,100000,100000,0,1669993200,1669995900,2700
3,100000,100000,0,1669995900,1669999500,3600
0,100000,100000,0,1669999500,1670031900,32400
3,100000,100000,0,1670031900,1670048100,16200
0,100000,100000,0,1670048100,1670050800,2700
3,100000,100000,0,1670050800,1670067000,16200
0,100000,100000,0,1670067000,1670069700,2700
3,100000,100000,0,1670069700,1670073300,3600
0,100000,100000,0,1670073300,1670073420,120
Question
The query usually takes some time to process for each device, and, I find that constantly querying for and analysing that data for each identifier separately is quite time consuming, so I thought, maybe it would be possible to pre-process that data for all devices periodically and store analysed results in separate table or materialized view.
Now the thing of running the query periodically and saving the results to a separate table or a materialized view isn't that hard, but is it possible to do that for all identifier values that exist on the table at once?
I believe that the query could be updated to do that, but I fail to grasp the concept on how to do so.
Without delving into your logic of analysis I may suggest this:
extract the list of distinct driver_identifier-s or have it stored in a materialized view too;
select from this list lateral join with your query.
Your query shall be changed a bit too, replace driver_identifier = 'V100000165676000' with driver_identifier = dil.drid to correlate it with the identifiers' list.
with driver_identifier_list(drid) as
(
select distinct driver_identifier from telemetry_tacho
)
select l.*
from driver_identifier_list as dil
cross join lateral
(
-- your query (where driver_identifier = dil.drid) here
) as l;
Effectively this is a loop that runs your query for every driver_identifier value. However the view(s) are to be refreshed on every telemetry_tacho mutation which makes the effectiveness of the materialized view approach questionable.
Related
Double counting problem in Rolling weekly / monthly active endpoints
Here is my current code to calculate DAE,WAE,MAE: select event_timestamp as day, Section, users as DAE, SUM(users) OVER (PARTITION BY Section ORDER BY event_timestamp ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as WAE, SUM(users) OVER (PARTITION BY Section ORDER BY event_timestamp ROWS BETWEEN 29 PRECEDING AND CURRENT ROW) as MAE from ( select count(distinct user_pseudo_id) as users, Section, event_timestamp from (select distinct * from ( select *, CASE WHEN param_value = "Names" or param_value = "SingleName" THEN 'Names' ELSE param_value END AS Section from( select user_pseudo_id, DATE_TRUNC(EXTRACT(DATE from TIMESTAMP_MICROS(event_timestamp)), DAY) as event_timestamp, event_name, params.value.string_value as param_value from `rayn-deen-app.analytics_317927526.events_*`, unnest(event_params) as params where (event_name = 'screen_view' and params.key = 'firebase_screen' and ( # Promises params.value.string_value = "Promises" # Favourites or params.value.string_value = "Favourites" )) group by user_pseudo_id, event_timestamp, event_name, param_value order by event_timestamp, user_pseudo_id) raw ) base order by event_timestamp) as events_table group by Section, event_timestamp ) The problem is that for WAE,MAE there is repeat counts of the same users happening. So for example user A was a "daily active user" for 4 days that week. Then in the WAE count, it will consider that as 4 users instead of one. So there is a problem of repeat counts which I need to remove somehow.
Time intervals analysis in BigQuery
Introduction I have IoT devices that are constantly sending data to the server. The data consists of those fields: state; state_valid; progress; timestamp; There is no guarantee that data will be received in correct time order, meaning that sometimes it might send data captured in the past, that removes the option to analyze and enrich data at the time of ingestion. Received data is stored in BigQuery table. Each device has a separate table. The table structure looks like this: state: INTEGER, REQUIRED state_valid: BOOLEAN, NULLABLE progress: INTEGER, REQUIRED timestamp: TIMESTAMP, REQUIRED Requirements After data collection, I need to analyze data adhering to those rules: Device is in received state value until different state is received; If record's state_valid is false - state value should be ignored and 0 should be used instead of it; If record's state_valid is NULL, last received state_valid value should be used; In analyzation phase, data should be viewed in one minute intervals; For example there shouldn't be a final record that starts at 20:51:07. Start date should be 20:51:00. The state that was on for most of the time of one minute interval - should be used for the whole minute. For example, if device had state 0 from 20:51:01 to 20:51:18 and state 2 for 20:51:18 to 20:52:12, 20:51:00 to 20:51:59 should be marked as state 2. The resulting data should group all consecutive intervals with same state value and represent it as one record with start and end timestamps The grouped intervals of same state should have calculated progress difference (max_progress - min_progress) Example Let's say I receive this data from device: state state_valid progress timestamp 2 1 2451 20:50:00 0 1 2451 20:50:20 2 1 2451 20:52:29 3 1 2451 20:53:51 3 1 2500 20:54:20 2 0 2500 20:55:09 Below I provide a visualization of that data on a timeline to better understand the next procedures: So the received data should be processed in one minute intervals, assigning each minute the state that device was in for the better part of that minute. So the above data becomes: Then, consecutive intervals of same state value should be merged: Result So, I need a query that would, adhering to the requirements described in Requirements section and given the data shown in the Example section provide me such result: group_id state progress start_timestamp end_timestamp duration 0 0 0 20:50:00 20:52:00 120s 1 2 0 20:52:00 20:54:00 120s 2 3 49 20:54:00 20:55:00 60s 3 0 0 20:55:00 20:56:00 60s Sample data Consider those two data sets as sample data Sample data 1 Data: WITH data as ( SELECT * FROM UNNEST([ STRUCT(NULL AS state, 0 AS state_valid, 0 as progress, CURRENT_TIMESTAMP() as timestamp), (2, 1, 2451, TIMESTAMP('2022-07-01 20:50:00 UTC')), (0, 1, 2451, TIMESTAMP('2022-07-01 20:50:20 UTC')), (2, 1, 2451, TIMESTAMP('2022-07-01 20:52:29 UTC')), (3, 1, 2451, TIMESTAMP('2022-07-01 20:53:51 UTC')), (3, 1, 2500, TIMESTAMP('2022-07-01 20:54:20 UTC')), (2, 0, 2500, TIMESTAMP('2022-07-01 20:55:09 UTC')), ]) WHERE NOT state IS NULL ) Expected outcome: group_id state progress start_timestamp end_timestamp duration 0 0 0 20:50:00 20:52:00 120s 1 2 0 20:52:00 20:54:00 120s 2 3 49 20:54:00 20:55:00 60s 3 0 0 20:55:00 current_timestamp current_timestamp - 20:55:00 Sample data 2 Data: WITH data as ( SELECT * FROM UNNEST([ STRUCT(NULL AS state, 0 AS state_valid, 0 as progress, CURRENT_TIMESTAMP() as timestamp), (2, 1, 2451, TIMESTAMP('2022-07-01 20:50:00 UTC')), (0, 1, 2451, TIMESTAMP('2022-07-01 20:50:20 UTC')), (2, 1, 2451, TIMESTAMP('2022-07-01 20:52:29 UTC')), (3, 1, 2451, TIMESTAMP('2022-07-01 20:53:51 UTC')), (3, 1, 2500, TIMESTAMP('2022-07-01 20:54:20 UTC')), (3, 1, 2580, TIMESTAMP('2022-07-01 20:55:09 UTC')), (3, 1, 2600, TIMESTAMP('2022-07-01 20:59:09 UTC')), (3, 1, 2700, TIMESTAMP('2022-07-01 21:20:09 UTC')), (2, 0, 2700, TIMESTAMP('2022-07-01 22:11:09 UTC')) ]) WHERE NOT state IS NULL ) Expected outcome: group_id state progress start_timestamp end_timestamp duration 0 0 0 20:50:00 20:52:00 120s 1 2 0 20:52:00 20:54:00 120s 2 3 249 20:54:00 22:11:00 4620s 3 0 0 22:11:00 current_timestamp current_timestamp - 22:11:00
Consider below approach with by_second as ( select if(state_valid = 0, 0, state) state, progress, ts, timestamp_trunc(ts, minute) ts_minute from ( select *, timestamp_sub(lead(timestamp) over(order by timestamp), interval 1 second) as next_timestamp from your_table ), unnest(generate_timestamp_array( timestamp, ifnull(next_timestamp, timestamp_trunc(timestamp_add(timestamp, interval 60 second), minute)), interval 1 second )) ts ), by_minute as ( select ts_minute, array_agg(struct(state, progress) order by weight desc limit 1)[offset(0)].* from ( select state, progress, ts_minute, count(*) weight from by_second group by state, progress, ts_minute ) group by ts_minute having sum(weight) > 59 ) select group_id, any_value(state) state, max(progress) progress, min(ts_minute) start_timestamp, timestamp_add(max(ts_minute), interval 1 minute) end_timestamp, 60 * count(*) duration from ( select countif(new_group) over(order by ts_minute) group_id, state, progress, ts_minute from ( select ts_minute, state, progress - lag(progress) over(order by ts_minute) as progress, ifnull((state, progress) != lag((state, progress)) over(order by ts_minute), true) new_group, from by_minute ) ) group by group_id if applied to dummy data as in your question output is
For some reason I feel that updating existing answer will be confusing - so see fixed solution here - there are two fixes in two lines at the very final select statement - hey are commented so you can easily locate them with by_second as ( select if(state_valid = 0, 0, state) state, progress, ts, timestamp_trunc(ts, minute) ts_minute from ( select *, timestamp_sub(lead(timestamp) over(order by timestamp), interval 1 second) as next_timestamp from your_table ), unnest(generate_timestamp_array( timestamp, ifnull(next_timestamp, timestamp_trunc(timestamp_add(timestamp, interval 60 second), minute)), interval 1 second )) ts ), by_minute as ( select ts_minute, array_agg(struct(state, progress) order by weight desc limit 1)[offset(0)].* from ( select state, progress, ts_minute, count(*) weight from by_second group by state, progress, ts_minute ) group by ts_minute having sum(weight) > 59 ) select group_id, any_value(state) state, sum(progress) progress, # here changed max(progress) to sum(progress) min(ts_minute) start_timestamp, timestamp_add(max(ts_minute), interval 1 minute) end_timestamp, 60 * count(*) duration from ( select countif(new_group) over(order by ts_minute) group_id, state, progress, ts_minute from ( select ts_minute, state, progress - lag(progress) over(order by ts_minute) as progress, -- ifnull((state, progress) != lag((state, progress)) over(order by ts_minute), true) new_group, # fixed this line with below one ifnull((state) != lag(state) over(order by ts_minute), true) new_group, from by_minute ) ) group by group_id
Yet another approach: WITH preprocessing AS ( SELECT IF (LAST_VALUE(state_valid IGNORE NULLS) OVER (ORDER BY ts) = 0, 0, state) AS state, LAST_VALUE(state_valid IGNORE NULLS) OVER (ORDER BY ts) AS state_valid, progress, ts FROM sample ), intervals_added AS ( ( SELECT *, 0 src FROM preprocessing UNION ALL SELECT null, null, null, ts, 1 FROM (SELECT MIN(ts) min_ts FROM sample), (SELECT MAX(ts) max_ts FROM sample), UNNEST (GENERATE_TIMESTAMP_ARRAY(min_ts, max_ts + INTERVAL 1 MINUTE, INTERVAL 1 MINUTE)) ts ) EXCEPT DISTINCT SELECT null, null, null, ts, 1 FROM (SELECT ts FROM preprocessing) ), analysis AS ( SELECT *, SUM(grp) OVER (ORDER BY ts) AS group_id FROM ( SELECT * EXCEPT(progress), TIMESTAMP_TRUNC(ts, MINUTE) AS start_timestamp, progress - LAST_VALUE(progress IGNORE NULLS) OVER w AS progress, IF (LAST_VALUE(state IGNORE NULLS) OVER w <> state, 1, 0) AS grp, TIMESTAMP_DIFF(LEAD(ts) OVER (ORDER BY ts, src), ts, SECOND) AS diff, FROM intervals_added WINDOW w AS (ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) ) QUALIFY MAX(diff) OVER (PARTITION BY TIMESTAMP_TRUNC(ts, MINUTE)) = diff ) SELECT group_id, MIN(state) AS sate, SUM(progress) AS progress, MIN(start_timestamp) AS start_timestamp, MIN(start_timestamp) + INTERVAL COUNT(1) MINUTE AS end_timestamp, 60 * COUNT(1) AS duration, FROM analysis GROUP BY 1 ORDER BY 1; output:
Computing session start and end using SQL window functions
I've a table of game logs containing a handDate, like this: ID handDate 1 2019-06-30 16:14:02.000 2 2019-07-12 06:18:02.000 3 ... I'd like to compute game sessions from this table (start and end), given that: A new session is considered if there is no activity since 1 hour. a session can exist across 2 days So I'd like results like this: day session_start sesssion_end 2019-06-30 2019-06-15 16:14:02.000 2019-06-15 16:54:02.000 2019-07-02 2019-07-02 16:18:02.000 2019-07-02 17:18:02.000 2019-07-02 2019-07-02 23:18:02.000 2019-07-03 03:18:02.000 2019-07-03 2019-07-03 06:18:02.000 2019-07-03 08:28:02.000 Currently I'm playing with the following code, but cannot achieve what I want: SELECT * FROM ( SELECT *, strftime( '%s', handDate) - strftime( '%s', prev_event) AS inactivity FROM ( SELECT handDate, date( handDate) as day, FIRST_VALUE( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate) AS first_event, MIN(handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate), MAX(handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate), LAG( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS prev_event, LEAD( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate) AS next_event FROM hands ) last ) final I'm using SQLite.
I found the following solution: SELECT day, sessionId, MIN(handDate) as sessionStart, MAX(handDate) as sessionEnd FROM( SELECT day, handDate, sum(is_new_session) over ( order by handDate rows between unbounded preceding and current row ) as sessionId FROM ( SELECT *, CASE WHEN prev_event IS NULL OR strftime('%s', handDate) - strftime('%s', prev_event) > 3600 THEN true ELSE false END AS is_new_session FROM ( SELECT handDate, date(handDate) as day, LAG(handDate) OVER ( PARTITION BY date(handDate) ORDER BY handDate RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS prev_event FROM hands ) ) ) GROUP BY sessionId
DROP TABLE IF EXISTS hands; CREATE TABLE hands(handDate TIMESTAMP); INSERT INTO hands(handDate) VALUES ('2021-10-29 10:30:00') , ('2021-10-29 11:35:00') , ('2021-10-29 11:36:00') , ('2021-10-29 11:37:00') , ('2021-10-29 12:38:00') , ('2021-10-29 12:39:00') , ('2021-10-29 12:39:10') ; SELECT start_period, end_period FROM ( SELECT is_start, handDate AS start_period , CASE WHEN is_start AND is_end THEN handDate ELSE LEAD(handDate) OVER (ORDER BY handDate) END AS END_period FROM ( SELECT * FROM ( SELECT * ,CASE WHEN (event-prev_event) * 1440.0 > 60 OR prev_event IS NULL THEN true ELSE FALSE END AS is_start ,CASE WHEN (next_event-event) * 1440.0 > 60 OR next_event IS NULL THEN true ELSE FALSE END AS is_end FROM ( SELECT handDate , juliANDay(handDate) event , juliANDay(LAG(handDate) OVER (ORDER BY handDate)) AS prev_event , juliANDay(LEAD(handDate) OVER (ORDER BY handDate)) AS next_event FROM hands ) t ) t WHERE is_start OR is_end )t )t WHERE is_start
BigQuery SQL: filter on event sequence
I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download for the same user ("followed" meaning the event_time_utc of store_app_view is older than event_time_utc of store_app_download). Sample data: WITH `project.dataset.dummy_data_init` AS (SELECT event_id FROM UNNEST(GENERATE_ARRAY(1, 10000)) event_id), `project.dataset.dummy_data_completed` AS (SELECT event_id, user_id[OFFSET(CAST(20 * RAND() - 0.5 AS INT64))] user_id, app_id[OFFSET(CAST(100 * RAND() - 0.5 AS INT64))] app_id, event_type[OFFSET(CAST(6 * RAND() - 0.5 AS INT64))] event_type, event_time_utc[OFFSET(CAST(26 * RAND() - 0.5 AS INT64))] event_time_utc FROM `project.dataset.dummy_data_init`, (SELECT GENERATE_ARRAY(1, 20) user_id), (SELECT GENERATE_ARRAY(1, 100) app_id), (SELECT ['store_app_view', 'store_app_view', 'store_app_download','store_app_install','store_app_update','store_fetch_manifest'] event_type), (SELECT GENERATE_TIMESTAMP_ARRAY('2020-01-01 00:00:00', '2020-01-26 00:00:00', INTERVAL 1 DAY) AS event_time_utc)) Select * FROM `project.dataset.dummy_data_completed` Thanks!
I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download. Your provided query seems to have almost no connection to this question, so I'll ignore it. For each user/app pair, you can get the rows that matching your conditions using GROUP BY: select user_id, app_id from t group by user_id, app_id having min(case when event_type = 'store_app_view' then event_time end) < max(case when event_type = 'store_app_download' then event_time end); To get the total for each app, use a subquery or CTE: select app_id, count(*) from (select user_id, app_id from t group by user_id, app_id having min(case when event_type = 'store_app_view' then event_time end) < max(case when event_type = 'store_app_download' then event_time end) ) ua group by app_id;
BigQuery (Google Analytics data):query two different 'hits.customDimensions.index' in the same 'hits.hitNumber'
my goal: Count 1 for the session if the following two hits.customDimensions.index and associated hits.customDimensions.value appear in the same hits.hitNumber (every row is 1 session if main query is still nested): ['hits.customDimensions.index' = 43 with associated 'hits.customDimensions.value' IN ('login', 'payment', 'order', 'thankyou')] AND ['hits.customDimensions.index' = 10 with associated 'hits.customDimensions.value' = 'checkout' [in the same hits.hitNumber] my problem: I don't know how i can query two different hits.customDimensions.value in the same hits.hitNumber in one Subquery without different WITH-tables. If it's possible, which I'm sure, the query would be very easy and short. Since i don't know how to query this usecase in a subquery, I use an workaround which totals to 5 WITH-tables. I would appreciate an easy way to query this usecase Explanation workaround query: Table1: Queries all except the 'problem-metric' Table2-3: Each table queries one hits.customDimensions.index with associated hits.customDimensions.value filtered for the correct value, sessionId and hitNumber table4: left join table 2 with table 3 based on date, sessionID and hitNumber. Basically if hitNumber combined with sessionId from table2 and table3 match I count 1 table5: left join table1 with table4 to combine the data #Table1 - complete data except session_atleast_loginCheckout WITH prepared_data AS ( SELECT date, SUM((SELECT 1 FROM UNNEST(hits) WHERE CAST(eCommerceAction.action_type AS INT64) BETWEEN 4 AND 6 LIMIT 1)) AS sessions_atleast_basket, #insert in this row query for sessions_atleast_loginCheckout SUM((SELECT 1 FROM UNNEST(hits) as h, UNNEST(h.customDimensions) as hcd WHERE index = 43 AND value IN ('payment', 'order', 'thankyou') LIMIT 1)) AS sessions_atleast_payment, FROM `big-query-221916.172008714.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND totals.visits = 1 GROUP BY date #Table2 - data for hits.customDimensions.index = 10 AND associated hits.customDimensions.value = 'checkout' with hits.hitNumber and sessionId (join later based on hitNumber and sessionId) loginCheckout_index10_pagetype_data AS ( SELECT date AS date, CONCAT(fullVisitorId, '/', CAST( visitStartTime AS STRING)) AS sessionId, h.hitNumber AS hitNumber, IF(hcd.value IS NOT NULL, 1, NULL) AS pagetype_checkout FROM `big-query-221916.172008714.ga_sessions_*` AS o, UNNEST(hits) as h, UNNEST(h.customDimensions) as hcd WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND hcd.index = 10 AND VALUE = 'checkout' AND h.type = 'PAGE' AND totals.visits = 1), #Table3 - data for hits.customDimensions.index = 43 AND associated hits.customDimensions.value IN ('login', 'register', 'payment', 'order','thankyou') with hits.hitNumber and sessionId (join later based on hitNumber and sessionId) loginCheckout_index43_pagelevel1_data AS ( SELECT date AS date, CONCAT(fullVisitorId, '/', CAST( visitStartTime AS STRING)) AS sessionId, h.hitNumber AS hitNumber, IF(hcd.value IS NOT NULL, 1, NULL) AS pagelevel1_login_to_thankyou FROM `big-query-221916.172008714.ga_sessions_*` AS o, UNNEST(hits) as h, UNNEST(h.customDimensions) as hcd WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND hcd.index = 43 AND VALUE IN ('login', 'register', 'payment', 'order', 'thankyou') AND h.type = 'PAGE' ), #table4 - left join table2 and table 3 on sessionId and hitNumber to get sessions_atleast_loginCheckout loginChackout_output_data AS( SELECT a.date AS date, COUNT(DISTINCT a.sessionId) AS sessions_atleast_loginCheckout FROM loginCheckout_index10_pagetype_data AS a LEFT JOIN loginCheckout_index43_pagelevel1_data AS b ON a.date = b.date AND a.sessionId = b.sessionId AND a.hitNumber = b.hitNumber WHERE pagelevel1_login_to_thankyou IS NOT NULL GROUP BY date #table5 - leftjoin table1 with table4 to get all data together SELECT prep.date, prep.sessions_atleast_basket, log.sessions_atleast_loginCheckout, prep.sessions_atleast_payment FROM prepared_data AS prep LEFT JOIN loginChackout_output_data as log ON prep.date = log.date AND
It's a bit like Inception, but maybe it helps to keep in mind that the input of unnest() is an array and the output are table rows ... SELECT SUM(totals.visits) as sessions FROM `big-query-221916.172008714.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND -- the following two hits.customDimensions.index and associated hits.customDimensions.value appear in the same hits.hitNumber (SELECT COUNT(1)>0 as hitsCountMoreThanZero FROM UNNEST(hits) AS h WHERE -- index 43, value IN ('login', 'payment', 'order', 'thankyou') (select count(1)>0 from unnest(h.customdimensions) where index=43 and value IN ('login', 'payment', 'order', 'thankyou')) AND -- index 10, value = 'checkout' (select count(1)>0 from unnest(h.customdimensions) where index=10 and value='checkout') ) GROUP BY date