Analyse each identifier individually in single query on PostgreSQL

Analyse each identifier individually in single query on PostgreSQL - sql

Description
I have PostgreSQL table that looks like this:
identifier
state
card_presence
progress
timestamp
V000000000000123
0
true
1000
2022-12-01 12:45:02
V000000000000123
2
true
1022
2022-12-01 12:45:03
V000000000000123
3
true
1024
2022-12-01 12:48:03
V000000000000124
2
true
974
2022-12-01 12:43:00
V000000000000124
6
true
982
2022-12-01 12:55:00
I have to analyze this data quite frequently (at ~60s) intervals. First stage of analysis is a complex query which processes the data in multiple steps. At the moment the I execute the query for each identifier individually.
Basically what the query does is somewhat what is described in: Time intervals analysis in BigQuery
The query looks like:
with real_data as (
SELECT
(CASE WHEN card_presence != false THEN state ELSE -1 END) as state,
progress,
lead(timestamp) over(order by timestamp) - interval '1 second' as next_timestamp,
timestamp
FROM telemetry_tacho
WHERE driver_identifier = 'V100000165676000' AND state IS NOT NULL AND timestamp >= CURRENT_TIMESTAMP - INTERVAL '2 weeks'
), sample_by_second as (
SELECT
state,
progress,
ts,
date_trunc('minute', ts) ts_minute
FROM
real_data,
generate_series(
timestamp,
coalesce(
next_timestamp,
date_trunc('minute', timestamp + interval '60 seconds')
),
interval '1 second'
) ts
), sample_by_second_with_weight as (
SELECT
state,
MIN(progress) as min_progress,
MAX(progress) as max_progress,
ts_minute,
count(*) weight
FROM sample_by_second
GROUP BY state, ts_minute
), sample_by_minute as (
SELECT
ts_minute,
(array_agg(state ORDER BY weight DESC))[1] as state,
MIN(min_progress) as min_progress,
MAX(max_progress) as max_progress
FROM sample_by_second_with_weight
GROUP BY ts_minute
), add_previous_state as (
SELECT
ts_minute,
state,
min_progress,
max_progress,
lag(state) OVER (ORDER BY ts_minute) as prev_state
FROM sample_by_minute
), add_group_indication as (
SELECT
ts_minute,
state,
min_progress,
max_progress,
SUM(CASE
WHEN state = 0 AND prev_state = -1 THEN 0
WHEN state = -1 AND prev_state = 0 THEN 0
WHEN state != prev_state THEN 1
ELSE 0
END) over (order by ts_minute) as group_id
FROM add_previous_state
), computed as (
select
group_id,
min(ts_minute) as ts_minute_min,
max(ts_minute) as ts_minute_max,
min(state) as state,
MIN(min_progress) as min_progress,
MAX(max_progress) as max_progress,
min(ts_minute) as start_timestamp,
max(ts_minute) + interval '1 minute' end_timestamp,
60 * count(*) as duration
from add_group_indication
group by group_id
), include_surrounding_states as (
select
*,
lag(state) over(order by start_timestamp) prev_state,
lead(state) over(order by start_timestamp) next_state
from computed
), filter_out_invalid_states as (
select
state,
min_progress,
max_progress,
start_timestamp,
end_timestamp,
lag(state) over(order by start_timestamp) prev_state,
lead(state) over(order by start_timestamp) next_state
from include_surrounding_states
where not (state = 2 AND prev_state = 3 AND next_state = 3 AND duration = 60)
), recalculate_group_id as (
select
SUM(CASE WHEN state != prev_state THEN 1 ELSE 0 END) over (order by start_timestamp) as group_id,
state,
min_progress,
max_progress,
COALESCE(start_timestamp, CURRENT_TIMESTAMP - INTERVAL '2 weeks') as start_timestamp, -- Add period start timestamp for the first entry
COALESCE(end_timestamp, CURRENT_TIMESTAMP) as end_timestamp
from filter_out_invalid_states
), final_data as (
SELECT
MAX(state) AS state,
MIN(min_progress) AS min_progress,
MAX(max_progress) AS max_progress,
MAX(max_progress) - MIN(min_progress) AS progress_diff,
EXTRACT('epoch' FROM min(start_timestamp))::integer AS start_timestamp,
EXTRACT('epoch' FROM max(end_timestamp))::integer AS end_timestamp,
EXTRACT('epoch' FROM (max(end_timestamp) - min(start_timestamp))::interval)::integer AS duration
FROM recalculate_group_id
GROUP BY group_id
ORDER BY start_timestamp ASC
)
select * from final_data;
Sample data
Input
"identifier","card_presence","state","progress","timestamp"
"0000000000000123",TRUE,0,100000,"2022-12-01 00:00:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-01 10:00:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-01 10:05:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-01 15:00:02+00"
"0000000000000123",TRUE,3,100000,"2022-12-01 15:45:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-01 20:15:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-01 20:15:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 05:14:45+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 05:15:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 05:15:01+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 06:10:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 07:11:20+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 07:11:28+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 07:13:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 08:01:06+00"
"0000000000000123",TRUE,0,100000,"2022-12-02 08:30:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 08:30:10+00"
"0000000000000123",TRUE,0,100000,"2022-12-02 09:45:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 10:30:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-02 15:00:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-02 15:45:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-02 16:45:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 01:45:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 02:25:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 05:18:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-03 06:15:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 07:00:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-03 11:30:00+00"
"0000000000000123",TRUE,3,100000,"2022-12-03 12:15:00+00"
"0000000000000123",TRUE,0,100000,"2022-12-03 13:15:00+00"
Output
"state","min_progress","max_progress","progress_diff","start_timestamp","end_timestamp","duration"
0,100000,100000,0,1669852800,1669889100,36300
3,100000,100000,0,1669889100,1669906800,17700
0,100000,100000,0,1669906800,1669909500,2700
3,100000,100000,0,1669909500,1669925700,16200
0,100000,100000,0,1669925700,1669958100,32400
3,100000,100000,0,1669958100,1669974300,16200
0,100000,100000,0,1669974300,1669977000,2700
3,100000,100000,0,1669977000,1669993200,16200
0,100000,100000,0,1669993200,1669995900,2700
3,100000,100000,0,1669995900,1669999500,3600
0,100000,100000,0,1669999500,1670031900,32400
3,100000,100000,0,1670031900,1670048100,16200
0,100000,100000,0,1670048100,1670050800,2700
3,100000,100000,0,1670050800,1670067000,16200
0,100000,100000,0,1670067000,1670069700,2700
3,100000,100000,0,1670069700,1670073300,3600
0,100000,100000,0,1670073300,1670073420,120
Question
The query usually takes some time to process for each device, and, I find that constantly querying for and analysing that data for each identifier separately is quite time consuming, so I thought, maybe it would be possible to pre-process that data for all devices periodically and store analysed results in separate table or materialized view.
Now the thing of running the query periodically and saving the results to a separate table or a materialized view isn't that hard, but is it possible to do that for all identifier values that exist on the table at once?
I believe that the query could be updated to do that, but I fail to grasp the concept on how to do so.

Without delving into your logic of analysis I may suggest this:
extract the list of distinct driver_identifier-s or have it stored in a materialized view too;
select from this list lateral join with your query.
Your query shall be changed a bit too, replace driver_identifier = 'V100000165676000' with driver_identifier = dil.drid to correlate it with the identifiers' list.
with driver_identifier_list(drid) as
(
select distinct driver_identifier from telemetry_tacho
)
select l.*
from driver_identifier_list as dil
cross join lateral
(
-- your query (where driver_identifier = dil.drid) here
) as l;
Effectively this is a loop that runs your query for every driver_identifier value. However the view(s) are to be refreshed on every telemetry_tacho mutation which makes the effectiveness of the materialized view approach questionable.

Related

Double counting problem in Rolling weekly / monthly active endpoints

Here is my current code to calculate DAE,WAE,MAE:
select event_timestamp as day, Section, users as DAE, SUM(users)
OVER (PARTITION BY Section
ORDER BY event_timestamp
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as WAE,
SUM(users)
OVER (PARTITION BY Section
ORDER BY event_timestamp
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW) as MAE
from (
select count(distinct user_pseudo_id) as users, Section, event_timestamp
from
(select distinct *
from
(
select *,
CASE
WHEN param_value = "Names" or param_value = "SingleName" THEN 'Names'
ELSE param_value
END AS Section
from(
select user_pseudo_id, DATE_TRUNC(EXTRACT(DATE from TIMESTAMP_MICROS(event_timestamp)), DAY) as event_timestamp, event_name, params.value.string_value as param_value
from `rayn-deen-app.analytics_317927526.events_*`, unnest(event_params) as params
where (event_name = 'screen_view' and params.key = 'firebase_screen' and (
# Promises
params.value.string_value = "Promises"
# Favourites
or params.value.string_value = "Favourites"
))
group by user_pseudo_id, event_timestamp, event_name, param_value
order by event_timestamp, user_pseudo_id) raw
) base
order by event_timestamp) as events_table
group by Section, event_timestamp
)
The problem is that for WAE,MAE there is repeat counts of the same users happening. So for example user A was a "daily active user" for 4 days that week. Then in the WAE count, it will consider that as 4 users instead of one. So there is a problem of repeat counts which I need to remove somehow.

Time intervals analysis in BigQuery

Introduction
I have IoT devices that are constantly sending data to the server. The data consists of those fields:
state;
state_valid;
progress;
timestamp;
There is no guarantee that data will be received in correct time order, meaning that sometimes it might send data captured in the past, that removes the option to analyze and enrich data at the time of ingestion.
Received data is stored in BigQuery table. Each device has a separate table. The table structure looks like this:
state: INTEGER, REQUIRED
state_valid: BOOLEAN, NULLABLE
progress: INTEGER, REQUIRED
timestamp: TIMESTAMP, REQUIRED
Requirements
After data collection, I need to analyze data adhering to those rules:
Device is in received state value until different state is received;
If record's state_valid is false - state value should be ignored and 0 should be used instead of it;
If record's state_valid is NULL, last received state_valid value should be used;
In analyzation phase, data should be viewed in one minute intervals;
For example there shouldn't be a final record that starts at 20:51:07. Start date should be 20:51:00.
The state that was on for most of the time of one minute interval - should be used for the whole minute.
For example, if device had state 0 from 20:51:01 to 20:51:18 and state 2 for 20:51:18 to 20:52:12, 20:51:00 to 20:51:59 should be marked as state 2.
The resulting data should group all consecutive intervals with same state value and represent it as one record with start and end timestamps
The grouped intervals of same state should have calculated progress difference (max_progress - min_progress)
Example
Let's say I receive this data from device:
state
state_valid
progress
timestamp
2
1
2451
20:50:00
0
1
2451
20:50:20
2
1
2451
20:52:29
3
1
2451
20:53:51
3
1
2500
20:54:20
2
0
2500
20:55:09
Below I provide a visualization of that data on a timeline to better understand the next procedures:
So the received data should be processed in one minute intervals, assigning each minute the state that device was in for the better part of that minute. So the above data becomes:
Then, consecutive intervals of same state value should be merged:
Result
So, I need a query that would, adhering to the requirements described in Requirements section and given the data shown in the Example section provide me such result:
group_id
state
progress
start_timestamp
end_timestamp
duration
0
0
0
20:50:00
20:52:00
120s
1
2
0
20:52:00
20:54:00
120s
2
3
49
20:54:00
20:55:00
60s
3
0
0
20:55:00
20:56:00
60s
Sample data
Consider those two data sets as sample data
Sample data 1
Data:
WITH data as (
SELECT * FROM UNNEST([
STRUCT(NULL AS state, 0 AS state_valid, 0 as progress, CURRENT_TIMESTAMP() as timestamp),
(2, 1, 2451, TIMESTAMP('2022-07-01 20:50:00 UTC')),
(0, 1, 2451, TIMESTAMP('2022-07-01 20:50:20 UTC')),
(2, 1, 2451, TIMESTAMP('2022-07-01 20:52:29 UTC')),
(3, 1, 2451, TIMESTAMP('2022-07-01 20:53:51 UTC')),
(3, 1, 2500, TIMESTAMP('2022-07-01 20:54:20 UTC')),
(2, 0, 2500, TIMESTAMP('2022-07-01 20:55:09 UTC')),
])
WHERE NOT state IS NULL
)
Expected outcome:
group_id
state
progress
start_timestamp
end_timestamp
duration
0
0
0
20:50:00
20:52:00
120s
1
2
0
20:52:00
20:54:00
120s
2
3
49
20:54:00
20:55:00
60s
3
0
0
20:55:00
current_timestamp
current_timestamp - 20:55:00
Sample data 2
Data:
WITH data as (
SELECT * FROM UNNEST([
STRUCT(NULL AS state, 0 AS state_valid, 0 as progress, CURRENT_TIMESTAMP() as timestamp),
(2, 1, 2451, TIMESTAMP('2022-07-01 20:50:00 UTC')),
(0, 1, 2451, TIMESTAMP('2022-07-01 20:50:20 UTC')),
(2, 1, 2451, TIMESTAMP('2022-07-01 20:52:29 UTC')),
(3, 1, 2451, TIMESTAMP('2022-07-01 20:53:51 UTC')),
(3, 1, 2500, TIMESTAMP('2022-07-01 20:54:20 UTC')),
(3, 1, 2580, TIMESTAMP('2022-07-01 20:55:09 UTC')),
(3, 1, 2600, TIMESTAMP('2022-07-01 20:59:09 UTC')),
(3, 1, 2700, TIMESTAMP('2022-07-01 21:20:09 UTC')),
(2, 0, 2700, TIMESTAMP('2022-07-01 22:11:09 UTC'))
])
WHERE NOT state IS NULL
)
Expected outcome:
group_id
state
progress
start_timestamp
end_timestamp
duration
0
0
0
20:50:00
20:52:00
120s
1
2
0
20:52:00
20:54:00
120s
2
3
249
20:54:00
22:11:00
4620s
3
0
0
22:11:00
current_timestamp
current_timestamp - 22:11:00

Consider below approach
with by_second as (
select if(state_valid = 0, 0, state) state, progress, ts, timestamp_trunc(ts, minute) ts_minute
from (
select *, timestamp_sub(lead(timestamp) over(order by timestamp), interval 1 second) as next_timestamp
from your_table
), unnest(generate_timestamp_array(
timestamp, ifnull(next_timestamp, timestamp_trunc(timestamp_add(timestamp, interval 60 second), minute)), interval 1 second
)) ts
), by_minute as (
select ts_minute, array_agg(struct(state, progress) order by weight desc limit 1)[offset(0)].*
from (
select state, progress, ts_minute, count(*) weight
from by_second
group by state, progress, ts_minute
)
group by ts_minute
having sum(weight) > 59
)
select group_id, any_value(state) state, max(progress) progress,
min(ts_minute) start_timestamp,
timestamp_add(max(ts_minute), interval 1 minute) end_timestamp,
60 * count(*) duration
from (
select countif(new_group) over(order by ts_minute) group_id, state, progress, ts_minute
from (
select ts_minute, state, progress - lag(progress) over(order by ts_minute) as progress,
ifnull((state, progress) != lag((state, progress)) over(order by ts_minute), true) new_group,
from by_minute
)
)
group by group_id
if applied to dummy data as in your question
output is

For some reason I feel that updating existing answer will be confusing - so see fixed solution here - there are two fixes in two lines at the very final select statement - hey are commented so you can easily locate them
with by_second as (
select if(state_valid = 0, 0, state) state, progress, ts, timestamp_trunc(ts, minute) ts_minute
from (
select *, timestamp_sub(lead(timestamp) over(order by timestamp), interval 1 second) as next_timestamp
from your_table
), unnest(generate_timestamp_array(
timestamp, ifnull(next_timestamp, timestamp_trunc(timestamp_add(timestamp, interval 60 second), minute)), interval 1 second
)) ts
), by_minute as (
select ts_minute, array_agg(struct(state, progress) order by weight desc limit 1)[offset(0)].*
from (
select state, progress, ts_minute, count(*) weight
from by_second
group by state, progress, ts_minute
)
group by ts_minute
having sum(weight) > 59
)
select group_id, any_value(state) state, sum(progress) progress,
# here changed max(progress) to sum(progress)
min(ts_minute) start_timestamp,
timestamp_add(max(ts_minute), interval 1 minute) end_timestamp,
60 * count(*) duration
from (
select countif(new_group) over(order by ts_minute) group_id, state, progress, ts_minute
from (
select ts_minute, state, progress - lag(progress) over(order by ts_minute) as progress,
-- ifnull((state, progress) != lag((state, progress)) over(order by ts_minute), true) new_group,
# fixed this line with below one
ifnull((state) != lag(state) over(order by ts_minute), true) new_group,
from by_minute
)
)
group by group_id

Yet another approach:
WITH preprocessing AS (
SELECT IF (LAST_VALUE(state_valid IGNORE NULLS) OVER (ORDER BY ts) = 0, 0, state) AS state,
LAST_VALUE(state_valid IGNORE NULLS) OVER (ORDER BY ts) AS state_valid,
progress, ts
FROM sample
),
intervals_added AS (
( SELECT *, 0 src FROM preprocessing UNION ALL
SELECT null, null, null, ts, 1
FROM (SELECT MIN(ts) min_ts FROM sample), (SELECT MAX(ts) max_ts FROM sample),
UNNEST (GENERATE_TIMESTAMP_ARRAY(min_ts, max_ts + INTERVAL 1 MINUTE, INTERVAL 1 MINUTE)) ts
) EXCEPT DISTINCT
SELECT null, null, null, ts, 1 FROM (SELECT ts FROM preprocessing)
),
analysis AS (
SELECT *, SUM(grp) OVER (ORDER BY ts) AS group_id FROM (
SELECT * EXCEPT(progress),
TIMESTAMP_TRUNC(ts, MINUTE) AS start_timestamp,
progress - LAST_VALUE(progress IGNORE NULLS) OVER w AS progress,
IF (LAST_VALUE(state IGNORE NULLS) OVER w <> state, 1, 0) AS grp,
TIMESTAMP_DIFF(LEAD(ts) OVER (ORDER BY ts, src), ts, SECOND) AS diff,
FROM intervals_added
WINDOW w AS (ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING)
) QUALIFY MAX(diff) OVER (PARTITION BY TIMESTAMP_TRUNC(ts, MINUTE)) = diff
)
SELECT group_id, MIN(state) AS sate, SUM(progress) AS progress,
MIN(start_timestamp) AS start_timestamp,
MIN(start_timestamp) + INTERVAL COUNT(1) MINUTE AS end_timestamp,
60 * COUNT(1) AS duration,
FROM analysis GROUP BY 1 ORDER BY 1;
output:

Computing session start and end using SQL window functions

I've a table of game logs containing a handDate, like this:
ID
handDate
1
2019-06-30 16:14:02.000
2
2019-07-12 06:18:02.000
3
...
I'd like to compute game sessions from this table (start and end), given that:
A new session is considered if there is no activity since 1 hour.
a session can exist across 2 days
So I'd like results like this:
day
session_start
sesssion_end
2019-06-30
2019-06-15 16:14:02.000
2019-06-15 16:54:02.000
2019-07-02
2019-07-02 16:18:02.000
2019-07-02 17:18:02.000
2019-07-02
2019-07-02 23:18:02.000
2019-07-03 03:18:02.000
2019-07-03
2019-07-03 06:18:02.000
2019-07-03 08:28:02.000
Currently I'm playing with the following code, but cannot achieve what I want:
SELECT *
FROM (
SELECT *,
strftime( '%s', handDate) - strftime( '%s', prev_event) AS inactivity
FROM (
SELECT handDate,
date( handDate) as day,
FIRST_VALUE( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate) AS first_event,
MIN(handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate),
MAX(handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate),
LAG( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS prev_event,
LEAD( handDate) OVER (PARTITION BY date( handDate) ORDER BY handDate) AS next_event
FROM hands
) last
) final
I'm using SQLite.

I found the following solution:
SELECT day,
sessionId,
MIN(handDate) as sessionStart,
MAX(handDate) as sessionEnd
FROM(
SELECT day,
handDate,
sum(is_new_session) over (
order by handDate rows between unbounded preceding and current row
) as sessionId
FROM (
SELECT *,
CASE
WHEN prev_event IS NULL
OR strftime('%s', handDate) - strftime('%s', prev_event) > 3600 THEN true
ELSE false
END AS is_new_session
FROM (
SELECT handDate,
date(handDate) as day,
LAG(handDate) OVER (
PARTITION BY date(handDate)
ORDER BY handDate RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS prev_event
FROM hands
)
)
)
GROUP BY sessionId

DROP TABLE IF EXISTS hands;
CREATE TABLE hands(handDate TIMESTAMP);
INSERT INTO hands(handDate)
VALUES ('2021-10-29 10:30:00')
, ('2021-10-29 11:35:00')
, ('2021-10-29 11:36:00')
, ('2021-10-29 11:37:00')
, ('2021-10-29 12:38:00')
, ('2021-10-29 12:39:00')
, ('2021-10-29 12:39:10')
;
SELECT start_period, end_period
FROM (
SELECT is_start, handDate AS start_period
, CASE WHEN is_start AND is_end THEN handDate
ELSE LEAD(handDate) OVER (ORDER BY handDate)
END AS END_period
FROM (
SELECT *
FROM (
SELECT *
,CASE WHEN (event-prev_event) * 1440.0 > 60 OR prev_event IS NULL THEN true ELSE FALSE END AS is_start
,CASE WHEN (next_event-event) * 1440.0 > 60 OR next_event IS NULL THEN true ELSE FALSE END AS is_end
FROM (
SELECT handDate
, juliANDay(handDate) event
, juliANDay(LAG(handDate) OVER (ORDER BY handDate)) AS prev_event
, juliANDay(LEAD(handDate) OVER (ORDER BY handDate)) AS next_event
FROM hands
) t
) t
WHERE is_start OR is_end
)t
)t
WHERE is_start

BigQuery SQL: filter on event sequence

I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download for the same user ("followed" meaning the event_time_utc of store_app_view is older than event_time_utc of store_app_download).
Sample data:
WITH
`project.dataset.dummy_data_init` AS (SELECT event_id FROM UNNEST(GENERATE_ARRAY(1, 10000)) event_id),
`project.dataset.dummy_data_completed` AS (SELECT event_id,
user_id[OFFSET(CAST(20 * RAND() - 0.5 AS INT64))] user_id,
app_id[OFFSET(CAST(100 * RAND() - 0.5 AS INT64))] app_id,
event_type[OFFSET(CAST(6 * RAND() - 0.5 AS INT64))] event_type,
event_time_utc[OFFSET(CAST(26 * RAND() - 0.5 AS INT64))] event_time_utc
FROM `project.dataset.dummy_data_init`,
(SELECT GENERATE_ARRAY(1, 20) user_id),
(SELECT GENERATE_ARRAY(1, 100) app_id),
(SELECT ['store_app_view', 'store_app_view', 'store_app_download','store_app_install','store_app_update','store_fetch_manifest'] event_type),
(SELECT GENERATE_TIMESTAMP_ARRAY('2020-01-01 00:00:00', '2020-01-26 00:00:00',
INTERVAL 1 DAY) AS event_time_utc))
Select * FROM `project.dataset.dummy_data_completed`
Thanks!

I want to count, for each app_id, how many times the event_type: store_app_view was followed by the event_type: store_app_download.
Your provided query seems to have almost no connection to this question, so I'll ignore it.
For each user/app pair, you can get the rows that matching your conditions using GROUP BY:
select user_id, app_id
from t
group by user_id, app_id
having min(case when event_type = 'store_app_view' then event_time end) <
max(case when event_type = 'store_app_download' then event_time end);
To get the total for each app, use a subquery or CTE:
select app_id, count(*)
from (select user_id, app_id
from t
group by user_id, app_id
having min(case when event_type = 'store_app_view' then event_time end) <
max(case when event_type = 'store_app_download' then event_time end)
) ua
group by app_id;

BigQuery (Google Analytics data):query two different 'hits.customDimensions.index' in the same 'hits.hitNumber'

my goal:
Count 1 for the session if the following two hits.customDimensions.index and associated hits.customDimensions.value appear in the same hits.hitNumber (every row is 1 session if main query is still nested):
['hits.customDimensions.index' = 43 with associated 'hits.customDimensions.value' IN ('login', 'payment', 'order', 'thankyou')] AND ['hits.customDimensions.index' = 10 with associated 'hits.customDimensions.value' = 'checkout' [in the same hits.hitNumber]
my problem:
I don't know how i can query two different hits.customDimensions.value in the same hits.hitNumber in one Subquery without different WITH-tables. If it's possible, which I'm sure, the query would be very easy and short. Since i don't know how to query this usecase in a subquery, I use an workaround which totals to 5 WITH-tables. I would appreciate an easy way to query this usecase
Explanation workaround query:
Table1: Queries all except the 'problem-metric'
Table2-3: Each table queries one hits.customDimensions.index with associated hits.customDimensions.value filtered for the correct value, sessionId and hitNumber
table4: left join table 2 with table 3 based on date, sessionID and hitNumber. Basically if hitNumber combined with sessionId from table2 and table3 match I count 1
table5: left join table1 with table4 to combine the data
#Table1 - complete data except session_atleast_loginCheckout
WITH
prepared_data AS (
SELECT
date,
SUM((SELECT 1 FROM UNNEST(hits) WHERE CAST(eCommerceAction.action_type AS INT64) BETWEEN 4 AND 6 LIMIT 1)) AS sessions_atleast_basket,
#insert in this row query for sessions_atleast_loginCheckout
SUM((SELECT 1 FROM UNNEST(hits) as h, UNNEST(h.customDimensions) as hcd WHERE index = 43 AND value IN ('payment', 'order', 'thankyou') LIMIT 1)) AS sessions_atleast_payment,
FROM
`big-query-221916.172008714.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND totals.visits = 1
GROUP BY
date
#Table2 - data for hits.customDimensions.index = 10 AND associated hits.customDimensions.value = 'checkout' with hits.hitNumber and sessionId (join later based on hitNumber and sessionId)
loginCheckout_index10_pagetype_data AS (
SELECT
date AS date,
CONCAT(fullVisitorId, '/', CAST( visitStartTime AS STRING)) AS sessionId,
h.hitNumber AS hitNumber,
IF(hcd.value IS NOT NULL, 1, NULL) AS pagetype_checkout
FROM
`big-query-221916.172008714.ga_sessions_*` AS o, UNNEST(hits) as h, UNNEST(h.customDimensions) as hcd
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND hcd.index = 10 AND VALUE = 'checkout' AND h.type = 'PAGE' AND totals.visits = 1),
#Table3 - data for hits.customDimensions.index = 43 AND associated hits.customDimensions.value IN ('login', 'register', 'payment', 'order','thankyou') with hits.hitNumber and sessionId (join later based on hitNumber and sessionId)
loginCheckout_index43_pagelevel1_data AS (
SELECT
date AS date,
CONCAT(fullVisitorId, '/', CAST( visitStartTime AS STRING)) AS sessionId,
h.hitNumber AS hitNumber,
IF(hcd.value IS NOT NULL, 1, NULL) AS pagelevel1_login_to_thankyou
FROM
`big-query-221916.172008714.ga_sessions_*` AS o, UNNEST(hits) as h, UNNEST(h.customDimensions) as hcd
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND hcd.index = 43 AND VALUE IN ('login', 'register', 'payment', 'order', 'thankyou') AND h.type = 'PAGE'
),
#table4 - left join table2 and table 3 on sessionId and hitNumber to get sessions_atleast_loginCheckout
loginChackout_output_data AS(
SELECT
a.date AS date,
COUNT(DISTINCT a.sessionId) AS sessions_atleast_loginCheckout
FROM
loginCheckout_index10_pagetype_data AS a
LEFT JOIN
loginCheckout_index43_pagelevel1_data AS b
ON
a.date = b.date AND
a.sessionId = b.sessionId AND
a.hitNumber = b.hitNumber
WHERE
pagelevel1_login_to_thankyou IS NOT NULL
GROUP BY
date
#table5 - leftjoin table1 with table4 to get all data together
SELECT
prep.date,
prep.sessions_atleast_basket,
log.sessions_atleast_loginCheckout,
prep.sessions_atleast_payment
FROM
prepared_data AS prep
LEFT JOIN
loginChackout_output_data as log
ON
prep.date = log.date AND

It's a bit like Inception, but maybe it helps to keep in mind that the input of unnest() is an array and the output are table rows ...
SELECT
SUM(totals.visits) as sessions
FROM
`big-query-221916.172008714.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND -- the following two hits.customDimensions.index and associated hits.customDimensions.value appear in the same hits.hitNumber
(SELECT COUNT(1)>0 as hitsCountMoreThanZero FROM UNNEST(hits) AS h
WHERE
-- index 43, value IN ('login', 'payment', 'order', 'thankyou')
(select count(1)>0 from unnest(h.customdimensions) where index=43 and value IN ('login', 'payment', 'order', 'thankyou'))
AND
-- index 10, value = 'checkout'
(select count(1)>0 from unnest(h.customdimensions) where index=10 and value='checkout')
)
GROUP BY
date

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Analyse each identifier individually in single query on PostgreSQL - sql

Related

Double counting problem in Rolling weekly / monthly active endpoints

Time intervals analysis in BigQuery

Computing session start and end using SQL window functions

BigQuery SQL: filter on event sequence

BigQuery (Google Analytics data):query two different 'hits.customDimensions.index' in the same 'hits.hitNumber'

Categories

Resources