Splitting time series with Google BigQuery by field value - sql

I have a dataset in Google Bigquery with vehicle positions over time and the direction they are going relative to a base, like
time | x | y | direction | vehicle_id
-----|-----|-----|-----------|-----------
0:00 | ... | ... | returning | 100
0:00 | ... | ... | returning | 200
0:00 | ... | ... | exploring | 300
0:05 | ... | ... | returning | 100
0:05 | ... | ... | exploring | 200
0:05 | ... | ... | exploring | 300
0:10 | ... | ... | exploring | 100
0:10 | ... | ... | exploring | 200
0:10 | ... | ... | exploring | 300
0:15 | ... | ... | exploring | 100
0:15 | ... | ... | exploring | 200
0:15 | ... | ... | returning | 300
I'm able to aggregate by vehicle easily, but I can't come up with a query that can break each vehicle series into 'trips', consisting of sequential occurrences of 'returning' or 'exploring'. I have read about analytic functions but none seem to fit the bill.
SELECT
vehicle_id,
ARRAY_AGG(
STRUCT(direction, time, x, y)
ORDER BY time) as series
FROM t
GROUP BY vehicle_id;
[
{
"vehicle_id": 100,
"series":
[
{"direction": "returning", "time": "0:00", "x": ..., "y": ...},
{"direction": "returning", "time": "0:05", "x": ..., "y": ...},
{"direction": "exploring", "time": "0:10", "x": ..., "y": ...},
{"direction": "exploring", "time": "0:15", "x": ..., "y": ...}
]
},
{
"vehicle_id": 200,
"series":
[
{"direction": "returning", "time": "0:00", "x": ..., "y": ...},
{"direction": "exploring", "time": "0:00", "x": ..., "y": ...},
{"direction": "exploring", "time": "0:00", "x": ..., "y": ...},
{"direction": "exploring", "time": "0:00", "x": ..., "y": ...}
]
},
{
"vehicle_id": 300,
"series":
[
{"direction": "exploring", "time": "0:00", "x": ..., "y": ...},
{"direction": "exploring", "time": "0:00", "x": ..., "y": ...},
{"direction": "exploring", "time": "0:00", "x": ..., "y": ...},
{"direction": "returning", "time": "0:00", "x": ..., "y": ...}
]
}
]
What I really want is to have a sequence of trips by vehicle, where each trip has a direction and a series of (t, x, y) positions. Is that possible to do?

Below is for BigQuery Standard SQL and uses pure SQL to achieve the very same result
#standardSQL
SELECT vehicle_id, ARRAY_AGG(STRUCT(direction, trip)) trips
FROM (
SELECT vehicle_id, direction, ARRAY_AGG(STRUCT(time, x, y) ORDER BY time) trip
FROM dataset
GROUP BY vehicle_id, direction
)
GROUP BY vehicle_id
If to apply to sample data from your question as in example below
#standardSQL
WITH dataset AS (
SELECT
TIMESTAMP '2019-09-07 00:00:00' AS time,
0.1 AS x, 0.1 AS y, 'returning' AS direction,
100 AS vehicle_id
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:00', 0.2, 0.2, 'returning', 200
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:00', 0.3, 0.3, 'exploring', 300
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:05', 1.1, 1.1, 'returning', 100
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:05', 1.2, 1.2, 'exploring', 200
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:05', 1.3, 1.3, 'exploring', 300
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:10', 2.1, 2.1, 'exploring', 100
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:10', 2.2, 2.2, 'exploring', 200
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:10', 2.3, 2.3, 'exploring', 300
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:15', 3.1, 3.1, 'exploring', 100
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:15', 3.2, 3.2, 'exploring', 200
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:15', 3.3, 3.3, 'returning', 300
)
SELECT vehicle_id, ARRAY_AGG(STRUCT(direction, trip)) trips
FROM (
SELECT vehicle_id, direction, ARRAY_AGG(STRUCT(time, x, y) ORDER BY time) trip
FROM dataset
GROUP BY vehicle_id, direction
)
GROUP BY vehicle_id
result is

I wasn't able to come up with a pure SQL solution, but Bigquery does provide a way to execute arbitrary processing within itself in the form of user-defined functions (UDF).
By aggregating the entire series of a vehicle into an array, we can feed it into a Javascript function that performs the necessary logic and splits the series in a sequence of trips.
CREATE TEMPORARY FUNCTION split_trips(
series ARRAY<STRUCT<direction STRING,
time TIMESTAMP,
x FLOAT64,
y FLOAT64>>)
RETURNS ARRAY<STRUCT<direction STRING,
trip ARRAY<STRUCT<time TIMESTAMP,
x FLOAT64,
y FLOAT64>>>>
LANGUAGE js AS """
if (series.length == 0) {
return [];
}
let result = [];
let trip = [];
for (let i = 0; i < series.length-1; i++) {
let {direction, time, x, y} = series[i];
trip.push({time: time, x: x, y: y});
if (direction == series[i+1].direction) {
continue;
}
result.push({direction: direction, trip: trip});
trip = [];
}
let lastEntry = series[series.length-1];
trip.push({time: lastEntry.time, x: lastEntry.x, y: lastEntry.y});
result.push({direction: lastEntry.direction, trip: trip});
return result;
""";
WITH dataset AS (
SELECT
TIMESTAMP '2019-09-07 00:00:00' AS time,
0.1 AS x, 0.1 AS y, 'returning' AS direction,
100 AS vehicle_id
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:00', 0.2, 0.2, 'returning', 200
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:00', 0.3, 0.3, 'exploring', 300
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:05', 1.1, 1.1, 'returning', 100
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:05', 1.2, 1.2, 'exploring', 200
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:05', 1.3, 1.3, 'exploring', 300
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:10', 2.1, 2.1, 'exploring', 100
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:10', 2.2, 2.2, 'exploring', 200
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:10', 2.3, 2.3, 'exploring', 300
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:15', 3.1, 3.1, 'exploring', 100
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:15', 3.2, 3.2, 'exploring', 200
UNION ALL SELECT TIMESTAMP '2019-09-07 00:00:15', 3.3, 3.3, 'returning', 300
),
by_vehicle AS (
SELECT
vehicle_id,
ARRAY_AGG(STRUCT(direction, time, x, y)
ORDER BY TIME) AS series
FROM dataset
GROUP BY vehicle_id
)
SELECT
vehicle_id,
split_trips(series) AS trips
FROM by_vehicle
The Bigquery documentation notes that each function invocation can produce at most 5 MiB of data, so considering the timestamp (64 bits) and two floats (64 bits each) that gives us at most ~200K entries to manipulate at once for each vehicle.

Related

Flatten JSON Object in Snowflake

I have a question about using the flatten function in Snowflake. I'm having trouble with extracting data from following path data:performance: of the following JSON-object:
{
"data": {
"metadata": {
"id": "001",
"created_at": "2020-01-01"
},
"performance": {
"2020-01-01": {
"ad_performances": [{
"ad": "XoGKkgcy7V3BDm6m",
"ad_impressions": 1,
"clicks": 0,
"device": "-3",
"total_net_amount": 0
}, {
"ad": "XoGKkgmFlHa3V5xj",
"ad_impressions": 17,
"clicks": 0,
"device": "-4",
"total_net_amount": 0
}, {
"ad": "XoGKkgmFlHa3V5xj",
"ad_impressions": 5,
"clicks": 0,
"device": "-5",
"total_net_amount": 0
}, {
"ad": "XoGKkgcy7V3BDm6m",
"ad_impressions": 19,
"clicks": 0,
"device": "-2",
"total_net_amount": 0
}, {
"ad": "XoGKkgcy7V3BDm6m",
"ad_impressions": 5,
"clicks": 0,
"device": "-1",
"total_net_amount": 0
}]
}
}
}
Desired result is a table with the "date" (2020-01-01), "ad" and "impressions".
I tried to achieve the desired result with:
select
key::date as date
,f.value:performances:ad as performances_array
,f.value:performances:impressions as performances_array
from <table>, lateral flatten (input => CLMN:performances) f;
but I´m not able to extract data from the "performance-array". Can someone help me out?
Thank you!
Can you try this one?
select f.KEY date,
l.VALUE:"ad" as performances_array,
l.VALUE:"impressions" as performances_array
from mydata, lateral flatten (input => CLMN:data.performance ) f,
lateral flatten (input => f.VALUE ) s,
lateral flatten (input => s.VALUE ) l
;
+------------+--------------------+--------------------+
| DATE | PERFORMANCES_ARRAY | PERFORMANCES_ARRAY |
+------------+--------------------+--------------------+
| 2020-01-01 | "XoGKkgcy7V3BDm6m" | 1 |
| 2020-01-01 | "XoGKkgmFlHa3V5xj" | 17 |
| 2020-01-01 | "XoGKkgmFlHa3V5xj" | |
| 2020-01-01 | "XoGKkgcy7V3BDm6m" | 19 |
| 2020-01-01 | "XoGKkgcy7V3BDm6m" | 5 |
+------------+--------------------+--------------------+
Only 2 LATERAL FLATTENs are required to extract the rows
select
a.key::date as ad_date,
b.value:ad::varchar as ad,
b.value:ad_impressions::int as impressions
from j
, lateral flatten(input => v:data:performance) a
, lateral flatten(input => a.value:ad_performances) b;
AD_DATE
AD
IMPRESSIONS
2020-01-01
XoGKkgcy7V3BDm6m
1
2020-01-01
XoGKkgmFlHa3V5xj
17
2020-01-01
XoGKkgmFlHa3V5xj
5
2020-01-01
XoGKkgcy7V3BDm6m
19
2020-01-01
XoGKkgcy7V3BDm6m
5
If you want to aggregate the data by ad date and ad,
with r as
(
select
a.key::date as ad_date,
b.value:ad::varchar as ad,
b.value:ad_impressions::int as impressions
from j
, lateral flatten(input => v:data:performance) a
, lateral flatten(input => a.value:ad_performances) b
)
select ad_date, ad, sum(impressions) as impressions
from r
group by ad_date, ad;
AD_DATE
AD
IMPRESSIONS
2020-01-01
XoGKkgcy7V3BDm6m
25
2020-01-01
XoGKkgmFlHa3V5xj
22

Concatenate column by condition (oracle)

source_id
source_groupid
source_nm
category_id
level_id
12345
34
ABC
7
2
67549
GI
5
1
24751
BL
6
Result
{"id": 12345, "groupid": 34, "name": ABC, "category_id": 7, "level_id": 2}
{"id": 67549, "groupid": , "name": GI, "category_id": 5, "level_id": 1}
SELECT CONCAT ('{','"id": ', source_id,', ', '"groupid": ', source_groupid,', ','"name": ',source_nm,', ','"category_id": ',category_id,', ', '"level_id": ', level_id, '}') as full_info
FROM table
I need to do column concatenation according to the following pattern. If, for example, there is no entry in the group_id or category_id, then how to write the code so that the template changes and looks for lines 2 and 3 as follows.
{"id": 12345, "groupid": 34, "name": ABC, "category_id": 7, "level_id": 2}
{"id": 67549, "name": GI, "category_id": 5, "level_id": 1}
{"id": 24751, "name": BL, "level_id": 6}
Well, in Oracle (which tag you've used), CONCAT function accepts only two arguments, and - therefore - that code won't work.
Instead, use a double pipe || operator. As of your main problem, CASE it is.
SQL> with test (source_id, source_groupid, source_nm, category_id, level_id) as
2 (select 12345, 34, 'ABC', 7, 2 from dual union all
3 select 67549, null, 'GI' , 5, 1 from dual union all
4 select 24751, null, 'BL' , null, 6 from dual
5 )
6 select '{' || '"id": ' || source_id ||
7 case when source_groupid is not null then ', "groupid": ' || source_groupid end ||
8 case when source_nm is not null then ', "name": ' || source_nm end ||
9 case when category_id is not null then ', "category_id": ' || category_id end ||
10 case when level_id is not null then ', "level_id": ' || level_id end || '}'
11 as result
12 from test;
RESULT
--------------------------------------------------------------------------------
{"id": 12345, "groupid": 34, "name": ABC, "category_id": 7, "level_id": 2}
{"id": 67549, "name": GI, "category_id": 5, "level_id": 1}
{"id": 24751, "name": BL, "level_id": 6}
SQL>
From Oracle 12, don't build JSON by hand; use the JSON_OBJECT function and then you can use ABSENT ON NULL:
SELECT JSON_OBJECT(
KEY 'id' VALUE source_id,
KEY 'groupid' VALUE source_groupid,
KEY 'name' VALUE source_nm,
KEY 'category_id' VALUE category_id,
KEY 'level_id' VALUE level_id
ABSENT ON NULL
) As json
FROM table_name;
Which, for the sample data:
CREATE TABLE table_name (source_id, source_groupid, source_nm, category_id, level_id) AS
SELECT 12345, 34, 'ABC', 7, 2 FROM DUAL UNION ALL
SELECT 67549, NULL, 'GI', 5, 1 FROM DUAL UNION ALL
SELECT 24751, NULL, 'BL', NULL, 6 FROM DUAL;
Outputs:
JSON
{"id":12345,"groupid":34,"name":"ABC","category_id":7,"level_id":2}
{"id":67549,"name":"GI","category_id":5,"level_id":1}
{"id":24751,"name":"BL","level_id":6}
db<>fiddle here

folding over large BigQuery result

Is there any easy way for me to do something like Ocaml's fold_left on a result of a BigQuery query, where each iteration corresponds to one row in the result?
What product or approach would be the easiest way? It would be great if:
all I need to do is to supply the initial state and the 'folder' function
preferably, I'd like to write the 'folder' function in a functional language
I don't need to install any GCP package
Since I don't know which product or language would work, I cannot be more specific, but pseudocode would be like:
let my_init = []
let my_folder = fun state row ->
// append for now, but it will be complicated. I need to do some set operations here. The point is that I need some way of transferring "state" across rows, when I iterate over rows in a predefined order.
row.col1 :: state
let query = "SELECT col1, col2, col3 FROM table1 ORDER BY timestamp"
query |> List.fold my_folder my_init
The result that I want to get from this simplified example is the final "state".
--- UPDATED ---
There is no bound on the number of rows---if we receive more, we get more rows. Typically, the number is more than a few millions but it can be larger than that.
Here's a simplified example that shows the major problem I'm encountering. We have a table with a few columns:
timestamp
user_id: a string id
operation_json: a stringified JSON object, which is a list of operations, each of which corresponds to either:
add user_id to a set
remove user_id from a set
For example, the followings are valid rows:
----------+---------+----------------------------------------------
timestamp | user_id | operation_json
----------+---------+----------------------------------------------
1 | id1 | [ { "op": "add", "set": "set1" } ]
2 | id2 | [ { "op": "add", "set": "set1" } ]
3 | id1 | [ { "op": "add", "set": "set2" } ]
4 | id3 | [ { "op": "add", "set": "set2" } ]
5 | id1 | [ { "op": "remove", "set": "set1" } ]
----------+---------+----------------------------------------------
As a result, I'd like to get sets of users; i.e.,
set1 |-> { id2 }
set2 |-> { id1, id3 }
I thought fold_left-like operation would be convenient. The state would be map>, and the initial-state would be an empty map.
Below [quick and simple] example for BigQuery Standard SQL
#standardSQL
CREATE TEMP FUNCTION fold(arr ARRAY<INT64>, init INT64)
RETURNS FLOAT64
LANGUAGE js AS """
const reducer = (accumulator, currentValue) => accumulator + parseInt(currentValue);
return arr.reduce(reducer, 5);
""";
WITH `project.dataset.table` AS (
SELECT 1 id, [1, 2, 3, 4] arr, 5 initial_state UNION ALL
SELECT 2, [1, 2, 3, 4, 5, 6, 7], 10
)
SELECT id, fold(arr, initial_state) result
FROM `project.dataset.table`
output is
Row id result
1 1 15.0
2 2 33.0
I think it is self-explanatory enough
See more for JS UDF
folding list of rows
See below extension of above
Here you are assembling array from the result's rows before applying fold function (of course you have some limits for UDF here to have in mind and also on how big your ARRAY of rows can go, etc.
#standardSQL
CREATE TEMP FUNCTION fold(arr ARRAY<INT64>, init INT64)
RETURNS FLOAT64
LANGUAGE js AS """
const reducer = (accumulator, currentValue) => accumulator + parseInt(currentValue);
return arr.reduce(reducer, 5);
""";
WITH `project.dataset.table` AS (
SELECT 1 id, 1 item UNION ALL
SELECT 1, 2 UNION ALL
SELECT 1, 3 UNION ALL
SELECT 1, 4 UNION ALL
SELECT 2, 1 UNION ALL
SELECT 2, 2 UNION ALL
SELECT 2, 3 UNION ALL
SELECT 2, 4 UNION ALL
SELECT 2, 5 UNION ALL
SELECT 2, 6 UNION ALL
SELECT 2, 7
)
SELECT id, fold(ARRAY_AGG(item), 5) result
FROM `project.dataset.table`
GROUP BY id
Note, if you need to include more than one field from each row - you can use ARRAY of STRUCT as below example
ARRAY_AGG(STRUCT(id , item) ORDER by id)
Of course, you will need to adjust respectively signature of fold UDF
For example:
#standardSQL
CREATE TEMP FUNCTION fold(arr ARRAY<STRUCT<id INT64, item INT64>>, init INT64)
RETURNS FLOAT64
LANGUAGE js AS """
const reducer = (accumulator, currentValue) => accumulator + parseInt(currentValue.item);
return arr.reduce(reducer, 5);
""";
WITH `project.dataset.table` AS (
SELECT 1 id, 1 item UNION ALL
SELECT 1, 2 UNION ALL
SELECT 1, 3 UNION ALL
SELECT 1, 4 UNION ALL
SELECT 2, 1 UNION ALL
SELECT 2, 2 UNION ALL
SELECT 2, 3 UNION ALL
SELECT 2, 4 UNION ALL
SELECT 2, 5 UNION ALL
SELECT 2, 6 UNION ALL
SELECT 2, 7
)
SELECT id, fold(ARRAY_AGG(t), 5) result
FROM `project.dataset.table` t
GROUP BY id
Below approach has nothing to do with folding per se, but rather attempt to translate your challenge into set-based one (which is more natural for when you dealing with sql) by identifying the latest op action for each user per set and if it is "remove" just eliminate that user from further consideration - if it is "add" just use the latest "add" for that user / set. This in assumption that there cannot be multiple consecutive "add" action for the same user / set - rather - it can be add /remove / add and so on. of course this can be further adjusted based on real use case
So having above in mind - below example for BigQuery Standard SQL
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 ts, 'id1' user_id, '[ { "op": "add", "set": "set1" } ]' operation_json UNION ALL
SELECT 2, 'id2', '[ { "op": "add", "set": "set1" } ]' UNION ALL
SELECT 3, 'id1', '[ { "op": "add", "set": "set2" } ]' UNION ALL
SELECT 4, 'id3', '[ { "op": "add", "set": "set2" } ]' UNION ALL
SELECT 5, 'id1', '[ { "op": "remove", "set": "set1" } ]'
)
SELECT bin, STRING_AGG(user_id, ',' ORDER BY ts) result
FROM (
SELECT user_id, bin, ARRAY_AGG(ts ORDER BY ts DESC LIMIT 1)[OFFSET(0)] ts
FROM (
SELECT ts, user_id, op, bin, LAST_VALUE(op) OVER(win) fin
FROM (
SELECT ts, user_id,
JSON_EXTRACT_SCALAR(REGEXP_REPLACE(operation_json, r'^\[|\]$', ''), '$.op') op,
JSON_EXTRACT_SCALAR(REGEXP_REPLACE(operation_json, r'^\[|\]$', ''), '$.set') bin
FROM `project.dataset.table`
)
WINDOW win AS (
PARTITION BY user_id, bin
ORDER BY ts
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
)
WHERE fin = 'add'
GROUP BY user_id, bin
)
GROUP BY bin
-- ORDER BY bin
output is
Row bin result
1 set1 id2
2 set2 id1,id3
if to apply to below dummy data
WITH `project.dataset.table` AS (
SELECT 1 ts, 'id1' user_id, '[ { "op": "add", "set": "set1" } ]' operation_json UNION ALL
SELECT 2, 'id2', '[ { "op": "add", "set": "set1" } ]' UNION ALL
SELECT 3, 'id1', '[ { "op": "add", "set": "set2" } ]' UNION ALL
SELECT 4, 'id3', '[ { "op": "add", "set": "set2" } ]' UNION ALL
SELECT 5, 'id1', '[ { "op": "remove", "set": "set1" } ]' UNION ALL
SELECT 6, 'id1', '[ { "op": "add", "set": "set1" } ]' UNION ALL
SELECT 7, 'id1', '[ { "op": "remove", "set": "set1" } ]' UNION ALL
SELECT 8, 'id1', '[ { "op": "add", "set": "set1" } ]' UNION ALL
SELECT 9, 'id1', '[ { "op": "remove", "set": "set2" } ]' UNION ALL
SELECT 10, 'id1', '[ { "op": "add", "set": "set2" } ]'
)
result will be
Row bin result
1 set1 id2,id1
2 set2 id3,id1

SQL-style GROUP BY aggregate functions in jq (COUNT, SUM and etc)

Similar questions asked here before:
Count items for a single key: jq count the number of items in json by a specific key
Calculate the sum of object values:
How do I sum the values in an array of maps in jq?
Question
How to emulate the COUNT aggregate function which should behave similarly to its SQL original? Let's extend this question even more to include other regular SQL functions:
COUNT
SUM / MAX/ MIN / AVG
ARRAY_AGG
The last one is not a standard SQL function - it's from PostgreSQL but is quite useful.
At input comes a stream of valid JSON objects. For demonstration let's pick a simple story of owners and their pets.
Model and data
Base relation: Owner
id name age
1 Adams 25
2 Baker 55
3 Clark 40
4 Davis 31
Base relation: Pet
id name litter owner_id
10 Bella 4 1
20 Lucy 2 1
30 Daisy 3 2
40 Molly 4 3
50 Lola 2 4
60 Sadie 4 4
70 Luna 3 4
Source
From above we get a derivative relation Owner_Pet (a result of SQL JOIN of the above relations) presented in JSON format for our jq queries (the source data):
{ "owner_id": 1, "owner": "Adams", "age": 25, "pet_id": 10, "pet": "Bella", "litter": 4 }
{ "owner_id": 1, "owner": "Adams", "age": 25, "pet_id": 20, "pet": "Lucy", "litter": 2 }
{ "owner_id": 2, "owner": "Baker", "age": 55, "pet_id": 30, "pet": "Daisy", "litter": 3 }
{ "owner_id": 3, "owner": "Clark", "age": 40, "pet_id": 40, "pet": "Molly", "litter": 4 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pet_id": 50, "pet": "Lola", "litter": 2 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pet_id": 60, "pet": "Sadie", "litter": 4 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pet_id": 70, "pet": "Luna", "litter": 3 }
Requests
Here are sample requests and their expected output:
COUNT the number of pets per owner:
{ "owner_id": 1, "owner": "Adams", "age": 25, "pets_count": 2 }
{ "owner_id": 2, "owner": "Baker", "age": 55, "pets_count": 1 }
{ "owner_id": 3, "owner": "Clark", "age": 40, "pets_count": 1 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pets_count": 3 }
SUM up the number of whelps per owner and get their MAX (MIN/AVG):
{ "owner_id": 1, "owner": "Adams", "age": 25, "litter_total": 6, "litter_max": 4 }
{ "owner_id": 2, "owner": "Baker", "age": 55, "litter_total": 3, "litter_max": 3 }
{ "owner_id": 3, "owner": "Clark", "age": 40, "litter_total": 4, "litter_max": 4 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "litter_total": 9, "litter_max": 4 }
ARRAY_AGG pets per owner:
{ "owner_id": 1, "owner": "Adams", "age": 25, "pets": [ "Bella", "Lucy" ] }
{ "owner_id": 2, "owner": "Baker", "age": 55, "pets": [ "Daisy" ] }
{ "owner_id": 3, "owner": "Clark", "age": 40, "pets": [ "Molly" ] }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pets": [ "Lola", "Sadie", "Luna" ] }
Here's an alternative, not using any custom functions with basic JQ. (I took the liberty to get rid of redundant parts of the question)
Count
In> jq -s 'group_by(.owner_id) | map({ owner_id: .[0].owner_id, count: map(.pet) | length})'
Out>[{"owner_id": "1","pets_count": 2}, ...]
Sum
In> jq -s 'group_by(.owner_id) | map({owner_id: .[0].owner_id, sum: map(.litter) | add})'
Out> [{"owner_id": "1","sum": 6}, ...]
Max
In> jq -s 'group_by(.owner_id) | map({owner_id: .[0].owner_id, max: map(.litter) | max})'
Out> [{"owner_id": "1","max": 4}, ...]
Aggregate
In> jq -s 'group_by(.owner_id) | map({owner_id: .[0].owner_id, agg: map(.pet) })'
Out> [{"owner_id": "1","agg": ["Bella","Lucy"]}, ...]
Sure, these might not be the most efficient implementations, but they show nicely how to implement custom functions oneself. All that changes between the different functions is inside the last map and the function after the pipe | (length, add, max)
The first map iterates over the different groups, taking the name from the first item, and using map again to iterate over the same-group items. Not as pretty as SQL, but not terribly more complicated.
I learned JQ today, and managed to do this already, so this should be encouraging for anyone getting started. JQ is neither like sed nor like SQL, but not terribly hard either.
Extended jq solution:
Custom count() function:
jq -sc 'def count($k): group_by(.[$k])[] | length as $l | .[0]
| .pets_count = $l
| del(.pet_id, .pet, .litter);
count("owner_id")' source.data
The output:
{"owner_id":1,"owner":"Adams","age":25,"pets_count":2}
{"owner_id":2,"owner":"Baker","age":55,"pets_count":1}
{"owner_id":3,"owner":"Clark","age":40,"pets_count":1}
{"owner_id":4,"owner":"Davis","age":31,"pets_count":3}
Custom sum() function:
jq -sc 'def sum($k): group_by(.[$k])[] | map(.litter) as $litters | .[0]
| . + {litter_total: $litters | add, litter_max: $litters | max}
| del(.pet_id, .pet, .litter);
sum("owner_id")' source.data
The output:
{"owner_id":1,"owner":"Adams","age":25,"litter_total":6,"litter_max":4}
{"owner_id":2,"owner":"Baker","age":55,"litter_total":3,"litter_max":3}
{"owner_id":3,"owner":"Clark","age":40,"litter_total":4,"litter_max":4}
{"owner_id":4,"owner":"Davis","age":31,"litter_total":9,"litter_max":4}
Custom array_agg() function:
jq -sc 'def array_agg($k): group_by(.[$k])[] | map(.pet) as $pets | .[0]
| .pets = $pets | del(.pet_id, .pet, .litter);
array_agg("owner_id")' source.data
The output:
{"owner_id":1,"owner":"Adams","age":25,"pets":["Bella","Lucy"]}
{"owner_id":2,"owner":"Baker","age":55,"pets":["Daisy"]}
{"owner_id":3,"owner":"Clark","age":40,"pets":["Molly"]}
{"owner_id":4,"owner":"Davis","age":31,"pets":["Lola","Sadie","Luna"]}
This is a nice exercise, but SO is not a programming service, so I will focus here on some key concepts for generic solutions in jq that are efficient, even for very large collections.
GROUPS_BY
The key to efficiency here is avoiding the built-in group_by, as it requires sorting. Since jq is fundamentally stream-oriented, the following definition of GROUPS_BY is likewise stream-oriented. It takes advantage of the efficiency of key-based lookups, while avoiding calling tojson on strings:
# emit a stream of the groups defined by f
def GROUPS_BY(stream; f):
reduce stream as $x ({};
($x|f) as $s
| ($s|type) as $t
| (if $t == "string" then $s else ($s|tojson) end) as $y
| .[$t][$y] += [$x] )
| .[][] ;
distinct and count_distinct
# Emit an array of the distinct entities in `stream`, without sorting
def distinct(stream):
reduce stream as $x ({};
($x|type) as $t
| (if $t == "string" then $x else ($x|tojson) end) as $y
| if (.[$t] | has($y)) then . else .[$t][$y] += [$x] end )
| [.[][]] | add ;
# Emit the number of distinct items in the given stream
def count_distinct(stream):
def sum(s): reduce s as $x (0;.+$x);
reduce stream as $x ({};
($x|type) as $t
| (if $t == "string" then $x else ($x|tojson) end) as $y
| .[$t][$y] = 1 )
| sum( .[][] ) ;
Convenience function
def owner: {owner_id,owner,age};
Example: "COUNT the number of pets per owner"
GROUPS_BY(inputs; .owner_id)
| (.[0] | owner) + {pets_count: count_distinct(.[]|.pet_id)}
Invocation: jq -nc -f program1.jq input.json
Output:
{"owner_id":1,"owner":"Adams","age":25,"pets_count":2}
{"owner_id":2,"owner":"Baker","age":55,"pets_count":1}
{"owner_id":3,"owner":"Clark","age":40,"pets_count":1}
{"owner_id":4,"owner":"Davis","age":31,"pets_count":3}
Example: "SUM up the number of whelps per owner and get their MAX"
GROUPS_BY(inputs; .owner_id)
| (.[0] | owner)
+ {litter_total: (map(.litter) | add)}
+ {litter_max: (map(.litter) | max)}
Invocation: jq -nc -f program2.jq input.json
Output: as given.
Example: "ARRAY_AGG pets per owner"
GROUPS_BY(inputs; .owner_id)
| (.[0] | owner) + {pets: distinct(.[]|.pet)}
Invocation: jq -nc -f program3.jq input.json
Output:
{"owner_id":1,"owner":"Adams","age":25,"pets":["Bella","Lucy"]}
{"owner_id":2,"owner":"Baker","age":55,"pets":["Daisy"]}
{"owner_id":3,"owner":"Clark","age":40,"pets":["Molly"]}
{"owner_id":4,"owner":"Davis","age":31,"pets":["Lola","Sadie","Luna"]}

How to format and query a PostgreSQL JSON into n column rows in sql

I would like to use a json and be able to display it into three column rows as follow.
scores median sigma
------ ------ -----
BF2D 0.5 0.67
BF2DSL 0.6 0.89
First, I have a hard time to format it.
Second, what would be the format suitable so that I can display it into three column rows. Have a feeling that I should use array representation?
The json_to_record(json) function seems interesting for my usecase, but i need help.
SELECT * from json_to_record('{"version": "1.0", "BF2D": [{"median": 0.5}, {"sigma": 0.67}], "BF2DSL": [{"median": 0.6}, {"sigma": 0.89}]}'::JSON) as x(median FLOAT, sigma FLOAT);
median | sigma
--------+--------
(NULL) | (NULL)
Not what i am looking for
I tried to use nested arrays, but unable to format it into a json
SELECT '{"version": "1.0", "scores": [{ "BF2D": [{"median": 0.5}, {"sigma": 0.67}] }, { "BF2DSL": [{"median": 0.6}, {"sigma": 0.89}] }]'::JSON;
Got it!
We can retreive the base level keys as follow:
SELECT * FROM json_object_keys('{"version": "1.0", "BF2D": {"avg": 0.5, "max": 99}, "BF2DSL": {"avg": 0.9, "max": 67}}'::JSON) scores WHERE scores<>'version';
scores
--------
BF2D
BF2DSL
We can get the key/values for a specific base level key, e.g., for BF2D as follow
SELECT * FROM json_each('{"version": "1.0", "BF2D": {"avg": 0.5, "max": 99}, "BF2DSL": {"avg": 0.9, "max": 67}}'::JSON->'BF2D');
KEY | value
-----+-------
avg | 0.5
max | 99
But, cannot use the above two queries to get 3 column rows
SELECT scores, json_each('{"version": "1.0", "BF2D": {"avg": 0.5, "max": 99}, "BF2DSL": {"avg": 0.9, "max": 67}}'::JSON->scores)
FROM (SELECT * FROM json_object_keys('{"version": "1.0", "BF2D": {"avg": 0.5, "max": 99}, "BF2DSL": {"avg": 0.9, "max": 67}}'::JSON) scores WHERE scores<>'version') t;
scores | json_each
--------+-----------
BF2D | (avg,0.5)
BF2D | (max,99)
BF2DSL | (avg,0.9)
BF2DSL | (max,67)
Finally, we can display the result into separate columns as follow
WITH p AS
(SELECT '{"version": "1.0", "BF2D": {"avg": 0.5, "max": 99, "sigma": 0.1}, "BF2DSL": {"avg": 0.9, "max": 67}}'::JSON AS jval),
t AS
(SELECT jval,json_object_keys(jval) cod FROM p),
scores AS
(SELECT * FROM t WHERE cod<>'version')
SELECT cod,t.* FROM scores,json_each(jval->cod) t;
cod | key | value
--------+-------+-------
BF2D | avg | 0.5
BF2D | max | 99
BF2D | sigma | 0.1
BF2DSL | avg | 0.9
BF2DSL | max | 67