How to query on fields from nested records without referring to the parent records in BigQuery? - google-bigquery

I have data structured as follows:
{
"results": {
"A": {"first": 1, "second": 2, "third": 3},
"B": {"first": 4, "second": 5, "third": 6},
"C": {"first": 7, "second": 8, "third": 9},
"D": {"first": 1, "second": 2, "third": 3},
... },
...
}
i.e. nested records, where the lowest level has the same schema for all records in the level above. The schema would be similar to this:
results RECORD NULLABLE
results.A RECORD NULLABLE
results.A.first INTEGER NULLABLE
results.A.second INTEGER NULLABLE
results.A.third INTEGER NULLABLE
results.B RECORD NULLABLE
results.B.first INTEGER NULLABLE
...
Is there a way to do (e.g. aggregate) queries in BigQuery on fields from the lowest level, without knowledge of the keys on the (direct) parent level? Put differently, can I do a query on first for all records in results without having to specify A, B, ... in my query?
I would for example want to achieve something like
SELECT SUM(results.*.first) FROM table
in order to get 1+4+7+1 = 13,
but SELECT results.*.first isn't supported.
(I've tried playing around with STRUCTs, but haven't gotten far.)

Below trick is for BigQuery Standard SQL
#standardSQL
SELECT id, (
SELECT AS STRUCT
SUM(first) AS sum_first,
SUM(second) AS sum_second,
SUM(third) AS sum_third
FROM UNNEST([a]||[b]||[c]||[d])
).*
FROM `project.dataset.table`,
UNNEST([results])
You can test, play with above using dummy/sample data from your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 AS id, STRUCT(
STRUCT(1 AS first, 2 AS second, 3 AS third) AS A,
STRUCT(4 AS first, 5 AS second, 6 AS third) AS B,
STRUCT(7 AS first, 8 AS second, 9 AS third) AS C,
STRUCT(1 AS first, 2 AS second, 3 AS third) AS D
) AS results
)
SELECT id, (
SELECT AS STRUCT
SUM(first) AS sum_first,
SUM(second) AS sum_second,
SUM(third) AS sum_third
FROM UNNEST([a]||[b]||[c]||[d])
).*
FROM `project.dataset.table`,
UNNEST([results])
with output
Row id sum_first sum_second sum_third
1 1 13 17 21

Is there a way to do (e.g. aggregate) queries in BigQuery on fields from the lowest level, without knowledge of the keys on the (direct) parent level?
Below is for BigQuery Standard SQL and totally avoids referencing parent records (A, B, C, D, etc.)
#standardSQL
CREATE TEMP FUNCTION Nested_SUM(entries ANY TYPE, field_name STRING) AS ((
SELECT SUM(CAST(SPLIT(kv, ':')[OFFSET(1)] AS INT64))
FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(entries), r'":{(.*?)}')) entry,
UNNEST(SPLIT(entry)) kv
WHERE TRIM(SPLIT(kv, ':')[OFFSET(0)], '"') = field_name
));
SELECT id,
Nested_SUM(results, 'first') AS first_sum,
Nested_SUM(results, 'second') AS second_sum,
Nested_SUM(results, 'third') AS third_sum,
Nested_SUM(results, 'forth') AS forth_sum
FROM `project.dataset.table`
if to apply to sample data from your question as in below example
#standardSQL
CREATE TEMP FUNCTION Nested_SUM(entries ANY TYPE, field_name STRING) AS ((
SELECT SUM(CAST(SPLIT(kv, ':')[OFFSET(1)] AS INT64))
FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(entries), r'":{(.*?)}')) entry,
UNNEST(SPLIT(entry)) kv
WHERE TRIM(SPLIT(kv, ':')[OFFSET(0)], '"') = field_name
));
WITH `project.dataset.table` AS (
SELECT 1 AS id, STRUCT(
STRUCT(1 AS first, 2 AS second, 3 AS third) AS A,
STRUCT(4 AS first, 5 AS second, 6 AS third) AS B,
STRUCT(7 AS first, 8 AS second, 9 AS third) AS C,
STRUCT(1 AS first, 2 AS second, 3 AS third) AS D
) AS results
)
SELECT id,
Nested_SUM(results, 'first') AS first_sum,
Nested_SUM(results, 'second') AS second_sum,
Nested_SUM(results, 'third') AS third_sum,
Nested_SUM(results, 'forth') AS forth_sum
FROM `project.dataset.table`
output is
Row id first_sum second_sum third_sum forth_sum
1 1 13 17 21 null

I adapted Mikhail's answer in order to support grouping on the values of the lowest-level fields:
#standardSQL
CREATE TEMP FUNCTION Nested_AGGREGATE(entries ANY TYPE, field_name STRING) AS ((
SELECT ARRAY(
SELECT AS STRUCT TRIM(SPLIT(kv, ':')[OFFSET(1)], '"') AS value, COUNT(SPLIT(kv, ':')[OFFSET(1)]) AS count
FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(entries), r'":{(.*?)}')) entry,
UNNEST(SPLIT(entry)) kv
WHERE TRIM(SPLIT(kv, ':')[OFFSET(0)], '"') = field_name
GROUP BY TRIM(SPLIT(kv, ':')[OFFSET(1)], '"')
)
));
SELECT id,
Nested_AGGREGATE(results, 'first') AS first_agg,
Nested_AGGREGATE(results, 'second') AS second_agg,
Nested_AGGREGATE(results, 'third') AS third_agg,
FROM `project.dataset.table`
Output for WITH `project.dataset.table` AS (SELECT 1 AS id, STRUCT( STRUCT(1 AS first, 2 AS second, 3 AS third) AS A, STRUCT(4 AS first, 5 AS second, 6 AS third) AS B, STRUCT(7 AS first, 8 AS second, 9 AS third) AS C, STRUCT(1 AS first, 2 AS second, 3 AS third) AS D) AS results ):
Row id first_agg.value first_agg.count second_agg.value second_agg.count third_agg.value third_agg.count
1 1 1 2 2 2 3 2
4 1 5 1 6 1
7 1 8 1 9 1

Related

BigQuery UDF define constant dictionary and match for a given function argument

What is the best way to define a map: Map(1 -> "One", 2 -> "Two") and define a function which will match to above function? I am thinking of defining a dictionary via Javascript and matching in the function body. An example would be great. Thanks
Below example for BigQuery Standard SQL
#standardSQL
CREATE TEMP FUNCTION MAP(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS (
IFNULL((
SELECT result
FROM (SELECT NULL AS search, NULL AS result UNION ALL SELECT * FROM UNNEST(map))
WHERE search = expr), `default`)
);
WITH `project.dataset.table` AS (
SELECT 1 id, 4 location_id UNION ALL
SELECT 2, 2 UNION ALL
SELECT 3, 5
)
SELECT id, location_id,
MAP(location_id,
[ (1, 'Los Angeles'),
(2, 'San Francisco'),
(3, 'New York'),
(4, 'Seattle')
], 'Non US') AS `Location`
FROM `project.dataset.table`
with result
Row id location_id Location
1 1 4 Seattle
2 2 2 San Francisco
3 3 5 Non US

How to implement generic Oracle DECODE function in BigQuery?

I'm looking into implementing the Oracle DECODE function as a UDF.
Below is the outward functionaliy
https://docs.oracle.com/cd/B19306_01/server.102/b14200/functions040.htm
Below is the outward functionality and syntax of a decode in Oracle:
Oracle:
DECODE( <expr> , <search1> , <result1> [ , <search2> , <result2> ... ] [ , <default> ] )
SELECT product_id,
DECODE (warehouse_id, 1, 'Southlake',
2, 'San Francisco',
3, 'New Jersey',
4, 'Seattle',
'Non domestic')
"Location of inventory" FROM inventories;
Primarily, with BigQuery UDFs SQL or JavaScript, with BigQuery UDFs, when you define the UDF function you need to know the number of parameters you are accepting and typing. When you define the SQL UDF function, you can also accept an array of type anything, but I am not sure if it can work and SQL UDF can perform what we want with an array. It seems based on the Javascript UDF documentation all parameters are named and typed and known up front.
Is there a way to accomplish this using a BigQuery UDF, it has to be dynamic like Oracle decode and fit any scenario you put in front of it not static knowing what you are decoding
Below is for BigQuery Standard SQL
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS ((
IFNULL((SELECT result FROM UNNEST(map) WHERE search = expr), `default`)
));
You can see how it works using below example
#standardSQL
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS ((
IFNULL((SELECT result FROM UNNEST(map) WHERE search = expr), `default`)
));
WITH `project.dataset.inventories` AS (
SELECT 1 product_id, 4 warehouse_id UNION ALL
SELECT 2, 2 UNION ALL
SELECT 3, 5
)
SELECT product_id, warehouse_id,
DECODE(warehouse_id,
[STRUCT<search INT64, result STRING>
(1,'Southlake'),
(2,'San Francisco'),
(3,'New Jersey'),
(4,'Seattle')
], 'Non domestic') AS `Location_of_inventory`
FROM `project.dataset.inventories`
with result
Row product_id warehouse_id Location_of_inventory
1 1 4 Seattle
2 2 2 San Francisco
3 3 5 Non domestic
Another example of use is:
#standardSQL
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS ((
IFNULL((SELECT result FROM UNNEST(map) WHERE search = expr), `default`)
));
WITH `project.dataset.inventories` AS (
SELECT 1 product_id, 4 warehouse_id UNION ALL
SELECT 2, 2 UNION ALL
SELECT 3, 5
), map AS (
SELECT 1 search, 'Southlake' result UNION ALL
SELECT 2, 'San Francisco' UNION ALL
SELECT 3, 'New Jersey' UNION ALL
SELECT 4, 'Seattle'
)
SELECT product_id, warehouse_id,
DECODE(warehouse_id, kv, 'Non domestic') AS `Location_of_inventory`
FROM `project.dataset.inventories`,
(SELECT ARRAY_AGG(STRUCT(search, result)) AS kv FROM map) arr
with the same output
Update to address - "for a reusable UDF, not having to name the fields makes it closer to Oracle's implementation."
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS (
IFNULL((
SELECT result FROM (
SELECT NULL AS search, NULL AS result UNION ALL SELECT * FROM UNNEST(map)
)
WHERE search = expr
), `default`)
);
So now - previous examples can be used w/o explicit naming fields as in example below
#standardSQL
CREATE TEMP FUNCTION DECODE(expr ANY TYPE, map ANY TYPE, `default` ANY TYPE ) AS (
IFNULL((
SELECT result FROM (
SELECT NULL AS search, NULL AS result UNION ALL SELECT * FROM UNNEST(map)
)
WHERE search = expr
), `default`)
);
WITH `project.dataset.inventories` AS (
SELECT 1 product_id, 4 warehouse_id UNION ALL
SELECT 2, 2 UNION ALL
SELECT 3, 5
)
SELECT product_id, warehouse_id,
DECODE(warehouse_id,
[ (1,'Southlake'),
(2,'San Francisco'),
(3,'New Jersey'),
(4,'Seattle')
], 'Non domestic') AS `Location_of_inventory`
FROM `project.dataset.inventories`
still with same output as before
```
SELECT product_id,
case
when warehouse_id = 1 then 'Southlake'
when warehouse_id = 2 then 'San Francisco'
when warehouse_id = 3 then 'New Jersey'
when warehouse_id = 4 then 'Seattle'
else
'Non domestic'
end as "Location of inventory" FROM inventories;
```

Bigquery union two arrays of different struct

I have two tables
v1
ARRAY<STRUCT<a int64>>
and
v2
ARRAY<STRUCT<a int64, b int64>>
I want to write query which unions both tables using union all and for v1 rows put nulls in place of b field. Any help is appreciated :)
I'm using standard SQL.
Below is for BigQuery Standard SQL
#standardSQL
SELECT
ARRAY(SELECT AS STRUCT val.a, NULL AS b FROM UNNEST(arr1) val) arr
FROM `project.dataset.v1`
UNION ALL
SELECT arr2 AS arr
FROM `project.dataset.v2`
you can test, play with above using dummy data as below
#standardSQL
WITH `project.dataset.v1` AS (
SELECT [STRUCT<a INT64>(1),STRUCT(2),STRUCT(3)] arr1
), `project.dataset.v2` AS (
SELECT [STRUCT<a INT64, b INT64>(100, 1),STRUCT(100, 2),STRUCT(100, 3)] arr2
)
SELECT
ARRAY(SELECT AS STRUCT val.a, NULL AS b FROM UNNEST(arr1) val) arr
FROM `project.dataset.v1`
UNION ALL
SELECT arr2 AS arr
FROM `project.dataset.v2`
with result as
Row arr.a arr.b
1 1 null
2 null
3 null
2 100 1
100 2
100 3

Idiomatic equivalent to map structure

My analytics involves the need to aggregate rows and to store the number of different values occurrences of a field someField in all the rows.
Sample data structure
[someField, someKey]
I'm trying to GROUP BY someKey and then be able to know for each of the results how many time there was each someField values
Example:
[someField: a, someKey: 1],
[someField: a, someKey: 1],
[someField: b, someKey: 1],
[someField: c, someKey: 2],
[someField: d, someKey: 2]
What I would like to achieve:
[someKey: 1, fields: {a: 2, b: 1}],
[someKey: 2, fields: {c: 1, d: 1}],
Does it work for you?
WITH data AS (
select 'a' someField, 1 someKey UNION all
select 'a', 1 UNION ALL
select 'b', 1 UNION ALL
select 'c', 2 UNION ALL
select 'd', 2)
SELECT
someKey,
ARRAY_AGG(STRUCT(someField, freq)) fields
FROM(
SELECT
someField,
someKey,
COUNT(someField) freq
FROM data
GROUP BY 1, 2
)
GROUP BY 1
Results:
It won't give exactly the results you are looking for, but it might work to receive the same queries your previous result would. As you said, for each key you can retrieve how many times (column freq) someField happened.
I've been looking for a way on how to aggregate structs and couldn't find one. But retrieving the results as an ARRAY of STRUCTS turned out to be quite straightforward.
There's probably a smarter way to do this (and get it in the format you want e.g. using an Array for the 2nd column), but this might be enough for you:
with sample as (
select 'a' as someField, 1 as someKey UNION all
select 'a' as someField, 1 as someKey UNION ALL
select 'b' as someField, 1 as someKey UNION ALL
select 'c' as someField, 2 as someKey UNION ALL
select 'd' as someField, 2 as someKey)
SELECT
someKey,
SUM(IF(someField = 'a', 1, 0)) AS a,
SUM(IF(someField = 'b', 1, 0)) AS b,
SUM(IF(someField = 'c', 1, 0)) AS c,
SUM(IF(someField = 'd', 1, 0)) AS d
FROM
sample
GROUP BY
someKey order by somekey asc
Results:
someKey a b c d
---------------------
1 2 1 0 0
2 0 0 1 1
This is well used technique in BigQuery (see here).
I'm trying to GROUP BY someKey and then be able to know for each of the results how many time there was each someField values
#standardSQL
SELECT
someKey,
someField,
COUNT(someField) freq
FROM yourTable
GROUP BY 1, 2
-- ORDER BY someKey, someField
What I would like to achieve:
[someKey: 1, fields: {a: 2, b: 1}],
[someKey: 2, fields: {c: 1, d: 1}],
This is different from what you expressed in words - it is called pivoting and based on your comment - The a, b, c, and d keys are potentially infinite - most likely is not what you need. At the same time - pivoting is easily doable too (if you have some finite number of field values) and you can find plenty of related posts

SQL Server stored procedure: iterate through table, compare values, insert to different table on value change

I can't tell whether my particular situation has already been covered given the titles of other questions, so apologies if an answer already exists.
I have a database that records values as strings, and another table that records runs of particular types within those values.
I require a stored procedure that iterates through the values (I understand this is related to the concept of Cursors), recording each value to temporary tables to control the count for a particular run type (odd/even numbers, for example, or vowels/consonants). When a given value indicates that a particular type of run has stopped (i.e. an odd number has stopped a run of even numbers and vice versa), the run is counted, the count is inserted into a runs table with a related type value (0 = odd/even, 1 = vowel/consonant etc.), the temporary table contents are deleted, and the value that caused the table count/clear is inserted to the temp table.
As I am completely new to stored procedures, I don't know exactly how to structure this kind of procedure, and the examples I've found don't:
Describe how to implement Cursors in a straightforward, understandable manner
Provide insights into comparisons between a given value and a stored comparison value
Allow for recognition of changes to an established pattern to initiate a section of a procedure
Let me know if any of this needs clarifying.
EDIT:
Version in use: MS SQL Server 2012
table structure for the raw values:
ID: Int PK AI
DateTimeStamp: Datetime
SelectedValue: Char(2)
UserId: Int
table structure for value runs:
ID: Int PK AI
DateTimeStamp: Datetime
Type: Int
Run: Int
Sample data: [following presented as comma-delimited string for brevity, input by one user]
e, 00, 1, t, r, 2, 4, 3, 5, 7, a, i, w, q, u, o, 23, 25, 24, 36, 12, e ...
groups would be:
vowels/consonants
even numbers/odd numbers
00
numbers under/over 20
numbers/letters
From the above, the runs are:
e (vowels/consonants: vowels)
e (numbers/letters: letters)
00 (00)
1 (odd/even: odd)
1 (numbers/letters: numbers)
t, r (vowels/consonants: consonants)
t, r (numbers/letters: letters)
2, 4 (odd/even: even)
3, 5, 7 (odd/even: odd)
2, 4, 3, 5, 7 (numbers/letters: numbers)
a, i (vowels/consonants: vowels)
w, q (vowels/consonants: consonants)
a, i, w, q, u, o (numbers/letters: letters)
1, 2, 4, 3, 5, 7 (under/over 20: under 20)
23, 25 (odd/even: odd)
23, 25, 24, 36 (under/over 20: over 20)
24, 36, 12 (odd/even: even)
u, o, e (vowels/consonants: vowels)
Which would make entries to the runs table as
Type: vowels/consonants, run: 1
Type: numbers/letters, run: 1
Type: 00, run: 1
Type: odd/even, run: 1
Type: numbers/letters, run: 1
Type: odd/even, run: 2
Type: odd/even, run: 3
Type: numbers/letters, run: 5
Type: vowels/consonants, run: 2
Type: vowels/consonants, run: 2
Type: numbers/letters, run: 6
Type: under/over 20, run: 6
Type: odd/even, run: 2
Type: under/over 20, run: 4
Type: odd/even, run: 3
Type: vowels/consonants, run: 3
EDIT Updated based on clarification of the original question.
This might not be the cleanest solution, but it should get you started:
WITH cteClassifications (ID, GroupNo, Type, Description) As
(
-- Vowels:
SELECT
ID,
1,
1,
'Vowels'
FROM
RawData
WHERE
SelectedValue In ('a', 'e', 'i', 'o', 'u')
UNION ALL
-- Consonants:
SELECT
ID,
1,
2,
'Consonants'
FROM
RawData
WHERE
SelectedValue Between 'a' And 'z'
And
SelectedValue Not In ('a', 'e', 'i', 'o', 'u')
UNION ALL
-- Even numbers:
SELECT
ID,
2,
1,
'Even numbers'
FROM
RawData
WHERE
SelectedValue != '00'
And
SelectedValue Not Between 'a' And 'z'
And
(TRY_PARSE(SelectedValue As tinyint) & 1) = 0
UNION ALL
-- Odd numbers:
SELECT
ID,
2,
2,
'Odd numbers'
FROM
RawData
WHERE
SelectedValue != '00'
And
SelectedValue Not Between 'a' And 'z'
And
(TRY_PARSE(SelectedValue As tinyint) & 1) = 1
UNION ALL
-- "00":
SELECT
ID,
3,
1,
'"00"'
FROM
RawData
WHERE
SelectedValue = '00'
UNION ALL
-- Numbers under 20:
SELECT
ID,
4,
1,
'Numbers under 20'
FROM
RawData
WHERE
SelectedValue != '00'
And
SelectedValue Not Between 'a' And 'z'
And
TRY_PARSE(SelectedValue As tinyint) < 20
UNION ALL
-- Numbers over 20:
SELECT
ID,
4,
2,
'Numbers over 20'
FROM
RawData
WHERE
SelectedValue != '00'
And
SelectedValue Not Between 'a' And 'z'
And
TRY_PARSE(SelectedValue As tinyint) > 20
UNION ALL
-- Numbers:
SELECT
ID,
5,
1,
'Numbers'
FROM
RawData
WHERE
SelectedValue != '00'
And
SelectedValue Not Between 'a' And 'z'
And
TRY_PARSE(SelectedValue As tinyint) Is Not Null
UNION ALL
-- Letters:
SELECT
ID,
5,
2,
'Letters'
FROM
RawData
WHERE
SelectedValue Between 'a' And 'z'
),
cteOrderedClassifications (ID, GroupNo, Type, Description, PrevType, RN) As
(
SELECT
ID,
GroupNo,
Type,
Description,
LAG(Type, 1, 0) OVER (PARTITION BY GroupNo ORDER BY ID),
ROW_NUMBER() OVER (PARTITION BY GroupNo ORDER BY ID)
FROM
cteClassifications
),
cteGroupedClassifications (ID, GroupNo, Type, Description, RN, ORN) As
(
SELECT
ID,
GroupNo,
Type,
Description,
RN,
RN
FROM
cteOrderedClassifications As C
WHERE
Type != PrevType
UNION ALL
SELECT
C.ID,
G.GroupNo,
G.Type,
G.Description,
G.RN,
C.RN
FROM
cteGroupedClassifications As G
INNER JOIN cteOrderedClassifications As C
ON C.GroupNo = G.GroupNo
And C.Type = G.Type
And C.RN = G.ORN + 1
),
cteRuns (ID, GroupNo, Type, Description, RN, Run) As
(
SELECT
Min(ID),
GroupNo,
Type,
MAX(Description),
RN,
Count(1)
FROM
cteGroupedClassifications
GROUP BY
GroupNo,
Type,
RN
)
SELECT
ROW_NUMBER() OVER (ORDER BY ID) As ID,
GroupNo,
Type,
Description,
Run
FROM
cteRuns
ORDER BY
ID
;
Once you're happy that the query is working, you can replace the final SELECT with an INSERT INTO Runs (ID, Type, Run) SELECT ID, Type, Run FROM cteFinalRuns to populate the table in a single pass.
SQL Fiddle example