How to avoid alias shadowing in nested fields with non-unique names? - google-bigquery

Given the following table:
I'd like to rename fred to freddy.
For this, I've written the following code:
WITH foo AS (
SELECT
1 corge,
STRUCT(
[STRUCT(
2 AS bar,
3 AS fred)
] AS qux,
4 AS plugh
) bar
)
SELECT
corge as corge,
(SELECT AS STRUCT ARRAY(
SELECT AS STRUCT
bar.qux.bar as bar,
bar.qux.fred as freddy
FROM
foo.bar.qux)
as qux)
as bar,
plugh as plugh
FROM
foo
But it results in the following error:
Cannot access field qux on a value with type INT64 at [17:17]
It seems like the inner bar is shadowing the outer bar. How can I avoid this and make it work?

How about to avoid all of those UNNESTs and rebuilding the arrays and rather simply force new names as it is in below example
WITH foo AS (
SELECT
1 corge,
STRUCT(
[STRUCT(
2 AS bar,
3 AS fred)
] AS qux,
4 AS plugh
) bar
), foo_with_new_names AS (
SELECT
-1 corge,
STRUCT(
[STRUCT(
2 AS bar,
3 AS freddy)
] AS qux,
4 AS plugh
) bar
)
select * from foo_with_new_names where false
union all select * from foo
with output

Try this one:
WITH foo AS (
SELECT
1 corge,
STRUCT(
[STRUCT(2 AS bar, 3 AS fred), (22, 32)] AS qux,
4 AS plugh
) bar
)
SELECT
corge as corge,
(SELECT AS STRUCT
ARRAY(SELECT STRUCT(bar, fred as freddy) FROM unnest(bar.qux)) AS qux,
bar.plugh) AS bar
FROM foo

Based on the very good answer given by Sergey Geron, here is a version that additionally preserves the order of the elements in the qux array:
WITH foo AS (
SELECT
1 corge,
STRUCT(
[STRUCT(
2 AS bar,
3 AS fred)
] AS qux,
4 AS plugh
) bar
)
SELECT
corge AS corge,
(SELECT AS STRUCT ARRAY(
SELECT STRUCT(
bar AS bar,
fred AS freddy)
FROM
UNNEST(bar.qux)
WITH OFFSET AS bar_qux_offset ORDER BY bar_qux_offset)
AS qux,
bar.plugh)
AS bar
FROM
foo

Related

How to query on fields from nested records without referring to the parent records in BigQuery?

I have data structured as follows:
{
"results": {
"A": {"first": 1, "second": 2, "third": 3},
"B": {"first": 4, "second": 5, "third": 6},
"C": {"first": 7, "second": 8, "third": 9},
"D": {"first": 1, "second": 2, "third": 3},
... },
...
}
i.e. nested records, where the lowest level has the same schema for all records in the level above. The schema would be similar to this:
results RECORD NULLABLE
results.A RECORD NULLABLE
results.A.first INTEGER NULLABLE
results.A.second INTEGER NULLABLE
results.A.third INTEGER NULLABLE
results.B RECORD NULLABLE
results.B.first INTEGER NULLABLE
...
Is there a way to do (e.g. aggregate) queries in BigQuery on fields from the lowest level, without knowledge of the keys on the (direct) parent level? Put differently, can I do a query on first for all records in results without having to specify A, B, ... in my query?
I would for example want to achieve something like
SELECT SUM(results.*.first) FROM table
in order to get 1+4+7+1 = 13,
but SELECT results.*.first isn't supported.
(I've tried playing around with STRUCTs, but haven't gotten far.)
Below trick is for BigQuery Standard SQL
#standardSQL
SELECT id, (
SELECT AS STRUCT
SUM(first) AS sum_first,
SUM(second) AS sum_second,
SUM(third) AS sum_third
FROM UNNEST([a]||[b]||[c]||[d])
).*
FROM `project.dataset.table`,
UNNEST([results])
You can test, play with above using dummy/sample data from your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 AS id, STRUCT(
STRUCT(1 AS first, 2 AS second, 3 AS third) AS A,
STRUCT(4 AS first, 5 AS second, 6 AS third) AS B,
STRUCT(7 AS first, 8 AS second, 9 AS third) AS C,
STRUCT(1 AS first, 2 AS second, 3 AS third) AS D
) AS results
)
SELECT id, (
SELECT AS STRUCT
SUM(first) AS sum_first,
SUM(second) AS sum_second,
SUM(third) AS sum_third
FROM UNNEST([a]||[b]||[c]||[d])
).*
FROM `project.dataset.table`,
UNNEST([results])
with output
Row id sum_first sum_second sum_third
1 1 13 17 21
Is there a way to do (e.g. aggregate) queries in BigQuery on fields from the lowest level, without knowledge of the keys on the (direct) parent level?
Below is for BigQuery Standard SQL and totally avoids referencing parent records (A, B, C, D, etc.)
#standardSQL
CREATE TEMP FUNCTION Nested_SUM(entries ANY TYPE, field_name STRING) AS ((
SELECT SUM(CAST(SPLIT(kv, ':')[OFFSET(1)] AS INT64))
FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(entries), r'":{(.*?)}')) entry,
UNNEST(SPLIT(entry)) kv
WHERE TRIM(SPLIT(kv, ':')[OFFSET(0)], '"') = field_name
));
SELECT id,
Nested_SUM(results, 'first') AS first_sum,
Nested_SUM(results, 'second') AS second_sum,
Nested_SUM(results, 'third') AS third_sum,
Nested_SUM(results, 'forth') AS forth_sum
FROM `project.dataset.table`
if to apply to sample data from your question as in below example
#standardSQL
CREATE TEMP FUNCTION Nested_SUM(entries ANY TYPE, field_name STRING) AS ((
SELECT SUM(CAST(SPLIT(kv, ':')[OFFSET(1)] AS INT64))
FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(entries), r'":{(.*?)}')) entry,
UNNEST(SPLIT(entry)) kv
WHERE TRIM(SPLIT(kv, ':')[OFFSET(0)], '"') = field_name
));
WITH `project.dataset.table` AS (
SELECT 1 AS id, STRUCT(
STRUCT(1 AS first, 2 AS second, 3 AS third) AS A,
STRUCT(4 AS first, 5 AS second, 6 AS third) AS B,
STRUCT(7 AS first, 8 AS second, 9 AS third) AS C,
STRUCT(1 AS first, 2 AS second, 3 AS third) AS D
) AS results
)
SELECT id,
Nested_SUM(results, 'first') AS first_sum,
Nested_SUM(results, 'second') AS second_sum,
Nested_SUM(results, 'third') AS third_sum,
Nested_SUM(results, 'forth') AS forth_sum
FROM `project.dataset.table`
output is
Row id first_sum second_sum third_sum forth_sum
1 1 13 17 21 null
I adapted Mikhail's answer in order to support grouping on the values of the lowest-level fields:
#standardSQL
CREATE TEMP FUNCTION Nested_AGGREGATE(entries ANY TYPE, field_name STRING) AS ((
SELECT ARRAY(
SELECT AS STRUCT TRIM(SPLIT(kv, ':')[OFFSET(1)], '"') AS value, COUNT(SPLIT(kv, ':')[OFFSET(1)]) AS count
FROM UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(entries), r'":{(.*?)}')) entry,
UNNEST(SPLIT(entry)) kv
WHERE TRIM(SPLIT(kv, ':')[OFFSET(0)], '"') = field_name
GROUP BY TRIM(SPLIT(kv, ':')[OFFSET(1)], '"')
)
));
SELECT id,
Nested_AGGREGATE(results, 'first') AS first_agg,
Nested_AGGREGATE(results, 'second') AS second_agg,
Nested_AGGREGATE(results, 'third') AS third_agg,
FROM `project.dataset.table`
Output for WITH `project.dataset.table` AS (SELECT 1 AS id, STRUCT( STRUCT(1 AS first, 2 AS second, 3 AS third) AS A, STRUCT(4 AS first, 5 AS second, 6 AS third) AS B, STRUCT(7 AS first, 8 AS second, 9 AS third) AS C, STRUCT(1 AS first, 2 AS second, 3 AS third) AS D) AS results ):
Row id first_agg.value first_agg.count second_agg.value second_agg.count third_agg.value third_agg.count
1 1 1 2 2 2 3 2
4 1 5 1 6 1
7 1 8 1 9 1

get a distinct count and repeating per row in BigQuery SQL

i havent been able to find an answer to this but I think it should be possible to do this with only one query with BigQuery - i'm looking for something close to this, even with approximate results.
let's say i have a table that looks like this, with one column called myValue
====
myValue
====
foo |
bar |
bar |
baz |
i would like to be able to have a single query that results in a column next to it that has the distinct counts of myValue, on every row.
=======|=====================|
myValue|myNewCounts |
=======|=====================|
foo |[foo:1, bar:2, baz:1]
bar |[foo:1, bar:2, baz:1]
bar |[foo:1, bar:2, baz:1]
baz |[foo:1, bar:2, baz:1]
I know that you can use ARRAY_AGG(distinct) to get the distinct values on every row, but I haven't been able to find a way to also get the counts as well on every row, even in an approximate fashion.
It's important that this be done in a single query - I could obviously have a separate query that calculates the distinct counts and then join that back to this table, but im trying to do this in one query.
one would think - that in a columnar database - returning myValue and myNewCounts in one pass should be doable somehow....
Below is for BigQuery Standard SQL
#standardSQL
SELECT * FROM `project.dataset.table`, (
SELECT '[' || STRING_AGG(x, ', ') || ']' myNewCounts FROM (
SELECT FORMAT('%s:%i', myValue, COUNT(1)) x
FROM `project.dataset.table`
GROUP BY myValue
))
if to apply to sample data from your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'foo' myValue UNION ALL
SELECT 'bar' UNION ALL
SELECT 'bar' UNION ALL
SELECT 'baz'
)
SELECT * FROM `project.dataset.table`, (
SELECT '[' || STRING_AGG(x, ', ') || ']' myNewCounts FROM (
SELECT FORMAT('%s:%i', myValue, COUNT(1)) x
FROM `project.dataset.table`
GROUP BY myValue
))
result is
Row myValue myNewCounts
1 foo [foo:1, bar:2, baz:1]
2 bar [foo:1, bar:2, baz:1]
3 bar [foo:1, bar:2, baz:1]
4 baz [foo:1, bar:2, baz:1]
In case if myNewCounts is expected to be an array - use below version instead
#standardSQL
SELECT * FROM `project.dataset.table`, (
SELECT ARRAY_AGG(x) myNewCounts FROM (
SELECT FORMAT('%s:%i', myValue, COUNT(1)) x
FROM `project.dataset.table`
GROUP BY myValue
))

How to get count of matches in field of table for list of phrases from another table in bigquery?

Given an arbitrary list of phrases phrase1, phrase2*, ... phraseN (say these are in another table Phrase_Table), how would one get the count of matches for each phrase in a field F in a bigquery table?
Here, "*" means there must be some non-empty/non-blank string after the phrase.
Lets say you have a table with and ID field and two string fields Field1, Field2
Output would look something like
id, CountOfPhrase1InField1, CountOfPhrase2InField1, CountOfPhrase1InField2, CountOfPhrase2InField2
or I guess instead of all of those output fields maybe there's a single json object field
id, [{"fieldName": Field1, "counts": {phrase1: m, phrase2: mm, ...},
{"fieldName": Field2, "counts": {phrase1: m2, phrase2: mm2, ...},...]
Thanks!
Below example is for BigQuery Standard SQL
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'foo1 foo foo40' str UNION ALL
SELECT 'test1 test test2 test'
), `project.dataset.keywords` AS (
SELECT 'foo' key UNION ALL
SELECT 'test'
)
SELECT str, ARRAY_AGG(STRUCT(key, ARRAY_LENGTH(REGEXP_EXTRACT_ALL(str, CONCAT(key, r'[^\s]'))) as matches)) all_matches
FROM `project.dataset.table`
CROSS JOIN `project.dataset.keywords`
GROUP BY str
with result
Row str all_matches.key all_matches.matches
1 foo1 foo foo40 foo 2
test 0
2 test1 test test2 test foo 0
test 2
If you prefer output as json you can add TO_JSON_STRING() as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'foo1 foo foo40' str UNION ALL
SELECT 'test1 test test2 test'
), `project.dataset.keywords` AS (
SELECT 'foo' key UNION ALL
SELECT 'test'
)
SELECT str, TO_JSON_STRING(ARRAY_AGG(STRUCT(key, ARRAY_LENGTH(REGEXP_EXTRACT_ALL(str, CONCAT(key, r'[^\s]'))) as matches))) all_matches
FROM `project.dataset.table`
CROSS JOIN `project.dataset.keywords`
GROUP BY str
with output
Row str all_matches
1 foo1 foo foo40 [{"key":"foo","matches":2},{"key":"test","matches":0}]
2 test1 test test2 test [{"key":"foo","matches":0},{"key":"test","matches":2}]
there are endless ways of presenting outputs like above - hope you will adjust it to whatever exactly you need :o)

How to make an equality test on an array

I am trying to aggregate values on an ID. I return them if they are all the same, but have to create another value 'C' if both are encountered.
CREATE TABLE foo (
fooid int,
foocomm text
);
INSERT INTO foo (fooid,foocomm)
VALUES (1,'A');
INSERT INTO foo (fooid,foocomm)
VALUES (1,'B');
INSERT INTO foo (fooid,foocomm)
VALUES (2,'A');
SELECT
CASE
WHEN array_remove(array_agg(foocomm),NULL) = {'A'} THEN 'A'
WHEN array_remove(array_agg(foocomm),NULL) = {'B'} THEN 'B'
WHEN array_remove(array_agg(foocomm),NULL) = {'A','B'} THEN 'C'
END AS BAR
FROM foo
GROUP BY fooid;
It should yield
fooid,foocomm
1, 'C'
2, 'A'
t=# SELECT fooid,
CASE
WHEN array_remove(array_agg(foocomm order by foocomm),NULL) = '{A}' THEN 'A'
WHEN array_remove(array_agg(foocomm order by foocomm),NULL) = '{B}' THEN 'B'
WHEN array_remove(array_agg(foocomm order by foocomm),NULL) = '{A,B}' THEN 'C'
END AS BAR
FROM foo
GROUP BY fooid;
fooid | bar
-------+-----
1 | C
2 | A
(2 rows)
your query works, just fix array text representation:
https://www.postgresql.org/docs/current/static/arrays.html#ARRAYS-INPUT

DB Query. How to capture 'mixed' status on a single row

I have a database query ...
select foo, bar, status
from mytable
where bar in (bar1, bar2, bar3);
The status is a status associated with the pair foo-bar. The GUI display is going to display 1 row for every foo, and should display a checked-checkbox if for all bar1, bar2, bar3 for that foo, the status are all 1. And an unchecked chceckbox if for that foo, the status values of bar1, bar2 and bar3 are all zero. If, again for a given foo, different bars have a different status, I am required to display some other token (a questionmark, say.)
My knowledge of sql isn't sufficient to this task. Can this be done in sql. it's in Oracle, if that makes a difference. I'm thinking I may have to suck it into perl and check for the condition there, but I'm not happy with that idea.
In T-SQL I'd do this:
create table mytable (foo nvarchar(128), bar nvarchar(128), status int)
go
select foo, (MAX(status) + MIN(status)) as status
from mytable
group by foo
then in the client app the resulting status value will be 0 if all are unchecked, 1 if some checked, and 2 if all checked
With a CTE to supply sample data, different combinations of zeros and ones in the row statuses gives different output values:
with tmp_tab as (
select 'foo1' as foo, 'bar1' as bar, 0 as status from dual
union
select 'foo1' as foo, 'bar2' as bar, 0 as status from dual
union
select 'foo1' as foo, 'bar3' as bar, 0 as status from dual
union
select 'foo2' as foo, 'bar1' as bar, 0 as status from dual
union
select 'foo2' as foo, 'bar2' as bar, 1 as status from dual
union
select 'foo2' as foo, 'bar3' as bar, 0 as status from dual
union
select 'foo3' as foo, 'bar1' as bar, 1 as status from dual
union
select 'foo3' as foo, 'bar2' as bar, 1 as status from dual
union
select 'foo3' as foo, 'bar3' as bar, 1 as status from dual
)
select foo,
case
when sum(status) = 0 then 'Unchecked'
when sum(status) = count(bar) then 'Checked'
else 'Unknown'
end as status
from tmp_tab
where bar in ('bar1','bar2','bar3')
group by foo;
FOO STATUS
---- ---------
foo1 Unchecked
foo2 Unknown
foo3 Checked
perhaps restructure your query to perform a union instead of the IN.
in that way, you will have an explicit value (unique to each unioned select statement, that will tell you which value has been matched.
If the designer were smart, bar1, bar2, and bar3 should be numeric powers of 2, so one can apply bitwise operators to them - then it would be trivial to know which of the particular bars are set.
I'm assuming bar1/2/3 can only be 0 or 1:
SELECT foo, bar,
CASE WHEN ( bar1 + bar2 + bar3 = 3 ) THEN 'checked'
WHEN ( bar1 + bar2 + bar3 = 0 ) THEN 'unchecked'
ELSE 'something else' END
FROM mytable
WHERE bar in (bar1, bar2, bar3);
Or am I missing something?
Edit: Looks like I originally misunderstood the problem. I think this will work.
SELECT foo,
DECODE( sum_status, 0, 'unchecked', 3, 'checked', 'something else' )
FROM ( SELECT foo, SUM( status ) AS sum_status
FROM mytable
WHERE bar in (bar1, bar2, bar3)
GROUP BY foo )
Again, this assumes status can only be 0 or 1.