How to convert string representation dictionary into struct in gcp bigquery? - google-bigquery

How can actually convert the below string into STRUCT
select '{\"ID\": \"A\", \"QualifierID\": \"XYZ\", \"text\": \"Origin\"}';
I want the above string formatted text to be Struct type in GCP Bigquery.

Using a JSON function,
WITH sample_data AS (
SELECT '{\"ID\": \"A\", \"QualifierID\": \"XYZ\", \"text\": \"Origin\"}' json
)
SELECT STRUCT (
JSON_VALUE(json, '$.ID') AS ID,
JSON_VALUE(json, '$.QualifierID') AS QualifierID,
JSON_VALUE(json, '$.text') AS text
) AS struct_col
FROM sample_data;
Or with JSON type,
WITH sample_data AS (
SELECT '{\"ID\": \"A\", \"QualifierID\": \"XYZ\", \"text\": \"Origin\"}' json
)
SELECT STRUCT (
STRING(PARSE_JSON(json).ID) AS ID,
STRING(PARSE_JSON(json).QualifierID) AS QualifierID,
STRING(PARSE_JSON(json).text) AS text
) AS struct_col
FROM sample_data;
you can get following result:

Consider yet another option
create temp function keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
select * except(row_id) from (
select to_json_string(t) row_id, key, value
from your_table t, unnest(keys(json)) key with offset
join unnest(values(json)) value with offset
using(offset)
)
pivot (any_value(value) for key in ('ID', 'QualifierID', 'text'))
if applied to sample data in your question - output is

Does something like this work for you?
with _cte as (
select JSON '{\"ID\": \"A\", \"QualifierID\": \"XYZ\", \"text\": \"Origin\"}' as tojson
)
select struct(JSON_EXTRACT(tojson, '$.ID') as ID,
JSON_EXTRACT(tojson, '$.QualifierID') as QualifierID,
JSON_EXTRACT(tojson, '$.text') as text) as tostruct
from _cte

Related

BigQuery - Count how many words in array are equal

I want to count how many similar words I have in a path (which will be split at delimiter /) and return a matching array of integers.
Input data will be something like:
I want to add another column, match_count, with an array of integers. For example:
To replicate this case, this is the query I'm working with:
CREATE TEMP FUNCTION HOW_MANY_MATCHES_IN_PATH(src_path ARRAY<STRING>, test_path ARRAY<STRING>) RETURNS ARRAY<INTEGER> AS (
-- WHAT DO I PUT HERE?
);
SELECT
*,
HOW_MANY_MATCHES_IN_PATH(src_path, test_path) as dir_path_match_count
FROM (
SELECT
ARRAY_AGG(x) AS src_path,
ARRAY_AGG(y) as test_path
FROM
UNNEST([
'lib/client/core.js',
'lib/server/core.js'
]) AS x, UNNEST([
'test/server/core.js'
]) as y
)
I've tried working with ARRAY and UNNEST in the HOW_MANY_MATCHES_IN_PATH function, but I either end up with an error or an array of 4 items (in this example)
Consider below approach
create temp function how_many_matches_in_path(src_path string, test_path string) returns integer as (
(select count(distinct src)
from unnest(split(src_path, '/')) src,
unnest(split(test_path, '/')) test
where src = test)
);
select *,
array( select how_many_matches_in_path(src, test)
from t.src_path src with offset
join t.test_path test with offset
using(offset)
) dir_path_match_count
from your_table t
if to apply to sample of Input data in your question
with your_table as (
select
['lib/client/core.js', 'lib/server/core.js'] src_path,
['test/server/core.js', 'test/server/core.js'] test_path
)
output is

ARRAY_AGG not allowed in user-defined function (Standard SQL)

Working on a user-defined function on BigQuery to extract emails from messy data sets, I'm facing an issued with ARRAY_AGG() not being allowed in the body of a temp user defined-function (UDF).
CREATE TEMP FUNCTION GET_EMAIL(emails ARRAY<STRING>, index INT64) AS (
ARRAY_AGG(
DISTINCT
(SELECT * FROM
UNNEST(
SPLIT(
REPLACE(
LOWER(
ARRAY_TO_STRING(emails, ",")
)," ", ""
)
)
) AS e where e like '%#%'
) IGNORE NULLS
)[SAFE_OFFSET(index)]
);
SELECT GET_EMAIL(["bob#hotmail.com,test#gmail.com", "12345", "bon#yahoo.com"],1) as email_1
I've tried to bypass the ARRAY_AGG by selecting from UNNEST with OFFSET and then WHERE the offset would be the index.
However, now there's a column limitation (not more than one column in inside a scalar sub-query SELECT clause) suggesting to use a SELECT AS STRUCT instead.
I gave a try to the SELECT AS STRUCT:
CREATE TEMP FUNCTION GET_EMAIL(emails ARRAY<STRING>, index INT64) AS (
(SELECT AS STRUCT DISTINCT list.e, list.o FROM
UNNEST(
SPLIT(
REPLACE(
LOWER(
ARRAY_TO_STRING(emails, ", ")
)," ", ""
)
)
) AS list
WITH OFFSET as list.o
WHERE list.e like '%#%' AND list.o = index)
);
SELECT GET_EMAIL(["bob#hotmail.com,test#gmail.com", "12345", "bob#yahoo.com"],1) as email_1
But it doesn't like my DISTINCT and then even removing it, it will complain about parsing e and o.
So I'm out of ideas here, I probably made a knot. Can anyone suggest how to do this job inside a UDF? Thanks.
Below version works
CREATE TEMP FUNCTION GET_EMAIL(emails ARRAY<STRING>, index INT64) AS ((
SELECT ARRAY(
SELECT *
FROM UNNEST(
SPLIT(
REPLACE(
LOWER(
ARRAY_TO_STRING(emails, ",")
)," ", ""
)
)
) AS e WHERE e LIKE '%#%'
)[SAFE_OFFSET(index)]
));
SELECT GET_EMAIL(["bob#hotmail.com,test#gmail.com", "12345", "bon#yahoo.com"], 1) AS email_1
with result
Row email_1
1 test#gmail.com
Or below version (which is just slight correction of your original query)
CREATE TEMP FUNCTION GET_EMAIL(emails ARRAY<STRING>, index INT64) AS ((
SELECT ARRAY_AGG(e)[SAFE_OFFSET(index)]
FROM UNNEST(
SPLIT(
REPLACE(
LOWER(
ARRAY_TO_STRING(emails, ",")
)," ", ""
)
)
) AS e WHERE e LIKE '%#%'
));
SELECT GET_EMAIL(["bob#hotmail.com,test#gmail.com", "12345", "bon#yahoo.com"], 1) AS email_1
obviously with the same result

Bigquery array of STRINGs to array of INTs

I'm trying to pull an array of INT64 s in BigQuery standard SQL from a column which is a long string of numbers separated by commas (for example, 2013,1625,1297,7634). I can pull an array of strings easily with:
SELECT
SPLIT(string_col,",")
FROM
table
However, I want to return an array of INT64 s, not an array of strings. How can I do that? I've tried
CAST(SPLIT(string_col,",") AS ARRAY<INT64>)
but that doesn't work.
Below is for BigQuery Standard SQL
#standardSQL
WITH yourTable AS (
SELECT 1 AS id, '2013,1625,1297,7634' AS string_col UNION ALL
SELECT 2, '1,2,3,4,5'
)
SELECT id,
(SELECT ARRAY_AGG(CAST(num AS INT64))
FROM UNNEST(SPLIT(string_col)) AS num
) AS num,
ARRAY(SELECT CAST(num AS INT64)
FROM UNNEST(SPLIT(string_col)) AS num
) AS num_2
FROM yourTable
Mikhail beat me to it and his answer is more extensive but adding this as a more minimal repro:
SELECT CAST(num as INT64) from unnest(SPLIT("2013,1625,1297,7634",",")) as num;

Extract a sub string from a string

In google-bigquery, I need to pull the string that is between domain** and ** as in the example bellow
The string is under the column "Site_Data"
Can someone help me? 10x!
See example below
#standardSQL
WITH yourTable AS (
SELECT '756-1__6565656565656, tagtype**unmapped,domain**www.sport.com,userarriveddirectly**False' AS Site_Data
)
SELECT
REGEXP_EXTRACT(Site_Data, r'domain\*\*(.*)\*\*') AS x,
Site_Data
FROM yourTable
Do all of the strings have that format? There are a couple of different options, assuming that you always need the third string after the ** delimiter.
1) Use SPLIT, e.g.:
#standardSQL
WITH SampleData AS (
SELECT '756-1__67648582789116,tagtype**unmapped,domain**www.sport.com,userarriveddirectly**False' AS site_data
)
SELECT SPLIT(site_data, '**')[OFFSET(2)] AS visit_type
FROM SampleData;
2) Use REGEXP_EXTRACT, e.g.:
#standardSQL
WITH SampleData AS (
SELECT '756-1__67648582789116,tagtype**unmapped,domain**www.sport.com,userarriveddirectly**False' AS site_data
)
SELECT REGEXP_EXTRACT(site_data, r'[^\*]+\*\*[^\*]+\*\*([^\*]+)') AS visit_type
FROM SampleData;
Taking this a step further, if you want to split the domain and the arrival type, you can use SPLIT again:
#standardSQL
WITH SampleData AS (
SELECT '756-1__67648582789116,tagtype**unmapped,domain**www.sport.com,userarriveddirectly**False' AS site_data
)
SELECT
SPLIT(visit_type)[OFFSET(0)] AS domain,
SPLIT(visit_type)[OFFSET(1)] AS arrival_type
FROM (
SELECT SPLIT(site_data, '**')[OFFSET(2)] AS visit_type
FROM SampleData
);

Splitting out a field in Big Query

I have searched around and can not find much on this topic (maybe bad search terms :). I have a table, Protopayload.resource, that gets Apache logging information. As a result the field I am interested in contains multiple values that I need to search against. The field is formatted in a php URL style.
i.e.
/?id=13242134123&ver=12&os_bits=64&os_type=mac&lng=EN
This makes all searches end up with really long regexes to get data. Then join statements to combine data.
Example search to combine mac/win stats
SELECT
t1.date, t1.wincount, COALESCE(t2.maccount, 0) AS maccount
FROM (
SELECT
DATE(metadata.timestamp) AS date,
INTEGER(COUNT(protoPayload.resource)) AS wincount
FROM (TABLE_DATE_RANGE(tablename, DATE_ADD(CURRENT_TIMESTAMP(), -30, 'DAY'), CURRENT_TIMESTAMP() ))
WHERE
(REGEXP_MATCH(protoPayload.resource, r'ver=[11,12'))
AND protoPayload.resource CONTAINS 'os=win' GROUP BY date ) t1
LEFT JOIN (
SELECT
DATE(metadata.timestamp) AS date,
INTEGER(COUNT(protoPayload.resource)) AS maccount
FROM (TABLE_DATE_RANGE(tablename, DATE_ADD(CURRENT_TIMESTAMP(), -30, 'DAY'), CURRENT_TIMESTAMP() ))
WHERE
(REGEXP_MATCH(protoPayload.resource, r'cv=[p,m][17,16,15,14]'))
AND protoPayload.resource CONTAINS 'os=mac' GROUP BY date ) t2
ON
t1.date = t2.date
ORDER BY t1.date
What I was thinking was to use similar regex searches. Create a new table. Then save the data to a new table with relation fields. Then fix future logging so it logs to the table correctly.
My questions are this valid solution, or is there a much easier way to accomplish this in Google BigQuery? Is there a better way to transform the data?
Thanks again for any input!
You can use a SQL function to parse the key-value pairs into an array, which will generally be faster than using JavaScript. For example,
#standardSQL
CREATE TEMPORARY FUNCTION ParseKeys(queryString STRING)
RETURNS ARRAY<STRUCT<key STRING, value STRING>> AS (
(SELECT
ARRAY_AGG(STRUCT(
entry[OFFSET(0)] AS key,
entry[OFFSET(1)] AS value))
FROM (
SELECT SPLIT(pairString, '=') AS entry
FROM UNNEST(SPLIT(REGEXP_EXTRACT(queryString, r'/\?(.*)'), '&')) AS pairString)
)
);
SELECT ParseKeys('/?foo=bar&baz=2');
Now you can build on this with a function that pivots the keys into struct fields:
#standardSQL
CREATE TEMP FUNCTION GetAttributes(queryString STRING) AS (
(SELECT AS STRUCT
MAX(IF(key = 'id', CAST(value AS INT64), NULL)) AS id,
MAX(IF(key = 'ver', CAST(value AS INT64), NULL)) AS ver,
MAX(IF(key = 'os_bits', CAST(value AS INT64), NULL)) AS os_bits,
MAX(IF(key = 'os_type', value, NULL)) AS os_type,
MAX(IF(key = 'lng', value, NULL)) AS lng
FROM UNNEST(ParseKeys(queryString)))
);
Putting everything together, you can try out the GetAttributes function with some sample input:
#standardSQL
CREATE TEMPORARY FUNCTION ParseKeys(queryString STRING)
RETURNS ARRAY<STRUCT<key STRING, value STRING>> AS (
(SELECT
ARRAY_AGG(STRUCT(
entry[OFFSET(0)] AS key,
entry[OFFSET(1)] AS value))
FROM (
SELECT SPLIT(pairString, '=') AS entry
FROM UNNEST(SPLIT(REGEXP_EXTRACT(queryString, r'/\?(.*)'), '&')) AS pairString)
)
);
CREATE TEMP FUNCTION GetAttributes(queryString STRING) AS (
(SELECT AS STRUCT
MAX(IF(key = 'id', CAST(value AS INT64), NULL)) AS id,
MAX(IF(key = 'ver', CAST(value AS INT64), NULL)) AS ver,
MAX(IF(key = 'os_bits', CAST(value AS INT64), NULL)) AS os_bits,
MAX(IF(key = 'os_type', value, NULL)) AS os_type,
MAX(IF(key = 'lng', value, NULL)) AS lng
FROM UNNEST(ParseKeys(queryString)))
);
SELECT url, GetAttributes(url).*
FROM UNNEST(['/?id=13242134123&ver=12&os_bits=64&os_type=mac&lng=EN',
'/?id=2343645745&ver=15&os_bits=32&os_type=linux&lng=FR']) AS url;
You can always use Javascript UDFs for maximum flexibility. They will be slower than a pure SQL solution, but you'll be able to code around its limitations.
For example:
#standardSQL
CREATE TEMPORARY FUNCTION parse(query STRING)
RETURNS STRUCT<id STRING, ver STRING, os_bits STRING, os_type STRING, lng STRING>
LANGUAGE js AS """
function parseQueryString(query) {
// http://codereview.stackexchange.com/a/10396
var map = {};
query.replace(/([^&=]+)=?([^&]*)(?:&+|$)/g, function(match, key, value) {
(map[key] = map[key] || []).push(value);
});
return map;
}
return parseQueryString(query)
""";
WITH urls AS
(SELECT 'id=13242134123&ver=12&os_bits=64&os_type=mac&lng=EN' query
UNION ALL
SELECT 'id=13242134124&ver=12&os_bits=64&os_type=mac&lng=EN1&lng=EN2' query
)
SELECT query, parse(query) as parsed
FROM urls;.
I see few issues in the query in your question
1. looks like regexp is not correct and will not capture what you expect
2. query is heavily over-engineered and can be quite simplified
Below is to address above points
SELECT
DATE(metadata.timestamp) AS date,
SUM(REGEXP_MATCH(protoPayload.resource, r'ver=(11|12)\b')
AND protoPayload.resource CONTAINS 'os_type=win'
) AS wincount,
SUM(REGEXP_MATCH(protoPayload.resource, r'cv=(p|m)(17|16|15|14)\b')
AND protoPayload.resource CONTAINS 'os_type=mac'
) AS maccount
FROM (TABLE_DATE_RANGE(tablename, DATE_ADD(CURRENT_TIMESTAMP(), -30, 'DAY'),
CURRENT_TIMESTAMP() ))
GROUP BY date
Please note: you query in question is written with BigQuery Legacy SQL, so I keep my answer in same dialect