Working on a user-defined function on BigQuery to extract emails from messy data sets, I'm facing an issued with ARRAY_AGG() not being allowed in the body of a temp user defined-function (UDF).
CREATE TEMP FUNCTION GET_EMAIL(emails ARRAY<STRING>, index INT64) AS (
ARRAY_AGG(
DISTINCT
(SELECT * FROM
UNNEST(
SPLIT(
REPLACE(
LOWER(
ARRAY_TO_STRING(emails, ",")
)," ", ""
)
)
) AS e where e like '%#%'
) IGNORE NULLS
)[SAFE_OFFSET(index)]
);
SELECT GET_EMAIL(["bob#hotmail.com,test#gmail.com", "12345", "bon#yahoo.com"],1) as email_1
I've tried to bypass the ARRAY_AGG by selecting from UNNEST with OFFSET and then WHERE the offset would be the index.
However, now there's a column limitation (not more than one column in inside a scalar sub-query SELECT clause) suggesting to use a SELECT AS STRUCT instead.
I gave a try to the SELECT AS STRUCT:
CREATE TEMP FUNCTION GET_EMAIL(emails ARRAY<STRING>, index INT64) AS (
(SELECT AS STRUCT DISTINCT list.e, list.o FROM
UNNEST(
SPLIT(
REPLACE(
LOWER(
ARRAY_TO_STRING(emails, ", ")
)," ", ""
)
)
) AS list
WITH OFFSET as list.o
WHERE list.e like '%#%' AND list.o = index)
);
SELECT GET_EMAIL(["bob#hotmail.com,test#gmail.com", "12345", "bob#yahoo.com"],1) as email_1
But it doesn't like my DISTINCT and then even removing it, it will complain about parsing e and o.
So I'm out of ideas here, I probably made a knot. Can anyone suggest how to do this job inside a UDF? Thanks.
Below version works
CREATE TEMP FUNCTION GET_EMAIL(emails ARRAY<STRING>, index INT64) AS ((
SELECT ARRAY(
SELECT *
FROM UNNEST(
SPLIT(
REPLACE(
LOWER(
ARRAY_TO_STRING(emails, ",")
)," ", ""
)
)
) AS e WHERE e LIKE '%#%'
)[SAFE_OFFSET(index)]
));
SELECT GET_EMAIL(["bob#hotmail.com,test#gmail.com", "12345", "bon#yahoo.com"], 1) AS email_1
with result
Row email_1
1 test#gmail.com
Or below version (which is just slight correction of your original query)
CREATE TEMP FUNCTION GET_EMAIL(emails ARRAY<STRING>, index INT64) AS ((
SELECT ARRAY_AGG(e)[SAFE_OFFSET(index)]
FROM UNNEST(
SPLIT(
REPLACE(
LOWER(
ARRAY_TO_STRING(emails, ",")
)," ", ""
)
)
) AS e WHERE e LIKE '%#%'
));
SELECT GET_EMAIL(["bob#hotmail.com,test#gmail.com", "12345", "bon#yahoo.com"], 1) AS email_1
obviously with the same result
Related
How can actually convert the below string into STRUCT
select '{\"ID\": \"A\", \"QualifierID\": \"XYZ\", \"text\": \"Origin\"}';
I want the above string formatted text to be Struct type in GCP Bigquery.
Using a JSON function,
WITH sample_data AS (
SELECT '{\"ID\": \"A\", \"QualifierID\": \"XYZ\", \"text\": \"Origin\"}' json
)
SELECT STRUCT (
JSON_VALUE(json, '$.ID') AS ID,
JSON_VALUE(json, '$.QualifierID') AS QualifierID,
JSON_VALUE(json, '$.text') AS text
) AS struct_col
FROM sample_data;
Or with JSON type,
WITH sample_data AS (
SELECT '{\"ID\": \"A\", \"QualifierID\": \"XYZ\", \"text\": \"Origin\"}' json
)
SELECT STRUCT (
STRING(PARSE_JSON(json).ID) AS ID,
STRING(PARSE_JSON(json).QualifierID) AS QualifierID,
STRING(PARSE_JSON(json).text) AS text
) AS struct_col
FROM sample_data;
you can get following result:
Consider yet another option
create temp function keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
select * except(row_id) from (
select to_json_string(t) row_id, key, value
from your_table t, unnest(keys(json)) key with offset
join unnest(values(json)) value with offset
using(offset)
)
pivot (any_value(value) for key in ('ID', 'QualifierID', 'text'))
if applied to sample data in your question - output is
Does something like this work for you?
with _cte as (
select JSON '{\"ID\": \"A\", \"QualifierID\": \"XYZ\", \"text\": \"Origin\"}' as tojson
)
select struct(JSON_EXTRACT(tojson, '$.ID') as ID,
JSON_EXTRACT(tojson, '$.QualifierID') as QualifierID,
JSON_EXTRACT(tojson, '$.text') as text) as tostruct
from _cte
I want to count how many similar words I have in a path (which will be split at delimiter /) and return a matching array of integers.
Input data will be something like:
I want to add another column, match_count, with an array of integers. For example:
To replicate this case, this is the query I'm working with:
CREATE TEMP FUNCTION HOW_MANY_MATCHES_IN_PATH(src_path ARRAY<STRING>, test_path ARRAY<STRING>) RETURNS ARRAY<INTEGER> AS (
-- WHAT DO I PUT HERE?
);
SELECT
*,
HOW_MANY_MATCHES_IN_PATH(src_path, test_path) as dir_path_match_count
FROM (
SELECT
ARRAY_AGG(x) AS src_path,
ARRAY_AGG(y) as test_path
FROM
UNNEST([
'lib/client/core.js',
'lib/server/core.js'
]) AS x, UNNEST([
'test/server/core.js'
]) as y
)
I've tried working with ARRAY and UNNEST in the HOW_MANY_MATCHES_IN_PATH function, but I either end up with an error or an array of 4 items (in this example)
Consider below approach
create temp function how_many_matches_in_path(src_path string, test_path string) returns integer as (
(select count(distinct src)
from unnest(split(src_path, '/')) src,
unnest(split(test_path, '/')) test
where src = test)
);
select *,
array( select how_many_matches_in_path(src, test)
from t.src_path src with offset
join t.test_path test with offset
using(offset)
) dir_path_match_count
from your_table t
if to apply to sample of Input data in your question
with your_table as (
select
['lib/client/core.js', 'lib/server/core.js'] src_path,
['test/server/core.js', 'test/server/core.js'] test_path
)
output is
I am trying to install a function. I don't understand what the problem is. I can install correctly when:
I delete the pivot
I use the Table and not the unnest only (so from the table, unnest(a))
CREATE OR REPLACE FUNCTION `dataset.function_naming` (a ARRAY<STRUCT<ROW_ID STRING, KEY STRING, VALUE STRING>>, id_one STRING, id_two STRING, start_date DATE, end_date DATE) RETURNS INT64
AS (
with tmp1 as (
select ROW_ID,X,Y,Z,W
from
(
select prop.ROW_ID,prop.KEY, prop.VALUE
from unnest(a) prop
where prop.KEY in ('X','Y','Z','W')
)
PIVOT
(
MAX(VALUE)
FOR UPPER(KEY) in('X','Y','Z','W')
) as PIVOT
)
select case when X is not null then 1,
when Y is not null then 2,
when Z is not null then 2,
when W is not null then 2
else 0
from tmp1
);
Thanks all.
There are few minor issues I see in your code.
missing extra (...) around function body
extra commas (,) within case statement
So, try below
CREATE OR REPLACE FUNCTION `dataset.function_naming` (
a ARRAY<STRUCT<ROW_ID STRING, KEY STRING, VALUE STRING>>,
id_one STRING,
id_two STRING,
start_date DATE,
end_date DATE
) RETURNS INT64
AS ((
with tmp1 as (
select ROW_ID,X,Y,Z,W
from
(
select prop.ROW_ID,prop.KEY, prop.VALUE
from unnest(a) prop
where prop.KEY in ('X','Y','Z','W')
)
PIVOT
(
MAX(VALUE)
FOR UPPER(KEY) in('X','Y','Z','W')
) as PIVOT
)
select case when X is not null then 1
when Y is not null then 2
when Z is not null then 2
when W is not null then 2
else 0
end
from tmp1
));
Seams there is an internal issue when using pivots and the unnest on the array. You can use the following, that executes the same logic, and also, create an case on issue tracker, as a BigQuery issue with Google cloud Support.
CREATE OR REPLACE FUNCTION `<dataset>.function_naming` (
a ARRAY<STRUCT<ROW_ID STRING, KEY STRING, VALUE STRING>>,
id_one STRING,
id_two STRING,
start_date DATE,
end_date DATE
) RETURNS INT64
AS (( WITH tmp AS (
SELECT
CASE
WHEN KEY="X" THEN 1
WHEN KEY="Y" THEN 2
WHEN KEY="Z" THEN 2
WHEN KEY="W" THEN 2
ELSE
0
END
teste_column
#-- FROM ( SELECT UPPER(prop.KEY) KEY, MAX(prop.VALUE) VALUE FROM -- following your query patern, but not really necessary
FROM ( SELECT UPPER(prop.KEY) KEY FROM
UNNEST(a) prop
WHERE
UPPER(key) IN ('X', 'Y', 'Z', 'W')
GROUP BY key )
ORDER BY teste_column DESC LIMIT 1 )
SELECT * FROM tmp
UNION ALL
SELECT 0 teste_column
FROM (SELECT 1)
LEFT JOIN tmp
ON FALSE
WHERE NOT EXISTS ( SELECT 1 FROM tmp)
));
#--- Testing the function:
select `<project>.<dataset>.function_naming`([STRUCT("1" AS ROW_ID, "x" AS KEY, "10"AS VALUE), STRUCT("1" AS ROW_ID, "x" AS KEY, "20"AS VALUE), STRUCT("1" AS ROW_ID, "w" AS KEY, "20"AS VALUE), STRUCT("1" AS ROW_ID, "y" AS KEY, "20"AS VALUE)], "1", "2", "2022-12-10", "2022-12-10")
all.
I need to extract from the string by REGEX all that matching the pattern "TTT\d{3}"
For the string in example i would like to get:
TTT108,TTT109,TTT111,TTT110
The DB2 function i would like to use is REGEXP_REPLACE(str,'REGEX pattern', ',').
The number of matching can be 0,1,2,3... in each string.
Thank you.
The example:
TTT108(optional);TTT109(optional);TTT111(optional);TTT110optional);ENTITYLIST_2=(optional);ENTITYLIST_3=(optional);Containment_Status=(optional)
If you want to extract the valid instead of replacing the invalid characters, please check if this helps:
with data (s) as (values
('TTT108(optional);TTT109(optional);TTT111(optional);TTT110optional);ENTITYLIST_2=(optional);ENTITYLIST_3=(optional);Containment_Status=(optional)')
)
select listagg(sst,', ') within group (order by n)
from (
select n,
regexp_substr(s,'(TTT[0-9][0-9][0-9])', 1, n)
from data
cross join (values (1),(2),(3),(4),(5)) x (n) -- any numbers table
where n <= regexp_count(s,'(TTT[0-9][0-9][0-9])')
) x (n,sst)
For any number of tokens & Db2 versions before 11.1:
select id, listagg(tok, ',') str
from
(
values
(1, 'TTT108(optional);TTT109(optional);TTT111(optional);TTT110optional);ENTITYLIST_2=(optional);ENTITYLIST_3=(optional);Containment_Status=(optional)')
) mytable (id, str)
, xmltable
(
'for $id in tokenize($s, ";") let $new := replace($id, "(TTT\d{3}).*", "$1") where matches($id, "(TTT\d{3}).*") return <i>{string($new)}</i>'
passing mytable.str as "s"
columns tok varchar(6) path '.'
) t
group by id;
I have a problem where a eastern-arabic numeral has entered my table as a timestamp and bigquery doesn't recognise this as a timestamp and will not execute my queries.
I wish to be able to convert this:
'٢٠١٨-١٠-١١T١٦:٠١:٤١.٠٤١Z'
into this:
'2018-10-11T16:01:41.041Z
in bigquery, Is this possible?
How about this SQL UDF:
CREATE TEMP FUNCTION arabicConvert(input STRING) AS ((
SELECT STRING_AGG(COALESCE(FORMAT('%i', i), letter), '')
FROM (SELECT SPLIT(input, '') x), UNNEST(x) letter
LEFT JOIN (SELECT letter_dict,i FROM (
SELECT SPLIT('٠١٢٣٤٥٦٧٨٩', '') l), UNNEST(l) letter_dict WITH OFFSET i
)
ON letter=letter_dict
));
SELECT arabicConvert('٢٠١٨-١٠-١١T١٦:٠١:٤١.٠٤١Z') converted
2018-10-11T16:01:41.041Z
There is alternative, lighter option :o)
CREATE TEMP FUNCTION arabicNumeralsConvert(input STRING) AS ((
CODE_POINTS_TO_STRING(ARRAY(
SELECT IF(code > 1600, code - 1584, code)
FROM UNNEST(TO_CODE_POINTS(input)) code
))
));
WITH t AS (
SELECT '٢٠١٨-١٠-١١T١٦:٠١:٤١.٠٤١Z' str UNION ALL
SELECT '2018-10-12T20:34:57.546Z'
)
SELECT str, arabicNumeralsConvert(str) converted
FROM t
result is as
str converted
٢٠١٨-١٠-١١T١٦:٠١:٤١.٠٤١Z 2018-10-11T16:01:41.041Z
2018-10-12T20:34:57.546Z 2018-10-12T20:34:57.546Z