bigquery code not translating Guillemets « and » correctly - google-bigquery

The following bigquery code does not display correctly Guillemets « and ». In the output of the code below, notice that the Guillements are 'translated' as xAB and xBB. The expected answer should preserve the current translation but replace xAB with « and xBB with ».
CREATE TEMP FUNCTION
decode(word string) AS ((
SELECT
IF
(STARTS_WITH(word, '&#x'),
safe.code_points_to_STRING(ARRAY(
SELECT
ifnull(SAFE_CAST(value AS int64),
ASCII(value))
FROM
UNNEST(SPLIT(REPLACE(word, '&#', '0'),';')) value
WHERE
NOT value = '' )),
word) ));
WITH
DATA AS (
SELECT
'Arabic' AS lang,
'https://www.elwatannews.com/news/details/5516935' AS url,
`'تطورات «مذبحة أبو حزام ».. دفن 10 جثث وضبط 19 من عائلتي المجزرة'` AS title)
SELECT
url,
lang,
(
SELECT
STRING_AGG(decode(chars), ''
ORDER BY
OFFSET
)
FROM
UNNEST(REGEXP_EXTRACT_ALL(title, r'(?:&#x.{3};)+|[^&]+')) chars
WITH
OFFSET
) AS translate
FROM
DATA

CREATE TEMP FUNCTION
decode(word string) AS ((
SELECT
IF
(STARTS_WITH(word, '&#x'),
safe.code_points_to_STRING(ARRAY(
SELECT
ifnull(SAFE_CAST(value AS int64),
ASCII(value))
FROM
UNNEST(SPLIT(REPLACE(word, '&#', '0'),';')) value
WHERE
NOT value = '' )),
word) ));
WITH
DATA AS (
SELECT
'Arabic' AS lang,
'https://www.elwatannews.com/news/details/5516935' AS url,
'تطورات «مذبحة أبو حزام ».. دفن 10 جثث وضبط 19 من عائلتي المجزرة' AS title)
SELECT
# url,
lang,
(
SELECT
STRING_AGG(decode(chars), ''
ORDER BY
OFFSET
)
FROM
UNNEST(REGEXP_EXTRACT_ALL(title, r'(?:&#x.{2,3};)+|[^&]+')) chars
WITH
OFFSET
) AS translate
FROM
DATA
with output

Related

BigQuery SQL JSON Returning additional rows when current row contains multiple values

I have a table that looks like this
keyA | data:{"value":false}}
keyB | data:{"value":3}}
keyC | data:{"value":{"paid":10,"unpaid":20}}}
For keyA,keyB I can easily extract a single value with JSON_EXTRACT_SCALAR, but for keyC I would like to return multiple values and change the key name, so the final output looks like this:
keyA | false
keyB | 3
keyC-paid | 10
keyD-unpaid | 20
I know I can use UNNEST and JSON_EXTRACT multiple values and create additional but unsure how to combine them to adjust the key column name as well?
Even more generic approach
create temp function extract_keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function extract_values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
create temp function extract_all_leaves(input string) returns string language js as '''
function flattenObj(obj, parent = '', res = {}){
for(let key in obj){
let propName = parent ? parent + '.' + key : key;
if(typeof obj[key] == 'object'){
flattenObj(obj[key], propName, res);
} else {
res[propName] = obj[key];
}
}
return JSON.stringify(res);
}
return flattenObj(JSON.parse(input));
''';
select col || replace(replace(key, 'value', ''), '.', '-') as col, value,
from your_table,
unnest([struct(extract_all_leaves(data) as json)]),
unnest(extract_keys(json)) key with offset
join unnest(extract_values(json)) value with offset
using(offset)
if applied to sample data in your question - output is
Benefit of this approach is that it is quite generic and thus can handle any level of nesting in json
For example for below data/table
the output is
Try this one:
WITH sample AS (
SELECT 'keyA' AS col, '{"value":false}' AS data
UNION ALL
SELECT 'keyB' AS col, '{"value":3}' AS data
UNION ALL
SELECT 'keyC' AS col, '{"value":{"paid":10,"unpaid":20}}' AS data
)
SELECT col || IFNULL('-' || k, '') AS col,
IFNULL(v, JSON_VALUE(data, '$.value')) AS data
FROM (
SELECT col, data,
`bqutil.fn.json_extract_keys`(JSON_QUERY(data, '$.value')) AS keys,
`bqutil.fn.json_extract_values`(JSON_QUERY(data, '$.value')) AS vals
FROM sample
) LEFT JOIN UNNEST(keys) k WITH OFFSET ki
LEFT JOIN UNNEST(vals) v WITH OFFSET vi ON ki = vi;

Expanding a Struct of Struct to columns in bigquery

I am working with a BQ table that has a format of a STRUCT of STRUCTs.
It looks as follows:
I would like to have a table which looks like follows:
property_hs_email_last_click_date_value
currentlyinworkflow_value
hs_first_engagement_object_id_value
hs_first_engagement_object_id_value__st
5/5/2022 23:00:00
Y
1
'Hey'
The challenge is that there are 500 fields and I would like to make this efficient instead of writing out every single line as follows:
SELECT property_hs_email_last_click_date as property_hs_email_last_click_date_value,
properties.currentlyinworkflow.value as currentlyinworkflow_value,
properties.hs_first_engagement_object_id.value as properties.hs_first_engagement_object_id_value,
properties.hs_first_engagement_object_id.value__st as hs_first_engagement_object_id_value__st
Any suggestions on how to make this more efficient?
Edit:
Here's a query that creates a table such as this:
create or replace table `project.database.TestTable` (
property_hs_email_last_click_date STRUCT < value string >,
properties struct < currentlyinworkflow struct < value string > ,
hs_first_engagement_object_id struct < value numeric , value__st string >,
first_conversion_event_name struct < value string >
>
);
insert into `project.database.TestTable`
values (struct('12/2/2022 23:00:02'), struct(struct('Yes'), struct(1, 'Thursday'), struct('Festival')) );
insert into `project.database.TestTable`
values (struct('14/2/2021 12:00:02'), struct(struct('No'), struct(5, 'Friday'), struct('Phone')) )
Below is quite generic script that extracts all leaves in JSON and then presents them as columns
create temp function extract_keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function extract_values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
create temp function extract_all_leaves(input string) returns string language js as '''
function flattenObj(obj, parent = '', res = {}){
for(let key in obj){
let propName = parent ? parent + '.' + key : key;
if(typeof obj[key] == 'object'){
flattenObj(obj[key], propName, res);
} else {
res[propName] = obj[key];
}
}
return JSON.stringify(res);
}
return flattenObj(JSON.parse(input));
''';
create temp table temp_table as (
select offset, key, value, format('%t', t) row_id
from your_table t,
unnest([struct(to_json_string(t) as json)]),
unnest([struct(extract_all_leaves(json) as leaves)]),
unnest(extract_keys(leaves)) key with offset
join unnest(extract_values(leaves)) value with offset
using(offset)
);
execute immediate (select '''
select * except(row_id) from (select * except(offset) from temp_table)
pivot (any_value(value) for replace(key, '.', '__') in (''' || keys_list || '''
))'''
from (select string_agg('"' || replace(key, '.', '__') || '"', ',' order by offset) keys_list from (
select key, min(offset) as offset from temp_table group by key
))
);
if applied to sample data in your question
create temp table your_table as (
select struct('12/2/2022 23:00:02' as value) as property_hs_email_last_click_date ,
struct(
struct('Yes' as value) as currentlyinworkflow ,
struct(1 as value, 'Thursday' as value__st) as hs_first_engagement_object_id ,
struct('Festival' as value) as first_conversion_event_name
) as properties
union all
select struct('14/2/2021 12:00:02'), struct(struct('No'), struct(5, 'Friday'), struct('Phone'))
);
the output is

build multiple json hash in same json in oracle

below two queries are returning a separate JSON when I merge the data. it's showing in invalid data.
SELECT JSON_OBJECT('Agriculture_Expenses' value
JSON_ARRAYAGG(JSON_OBJECT('Acreage12' value acreage,
'Farmer_projected' value
sellingprice,
'Projected_Expense' value attr1,
'Pattern1' value crop
)
-- JSON_OBJECT('ITEM' VALUE .50)
))
INTO AFL_JSON
FROM AGRI_INCOME
WHERE ATTR2 = 'AGRICULTURE_INCOME';
SELECT JSON_OBJECT('Cash_Flow_Form' value
JSON_ARRAYAGG(JSON_OBJECT('Agri_Coapp1' value rec.Coapp1,
'Agri_Coapp2' value rec.Coapp2,
'Agri_Coapp3' value rec.Coapp3,
'Lease_Land' value
rec.lease_land,
'Total_Land' value '',
'Land_under_cultivation1' value
rec.Land_under_cultivation,
'App_agr_lan' value
rec.Land_Holding)
))
INTO AFL_JSON
FROM dual;
DBMS_OUTPUT.PUT_LINE(AFL_JSON_DATA);
below two query is returning a data as below format extra { is coming when i merge two json object how can i get the data in below format.
{
"Cash_Flow_Form": [
{
"Agri_Coapp1": "1",
"Agri_Coapp2": "2",
"Agri_Coapp3": null,
"Lease_Land": "2",
"Total_Land": null,
"Land_under_cultivation1": "4",
"App_agr_lan": "5"
}
],
"Agriculture_Expenses": [
{
"Acreage12": "2",
"Farmer_projected": "2500",
"Projected_Expense": "81400",
"Pattern1": "Khariff"
}
]
}
You can use JSON_MERGEPATCH() function within a SELECT statement CROSS JOINing those JSON_OBJECTs
WITH AGRI_INCOME_JS AS
(
SELECT JSON_OBJECT( 'Agriculture_Expenses' value
JSON_ARRAYAGG(JSON_OBJECT ('Acreage12' value accreage,
'Farmer_projected' value sellingprice,
'Projected_Expense' value attr1,
'Pattern1' value crop
)
)
) AS JS2
FROM AGRI_INCOME
WHERE ATTR2 = 'AGRICULTURE_INCOME'
), CASH_FLOW_JS AS
(
SELECT JSON_OBJECT('Cash_Flow_Form' value
JSON_ARRAYAGG(JSON_OBJECT('Agri_Coapp1' value coapp1,
'Agri_Coapp2' value coapp2,
'Agri_Coapp3' value coapp3,
'Lease_Land' value lease_land,
'Total_Land' value Total_Land,
'Land_under_cultivation1' value Land_under_cultivation,
'App_agr_lan' value Land_Holding
)
)
) AS JS1
FROM CASH_FLOW
)
SELECT JSON_MERGEPATCH(JS1,JS2) AS "Result JSON"
FROM CASH_FLOW_JS
CROSS JOIN AGRI_INCOME_JS
The above query works for 18+, If your DB version is 12c, then use :
WITH AGRI_INCOME_JS AS
(
SELECT JSON_OBJECT( 'Agriculture_Expenses' value
JSON_ARRAYAGG(JSON_OBJECT ('Acreage12' value accreage,
'Farmer_projected' value sellingprice,
'Projected_Expense' value attr1,
'Pattern1' value crop
)
)
) AS JS2
FROM AGRI_INCOME
WHERE ATTR2 = 'AGRICULTURE_INCOME'
), CASH_FLOW_JS AS
(
SELECT JSON_OBJECT('Cash_Flow_Form' value
JSON_ARRAYAGG(JSON_OBJECT('Agri_Coapp1' value coapp1,
'Agri_Coapp2' value coapp2,
'Agri_Coapp3' value coapp3,
'Lease_Land' value lease_land,
'Total_Land' value Total_Land,
'Land_under_cultivation1' value Land_under_cultivation,
'App_agr_lan' value Land_Holding
)
)
) AS JS1
FROM CASH_FLOW
), JS(JS) AS
( SELECT JS1 FROM CASH_FLOW_JS
UNION ALL
SELECT JS2 FROM AGRI_INCOME_JS)
SELECT REPLACE(REPLACE(REPLACE(REPLACE(JSON_ARRAYAGG(JS),'\'),'["'),'"]'),'}","{',',')
AS "Result JSON"
FROM JS
Demo

BigQuery JSON EXTRACT

[
{
"key":"expiry_date",
"type":"date",
"label":"Expiry Date",
"required":false,
"default_value":"2029-12-15"
},
{
"key":"brand",
"type":"text",
"label":"Brand",
"required":false,
"default_value":"clipsal"
}
]
Is there a way that I could extract the default_value of "expiry_date" in the nested JSON above? The data is under a column called attributes.
Have you tried any of these functions described here? Maybe it can help.
Also, if the first element of the json array will be always what you want, you could use something like:
WITH test_table AS (
SELECT "[{\"key\":\"expiry_date\",\"type\":\"date\",\"label\":\"Expiry Date\",\"required\":false,\"default_value\":\"2029-12-15\"},{\"key\":\"brand\",\"type\":\"text\",\"label\":\"Brand\",\"required\":false,\"default_value\":\"clipsal\"}]" AS json_text_field
)
SELECT JSON_EXTRACT(json_text_field, '$[0].default_value') FROM test_table
If the keys it's not always the first, you could use this instead:
WITH test_table AS (
SELECT "[{\"key\":\"expiry_date\",\"type\":\"date\",\"label\":\"Expiry Date\",\"required\":false,\"default_value\":\"2029-12-15\"},{\"key\":\"brand\",\"type\":\"text\",\"label\":\"Brand\",\"required\":false,\"default_value\":\"clipsal\"}]" AS json_text_field
)
SELECT value FROM (
SELECT JSON_EXTRACT(json_text_field, '$.key') AS id, JSON_EXTRACT(json_text_field, '$.default_value') AS value FROM test_table, UNNEST(JSON_EXTRACT_ARRAY(json_text_field, '$')) AS json_value
) WHERE id = '"expiry_date"'
Below is for BigQuery Standard SQL
#standardSQL
SELECT JSON_EXTRACT_SCALAR(el, '$.default_value') AS default_value
FROM `project.dataset.table`,
UNNEST(JSON_EXTRACT_ARRAY(json)) el
WHERE JSON_EXTRACT_SCALAR(el, '$.key') = 'expiry_date'
You can test above with sample / dummy data from y our question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT '''
[
{
"key":"expiry_date",
"type":"date",
"label":"Expiry Date",
"required":false,
"default_value":"2029-12-15"
},
{
"key":"brand",
"type":"text",
"label":"Brand",
"required":false,
"default_value":"clipsal"
}
]
''' json
)
SELECT JSON_EXTRACT_SCALAR(el, '$.default_value') AS default_value
FROM `project.dataset.table`,
UNNEST(JSON_EXTRACT_ARRAY(json)) el
WHERE JSON_EXTRACT_SCALAR(el, '$.key') = 'expiry_date'
with output
Row default_value
1 2029-12-15
Depends on your real use case - you can consider below variation
#standardSQL
SELECT *,
(
SELECT JSON_EXTRACT_SCALAR(el, '$.default_value')
FROM UNNEST(JSON_EXTRACT_ARRAY(json)) el
WHERE JSON_EXTRACT_SCALAR(el, '$.key') = 'expiry_date'
) AS default_value
FROM `project.dataset.table`

How to transform path/string by collapsing repeated elements?

There is a filed in my table that represents pathways like below:
Item1->Item1->Item2-> Item3->Item3->Item3->Item1
In most cases this is quite looong sequence with many instances of same consecutive Items.
How I can shorted above path to something like below? in BigQuery!
Item1(x2)->Item2->Item3(x3)->Item1
I wanted to convince myself that this was possible just through array manipulation (using standard SQL), and I came up with a solution. An alternate way to solve the problem would be to use analytic functions, where you could detect changes in item along the path.
CREATE TEMPORARY FUNCTION PartsToString(
parts_and_offsets ARRAY<STRUCT<part STRING, off INT64>>) AS ((
SELECT
STRING_AGG(
CONCAT(part_and_offset.part,
IF(parts_and_offsets[OFFSET(off + 1)].off - part_and_offset.off = 1,
"",
CONCAT("(x", CAST(parts_and_offsets[OFFSET(off + 1)].off - part_and_offset.off AS STRING), ")"))))
FROM UNNEST(parts_and_offsets) AS part_and_offset WITH OFFSET off
WHERE off + 1 < ARRAY_LENGTH(parts_and_offsets)
));
CREATE TEMPORARY FUNCTION PathwayToParts(pathway STRING) AS ((
SELECT
ARRAY_CONCAT(
ARRAY_AGG(
STRUCT(part, off)),
[STRUCT("" AS part, ARRAY_LENGTH(ANY_VALUE(parts)) AS off)]) AS parts_and_offsets
FROM (SELECT SPLIT(pathway, "->") AS parts),
UNNEST(parts) AS part WITH OFFSET off
WHERE off = 0 OR part != parts[OFFSET(off - 1)]
));
WITH YourTable AS (
SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway
UNION ALL SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway
UNION ALL SELECT "Item1->Item1->Item1" AS pathway
UNION ALL SELECT "Item1->Item2->Item2" AS pathway
UNION ALL SELECT "Item1->Item1->Item2" AS pathway
UNION ALL SELECT "Item1->Item2->Item3" AS pathway
)
SELECT PartsToString(PathwayToParts(pathway)) AS parts_string
FROM YourTable;
Using Scalar JS UDF (Standard SQL) <-- would be my choice
CREATE TEMPORARY FUNCTION collapse_repeated(pathway STRING)
RETURNS STRING LANGUAGE js AS """
var items = pathway.split('->');
short = ''; elem = items[0]; count = 0;
for (var i = 0; i < items.length; i++) {
if (items[i] !== elem) {
if (short.length > 0) {short += '->'}
short += elem; if (count > 1) {short += '(x' + count.toString() + ')';}
elem = items[i]; count = 1;
} else {
count++;
}
}
if (short.length > 0) {short += '->'}
short += elem; if (count > 1) {short += '(x' + count.toString() + ')';}
return short;
""";
WITH YourTable AS (
SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway
UNION ALL SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway
UNION ALL SELECT "Item1->Item1->Item1" AS pathway
UNION ALL SELECT "Item1->Item2->Item2" AS pathway
)
SELECT collapse_repeated(pathway) AS shorten_pathway, pathway
FROM YourTable
Note: Same JS can be easily “translated” to JS UDF in Legacy SQL
Using Window Functions (Legacy SQL)
SELECT GROUP_CONCAT_UNQUOTED(IF(repeats=1, item, CONCAT(item, "(x", STRING(repeats), ")")), "->"), pathway
FROM (
SELECT MIN(pos) AS ord, MIN(item) AS item, COUNT(1) AS repeats, pathway
FROM (
SELECT item, pos, IFNULL(grp, 0)AS grp, pathway FROM (
SELECT item, pos, SUM(change) OVER(PARTITION BY pathway ORDER BY pos ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS grp, pathway
FROM (
SELECT item, pos, IF(item=next_item, 0, 1) AS change, pathway FROM (
SELECT item, pos, LEAD(item) OVER(PARTITION BY pathway ORDER BY pos) AS next_item, pathway
FROM (
SELECT item, POSITION(item) AS pos, pathway FROM (
SELECT SPLIT(pathway, "->") AS item, pathway FROM
(SELECT "Item1->Item2->Item2->Item2->Item3->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item2->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item1->Item4" AS pathway),
(SELECT "Item1->Item2->Item2->Item3->Item1->Item1->Item1->Item2->Item3->Item3->Item2->Item2->Item2->Item1->Item4" AS pathway),
(SELECT "Item1->Item1->Item1" AS pathway),
(SELECT "Item1->Item2->Item2" AS pathway)
)
)
)
)
)
)
GROUP BY grp, pathway
ORDER BY ord
)
GROUP BY pathway