BigQuery SQL JSON Returning additional rows when current row contains multiple values - sql

I have a table that looks like this
keyA | data:{"value":false}}
keyB | data:{"value":3}}
keyC | data:{"value":{"paid":10,"unpaid":20}}}
For keyA,keyB I can easily extract a single value with JSON_EXTRACT_SCALAR, but for keyC I would like to return multiple values and change the key name, so the final output looks like this:
keyA | false
keyB | 3
keyC-paid | 10
keyD-unpaid | 20
I know I can use UNNEST and JSON_EXTRACT multiple values and create additional but unsure how to combine them to adjust the key column name as well?

Even more generic approach
create temp function extract_keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function extract_values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
create temp function extract_all_leaves(input string) returns string language js as '''
function flattenObj(obj, parent = '', res = {}){
for(let key in obj){
let propName = parent ? parent + '.' + key : key;
if(typeof obj[key] == 'object'){
flattenObj(obj[key], propName, res);
} else {
res[propName] = obj[key];
}
}
return JSON.stringify(res);
}
return flattenObj(JSON.parse(input));
''';
select col || replace(replace(key, 'value', ''), '.', '-') as col, value,
from your_table,
unnest([struct(extract_all_leaves(data) as json)]),
unnest(extract_keys(json)) key with offset
join unnest(extract_values(json)) value with offset
using(offset)
if applied to sample data in your question - output is
Benefit of this approach is that it is quite generic and thus can handle any level of nesting in json
For example for below data/table
the output is

Try this one:
WITH sample AS (
SELECT 'keyA' AS col, '{"value":false}' AS data
UNION ALL
SELECT 'keyB' AS col, '{"value":3}' AS data
UNION ALL
SELECT 'keyC' AS col, '{"value":{"paid":10,"unpaid":20}}' AS data
)
SELECT col || IFNULL('-' || k, '') AS col,
IFNULL(v, JSON_VALUE(data, '$.value')) AS data
FROM (
SELECT col, data,
`bqutil.fn.json_extract_keys`(JSON_QUERY(data, '$.value')) AS keys,
`bqutil.fn.json_extract_values`(JSON_QUERY(data, '$.value')) AS vals
FROM sample
) LEFT JOIN UNNEST(keys) k WITH OFFSET ki
LEFT JOIN UNNEST(vals) v WITH OFFSET vi ON ki = vi;

Related

Migrating data from jsonb to integer[] SQL

I have jsonb field(data) in Postgresql with a structure like:
{ "id" => { "some_key" => [1, 2, 3] } }
I need to migrate the value to a different field.
t.jsonb "data"
t.integer "portals", default: [], array: true
When I'm trying to do like this:
UPDATE table_name
SET portals = ARRAY[data -> '1' ->> 'portals']
WHERE id = 287766
It raises an error:
Caused by PG::DatatypeMismatch: ERROR: column "portals" is of type integer[] but expression is of type text[]
Here is one way to do it. But if you search the site, as you should had to do, you get more.
Schema
create table t (
data jsonb
);
insert into t values ('{"1" : { "k1" : [1,2,3,5]} }');
insert into t values ('{"2" : { "k2" : [4,5,6,7]} }');
create table i (
id int,
v int[]
)
Some tests
select data -> '1' -> 'k1'
from t
where data ? '1'
;
insert into i values(1,ARRAY[1,2,3]);
update i
set v = (select replace(replace(data -> '1' ->> 'k1', '[', '{'), ']', '}')::int[] from t where data ? '1')
where id = 1;
select * from i;
The above gets array as a text, as you did. After that, just some text replacements to cast the text to an integer array literal.
DB Fiddle

Expanding a Struct of Struct to columns in bigquery

I am working with a BQ table that has a format of a STRUCT of STRUCTs.
It looks as follows:
I would like to have a table which looks like follows:
property_hs_email_last_click_date_value
currentlyinworkflow_value
hs_first_engagement_object_id_value
hs_first_engagement_object_id_value__st
5/5/2022 23:00:00
Y
1
'Hey'
The challenge is that there are 500 fields and I would like to make this efficient instead of writing out every single line as follows:
SELECT property_hs_email_last_click_date as property_hs_email_last_click_date_value,
properties.currentlyinworkflow.value as currentlyinworkflow_value,
properties.hs_first_engagement_object_id.value as properties.hs_first_engagement_object_id_value,
properties.hs_first_engagement_object_id.value__st as hs_first_engagement_object_id_value__st
Any suggestions on how to make this more efficient?
Edit:
Here's a query that creates a table such as this:
create or replace table `project.database.TestTable` (
property_hs_email_last_click_date STRUCT < value string >,
properties struct < currentlyinworkflow struct < value string > ,
hs_first_engagement_object_id struct < value numeric , value__st string >,
first_conversion_event_name struct < value string >
>
);
insert into `project.database.TestTable`
values (struct('12/2/2022 23:00:02'), struct(struct('Yes'), struct(1, 'Thursday'), struct('Festival')) );
insert into `project.database.TestTable`
values (struct('14/2/2021 12:00:02'), struct(struct('No'), struct(5, 'Friday'), struct('Phone')) )
Below is quite generic script that extracts all leaves in JSON and then presents them as columns
create temp function extract_keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function extract_values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
create temp function extract_all_leaves(input string) returns string language js as '''
function flattenObj(obj, parent = '', res = {}){
for(let key in obj){
let propName = parent ? parent + '.' + key : key;
if(typeof obj[key] == 'object'){
flattenObj(obj[key], propName, res);
} else {
res[propName] = obj[key];
}
}
return JSON.stringify(res);
}
return flattenObj(JSON.parse(input));
''';
create temp table temp_table as (
select offset, key, value, format('%t', t) row_id
from your_table t,
unnest([struct(to_json_string(t) as json)]),
unnest([struct(extract_all_leaves(json) as leaves)]),
unnest(extract_keys(leaves)) key with offset
join unnest(extract_values(leaves)) value with offset
using(offset)
);
execute immediate (select '''
select * except(row_id) from (select * except(offset) from temp_table)
pivot (any_value(value) for replace(key, '.', '__') in (''' || keys_list || '''
))'''
from (select string_agg('"' || replace(key, '.', '__') || '"', ',' order by offset) keys_list from (
select key, min(offset) as offset from temp_table group by key
))
);
if applied to sample data in your question
create temp table your_table as (
select struct('12/2/2022 23:00:02' as value) as property_hs_email_last_click_date ,
struct(
struct('Yes' as value) as currentlyinworkflow ,
struct(1 as value, 'Thursday' as value__st) as hs_first_engagement_object_id ,
struct('Festival' as value) as first_conversion_event_name
) as properties
union all
select struct('14/2/2021 12:00:02'), struct(struct('No'), struct(5, 'Friday'), struct('Phone'))
);
the output is

Snowflake get_path() or flatten() array query - to find latest key:value

I have a column 'amp' in a table 'EXAMPLE'. Column 'amp' is an array which looks like this:
[{
"list": [{
"element": {
"x_id": "12356789XXX",
"y_id": "12356789XXX38998",
}
},
{
"element": {
"x_id": "5677888356789XXX",
"y_id": "1XXX387688",
}
}]
}]
How should I query using get_path() or flatten() to extract the latest x_id and y_id value (or other alternative)
In this example it is only 2 elements, but there could 1 to 6000 elements containing x_id and y_id.
Help much appreciated!
Someone may have a more elegant way than this, but you can use a CTE. In the first table expression, grab the max of the array. In the second part, grab the values you need.
set json = '[{"list": [{"element": {"x_id": "12356789XXX","y_id": "12356789XXX38998"}},{"element": {"x_id": "5677888356789XXX","y_id": "1XXX387688",}}]}]';
create temp table foo(v variant);
insert into foo select parse_json($json);
with
MAX_INDEX(M) as
(
select max("INDEX") MAX_INDEX
from foo, lateral flatten(v, recursive => true)
),
VALS(V, P, K) as
(
select "VALUE", "PATH", "KEY"
from foo, lateral flatten(v, recursive => true)
)
select k as "KEY", V::string as VALUE from vals, max_index
where VALS.P = '[0].list[' || max_index.m || '].element.x_id' or
VALS.P = '[0].list[' || max_index.m || '].element.y_id'
;
Assuming that the outer array ALWAYS contains a single dictionary element, you could use this:
SELECT amp[0]:"list"[ARRAY_SIZE(amp[0]:"list")-1]:"element":"x_id"::VARCHAR AS x_id
,amp[0]:"list"[ARRAY_SIZE(amp[0]:"list")-1]:"element":"y_id"::VARCHAR AS y_id
FROM T
;
Or if you prefer a bit more modularity/readability, you could use this:
WITH CTE1 AS (
SELECT amp[0]:"list" AS _ARRAY
FROM T
)
,CTE2 AS (
SELECT _ARRAY[ARRAY_SIZE(_ARRAY)-1]:"element" AS _DICT
FROM CTE1
)
SELECT _DICT:"x_id"::VARCHAR AS x_id
,_DICT:"y_id"::VARCHAR AS y_id
FROM CTE2
;
Note: I have not used FLATTEN here because I did not see a good reason to use it.

Flatten association table to multi-value column?

I have a table with just product ID's and category ID's (products can be in more than one category). How can I flatten the category ID's into a product column so I end us with this:
id | name | desc | categories
1 | test1 | lorem | 1,3,4,23
2 | test2 | ipsom | 4,6,24
It is like I need to loop into a separate table for the categories column. How can I do this or is there a better way?
I created an CLR aggregate function that takes a varchar column and returns all its values separated by commas. In other words, it joins several strings into a comma-separated list. I am sure its performance is way better than any T-Sql trick.
As any aggregate function, it can be used in combination with group by. For example:
SELECT id, name, desc, JoinStrings(CONVERT(VARCHAR(20), category_id))
FROM product p
INNER JOIN category_products c ON p.category_id = c.category_id
GROUP BY id, name, desc
Here's the C# code to create the CLR assembly into Sql Server 2008:
using System;
using System.Data;
using System.Data.SqlClient;
using System.Data.SqlTypes;
using Microsoft.SqlServer.Server;
[Serializable]
[Microsoft.SqlServer.Server.SqlUserDefinedAggregate(Format.UserDefined, IsInvariantToDuplicates=false, IsInvariantToOrder=false, IsInvariantToNulls=true, MaxByteSize=-1)]
public struct JoinStrings : IBinarySerialize
{
private char[] sb;
private int pos;
public void Init()
{
sb = new char[512000];
pos = 0;
}
public void Accumulate(SqlString Value)
{
if (Value.IsNull) return;
char[] src = Value.ToString().ToCharArray();
Array.Copy(src, 0, sb, pos, src.Length);
pos += src.Length;
sb[pos] = ',';
pos++;
}
public void Merge(JoinStrings Group)
{
Accumulate(Group.Terminate());
}
public SqlString Terminate()
{
if (pos <= 0)
return new SqlString();
else
return new SqlString(new String(sb, 0, pos-1));
}
public void Read(System.IO.BinaryReader r)
{
this.Init();
pos = r.ReadInt32();
r.Read(sb, 0, pos);
}
public void Write(System.IO.BinaryWriter w)
{
w.Write(pos);
w.Write(sb, 0, pos);
}
}
Here's the code to create the function (although deploying from Visual Studio should do it automatically):
CREATE AGGREGATE [dbo].[JoinStrings]
(#s [nvarchar](4000))
RETURNS[nvarchar](max)
EXTERNAL NAME [YouAssemblyName].[JoinStrings]
There's no in-built way to do it in MSSQL.
Simulating group_concat MySQL function in Microsoft SQL Server 2005? has a good description of how to go about implementing a workaround.
I would suggest using a Recursive CTE. I believe that it would be something like this:
select productid, categoryid,
row_number() over (partition by id order by categoryid) as rownum
into #tabletorecurse
from TABLENAME
with finaloutput as
(
select productid as id, name, desc, categoryid as categories, rownum
from #tabletorecurse
join PRODUCTTABLE
on PRODUCTTABLE.id = #tabletorecurse.productid
where rownum = 1
union all
select tr.id, tr.name, tr.desc,
finaloutput.categories + ', ' + tr.categoryid, tr.rownum
from #tabletorecurse as tr
join finaloutput
on finaloutput.rownum + 1 = tr.rownum
and finaloutput.id = tr.productid
)
select id, name, desc, categories
from finaloutput
join
(
select max(rownum) as maxrow, id
from finaloutput
group by id
) as maxvalues
on maxvalues.id = finaloutput.id
and maxvalues.maxrow = finaloutput.rownum
Use a function.
This does a lookup to text so you will need to adapt.
The COALESCE is just to put a ,.
This is from a large scale production application - it works and it fast.
Function was questioned by JustinPony as function is slow
I am hitting some tables of million of records but only returning 100 rows.
The function is only applied to the hundred rows.
usage:
select top 5 sID, ( select [dbo].[JoinMVEnum](docSVsys.sID, '140') ) as [Flag Issue]
from docSVsys
function
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE FUNCTION [dbo].[JoinMVText]
(
#sID int,
#fieldID tinyint
)
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE #MVtextList varchar(max)
SELECT #MVtextList = COALESCE(#MVtextList + '; ', '') + docMVtext.value
FROM docMVtext with (nolock)
WHERE docMVtext.sID = #sID and fieldID = #fieldID
RETURN #MVtextList
END
GO

How To Split Pipe-Delimited Column and insert each value into new table Once?

I have an old database with a gazillion records (more or less) that have a single tags column (with tags being pipe-delimited) that looks like so:
Breakfast
Breakfast|Brunch|Buffet|Burger|Cakes|Crepes|Deli|Dessert|Dim Sum|Fast Food|Fine Wine|Spirits|Kebab|Noodles|Organic|Pizza|Salad|Seafood|Steakhouse|Sushi|Tapas|Vegetarian
Breakfast|Brunch|Buffet|Burger|Deli|Dessert|Fast Food|Fine Wine|Spirits|Noodles|Pizza|Salad|Seafood|Steakhouse|Vegetarian
Breakfast|Brunch|Buffet|Cakes|Crepes|Dessert|Fine Wine|Spirits|Salad|Seafood|Steakhouse|Tapas|Teahouse
Breakfast|Brunch|Burger|Crepes|Salad
Breakfast|Brunch|Cakes|Dessert|Dim Sum|Noodles|Pizza|Salad|Seafood|Steakhouse|Vegetarian
Breakfast|Brunch|Cakes|Dessert|Dim Sum|Noodles|Pizza|Salad|Seafood|Vegetarian
Breakfast|Brunch|Deli|Dessert|Organic|Salad
Breakfast|Brunch|Dessert|Dim Sum|Hot Pot|Seafood
Breakfast|Brunch|Dessert|Dim Sum|Seafood
Breakfast|Brunch|Dessert|Fine Wine|Spirits|Noodles|Pizza|Salad|Seafood
Breakfast|Brunch|Dessert|Fine Wine|Spirits|Salad|Vegetarian
Is there a way one could retrieve each tag and insert it into a new table tag_id | tag_nm using MySQL only?
Here is my attempt which uses PHP..., I imagine this could be more efficient with a clever MySQL query. I've placed the relationship part of it there too. There's no escaping and error checking.
$rs = mysql_query('SELECT `venue_id`, `tag` FROM `venue` AS a');
while ($row = mysql_fetch_array($rs)) {
$tag_array = explode('|',$row['tag']);
$venueid = $row['venue_id'];
foreach ($tag_array as $tag) {
$rs2 = mysql_query("SELECT `tag_id` FROM `tag` WHERE tag_nm = '$tag'");
$tagid = 0;
while ($row2 = mysql_fetch_array($rs2)) $tagid = $row2['tag_id'];
if (!$tagid) {
mysql_execute("INSERT INTO `tag` (`tag_nm`) VALUES ('$tag')");
$tagid = mysql_insert_id;
}
mysql_execute("INSERT INTO `venue_tag_rel` (`venue_id`, `tag_id`) VALUES ($venueid, $tagid)");
}
}
After finding there is no official split function I've solved the issue using only MySQL like so:
1: I created the function strSplit
CREATE FUNCTION strSplit(x varchar(21845), delim varchar(255), pos int) returns varchar(255)
return replace(
replace(
substring_index(x, delim, pos),
substring_index(x, delim, pos - 1),
''
),
delim,
''
);
Second I inserted the new tags into my new table (real names and collumns changed, to keep it simple)
INSERT IGNORE INTO tag (SELECT null, strSplit(`Tag`,'|',1) AS T FROM `old_venue` GROUP BY T)
Rinse and repeat increasing the pos by one for each collumn (in this case I had a maximum of 8 seperators)
Third to get the relationship
INSERT INTO `venue_tag_rel`
(Select a.`venue_id`, b.`tag_id` from `old_venue` a, `tag` b
WHERE
(
a.`Tag` LIKE CONCAT('%|',b.`tag_nm`)
OR a.`Tag` LIKE CONCAT(b.`tag_nm`,'|%')
OR a.`Tag` LIKE CONCAT(CONCAT('%|',b.`tag_nm`),'|%')
OR a.`Tag` LIKE b.`tag_nm`
)
)