How to map object/json array in Snowflake SQL / DBT Macro? - sql

id
some_attribute
json_array
1
"abc"
[ { attr: 'apple'}, { attr: 'banana' } ]
How to get the get rid of attr in json_array so that the table results into something like table below?
id
some_attribute
string_array
1
"abc"
[ 'apple', 'banana' ]
Use case is during the cleaning stage of the data to make further processing and analysis simpler in later stages of the pipeline.
Thx for the help!

One option is to FLATTEN the json array, then construct the string array from the values.
For example
WITH data AS(
SELECT 1 id, 'abc' as some_attribute
, [{ 'attr': 'apple'}, { 'attr': 'banana' } ] as json_array
)
SELECT
id
, some_attribute
, ARRAY_AGG(value:attr::string) WITHIN GROUP( ORDER BY index) as string_array
FROM
data
, TABLE(FLATTEN(input => json_array))
GROUP BY
id
, some_attribute
which returns
ID|SOME_ATTRIBUTE|STRING_ARRAY |
--+--------------+------------------+
1|abc |["apple","banana"]|

Another option is to create a JavaScript UDF. For example
CREATE OR REPLACE FUNCTION ARRAY_JSON_VALUES("a" ARRAY, "attr" STRING)
RETURNS ARRAY
LANGUAGE JAVASCRIPT RETURNS NULL ON NULL INPUT IMMUTABLE
AS
$$
return a.map(e => e[attr]);
$$
then
WITH data AS(
SELECT 1 id, 'abc' as some_attribute, [{ 'attr': 'apple'}, { 'attr': 'banana' } ] as json_array
)
SELECT
id
, some_attribute
, ARRAY_JSON_VALUES(json_array,'attr') as string_array
FROM
data
again returns
ID|SOME_ATTRIBUTE|STRING_ARRAY |
--+--------------+------------------+
1|abc |["apple","banana"]|

Related

how can I make result of ARRAY_AGG() function json parsable

I have a query that selects the rows from joined table as an array using ARRAY_AGG() function.
select
entity_number,
ARRAY_AGG('{"property_id":"'||property_id||'","value":"'||value||'"}') entity_properties from entities
join entity_properties
on entities.id = entity_properties.entity_id
where entities.id in (
select entity_id from entity_properties
where value = '6258006d824a25dabdb39a79.pdf'
)
group by entities.id;
what I get is:
[
{
"entity_number":"P1718238009-1",
"entity_properties":"[
\"{\"property_id\":\"006109cd-a100-437c-a683-f13413b448e6\",\"value\":\"Rozilik berildi\"}\",
\"{\"property_id\":\"010f5e23-d66f-4414-b54b-9647afc6762b\",\"value\":\"6258006d824a25dabdb39a79.pdf\"}\",
\"{\"property_id\":\"0a01904e-1ca0-40ef-bbe1-c90eaddea3fc\",\"value\":\"6260c9e9b06e4c2cc492c470_2634467.pdf\"}\"
]"
}
]
As you can see, it is not json parsable
To parse entity_properties as array of objects I need the data in this format
[
{
"entity_number":"P1718238009-1",
"entity_properties":[
{"property_id":"006109cd-a100-437c-a683-f13413b448e6","value":"Rozilik berildi"},
{"property_id":"010f5e23-d66f-4414-b54b-9647afc6762b","value":"6258006d824a25dabdb39a79.pdf"},
{"property_id":"0a01904e-1ca0-40ef-bbe1-c90eaddea3fc","value":"6260c9e9b06e4c2cc492c470_2634467.pdf"}
]
}
]
Can I achieve what I want with ARRAY_AGG()? How?
If not, what approach should I take?
Try using json_agg and json_build_object function
like this:
select
entity_number,
json_agg(json_build_object('property_id', property_id, 'value', value)) entity_properties from entities
join entity_properties
on entities.id = entity_properties.entity_id
where entities.id in (
select entity_id from entity_properties
where value = '6258006d824a25dabdb39a79.pdf'
)
group by entities.id;
Using a simplified sample data this query provides the first step of the aggregation
with tab as (
select * from (values
(1,'a','x'),
(1,'b','y'),
(2,'c','z')
) tab(entity_number,property_id,value)
)
select
entity_number,
json_agg( json_build_object('property_id', property_id, 'value', value)) entity_properties
from tab
group by 1
;
entity_number|entity_properties |
-------------+----------------------------------------------------------------------------+
1|[{"property_id" : "a", "value" : "x"}, {"property_id" : "b", "value" : "y"}]|
2|[{"property_id" : "c", "value" : "z"}]
Additional aggregation returns the final json array
with tab as (
select * from (values
(1,'a','x'),
(1,'b','y'),
(2,'c','z')
) tab(entity_number,property_id,value)
),
tab2 as (
select
entity_number,
json_agg( json_build_object('property_id', property_id, 'value', value)) entity_properties
from tab
group by 1
)
select
json_agg(
json_build_object(
'entity_number',
entity_number,
'entity_properties',
entity_properties
)
)
from tab2
[
{
"entity_number": 1,
"entity_properties": [
{
"value": "x",
"property_id": "a"
},
{
"value": "y",
"property_id": "b"
}
]
},
{
"entity_number": 2,
"entity_properties": [
{
"value": "z",
"property_id": "c"
}
]
}
]
Note that I used jsonb_pretty to format the output.

How to PARSE_JSON from Snowflake Field?

I have a table that looks like:
ID|FIELD1
1|[ { "list": [ {} ] } ]
2|[ { "list": [ { "item": "" } ] } ]
3|[ { "list": [ { "item": "Tag1" }, { "item": "Tag2" } ] } ]
And I want to get all the tags associated to this specific query such that I can just get a list:
Tag1,Tag2
I've tried
SELECT PARSE_JSON(FIELD1[0]['list'][0]['item']) FROM MY_TABLE
WHERE PARSE_JSON(FIELD1[0]['list'][0]) != '{}'
But I get
JSON: garbage in the numeric literal: 65-310 , pos 7
How can I properly unpack these values in SQL?
UPDATE: Clumsy Solution
SELECT LISTAGG(CODES,'\',\'') AS PROMO_CODES
FROM
(SELECT DISTINCT FIELD1[0]['list'][0]['item'] AS CODES FROM MY_TABLE
WHERE FIELD1[0]['list'][0] IS NOT NULL
AND FIELD1[0]['list'][0] != '{}'
AND FIELD1[0]['list'][0]['item'] != ''
)
Please have a look into below knowledge article, if this helps in your case:
https://community.snowflake.com/s/article/Dynamically-extracting-JSON-using-LATERAL-FLATTEN
As I see, the Clumsy Solution does not provide the correct result. It shows only Tag1. So here's my solution:
select LISTAGG( v.VALUE:item, ',' ) from MY_TABLE,
lateral flatten (parse_json(FIELD1[0]):list) v
WHERE v.VALUE:item <> '';
I would recommend to add DISTINCT to prevent duplicate tags in the output:
select LISTAGG( DISTINCT v.VALUE:item, ',' ) from MY_TABLE,
lateral flatten (parse_json(FIELD1[0]):list) v
WHERE v.VALUE:item <> '';
If there are more items in the FIELD1 array (ie 0,1,2), you may use this one:
select LISTAGG( DISTINCT v.VALUE:item, ',' ) from MY_TABLE,
lateral flatten(FIELD1) f,
lateral flatten (parse_json(f.VALUE):list) v
WHERE v.VALUE:item <> '';

GET last element of array in json column of my Transact SQL table

Thanks for helping.
I have my table CONVERSATIONS structured in columns like this :
[ ID , JSON_CONTENT ]
In the column ID i have a simple id in Varchar
In the column JSON_CONTENT i something like this :
{
"id_conversation" : "25bc8cbffa8b4223a2ed527e30d927bf",
"exchanges": [
{
"A" : "...",
"B": "..."
},
{
"A" : "...",
"B": "..."
},
{
"A" : "...",
"Z" : "..."
}
]
}
I would like to query and get the id and the last element of exchanges :
[ ID , LAST_ELT_IN_EXCHANGE_IN_JSON_CONTENT]
I wanted to do this :
select TOP 3 ID, JSON_QUERY(JSON_CONTENT, '$.exchange[-1]')
from CONVERSATION
But of course Transact SQL is not Python.
I saw theses answers, but i don't know how to applicate to my problem.
Select last value from Json array
Thanks for helping <3
If I understand you correctly, you need an additional APPLY operator and a combination of OPENJSON() and ROW_NUMBER(). The result from the OPENJSON() call is a table with columns key, value and type and when the JSON content is an array, the key column returns the index of the element in the specified array:
Table:
SELECT ID, JSON_CONTENT
INTO CONVERSATION
FROM (VALUES
(1, '{"id_conversation":"25bc8cbffa8b4223a2ed527e30d927bf","exchanges":[{"A":"...","B":"..."},{"A":"...","B":"..."},{"A":"...","Z":"..."}]}')
) v (ID, JSON_CONTENT)
Statement:
SELECT c.ID, j.[value]
FROM CONVERSATION c
OUTER APPLY (
SELECT [value], ROW_NUMBER() OVER (ORDER BY CONVERT(int, [key]) DESC) AS rn
FROM OPENJSON(c.JSON_CONTENT, '$.exchanges')
) j
WHERE j.rn = 1
Result:
ID value
------------------------
1 {
"A" : "...",
"Z" : "..."
}
Notice, that -1 is not a valid array index in your path expression, but you can access the item in a JSON array by index (e.g. '$.exchanges[2]').

Postgresql search if exists in nested jsonb

I'm new with jsonb request and i got a problem. Inside an 'Items' table, I have 'id' and 'data' jsonb. Here is what can look like a data:
[
{
"paramId": 3,
"value": "dog"
},
{
"paramId": 4,
"value": "cat"
},
{
"paramId": 5,
"value": "fish"
},
{
"paramId": 6,
"value": "",
"fields": [
{
"paramId": 3,
"value": "cat"
},
{
"paramId": 4,
"value": "dog"
}
]
},
{
"paramId": 6,
"value": "",
"fields": [
{
"paramId": 5,
"value": "cat"
},
{
"paramId": 3,
"value": "dog"
}
]
}
]
The value in data is always an array with object inside but sometimes the object can have a 'fields' value with objects inside. It is maximum one level deep.
How can I select the id of the items which as for example an object containing "paramId": 3 and "value": "cat" and also have an object with "paramId": 5 and "value" LIKE '%ish%'.
I already have found a way to do that when the object is on level 0
SELECT i.*
FROM items i
JOIN LATERAL jsonb_array_elements(i.data) obj3(val) ON obj.val->>'paramId' = '3'
JOIN LATERAL jsonb_array_elements(i.data) obj5(val) ON obj2.val->>'paramId' = '5'
WHERE obj3.val->>'valeur' = 'cat'
AND obj5.val->>'valeur' LIKE '%ish%';
but I don't know how to search inside the fields array if fields exists.
Thank you in advance for you help.
EDIT:
It looks like my question is not clear. I will try to make it better.
What I want to do is to find all the 'item' having in the 'data' column objects who match my search criteria. This without looking if the objects are at first level or inside a 'fields' key of an object.
Again for example. This record should be selected if I search:
'paramId': 3 AND 'value': 'cat
'paramId': 4 AND 'value': LIKE '%og%'
the matching ones are in the 'fields' key of the object with 'paramId': 6 and I don't know how to do that.
This can be expressed using a JSON/Path expression without the need for unnesting everything
To search for paramId = 3 and value = 'cat'
select *
from items
where data #? '$[*] ? ( (#.paramId == 3 && #.value == "cat") || exists( #.fields[*] ? (#.paramId == 3 && #.value == "cat")) )'
The $[*] part iterates over all elements of the first level array. To check the elements in the fields array, the exists() operator is used to nest the expression. #.fields[*] iterates over all elements in the fields array and applies the same expression again. I don't see a way how repeating the values could be avoided though.
For a "like" condition, you can use like_regex:
select *
from items
where data #? '$[*] ? ( (#.paramId == 4 && #.value like_regex ".*og.*") || exists( #.fields[*] ? (#.paramId == 4 && #.value like_regex ".*og.*")) )'
For now I have found a solution but it is not really clean and I don't know how it will perform in production with 10M records.
SELECT i.id, i.data
FROM ( -- A;
select it.id, it.data, i as value
from items it,
jsonb_array_elements(it.data) i
union
select it.id, it.data, f as value
from items it,
jsonb_array_elements(it.data) i,
jsonb_array_elements(i -> 'fields') f
) as i
WHERE (i.value ->> 'paramId' = '5' -- B1;
AND i.value ->> 'value' LIKE '%ish%')
OR (i.value ->> 'paramId' = '3' -- B2;
AND i.value ->> 'value' = 'cat')
group by i.id, i.data
having COUNT(*) >= 2; -- C;
A: I "flatten" the first and second level (second level is in 'fields' key)
B1, B2: These are my search criteria
C: I make sure the fields have all the criteria matching. If 3 criteria --> COUNT(*) >=3
It really doesn't look clean to me. It is working for dev purpose but I think there is a better way to do it.
If somebody have an idea Big thanks to him/her!

SQL query to match and return all occurrences of search string

I have a json document in a column (record) with a table (TABLE) as below. Need to write a SQL query to bring all occurrences of values of fields "a", "b", 'k" within aaagroup.
Result should be:
NAME1 age1 comment1
NAME2 age2
NAME3 comment3
JSON data:
{
"reportfile": {
"aaa": {
"aaagroup": [{
"a": "NAME1",
"b": "age1",
"k": "comment1"
},
{
"a": "NAME2",
"b": "age2"
},
{
"a": "NAME3",
"k": "comment3"
}]
},
"dsa": {
"dsagroup": [{
"j": "Name"
},
{
"j": "Title"
}]
}
}
}
I used the below query for a single occurrence:
Data:
{"reportfile":{"aaa":{"aaagroup":[{"a":"NAME1","k":"age1}]},"dsa":{"dsagroup":[{"j":"USERNAME"}],"l":"1","m":"1"}}}
Query:
select
substr(cc.BUS_NME, 1, strpos(cc.BUS_NME,'"')-1) as BUS_NME,
substr(cc.AGE, 1, strpos(cc.AGE,'"')-1) as AGE
from
(substr(bb.aaa,strpos(bb.aaa,'"a":"')+5) as BUS_NME,
substr(bb.aaa,strpos(bb.aaa,'"k":"')+5) as AGE
from
(substr(aa.G, strpos(aa.G,'"aaagroup'),strpos(aa.G,'},')) as aaa
from
(select substr(record, strpos(record,'"aaagroup')) as G
from TABLE) aa) bb) cc
ush rani – If I am getting your question correctly, you will have a external table like this and you can try below query to get the desire result from external table
sample external table:
CREATE EXTERNAL TABLE Ext_JSON_data(
reportfile string
)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1'
)
LOCATION
's3://bucket/folder/'
Query to fetch desire result:
WITH the_table AS (
SELECT CAST(social AS MAP(VARCHAR, JSON)) AS social_data
FROM (
VALUES
(JSON '{"aaa": {"aaagroup": [{"a": "NAME1","b": "age1","k": "comment1"},{"a": "NAME2","b": "age2"},{"a": "NAME3","k": "comment3"}]},"dsa": {"dsagroup": [{"j": "Name"},{"j": "Title"}]}}')
) AS t (social)
),
cte_first_level as
(
SELECT
first_level_key
,CAST(first_level_value AS MAP(VARCHAR, JSON))As first_level_value
FROM the_table
CROSS JOIN UNNEST (social_data) AS t (first_level_key, first_level_value)
),
cte_second_level as
(
Select
first_level_key
,SECOND_level_key
,SECOND_level_value
from
cte_first_level
CROSS JOIN UNNEST (first_level_value) AS t (SECOND_level_key, SECOND_level_value)
)
SELECT
first_level_key
,SECOND_level_key
,SECOND_level_value
,items
,items['a'] value_of_a
,items['b'] value_of_b
,items['k'] value_of_k
from
cte_second_level
cross join unnest(cast(json_extract(SECOND_level_value, '$') AS ARRAY<MAP<VARCHAR, VARCHAR>>)) t (items)
Query Output :