dynamic split json string in bigquery [duplicate] - sql

I have load the entire json file into a STRING column of BigQuery table. Now I am trying to access the keys using JSON_EXTRACT_SCALAR function, but I am getting null result for the child keys which contain special character period(".") within their name.
Here's the snippet of the data:
{"server_received_time":"2019-01-17 15:00:00.482000","app":161,"device_carrier":null,"$schema":12,"city":"Caro","user_id":null,"uuid":"9018","event_time":"2019-01-17 15:00:00.045000","platform":"Web","os_version":"49","vendor_id":711,"processed_time":"2019-01-17 15:00:00.817195","user_creation_time":"2018-11-01 19:16:34.971000","version_name":null,"ip_address":null,"paying":null,"dma":null,"group_properties":{},"user_properties":{"location.radio":"ca","vendor.userTier":"free","vendor.userID":"a989","user.id":"a989","user.tier":"free","location.region":"ca"},"client_upload_time":"2019-01-17 15:00:00.424000","$insert_id":"e8410","event_type":"LOADED","library":"amp\/4.5.2","vendor_attribution_ids":null,"device_type":"Mac","device_manufacturer":null,"start_version":null,"location_lng":null,"server_upload_time":"2019-01-17 15:00:00.493000","event_id":64,"location_lat":null,"os_name":"Chrome","vendor_event_type":null,"device_brand":null,"groups":{},"event_properties":{"content.authenticated":false,"content.subsection1":"regions","custom.DNT":true,"content.subsection2":"ca","referrer.url":"","content.url":"","content.type":"index","content.title":"","custom.cookiesenabled":true,"app.pillar":"feed","content.area":"news","app.name":"oc"},"data":{},"device_id":"","language":"English","device_model":"Mac","country":"","region":"","is_attribution_event":false,"adid":null,"session_id":15,"device_family":"Mac","sample_rate":null,"idfa":null,"client_event_time":"2019-01-17 14:59:59.987000"}
{"server_received_time":"2019-01-17 15:00:00.913000","app":161,"device_carrier":null,"$schema":12,"city":"Fo","user_id":null,"uuid":"9052","event_time":"2019-01-17 15:00:00.566000","platform":"Web","os_version":"71","vendor_id":797,"processed_time":"2019-01-17 15:00:01.301936","user_creation_time":"2019-01-17 15:00:00.566000","version_name":null,"ip_address":null,"paying":null,"dma":"CO","group_properties":{},"user_properties":{"user.tier":"free"},"client_upload_time":"2019-01-17 15:00:00.157000","$insert_id":"69ae","event_type":"START WEB SESSION","library":"amp\/4.5.2","vendor_attribution_ids":null,"device_type":"Android","device_manufacturer":null,"start_version":null,"location_lng":null,"server_upload_time":"2019-01-17 15:00:00.925000","event_id":1,"location_lat":null,"os_name":"Chrome Mobile","vendor_event_type":null,"device_brand":null,"groups":{},"event_properties":{"content.subsection3":"home","content.subsection2":"archives","content.title":"","content.keywords.subject":["Lifestyle\/Recreation and leisure\/Outdoor recreation\/Boating","Lifestyle\/Relationships\/Couples","General news\/Weather","Oddities"],"content.publishedtime":154687,"app.name":"oc","referrer.url":"","content.subsection1":"archives","content.url":"","content.authenticated":false,"content.keywords.location":["Ot"],"content.originaltitle":"","content.type":"story","content.authors":["Archives"],"app.pillar":"feed","content.area":"news","content.id":"1.49","content.updatedtime":1546878600538,"content.keywords.tag":["24 1","boat house","Ot","Rockcliffe","River","m"],"content.keywords.person":["Ber","Shi","Jea","Jean\u00e9tien"]},"data":{"first_event":true},"device_id":"","language":"English","device_model":"Android","country":"","region":"","is_attribution_event":false,"adid":null,"session_id":15477,"device_family":"Android","sample_rate":null,"idfa":null,"client_event_time":"2019-01-17 14:59:59.810000"}
{"server_received_time":"2019-01-17 15:00:00.913000","app":16,"device_carrier":null,"$schema":12,"city":"","user_id":null,"uuid":"905","event_time":"2019-01-17 15:00:00.574000","platform":"Web","os_version":"71","vendor_id":7973,"processed_time":"2019-01-17 15:00:01.301957","user_creation_time":"2019-01-17 15:00:00.566000","version_name":null,"ip_address":null,"paying":null,"dma":"DCO","group_properties":{},"user_properties":{"user.tier":"free"},"client_upload_time":"2019-01-17 15:00:00.157000","$insert_id":"d045","event_type":"LOADED","library":"am-js\/4.5.2","vendor_attribution_ids":null,"device_type":"Android","device_manufacturer":null,"start_version":null,"location_lng":null,"server_upload_time":"2019-01-17 15:00:00.925000","event_id":2,"location_lat":null,"os_name":"Chrome Mobile","vendor_event_type":null,"device_brand":null,"groups":{},"event_properties":{"content.subsection3":"home","content.subsection2":"archives","content.subsection1":"archives","content.keywords.subject":["Lifestyle\/Recreation and leisure\/Outdoor recreation\/Boating","Lifestyle\/Relationships\/Couples","General news\/Weather","Oddities"],"content.type":"story","content.keywords.location":["Ot"],"app.pillar":"feed","app.name":"oc","content.authenticated":false,"custom.DNT":false,"content.id":"1.4","content.keywords.person":["Ber","Shi","Jea","Je\u00e9tien"],"content.title":"","content.url":"","content.originaltitle":"","custom.cookiesenabled":true,"content.authors":["Archives"],"content.publishedtime":1546878600538,"referrer.url":"","content.area":"news","content.updatedtime":1546878600538,"content.keywords.tag":["24 1","boat house","O","Rockcliffe","River","pr"]},"data":{},"device_id":"","language":"English","device_model":"Android","country":"","region":"","is_attribution_event":false,"adid":null,"session_id":1547737199081,"device_family":"Android","sample_rate":null,"idfa":null,"client_event_time":"2019-01-17 14:59:59.818000"}
Here's the sample query against the table:
SELECT
CAST(JSON_EXTRACT_SCALAR(data,'$.uuid')AS INT64) AS uuid_id,
CAST(JSON_EXTRACT_SCALAR(data,'$.event_time') AS TIMESTAMP) AS event_time,
JSON_EXTRACT_SCALAR(data,'$[event_properties].app.name') AS app_name,
JSON_EXTRACT_SCALAR(data,'$[user_properties].user.tier') AS user_tier
FROM
mytable
Above query give null result for app_name & user_tier columns even though data exists for them.
Following the BigQuery JSON function documentation - JSON Functions in Standard SQL
In cases where a JSON key uses invalid JSONPath characters, you can escape those characters using single quotes and brackets, [' '].
and running the query as:
SELECT
CAST(JSON_EXTRACT_SCALAR(data,"$.uuid_id")AS INT64) AS uuid_id,
CAST(JSON_EXTRACT_SCALAR(data,"$.event_time") AS TIMESTAMP) AS event_time,
JSON_EXTRACT_SCALAR(data,"$.event_properties.['app.name']") AS app_name,
JSON_EXTRACT_SCALAR(data,"$.user_properties.['user.tier']") AS user_tier
FROM
mytable
result into following error:
Invalid token in JSONPath at: .['app.name']
Please advise. What am I missing here?

You have an extra . before the [. Use
"$.event_properties['app.name']"

Related

How to get second last key from JSON in Amazon Athena?

I want to get the second last key from my json , I have tried
SELECT json_extract_scalar(json_column, '$[-1]') AS last_key
FROM table_name;
but it is giving the following error
awsathena error: INVALID_FUNCTION_ARGUMENT: Invalid JSON path: '$[-1]'
while using key name is giving me correct result, for result
SELECT json_extract_scalar(json_column, '$.status') AS last_key
FROM table_name;
is giving me correct output
json_extract_scalar has limited json path support (and $[-1] does not look like a valid json path to access json property). Depended on Athena version you can workaround with cast to map (something along this lines):
SELECT element_at(m, cardinality(m)) last_key
FROM (SELECT map_values(cast(json_column as map(varchar, json))) AS m
FROM table_name);
Also note that json is unordered in it's nature, so relying on the actual property order can be a questionable choice.

extract values inside an array column in amazon athena

I have a table in athena aws where the column 'metadata_stopinfo' has the structure that you can see in the image.
I am trying to extract values that are inside that array, however when I try
SELECT
"json_extract_scalar"(metadata_stopinfo, '$.city')
FROM "table"
I have the following problem
SYNTAX_ERROR: line 2:5: Unexpected parameters (array(row("address" row("addressline" varchar,"city" varchar,"countrycode" varchar,"countrycodeoriginal" varchar,"state" varchar,"zipcode" varchar),"carrierreference" varchar,"contacts" array(row("contacttype" varchar,"email" varchar,"fax" varchar,"mobilephone" varchar,"name" varchar,"officephone" varchar,"userid" varchar)),"containerinfo" array(row("containerid" varchar,"containeridtype" varchar,"equipmentcode" varchar,"equipmenttype" varchar)),"conveyancelinenumber" varchar,"conveyancetype" varchar,"conveyancetypeoriginal" varchar,"dateinfo" row("arrivalestimateddate" varchar,"arrivalestimateddateend" varchar,"arrivalestimatedendoffset" varchar,"arrivalestimatedoffset" varchar,"arrivalrequesteddate" varchar,"deliveryestimateddate" varchar,"deliveryestimateddateend" varchar,"deliveryestimatedendoffset" varchar,"deliveryestimatedoffset" varchar,"deliveryrequesteddate" varchar,"deliveryrequesteddateend" varchar,"deliveryrequestedendoffset" varchar,"deliveryrequestedoffset" varchar,"departureestimateddate" varchar,"departureestimateddateend" varchar,"departureestimatedendoffset" varchar,"departureestimatedoffset" varchar,"departurerequesteddate" varchar,"pickuprequesteddate" varchar,"pickuprequesteddateend" varchar,"pickuprequestedendoffset" varchar,"pickuprequestedoffset" varchar,"pickupestimateddate" varchar,"pickupestimateddateend" varchar,"pickupestimatedendoffset" varchar,"pickupestimatedoffset" varchar),"deliverynotenumber" varchar,"instructions" array(row("customerspecificsubtype" varchar,"header" boolean,"instructionsubtype" varchar,"instructiontype" varchar,"text" varchar)),"locationid" varchar,"partnercarrieraddress" row("addressline" varchar,"city" varchar,"countrycode" varchar,"countrycodeoriginal" varchar,"state" varchar,"zipcode" varchar),"partnercarriercontacts" array(row("contacttype" varchar,"email" varchar,"fax" varchar,"name" varchar,"officephone" varchar)),"partnercarrierid" varchar,"partnercarriername" varchar,"partnerid" varchar,"partnername" varchar,"partnertimezone" varchar,"partnertype" varchar,"productquantity" row("number" double,"originalunitofmeasure" varchar,"quantitytype" varchar,"unitofmeasure" varchar),"sequencenumber" bigint,"shipmentidentifier" varchar,"stoptype" varchar,"transportinfo" row("description" varchar,"transportcode" varchar,"transportoriginalcode" varchar),"vesselinfo" row("lloydsnumber" varchar,"shipsradiocallnumber" varchar,"vesselname" varchar,"vesselnumber" varchar,"voyagetripnumber" varchar))), varchar(6)) for function json_extract_scalar. Expected: json_extract_scalar(varchar(x), JsonPath) , json_extract_scalar(json, JsonPath)
My question is, how can i extract values inside de column ?
json_extract_scalar unsurprisingly works with json (note that even if yur data was in json format, json_extract_scalar(metadata_stopinfo, '$.city') still would not have worked cause your data is an array), while your column contains array's of row's, so you need to work with it correspondingly. For example you can use indexes to access elements in array (in presto array indexes start from 1):
SELECT
metadata_stopinfo[1] r
FROM "table"
And then access the fields:
The fields may be of any SQL type, and are accessed with field reference operator .
SELECT
metadata_stopinfo[1].city city
FROM "table"
Also you can flatten the array with unnest:
SELECT r.city
FROM "table",
unnest(metadata_stopinfo) as t(r)

How to resolve this sql error of schema_of_json

I need to find out the schema of a given JSON file, I see sql has schema_of_json function
and something like this works flawlessly
> SELECT schema_of_json('[{"col":0}]');
ARRAY<STRUCT<`col`: BIGINT>>
But if I query for my table name, it gives me the following error
>SELECT schema_of_json(Transaction) as json_data from table_name;
Error in SQL statement: AnalysisException: cannot resolve 'schemaofjson(`Transaction`)' due to data type mismatch: The input json should be a string literal and not null; however, got `Transaction`.; line 1 pos 7;
The Transaction is one of the columns in my table and after checking it manually I can attest that it is of String type(json).
The SQL statement has it to give me the schema of the JSON, how to do it?
after looking further into the documentation that it is clear that the word foldable means that of the static one, and a column from a table JSON won't work
for minimal reroducible example here you go:
SELECT schema_of_json(CAST('{ "a": "b" }' AS STRING))
As soon as the cast is introduced in the above statement, the schema_of_json will fail......... It needs a static JSON as it's input

Fetching attribute from JSON string with JSON_VAL cause "<attribute> is invalid in the used context" error

A proprietary third-party application stores JSON strings in it's database like this one:
{"state":"complete","timestamp":1614776473000}
I need the timestamp and found out that
DB2 offers JSON functions. Since it's stored as string in the PROF_VALUE column, I guess that converting with SYSTOOLS.JSON2BSON is required, before I can use JSON_VAL to fetch the timestamp:
SELECT SYSTOOLS.JSON_VAL(SYSTOOLS.JSON2BSON(PROF_VALUE), "timestamp", "f")
FROM EMPINST.PROFILE_EXTENSIONS ext
WHERE PROF_PROPERTY_ID = 'touchpointState'
This causes an error that timestamp is invalid in the used context ( SQLCODE=-206, SQLSTATE=42703, DRIVER=4.26.14). The same error is thown when I remove the JSON2BSON call like this
SELECT SYSTOOLS.JSON_VAL(PROF_VALUE, "timestamp", "f")
Also not working with the same error (different data-types):
SELECT SYSTOOLS.JSON_VAL(SYSTOOLS.JSON2BSON(PROF_VALUE), "state", "s:1000")
SELECT SYSTOOLS.JSON_VAL(PROF_VALUE) "state", "s:1000")
I don't understand this error. My syntax is like the documented JSON_VAL ( json-value , search-string , result-type) and it is the same like in the examples, where they show how to fetch the name field of an object.
I also played around a bit with JSON_TABLE to use raw input data for testing (instead of the database data), but it seems not suiteable for that.
SELECT *
FROM TABLE(SYSTOOLS.JSON_TABLE( SYSTOOLS.JSON2BSON('{"state":"complete","timestamp":1614776473000}'), 'state','s:32')) DATA
This gave me a table with one row: Type = 2 and Value = complete.
I had two problems in my query: First it seems that double quotes " are for object references. I wasn't aware that there is any difference, because in most databases I used yet, both single ' and double quotes " are equal.
The second problem is, that JSON_VAL needs to be called without SYSTOOLS, but the reference is still needed on SYSTOOLS.JSON2BSON(PROF_VALUE).
With those changes, the following query worked:
SELECT JSON_VAL(SYSTOOLS.JSON2BSON(PROF_VALUE), 'timestamp', 'f')
FROM EMPINST.PROFILE_EXTENSIONS ext
WHERE PROF_PROPERTY_ID = 'touchpointState'

Parsing JSON file in Snowflake Database

Database :SNOWFLAKE
My table contains JSON data for example:
{
"bucket":"IN_Apps",
"bySeqno":56,
"cas":1527639206906626048,
"content":"eyJoaWdoQmluIjoiNTQ4NTA4MDkiLCJkb2N1bWVudFR5cGUiOiJJSU5ETyIsImNhcmRUeXBlIyayI6Ik1BU1RFUkNBUkQifQ==",
"event":"mutation",
"expiration":0,
"flags":33554432,
"key":"iin54850809",
"lockTime":0,
"partition":948,
"revSeqno":1,
"vBucketUuid":137987627737694
}
when i tried to parse it.
select
parse_json:bucket::string as bucket ,
parse_json:bySeqno::string as bySeqno ,
parse_json:cas::INT as cas ,
parse_json:content::string as content ,
parse_json:event::string as event
,parse_json:expiration::string as expiration
,parse_json:flags::string as flags
,parse_json:key::string as key
,parse_json:lockTime::string as lockTime
,parse_json:partition::string as partition
,parse_json:revSeqno::string as revSeqno
,parse_json:vBucketUuid::string as vBucketUuid
from STG_YS_APPS v
but it is throwing error like.
SQL compilation error: error line 2 at position 0 invalid identifier > >'PARSE_JSON'
may someone please help me.
Answer with a known schema
Update: Since you provided schema, which shows a VAR column of VARIANT type, here's what you need, couldn't be simpler:
select
var:bucket::string as bucket,
var:bySeqno::string as bySeqno,
var:cas::int as cas
...
from STG_YS_APPS v
Below the answer before the schema was known
I'll assume you have a VARCHAR (or similar) column in your table that is called json, and stores the values you presented. You didn't provide the schema, so please adjust the column name as necessary.
You're not using PARSE_JSON as a function in your SQL. You should write something like
select
parse_json(json):bucket::string as bucket,
parse_json(json):bySeqno::string as bySeqno,
parse_json(json):cas::int as cas
...
from STG_YS_APPS v