select node value from json column type - sql

A table I called raw_data with three columns: ID, timestamp, payload, the column paylod is a json type having values such as:
{
"data": {
"author_id": "1461871206425108480",
"created_at": "2022-08-17T23:19:14.000Z",
"geo": {
"coordinates": {
"type": "Point",
"coordinates": [
-0.1094,
51.5141
]
},
"place_id": "3eb2c704fe8a50cb"
},
"id": "1560043605762392066",
"text": " ALWAYS # London, United Kingdom"
},
"matching_rules": [
{
"id": "1560042248007458817",
"tag": "london-paris"
}
]
}
From this I want to select rows where the coordinates is available, such as [-0.1094,51.5141]in this case.
SELECT *
FROM raw_data, json_each(payload)
WHERE json_extract(json_each.value, '$.data.geo.') IS NOT NULL
LIMIT 20;
Nothing was returned.
EDIT
NOT ALL json objects have the coordinates node. For example this value:
{
"data": {
"author_id": "1556031969062010881",
"created_at": "2022-08-18T01:42:21.000Z",
"geo": {
"place_id": "006c6743642cb09c"
},
"id": "1560079621017796609",
"text": "Dear Desperate sister say husband no dey oo."
},
"matching_rules": [
{
"id": "1560077018183630848",
"tag": "kaduna-kano-katsina-dutse-zaria"
}
]
}

The correct path is '$.data.geo.coordinates.coordinates' and there is no need for json_each():
SELECT *
FROM raw_data
WHERE json_extract(payload, '$.data.geo.coordinates.coordinates') IS NOT NULL;
See the demo.

Related

Extract array from varchar in PrestoSQL

I have a VARCHAR field like this:
[
{
"config": 0,
"type": "0
},
{
"config": x,
"type": "1"
},
{
"config": "",
"type": ""
},
{
"config": [
{
"address": {},
"category": "",
"merchant": {
"data": [
10,12,23
],
"file": 0
},
"range_id": 1,
"shop_id_info": null
}
],
"type": "new"
}
]
And I need to extract merchant data from this. Desirable output is:
10
12
23
Please advise. I keep getting Cannot cast VARCHAR to array/unnest type VARCHAR
You can try using json path $.*.config.*.merchant.data.* but if it does not work for you (as for me in Athena version, where arrays in json path are not supported well) you can cast your json to ARRAY(JSON) and do some manipultaions from there (needed to fix your JSON a little bit):
Test data:
WITH dataset AS (
SELECT * FROM (VALUES
(JSON '[
{
"config": {},
"type": "0"
},
{
"config": "x",
"type": "1"
},
{
"config": "",
"type": ""
},
{
"config": [
{
"address": {},
"category": "",
"merchant": {
"data": [
10,12,23
],
"file": 0
},
"range_id": 1,
"shop_id_info": null
}
],
"type": "new"
}
]')
) AS t (json_value))
And query:
SELECT flatten(
transform(
flatten(
transform(
CAST(json_value AS ARRAY(JSON))
, json_object -> try(CAST(json_extract(json_object, '$.config') AS ARRAY(JSON))))),
json_config -> CAST(json_extract(json_config, '$.merchant.data') as ARRAY(INTEGER))))
FROM dataset
Which will give you array of numbers:
_col0
[10, 12, 23]
And from there you can continue with unnest and so on if needed.

How to avoid the duplicated data entry after parsing json in kusto?

I have following sample json data.
{
"data": {
"type": "ABC",
"id": "17495500314",
"attributes": {
[!["event": "update",
"gps_vali][1]][1]d": true,
"gps": {
"distance_diff": 6.48,
"total_distance": 848.6
},
"hdop": 79,
"fuel_level": 46.8,
"total_fuel_used": 60443.9,
"location": {
"latitude": 411.372618,
"longitude": -1.254931,
"relative_position": {
"distance": "37",
}
},
"idle_periods": []
},
"relationships": {
"assets": {
"data": [
{
"type": "ABCDFTTG",
"id": "1589799143500003",
"attributes": {
"external_id": "ABCDFTTG",
"hardware_id": "ABCDFTTG"
}
}
]
},
"devices": {
"data": [
{
"type": "ABCDFTTG",
"id": "1585231172900341",
"attributes": {
"serial": "5572016191"
}
},
{
"type": "tablet",
"id": "1587893062600175",
"attributes": {
"serial": "ABCDFTTG"
}
}
]
},
"users": {
"data": [
{
"type": "user",
"id": "ABCDFTTG",
"attributes": {
"external_id": "ABCDFTTG"
}
}
]
}
}
},
"meta": {
"message_id": "11eb-8c75-0b3f87aedbb5",
"consumer_version": "1.2.0",
"origin_version": null,
"timestamp": "2021-06-14T17:42:29Z"
}
}
I want only one row instead of this two. Here is my kusto query which is used for parsing json data into table columns.
Test
|where messageId =="123"
//|mv-expand message=message.data.attributes
|mv-expand message
|mv-expand Value=message.data.relationships.assets.['data']
|mv-expand value_devices=message.data.relationships.devices.['data']
|mv-expand value_user=message.data.relationships.users.['data']
| project type=message.data.type,id=message.data.id,
event=tostring(message.data.attributes.event),
logged_at=tostring(message.data.attributes.logged_at),
distance=toint(message.data.attributes.location.relative_position.distance),
// Value=message.data.relationships.assets.['data'],//.['data']
type_asset=Value.type,asset_id=Value.id,
device_type=value_devices.type,device_id=value_devices.id,
device_attr_serial=value_devices.attributes.serial,
user_type=value_user.type,user_id=value_user.id,
user_external_id=value_user.attributes.external_id
This duplicate row appeared after adding user tag this tag is array so how to handle this array with single id.
I have parse my json data any got the following output.
Expected output should be like
check device_type and device_id columns

Flatten and reconstruct JSON Snowflake

I am still learning Snowflake, any help would be really appreciated.
I have a column, let's call it 'result'.
{
"catalog": [
{
"img_href": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179361.jpg",
"name": "ADITI HAND BLOCKED PRINT",
"price": 16
},
{
"img_href": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179330.jpg",
"name": "TORBAY HAND BLOCKED PRINT",
"price": 17
},
{
"img_href": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179362.jpg",
"name": "ADITI HAND BLOCKED PRINT",
"price": 18
}
],
"datetime": 161878993658
"catalog_id": 1
}
I would like to flatten it and reconstruct as below
[
{
"datetime": 161878993658,
"url": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179361.jpg"
},
{
"datetime": 161878993658,
"url": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179330.jpg"
},
{
"datetime": 161878993658,
"url": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179362.jpg"
},
]
The following will do this. You won't need the CTE, so delete it and replace uses of tbl with the name of your table and uses of json with your variant column.
/*delete this line*/ with tbl as (select parse_json($1) json from values('{"catalog":[{"img_href":"https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179361.jpg","name":"ADITI HAND BLOCKED PRINT","price":16},{"img_href":"https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179330.jpg","name":"TORBAY HAND BLOCKED PRINT","price":17},{"img_href":"https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179362.jpg","name":"ADITI HAND BLOCKED PRINT","price":18}],"datetime":161878993658,"catalog_id":1}'))
select array_agg(new_col) reconstructed
from (
/* replace json and tbl */ select object_construct('datetime', json:datetime, 'url', obj.value:img_href) new_col, json:catalog_id catalog_id
from tbl, lateral flatten(json:catalog) obj
) group by catalog_id;
It outputs
[
{
"datetime": 161878993658,
"url": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179361.jpg"
},
{
"datetime": 161878993658,
"url": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179330.jpg"
},
{
"datetime": 161878993658,
"url": "https://schumacher-webassets.s3.amazonaws.com/Web%20Catalog-600/179362.jpg"
}
]

Query Druid SQL inner join with a dataSource name that has a dash

How to write an INNER JOIN query between two data sources that one of them has a dash as it's schema name
Executing the following query on the Druid SQL binary results in a query error
SELECT *
FROM first
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;
org.apache.druid.java.util.common.ISE: Cannot build plan for query
Is this the correct syntax when trying to refrence a data source that has a dash in it's name?
Schema
[
{
"dataSchema": {
"dataSource": "second-schema",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"device_id",
"device_name",
"x_1",
"x_2",
"x_3",
"vlan",
"s_x",
"d_x",
"d_p",
"msg_type"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
{
"type": "count",
"name": "event_count"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "flow-info",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "flow-info"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
},
{
"dataSchema": {
"dataSource": "first",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"category",
"device_id",
"device_name",
"severity",
"x_2",
"x_3",
"x_4",
"x_5",
"vlan",
"s_x",
"d_x",
"s_i",
"d_i",
"d_p",
"id"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "doubleSum", "name": "val_num", "fieldName": "val_num" },
{ "type": "doubleMin", "name": "val_num_min", "fieldName": "val_num" },
{ "type": "doubleMax", "name": "val_num_max", "fieldName": "val_num" },
{ "type": "doubleSum", "name": "size", "fieldName": "size" },
{ "type": "doubleMin", "name": "size_min", "fieldName": "size" },
{ "type": "doubleMax", "name": "size_max", "fieldName": "size" },
{ "type": "count", "name": "first_count" }
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "first",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "first"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
}
]
Based on your schema definitions there are a few observations I'll make.
When doing a join you usually have to list out columns explicitly (not use a *) otherwise you get collisions from duplicate columns. In your join, for example, you have a device_id in both "first" and "second-schema", not to mention all the other columns that are the same across both.
When using a literal delimiter I don't mix them up. I either use them or I don't.
So I think your query will work better in the form of something more like this
SELECT
"first"."etid",
"first"."category",
"first"."device_id",
"first"."device_name",
"first"."severity",
"first"."x_2",
"first"."x_3",
"first"."x_4",
"first"."x_5",
"first"."vlan",
"first"."s_x",
"first"."d_x",
"first"."s_i",
"first"."d_i",
"first"."d_p",
"first"."id",
"second-schema"."etid" as "ss_etid",
"second-schema"."device_id" as "ss_device_id",
"second-schema"."device_name" as "ss_device_name",
"second-schema"."x_1" as "ss_x_1",
"second-schema"."x_2" as "ss_x_2",
"second-schema"."x_3" as "ss_x_3",
"second-schema"."vlan" as "ss_vlan",
"second-schema"."s_x" as "ss_s_x",
"second-schema"."d_x" as "ss_d_x",
"second-schema"."d_p" as "ss_d_p",
"second-schema"."msg_type"
FROM "first"
INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";
Obviously feel free to name columns as you see fit, or include exclude columns as needed. Select * will only work when all columns across both tables are unique.

Transform JSON response with lodash

I'm new in lodash (v3.10.1), and having a hard time understanding.
Hope someone can help.
I have an input something like this:
{
{"id":1,"name":"Matthew","company":{"id":1,"name":"abc","industry":{"id":5,"name":"Medical"}}},
{"id":2,"name":"Mark","company":{"id":1,"name":"abc","industry":{"id":5,"name":"Medical"}}},
{"id":3,"name":"Luke","company":{"id":1,"name":"abc","industry":{"id":5,"name":"Medical"}}},
{"id":4,"name":"John","company":{"id":1,"name":"abc","industry":{"id":5,"name":"Medical"}}},
{"id":5,"name":"Paul","company":{"id":1,"name":"abc","industry":{"id":5,"name":"Medical"}}}
];
I would like to output this or close to this:
{
"industries": [
{
"industry":{
"id":5,
"name":"Medical",
"companies": [
{
"company":{
"id":1,
"name":"abc",
"employees": [
{"id":1,"name":"Matthew"},
{"id":2,"name":"Mark"},
{"id":3,"name":"Luke"},
{"id":4,"name":"John"},
{"id":5,"name":"Paul"}
]
}
}
]
}
}
]
}
Here's something that gets you close to what you want. I structured the output to be an object instead of an array. You don't need the industries or industry properties in your example output. The output structure looks like this:
{
"industry name": {
"id": "id of industry",
"companies": [
{
"company name": "name of company",
"id": "id of company",
"employees": [
{
"id": "id of company",
"name": "name of employee"
}
]
}
]
}
}
I use the _.chain function to wrap the collection with a lodash wrapper object. This enables me to explicitly chain lodash functions.
From there, I use the _.groupBy function to group elements of the collection by their industry name. Since I'm chaining, I don't have to pass in the array again to the function. It's implicitly passed via the lodash wrapper. The second argument of the _.groupBy is the path to the value I want to group elements by. In this case, it's the path to the industry name: company.industry.name. _.groupBy returns an object with each employee grouped by their industry (industries are keys for this object).
I then do use _.transform to transform each industry object. _.transform is essentially _.reduce except that the results returned from the _.transform function is always an object.
The function passed to the _.transform function gets executed against each key/value pair in the object. In the function, I use _.groupBy again to group employees by company. Based off the results of _.groupBy, I map the values to the final structure I want for each employee object.
I then call the _.value function because I want to unwrap the output collection from the lodash wrapper object.
I hope this made sense. If it doesn't, I highly recommend reading Lo-Dash Essentials. After reading the book, I finally got why lodash is so useful.
"use strict";
var _ = require('lodash');
var emps = [
{ "id": 1, "name": "Matthew", "company": { "id": 1, "name": "abc", "industry": { "id": 5, "name": "Medical" } } },
{ "id": 2, "name": "Mark", "company": { "id": 1, "name": "abc", "industry": { "id": 5, "name": "Medical" } } },
{ "id": 3, "name": "Luke", "company": { "id": 1, "name": "abc", "industry": { "id": 5, "name": "Medical" } } },
{ "id": 4, "name": "John", "company": { "id": 1, "name": "abc", "industry": { "id": 5, "name": "Medical" } } },
{ "id": 5, "name": "Paul", "company": { "id": 1, "name": "abc", "industry": { "id": 5, "name": "Medical" } } }
];
var result = _.chain(emps)
.groupBy("company.industry.name")
.transform(function(result, employees, industry) {
result[industry] = {};
result[industry].id = _.get(employees[0], "company.industry.id");
result[ industry ][ 'companies' ] = _.map(_.groupBy(employees, "company.name"), function( employees, company ) {
return {
company: company,
id: _.get(employees[ 0 ], 'company.id'),
employees: _.map(employees, _.partialRight(_.pick, [ 'id', 'name' ]))
};
});
return result;
})
.value();
Results from your example are as follows:
{
"Medical": {
"id": 5,
"companies": [
{
"company": "abc",
"id": 1,
"employees": [
{
"id": 1,
"name": "Matthew"
},
{
"id": 2,
"name": "Mark"
},
{
"id": 3,
"name": "Luke"
},
{
"id": 4,
"name": "John"
},
{
"id": 5,
"name": "Paul"
}
]
}
]
}
}
If you ever wanted the exact same structure as in the questions, I solved it using the jsonata library:
(
/* lets flatten it out for ease of accessing the properties*/
$step1 := $ ~> | $ |
{
"employee_id": id,
"employee_name": name,
"company_id": company.id,
"company_name": company.name,
"industry_id": company.industry.id,
"industry_name": company.industry.name
},
["company", "id", "name"] |;
/* now the magic begins*/
$step2 := {
"industries":
[($step1{
"industry" & $string(industry_id): ${
"id": $distinct(industry_id)#$I,
"name": $distinct(industry_name),
"companies": [({
"company" & $string(company_id): {
"id": $distinct(company_id),
"name": $distinct(company_name),
"employees": [$.{
"id": $distinct(employee_id),
"name": $distinct(employee_name)
}]
}
} ~> $each(function($v){ {"company": $v} }))]
}
} ~> $each(function($v){ {"industry": $v} }))]
};
)
You can see it in action on the live demo site: https://try.jsonata.org/VvW4uTRz_