Query Druid SQL inner join with a dataSource name that has a dash - sql

How to write an INNER JOIN query between two data sources that one of them has a dash as it's schema name
Executing the following query on the Druid SQL binary results in a query error
SELECT *
FROM first
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;
org.apache.druid.java.util.common.ISE: Cannot build plan for query
Is this the correct syntax when trying to refrence a data source that has a dash in it's name?
Schema
[
{
"dataSchema": {
"dataSource": "second-schema",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"device_id",
"device_name",
"x_1",
"x_2",
"x_3",
"vlan",
"s_x",
"d_x",
"d_p",
"msg_type"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
{
"type": "count",
"name": "event_count"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "flow-info",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "flow-info"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
},
{
"dataSchema": {
"dataSource": "first",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"category",
"device_id",
"device_name",
"severity",
"x_2",
"x_3",
"x_4",
"x_5",
"vlan",
"s_x",
"d_x",
"s_i",
"d_i",
"d_p",
"id"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "doubleSum", "name": "val_num", "fieldName": "val_num" },
{ "type": "doubleMin", "name": "val_num_min", "fieldName": "val_num" },
{ "type": "doubleMax", "name": "val_num_max", "fieldName": "val_num" },
{ "type": "doubleSum", "name": "size", "fieldName": "size" },
{ "type": "doubleMin", "name": "size_min", "fieldName": "size" },
{ "type": "doubleMax", "name": "size_max", "fieldName": "size" },
{ "type": "count", "name": "first_count" }
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "first",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "first"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
}
]

Based on your schema definitions there are a few observations I'll make.
When doing a join you usually have to list out columns explicitly (not use a *) otherwise you get collisions from duplicate columns. In your join, for example, you have a device_id in both "first" and "second-schema", not to mention all the other columns that are the same across both.
When using a literal delimiter I don't mix them up. I either use them or I don't.
So I think your query will work better in the form of something more like this
SELECT
"first"."etid",
"first"."category",
"first"."device_id",
"first"."device_name",
"first"."severity",
"first"."x_2",
"first"."x_3",
"first"."x_4",
"first"."x_5",
"first"."vlan",
"first"."s_x",
"first"."d_x",
"first"."s_i",
"first"."d_i",
"first"."d_p",
"first"."id",
"second-schema"."etid" as "ss_etid",
"second-schema"."device_id" as "ss_device_id",
"second-schema"."device_name" as "ss_device_name",
"second-schema"."x_1" as "ss_x_1",
"second-schema"."x_2" as "ss_x_2",
"second-schema"."x_3" as "ss_x_3",
"second-schema"."vlan" as "ss_vlan",
"second-schema"."s_x" as "ss_s_x",
"second-schema"."d_x" as "ss_d_x",
"second-schema"."d_p" as "ss_d_p",
"second-schema"."msg_type"
FROM "first"
INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";
Obviously feel free to name columns as you see fit, or include exclude columns as needed. Select * will only work when all columns across both tables are unique.

Related

Not able to match same json which has mismatch in indexes of array of objects (Karate) [duplicate]

This question already has an answer here:
Is there a simple match for objects containing array where the array content order doesn't matter?
(1 answer)
Closed 1 year ago.
Trying to match two jsons, but getting test fails. Well, both jsons are the same but objects indexes inside the array are not same. I think should not make any difference. Following are two jsons:
This is the code line: And match response contains ScenarioModelResponse where
**response : **
{
"relationships": [
{
"sourceId": "36",
"targetId": "149",
"type": "Reid Enright"
}
],
"modelId": "027f93d1-ef9e-4f1e-b2c4-684436c5b18a",
"elements": [
{
"externalRefId": "36",
"attributes": {
"jsonPbject": "Reid Enright"
},
"id": "057f7b7e-11b9-4779-97c0-67485153c285",
"type": "Rocky Shore"
},
{
"externalRefId": "149",
"attributes": {
"jsonPbject": "Ben Lyon"
},
"id": "325b989e-b299-4cfc-86b5-0813106da38e",
"type": "Claire Voyance"
}
]
}
ScenarioModelResponse :
{
"relationships": [
{
"sourceId": "36",
"targetId": "149",
"type": "Reid Enright"
}
],
"modelId": "027f93d1-ef9e-4f1e-b2c4-684436c5b18a",
"elements": [
{
"externalRefId": "149",
"attributes": {
"jsonPbject": "Ben Lyon"
},
"id": "325b989e-b299-4cfc-86b5-0813106da38e",
"type": "Claire Voyance"
},
{
"externalRefId": "36",
"attributes": {
"jsonPbject": "Reid Enright"
},
"id": "057f7b7e-11b9-4779-97c0-67485153c285",
"type": "Rocky Shore"
}
]
}
This the error I am getting after execution :
$.elements[0].externalRefId | not equal (STRING:STRING)
'149'
'36'
The arrays are NOT the same. This can be solved in 2 lines:
* match response.relationships == expected.relationships
* match response.elements contains only expected.elements
For a detailed explanation, refer:
https://stackoverflow.com/a/65939070/143475
https://stackoverflow.com/a/55710769/143475

How to avoid the duplicated data entry after parsing json in kusto?

I have following sample json data.
{
"data": {
"type": "ABC",
"id": "17495500314",
"attributes": {
[!["event": "update",
"gps_vali][1]][1]d": true,
"gps": {
"distance_diff": 6.48,
"total_distance": 848.6
},
"hdop": 79,
"fuel_level": 46.8,
"total_fuel_used": 60443.9,
"location": {
"latitude": 411.372618,
"longitude": -1.254931,
"relative_position": {
"distance": "37",
}
},
"idle_periods": []
},
"relationships": {
"assets": {
"data": [
{
"type": "ABCDFTTG",
"id": "1589799143500003",
"attributes": {
"external_id": "ABCDFTTG",
"hardware_id": "ABCDFTTG"
}
}
]
},
"devices": {
"data": [
{
"type": "ABCDFTTG",
"id": "1585231172900341",
"attributes": {
"serial": "5572016191"
}
},
{
"type": "tablet",
"id": "1587893062600175",
"attributes": {
"serial": "ABCDFTTG"
}
}
]
},
"users": {
"data": [
{
"type": "user",
"id": "ABCDFTTG",
"attributes": {
"external_id": "ABCDFTTG"
}
}
]
}
}
},
"meta": {
"message_id": "11eb-8c75-0b3f87aedbb5",
"consumer_version": "1.2.0",
"origin_version": null,
"timestamp": "2021-06-14T17:42:29Z"
}
}
I want only one row instead of this two. Here is my kusto query which is used for parsing json data into table columns.
Test
|where messageId =="123"
//|mv-expand message=message.data.attributes
|mv-expand message
|mv-expand Value=message.data.relationships.assets.['data']
|mv-expand value_devices=message.data.relationships.devices.['data']
|mv-expand value_user=message.data.relationships.users.['data']
| project type=message.data.type,id=message.data.id,
event=tostring(message.data.attributes.event),
logged_at=tostring(message.data.attributes.logged_at),
distance=toint(message.data.attributes.location.relative_position.distance),
// Value=message.data.relationships.assets.['data'],//.['data']
type_asset=Value.type,asset_id=Value.id,
device_type=value_devices.type,device_id=value_devices.id,
device_attr_serial=value_devices.attributes.serial,
user_type=value_user.type,user_id=value_user.id,
user_external_id=value_user.attributes.external_id
This duplicate row appeared after adding user tag this tag is array so how to handle this array with single id.
I have parse my json data any got the following output.
Expected output should be like
check device_type and device_id columns

POSTGRESQL query to extract attributes in JSON

I have the below JSON in a particular DB column. I need a query to extract fields stored within the savings rate(to and from).
{
"data": [
{
"data": {
"intro_visited": {
"portfolio_detail_investment_journey": true,
"dashboard_investments": true,
"portfolio_list_updates": true,
"portfolio_detail_invested": true,
"portfolio_list_offering": true,
"dashboard_more_bottom_bar": true
}
},
"type": "user_properties",
"schema_version": "1"
},
{
"data": {
"savings_info": {
"remind_at": 1583475493291,
"age": 100,
"savings_rate": {
"to": "20",
"from": "4"
},
"recommendation": {
"offering_name": "Emergency Fund",
"amount": "1,11,111",
"offering_status": "not_invested",
"ideal_amount": "1,11,111",
"offering_code": "liquid"
}
}
},
"type": "savings_info",
"schema_version": "1"
}
]
}
To get the "To"
$..data.savings_info.savings_rate.to
To get the "From"
$..data.savings_info.savings_rate.from
This script works
SELECT
<column> ->'data'->2->'data'->'savings_info'->'savings_rate'->>'to' AS to_rate
from <table>

Json schema variable properties

Based some condition my IPV4 can have either 2 or 3 properties but those are required. How to define it. I tried below schema. I get error saying "JSON is valid against more than one schema from 'oneOf'. Valid schema indexes: 0, 1"
"IPv4Type": {
"type": "object",
"oneOf": [
{
"properties": {
"provider-address": {
"type": "string",
"format": "ipv4"
},
"customer-address": {
"type": "string",
"format": "ipv4"
},
"mask": {
"type": "number"
}
},
"required": [
"provider-address",
"customer-address",
"mask"
]
},
{
"properties": {
"provider-address": {
"type": "string",
"format": "ipv4"
},
"mask": {
"type": "number"
}
},
"required": [
"provider-address",
"mask"
]
}
]
}
A few ideas:
you can drop the oneOf , define your JSON in one object, with all 3 properties defined, but adding only "provider-address" and "mask" as required
defining "additionalProperties": false the the 2nd definition under oneOf
replacing oneOf with anyOf

elasticsearch Not_analyzed and analyzed

Hello For certain requirement i have made all the index not_analyzed
{
"template": "*",
"mappings": {
"_default_": {
"dynamic_templates": [
{
"my_template": {
"match_mapping_type": "string",
"mapping": {
"index": "not_analyzed"
}
}
}
]
}
}
}
But now as per our requirement i have to make certain field as analyzed . and keep rest of the field as not analyzed
My Data is of type :
{ "field1":"Value1",
"field2":"Value2",
"field3":"Value3",
"field4":"Value3",
"field5":"Value4",
"field6":"Value5",
"field7":"Value6",
"field8":"",
"field9":"ce-3sdfa773-7sdaf2-989e-5dasdsdf",
"field10":"12345678",
"field11":"ertyu12345ffd",
"field12":"A",
"field13":"Value7",
"field14":"Value8",
"field15":"Value9",
"field16":"Value10",
"field17":"Value11",
"field18":"Value12",
"field19":{
"field20":"Value13",
"field21":"Value14"
},
"field22":"Value15",
"field23":"ipaddr",
"field24":"datwithtime",
"field25":"Value6",
"field26":"0",
"field20":"0",
"field28":"0"
}
If i change my template as per recommendation to something like this
{
"template": "*",
"mappings": {
"_default_": {
"properties": {
"filed6": {
"type": "string",
"analyzer": "keyword",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}}},
"dynamic_templates": [
{
"my_template": {
"match_mapping_type": "*",
"mapping": {
"type": "string",
"index": "not_analyzed"
}
}
}
]
}
}
}
Then i get error while i insert data stating
{"error":"MapperParsingException[failed to parse [field19]]; nested: ElasticsearchIllegalArgumentException[unknown property [field20 ]]; ","status":400}
In short you want to change the mapping of your index.
If your index does not contain any data(which I suppose, is not the
case), then you can simply delete the index and create it again with
new mapping.
If your index contains data, you will have to reindex it.
Steps for reindexing:
Put all data from existing index to dummy index.
Delete existing index. Generate new mapping.
Transfer data from dummy index to newly created index.
You can also give a look to elastic search alias here
This link might also be useful.
If you want to use the same field as analysed and not analysed at the same time you have to use multifield using
"title": {
"type": "multi_field",
"fields": {
"title": { "type": "string" },
"raw": { "type": "string", "index": "not_analyzed" }
}
}
This is for your reference.
For defining multifield in dynamic_templates use:
{
"template": "*",
"mappings": {
"_default_": {
"dynamic_templates": [
{
"my_template": {
"match_mapping_type": "string",
"mapping": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
]
}
}
}
Refer this for more info on this.
You can either write multiple templates or have separate properties depending on your requirements. Both will work smoothly.
1) Multiple Templates
{
"mappings": {
"your_doctype": {
"dynamic_templates": [
{
"analyzed_values": {
"match_mapping_type": "*",
"match_pattern": "regex",
"match": "title|summary",
"mapping": {
"type": "string",
"analyzer": "keyword"
}
}
},
{
"date_values": {
"match_mapping_type": "date",
"match": "*_date",
"mapping": {
"type": "date"
}
}
},
{
"exact_values": {
"match_mapping_type": "*",
"mapping": {
"type": "string",
"index": "not_analyzed"
}
}
}
]
}
}
}
Here title and summary are analyzed by keyword analyzer. I have also added date field which is optional, it will map create_date etc as date. Last one will match anything and it will not_analyzed which will fulfill your requirements.
2) Add analyzed field as properties.
{
"mappings": {
"your_doctype": {
"properties": {
"title": {
"type": "string",
"analyzer": "keyword"
},
"summary": {
"type": "string",
"analyzer": "keyword"
}
},
"dynamic_templates": [
{
"any_values": {
"match_mapping_type": "*",
"mapping": {
"type": "string",
"index": "not_analyzed"
}
}
}
]
}
}
}
Here title and summary fields are analyzed while rest will be not_analyzed
You would have to reindex the data no matter which solution you take.
EDIT 1 : After looking at your data and mapping, there is one slight problem in it. Your data contains object structure and you are mapping everything apart from filed6 as string and filed19 is an Object and not string and hence ES is throwing the error. The solution is to let ES decide which datatype the field is with dynamic_type. Change your mapping to this
{
"template": "*",
"mappings": {
"_default_": {
"properties": {
"filed6": {
"type": "string",
"analyzer": "keyword",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
},
"dynamic_templates": [
{
"my_template": {
"match_mapping_type": "*",
"mapping": {
"type": "{dynamic_type}", <--- this will decide if field is either object or string.
"index": "not_analyzed"
}
}
}
]
}
}
}
Hope this helps!!