schedule a pipeline of azure data lake store which runs on every Monday at 8 am UTC - azure-data-lake

Output Data Set:
"availability": {"frequency": "Day","interval": 1,"offset": "03:00:00","style": "StartOfInterval"}
Pipeline:
"scheduler": {"frequency": "Day","interval": 1,"offset": "03:00:00","style": "StartOfInterval"}

I think this should do the trick
{
"name": "trigger10",
"properties": {
"runtimeState": "Stopped",
"type": "ScheduleTrigger",
"typeProperties": {
"recurrence": {
"frequency": "Week",
"interval": 1,
"startTime": "2019-02-27T12:49:00.000Z",
"timeZone": "UTC",
"schedule": {
"minutes": [
0
],
"hours": [
20
],
"weekDays": [
"Monday"
]
}
}
}
}
}

Related

AWS IoT rule sql select statement

I am trying to write a SQL select statement for my AWS IoT rule to extract the values 'gateway_id and 'rssi' from the following MQTT message:
{
"end_device_ids": {
"device_id": "imd2",
"application_ids": {
"application_id": "pennal"
},
"dev_eui": "004E3A0DF76DC9E9",
"join_eui": "70B3D57ED003CBE8",
"dev_addr": "260BA9D0"
},
"correlation_ids": [
"as:up:01G30W0J4D65P6D50QH1DN3ZQP",
"gs:conn:01G2ZZ7FT9BH6J93WRYS4ATVDM",
"gs:up:host:01G2ZZ7FTN14103H90QN71Q557",
"gs:uplink:01G30W0HXWMES1Z7X7F2MCFMPF",
"ns:uplink:01G30W0HXXJM5PNGJAD0W01GGH",
"rpc:/ttn.lorawan.v3.GsNs/HandleUplink:01G30W0HXWFR3HNGBZS7XJV15E",
"rpc:/ttn.lorawan.v3.NsAs/HandleUplink:01G30W0J4D18JZW199EM8WERGR"
],
"received_at": "2022-05-14T08:47:25.837680984Z",
"uplink_message": {
"session_key_id": "AYBlRLSz9n83bW3WU3+GfQ==",
"f_port": 1,
"f_cnt": 5013,
"frm_payload": "DiAAAA==",
"decoded_payload": {
"rainmm": 0,
"voltage": 3.616
},
"rx_metadata": [
{
"gateway_ids": {
"gateway_id": "pennal-gw2",
"eui": "AC1F09FFFE057EC6"
},
"time": "2022-05-14T08:47:25.065794944Z",
"timestamp": 114306297,
"rssi": -126,
"channel_rssi": -126,
"snr": -8.25,
"uplink_token": "ChgKFgoKcGVubmFsLWd3MhIIrB8J//4FfsYQ+dnANhoMCJ3Z/ZMGEOPmv6sCIKjZvump7gYqCwid2f2TBhCA568f"
}
],
"settings": {
"data_rate": {
"lora": {
"bandwidth": 125000,
"spreading_factor": 11
}
},
"coding_rate": "4/5",
"frequency": "868100000",
"timestamp": 114306297,
"time": "2022-05-14T08:47:25.065794944Z"
},
"received_at": "2022-05-14T08:47:25.629041670Z",
"confirmed": true,
"consumed_airtime": "0.659456s",
"version_ids": {
"brand_id": "heltec",
"model_id": "cubecell-dev-board-class-a-otaa",
"hardware_version": "_unknown_hw_version_",
"firmware_version": "1.0",
"band_id": "EU_863_870"
},
"network_ids": {
"net_id": "000013",
"tenant_id": "ttn",
"cluster_id": "eu1",
"cluster_address": "eu1.cloud.thethings.network"
}
}
}
I have tried following the documentation here: AWS Documentation but am struggling with the nested part of the message.
my SQL statement at the moment is:
SELECT received_at as datetime, end_device_ids.device_id as device_id,
uplink_message.decoded_payload.rainmm as rainmm, uplink_message.decoded_payload.voltage as
voltage, uplink_message.settings.data_rate.lora.spreading_factor as sprfact,
uplink_message.consumed_airtime as time_on_air ,uplink_message.settings.timestamp as ts,
uplink_message.rx_metadata as rx,(select value gateway_ids from uplink_message.rx_metadata) as gw,
(select value rssi from uplink_message.rx_metadata)as rssi, get((select gateway_id from
uplink_message.rx_metadata),0).gateway_id as gwn FROM 'thethings/lorawan/matt-pennal-ire/uplink'
which returns
{
"datetime": "2022-05-15T12:19:11.947844474Z",
"device_id": "md4",
"rainmm": 5.842001296924288,
"voltage": 3.352,
"sprfact": 8,
"time_on_air": "0.092672s",
"ts": 3262497863,
"rx": [
{
"gateway_ids": {
"gateway_id": "pennal-gw2",
"eui": "AC1F09FFFE057EC6"
},
"time": "2022-05-15T12:19:11.178463935Z",
"timestamp": 3262497863,
"rssi": -125,
"channel_rssi": -125,
"snr": -7.5,
"uplink_token": "ChgKFgoKcGVubmFsLWd3MhIIrB8J//4FfsYQx4jXkwwaDAi/34OUBhCCy9XhAiDY6prg+ckHKgsIv9+DlAYQv8mMVQ=="
}
],
"gw": [
{
"gateway_id": "pennal-gw2",
"eui": "AC1F09FFFE057EC6"
}
],
"rssi": [
-125
]
}
but I would like it to return
{
"datetime": "2022-05-15T12:19:11.947844474Z",
"device_id": "md4",
"rainmm": 5.842001296924288,
"voltage": 3.352,
"sprfact": 8,
"time_on_air": "0.092672s",
"ts": 3262497863,
"gwn":"pennal_gw2"
"rssi":-126
}
Any help to get the values from the nested array would be greatly appreciated!

How to avoid the duplicated data entry after parsing json in kusto?

I have following sample json data.
{
"data": {
"type": "ABC",
"id": "17495500314",
"attributes": {
[!["event": "update",
"gps_vali][1]][1]d": true,
"gps": {
"distance_diff": 6.48,
"total_distance": 848.6
},
"hdop": 79,
"fuel_level": 46.8,
"total_fuel_used": 60443.9,
"location": {
"latitude": 411.372618,
"longitude": -1.254931,
"relative_position": {
"distance": "37",
}
},
"idle_periods": []
},
"relationships": {
"assets": {
"data": [
{
"type": "ABCDFTTG",
"id": "1589799143500003",
"attributes": {
"external_id": "ABCDFTTG",
"hardware_id": "ABCDFTTG"
}
}
]
},
"devices": {
"data": [
{
"type": "ABCDFTTG",
"id": "1585231172900341",
"attributes": {
"serial": "5572016191"
}
},
{
"type": "tablet",
"id": "1587893062600175",
"attributes": {
"serial": "ABCDFTTG"
}
}
]
},
"users": {
"data": [
{
"type": "user",
"id": "ABCDFTTG",
"attributes": {
"external_id": "ABCDFTTG"
}
}
]
}
}
},
"meta": {
"message_id": "11eb-8c75-0b3f87aedbb5",
"consumer_version": "1.2.0",
"origin_version": null,
"timestamp": "2021-06-14T17:42:29Z"
}
}
I want only one row instead of this two. Here is my kusto query which is used for parsing json data into table columns.
Test
|where messageId =="123"
//|mv-expand message=message.data.attributes
|mv-expand message
|mv-expand Value=message.data.relationships.assets.['data']
|mv-expand value_devices=message.data.relationships.devices.['data']
|mv-expand value_user=message.data.relationships.users.['data']
| project type=message.data.type,id=message.data.id,
event=tostring(message.data.attributes.event),
logged_at=tostring(message.data.attributes.logged_at),
distance=toint(message.data.attributes.location.relative_position.distance),
// Value=message.data.relationships.assets.['data'],//.['data']
type_asset=Value.type,asset_id=Value.id,
device_type=value_devices.type,device_id=value_devices.id,
device_attr_serial=value_devices.attributes.serial,
user_type=value_user.type,user_id=value_user.id,
user_external_id=value_user.attributes.external_id
This duplicate row appeared after adding user tag this tag is array so how to handle this array with single id.
I have parse my json data any got the following output.
Expected output should be like
check device_type and device_id columns

Query Druid SQL inner join with a dataSource name that has a dash

How to write an INNER JOIN query between two data sources that one of them has a dash as it's schema name
Executing the following query on the Druid SQL binary results in a query error
SELECT *
FROM first
INNER JOIN "second-schema" on first.device_id = "second-schema".device_id;
org.apache.druid.java.util.common.ISE: Cannot build plan for query
Is this the correct syntax when trying to refrence a data source that has a dash in it's name?
Schema
[
{
"dataSchema": {
"dataSource": "second-schema",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"device_id",
"device_name",
"x_1",
"x_2",
"x_3",
"vlan",
"s_x",
"d_x",
"d_p",
"msg_type"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "hyperUnique", "name": "conn_id_hll", "fieldName": "conn_id"},
{
"type": "count",
"name": "event_count"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "flow-info",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "flow-info"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
},
{
"dataSchema": {
"dataSource": "first",
"parser": {
"type": "string",
"parseSpec": {
"format": "json",
"timestampSpec": {
"column": "ts_start"
},
"dimensionsSpec": {
"dimensions": [
"etid",
"category",
"device_id",
"device_name",
"severity",
"x_2",
"x_3",
"x_4",
"x_5",
"vlan",
"s_x",
"d_x",
"s_i",
"d_i",
"d_p",
"id"
],
"dimensionExclusions": [],
"spatialDimensions": []
}
}
},
"metricsSpec": [
{ "type": "doubleSum", "name": "val_num", "fieldName": "val_num" },
{ "type": "doubleMin", "name": "val_num_min", "fieldName": "val_num" },
{ "type": "doubleMax", "name": "val_num_max", "fieldName": "val_num" },
{ "type": "doubleSum", "name": "size", "fieldName": "size" },
{ "type": "doubleMin", "name": "size_min", "fieldName": "size" },
{ "type": "doubleMax", "name": "size_max", "fieldName": "size" },
{ "type": "count", "name": "first_count" }
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "minute"
}
},
"ioConfig": {
"type": "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connectiontimeout.ms": "15000",
"zookeeper.sessiontimeout.ms": "15000",
"zookeeper.synctime.ms": "5000",
"group.id": "first",
"fetch.size": "1048586",
"autooffset.reset": "largest",
"autocommit.enable": "false"
},
"feed": "first"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type": "realtime",
"maxRowsInMemory": 50000,
"basePersistDirectory": "\/opt\/druid-data\/realtime\/basePersist",
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT15m",
"rejectionPolicy": {
"type": "serverTime"
}
}
}
]
Based on your schema definitions there are a few observations I'll make.
When doing a join you usually have to list out columns explicitly (not use a *) otherwise you get collisions from duplicate columns. In your join, for example, you have a device_id in both "first" and "second-schema", not to mention all the other columns that are the same across both.
When using a literal delimiter I don't mix them up. I either use them or I don't.
So I think your query will work better in the form of something more like this
SELECT
"first"."etid",
"first"."category",
"first"."device_id",
"first"."device_name",
"first"."severity",
"first"."x_2",
"first"."x_3",
"first"."x_4",
"first"."x_5",
"first"."vlan",
"first"."s_x",
"first"."d_x",
"first"."s_i",
"first"."d_i",
"first"."d_p",
"first"."id",
"second-schema"."etid" as "ss_etid",
"second-schema"."device_id" as "ss_device_id",
"second-schema"."device_name" as "ss_device_name",
"second-schema"."x_1" as "ss_x_1",
"second-schema"."x_2" as "ss_x_2",
"second-schema"."x_3" as "ss_x_3",
"second-schema"."vlan" as "ss_vlan",
"second-schema"."s_x" as "ss_s_x",
"second-schema"."d_x" as "ss_d_x",
"second-schema"."d_p" as "ss_d_p",
"second-schema"."msg_type"
FROM "first"
INNER JOIN "second-schema" ON "first"."device_id" = "second-schema"."device_id";
Obviously feel free to name columns as you see fit, or include exclude columns as needed. Select * will only work when all columns across both tables are unique.

Indexing policy is not being created along with collection in Cosmos DB

I'm writing a database creation script using Java SDK and the indexing policy is not being created as expected (and documented).
JAVA SDK used: com.microsoft.azure:azure-documentdb:2.4.0
Azure Cosmos DB emulator 2.2.2 for windows
Current Cosmos DB installation in Azure Portal with SQL account
I construct the collection creation request with a JAVA library and the result (before the actual request) looks like this (DocumentCollection::toJson()):
{
"uniqueKeyPolicy": {},
"partitionKey":
{
"kind": "Hash",
"paths": ["/playerId"]
},
"indexingPolicy":
{
"indexingMode": "Consistent",
"automatic": true,
"includedPaths": [
{
"path": "/gameId/?",
"indexes": [
{
"kind": "Range",
"dataType": "String"
}
]
},
{
"path": "/playerId/?",
"indexes": [
{
"kind": "Range",
"dataType": "String"
}
]
},
{
"path": "/date/*",
"indexes": [
{
"kind": "Range",
"dataType": "String"
}
]
}
],
"excludedPaths": [
{
"path": "/*"
}
]
},
"id": "Games"
}
The request completes successful but if I check the actual indexing policy with data explorer or DocumentClient.readCollection it looks like this:
{
"indexingMode": "consistent",
"automatic": true,
"includedPaths": [
{
"path": "/gameId/?",
"indexes": []
},
{
"path": "/playerId/?",
"indexes": []
},
{
"path": "/date/*",
"indexes": []
}
],
"excludedPaths": [
{
"path": "/*"
},
{
"path": "/\"_etag\"/?"
}
]
}
As you can see the arrays for index definitions are empty.
Then if I copy the indexing policy from SDK-generated output and manually paste it in Emulator or Portal 'Scale and settings' window for created collection the update outcome becomes the following:
{
"indexingMode": "consistent",
"automatic": true,
"includedPaths": [
{
"path": "/gameId/?",
"indexes": [
{
"kind": "Range",
"dataType": "String",
"precision": -1
},
{
"kind": "Range",
"dataType": "Number",
"precision": -1
}
]
},
{
"path": "/playerId/?",
"indexes": [
{
"kind": "Range",
"dataType": "String",
"precision": -1
},
{
"kind": "Range",
"dataType": "Number",
"precision": -1
}
]
},
{
"path": "/date/*",
"indexes": [
{
"kind": "Range",
"dataType": "String",
"precision": -1
},
{
"kind": "Range",
"dataType": "Number",
"precision": -1
}
]
}
],
"excludedPaths": [
{
"path": "/*"
},
{
"path": "/\"_etag\"/?"
}
]
}
So the indexes are being created (although with extra Number entry as mentioned here).
Am I doing something wrong with a creation script?
Please follow the github source code and refer to below code which works for me.
import com.microsoft.azure.documentdb.*;
import java.util.Collection;
import java.util.List;
public class CreateCollectionTest {
//
static private String YOUR_COSMOS_DB_ENDPOINT = "https://***.documents.azure.com:443/";
static private String YOUR_COSMOS_DB_MASTER_KEY = "***";
public static void main(String[] args) throws DocumentClientException {
DocumentClient client = new DocumentClient(
YOUR_COSMOS_DB_ENDPOINT,
YOUR_COSMOS_DB_MASTER_KEY,
new ConnectionPolicy(),
ConsistencyLevel.Session);
DocumentCollection collection = new DocumentCollection();
collection.set("id", "game");
IndexingPolicy indexingPolicy = new IndexingPolicy();
Collection<IncludedPath> includedPaths = new ArrayList<IncludedPath>();
IncludedPath includedPath = new IncludedPath();
includedPath.setPath("/gameId/?");
Collection<Index> indexes = new ArrayList<Index>();
Index stringIndex = Index.Range(DataType.String);
stringIndex.set("precision", -1);
indexes.add(stringIndex);
Index numberIndex = Index.Range(DataType.Number);
numberIndex.set("precision", -1);
indexes.add(numberIndex);
includedPath.setIndexes(indexes);
includedPaths.add(includedPath);
indexingPolicy.setIncludedPaths(includedPaths);
collection.setIndexingPolicy(indexingPolicy);
ResourceResponse<DocumentCollection> createColl = client.createCollection("dbs/db", collection, null);
}
}

Nested "for loop" searches in SQL - Azure CosmosDB

I am using Cosmos DB and have a document with the following simplified structure:
{
"id1":"123",
"stuff": [
{
"id2": "stuff",
"a": {
"b": {
"c": {
"d": [
{
"e": [
{
"id3": "things",
"name": "animals",
"classes": [
{
"name": "ostrich",
"meta": 1
},
{
"name": "big ostrich",
"meta": 1
}
]
},
{
"id3": "default",
"name": "other",
"classes": [
{
"name": "green trees",
"meta": 1
},
{
"name": "trees",
"score": 1
}
]
}
]
}
]
}
}
}
}
]
}
My issue is - I have an array of these documents and need to search name to see if it matches my search word. For example I want both big trees and trees to return if a user types in trees.
So currently I push every document into an array and do the following:
For each document
for each stuff
for each a.b.c.d[0].e
for each classes
var splice = name.split(' ')
if (splice.includes(searchWord))
return id1, id2 and id3.
Using cosmosDB I am using SQL with the following code:
client.queryDocuments(
collection,
`SELECT * FROM root r`
).toArray((err, results) => {stuff});
This effectively brings every document in my collection into an array to perform the search manually above as mentioned.
This is going to cause issues when I have 1000s or 1,000,000s of documents in the array and I believe I should be leveraging the search mechanics available within Cosmos itself. Is anyone able to help me to work out what SQL query would be able to perform this type of function?
Having searched everything is it also possible to search the 5 latest documents?
Thanks for any insight in advance!
1.Is anyone able to help me to work out what SQL query would be able to
perform this type of function?
According to your sample and description, I suggest you using ARRAY_CONTAINS in cosmos db sql. Please refer to my sample:
sample documents:
[
{
"id1": "123",
"stuff": [
{
"id2": "stuff",
"a": {
"b": {
"c": {
"d": [
{
"e": [
{
"id3": "things",
"name": "animals",
"classes": [
{
"name": "ostrich",
"meta": 1
},
{
"name": "big ostrich",
"meta": 1
}
]
},
{
"id3": "default",
"name": "other",
"classes": [
{
"name": "green trees",
"meta": 1
},
{
"name": "trees",
"score": 1
}
]
}
]
}
]
}
}
}
}
]
},
{
"id1": "456",
"stuff": [
{
"id2": "stuff2",
"a": {
"b": {
"c": {
"d": [
{
"e": [
{
"id3": "things2",
"name": "animals",
"classes": [
{
"name": "ostrich",
"meta": 1
},
{
"name": "trees",
"meta": 1
}
]
},
{
"id3": "default2",
"name": "other",
"classes": [
{
"name": "green trees",
"meta": 1
},
{
"name": "trees",
"score": 1
}
]
}
]
}
]
}
}
}
}
]
},
{
"id1": "789",
"stuff": [
{
"id2": "stuff3",
"a": {
"b": {
"c": {
"d": [
{
"e": [
{
"id3": "things3",
"name": "animals",
"classes": [
{
"name": "ostrich",
"meta": 1
},
{
"name": "big",
"meta": 1
}
]
},
{
"id3": "default3",
"name": "other",
"classes": [
{
"name": "big trees",
"meta": 1
}
]
}
]
}
]
}
}
}
}
]
}
]
query :
SELECT distinct c.id1,stuff.id2,e.id3 FROM c
join stuff in c.stuff
join d in stuff.a.b.c.d
join e in d.e
where ARRAY_CONTAINS(e.classes,{name:"trees"},true)
or ARRAY_CONTAINS(e.classes,{name:"big trees"},true)
output:
2.Having searched everything is it also possible to search the 5 latest
documents?
Per my research, features like LIMIT is not supported in cosmos so far. However , TOP is supported by cosmos db. So if you could add sort field(such as date or id), then you could use sql:
select top 5 from c order by c.sort desc