How to pass column name to toscalar function? - kql

I have a kusto table with one of the columns as dynamic type with nested json.
If I use ColumnName in toscalar function, I will get this error. Is there any way to do this, or it is impossble?
let T = datatable(ColumnName:dynamic)
[
dynamic({"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 0}]}]}),
dynamic({"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 2}]}]})
];
let result = T
// The following line works, but regex is not allowed during review.
// | where tostring(ColumnName) matches regex '"ConfidenceLevel":"High","Count":[^0]'
| where isnotnull(toscalar(
// print s = '{"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 0}]}]}'
print s = tostring(ColumnName) // Error here: The name 'ColumnName' does not refer to any column, table, varible or function.
| project obj0 = parse_json(s)
| mv-expand obj1 = obj0.First
| mv-expand obj2 = obj1.Second
| where obj2.ConfidenceLevel == "High" and obj2.Count > 0)
)
;
result
I tried and confirmed it was caused by toscalar function.
Expected result (The second row will be selected):
ColumnName
{"OtherField":"Unknown","First":[{"Id":"","Second":[{"ConfidenceLevel":"Low","Count":3}]},{"Id":"","Second":[{"ConfidenceLevel":"High","Count":2}]}]}

The documentation is very clear about this:
"The toscalar() function can't be applied on row-level (for-each-row scenario)."
Nested mv-apply, to deal with the nested arrays
let T = datatable(ColumnName:dynamic)
[
dynamic({"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 0}]}]}),
dynamic({"OtherField": "Unknown","First": [{"Id": "","Second": [{"ConfidenceLevel": "Low","Count": 3}]},{"Id": "","Second":[{"ConfidenceLevel": "High","Count": 2}]}]})
];
T
| mv-apply ColumnName.First on
(
mv-apply ColumnName_First.Second on
(
where ColumnName_First_Second.ConfidenceLevel == "High"
and ColumnName_First_Second.Count > 0
)
)
| project ColumnName
ColumnName
{"OtherField":"Unknown","First":[{"Id":"","Second":[{"ConfidenceLevel":"Low","Count":3}]},{"Id":"","Second":[{"ConfidenceLevel":"High","Count":2}]}]}
Fiddle

Related

Query a single element from a dictionary-like column in SQL

I have a PostgresSQL table that looks as follows:
id order_id products
[PK] integer integer character varying
1 123 {"type": "foo", "counts": 2}
2 456 {"type": "foobar", "counts": 4}
3 789 {"type": "foo", "counts": 1}
4 678 {"type": "baz", "counts": 3}
I would like to query for only where type = foo.
In another query, I've successfully used the following:
SELECT
table_a.data::json->>'type' prod_type,
FROM table.a
But, this only works because the data column is JSON type.
How would I index into the products column such that I only return type = foo?
1 123 {"type": "foo", "counts": 2}
3 789 {"type": "foo", "counts": 1}
Thanks!
You can try something like this:
SELECT *
FROM test
WHERE products::json->>'type' = 'foo';
Sample Output:

Make a property bag from a list of keys and values

I have a list containing the keys and another list containing values (obtained from splitting a log line). How can I combine the two to make a proeprty-bag in Kusto?
let headers = pack_array("A", "B", "C");
datatable(RawData:string)
[
"1,2,3",
"4,5,6",
]
| expand fields = split(RawData, ",")
| expand dict = ???
Expected:
dict
-----
{"A": 1, "B": 2, "C": 3}
{"A": 4, "B": 5, "C": 6}
Here's one option, that uses the combination of:
mv-apply: https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/mv-applyoperator
pack(): https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/packfunction
make_bag(): https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/make-bag-aggfunction
let keys = pack_array("A", "B", "C");
datatable(RawData:string)
[
"1,2,3",
"4,5,6",
]
| project values = split(RawData, ",")
| mv-apply with_itemindex = i key = keys to typeof(string) on (
summarize dict = make_bag(pack(key, values[i]))
)
values
dict
[ "1", "2", "3"]
{ "A": "1", "B": "2", "C": "3"}
[ "4", "5", "6"]
{ "A": "4", "B": "5", "C": "6"}

Collecting distinct key value-pairs of Nested type in ClickHouse into arrays

I have data with the following schema in ClickHouse:
CREATE TABLE table AS (
key String,
…
nested Nested (
key String,
value String
)
) …
Some example data:
key | … | nested |
----|---|-------------------------------|
k1 | | [{"key": "a", "value": "1"}] |
k1 | | [{"key": "a", "value": "2"}] |
k1 | | [{"key": "a", "value": "1"}, |
| | "key": "a", "value": "2"}] |
k1 | | [{"key": "b", "value": "3" |
I want to group by the key and collect all the distinct key-value pairs into two arrays:
key | nested.key | nested.value |
------|-----------------|------------------|
k1 | ["a", "a", "b"] | ["1", "2", "3"] |
What is the simplest and most efficient way to do this in ClickHouse?
I would suggest this query:
SELECT DISTINCT
key,
arrayDistinct(groupArray((nested.key, nested.value))) AS distinctNested,
arrayMap(x -> (x.1), distinctNested) AS `nested.keys`,
arrayMap(x -> (x.2), distinctNested) AS `nested.values`
FROM test.table_002
ARRAY JOIN nested
GROUP BY key
/* Result
┌─key─┬─distinctNested──────────────────┬─nested.keys───┬─nested.values─┐
│ k1 │ [('a','1'),('a','2'),('b','3')] │ ['a','a','b'] │ ['1','2','3'] │
└─────┴─────────────────────────────────┴───────────────┴───────────────┘
*/
/* Test data preparing */
CREATE TABLE test.table_002 (
key String,
nested Nested (key String, value String)
) ENGINE = Memory;
INSERT INTO test.table_002
FORMAT JSONEachRow
{"key": "k1", "nested.key":["a"], "nested.value": ["1"]}
{"key": "k1", "nested.key":["a"], "nested.value": ["2"]}
{"key": "k1", "nested.key":["a", "a"], "nested.value": ["1", "2"]}
{"key": "k1", "nested.key":["b"], "nested.value": ["3"]}

Lookup smallest value greater than current

I have an objects table and a lookup table. In the objects table, I'm looking to add the smallest value from the lookup table that is greater than the object's number.
I found this similar question but it's about finding a value greater than a constant, rather than changing for each row.
In code:
import pandas as pd
objects = pd.DataFrame([{"id": 1, "number": 10}, {"id": 2, "number": 30}])
lookup = pd.DataFrame([{"number": 3}, {"number": 12}, {"number": 40}])
expected = pd.DataFrame(
[
{"id": 1, "number": 10, "smallest_greater": 12},
{"id": 2, "number": 30, "smallest_greater": 40},
]
)
First compare each value lookup['number'] by objects['number'] to 2d boolean mask, then add cumsum and compare first value by 1 and get position by numpy.argmax for set value by lookup['number'].
Output is generated with numpy.where for overwrite all not matched values to NaN.
objects = pd.DataFrame([{"id": 1, "number": 10}, {"id": 2, "number": 30},
{"id": 3, "number": 100},{"id": 4, "number": 1}])
print (objects)
id number
0 1 10
1 2 30
2 3 100
3 4 1
m1 = lookup['number'].values >= objects['number'].values[:, None]
m2 = np.cumsum(m1, axis=1) == 1
m3 = np.any(m1, axis=1)
out = lookup['number'].values[m2.argmax(axis=1)]
objects['smallest_greater'] = np.where(m3, out, np.nan)
print (objects)
id number smallest_greater
0 1 10 12.0
1 2 30 40.0
2 3 100 NaN
3 4 1 3.0
smallest_greater = []
for i in objects['number']: smallest_greater.append(lookup['number'[lookup[lookup['number']>i].sort_values(by='number').index[0]])
objects['smallest_greater'] = smallest_greater

SQL-style GROUP BY aggregate functions in jq (COUNT, SUM and etc)

Similar questions asked here before:
Count items for a single key: jq count the number of items in json by a specific key
Calculate the sum of object values:
How do I sum the values in an array of maps in jq?
Question
How to emulate the COUNT aggregate function which should behave similarly to its SQL original? Let's extend this question even more to include other regular SQL functions:
COUNT
SUM / MAX/ MIN / AVG
ARRAY_AGG
The last one is not a standard SQL function - it's from PostgreSQL but is quite useful.
At input comes a stream of valid JSON objects. For demonstration let's pick a simple story of owners and their pets.
Model and data
Base relation: Owner
id name age
1 Adams 25
2 Baker 55
3 Clark 40
4 Davis 31
Base relation: Pet
id name litter owner_id
10 Bella 4 1
20 Lucy 2 1
30 Daisy 3 2
40 Molly 4 3
50 Lola 2 4
60 Sadie 4 4
70 Luna 3 4
Source
From above we get a derivative relation Owner_Pet (a result of SQL JOIN of the above relations) presented in JSON format for our jq queries (the source data):
{ "owner_id": 1, "owner": "Adams", "age": 25, "pet_id": 10, "pet": "Bella", "litter": 4 }
{ "owner_id": 1, "owner": "Adams", "age": 25, "pet_id": 20, "pet": "Lucy", "litter": 2 }
{ "owner_id": 2, "owner": "Baker", "age": 55, "pet_id": 30, "pet": "Daisy", "litter": 3 }
{ "owner_id": 3, "owner": "Clark", "age": 40, "pet_id": 40, "pet": "Molly", "litter": 4 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pet_id": 50, "pet": "Lola", "litter": 2 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pet_id": 60, "pet": "Sadie", "litter": 4 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pet_id": 70, "pet": "Luna", "litter": 3 }
Requests
Here are sample requests and their expected output:
COUNT the number of pets per owner:
{ "owner_id": 1, "owner": "Adams", "age": 25, "pets_count": 2 }
{ "owner_id": 2, "owner": "Baker", "age": 55, "pets_count": 1 }
{ "owner_id": 3, "owner": "Clark", "age": 40, "pets_count": 1 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pets_count": 3 }
SUM up the number of whelps per owner and get their MAX (MIN/AVG):
{ "owner_id": 1, "owner": "Adams", "age": 25, "litter_total": 6, "litter_max": 4 }
{ "owner_id": 2, "owner": "Baker", "age": 55, "litter_total": 3, "litter_max": 3 }
{ "owner_id": 3, "owner": "Clark", "age": 40, "litter_total": 4, "litter_max": 4 }
{ "owner_id": 4, "owner": "Davis", "age": 31, "litter_total": 9, "litter_max": 4 }
ARRAY_AGG pets per owner:
{ "owner_id": 1, "owner": "Adams", "age": 25, "pets": [ "Bella", "Lucy" ] }
{ "owner_id": 2, "owner": "Baker", "age": 55, "pets": [ "Daisy" ] }
{ "owner_id": 3, "owner": "Clark", "age": 40, "pets": [ "Molly" ] }
{ "owner_id": 4, "owner": "Davis", "age": 31, "pets": [ "Lola", "Sadie", "Luna" ] }
Here's an alternative, not using any custom functions with basic JQ. (I took the liberty to get rid of redundant parts of the question)
Count
In> jq -s 'group_by(.owner_id) | map({ owner_id: .[0].owner_id, count: map(.pet) | length})'
Out>[{"owner_id": "1","pets_count": 2}, ...]
Sum
In> jq -s 'group_by(.owner_id) | map({owner_id: .[0].owner_id, sum: map(.litter) | add})'
Out> [{"owner_id": "1","sum": 6}, ...]
Max
In> jq -s 'group_by(.owner_id) | map({owner_id: .[0].owner_id, max: map(.litter) | max})'
Out> [{"owner_id": "1","max": 4}, ...]
Aggregate
In> jq -s 'group_by(.owner_id) | map({owner_id: .[0].owner_id, agg: map(.pet) })'
Out> [{"owner_id": "1","agg": ["Bella","Lucy"]}, ...]
Sure, these might not be the most efficient implementations, but they show nicely how to implement custom functions oneself. All that changes between the different functions is inside the last map and the function after the pipe | (length, add, max)
The first map iterates over the different groups, taking the name from the first item, and using map again to iterate over the same-group items. Not as pretty as SQL, but not terribly more complicated.
I learned JQ today, and managed to do this already, so this should be encouraging for anyone getting started. JQ is neither like sed nor like SQL, but not terribly hard either.
Extended jq solution:
Custom count() function:
jq -sc 'def count($k): group_by(.[$k])[] | length as $l | .[0]
| .pets_count = $l
| del(.pet_id, .pet, .litter);
count("owner_id")' source.data
The output:
{"owner_id":1,"owner":"Adams","age":25,"pets_count":2}
{"owner_id":2,"owner":"Baker","age":55,"pets_count":1}
{"owner_id":3,"owner":"Clark","age":40,"pets_count":1}
{"owner_id":4,"owner":"Davis","age":31,"pets_count":3}
Custom sum() function:
jq -sc 'def sum($k): group_by(.[$k])[] | map(.litter) as $litters | .[0]
| . + {litter_total: $litters | add, litter_max: $litters | max}
| del(.pet_id, .pet, .litter);
sum("owner_id")' source.data
The output:
{"owner_id":1,"owner":"Adams","age":25,"litter_total":6,"litter_max":4}
{"owner_id":2,"owner":"Baker","age":55,"litter_total":3,"litter_max":3}
{"owner_id":3,"owner":"Clark","age":40,"litter_total":4,"litter_max":4}
{"owner_id":4,"owner":"Davis","age":31,"litter_total":9,"litter_max":4}
Custom array_agg() function:
jq -sc 'def array_agg($k): group_by(.[$k])[] | map(.pet) as $pets | .[0]
| .pets = $pets | del(.pet_id, .pet, .litter);
array_agg("owner_id")' source.data
The output:
{"owner_id":1,"owner":"Adams","age":25,"pets":["Bella","Lucy"]}
{"owner_id":2,"owner":"Baker","age":55,"pets":["Daisy"]}
{"owner_id":3,"owner":"Clark","age":40,"pets":["Molly"]}
{"owner_id":4,"owner":"Davis","age":31,"pets":["Lola","Sadie","Luna"]}
This is a nice exercise, but SO is not a programming service, so I will focus here on some key concepts for generic solutions in jq that are efficient, even for very large collections.
GROUPS_BY
The key to efficiency here is avoiding the built-in group_by, as it requires sorting. Since jq is fundamentally stream-oriented, the following definition of GROUPS_BY is likewise stream-oriented. It takes advantage of the efficiency of key-based lookups, while avoiding calling tojson on strings:
# emit a stream of the groups defined by f
def GROUPS_BY(stream; f):
reduce stream as $x ({};
($x|f) as $s
| ($s|type) as $t
| (if $t == "string" then $s else ($s|tojson) end) as $y
| .[$t][$y] += [$x] )
| .[][] ;
distinct and count_distinct
# Emit an array of the distinct entities in `stream`, without sorting
def distinct(stream):
reduce stream as $x ({};
($x|type) as $t
| (if $t == "string" then $x else ($x|tojson) end) as $y
| if (.[$t] | has($y)) then . else .[$t][$y] += [$x] end )
| [.[][]] | add ;
# Emit the number of distinct items in the given stream
def count_distinct(stream):
def sum(s): reduce s as $x (0;.+$x);
reduce stream as $x ({};
($x|type) as $t
| (if $t == "string" then $x else ($x|tojson) end) as $y
| .[$t][$y] = 1 )
| sum( .[][] ) ;
Convenience function
def owner: {owner_id,owner,age};
Example: "COUNT the number of pets per owner"
GROUPS_BY(inputs; .owner_id)
| (.[0] | owner) + {pets_count: count_distinct(.[]|.pet_id)}
Invocation: jq -nc -f program1.jq input.json
Output:
{"owner_id":1,"owner":"Adams","age":25,"pets_count":2}
{"owner_id":2,"owner":"Baker","age":55,"pets_count":1}
{"owner_id":3,"owner":"Clark","age":40,"pets_count":1}
{"owner_id":4,"owner":"Davis","age":31,"pets_count":3}
Example: "SUM up the number of whelps per owner and get their MAX"
GROUPS_BY(inputs; .owner_id)
| (.[0] | owner)
+ {litter_total: (map(.litter) | add)}
+ {litter_max: (map(.litter) | max)}
Invocation: jq -nc -f program2.jq input.json
Output: as given.
Example: "ARRAY_AGG pets per owner"
GROUPS_BY(inputs; .owner_id)
| (.[0] | owner) + {pets: distinct(.[]|.pet)}
Invocation: jq -nc -f program3.jq input.json
Output:
{"owner_id":1,"owner":"Adams","age":25,"pets":["Bella","Lucy"]}
{"owner_id":2,"owner":"Baker","age":55,"pets":["Daisy"]}
{"owner_id":3,"owner":"Clark","age":40,"pets":["Molly"]}
{"owner_id":4,"owner":"Davis","age":31,"pets":["Lola","Sadie","Luna"]}