Mongodb aggregation to find outliers - pandas

In my mongodb collection documents are stored in the following format:
{ "_id" : ObjectId("62XXXXXX"), "res" : 12, ... }
{ "_id" : ObjectId("63XXXXXX"), "res" : 23, ... }
{ "_id" : ObjectId("64XXXXXX"), "res" : 78, ... }
...
I need to extract id's for the document for which the value of "res" is outlier (i.e. value < Q1 - 1.5 * IQR or value > Q3 + 1.5 * IQR (Q1, Q3 are percentiles)). I have done this using pandas functionality by retrieving all documents from the collection, which may become slow if the number of documents in collection become too big.
Is there a way to do this using mongodb aggregation pipeline (or just calculating percentiles)?

If I understand how you want to retrieve outliers, here's one way you might be able to do it.
db.collection.aggregate([
{ // partition res into quartiles
"$bucketAuto": {
"groupBy": "$res",
"buckets": 4
}
},
{ // get the max of each quartile
"$group": {
"_id": "$_id.max"
}
},
{ // sort the quartile maxs
"$sort": {
"_id": 1
}
},
{ // put sorted quartile maxs into array
"$group": {
"_id": null,
"maxs": {"$push": "$_id"}
}
},
{ // assign Q1 and Q3
"$project": {
"_id": 0,
"q1": {"$arrayElemAt": ["$maxs", 0]},
"q3": {"$arrayElemAt": ["$maxs", 2]}
}
},
{ // set IQR
"$set": {
"iqr": {
"$subtract": ["$q3", "$q1"]
}
}
},
{ // assign upper/lower outlier thresholds
"$project": {
"outlierThresholdLower": {
"$subtract": [
"$q1",
{"$multiply": ["$iqr", 1.5]}
]
},
"outlierThresholdUpper": {
"$add": [
"$q3",
{"$multiply": ["$iqr", 1.5]}
]
}
}
},
{ // get outlier _id's
"$lookup": {
"from": "collection",
"as": "outliers",
"let": {
"oTL": "$outlierThresholdLower",
"oTU": "$outlierThresholdUpper"
},
"pipeline": [
{
"$match": {
"$expr": {
"$or": [
{"$lt": ["$res", "$$oTL"]},
{"$gt": ["$res", "$$oTU"]}
]
}
}
},
{
"$project": {
"_id": 1
}
}
]
}
}
])
Try it on mongoplayground.net.

One more option based on #rickhg12hs's answer, is to use $setWindowFields:
db.collection.aggregate([
{$setWindowFields: {
sortBy: {res: 1},
output: {
totalCount: {$count: {}},
index: {$sum: 1, window: {documents: ["unbounded", "current"]}}
}
}
},
{$match: {
$expr: {$lte: [
{$abs: {$subtract: [
{$mod: [
{$multiply: [
{$add: ["$index", {$round: {$divide: ["$totalCount", 4]}}]}, 2]},
"$totalCount"
]}, 0]}
}, 1]}
}},
{$group: {_id: null, res: {$push: "$res"}}},
{$project: {_id: 0, q1: {$first: "$res"}, q3: {$last: "$res"},
iqr: {"$subtract": [{$last: "$res"}, {$first: "$res"}]}
}},
{$project: {
outlierThresholdLower: {$subtract: ["$q1", {$multiply: ["$iqr", 1.5]}]},
outlierThresholdUpper: {$add: ["$q3", {$multiply: ["$iqr", 1.5]}]}
}
},
{$lookup: {
from: "collection",
as: "outliers",
let: {oTL: "$outlierThresholdLower", oTU: "$outlierThresholdUpper"},
pipeline: [
{$match: {$expr: {$or: [{$lt: ["$res", "$$oTL"]}, {$gt: ["$res", "$$oTU"]}]}}},
{$project: {_id: 1}}
]
}
}
])
See how it works on the playground example

Related

Query item in nested array

Customer appointments with top level locationId sample data set:
[
{
"locationId": 9999,
"customerAppointments": [
{
"customerId": "1",
"appointments": [
{
"appointmentId": "cbbce566-da59-42c2-8845-53976ba63d56",
"locationName": "Sullivan St"
},
{
"appointmentId": "5f09e2af-ddae-47aa-9f7c-fd1001a9c5e6",
"locationName": "Oak St"
}
]
},
{
"customerId": "2",
"appointments": [
{
"appointmentId": "964a3c1c-ccec-4082-99e2-65795352ba79",
"locationName": "Kellet St"
}
]
},
{
"customerId": "3",
"appointments": []
}
]
},
{
...
},
{
...
}
]
I need to pull out appointment by locationId and customerId and only get the appointment for that customerId e.g
Sample response:
[
{
"appointmentId": "964a3c1c-ccec-4082-99e2-65795352ba79",
"locationName": "Kellet St"
}
]
Tried below query, but it just returns all records for all customers ids (which is kind of expected):
db.getCollection("appointments").find(
{
"locationId" : NumberInt(9999),
"customerAppointments" : {
"$elemMatch" : {
"customerId" : "2"
}
}
}
);
But how can I get just the appointment record for a specific customerId?
When asking this question I was unaware of the older version of MongoDB driver (< v5) so we cannot use the $getField operator.
However, this query seems to work well:
db.getCollection("appointments").aggregate([
{
$match: {
"locationId": NumberInt(9999)
}
},
{
$unwind: "$customerAppointments"
},
{
$match: {
"customerAppointments.customerId": "2"
}
},
{
$project: {
appointments: "$customerAppointments.appointments"
}
}
]);
Yields:
{
"_id" : ObjectId("63eebe95c7a0da54804c1db2"),
"appointments" : [
{
"appointmentId" : "964a3c1c-ccec-4082-99e2-65795352ba79",
"locationName" : "Kellet St"
}
]
}

Comparing two JSON objects with order of fields and subarrays shuffled in Karate [duplicate]

This question already has an answer here:
Asserting and using conditions for an array response in Karate
(1 answer)
Closed 2 years ago.
I want to loop through below nested json structure and want to update all the required fields, however I could achieve this through typescript but want to do this in karate JS, I do not see any examples how to nested for each works.
I want to update 26 periods data(here for readability i used 3), based on index I want to update period field, i.e. if(index == key), these 26 periods are under each car attribute.(NOTe: you again have multiple cars and multiple car attributes and each car attribute you have 26 periods data)
I cannot use this Karate - Match two dynamic responses only when you have single array list and have less data
[
{
"cars": [
{
"name": "car 1",
"periodsData": [
{
"period": "5ed73ed31a775d1ab0c9fb5c",
"index": 1
},
{
"period": "5ed73ed31a775d1ab0c9fb5d",
"index": 2
},
{
"period": "5ed73ed31a775d1ab0c9fb5e",
"index": 3
}
]
},
{
"name": "car 2",
"periodsData": [
{
"period": "5ed73ed31a775d1ab0c9fb5c",
"index": 1
},
{
"period": "5ed73ed31a775d1ab0c9fb5d",
"index": 2
},
{
"period": "5ed73ed31a775d1ab0c9fb5e",
"index": 3
}
]
},
{
"name": "car 3",
"periodsData": [
{
"period": "5ed73ed31a775d1ab0c9fb5c",
"index": 1
},
{
"period": "5ed73ed31a775d1ab0c9fb5d",
"index": 2
},
{
"period": "5ed73ed31a775d1ab0c9fb5e",
"index": 3
}
]
}
],
"totalPeriodEprps": [
{
"period": "5ed73ed31a775d1ab0c9fb5c",
"index": 1
},
{
"period": "5ed73ed31a775d1ab0c9fb5d",
"index": 2
},
{
"period": "5ed73ed31a775d1ab0c9fb5e",
"index": 3
}
]
}
carId ="dfd"
]
This above array repeats
Type script code
//periods is a map of index and values
async modifyCarsData(mid, id, periods, campaignData) {
//carData is a json file
carData.forEach(element => {
element.carId= id;
// Update all egrp periods
element.totalPeriodEGRPs.forEach(eGrpPeriod => {
// egrprd.period =
if (periods.size === element.totalPeriodEGRPs.length) {
periods.forEach((value, key) => {
if (key === eGrpPeriod.index.toString()) {
eGrpPeriod.period = value;
return true;
}
});
}
});
element.cars.forEach(carCell => {
// Logic for updating periods data
carCell .periodsData.forEach(periodAttribute => {
if (periods.size === carCell.periodsData.length) {
periods.forEach((value, key) => {
if (key === periodAttribute.index.toString()) {
periodAttribute.period = value;
return true;
}
});
}
});
});
});
Don't think of this as an update, but as a transform. I'm not using your example because it is un-necessarily complicated. Here is a simpler example that gives you all the concepts you need:
* def data = [{ name: 'one', periods: [{ index: 1, value: 'a' },{ index: 2, value: 'b' }]}, { name: 'two', periods: [{ index: 1, value: 'c' },{ index: 2, value: 'd' }]}]
* def fnPeriod = function(x){ x.value = x.value + x.index; return x }
* def fnData = function(x){ return { name: x.name, periods: karate.map(x.periods, fnPeriod) } }
* def converted = karate.map(data, fnData)
* print converted
Which prints:
[
{
"name": "one",
"periods": [
{
"index": 1,
"value": "a1"
},
{
"index": 2,
"value": "b2"
}
]
},
{
"name": "two",
"periods": [
{
"index": 1,
"value": "c1"
},
{
"index": 2,
"value": "d2"
}
]
}
]
If this doesn't work for you, please look for another tool. Karate is designed for testing and assertions, not doing what you normally do in programming languages. And I suspect that you have fallen into the trap of writing "over-smart tests", so please read this: https://stackoverflow.com/a/54126724/143475
Also refer: https://stackoverflow.com/a/53120851/143475

Karate - How to use nested for each in karate similar to javascript [duplicate]

This question already has an answer here:
Asserting and using conditions for an array response in Karate
(1 answer)
Closed 2 years ago.
I want to loop through below nested json structure and want to update all the required fields, however I could achieve this through typescript but want to do this in karate JS, I do not see any examples how to nested for each works.
I want to update 26 periods data(here for readability i used 3), based on index I want to update period field, i.e. if(index == key), these 26 periods are under each car attribute.(NOTe: you again have multiple cars and multiple car attributes and each car attribute you have 26 periods data)
I cannot use this Karate - Match two dynamic responses only when you have single array list and have less data
[
{
"cars": [
{
"name": "car 1",
"periodsData": [
{
"period": "5ed73ed31a775d1ab0c9fb5c",
"index": 1
},
{
"period": "5ed73ed31a775d1ab0c9fb5d",
"index": 2
},
{
"period": "5ed73ed31a775d1ab0c9fb5e",
"index": 3
}
]
},
{
"name": "car 2",
"periodsData": [
{
"period": "5ed73ed31a775d1ab0c9fb5c",
"index": 1
},
{
"period": "5ed73ed31a775d1ab0c9fb5d",
"index": 2
},
{
"period": "5ed73ed31a775d1ab0c9fb5e",
"index": 3
}
]
},
{
"name": "car 3",
"periodsData": [
{
"period": "5ed73ed31a775d1ab0c9fb5c",
"index": 1
},
{
"period": "5ed73ed31a775d1ab0c9fb5d",
"index": 2
},
{
"period": "5ed73ed31a775d1ab0c9fb5e",
"index": 3
}
]
}
],
"totalPeriodEprps": [
{
"period": "5ed73ed31a775d1ab0c9fb5c",
"index": 1
},
{
"period": "5ed73ed31a775d1ab0c9fb5d",
"index": 2
},
{
"period": "5ed73ed31a775d1ab0c9fb5e",
"index": 3
}
]
}
carId ="dfd"
]
This above array repeats
Type script code
//periods is a map of index and values
async modifyCarsData(mid, id, periods, campaignData) {
//carData is a json file
carData.forEach(element => {
element.carId= id;
// Update all egrp periods
element.totalPeriodEGRPs.forEach(eGrpPeriod => {
// egrprd.period =
if (periods.size === element.totalPeriodEGRPs.length) {
periods.forEach((value, key) => {
if (key === eGrpPeriod.index.toString()) {
eGrpPeriod.period = value;
return true;
}
});
}
});
element.cars.forEach(carCell => {
// Logic for updating periods data
carCell .periodsData.forEach(periodAttribute => {
if (periods.size === carCell.periodsData.length) {
periods.forEach((value, key) => {
if (key === periodAttribute.index.toString()) {
periodAttribute.period = value;
return true;
}
});
}
});
});
});
Don't think of this as an update, but as a transform. I'm not using your example because it is un-necessarily complicated. Here is a simpler example that gives you all the concepts you need:
* def data = [{ name: 'one', periods: [{ index: 1, value: 'a' },{ index: 2, value: 'b' }]}, { name: 'two', periods: [{ index: 1, value: 'c' },{ index: 2, value: 'd' }]}]
* def fnPeriod = function(x){ x.value = x.value + x.index; return x }
* def fnData = function(x){ return { name: x.name, periods: karate.map(x.periods, fnPeriod) } }
* def converted = karate.map(data, fnData)
* print converted
Which prints:
[
{
"name": "one",
"periods": [
{
"index": 1,
"value": "a1"
},
{
"index": 2,
"value": "b2"
}
]
},
{
"name": "two",
"periods": [
{
"index": 1,
"value": "c1"
},
{
"index": 2,
"value": "d2"
}
]
}
]
If this doesn't work for you, please look for another tool. Karate is designed for testing and assertions, not doing what you normally do in programming languages. And I suspect that you have fallen into the trap of writing "over-smart tests", so please read this: https://stackoverflow.com/a/54126724/143475
Also refer: https://stackoverflow.com/a/53120851/143475

How to implement group by in Dataweave based on first column in CSV

I have an incoming CSV file that looks like this (notice that the first field is common - this is the order number)
36319602,100,12458,HARVEY NORMAN,
36319602,101,12459,HARVEY NORMAN,
36319602,102,12457,HARVEY NORMAN,
36319601,110,12458,HARVEY NORMAN,
36319601,111,12459,HARVEY NORMAN,
36319601,112,12457,HARVEY NORMAN,
36319603,110,12458,HARVEY NORMAN,
36319603,121,12459,HARVEY NORMAN,
36319603,132,12457,HARVEY NORMAN,
This is my current Dataweave code
list_of_orders: {
order: payload map ((payload01 , indexOfPayload01) -> {
order_dtl:
[{
seq_nbr: payload01[1],
route_nbr: payload01[2]
}],
order_hdr: {
ord_nbr: payload01[0],
company: payload01[3],
city: payload01[4],
}
})
}
An example of the desired output would be something like this ... (this is just mocked up). Notice how I would like a single header grouped by the first column which is the order number - but with multiple detail lines
"list_of_orders": {
"order": [
{
"order_dtl": [
{
seq_nbr: 100,
route_nbr: 12458
},
{
seq_nbr: 101,
route_nbr: 12459
},
{
seq_nbr: 102,
route_nbr: 12457
}
],
"order_hdr":
{
ord_nbr: 36319602,
company: HARVEY NORMAN
}
}
]
}
It works fine except that it is repeating the order_hdr key.
What they would like is a single header key with multiple details beneath.
The grouping is to be based on "ord_nbr: payload01[0]"
Any help appreciated
Thanks
I think you're using Dataweave 1. In dw1, this groupBy gets the desired output(Note you can change the field pointers [0],1 etc to field name mappings if you have them set up as metadata etc):
%dw 1.0
%output application/json
---
list_of_orders: {
order: (payload groupBy ($[0])) map {
order_dtl: $ map {
seq_nbr: $[1],
route_nbr: $[2]
},
order_hdr:
{
ord_nbr: $[0][0],
company: $[0][3]
}
}}
UPDATE
Here is the output for the new input sample with multiple orders:
{
"list_of_orders": {
"order": [
{
"order_dtl": [
{
"seq_nbr": "110",
"route_nbr": "12458"
},
{
"seq_nbr": "121",
"route_nbr": "12459"
},
{
"seq_nbr": "132",
"route_nbr": "12457"
}
],
"order_hdr": {
"ord_nbr": "36319603",
"company": "HARVEY NORMAN"
}
},
{
"order_dtl": [
{
"seq_nbr": "100",
"route_nbr": "12458"
},
{
"seq_nbr": "101",
"route_nbr": "12459"
},
{
"seq_nbr": "102",
"route_nbr": "12457"
}
],
"order_hdr": {
"ord_nbr": "36319602",
"company": "HARVEY NORMAN"
}
},
{
"order_dtl": [
{
"seq_nbr": "110",
"route_nbr": "12458"
},
{
"seq_nbr": "111",
"route_nbr": "12459"
},
{
"seq_nbr": "112",
"route_nbr": "12457"
}
],
"order_hdr": {
"ord_nbr": "36319601",
"company": "HARVEY NORMAN"
}
}
]
}
}

hierarchical faceting with Elasticsearch

I'm using elasticsearch and need to implement facet search for hierarchical object as follow:
category 1 (10)
subcategory 1 (4)
subcategory 2 (6)
category 2 (X)
...
So I need to get facets for two related objects. Documentation says that it's possible to get such kind of facets for numeric value, but I need it for strings http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-facets-terms-stats-facet.html
Here is another interesting topic, unfortunately it's old: http://elasticsearch-users.115913.n3.nabble.com/Pivot-facets-td2981519.html
Does it possible with elastic search?
If so, how can I do that?
The previous solution works really well until you have no more than a multi-level tag on a single-document. In this case a simple aggregation doesn't work, because the flat structure of the lucene fields mix the results on the internal aggregation.
See the example below:
DELETE /test_category
POST /test_category
# Insert a doc with 2 hierarchical tags
POST /test_category/test/1
{
"categories": [
{
"cat_1": "1",
"cat_2": "1.1"
},
{
"cat_1": "2",
"cat_2": "2.2"
}
]
}
# Simple two-levels aggregations query
GET /test_category/test/_search?search_type=count
{
"aggs": {
"main_category": {
"terms": {
"field": "categories.cat_1"
},
"aggs": {
"sub_category": {
"terms": {
"field": "categories.cat_2"
}
}
}
}
}
}
That's the WRONG response that I have got on ES 1.4, where the fields on the internal aggregation are mixed at a document level:
{
...
"aggregations": {
"main_category": {
"buckets": [
{
"key": "1",
"doc_count": 1,
"sub_category": {
"buckets": [
{
"key": "1.1",
"doc_count": 1
},
{
"key": "2.2", <= WRONG
"doc_count": 1
}
]
}
},
{
"key": "2",
"doc_count": 1,
"sub_category": {
"buckets": [
{
"key": "1.1", <= WRONG
"doc_count": 1
},
{
"key": "2.2",
"doc_count": 1
}
]
}
}
]
}
}
}
A Solution can be to use nested objects. These are the steps to do:
1) Define a new type in the schema with nested objects
POST /test_category/test2/_mapping
{
"test2": {
"properties": {
"categories": {
"type": "nested",
"properties": {
"cat_1": {
"type": "string"
},
"cat_2": {
"type": "string"
}
}
}
}
}
}
# Insert a single document
POST /test_category/test2/1
{"categories":[{"cat_1":"1","cat_2":"1.1"},{"cat_1":"2","cat_2":"2.2"}]}
2) Run a nested aggregation query:
GET /test_category/test2/_search?search_type=count
{
"aggs": {
"categories": {
"nested": {
"path": "categories"
},
"aggs": {
"main_category": {
"terms": {
"field": "categories.cat_1"
},
"aggs": {
"sub_category": {
"terms": {
"field": "categories.cat_2"
}
}
}
}
}
}
}
}
That's the response, now correct, that I have got:
{
...
"aggregations": {
"categories": {
"doc_count": 2,
"main_category": {
"buckets": [
{
"key": "1",
"doc_count": 1,
"sub_category": {
"buckets": [
{
"key": "1.1",
"doc_count": 1
}
]
}
},
{
"key": "2",
"doc_count": 1,
"sub_category": {
"buckets": [
{
"key": "2.2",
"doc_count": 1
}
]
}
}
]
}
}
}
}
The same solution can be extended to a more than two-levels hierarchy facet.
Currently, elasticsearch does not support hierarchical facetting out-of-the-box. But the upcoming 1.0 release features a new aggregations module, that can be used to get these kind of facets (which are more like pivot-facets rather than hierarchical facets). Version 1.0 is currently in beta, you can download the second beta and test out aggregatins by yourself. Your example might look like
curl -XPOST 'localhost:9200/_search?pretty' -d '
{
"aggregations": {
"main category": {
"terms": {
"field": "cat_1",
"order": {"_term": "asc"}
},
"aggregations": {
"sub category": {
"terms": {
"field": "cat_2",
"order": {"_term": "asc"}
}
}
}
}
}
}'
The idea is, to have a different field for each level of facetting and bucket your facets based on the terms of the first level (cat_1). These aggregations then would have sub-buckets, based on the terms of the second level (cat_2). The result may look like
{
"aggregations" : {
"main category" : {
"buckets" : [ {
"key" : "category 1",
"doc_count" : 10,
"sub category" : {
"buckets" : [ {
"key" : "subcategory 1",
"doc_count" : 4
}, {
"key" : "subcategory 2",
"doc_count" : 6
} ]
}
}, {
"key" : "category 2",
"doc_count" : 7,
"sub category" : {
"buckets" : [ {
"key" : "subcategory 1",
"doc_count" : 3
}, {
"key" : "subcategory 2",
"doc_count" : 4
} ]
}
} ]
}
}
}