mongodb aggregate distinct count - mongodb-query

Realise this topic has been asked many times - but the advice hasn't helped me solve this problem.
The following query is trying to determine the presence of sales on a given weekday using ISODay. Because the query will be run at the start of the month, I need to know how many occurrences of the specific ISOday occur in the month.
var query = { eventType: 'Sale', site : 4, tank: 1, txnDate : { "$gt" : new Date('2018-08-01T00:00:00') } };
db.tankevent.aggregate([
{ $match: query },
{ $project : {
isoDay: { $isoDayOfWeek: "$txnDate" },
dayDate: { $dateToString: { format: "%d", date:"$txnDate" } }
}
},
{ $group:
{ _id : { isoday: "$isoDay", dday: "$dayDate" }, count: { "$sum" : 1 } }
},
{ $sort: { "_id.isoday": 1, "_id.dday": 1 } }
])
provides the following output
/* 1 */
{
"_id" : {
"isoday" : 1,
"dday" : "06"
},
"count" : 62.0
}
/* 2 */
{
"_id" : {
"isoday" : 1,
"dday" : "13"
},
"count" : 69.0
}
/* 3 */
{
"_id" : {
"isoday" : 1,
"dday" : "20"
},
"count" : 72.0
}
/* 4 */
{
"_id" : {
"isoday" : 2,
"dday" : "07"
},
"count" : 75.0
}
I am trying to have "count" represent the number of unique "dday" records - so using the output above, I want count to be "3" for isoDay = 1. At the moment count is reporting number of sales events that occurred for the group combination

All you need to do is have the grouping twice.
db.tankevent.aggregate([
{ $match: query },
{ $project : {
isoDay: { $isoDayOfWeek: "$txnDate" },
dayDate: { $dateToString: { format: "%d", date:"$txnDate" } }
}
},
{ $group:
{ _id : { isoday: "$isoDay", dday: "$dayDate" }, count: { "$sum" : 1 } }
},
{ $project : {
isoDay_Final: "$_id.isoday"
}
},
{ $group:
{ _id : "$isoDay_Final", count: { "$sum" : 1 } }
},
{ $sort: { "_id": 1 } }
])

Related

Mongo db query | Join | Create query on two collection

Select leadId count on two collection in Mongo DB
Collection 1 : leads
{
leadId:"abc123",
status:"OPENED",
stage:"start",
crossSell:
{
cc:
{
consent:true,
shown:[{first:true}]
}
}
}
Collection 2 : pdata
{
activeLeadId:"abc123",
status:"OPENED",
details:
[
{
rating:10
},
{
rating:9
}
]
}
Question : Find leadId count from leads collection join with pdata collection based on below conditions
leads.leadId = pdata.activeleadId and
leads.status = "OPENED" and
leads.crossSell.cc.consent = true and
leads.crossSell.cc.shown[0].first = true and
pdata.details.rating >= 5
You can try a aggregation query,
$match your conditions for leads collection
$lookup with pdata collection, pass leadId to match with pdata
match required conditions for pdata
$limit to return single document, because we don't need that data in response
$match condition to check is pdata is not empty
$count to get total number of records
db.leads.aggregate([
{
$match: {
status: "OPENED",
"crossSell.cc.consent": true,
"crossSell.cc.shown.first": true
}
},
{
"$lookup": {
"from": "pdata",
"let": { "leadId": "$leadId" },
"pipeline": [
{
$match: {
$expr: { $eq: ["$$leadId", "$activeLeadId"] },
"details.rating": { $gte: 5 }
}
},
{ $limit: 1 }
],
"as": "pdata"
}
},
{ $match: { pdata: { $ne: [] } } },
{ $count: "count" }
])
Playground

Mongodb aggregation to find outliers

In my mongodb collection documents are stored in the following format:
{ "_id" : ObjectId("62XXXXXX"), "res" : 12, ... }
{ "_id" : ObjectId("63XXXXXX"), "res" : 23, ... }
{ "_id" : ObjectId("64XXXXXX"), "res" : 78, ... }
...
I need to extract id's for the document for which the value of "res" is outlier (i.e. value < Q1 - 1.5 * IQR or value > Q3 + 1.5 * IQR (Q1, Q3 are percentiles)). I have done this using pandas functionality by retrieving all documents from the collection, which may become slow if the number of documents in collection become too big.
Is there a way to do this using mongodb aggregation pipeline (or just calculating percentiles)?
If I understand how you want to retrieve outliers, here's one way you might be able to do it.
db.collection.aggregate([
{ // partition res into quartiles
"$bucketAuto": {
"groupBy": "$res",
"buckets": 4
}
},
{ // get the max of each quartile
"$group": {
"_id": "$_id.max"
}
},
{ // sort the quartile maxs
"$sort": {
"_id": 1
}
},
{ // put sorted quartile maxs into array
"$group": {
"_id": null,
"maxs": {"$push": "$_id"}
}
},
{ // assign Q1 and Q3
"$project": {
"_id": 0,
"q1": {"$arrayElemAt": ["$maxs", 0]},
"q3": {"$arrayElemAt": ["$maxs", 2]}
}
},
{ // set IQR
"$set": {
"iqr": {
"$subtract": ["$q3", "$q1"]
}
}
},
{ // assign upper/lower outlier thresholds
"$project": {
"outlierThresholdLower": {
"$subtract": [
"$q1",
{"$multiply": ["$iqr", 1.5]}
]
},
"outlierThresholdUpper": {
"$add": [
"$q3",
{"$multiply": ["$iqr", 1.5]}
]
}
}
},
{ // get outlier _id's
"$lookup": {
"from": "collection",
"as": "outliers",
"let": {
"oTL": "$outlierThresholdLower",
"oTU": "$outlierThresholdUpper"
},
"pipeline": [
{
"$match": {
"$expr": {
"$or": [
{"$lt": ["$res", "$$oTL"]},
{"$gt": ["$res", "$$oTU"]}
]
}
}
},
{
"$project": {
"_id": 1
}
}
]
}
}
])
Try it on mongoplayground.net.
One more option based on #rickhg12hs's answer, is to use $setWindowFields:
db.collection.aggregate([
{$setWindowFields: {
sortBy: {res: 1},
output: {
totalCount: {$count: {}},
index: {$sum: 1, window: {documents: ["unbounded", "current"]}}
}
}
},
{$match: {
$expr: {$lte: [
{$abs: {$subtract: [
{$mod: [
{$multiply: [
{$add: ["$index", {$round: {$divide: ["$totalCount", 4]}}]}, 2]},
"$totalCount"
]}, 0]}
}, 1]}
}},
{$group: {_id: null, res: {$push: "$res"}}},
{$project: {_id: 0, q1: {$first: "$res"}, q3: {$last: "$res"},
iqr: {"$subtract": [{$last: "$res"}, {$first: "$res"}]}
}},
{$project: {
outlierThresholdLower: {$subtract: ["$q1", {$multiply: ["$iqr", 1.5]}]},
outlierThresholdUpper: {$add: ["$q3", {$multiply: ["$iqr", 1.5]}]}
}
},
{$lookup: {
from: "collection",
as: "outliers",
let: {oTL: "$outlierThresholdLower", oTU: "$outlierThresholdUpper"},
pipeline: [
{$match: {$expr: {$or: [{$lt: ["$res", "$$oTL"]}, {$gt: ["$res", "$$oTU"]}]}}},
{$project: {_id: 1}}
]
}
}
])
See how it works on the playground example

sql conversion to mongodb aggregate

I'm new to SQL and MongoDB. I'm trying to convert this:
SELECT accountType, ROUND(AVG(balance), 2) avgBalance
FROM customers
WHERE gender="female"
GROUP BY accountType
HAVING COUNT(*) < 140
ORDER BY avgBalance
LIMIT 1
to MongoDB but I can't get it to work. I don't quite understand how the order ($group, $match, $project, $round, $avg etc.) should be and how the "ROUND and AVG" are used together. This is how the answer should be like: { "accountType" : "account-type", "avgBalance" : NumberDecimal("9999.99") }
Here is what I have so far:
db.customers.aggregate( [ { $group: { _id: { accountType: "accountType", avgBalance: { $avg: { "balance" } } }, { $match: { count: { $lt: 140 } } }, { gender: "female" }, { $project: { "accountType": { $round: [ $agv: "balance", 2 ] } } }, { $limit: 1 } ] )
Direction is not bad, would be this one:
db.customers.aggregate([
// WHERE gender="female"
{ $match: { gender: "female" } },
// GROUP BY accountType, SELECT AVG(balance)
{
$group: {
_id: "$accountType",
avgBalance: { $avg: "$balance" },
count: {$sum: 1}
}
},
// HAVING COUNT(*) < 140
{ $match: { count: { $lt: 140 } } },
// SELECT ... AS ...
{
$project: {
accountType: "$_id",
avgBalance: { $round: ["$avgBalance", 2] }
}
},
// ORDER BY avgBalance
{ $sort: { avgBalance: 1 } },
// LIMIT 1
{ $limit: 1 }
])

Mongodb: get count of multiple values in a field grouped by another field

I have a collection as below
{"country":"US","city":"NY"}
{"country":"US","city":"AL"}
{"country":"US","city":"MA"}
{"country":"US","city":"NY"}
{"country":"US","city":"MA"}
{"country":"IN","city":"DL"}
{"country":"IN","city":"KA"}
{"country":"IN","city":"DL"}
{"country":"IN","city":"DL"}
{"country":"IN","city":"KA"}
and expecting an output
{ "data": { "US": {"NY": 2,"AL": 1,"MA": 2 },
"IN": {"DL": 3,"KA": 2 }}
}
Below is the mongodb query I tried, i was able to get to get the count at country level, but not at the state level. please help me in correcting the below query to get data at state level.
db.country_dash.aggregate([
{"$group": {
"_id":"$country",
"state": {"$addToSet": "$state"}
}},
{"$project": {
"_id":0,
"country":"$_id",
"state": {"$size": "$state"}
} }
])
db.country_dash.aggregate(
// Pipeline
[
// Stage 1
{
$group: {
_id: {
city: '$city'
},
total: {
$sum: 1
},
country: {
$addToSet: '$country'
}
}
},
// Stage 2
{
$project: {
total: 1,
country: {
$arrayElemAt: ['$country', 0]
},
city: '$_id.city',
_id: 0
}
},
// Stage 3
{
$group: {
_id: '$country',
data: {
$addToSet: {
city: '$city',
total: '$total'
}
}
}
},
]
);

MongoDB - How to make a query to get the average usage?

Here is how the documents in db look like.
/* 1 */
{
"_id" : 1,
"feat" : {
"processName": [
{
"value" : {
"value": "Process1"
}
}
],
"processUsage": [
{
"value" : {
"value": 23.21
}
}
]
}
}
/* 2 */
{
"_id" : 2,
"feat" : {
"processName": [
{
"value" : {
"value": "Process2"
}
}
],
"memoryUsage": [
{
"value" : {
"value": 2.411502e+05
}
}
]
}
}
/* 3 */
{
"_id" : 3,
"feat" : {
"processName": [
{
"value" : {
"value": "Process1"
}
}
],
"processUsage": [
{
"value" : {
"value": 67.42
}
}
]
}
}
/* 4 */
{
"_id" : 4,
"feat" : {
"processName": [
{
"value" : {
"value": "Process3"
}
}
],
"processUsage": [
{
"value" : {
"value": 39.97
}
}
]
}
}
/* 5 */
{
"_id" : 5,
"feat" : {
"processName": [
{
"value" : {
"value": "Process2"
}
}
],
"processUsage": [
{
"value" : {
"value": 21.05
}
}
]
}
}
Each process has entries with processUsage and memoryUsage. What I am interest in is the average processUsage. So, I'd like to ignore the entries with memoryUsage.
I tried $match + $group in an aggregate with $avg, but for each process I just got back as average 0.00000000.
Then I tried my luck with mapReduce using javascript, unfortunately it did not work out either.
Could someone just show me how to do that? By the way, I am using Robomongo 0.8.5
Edit:
The query looks like this:
db.database.aggregate([
{ $match : {"$feat.processUsage.value.value": {$gt : -1}
},
{
$group: {_id: "$feats.processName.value.value", average: {$avg:
"$feats.processUsage.value.value"}
}
])
You can use the following aggregate query:
db.test.aggregate(
[
{
$unwind : "$feat.processUsage"
},
{
$group: {
_id: "$feat.processName.value.value",
average: {$avg:"$feat.processUsage.value.value"}
}
}
]
)
Unwinding in the initial phase will let you filter documents that has processUsage key in document.
Result:
{ "_id" : [ "Process2" ], "average" : 21.05 }
{ "_id" : [ "Process3" ], "average" : 39.97 }
{ "_id" : [ "Process1" ], "average" : 45.315 }