aggregations merged in hits in elasticsearch - sql

Just for an example, let's say I have a database, or an elastic index, holding sales persons and also all their customer visits past and into the future.
Lets also say I want to produce a list of these sales persons and show how many customer visits they have scheduled.
In SQL I would do something like this:
(mind: SQL is probably not all that correct, because it is just written here and just for telling what I am intending to do)
select foo, bar, sum(baz) from table_barbaz
where appointment_date > now()
group by bar
is it possible to get the same result in Elastic search? Like a list of documents sort of looking like this:
{
"foo": "Salesmen John",
"bar": "Client visit this week",
"sum_baz": 99
}
Not sure if this is related to nested aggregations or something else.
Below is a mapping that could have been used in this example. As the real mapping is internal IP, I don't really want to share it publicly.
{
"mappings": {
"properties": {
"salesman_id": {
"type": "integer"
},
"salesman_name": {
"type": "keyword"
},
"customer_visit": {
"type": "integer"
},
"customer_visit_start_date": {
"type": "date",
"format": "yyyy-MM-dd||strict_date"
},
"customer_visit_end_date": {
"type": "date",
"format": "yyyy-MM-dd||strict_date"
}
}
}
}

Then, an aggregation query like the following one would give you the number of customer visits for each salesman, for each day:
{
"size": 0,
"aggs": {
"salesmen": {
"terms": {
"field": "salesman_name",
"size": 20
},
"aggs": {
"days": {
"date_histogram": {
"field": "customer_visit_start_date",
"interval": "day"
},
"aggs": {
"visits": {
"sum": {
"field": "customer_visit"
}
}
}
}
}
}
}
}

Related

MongoDB multiple Lookup into same collection

I have two collections Bill and Employee. Bill contains the information about the monthly student bill and Employee contains all types of people working in the school (Accountant, Teachers, Maintenance etc).
Bill has billVerifyBy and classteacher field which points to the records of Employees.
Bill collection
{
"_id": ObjectId("ab12dns..."), //mongoid
"studentname": "demoUser",
"class": { "section": "A"},
"billVerifiedBy": "121212",
"classteacher": "134239",
}
Employee collection
{
"_id": ObjectId("121212"), // random number
"name": "Darn Morphy",
"department": "Accounts",
"email": "dantest#test.com",
}
{
"_id": ObjectId("134239"),
"name": "Derreck",
"department": "Faculty",
"email": "derrect145#test.com",
}
I need to retrieve the Accounts and Teacher information related to a particular bill. I am using Mongodb lookup to get the information. However, I have to lookup to the same table twice since billVerifiedBy and classteacher belong to the same Employee tables as given below.
db.bill.aggregate([
{
$lookup: {"from": "employee", "localField": "billVerifiedBy", "foreignField": "_id", "as": "accounts"}},
},
{
$lookup: {"from": "employee", "localField": "classteacher", "foreignField": "_id", "as": "faculty"}},
},
{
$project: {
"studentname": 1,
"class": 1,
"verifiedUser": "$accounts.name",
"verifiedByEmail":"$accounts.email",
"facultyName": "$faculty.name",
"facultyEmail": "$faculty.email"
}
}
]
I don't know if this is the good way of arranging the Accounts and Faculty information in the single Employee collection. And is it right thing to lookup twice with same collection. Or should I create separate Accounts and Faculty collection and lookup with it. Please suggest what would be the best approach in terms of performance.
In mongodb, when you want to join multiple documents from the same collection, you can use "$lookup" with its "pipeline" and "let" options. It filters documents that you want to take with defined variables.
db.getCollection('Bill').aggregate([{
"$lookup": {
"as": "lookupUsers",
"from": "Employee",
// define variables that you need to use in pipeline to filter documents
"let": {
"verifier": "$billVerifiedBy",
"teacher": "$classteacher"
},
"pipeline": [{ // filter employees who you need to filter.
"$match": {
"$expr": {
"$or": [{
"$eq": ["$_id", "$$verifier"]
},
{
"$eq": ["$_id", "$$teacher"]
}
]
}
}
},
{ // combine filtered 2 documents in an employee array
"$group": {
"_id": "",
"employee": {
"$addToSet": {
"_id": "$_id",
"name": "$name",
"department": "$department",
"email": "$email"
}
}
}
},
{ // takes item from the array by predefined variable.
"$project": {
"_id": 0,
"billVerifiedBy": {
"$slice": [{
"$filter": {
"input": "$employee",
"cond": {
"$eq": ["$$this._id", "$$verifier"]
}
}
},
1
]
},
"classteacher": {
"$slice": [{
"$filter": {
"input": "$employee",
"cond": {
"$eq": ["$$this._id", "$$teacher"]
}
}
},
1
]
}
}
},
{
"$unwind": "$billVerifiedBy"
},
{
"$unwind": "$classteacher"
},
]
}
},
{
"$unwind": "$lookupUsers"
},
]);
Output is like that:
{
"_id": ObjectId("602916dcf4450742cdebe38d"),
"studentname": "demoUser",
"class": {
"section": "A"
},
"billVerifiedBy": ObjectId("6029172e9ea6c9d4776517ce"),
"classteacher": ObjectId("6029172e9ea6c9d4776517cf"),
"lookupUsers": {
"billVerifiedBy": {
"_id": ObjectId("6029172e9ea6c9d4776517ce"),
"name": "Darn Morphy",
"department": "Accounts",
"email": "dantest#test.com"
},
"classteacher": {
"_id": ObjectId("6029172e9ea6c9d4776517cf"),
"name": "Derreck",
"department": "Faculty",
"email": "derrect145#test.com"
}
}
}

How to realize sum(field) group by multi field in elasticsearch?

I'm using logstash to save row data from MySQL to ElasticSearch. How to calculate sum on one field group by two fields?
For example, here is one table named "Students", it has several columns: id, class_id, name, gender, age;
and here is one SQL query:
select class_id, gender, sum(age) from Students group by class_id, gender;
How to translate this SQL to ElasticSearch high level rest client API call?
Below is my try, but it is not correct:
public TermsAggregationBuilder constructAggregation() {
TermsAggregationBuilder aggregation = AggregationBuilders.terms("by_classid")
.field("classId.keyword");
aggregation = aggregation.subAggregation(AggregationBuilders.terms("by_gender").field("gender.keyword"));
aggregation = aggregation.subAggregation(AggregationBuilders.sum("sum_age")
.field("age"));
return aggregation;
}
Following is the raw query for your sql statement
POST aggregation_index_2/_search
{
"size": 0,
"aggs": {
"gender_agg": {
"terms": {
"field": "gender"
},
"aggs": {
"class_id_aggregator": {
"terms": {
"aggs": {
"field": "class_id"
},
"age_sum_aggregator": {
"sum": {
"field": "age"
}
}
}
}
}
}
}
}
Mappings
PUT aggregation_index_2
{
"mappings": {
"properties": {
"gender": {
"type": "keyword"
},
"age": {
"type": "integer"
},
"class_id": {
"type": "integer"
}
}
}
}

How do I write an ElasticSearch query to find unique elements in columns?

For example, if I have a SQL query:
SELECT distinct emp_id, salary FROM TABLE_EMPLOYEE
what would be its ElasticSearch equivalent?
This is what I have come up with until now:
{
"aggs": {
"Employee": {
"terms": {
"field":["emp_id", "salary" ]
"size": 1000
}
}
}
}
Instead of sending a list of fields to perform distinct upon, send them as separate aggregations.
{
"aggs": {
"Employee": {
"terms": {
"field": "emp_id",
"size": 10
}
},
"Salary":{
"terms": {
"field": "salary",
"size": 10
}
}
},
"size": 0
}
To answer from our conversation you would issue the following http command using curl.
curl -XGET localhost:9200/<your index>/<type>/_search?pretty

Scope 0 count terms in aggregation in ElasticSearch

i am doing aggregations on "location" field in my document ,where there is also a "city" field in the same document.I am querying the document on city field and aggregating the documents on location field.
{
"aggs": {
"locations": {
"terms": {
"field": "location",
"min_doc_count": 0
}
}
},
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"city": "mumbai",
"_cache": true
}
}
]
}
}
}
}
}
Now the count and aggregations come fine and along with the hits.but my problem is that i want to do aggregation with 'doc-count' set to 0 and the aggregation bucket returns me all the lcoations with 0 count which even falls in other city.I want to get 0 count locations only for that city.want to scope the context of 0 count location to city.
I tried achieving this by nested aggregation placing location inside nested city and then doing aggs, or combining the filter aggs with terms agg but still getting the same result.Is there any way to achieve this or elasticsearch is inherently build to work like this.
ES Version - 1.6
My mapping looks like this:
{
"service": {
"_source": {
"enabled": true
},
"properties": {
"name": {
"type": "string",
"index": "not_analyzed"
},
"location": {
"type": "string",
"index": "not_analyzed"
},
"city": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
Sample docs to index
{
"name": "a",
"location": "x",
"city": "mumbai"
}
{
"name": "b",
"location": "x",
"city": "mumbai"
}
{
"name": "c",
"location": "y"
"city": "chennai"
}
You should try to sort your terms aggregation (embedded into a filter aggregation) by ascending doc count and you'll get all the terms with 0 doc count first. Note that by default, you'll only get the first 10 terms, if you have less terms with 0 doc count, you'll see them all, otherwise you might need to increase the size parameter to something higher than 10.
{
"aggs": {
"city_filter": {
"filter": {
"term": {
"city": "mumbai"
}
},
"aggs": {
"locations": {
"terms": {
"field": "location",
"min_doc_count": 0,
"size": 20, <----- add this if you have more than ten 0-doc-count terms
"order": { <----- add this to see 0-doc-count first
"_count": "asc"
}
}
}
}
}
},
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"term": {
"city": "mumbai",
"_cache": true
}
}
]
}
}
}
}
}

Aggregations on most recent document in group using elasticsearch

Suppose there are several documents per person that contain values:
{
"name": "John",
"value": 1,
"timestamp": 2014-06-15
}
{
"name": "John",
"value": 2,
"timestamp": 2014-06-16
}
{
"name": "Sam",
"value": 2,
"timestamp": 2014-06-15
}
{
"name": "Sam",
"value": 3,
"timestamp": 2014-06-16
}
How do I get a list of the most recent documents for each person?
How do I get an average of the values for the list of the most recent documents for each person? Given the sample data, this would be 2.5, not 2.
Is there some combination of buckets and metrics that could achieve this result? Will I need to implement a custom aggregator as part of a plugin, or must this sort of computation be performed in memory?
If you only need to find the most recent persons try something like this:
"aggs": {
"personName": {
"terms": {
"field": "name",
"size": 5,
"order": {"timeCreated": "desc"}
},
"aggs": {
"timeCreated": {
"max": {"field": "timestamp"}
}
}
}
}
The second operation is just an aggregation, and to get the average of the value field you could try something like:
curl -XPOST "http://DOMAIN:9200/your/data/_search" -d'
{
"size": 0,
"aggregations": {
"the_name": {
"terms": {
"field": "name",
"order": {
"value_avg": "desc"
}
},
"aggregations": {
"value_avg": {
"avg": {
"field": "value"
}
}
}
}
}
}'
To achieve a solution for your first issue I would recommend you to order the response by date, and then in your project ignore a term when you have another with the same name (meaning filter the data after the response of ES)