How to query distinct field values in ElasticSearch

Let’s say we have an ElasticSearch index called strings with a field pattern of {"type": "keyword"}.

Get the top N values of the column

If we want to get the top N ( 12 in our example) entries, i.e. the patterns that are present in the most documents, we can use this query:

terms_aggregation_query.json

{
    "aggs" : {
        "patterns" : {
            "terms" : {
                "field" : "pattern.keyword",
                "size": 12
            }
        }
    }
}

{
    "aggs" : {
        "patterns" : {
            "terms" : {
                "field" : "pattern.keyword",
                "size": 12
            }
        }
    }
}

Full example in Python:

es_terms_example.py

from elasticsearch import Elasticsearch

es = Elasticsearch()

result = es.search(index="strings", body={
    "aggs" : {
        "patterns" : {
            "terms" : {
                "field" : "pattern.keyword",
                "size": 12
            }
        }
    }
})
for aggregation in result["aggregations"]["patterns"]["buckets"]:
    print(aggregation) # e.g. {'key': 'mypattern, 'doc_count': 2802}

from elasticsearch import Elasticsearch

es = Elasticsearch()

result = es.search(index="strings", body={
    "aggs" : {
        "patterns" : {
            "terms" : {
                "field" : "pattern.keyword",
                "size": 12
            }
        }
    }
})
for aggregation in result["aggregations"]["patterns"]["buckets"]:
    print(aggregation) # e.g. {'key': 'mypattern, 'doc_count': 2802}

See the terms aggregation documentation for more infos.

Get all the distinct values of the column

Getting all the values is slightly more complicated since we need to use a composite aggregation that returns an after_key to paginate the query.

This Python helper function will automatically paginate the query with configurable page size:

es_iterate_distinct.py

from elasticsearch import Elasticsearch

es = Elasticsearch()

def iterate_distinct_field(es, fieldname, pagesize=250, **kwargs):
    """
    Helper to get all distinct values from ElasticSearch
    (ordered by number of occurrences)
    """
    compositeQuery = {
        "size": pagesize,
        "sources": [{
                fieldname: {
                    "terms": {
                        "field": fieldname
                    }
                }
            }
        ]
    }
    # Iterate over pages
    while True:
        result = es.search(**kwargs, body={
            "aggs": {
                "values": {
                    "composite": compositeQuery
                }
            }
        })
        # Yield each bucket
        for aggregation in result["aggregations"]["values"]["buckets"]:
            yield aggregation
        # Set "after" field
        if "after_key" in result["aggregations"]["values"]:
            compositeQuery["after"] = \
                result["aggregations"]["values"]["after_key"]
        else: # Finished!
            break

# Usage example
for result in iterate_distinct_field(es, fieldname="pattern.keyword", index="strings"):
    print(result) # e.g. {'key': {'pattern': 'mypattern'}, 'doc_count': 315}

from elasticsearch import Elasticsearch

es = Elasticsearch()

def iterate_distinct_field(es, fieldname, pagesize=250, **kwargs):
    """
    Helper to get all distinct values from ElasticSearch
    (ordered by number of occurrences)
    """
    compositeQuery = {
        "size": pagesize,
        "sources": [{
                fieldname: {
                    "terms": {
                        "field": fieldname
                    }
                }
            }
        ]
    }
    # Iterate over pages
    while True:
        result = es.search(**kwargs, body={
            "aggs": {
                "values": {
                    "composite": compositeQuery
                }
            }
        })
        # Yield each bucket
        for aggregation in result["aggregations"]["values"]["buckets"]:
            yield aggregation
        # Set "after" field
        if "after_key" in result["aggregations"]["values"]:
            compositeQuery["after"] = \
                result["aggregations"]["values"]["after_key"]
        else: # Finished!
            break

# Usage example
for result in iterate_distinct_field(es, fieldname="pattern.keyword", index="strings"):
    print(result) # e.g. {'key': {'pattern': 'mypattern'}, 'doc_count': 315}

Check out similar posts by category: Databases, ElasticSearch, Python

If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow

Buy me a coffee