Source code for pandagg.node.aggs.abstract

import json

from pandagg.node._node import Node
from typing import Optional, List, Union, Dict, Any, Tuple, Iterator, Type

from pandagg.types import (
    Meta,
    BucketKey,
    BucketDict,
    AggClauseDict,
    AggType,
    Script,
    GapPolicy,
    AggName,
    AggClauseResponseDict,
    BucketsWrapperDict,
    BucketKeyAtom,
    BucketsDict,
)


[docs]class AggClause(Node):
    """
    Wrapper around elasticsearch aggregation concept.
    https://www.elastic.co/guide/en/elasticsearch/reference/2.3/search-aggregations.html

    Each aggregation can be seen both a Node that can be encapsulated in a parent agg.

    Define a method to build aggregation request.
    """

    _classes: Dict[AggType, Type["AggClause"]]

    _type_name = "agg"
    KEY: str

    VALUE_ATTRS: List[str]
    WHITELISTED_MAPPING_TYPES: List[str]
    BLACKLISTED_MAPPING_TYPES: List[str]

    def __init__(
        self, meta: Optional[Meta] = None, identifier: Optional[str] = None, **body: Any
    ) -> None:
        # remove empty keys from body, make __init__ clearer
        self.body: Dict[str, Any] = {k: v for k, v in body.items() if v is not None}
        self.meta: Optional[Meta] = meta
        self._children: Dict[AggName, Any] = {}
        super(AggClause, self).__init__(identifier=identifier)

[docs]    def line_repr(self, depth: int, **kwargs: Any) -> Tuple[str, str]:
        # root node
        if not self.KEY:
            return "_", ""
        repr_args = [str(self.KEY)]
        if self.body:
            repr_args.append(self._params_repr(self.body))
        unnamed = "<%s>" % ", ".join(repr_args)
        return "", unnamed

    @staticmethod
    def _params_repr(params: Dict[str, Any]) -> str:
        params = params or {}
        return ", ".join(
            "%s=%s" % (str(k), str(json.dumps(params[k], sort_keys=True)))
            for k in sorted(params.keys())
        )

[docs]    @classmethod
    def valid_on_field_type(cls, field_type: str) -> bool:
        if hasattr(cls, "WHITELISTED_MAPPING_TYPES"):
            return field_type in cls.WHITELISTED_MAPPING_TYPES
        if hasattr(cls, "BLACKLISTED_MAPPING_TYPES"):
            return field_type not in cls.BLACKLISTED_MAPPING_TYPES
        # by default laxist
        # TODO - constraint to only allowed types
        return True

[docs]    def to_dict(self) -> AggClauseDict:
        """
        ElasticSearch aggregation queries follow this formatting::

            {
                "<aggregation_name>" : {
                    "<aggregation_type>" : {
                        <aggregation_body>
                    }
                    [,"meta" : {  [<meta_data_body>] } ]?
                }
            }

        to_dict() returns the following part (without aggregation name)::

            {
                "<aggregation_type>" : {
                    <aggregation_body>
                }
                [,"meta" : {  [<meta_data_body>] } ]?
            }
        """
        if self.KEY is None:
            raise ValueError("For typing only")
        aggs = {self.KEY: self.body}
        if self.meta:
            aggs["meta"] = self.meta
        return aggs

[docs]    def extract_buckets(
        self, response_value: AggClauseResponseDict
    ) -> Iterator[Tuple[BucketKey, BucketDict]]:
        raise NotImplementedError()

[docs]    @classmethod
    def extract_bucket_value(
        cls,
        response: Union[BucketsWrapperDict, BucketDict],
        value_as_dict: bool = False,
    ) -> Any:
        attrs = cls.VALUE_ATTRS
        if value_as_dict:
            return {attr_: response.get(attr_) for attr_ in attrs}
        return response.get(attrs[0])

[docs]    def is_convertible_to_composite_source(self) -> bool:
        return False

    def __str__(self) -> str:
        return "<{class_}, type={type}, body={body}>".format(
            class_=str(self.__class__.__name__),
            type=str(self.KEY),
            body=json.dumps(self.body),
        )

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, AggClause):
            return other.to_dict() == self.to_dict()
        # make sure we still equal to a dict with the same data
        return other == self.to_dict()


TypeOrAgg = Union[AggType, AggClauseDict, AggClause]


[docs]def A(name: str, type_or_agg: Optional[TypeOrAgg] = None, **body: Any) -> AggClause:
    """
    Accept multiple syntaxes, return a AggNode instance.

    :param name: aggregation clause name
    :param type_or_agg:
    :param body:
    :return: AggNode
    """
    if isinstance(type_or_agg, str):
        # _translate_agg("per_user", "terms", field="user")
        return AggClause.get_dsl_class(type_or_agg)(**body)
    if isinstance(type_or_agg, AggClause):
        # _translate_agg("per_user", Terms(field='user'))
        if body:
            raise ValueError(
                'Body cannot be added using "AggNode" declaration, got %s.' % body
            )
        return type_or_agg
    if isinstance(type_or_agg, dict):
        # _translate_agg("per_user", {"terms": {"field": "user"}})
        if body:
            raise ValueError(
                'Body cannot be added using "dict" agg declaration, got %s.' % body
            )
        type_or_agg = type_or_agg.copy()
        children_aggs = (
            type_or_agg.pop("aggs", None) or type_or_agg.pop("aggregations", None) or {}
        )
        if len(type_or_agg) != 1:
            raise ValueError(
                "Invalid aggregation declaration (two many keys): got <%s>"
                % type_or_agg
            )
        type_, body_ = type_or_agg.popitem()
        body_ = body_.copy()
        if children_aggs:
            body_["aggs"] = children_aggs
        return AggClause.get_dsl_class(type_)(**body_)
    if type_or_agg is None:
        # if type_or_agg is not provided, by default execute a terms aggregation
        # _translate_agg("per_user")
        return AggClause.get_dsl_class("terms")(field=name, **body)
    raise ValueError('"type_or_agg" must be among "dict", "AggNode", "str"')


[docs]class Root(AggClause):
    """
    Not a real aggregation. Just the initial empty dict (used as lighttree.Tree root).
    """

    KEY: str = "_root"

[docs]    def line_repr(self, depth: int, **kwargs: Any) -> Tuple[str, str]:
        return "_", ""

[docs]    def extract_buckets(
        self, response_value: AggClauseResponseDict
    ) -> Iterator[Tuple[BucketKey, BucketDict]]:
        # probably mypy bug in case of recursive typing
        yield None, response_value  # type: ignore

[docs]    @classmethod
    def extract_bucket_value(
        cls,
        response: Union[BucketsWrapperDict, BucketDict],
        value_as_dict: bool = False,
    ) -> Any:
        return None


[docs]class MetricAgg(AggClause):
    """
    Metric aggregation are aggregations providing a single bucket, with value attributes to be extracted.
    """

[docs]    def extract_buckets(
        self, response_value: AggClauseResponseDict
    ) -> Iterator[Tuple[BucketKey, BucketDict]]:
        # probably mypy bug in case of recursive typing
        yield None, response_value  # type: ignore


[docs]class BucketAggClause(AggClause):
    """
    Bucket aggregation have special abilities: they can encapsulate other aggregations as children.
    Each time, the extracted value is a 'doc_count'.

    Provide methods:
    - to build aggregation request (with children aggregations)
    - to to extract buckets from raw response
    - to build query to filter documents belonging to that bucket

    Note: the aggs attribute's only purpose is for children initiation with the following syntax:
    >>> from pandagg.aggs import Terms, Avg
    >>> agg = Terms(
    >>>     field='some_path',
    >>>     aggs={
    >>>         'avg_agg': Avg(field='some_other_path')
    >>>     }
    >>> )
    """

    def __init__(self, **body: Any) -> None:
        identifier: Optional[str] = body.pop("identifier", None)
        aggs = body.pop("aggs", None) or body.pop("aggregations", None)
        super(BucketAggClause, self).__init__(identifier=identifier, **body)
        self._children: Dict[AggName, Any] = aggs or {}  # type: ignore

[docs]    def extract_buckets(
        self, response_value: AggClauseResponseDict
    ) -> Iterator[Tuple[BucketKey, BucketDict]]:
        raise NotImplementedError()


[docs]class UniqueBucketAgg(BucketAggClause):
    """Aggregations providing a single bucket."""

[docs]    def extract_buckets(
        self, response_value: AggClauseResponseDict
    ) -> Iterator[Tuple[BucketKey, BucketDict]]:
        # probably mypy bug in case of recursive typing
        yield None, response_value  # type: ignore


[docs]class MultipleBucketAgg(BucketAggClause):

    IMPLICIT_KEYED: bool = False

    def __init__(
        self, keyed: bool = False, key_as_string: bool = False, **body: Any
    ) -> None:
        """
        Aggregation that return either a list or a map of buckets.

        If keyed, ES buckets are expected as dict, else as list (in this case key_path is used to extract key from each
        list item).
        """
        # keyed has another meaning in lighttree Node
        self.keyed_: bool = keyed or self.IMPLICIT_KEYED
        self.key_path: str = "key_as_string" if key_as_string else "key"
        if keyed and not self.IMPLICIT_KEYED:
            body["keyed"] = keyed
        super(MultipleBucketAgg, self).__init__(**body)

[docs]    def extract_buckets(
        self, response_value: AggClauseResponseDict
    ) -> Iterator[Tuple[BucketKey, BucketDict]]:
        buckets = response_value["buckets"]
        key: BucketKeyAtom
        if self.keyed_:
            buckets_: BucketsDict = buckets  # type: ignore
            for key in buckets_.keys():
                yield key, buckets_[key]
        else:
            buckets__: List[BucketDict] = buckets  # type: ignore
            for bucket in buckets__:
                yield self._extract_bucket_key(bucket), bucket

    def _extract_bucket_key(self, bucket: BucketDict) -> BucketKey:
        return bucket[self.key_path]


[docs]class FieldOrScriptMetricAgg(MetricAgg):
    """
    Metric aggregation based on single field.
    """

    def __init__(
        self, field: Optional[str] = None, script: Optional[Script] = None, **body: Any
    ) -> None:
        self.field: Optional[str] = field
        super(FieldOrScriptMetricAgg, self).__init__(field=field, script=script, **body)


[docs]class Pipeline(UniqueBucketAgg):
    def __init__(
        self, buckets_path: str, gap_policy: Optional[GapPolicy] = None, **body: Any
    ) -> None:
        super(Pipeline, self).__init__(
            buckets_path=buckets_path, gap_policy=gap_policy, **body
        )


[docs]class ScriptPipeline(Pipeline):
    VALUE_ATTRS: List[str] = ["value"]

    def __init__(
        self,
        script: Script,
        buckets_path: str,
        gap_policy: Optional[GapPolicy] = None,
        **body: Any
    ) -> None:
        super(ScriptPipeline, self).__init__(
            buckets_path=buckets_path, gap_policy=gap_policy, script=script, **body
        )