diff --git a/README.md b/README.md index 3f1061e5..9c17648f 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,9 @@ and is intended to make it more convenient to deal with deeply nested queries an ## Features - flexible aggregation and search queries declaration, with ability to insert clauses at specific points (and not only below last manipulated clause) -- query validation based on provided mapping +- query validation based on provided mappings - parsing of aggregation results in convenient formats: tree with interactive navigation, csv-like tabular breakdown, pandas dataframe, and others -- cluster indices discovery module, and mapping interactive navigation +- cluster indices discovery module, and mappings interactive navigation ## Documentation @@ -56,12 +56,12 @@ Discover indices on cluster with matching pattern: ['movies', 'movies_fake'] ``` -Explore index mapping: +Explore index mappings: ```python >>> movies = indices.movies ->>> movies.mapping - +>>> movies.mappings + _ ├── directors [Nested] │ ├── director_id Keyword @@ -78,8 +78,8 @@ _ ... ``` ```python ->>> movies.mapping.roles - +>>> movies.mappings.roles + roles [Nested] ├── actor_id Keyword ├── first_name Text @@ -95,7 +95,7 @@ roles [Nested] Execute aggregation on field: ```python ->>> movies.mapping.roles.gender.a.terms() +>>> movies.mappings.roles.gender.a.terms() doc_count key M 2296792 M F 1135174 F diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index fc48a3ff..a70503cf 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -15,7 +15,7 @@ Elasticsearch tree structures Many Elasticsearch objects have a **tree** structure, ie they are built from a hierarchy of **nodes**: -* a `mapping `_ (tree) is a hierarchy of `fields `_ (nodes) +* a `mappings `_ (tree) is a hierarchy of `fields `_ (nodes) * a `query `_ (tree) is a hierarchy of query clauses (nodes) * an `aggregation `_ (tree) is a hierarchy of aggregation clauses (nodes) * an aggregation response (tree) is a hierarchy of response buckets (nodes) diff --git a/docs/source/reference/pandagg.interactive.mappings.rst b/docs/source/reference/pandagg.interactive.mappings.rst new file mode 100644 index 00000000..ec345a57 --- /dev/null +++ b/docs/source/reference/pandagg.interactive.mappings.rst @@ -0,0 +1,7 @@ +pandagg.interactive.mappings module +=================================== + +.. automodule:: pandagg.interactive.mappings + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/pandagg.interactive.rst b/docs/source/reference/pandagg.interactive.rst index 2505d683..d0d8eb40 100644 --- a/docs/source/reference/pandagg.interactive.rst +++ b/docs/source/reference/pandagg.interactive.rst @@ -7,7 +7,7 @@ Submodules .. toctree:: :maxdepth: 8 - pandagg.interactive.mapping + pandagg.interactive.mappings pandagg.interactive.response Module contents diff --git a/docs/source/reference/pandagg.mappings.rst b/docs/source/reference/pandagg.mappings.rst new file mode 100644 index 00000000..576d820f --- /dev/null +++ b/docs/source/reference/pandagg.mappings.rst @@ -0,0 +1,7 @@ +pandagg.mappings module +======================= + +.. automodule:: pandagg.mappings + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/pandagg.node.aggs.composite.rst b/docs/source/reference/pandagg.node.aggs.composite.rst new file mode 100644 index 00000000..64030c05 --- /dev/null +++ b/docs/source/reference/pandagg.node.aggs.composite.rst @@ -0,0 +1,7 @@ +pandagg.node.aggs.composite module +================================== + +.. automodule:: pandagg.node.aggs.composite + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/pandagg.node.aggs.rst b/docs/source/reference/pandagg.node.aggs.rst index 9ebac57c..ff8516c6 100644 --- a/docs/source/reference/pandagg.node.aggs.rst +++ b/docs/source/reference/pandagg.node.aggs.rst @@ -9,6 +9,7 @@ Submodules pandagg.node.aggs.abstract pandagg.node.aggs.bucket + pandagg.node.aggs.composite pandagg.node.aggs.metric pandagg.node.aggs.pipeline diff --git a/docs/source/reference/pandagg.node.mappings.abstract.rst b/docs/source/reference/pandagg.node.mappings.abstract.rst new file mode 100644 index 00000000..c98662b5 --- /dev/null +++ b/docs/source/reference/pandagg.node.mappings.abstract.rst @@ -0,0 +1,7 @@ +pandagg.node.mappings.abstract module +===================================== + +.. automodule:: pandagg.node.mappings.abstract + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/pandagg.node.mappings.field_datatypes.rst b/docs/source/reference/pandagg.node.mappings.field_datatypes.rst new file mode 100644 index 00000000..1cbca125 --- /dev/null +++ b/docs/source/reference/pandagg.node.mappings.field_datatypes.rst @@ -0,0 +1,7 @@ +pandagg.node.mappings.field\_datatypes module +============================================= + +.. automodule:: pandagg.node.mappings.field_datatypes + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/pandagg.node.mappings.meta_fields.rst b/docs/source/reference/pandagg.node.mappings.meta_fields.rst new file mode 100644 index 00000000..0e46b4bc --- /dev/null +++ b/docs/source/reference/pandagg.node.mappings.meta_fields.rst @@ -0,0 +1,7 @@ +pandagg.node.mappings.meta\_fields module +========================================= + +.. automodule:: pandagg.node.mappings.meta_fields + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/pandagg.node.mappings.rst b/docs/source/reference/pandagg.node.mappings.rst new file mode 100644 index 00000000..ef491644 --- /dev/null +++ b/docs/source/reference/pandagg.node.mappings.rst @@ -0,0 +1,20 @@ +pandagg.node.mappings package +============================= + +Submodules +---------- + +.. toctree:: + :maxdepth: 8 + + pandagg.node.mappings.abstract + pandagg.node.mappings.field_datatypes + pandagg.node.mappings.meta_fields + +Module contents +--------------- + +.. automodule:: pandagg.node.mappings + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/pandagg.node.rst b/docs/source/reference/pandagg.node.rst index 0d91e97c..392d44f7 100644 --- a/docs/source/reference/pandagg.node.rst +++ b/docs/source/reference/pandagg.node.rst @@ -8,7 +8,7 @@ Subpackages :maxdepth: 8 pandagg.node.aggs - pandagg.node.mapping + pandagg.node.mappings pandagg.node.query pandagg.node.response diff --git a/docs/source/reference/pandagg.rst b/docs/source/reference/pandagg.rst index c7e29248..e84ac02a 100644 --- a/docs/source/reference/pandagg.rst +++ b/docs/source/reference/pandagg.rst @@ -21,7 +21,7 @@ Submodules pandagg.connections pandagg.discovery pandagg.exceptions - pandagg.mapping + pandagg.mappings pandagg.query pandagg.response pandagg.search diff --git a/docs/source/reference/pandagg.tree.aggs.rst b/docs/source/reference/pandagg.tree.aggs.rst index fa1dd3e5..9f435ae8 100644 --- a/docs/source/reference/pandagg.tree.aggs.rst +++ b/docs/source/reference/pandagg.tree.aggs.rst @@ -1,16 +1,5 @@ -pandagg.tree.aggs package -========================= - -Submodules ----------- - -.. toctree:: - :maxdepth: 8 - - pandagg.tree.aggs.aggs - -Module contents ---------------- +pandagg.tree.aggs module +======================== .. automodule:: pandagg.tree.aggs :members: diff --git a/docs/source/reference/pandagg.tree.mappings.rst b/docs/source/reference/pandagg.tree.mappings.rst new file mode 100644 index 00000000..519afa00 --- /dev/null +++ b/docs/source/reference/pandagg.tree.mappings.rst @@ -0,0 +1,7 @@ +pandagg.tree.mappings module +============================ + +.. automodule:: pandagg.tree.mappings + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/reference/pandagg.tree.query.rst b/docs/source/reference/pandagg.tree.query.rst index 35b36324..63f93b86 100644 --- a/docs/source/reference/pandagg.tree.query.rst +++ b/docs/source/reference/pandagg.tree.query.rst @@ -1,16 +1,5 @@ -pandagg.tree.query package -========================== - -Submodules ----------- - -.. toctree:: - :maxdepth: 8 - - pandagg.tree.query.abstract - -Module contents ---------------- +pandagg.tree.query module +========================= .. automodule:: pandagg.tree.query :members: diff --git a/docs/source/reference/pandagg.tree.rst b/docs/source/reference/pandagg.tree.rst index d0415cfd..de1f9688 100644 --- a/docs/source/reference/pandagg.tree.rst +++ b/docs/source/reference/pandagg.tree.rst @@ -1,22 +1,15 @@ pandagg.tree package ==================== -Subpackages ------------ - -.. toctree:: - :maxdepth: 8 - - pandagg.tree.aggs - pandagg.tree.query - Submodules ---------- .. toctree:: :maxdepth: 8 - pandagg.tree.mapping + pandagg.tree.aggs + pandagg.tree.mappings + pandagg.tree.query pandagg.tree.response Module contents diff --git a/examples/imdb/README.md b/examples/imdb/README.md index c4a86c7f..ed694829 100644 --- a/examples/imdb/README.md +++ b/examples/imdb/README.md @@ -27,7 +27,7 @@ Relational schema is the following: ![imdb tables](ressources/imdb_ijs.svg) -## Index mapping +## Index mappings #### Overview The base unit (document) will be a movie, having a name, rank (ratings), year of release, a list of actors @@ -46,7 +46,7 @@ Movie: #### Which fields require nesting? Since genres contain a single keyword field, in no case we need it to be stored as a nested field. -On the contrary, actor roles and directors require a nested mapping if we consider applying multiple +On the contrary, actor roles and directors require a nested field if we consider applying multiple simultanous query clauses on their sub-fields (for instance search movie in which actor is a woman AND whose role is nurse). More information on distinction between array and nested fields [here]( @@ -61,10 +61,10 @@ opt for a text field. Yet we might want to aggregate on exact keywords to count More inforamtion on distinction between text and keyword fields [here]( https://www.elastic.co/fr/blog/strings-are-dead-long-live-strings) -#### Mapping +#### Mappings ``` - + _ ├── directors [Nested] │ ├── director_id Keyword @@ -151,7 +151,7 @@ OUTPUT_FILE_NAME = 'serialized.json' # can take a while python examples/imdb/serialize.py -# create index with mapping if necessary, bulk insert documents in ES +# create index with mappings if necessary, bulk insert documents in ES python examples/imdb/load.py ``` diff --git a/examples/imdb/load.py b/examples/imdb/load.py index 6215a4f4..e2faad71 100644 --- a/examples/imdb/load.py +++ b/examples/imdb/load.py @@ -2,10 +2,10 @@ from os.path import join from elasticsearch import Elasticsearch, helpers from examples.imdb.conf import ES_HOST, ES_USE_AUTH, ES_PASSWORD, ES_USER, DATA_DIR -from pandagg.mapping import Mapping, Keyword, Text, Float, Nested, Integer +from pandagg.mappings import Mappings, Keyword, Text, Float, Nested, Integer index_name = "movies" -mapping = Mapping( +mappings = Mappings( dynamic=False, properties={ "movie_id": Keyword(), @@ -68,8 +68,8 @@ def bulk_index(client, docs): print("CREATE INDEX\n") es_client.indices.create(index_name) print("-" * 50) - print("UPDATE MAPPING\n") - es_client.indices.put_mapping(index=index_name, body=mapping) + print("UPDATE MAPPINGS\n") + es_client.indices.put_mapping(index=index_name, body=mappings) print("-" * 50) print("WRITE DOCUMENTS\n") diff --git a/pandagg/connections.py b/pandagg/connections.py index fb0f419d..403e3f54 100644 --- a/pandagg/connections.py +++ b/pandagg/connections.py @@ -1,5 +1,4 @@ # copied from elasticsearch-dsl/connections.py -from six import string_types from elasticsearch import Elasticsearch @@ -77,7 +76,7 @@ def get_connection(self, alias="default"): """ # do not check isinstance(Elasticsearch) so that people can wrap their # clients - if not isinstance(alias, string_types): + if not isinstance(alias, str): return alias # connection already established diff --git a/pandagg/discovery.py b/pandagg/discovery.py index 92c5767e..704f151d 100644 --- a/pandagg/discovery.py +++ b/pandagg/discovery.py @@ -1,11 +1,9 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals -from future.utils import iteritems, python_2_unicode_compatible from lighttree.interactive import Obj -from pandagg.interactive.mapping import IMapping +from pandagg.interactive.mappings import IMappings from pandagg.search import Search @@ -15,11 +13,11 @@ def discover(using, index="*"): :param index: Comma-separated list or wildcard expression of index names used to limit the request. """ indices = Indices() - for index_name, index_detail in iteritems(using.indices.get(index=index)): + for index_name, index_detail in using.indices.get(index=index).items(): indices[index_name] = Index( client=using, name=index_name, - mapping=index_detail["mappings"], + mappings=index_detail["mappings"], settings=index_detail["settings"], aliases=index_detail["aliases"], ) @@ -29,21 +27,20 @@ def discover(using, index="*"): # until Proper Index class is written -@python_2_unicode_compatible class Index(object): - def __init__(self, name, settings, mapping, aliases, client=None): + def __init__(self, name, settings, mappings, aliases, client=None): super(Index, self).__init__() self.client = client self.name = name self.settings = settings - self._mapping = mapping - self.mapping = IMapping(mapping, client=client, index=name) + self._mappings = mappings + self.mappings = IMappings(mappings, client=client, index=name) self.aliases = aliases def search(self, nested_autocorrect=True, repr_auto_execute=True): return Search( using=self.client, - mapping=self._mapping, + mappings=self._mappings, index=self.name, nested_autocorrect=nested_autocorrect, repr_auto_execute=repr_auto_execute, diff --git a/pandagg/exceptions.py b/pandagg/exceptions.py index 7ac94f96..f4b58258 100644 --- a/pandagg/exceptions.py +++ b/pandagg/exceptions.py @@ -1,24 +1,21 @@ -from __future__ import unicode_literals - - class InvalidAggregation(Exception): """Wrong aggregation definition""" class MappingError(Exception): - """Basic Mapping Error""" + """Basic Mappings Error""" pass class AbsentMappingFieldError(MappingError): - """Field is not present in mapping.""" + """Field is not present in mappings.""" pass class InvalidOperationMappingFieldError(MappingError): - """Invalid aggregation type on this mapping field.""" + """Invalid aggregation type on this mappings field.""" pass diff --git a/pandagg/interactive/__init__.py b/pandagg/interactive/__init__.py index e6bf61e7..64e1c378 100644 --- a/pandagg/interactive/__init__.py +++ b/pandagg/interactive/__init__.py @@ -1,2 +1,2 @@ -from .mapping import * +from .mappings import * from .response import * diff --git a/pandagg/interactive/_field_agg_factory.py b/pandagg/interactive/_field_agg_factory.py index fc7abc96..35ed0318 100644 --- a/pandagg/interactive/_field_agg_factory.py +++ b/pandagg/interactive/_field_agg_factory.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from pandagg.node.aggs.abstract import AggClause +from pandagg.node.aggs.abstract import AggClause, BucketAggClause from pandagg.node.types import MAPPING_TYPES @@ -25,6 +25,11 @@ def field_klass_init(self, field, search): def aggregator_factory(agg_klass): def aggregator(self, **kwargs): + if issubclass(agg_klass, BucketAggClause): + return self._search.groupby( + "%s_%s" % (agg_klass.KEY, self._field), + agg_klass(field=self._field, **kwargs), + ) return self._search.agg( "%s_%s" % (agg_klass.KEY, self._field), agg_klass(field=self._field, **kwargs), diff --git a/pandagg/interactive/mapping.py b/pandagg/interactive/mappings.py similarity index 77% rename from pandagg/interactive/mapping.py rename to pandagg/interactive/mappings.py index 9079f4b8..863bb8a8 100644 --- a/pandagg/interactive/mapping.py +++ b/pandagg/interactive/mappings.py @@ -5,43 +5,43 @@ from lighttree import TreeBasedObj -from pandagg.tree.mapping import _mapping +from pandagg.tree.mappings import _mappings from pandagg.interactive._field_agg_factory import field_classes_per_name from pandagg.utils import DSLMixin -class IMapping(DSLMixin, TreeBasedObj): - """Interactive wrapper upon mapping tree, allowing field navigation and quick access to single clause aggregations +class IMappings(DSLMixin, TreeBasedObj): + """Interactive wrapper upon mappings tree, allowing field navigation and quick access to single clause aggregations computation. """ - _REPR_NAME = "Mapping" + _REPR_NAME = "Mappings" _NODE_PATH_ATTR = "name" def __init__( self, - mapping, + mappings, client=None, index=None, depth=1, root_path=None, initial_tree=None, ): - if mapping is None: - raise ValueError("mapping cannot be None") + if mappings is None: + raise ValueError("mappings cannot be None") self._client = client self._index = index - super(IMapping, self).__init__( - tree=_mapping(mapping), + super(IMappings, self).__init__( + tree=_mappings(mappings), root_path=root_path, depth=depth, initial_tree=initial_tree, ) - # if we reached a leave, add aggregation capabilities based on reached mapping type + # if we reached a leave, add aggregation capabilities based on reached mappings type self._set_agg_property_if_required() def _clone(self, nid, root_path, depth): - return IMapping( + return IMappings( self._tree.subtree(nid)[1], client=self._client, root_path=root_path, @@ -59,7 +59,7 @@ def _set_agg_property_if_required(self): search=search_class( using=self._client, index=self._index, - mapping=self._initial_tree, + mappings=self._initial_tree, repr_auto_execute=True, nested_autocorrect=True, ), diff --git a/pandagg/mapping.py b/pandagg/mappings.py similarity index 82% rename from pandagg/mapping.py rename to pandagg/mappings.py index 4a48f9f4..14b93c4f 100644 --- a/pandagg/mapping.py +++ b/pandagg/mappings.py @@ -1,8 +1,10 @@ -from pandagg.tree.mapping import Mapping -from pandagg.interactive.mapping import IMapping -from pandagg.node.mapping.field_datatypes import ( +from pandagg.tree.mappings import Mappings +from pandagg.interactive.mappings import IMappings +from pandagg.node.mappings.field_datatypes import ( Text, Keyword, + ConstantKeyword, + WildCard, Long, Integer, Short, @@ -41,7 +43,7 @@ Shape, Histogram, ) -from pandagg.node.mapping.meta_fields import ( +from pandagg.node.mappings.meta_fields import ( Index, Type, Id, @@ -54,10 +56,12 @@ ) __all__ = [ - "Mapping", - "IMapping", + "Mappings", + "IMappings", "Text", "Keyword", + "ConstantKeyword", + "WildCard", "Long", "Integer", "Short", diff --git a/pandagg/node/__init__.py b/pandagg/node/__init__.py index f519d2ab..a9d01370 100644 --- a/pandagg/node/__init__.py +++ b/pandagg/node/__init__.py @@ -1,3 +1,3 @@ from .query import * from .aggs import * -from .mapping import * +from .mappings import * diff --git a/pandagg/node/_node.py b/pandagg/node/_node.py index a9c4d7c5..6a671c6e 100644 --- a/pandagg/node/_node.py +++ b/pandagg/node/_node.py @@ -1,10 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from future.utils import iteritems - from lighttree import Node as OriginalNode from pandagg.utils import DSLMixin @@ -20,7 +16,7 @@ class Node(DSLMixin, OriginalNode): @staticmethod def expand__to_dot(params): nparams = {} - for pname, pvalue in iteritems(params): + for pname, pvalue in params.items(): if "__" in pname: pname = pname.replace("__", ".") nparams[pname] = pvalue diff --git a/pandagg/node/aggs/abstract.py b/pandagg/node/aggs/abstract.py index 83994359..0f899e3e 100644 --- a/pandagg/node/aggs/abstract.py +++ b/pandagg/node/aggs/abstract.py @@ -1,22 +1,20 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals -from builtins import str as text -from six import text_type - import json from pandagg.node._node import Node def A(name, type_or_agg=None, **body): - """Accept multiple syntaxes, return a AggNode instance. + """ + Accept multiple syntaxes, return a AggNode instance. + :param type_or_agg: :param body: :return: AggNode """ - if isinstance(type_or_agg, text_type): + if isinstance(type_or_agg, str): # _translate_agg("per_user", "terms", field="user") return AggClause._get_dsl_class(type_or_agg)(**body) if isinstance(type_or_agg, AggClause): @@ -54,7 +52,8 @@ def A(name, type_or_agg=None, **body): class AggClause(Node): - """Wrapper around elasticsearch aggregation concept. + """ + Wrapper around elasticsearch aggregation concept. https://www.elastic.co/guide/en/elasticsearch/reference/2.3/search-aggregations.html Each aggregation can be seen both a Node that can be encapsulated in a parent agg. @@ -79,7 +78,7 @@ def line_repr(self, depth, **kwargs): # root node if self.KEY is None: return "_", "" - repr_args = [text(self.KEY)] + repr_args = [str(self.KEY)] if self.body: repr_args.append(self._params_repr(self.body)) unnamed = "<%s>" % ", ".join(repr_args) @@ -89,7 +88,7 @@ def line_repr(self, depth, **kwargs): def _params_repr(params): params = params or {} return ", ".join( - "%s=%s" % (text(k), text(json.dumps(params[k], sort_keys=True))) + "%s=%s" % (str(k), str(json.dumps(params[k], sort_keys=True))) for k in sorted(params.keys()) ) @@ -102,7 +101,8 @@ def valid_on_field_type(cls, field_type): return False def to_dict(self): - """ElasticSearch aggregation queries follow this formatting:: + """ + ElasticSearch aggregation queries follow this formatting:: { "" : { @@ -128,7 +128,9 @@ def to_dict(self): return aggs def get_filter(self, key): - """Return filter query to list documents having this aggregation key. + """ + Return filter query to list documents having this aggregation key. + :param key: string :return: elasticsearch filter query """ @@ -146,8 +148,8 @@ def extract_bucket_value(cls, response, value_as_dict=False): def __str__(self): return "<{class_}, type={type}, body={body}>".format( - class_=text(self.__class__.__name__), - type=text(self.KEY), + class_=str(self.__class__.__name__), + type=str(self.KEY), body=json.dumps(self.body), ) @@ -159,7 +161,9 @@ def __eq__(self, other): class Root(AggClause): - """Not a real aggregation. Just the initial empty dict.""" + """ + Not a real aggregation. Just the initial empty dict (used as lighttree.Tree root). + """ KEY = "_root" @@ -175,7 +179,9 @@ def extract_bucket_value(cls, response, value_as_dict=False): class MetricAgg(AggClause): - """Metric aggregation are aggregations providing a single bucket, with value attributes to be extracted.""" + """ + Metric aggregation are aggregations providing a single bucket, with value attributes to be extracted. + """ VALUE_ATTRS = None @@ -186,8 +192,9 @@ def get_filter(self, key): return None -class BucketAggNode(AggClause): - """Bucket aggregation have special abilities: they can encapsulate other aggregations as children. +class BucketAggClause(AggClause): + """ + Bucket aggregation have special abilities: they can encapsulate other aggregations as children. Each time, the extracted value is a 'doc_count'. Provide methods: @@ -222,7 +229,7 @@ def get_filter(self, key): raise NotImplementedError() -class UniqueBucketAgg(BucketAggNode): +class UniqueBucketAgg(BucketAggClause): """Aggregations providing a single bucket.""" VALUE_ATTRS = None @@ -234,13 +241,14 @@ def get_filter(self, key): raise NotImplementedError() -class MultipleBucketAgg(BucketAggNode): +class MultipleBucketAgg(BucketAggClause): VALUE_ATTRS = None IMPLICIT_KEYED = False def __init__(self, keyed=None, key_path="key", meta=None, **body): - """Aggregation that return either a list or a map of buckets. + """ + Aggregation that return either a list or a map of buckets. If keyed, ES buckets are expected as dict, else as list (in this case key_path is used to extract key from each list item). @@ -273,7 +281,9 @@ def get_filter(self, key): class FieldOrScriptMetricAgg(MetricAgg): - """Metric aggregation based on single field.""" + """ + Metric aggregation based on single field. + """ VALUE_ATTRS = None diff --git a/pandagg/node/aggs/bucket.py b/pandagg/node/aggs/bucket.py index 35a41695..c5ff1bc8 100644 --- a/pandagg/node/aggs/bucket.py +++ b/pandagg/node/aggs/bucket.py @@ -9,7 +9,6 @@ - significant terms """ -from builtins import str as text from pandagg.node.types import NUMERIC_TYPES from pandagg.node.aggs.abstract import MultipleBucketAgg, UniqueBucketAgg @@ -267,7 +266,7 @@ def _extract_bucket_key(self, bucket): else: key = "*-" if self.to_key in bucket: - key += text(bucket[self.to_key]) + key += str(bucket[self.to_key]) else: key += "*" return key diff --git a/pandagg/node/aggs/composite.py b/pandagg/node/aggs/composite.py index 05c8cc21..a86ed0aa 100644 --- a/pandagg/node/aggs/composite.py +++ b/pandagg/node/aggs/composite.py @@ -1,7 +1,7 @@ -from .abstract import BucketAggNode +from .abstract import BucketAggClause -class Composite(BucketAggNode): +class Composite(BucketAggClause): def __init__(self, sources, size=None, after_key=None, meta=None, **body): """https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-composite-aggregation.html :param sources: diff --git a/pandagg/node/aggs/metric.py b/pandagg/node/aggs/metric.py index 11278581..7e658ce9 100644 --- a/pandagg/node/aggs/metric.py +++ b/pandagg/node/aggs/metric.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals - from pandagg.node.types import NUMERIC_TYPES from pandagg.node.aggs.abstract import FieldOrScriptMetricAgg, MetricAgg diff --git a/pandagg/node/mapping/__init__.py b/pandagg/node/mappings/__init__.py similarity index 100% rename from pandagg/node/mapping/__init__.py rename to pandagg/node/mappings/__init__.py diff --git a/pandagg/node/mapping/abstract.py b/pandagg/node/mappings/abstract.py similarity index 92% rename from pandagg/node/mapping/abstract.py rename to pandagg/node/mappings/abstract.py index 024b8185..6b7718a1 100644 --- a/pandagg/node/mapping/abstract.py +++ b/pandagg/node/mappings/abstract.py @@ -2,8 +2,6 @@ # -*- coding: utf-8 -*- import json -from builtins import str as text - from pandagg.node._node import Node @@ -42,8 +40,8 @@ def _display_pattern(self): def __str__(self): return "<%s field>:\n%s" % ( - text(self.KEY).capitalize(), - text(json.dumps(self.body, indent=4)), + str(self.KEY).capitalize(), + str(json.dumps(self.body, indent=4)), ) diff --git a/pandagg/node/mapping/field_datatypes.py b/pandagg/node/mappings/field_datatypes.py similarity index 96% rename from pandagg/node/mapping/field_datatypes.py rename to pandagg/node/mappings/field_datatypes.py index 49ea921f..a6252460 100644 --- a/pandagg/node/mapping/field_datatypes.py +++ b/pandagg/node/mappings/field_datatypes.py @@ -13,6 +13,14 @@ class Keyword(RegularField): KEY = "keyword" +class ConstantKeyword(RegularField): + KEY = "constant_keyword" + + +class WildCard(RegularField): + KEY = "wildcard" + + # numeric class Long(RegularField): KEY = "long" @@ -112,7 +120,7 @@ class GeoShape(RegularField): class IP(RegularField): """for IPv4 and IPv6 addresses""" - KEY = "IP" + KEY = "ip" class Completion(RegularField): diff --git a/pandagg/node/mapping/meta_fields.py b/pandagg/node/mappings/meta_fields.py similarity index 96% rename from pandagg/node/mapping/meta_fields.py rename to pandagg/node/mappings/meta_fields.py index 7a697dc6..7b505222 100644 --- a/pandagg/node/mapping/meta_fields.py +++ b/pandagg/node/mappings/meta_fields.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + from .abstract import Field @@ -11,7 +12,7 @@ class Index(Field): class Type(Field): - """The document’s mapping type.""" + """The document’s mappings type.""" KEY = "_type" diff --git a/pandagg/node/query/_parameter_clause.py b/pandagg/node/query/_parameter_clause.py index c39f9c3b..4a1bf57a 100644 --- a/pandagg/node/query/_parameter_clause.py +++ b/pandagg/node/query/_parameter_clause.py @@ -1,9 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals - - from pandagg.node.query.abstract import ParentParameterClause diff --git a/pandagg/node/query/abstract.py b/pandagg/node/query/abstract.py index b0eafec4..dd458c2e 100644 --- a/pandagg/node/query/abstract.py +++ b/pandagg/node/query/abstract.py @@ -1,18 +1,15 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals - -from builtins import str as text import json -from six import text_type - from pandagg.node._node import Node def Q(type_or_query=None, **body): - """Accept multiple syntaxes, return a QueryClause node. + """ + Accept multiple syntaxes, return a QueryClause node. + :param type_or_query: :param body: :return: QueryClause @@ -40,7 +37,7 @@ def Q(type_or_query=None, **body): ) type_, body_ = type_or_query.popitem() return QueryClause._get_dsl_class(type_)(**body_) - if isinstance(type_or_query, text_type): + if isinstance(type_or_query, str): return QueryClause._get_dsl_class(type_or_query)(**body) raise ValueError('"type_or_query" must be among "dict", "AggNode", "str"') @@ -62,7 +59,7 @@ def __init__( def line_repr(self, depth, **kwargs): repr_args = [] if self._named: - repr_args.append("_name=%s" % text(self.identifier)) + repr_args.append("_name=%s" % str(self.identifier)) if self.body: repr_args.append(self._params_repr(self.body)) return self.KEY, ", ".join(repr_args) @@ -71,7 +68,7 @@ def line_repr(self, depth, **kwargs): def _params_repr(params): params = params or {} return ", ".join( - "%s=%s" % (text(k), text(json.dumps(params[k], sort_keys=True))) + "%s=%s" % (str(k), str(json.dumps(params[k], sort_keys=True))) for k in sorted(params.keys()) ) @@ -91,9 +88,9 @@ def to_dict(self): def __str__(self): return "<{class_}, id={id}, type={type}, body={body}>".format( - class_=text(self.__class__.__name__), - type=text(self.KEY), - id=text(self.identifier), + class_=str(self.__class__.__name__), + type=str(self.KEY), + id=str(self.identifier), body=self.body, ) @@ -123,20 +120,23 @@ def __init__(self, field, _name=None, **body): class FlatFieldQueryClause(AbstractSingleFieldQueryClause): - """Query clause applied on one single field. + """ + Query clause applied on one single field. Example: Exists: {"exists": {"field": "user"}} -> field = "user" -> body = {"field": "user"} - q = Exists(field="user") + >>> from pandagg.query import Exists + >>> q = Exists(field="user") DistanceFeature: {"distance_feature": {"field": "production_date", "pivot": "7d", "origin": "now"}} -> field = "production_date" -> body = {"field": "production_date", "pivot": "7d", "origin": "now"} - q = DistanceFeature(field="production_date", pivot="7d", origin="now") + >>> from pandagg.query import DistanceFeature + >>> q = DistanceFeature(field="production_date", pivot="7d", origin="now") """ _FIELD_AT_BODY_ROOT = True @@ -147,20 +147,22 @@ def __init__(self, field, _name=None, **body): class KeyFieldQueryClause(AbstractSingleFieldQueryClause): - """Clause with field used as key in clause body: + """ + Clause with field used as key in clause body: Term: {"term": {"user": {"value": "Kimchy", "boost": 1}}} -> field = "user" -> body = {"user": {"value": "Kimchy", "boost": 1}} - q1 = Term(user={"value": "Kimchy", "boost": 1}}) - q2 = Term(field="user", value="Kimchy", boost=1}}) + >>> from pandagg.query import Term + >>> q1 = Term(user={"value": "Kimchy", "boost": 1}}) + >>> q2 = Term(field="user", value="Kimchy", boost=1}}) Can accept a "_implicit_param" attribute specifying which is the equivalent key when inner body isn't a dict but a raw value. For Term: _implicit_param = "value" - q = Term(user="Kimchy") + >>> q = Term(user="Kimchy") {"term": {"user": {"value": "Kimchy"}}} -> field = "user" -> body = {"term": {"user": {"value": "Kimchy"}}} @@ -188,11 +190,11 @@ def __init__(self, field=None, _name=None, _expand__to_dot=True, **params): def line_repr(self, depth, **kwargs): if not self.inner_body: - return "", ", ".join([text(self.KEY), "field=%s" % text(self.field)]) + return "", ", ".join([str(self.KEY), "field=%s" % str(self.field)]) return ( self.KEY, ", ".join( - ["field=%s" % text(self.field), self._params_repr(self.inner_body)] + ["field=%s" % str(self.field), self._params_repr(self.inner_body)] ), ) @@ -203,7 +205,7 @@ def __init__(self, fields, _name=None, **body): super(LeafQueryClause, self).__init__(_name=_name, fields=fields, **body) def line_repr(self, depth, **kwargs): - return self.KEY, "fields=%s" % (list(map(text, self.fields))) + return self.KEY, "fields=%s" % (list(map(str, self.fields))) class ParentParameterClause(QueryClause): diff --git a/pandagg/node/query/compound.py b/pandagg/node/query/compound.py index b678b021..fc249035 100644 --- a/pandagg/node/query/compound.py +++ b/pandagg/node/query/compound.py @@ -2,7 +2,8 @@ class CompoundClause(QueryClause): - """Compound clauses can encapsulate other query clauses: + """ + Compound clauses can encapsulate other query clauses: .. code-block:: diff --git a/pandagg/node/response/bucket.py b/pandagg/node/response/bucket.py index a39bd30b..2c17b135 100644 --- a/pandagg/node/response/bucket.py +++ b/pandagg/node/response/bucket.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals from pandagg.node._node import Node @@ -20,7 +19,8 @@ def __init__(self, value, key=None, level=None): @property def attr_name(self): - """Determine under which attribute name the bucket will be available in response tree. + """ + Determine under which attribute name the bucket will be available in response tree. Dots are replaced by `_` characters so that they don't prevent from accessing as attribute. Resulting attribute unfit for python attribute name syntax is still possible and will be accessible through diff --git a/pandagg/response.py b/pandagg/response.py index cd46a499..2f97a4ae 100644 --- a/pandagg/response.py +++ b/pandagg/response.py @@ -1,17 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals - import copy -from builtins import str as text - from future.utils import iterkeys, iteritems from pandagg.interactive.response import IResponse from pandagg.node.aggs.abstract import UniqueBucketAgg, MetricAgg, Root -from pandagg.node.aggs.bucket import Nested +from pandagg.node.aggs.bucket import Nested, ReverseNested from pandagg.tree.response import AggsResponseTree @@ -102,9 +98,9 @@ def to_dataframe(self, expand_source=True, source_only=True): def __repr__(self): if not isinstance(self.total, dict): - total_repr = text(self.total) + total_repr = str(self.total) elif self.total.get("relation") == "eq": - total_repr = text(self.total["value"]) + total_repr = str(self.total["value"]) elif self.total.get("relation") == "gte": total_repr = ">%d" % self.total["value"] else: @@ -243,24 +239,25 @@ def _grouping_agg(self, name=None): """Return aggregation node that used as grouping node. Note: in case there is only a nested aggregation below that node, group-by nested clause. """ - # if provided if name is not None: + # override existing groupby_ptr id_ = self._aggs.id_from_key(name) if not self._aggs._is_eligible_grouping_node(id_): raise ValueError( "Cannot group by <%s>, not a valid grouping aggregation" % name ) key, node = self._aggs.get(id_) - # if parent of single nested clause and nested_autocorrect - if self._aggs.nested_autocorrect: - children = self._aggs.children(node.identifier) - if len(children) == 1: - child_key, child_node = children[0] - if isinstance(child_node, Nested): - return child_key, child_node - return key, node - # if use of groupby method in Aggs class, use groupby pointer - return self._aggs.get(self._aggs._groupby_ptr) + else: + key, node = self._aggs.get(self._aggs._groupby_ptr) + + # if parent of single nested clause and nested_autocorrect + if self._aggs.nested_autocorrect: + children = self._aggs.children(node.identifier) + if len(children) == 1: + child_key, child_node = children[0] + if isinstance(child_node, (Nested, ReverseNested)): + return child_key, child_node + return key, node def to_tabular( self, @@ -435,4 +432,4 @@ def serialize(self, output="tabular", **kwargs): def __repr__(self): if not self.keys(): return " empty" - return " %s" % list(map(text, self.keys())) + return " %s" % list(map(str, self.keys())) diff --git a/pandagg/search.py b/pandagg/search.py index a4819fe0..cdf03fbe 100644 --- a/pandagg/search.py +++ b/pandagg/search.py @@ -3,13 +3,12 @@ import json from elasticsearch.helpers import scan -from future.utils import string_types from lighttree.exceptions import NotFoundNodeError from pandagg.connections import get_connection from pandagg.query import Bool from pandagg.response import Response -from pandagg.tree.mapping import _mapping +from pandagg.tree.mappings import _mappings from pandagg.tree.query import Query, ADD from pandagg.tree.aggs import Aggs from pandagg.utils import DSLMixin @@ -63,7 +62,7 @@ def index(self, *index): else: indexes = [] for i in index: - if isinstance(i, string_types): + if isinstance(i, str): indexes.append(i) elif isinstance(i, list): indexes += i @@ -104,7 +103,7 @@ def __init__( self, using=None, index=None, - mapping=None, + mappings=None, nested_autocorrect=False, repr_auto_execute=False, ): @@ -113,12 +112,12 @@ def __init__( :arg using: `Elasticsearch` instance to use :arg index: limit the search to index - :arg mapping: mapping used for query validation + :arg mappings: mappings used for query validation :arg nested_autocorrect: in case of missing nested clause, will insert it automatically :arg repr_auto_execute: execute query and display results as dataframe, requires client to be provided All the parameters supplied (or omitted) at creation type can be later - overridden by methods (`using`, `index` and `mapping` respectively). + overridden by methods (`using`, `index` and `mappings` respectively). """ self._sort = [] @@ -127,12 +126,12 @@ def __init__( self._highlight_opts = {} self._suggest = {} self._script_fields = {} - mapping = _mapping(mapping) - self._mapping = mapping - self._aggs = Aggs(mapping=mapping, nested_autocorrect=nested_autocorrect) - self._query = Query(mapping=mapping, nested_autocorrect=nested_autocorrect) + mappings = _mappings(mappings) + self._mappings = mappings + self._aggs = Aggs(mappings=mappings, nested_autocorrect=nested_autocorrect) + self._query = Query(mappings=mappings, nested_autocorrect=nested_autocorrect) self._post_filter = Query( - mapping=mapping, nested_autocorrect=nested_autocorrect + mappings=mappings, nested_autocorrect=nested_autocorrect ) self._repr_auto_execute = repr_auto_execute super(Search, self).__init__(using=using, index=index) @@ -341,10 +340,10 @@ def __getitem__(self, n): return s elif isinstance(n, list): # Columns selection - if self._mapping: + if self._mappings: for key in n: try: - self._mapping.id_from_key(key) + self._mappings.id_from_key(key) except NotFoundNodeError: raise KeyError("%s not in index" % key) return self.source(includes=n) @@ -395,7 +394,9 @@ def _clone(self): of all the underlying objects. Used internally by most state modifying APIs. """ - s = self.__class__(using=self._using, index=self._index, mapping=self._mapping) + s = self.__class__( + using=self._using, index=self._index, mappings=self._mappings + ) s._params = self._params.copy() s._sort = self._sort[:] s._source = copy.copy(self._source) if self._source is not None else None @@ -406,7 +407,7 @@ def _clone(self): s._aggs = self._aggs.clone() s._query = self._query.clone() s._post_filter = self._post_filter.clone() - s._mapping = None if self._mapping is None else self._mapping.clone() + s._mappings = None if self._mappings is None else self._mappings.clone() s._repr_auto_execute = self._repr_auto_execute return s @@ -465,7 +466,7 @@ def script_fields(self, **kwargs): """ s = self._clone() for name in kwargs: - if isinstance(kwargs[name], string_types): + if isinstance(kwargs[name], str): kwargs[name] = {"script": kwargs[name]} s._script_fields.update(kwargs) return s @@ -541,7 +542,7 @@ def sort(self, *keys): s = self._clone() s._sort = [] for k in keys: - if isinstance(k, string_types) and k.startswith("-"): + if isinstance(k, str) and k.startswith("-"): if k[1:] == "_score": raise ValueError("Sorting by `-_score` is not allowed.") k = {k[1:]: {"order": "desc"}} diff --git a/pandagg/tree/__init__.py b/pandagg/tree/__init__.py index ca8046f4..73e923ee 100644 --- a/pandagg/tree/__init__.py +++ b/pandagg/tree/__init__.py @@ -1,3 +1,3 @@ from .aggs import * from .query import * -from .mapping import * +from .mappings import * diff --git a/pandagg/tree/_tree.py b/pandagg/tree/_tree.py index c81c5dc9..abf4d457 100644 --- a/pandagg/tree/_tree.py +++ b/pandagg/tree/_tree.py @@ -1,17 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals -from future.utils import python_2_unicode_compatible - -from builtins import str as text - from lighttree import Tree as OriginalTree from pandagg.utils import DSLMixin -@python_2_unicode_compatible class Tree(DSLMixin, OriginalTree): KEY = None @@ -22,7 +16,21 @@ def get_node_dsl_class(cls, name): return cls.node_class._get_dsl_class(name) def id_from_key(self, key): - """Find node identifier based on key. If multiple nodes have the same key, takes the first found one.""" + """ + Find node identifier based on key. If multiple nodes have the same key, takes the first one. + + Useful because of how pandagg implements lighttree.Tree. + A bit of context: + + ElasticSearch allows queries to contain multiple similarly named clauses (for queries and aggregations). + As a consequence clauses names are not used as clauses identifier in Trees, and internally pandagg (as lighttree + ) uses auto-generated uuids to distinguish them. + + But for usability reasons, notably when declaring that an aggregation clause must be placed relatively to + another one, the latter is identified by its name rather than its internal id. Since it is technically + possible that multiple clauses share the same name (not recommended, but allowed), some pandagg features are + ambiguous and not recommended in such context. + """ for k, n in self.list(): if k == key: return n.identifier @@ -30,7 +38,7 @@ def id_from_key(self, key): def __str__(self): return "<{class_}>\n{tree}".format( - class_=text(self.__class__.__name__), tree=self.show(limit=40) + class_=str(self.__class__.__name__), tree=self.show(limit=40) ) def __repr__(self): diff --git a/pandagg/tree/aggs.py b/pandagg/tree/aggs.py index 4c8ce686..837111a4 100644 --- a/pandagg/tree/aggs.py +++ b/pandagg/tree/aggs.py @@ -1,30 +1,23 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals -from builtins import str as text -from six import text_type - import json -from future.utils import python_2_unicode_compatible - from pandagg.tree._tree import Tree -from pandagg.tree.mapping import _mapping +from pandagg.tree.mappings import _mappings -from pandagg.node.aggs.abstract import BucketAggNode, AggClause, Root, A +from pandagg.node.aggs.abstract import BucketAggClause, AggClause, Root, A from pandagg.node.aggs.bucket import Nested, ReverseNested from pandagg.node.aggs.pipeline import BucketSelector, BucketSort -@python_2_unicode_compatible class Aggs(Tree): """ Combination of aggregation clauses. This class provides handful methods to build an aggregation (see :func:`~pandagg.tree.aggs.Aggs.aggs` and :func:`~pandagg.tree.aggs.Aggs.groupby`), and is used as well to parse aggregations response in easy to manipulate formats. - Mapping declaration is optional, but doing so validates aggregation validity and automatically handles missing + Mappings declaration is optional, but doing so validates aggregation validity and automatically handles missing nested clauses. Accept following syntaxes: @@ -39,19 +32,19 @@ class Aggs(Tree): >>> from pandagg.aggs import Terms, Avg >>> Aggs({'per_user': Terms(field='user')}) - :param mapping: ``dict`` or ``pandagg.tree.mapping.Mapping`` Mapping of requested indice(s). If provided, will + :param mappings: ``dict`` or ``pandagg.tree.mappings.Mappings`` Mappings of requested indice(s). If provided, will check aggregations validity. :param nested_autocorrect: ``bool`` In case of missing nested clauses in aggregation, if True, automatically - add missing nested clauses, else raise error. Ignored if mapping is not provided. + add missing nested clauses, else raise error. Ignored if mappings are not provided. :param _groupby_ptr: ``str`` identifier of aggregation clause used as grouping element (used by `clone` method). """ node_class = AggClause def __init__( - self, aggs=None, mapping=None, nested_autocorrect=None, _groupby_ptr=None + self, aggs=None, mappings=None, nested_autocorrect=None, _groupby_ptr=None ): - self.mapping = _mapping(mapping) + self.mappings = _mappings(mappings) self.nested_autocorrect = nested_autocorrect super(Aggs, self).__init__() @@ -64,112 +57,6 @@ def __init__( if aggs is not None: self._insert_aggs(aggs, at_root=True) - def __nonzero__(self): - return bool(self.to_dict()) - - __bool__ = __nonzero__ - - def _insert_agg( - self, name, node, insert_below_id=None, at_root=None, groupby_id=False - ): - """ - Using flat declaration: - >>> Aggs().agg("per_user", "terms", field="user") - >>> Aggs().agg("per_user", {"terms": {"field": "user"}}) - - Using DSL class: - >>> from pandagg.aggs import Terms - >>> Aggs().agg("per_user", Terms(field='user')) - - Agg node insertion, accepts following syntax: - - name="per_user", type_or_agg="terms", field="user" - - name="per_user", type_or_agg=Terms(field='user') - - name="per_user", type_or_agg={"terms": {"field": "user"}} - - insert children aggs as well: - - name="per_user", type_or_agg="terms", field="user", aggs={"avg_spent_time": {"avg": {"field": "spent_time"}}} - - name="per_user", type_or_agg=Terms(field='user', aggs={"avg_spent_time": Avg(field="spent_time")}) - - name="per_user", type_or_agg={"field": "user", aggs: {"avg_spent_time": {"avg": {"field": "spent_time"}}}} - """ - if insert_below_id and at_root: - raise ValueError('Must define at most one of "insert_below" or "at_root".') - if at_root: - insert_below_id = self.root - # based on last groupby pointer - if insert_below_id is None: - insert_below_id = self._groupby_ptr - - if not isinstance(name, text_type): - raise ValueError('Agg "name" must be a str.') - - _children_aggs = node._children or {} - - if groupby_id: - if _children_aggs: - raise ValueError("Cannot group by multiple aggs at once.") - subs = [ - self.drop_subtree(cid) for cid in self.children_ids(insert_below_id) - ] - self.insert(node, key=name, parent_id=insert_below_id) - for sub_key, sub_tree in subs: - self.insert(sub_tree, key=sub_key, parent_id=node.identifier) - # moving pointer when using groupby - self._groupby_ptr = node.identifier - return - - # in aggs mode, do not move pointer - self.insert_node(node=node, key=name, parent_id=insert_below_id) - for child_name, child in _children_aggs.items(): - child_node = A(child_name, child) - self._insert_agg( - name=child_name, node=child_node, insert_below_id=node.identifier - ) - - def _clone_init(self, deep=False): - return Aggs( - mapping=self.mapping.clone(deep=deep) if self.mapping is not None else None, - nested_autocorrect=self.nested_autocorrect, - _groupby_ptr=self._groupby_ptr, - ) - - def _is_eligible_grouping_node(self, nid): - """ - Return whether node can be used as grouping node. - """ - _, node = self.get(nid) - if not isinstance(node, BucketAggNode): - return False - # special aggregations not returning anything - if isinstance(node, (BucketSelector, BucketSort)): - return False - return True - - @property - def _deepest_linear_bucket_agg(self): - """ - Return deepest bucket aggregation node identifier (pandagg.nodes.abstract.BucketAggNode) of that aggregation - that neither has siblings, nor has an ancestor with siblings. - """ - if len(self._nodes_map) <= 1: - return self.root - last_bucket_agg_id = self.root - children = [ - c - for k, c in self.children(last_bucket_agg_id) - if self._is_eligible_grouping_node(c.identifier) - ] - while len(children) == 1: - last_agg_node = children[0] - if not self._is_eligible_grouping_node(last_agg_node.identifier): - break - last_bucket_agg_id = last_agg_node.identifier - children = [ - c - for k, c in self.children(last_bucket_agg_id) - if self._is_eligible_grouping_node(c.identifier) - ] - return last_bucket_agg_id - def grouped_by(self, agg_name=None, deepest=False): """ Define which aggregation will be used as grouping pointer. @@ -195,7 +82,7 @@ def grouped_by(self, agg_name=None, deepest=False): def groupby(self, name, type_or_agg=None, insert_below=None, at_root=None, **body): """ - Arrange passed aggregations in vertical/nested manner, above or below another agg clause. + Insert provided aggregation clause in copy of initial Aggs. Given the initial aggregation:: @@ -205,19 +92,16 @@ def groupby(self, name, type_or_agg=None, insert_below=None, at_root=None, **bod If `insert_below` = 'A':: A──> new──> B - └──> C + └──> C - >>> Aggs()\ - >>> .groupby('per_user_id', 'terms', field='user_id') + >>> Aggs().groupby('per_user_id', 'terms', field='user_id') {"per_user_id":{"terms":{"field":"user_id"}}} - >>> Aggs()\ - >>> .groupby('per_user_id', {'terms': {"field": "user_id"}}) + >>> Aggs().groupby('per_user_id', {'terms': {"field": "user_id"}}) {"per_user_id":{"terms":{"field":"user_id"}}} >>> from pandagg.aggs import Terms - >>> Aggs()\ - >>> .groupby('per_user_id', Terms(field="user_id")) + >>> Aggs().groupby('per_user_id', Terms(field="user_id")) {"per_user_id":{"terms":{"field":"user_id"}}} :rtype: pandagg.aggs.Aggs @@ -232,60 +116,10 @@ def groupby(self, name, type_or_agg=None, insert_below=None, at_root=None, **bod node=node, insert_below_id=insert_below_id, at_root=at_root, - groupby_id=True, + groupby=True, ) return new_agg - def aggs(self, aggs, insert_below=None, at_root=False): - """ - Insert provided aggs in copy of initial Aggs. - - Accept following syntaxes for provided aggs: - - python dict format: - >>> Aggs().aggs({'some_agg': {'terms': {'field': 'some_field'}}, 'other_agg': {'avg': {'field': 'age'}}}) - - Aggs instance: - >>> Aggs().aggs(Aggs({'some_agg': {'terms': {'field': 'some_field'}}, 'other_agg': {'avg': {'field': 'age'}}})) - - dict with Agg clauses values: - >>> from pandagg.aggs import Terms, Avg - >>> Aggs().aggs({'some_agg': Terms(field='some_field'), 'other_agg': Avg(field='age')}) - - :param aggs: aggregations to insert into existing aggregation - :param insert_below: name of aggregation below which provided aggs should be inserted - :param at_root: if True, aggregation is inserted at root - :return: copy of initial Aggs with provided aggs inserted - """ - new_agg = self.clone(with_nodes=True) - if insert_below is not None: - insert_below = new_agg.id_from_key(insert_below) - new_agg._insert_aggs(aggs=aggs, insert_below_id=insert_below, at_root=at_root) - return new_agg - - def _insert_aggs(self, aggs, insert_below_id=None, at_root=False): - """Insert aggs clauses in current Aggs (mutate current instance)""" - if at_root: - insert_below_id = self.root - elif not insert_below_id: - # parent based on last groupby pointer - insert_below_id = self._groupby_ptr - - if aggs is None: - return - if isinstance(aggs, Aggs): - self.merge(aggs, nid=insert_below_id) - self._groupby_ptr = self.root - return - if isinstance(aggs, dict): - for agg_name, agg_body in aggs.items(): - node = A(agg_name, agg_body) - self._insert_agg( - name=agg_name, node=node, insert_below_id=insert_below_id - ) - return - raise TypeError("Unsupported aggs type %s for Aggs" % type(aggs)) - def agg(self, name, type_or_agg=None, insert_below=None, at_root=False, **body): """ Insert provided agg clause in copy of initial Aggs. @@ -318,7 +152,42 @@ def agg(self, name, type_or_agg=None, insert_below=None, at_root=False, **body): ) return new_agg + def aggs(self, aggs, insert_below=None, at_root=False): + """ + Insert provided aggs in copy of initial Aggs. + + Accept following syntaxes for provided aggs: + + python dict format: + >>> Aggs().aggs({'some_agg': {'terms': {'field': 'some_field'}}, 'other_agg': {'avg': {'field': 'age'}}}) + + Aggs instance: + >>> Aggs().aggs(Aggs({'some_agg': {'terms': {'field': 'some_field'}}, 'other_agg': {'avg': {'field': 'age'}}})) + + dict with Agg clauses values: + >>> from pandagg.aggs import Terms, Avg + >>> Aggs().aggs({'some_agg': Terms(field='some_field'), 'other_agg': Avg(field='age')}) + + :param aggs: aggregations to insert into existing aggregation + :param insert_below: name of aggregation below which provided aggs should be inserted + :param at_root: if True, aggregation is inserted at root + :return: copy of initial Aggs with provided aggs inserted + """ + new_agg = self.clone(with_nodes=True) + if insert_below is not None: + insert_below = new_agg.id_from_key(insert_below) + new_agg._insert_aggs(aggs=aggs, insert_below_id=insert_below, at_root=at_root) + return new_agg + def to_dict(self, from_=None, depth=None): + """ + Serialize Aggs as dict. + + :param from_: identifier of aggregation clause, if provided, limits serialization to this clause and its + children (used for recursion, shouldn't be useful) + :param depth: integer, if provided, limit the serialization to a given depth + :return: dict + """ if self.root is None: return None from_ = self.root if from_ is None else from_ @@ -339,31 +208,217 @@ def to_dict(self, from_=None, depth=None): return node_query_dict def applied_nested_path_at_node(self, nid): - # from current node to root + """ + Return nested path applied at a clause. + + :param nid: clause identifier + :return: None if no nested is applied, else applied path (str) + """ + # iterate from provided clause to root clause for id_ in self.ancestors_ids(nid, include_current=True): _, node = self.get(id_) if isinstance(node, (Nested, ReverseNested)): return node.path return None + def apply_reverse_nested(self, nid=None): + for k, leaf in self.leaves(nid): + if isinstance(leaf, BucketAggClause) and self.applied_nested_path_at_node( + leaf.identifier + ): + self.add_node( + ReverseNested(), + insert_below=leaf.identifier, + key="reverse_nested_%s" % leaf.identifier, + ) + + def show(self, *args, line_max_length=80, **kwargs): + """ + Return compact representation of Aggs. + + >>> Aggs({ + >>> "genres": { + >>> "terms": {"field": "genres", "size": 3}, + >>> "aggs": { + >>> "movie_decade": { + >>> "date_histogram": {"field": "year", "fixed_interval": "3650d"} + >>> } + >>> }, + >>> } + >>> }).show() + + genres + └── movie_decade + + All *args and **kwargs are propagated to `lighttree.Tree.show` method. + :return: str + """ + root_children = self.children(self.root) + if len(root_children) == 0: + return " empty" + if len(root_children) == 1: + child_id = root_children[0][1].identifier + return "\n%s" % str( + super(Tree, self).show( + child_id, *args, line_max_length=line_max_length, **kwargs + ) + ) + + return "\n%s" % str( + super(Tree, self).show(*args, line_max_length=line_max_length, **kwargs) + ) + + def __nonzero__(self): + return bool(self.to_dict()) + + __bool__ = __nonzero__ + + def _insert_agg( + self, name, node, insert_below_id=None, at_root=None, groupby=False + ): + """ + Mutate current Aggs instance (no clone), inserting named AggClause instance at asked location. + + :param name: aggregation name + :param node: AggClause instance that should be inserted + :param insert_below_id: if provided, inserted clause is placed below this clause + :param at_root: boolean, if True inserted clause is placed on top of aggregation + :param groupby: boolean, if True, move all targeted clause children under inserted + clause and update groupby pointer + """ + if insert_below_id and at_root: + raise ValueError('Must define at most one of "insert_below" or "at_root".') + if at_root: + insert_below_id = self.root + # based on last groupby pointer + if insert_below_id is None: + insert_below_id = self._groupby_ptr + + if not isinstance(name, str): + raise ValueError('Agg "name" must be a str.') + + _children_aggs = node._children or {} + + if groupby: + if _children_aggs: + raise ValueError("Cannot group by multiple aggs at once.") + subs = [ + self.drop_subtree(cid) for cid in self.children_ids(insert_below_id) + ] + self.insert(node, key=name, parent_id=insert_below_id) + for sub_key, sub_tree in subs: + self.insert(sub_tree, key=sub_key, parent_id=node.identifier) + # moving pointer when using groupby + self._groupby_ptr = node.identifier + return + + # in aggs mode, do not move pointer + self.insert_node(node=node, key=name, parent_id=insert_below_id) + for child_name, child in _children_aggs.items(): + child_node = A(child_name, child) + self._insert_agg( + name=child_name, node=child_node, insert_below_id=node.identifier + ) + + def _clone_init(self, deep=False): + return Aggs( + mappings=self.mappings.clone(deep=deep) + if self.mappings is not None + else None, + nested_autocorrect=self.nested_autocorrect, + _groupby_ptr=self._groupby_ptr, + ) + + def _is_eligible_grouping_node(self, nid): + """ + Return whether node can be used as grouping node. + """ + _, node = self.get(nid) + if not isinstance(node, BucketAggClause): + return False + # special aggregations not returning anything + if isinstance(node, (BucketSelector, BucketSort)): + return False + return True + + @property + def _deepest_linear_bucket_agg(self): + """ + Return deepest bucket aggregation node identifier (pandagg.nodes.abstract.BucketAggClause) of that aggregation + that neither has siblings, nor has an ancestor with siblings. + """ + if len(self._nodes_map) <= 1: + return self.root + last_bucket_agg_id = self.root + children = [ + c + for k, c in self.children(last_bucket_agg_id) + if self._is_eligible_grouping_node(c.identifier) + ] + while len(children) == 1: + last_agg_node = children[0] + if not self._is_eligible_grouping_node(last_agg_node.identifier): + break + last_bucket_agg_id = last_agg_node.identifier + children = [ + c + for k, c in self.children(last_bucket_agg_id) + if self._is_eligible_grouping_node(c.identifier) + ] + return last_bucket_agg_id + + def _insert_aggs(self, aggs, insert_below_id=None, at_root=False): + """ + Insert multiple aggregation clauses in current Aggs (mutate current instance). + By default place them under groupby pointer if none of `insert_below_id` or `at_root` is provided. + + :param aggs: Aggs instance, or dict + :param insert_below_id: clause identifier under which inserted aggs should be placed + :param at_root: if True, place inserted aggs at root, instead of placing them under Aggs._groupby_ptr. + :return: + """ + if at_root: + insert_below_id = self.root + elif not insert_below_id: + # parent based on last groupby pointer + insert_below_id = self._groupby_ptr + + if aggs is None: + return + if isinstance(aggs, Aggs): + self.merge(aggs, nid=insert_below_id) + self._groupby_ptr = self.root + return + if isinstance(aggs, dict): + for agg_name, agg_body in aggs.items(): + node = A(agg_name, agg_body) + self._insert_agg( + name=agg_name, node=node, insert_below_id=insert_below_id + ) + return + raise TypeError("Unsupported aggs type %s for Aggs" % type(aggs)) + def _insert_node_below(self, node, parent_id, key, by_path): """ - If mapping is provided, nested aggregations are automatically applied. + If mappings is provided, check if aggregation complies with it (nested / reverse nested). + + Note: overrides `lighttree.Tree._insert_node_below` method to handle automatic nested validation while inserting + a clause. """ - # if aggregation node is explicitely nested or reverse nested aggregation, do not override, but validate + # if aggregation node is explicitly nested or reverse nested aggregation, do not override if ( isinstance(node, Nested) or isinstance(node, ReverseNested) - or not self.mapping + or not self.mappings or not hasattr(node, "field") or self.root is None ): return super(Aggs, self)._insert_node_below(node, parent_id, key, by_path) - self.mapping.validate_agg_node(node) + self.mappings.validate_agg_clause(node) # from deepest to highest - required_nested_level = self.mapping.nested_at_field(node.field) + required_nested_level = self.mappings.nested_at_field(node.field) current_nested_level = self.applied_nested_path_at_node(parent_id) if current_nested_level == required_nested_level: @@ -402,7 +457,7 @@ def _insert_node_below(self, node, parent_id, key, by_path): ) # requires nested - apply all required nested fields - for nested_lvl in reversed(self.mapping.list_nesteds_at_field(node.field)): + for nested_lvl in reversed(self.mappings.list_nesteds_at_field(node.field)): if current_nested_level != nested_lvl: # check if already exists in direct children, else create it child_nested = next( @@ -430,32 +485,5 @@ def _insert_node_below(self, node, parent_id, key, by_path): parent_id = nested_node.identifier super(Aggs, self)._insert_node_below(node, parent_id, key, by_path) - def apply_reverse_nested(self, nid=None): - for k, leaf in self.leaves(nid): - if isinstance(leaf, BucketAggNode) and self.applied_nested_path_at_node( - leaf.identifier - ): - self.add_node( - ReverseNested(), - insert_below=leaf.identifier, - key="reverse_nested_%s" % leaf.identifier, - ) - def __str__(self): return json.dumps(self.to_dict(), indent=2) - - def show(self, *args, line_max_length=80, **kwargs): - root_children = self.children(self.root) - if len(root_children) == 0: - return " empty" - if len(root_children) == 1: - child_id = root_children[0][1].identifier - return "\n%s" % text( - super(Tree, self).show( - child_id, *args, line_max_length=line_max_length, **kwargs - ) - ) - - return "\n%s" % text( - super(Tree, self).show(*args, line_max_length=line_max_length, **kwargs) - ) diff --git a/pandagg/tree/mapping.py b/pandagg/tree/mapping.py deleted file mode 100644 index 0cef7fab..00000000 --- a/pandagg/tree/mapping.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from future.utils import iteritems - -from pandagg.node.mapping.abstract import Field, RegularField, ComplexField - - -from pandagg.exceptions import ( - AbsentMappingFieldError, - InvalidOperationMappingFieldError, -) -from pandagg.tree._tree import Tree - - -def _mapping(m): - if m is None: - return None - if isinstance(m, dict): - return Mapping(**m) - if isinstance(m, Mapping): - return m - raise TypeError("Unsupported %s type for Mapping" % type(m)) - - -class Mapping(Tree): - - node_class = Field - KEY = None - - def __init__(self, properties=None, dynamic=False, **kwargs): - """""" - super(Mapping, self).__init__() - root_node = Field(dynamic=dynamic, **kwargs) - self.insert_node(root_node) - if properties: - self._insert(root_node.identifier, properties, False) - - def _insert(self, pid, el, is_subfield): - if not isinstance(el, dict): - raise ValueError("Wrong declaration, got %s" % el) - for name, field in iteritems(el): - if isinstance(field, dict): - field = field.copy() - field = Field._get_dsl_class(field.pop("type", "object"))( - _subfield=is_subfield, **field - ) - elif isinstance(field, Field): - field._subfield = is_subfield - pass - else: - raise ValueError("Unsupported type %s" % type(field)) - self.insert_node(field, key=name, parent_id=pid) - if isinstance(field, ComplexField) and field.properties: - self._insert(field.identifier, field.properties, False) - if isinstance(field, RegularField) and field.fields: - if is_subfield: - raise ValueError( - "Cannot insert subfields into a subfield on field %s" % name - ) - self._insert(field.identifier, field.fields, True) - - def to_dict(self, from_=None, depth=None): - if self.root is None: - return None - from_ = self.root if from_ is None else from_ - key, node = self.get(from_) - children_queries = {} - if depth is None or depth > 0: - if depth is not None: - depth -= 1 - for child_key, child_node in self.children(node.identifier): - children_queries[child_key] = self.to_dict( - from_=child_node.identifier, depth=depth - ) - serialized_node = node.body - if children_queries: - if node.KEY is None or node.KEY in ("object", "nested"): - serialized_node["properties"] = children_queries - else: - serialized_node["fields"] = children_queries - return serialized_node - - def validate_agg_node(self, agg_node, exc=True): - """Ensure if node has field or path that it exists in mapping, and that required aggregation type - if allowed on this kind of field. - :param agg_node: AggNode you want to validate on this mapping - :param exc: boolean, if set to True raise exception if invalid - :rtype: boolean - """ - if hasattr(agg_node, "path"): - if agg_node.path is None: - # reverse nested - return True - return self.resolve_path_to_id(agg_node.path) in self - - if not hasattr(agg_node, "field"): - return True - - # TODO take into account flattened data type - try: - nid = self.get_node_id_by_path(agg_node.field) - except StopIteration: - raise AbsentMappingFieldError( - u"Agg of type <%s> on non-existing field <%s>." - % (agg_node.KEY, agg_node.field) - ) - _, field = self.get(nid) - - field_type = field.KEY - if not agg_node.valid_on_field_type(field_type): - if not exc: - return False - raise InvalidOperationMappingFieldError( - u"Agg of type <%s> not possible on field of type <%s>." - % (agg_node.KEY, field_type) - ) - return True - - def mapping_type_of_field(self, field_path): - try: - _, node = self.get(field_path, by_path=True) - return node.KEY - except Exception: - raise AbsentMappingFieldError( - u"<%s field is not present in mapping>" % field_path - ) - - def nested_at_field(self, field_path): - nesteds = self.list_nesteds_at_field(field_path) - if nesteds: - return nesteds[0] - return None - - def list_nesteds_at_field(self, field_path): - """List nested paths that apply at a given path.""" - path_nid = self.get_node_id_by_path(field_path) - # from deepest to highest - return [ - self.get_path(nid) - for nid in self.ancestors_ids(path_nid, include_current=True) - if self.get(nid)[1].KEY == "nested" - ] diff --git a/pandagg/tree/mappings.py b/pandagg/tree/mappings.py new file mode 100644 index 00000000..d2f47001 --- /dev/null +++ b/pandagg/tree/mappings.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +from pandagg.node.mappings.abstract import Field, RegularField, ComplexField + + +from pandagg.exceptions import ( + AbsentMappingFieldError, + InvalidOperationMappingFieldError, +) +from pandagg.tree._tree import Tree + + +def _mappings(m): + if m is None: + return None + if isinstance(m, dict): + return Mappings(**m) + if isinstance(m, Mappings): + return m + raise TypeError("Unsupported %s type for Mappings" % type(m)) + + +class Mappings(Tree): + + node_class = Field + + def __init__(self, properties=None, dynamic=False, **kwargs): + super(Mappings, self).__init__() + root_node = Field(dynamic=dynamic, **kwargs) + self.insert_node(root_node) + if properties: + self._insert(root_node.identifier, properties, False) + + def to_dict(self, from_=None, depth=None): + """ + Serialize Mappings as dict. + + :param from_: identifier of a field, if provided, limits serialization to this field and its + children (used for recursion, shouldn't be useful) + :param depth: integer, if provided, limit the serialization to a given depth + :return: dict + """ + if self.root is None: + return None + from_ = self.root if from_ is None else from_ + key, node = self.get(from_) + children_queries = {} + if depth is None or depth > 0: + if depth is not None: + depth -= 1 + for child_key, child_node in self.children(node.identifier): + children_queries[child_key] = self.to_dict( + from_=child_node.identifier, depth=depth + ) + serialized_node = node.body + if children_queries: + if node.KEY is None or node.KEY in ("object", "nested"): + serialized_node["properties"] = children_queries + else: + serialized_node["fields"] = children_queries + return serialized_node + + def validate_agg_clause(self, agg_clause, exc=True): + """ + Ensure that if aggregation clause relates to a field (`field` or `path`) this field exists in mappings, and that + required aggregation type is allowed on this kind of field. + + :param agg_clause: AggClause you want to validate on these mappings + :param exc: boolean, if set to True raise exception if invalid + :rtype: boolean + """ + if hasattr(agg_clause, "path"): + if agg_clause.path is None: + # reverse nested + return True + return self.resolve_path_to_id(agg_clause.path) in self + + if not hasattr(agg_clause, "field"): + return True + + # TODO take into account flattened data type + try: + nid = self.get_node_id_by_path(agg_clause.field) + except StopIteration: + raise AbsentMappingFieldError( + u"Agg of type <%s> on non-existing field <%s>." + % (agg_clause.KEY, agg_clause.field) + ) + _, field = self.get(nid) + + field_type = field.KEY + if not agg_clause.valid_on_field_type(field_type): + if not exc: + return False + raise InvalidOperationMappingFieldError( + u"Agg of type <%s> not possible on field of type <%s>." + % (agg_clause.KEY, field_type) + ) + return True + + def mapping_type_of_field(self, field_path): + """ + Return field type of provided field path. + + >>> mappings = Mappings(dynamic=False, properties={ + >>> 'id': {'type': 'keyword'}, + >>> 'comments': {'type': 'nested', 'properties': { + >>> 'comment_text': {'type': 'text'}, + >>> 'date': {'type': 'date'} + >>> }} + >>> }) + >>> mappings.mapping_type_of_field('id') + 'keyword' + >>> mappings.mapping_type_of_field('comments') + 'nested' + >>> mappings.mapping_type_of_field('comments.comment_text') + 'text' + """ + try: + _, node = self.get(field_path, by_path=True) + return node.KEY + except Exception: + raise AbsentMappingFieldError( + u"<%s field is not present in mappings>" % field_path + ) + + def nested_at_field(self, field_path): + """ + Return nested path applied on a given path. Return `None` is none applies. + + >>> mappings = Mappings(dynamic=False, properties={ + >>> 'id': {'type': 'keyword'}, + >>> 'comments': {'type': 'nested', 'properties': { + >>> 'comment_text': {'type': 'text'}, + >>> 'date': {'type': 'date'} + >>> }} + >>> }) + >>> mappings.nested_at_field('id') + None + >>> mappings.nested_at_field('comments') + 'comments' + >>> mappings.nested_at_field('comments.comment_text') + 'comments' + """ + nesteds = self.list_nesteds_at_field(field_path) + if nesteds: + return nesteds[0] + return None + + def list_nesteds_at_field(self, field_path): + """ + List nested paths that apply at a given path. + + >>> mappings = Mappings(dynamic=False, properties={ + >>> 'id': {'type': 'keyword'}, + >>> 'comments': {'type': 'nested', 'properties': { + >>> 'comment_text': {'type': 'text'}, + >>> 'date': {'type': 'date'} + >>> }} + >>> }) + >>> mappings.list_nesteds_at_field('id') + [] + >>> mappings.list_nesteds_at_field('comments') + ['comments'] + >>> mappings.list_nesteds_at_field('comments.comment_text') + ['comments'] + """ + path_nid = self.get_node_id_by_path(field_path) + # from deepest to highest + return [ + self.get_path(nid) + for nid in self.ancestors_ids(path_nid, include_current=True) + if self.get(nid)[1].KEY == "nested" + ] + + def _insert(self, pid, properties, is_subfield): + """ + Recursive method to insert properties in current mappings. + + :param pid: parent field identifier + :param properties: fields definitions that are inserted below pid + :param is_subfield: are provided properties `fields` mappings parameter, cf + https://www.elastic.co/guide/en/elasticsearch/reference/current/multi-fields.html + """ + if not isinstance(properties, dict): + raise ValueError("Wrong declaration, got %s" % properties) + for field_name, field in properties.items(): + if isinstance(field, dict): + field = field.copy() + field = Field._get_dsl_class(field.pop("type", "object"))( + _subfield=is_subfield, **field + ) + elif isinstance(field, Field): + field._subfield = is_subfield + pass + else: + raise ValueError("Unsupported type %s" % type(field)) + self.insert_node(field, key=field_name, parent_id=pid) + if isinstance(field, ComplexField) and field.properties: + self._insert(field.identifier, field.properties, False) + if isinstance(field, RegularField) and field.fields: + if is_subfield: + raise ValueError( + "Cannot insert subfields into a subfield on field %s" + % field_name + ) + self._insert(field.identifier, field.fields, True) diff --git a/pandagg/tree/query.py b/pandagg/tree/query.py index 5c03548d..51cb1b22 100644 --- a/pandagg/tree/query.py +++ b/pandagg/tree/query.py @@ -1,12 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals import json -from future.utils import python_2_unicode_compatible -from builtins import str as text - from pandagg._decorators import Substitution from pandagg.node.query._parameter_clause import ParentParameterClause from pandagg.node.query.abstract import QueryClause, LeafQueryClause, Q @@ -14,27 +10,22 @@ from pandagg.node.query.joining import Nested from pandagg.tree._tree import Tree -from pandagg.tree.mapping import _mapping +from pandagg.tree.mappings import _mappings ADD = "add" REPLACE = "replace" REPLACE_ALL = "replace_all" sub_insertion = Substitution( - insertion_doc=""" - * *parent* (``str``) -- + location_kwargs=""" + * *insert_below* (``str``) -- named query clause under which the inserted clauses should be placed. - * *parent_param* (``str`` optional parameter when using *parent* param) -- - parameter under which inserted clauses will be placed. For instance if *parent* clause is a boolean, can be - 'must', 'filter', 'should', 'must_not'. - - * *child* (``str``) -- - named query clause above which the inserted clauses should be placed. + * *compound_param* (``str``) -- + param under which inserted clause will be placed in compound query - * *child_param* (``str`` optional parameter when using *parent* param) -- - parameter of inserted boolean clause under which child clauses will be placed. For instance if inserted clause - is a boolean, can be 'must', 'filter', 'should', 'must_not'. + * *on* (``str``) -- + named compound query clause on which the inserted compound clause should be merged. * *mode* (``str`` one of 'add', 'replace', 'replace_all') -- merging strategy when inserting clauses on a existing compound clause. @@ -47,300 +38,284 @@ ) -@python_2_unicode_compatible class Query(Tree): - KEY = None node_class = QueryClause - def __init__(self, q=None, mapping=None, nested_autocorrect=False): + def __init__(self, q=None, mappings=None, nested_autocorrect=False): """ Combination of query clauses. - Mapping declaration is optional, but doing so validates query consistency. + Mappings declaration is optional, but doing so validates query consistency. :param q: optional, query (dict, or Query instance) - :param mapping: ``dict`` or ``pandagg.tree.mapping.Mapping`` - Mapping of requested indice(s). Providing it will add validation features. - :param nested_autocorrect: add required nested clauses if missing. Ignored if mapping is not provided. + :param mappings: ``dict`` or ``pandagg.tree.mappings.Mappings`` + Mappings of requested indice(s). Providing it will add validation features. + :param nested_autocorrect: add required nested clauses if missing. Ignored if mappings is not provided. """ - self.mapping = _mapping(mapping) + self.mappings = _mappings(mappings) self.nested_autocorrect = nested_autocorrect super(Query, self).__init__() if q: self._insert_query(q) - def __nonzero__(self): - return bool(self.to_dict()) + @sub_insertion + def query( + self, + type_or_query, + insert_below=None, + on=None, + mode=ADD, + compound_param=None, + **body + ): + r""" + Insert provided clause in copy of initial Query. - __bool__ = __nonzero__ + >>> from pandagg.query import Query + >>> Query().query('term', some_field=23) + {'term': {'some_field': 23}} - def _clone_init(self, deep=False): - return Query( - mapping=None - if self.mapping is None - else self.mapping.clone(with_nodes=True, deep=deep), - nested_autocorrect=self.nested_autocorrect, + >>> from pandagg.query import Term + >>> Query()\ + >>> .query({'term': {'some_field': 23})\ + >>> .query(Term(other_field=24))\ + {'bool': {'must': [{'term': {'some_field': 23}}, {'term': {'other_field': 24}}]}} + + :Keyword Arguments: + %(location_kwargs)s + """ + q = self.clone(with_nodes=True) + node = self._q(type_or_query, **body) + q._insert_query_at( + node, + mode=mode, + on=on, + insert_below=insert_below, + compound_param=compound_param, ) + return q - @classmethod - def _from_dict(cls, d): - """return Query, from dict""" - # {"nested": {"path": "xxx", "query": {"term": {"some_field": "234"}}}} - if not len(d.keys()) == 1: - raise ValueError("Wrong declaration, got %s" % d.keys()) - vk, vv = d.copy().popitem() - return cls._get_dsl_class_from_tree_or_node(vk, **vv) + @sub_insertion + def must( + self, + type_or_query, + insert_below=None, + on=None, + mode=ADD, + bool_body=None, + **body + ): + r""" + Create copy of initial Query and insert provided clause under "bool" query "must". - def _has_bool_root(self): - if not self.root: - return False - _, r = self.get(self.root) - return isinstance(r, Bool) + >>> Query().must('term', some_field=1) + >>> Query().must({'term': {'some_field': 1}}) + >>> from pandagg.query import Term + >>> Query().must(Term(some_field=1)) - def _compound_param_id(self, nid, key, create_if_not_exists=True): - """ - :param nid: id of compound node - :param key: param key, for instance if compound if bool, can be 'must', 'should', 'must_not' etc - :return: param node id + :Keyword Arguments: + %(location_kwargs)s """ - try: - return self.child_id(nid, key) - except ValueError: - if not create_if_not_exists: - raise - # add key - param_node = self.get_node_dsl_class(key)() - self._insert_node_below(param_node, parent_id=nid, key=key, by_path=False) - return param_node.identifier + return self._compound_param_insert( + "bool", "must", mode, type_or_query, insert_below, on, bool_body, **body + ) - @classmethod - def _q(cls, type_or_query, **body): - if isinstance(type_or_query, Query): - type_or_query = type_or_query.to_dict() - return Q(type_or_query, **body) + def should( + self, + type_or_query, + insert_below=None, + on=None, + mode=ADD, + bool_body=None, + **body + ): + return self._compound_param_insert( + "bool", "should", mode, type_or_query, insert_below, on, bool_body, **body + ) - def _insert_query_at( - self, node, mode, on=None, insert_below=None, compound_param=None + def must_not( + self, + type_or_query, + insert_below=None, + on=None, + mode=ADD, + bool_body=None, + **body ): - """Insert clause (and its children) in query. + return self._compound_param_insert( + "bool", "must_not", mode, type_or_query, insert_below, on, bool_body, **body + ) - If compound query with on specified: merge according to mode. - If insert_below is not specified, place on top (wrapped in bool-must if necessary). - If insert_below is provided (only under compound query): place under it. + def filter( + self, + type_or_query, + insert_below=None, + on=None, + mode=ADD, + bool_body=None, + **body + ): + return self._compound_param_insert( + "bool", "filter", mode, type_or_query, insert_below, on, bool_body, **body + ) - :param node: node to insert, can contain children clauses (in case of compound query). - :param mode: how compound queries merge should be treated - :param on: id of compound query on which node should be merged - :param insert_below: - :return: + # compound + def bool( + self, + must=None, + should=None, + must_not=None, + filter=None, + insert_below=None, + on=None, + mode=ADD, + **body + ): """ - if mode not in (ADD, REPLACE, REPLACE_ALL): - raise ValueError("Invalid mode %s" % mode) - - if ( - isinstance(node, Bool) - and not on - and not insert_below - and self._has_bool_root() - ): - on = self.root + >>> Query().bool(must={"term": {"some_field": "yolo"}}) + """ + return self.query( + "bool", + must=must, + should=should, + must_not=must_not, + filter=filter, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - if isinstance(node, CompoundClause) and on: - # ensure we try to merge on same type of clause - _, existing = self.get(on) - if existing.KEY != node.KEY: - raise ValueError( - "Cannot merge compound clause %s on %s. Must be the same." - % (node.KEY, existing.KEY) - ) - if mode == REPLACE_ALL: - pid = self.parent_id(on) - existing_k, _ = self.drop_subtree(on) - self._insert_query(node, insert_below=pid) - existing.body = node.body - return + def boosting( + self, positive=None, negative=None, insert_below=None, on=None, mode=ADD, **body + ): + if not positive and not negative: + raise ValueError('Expect at least one of "positive", "negative"') + return self.query( + "boosting", + positive=positive, + negative=negative, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - # merge - existing.body.update(node.body) - for param_key, children in node._children.items(): - if not children: - continue - param_id = self._compound_param_id(on, param_key) - # here, possible modes are either ADD, or REPLACE - if mode == REPLACE: - existing_clauses_ids = self.children_ids(param_id) - for eid in existing_clauses_ids: - self.drop_node(eid) - for child in children: - self._insert_query(child, insert_below=param_id) - return + def constant_score( + self, filter=None, boost=None, insert_below=None, on=None, mode=ADD, **body + ): + if not filter and not boost: + raise ValueError('Expect at least one of "filter", "boost"') + return self.query( + "constant_score", + filter=filter, + boost=boost, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - if insert_below: - # below node, with compound_param - _, pnode = self.get(insert_below) - if not isinstance(pnode, CompoundClause): - raise ValueError( - "Cannot insert clause below %s clause (only compound clauses can have children clauses)." - % pnode.KEY - ) - compound_param = compound_param or pnode._default_operator - if compound_param not in pnode._parent_params.keys(): - raise ValueError( - "<%s> parameter for <%s> compound clause does not accept children clauses." - % (compound_param, pnode.KEY) - ) - param_id = self._compound_param_id(insert_below, compound_param) - _, p = self.get(param_id) - if not p.MULTIPLE: - # inserting a clause, under a parameter allowing a single clause (for instance below nested query) - cids = self.children_ids(param_id) - if cids: - # can be at most one - cid = cids[0] - # must place existing clause, and new one under a bool -> must - _, existing_child = self.get(cid) - if isinstance(existing_child, Bool): - child_param_id = self._compound_param_id( - existing_child.identifier, "must" - ) - self._insert_query(node, insert_below=child_param_id) - return - _, existing_child = self.drop_node(cid) - self._insert_query( - Bool(must=[existing_child, node]), insert_below=param_id - ) - return - self._insert_query(node, insert_below=param_id) - return + def dis_max(self, queries, insert_below=None, on=None, mode=ADD, **body): + return self.query( + "dis_max", + queries=queries, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - # from now on: position was not provided: - if self.is_empty(): - # currently empty (-> on top) - self._insert_query(node) - return - - if self._has_bool_root(): - # top query is bool - must_id = self._compound_param_id(self.root, "must") - self._insert_query(node, insert_below=must_id) - return - - # top query is not bool - _, initial_query = self.drop_subtree(self.root) - if isinstance(node, Bool): - # if inserted node is bool, placed on top, and previous query is added under "must" parameter - self._insert_query(node) - self._insert_query_at( - initial_query.to_dict(), insert_below=node.identifier, mode=ADD - ) - else: - # we place initial + added clauses under a bool>must - self._insert_query(Bool(must=[node, initial_query.to_dict()])) - return + def function_score(self, query, insert_below=None, on=None, mode=ADD, **body): + return self.query( + "function_score", + query=query, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - def _insert_query(self, query=None, insert_below=None): - """ - Accept Node, or dict syntaxes, convert to nodes. - Insert query clause and its children. - Wraps in bool->must if parent param is not multiple. - - Does not handle: - - syntax conversion (Tree -> Node, or flat syntax) - - logic about where to insert it, should be handled before (dumb insert below insert_below) - - >>> Query()._insert_query({"terms": {"field": "user"}}) - >>> Query()._insert_query(Query({"terms": {"field": "user"}})) - >>> Query()._insert_query({"bool": {"must": {"term": {"some_field": "yolo"}}, "should": {}}}) - >>> Query()._insert_query({"term": {"some_field": "yolo"}) - """ - if isinstance(query, QueryClause): - node = query - elif isinstance(query, dict): - query = query.copy() - # {"term": {"some_field": 1}} - # {"bool": {"filter": [{"term": {"some_field": 1}}]}} - if len(query.keys()) != 1: - raise ValueError( - "Invalid query clause declaration (two many keys): got <%s>" - % query.keys() - ) - type_, body_ = query.popitem() - node_klass = self.get_node_dsl_class(type_) - if issubclass(node_klass, ParentParameterClause): - raise ValueError() - node = node_klass(**body_) - else: - raise ValueError('"query" must be of type "dict" or "AggNode"') + def nested(self, path, query=None, insert_below=None, on=None, mode=ADD, **body): + return self.query( + "nested", + query=query, + insert_below=insert_below, + on=on, + mode=mode, + path=path, + **body + ) - self.insert_node(node, parent_id=insert_below) + def has_child(self, query, insert_below=None, on=None, mode=ADD, **body): + return self.query( + "has_child", + query=query, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - if not isinstance(node, CompoundClause): - return + def has_parent(self, query, insert_below=None, on=None, mode=ADD, **body): + return self.query( + "has_parent", + query=query, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - _children_clauses = node._children.copy() - for param_name, child_nodes in _children_clauses.items(): - param_node = self.get_node_dsl_class(param_name)() - if not param_node.MULTIPLE and len(child_nodes) > 1: - raise ValueError( - "Cannot insert multiple query clauses under %s parameter" - % param_name - ) - self.insert_node(param_node, parent_id=node.identifier, key=param_name) - for child in child_nodes: - self._insert_query(query=child, insert_below=param_node.identifier) + def script_score(self, query, insert_below=None, on=None, mode=ADD, **body): + return self.query( + "script_score", + query=query, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - def _insert_node_below(self, node, parent_id, key, by_path): - """Override lighttree.Tree._insert_node_below method to ensure inserted query clause is consistent.""" - if parent_id is not None: - _, pnode = self.get(parent_id) - if isinstance(pnode, LeafQueryClause): - raise ValueError( - "Cannot add clause under leaf query clause <%s>" % pnode.KEY - ) - if isinstance(pnode, ParentParameterClause): - if isinstance(node, ParentParameterClause): - raise ValueError( - "Cannot add parameter clause <%s> under another parameter clause <%s>" - % (pnode.KEY, node.KEY) - ) - if isinstance(pnode, CompoundClause): - if key not in pnode._parent_params: - raise ValueError( - "Expect a parameter clause of type %s under <%s> compound clause, got <%s>" - % (pnode._parent_params.keys(), pnode.KEY, key) - ) + def pinned_query(self, organic, insert_below=None, on=None, mode=ADD, **body): + return self.query( + "pinned_query", + organic=organic, + insert_below=insert_below, + on=on, + mode=mode, + **body + ) - # automatic handling of nested clauses - if isinstance(node, Nested) or not self.mapping or not hasattr(node, "field"): - return super(Query, self)._insert_node_below( - node=node, parent_id=parent_id, key=key, by_path=by_path - ) - required_nested_level = self.mapping.nested_at_field(node.field) - if len(self.list()) <= 1: - # empty - current_nested_level = None - else: - current_nested_level = self.applied_nested_path_at_node(parent_id) - if current_nested_level == required_nested_level: - return super(Query, self)._insert_node_below( - node=node, parent_id=parent_id, key=key, by_path=by_path - ) - if not self.nested_autocorrect: - raise ValueError( - "Invalid %s query clause on %s field. Invalid nested: expected %s, current %s." - % (node.KEY, node.field, required_nested_level, current_nested_level) - ) - # requires nested - apply all required nested fields - to_insert = node - for nested_lvl in self.mapping.list_nesteds_at_field(node.field): - if current_nested_level != nested_lvl: - to_insert = self.get_node_dsl_class("nested")( - path=nested_lvl, query=to_insert - ) - self._insert_query(to_insert, parent_id) + def show(self, *args, line_max_length=80, **kwargs): + """ + Return compact representation of Query. + + >>> Query()\ + >>> .must({"exists": {"field": "some_field"}})\ + >>> .must({"term": {"other_field": {"value": 5}}})\ + >>> .show() + + bool + └── must + ├── exists field=some_field + └── term field=other_field, value=5 + + All *args and **kwargs are propagated to `lighttree.Tree.show` method. + :return: str + """ + return "\n%s" % str( + super(Tree, self).show(*args, line_max_length=line_max_length, **kwargs) + ) def applied_nested_path_at_node(self, nid): + """ + Return nested path applied at a clause. + + :param nid: clause identifier + :return: None if no nested is applied, else applied path (str) + """ # from current node to root for id_ in self.ancestors_ids(nid, include_current=True): _, node = self.get(id_) @@ -349,7 +324,9 @@ def applied_nested_path_at_node(self, nid): return None def to_dict(self, from_=None): - """Serialize query as dict.""" + """ + Serialize Query as dict. + """ if self.root is None: return None from_ = self.root if from_ is None else from_ @@ -382,265 +359,266 @@ def to_dict(self, from_=None): q[node.KEY].update(d) return q - @sub_insertion - def query( + # compound parameters + def _compound_param_insert( self, + compound_key, + compound_param_key, + mode, type_or_query, insert_below=None, on=None, - mode=ADD, - compound_param=None, + compound_body=None, **body ): - r"""Insert new clause(s) in current query. - - Inserted clause can accepts following syntaxes. - - Given an empty query: - - >>> from pandagg.query import Query - >>> q = Query() - - flat syntax: clause type, followed by query clause body as keyword arguments: - - >>> q.query('term', some_field=23) - {'term': {'some_field': 23}} + q = self.clone(with_nodes=True) + node = self._q(type_or_query, **body) + compound_body = compound_body or {} + compound_body[compound_param_key] = node + compound_node = self.get_node_dsl_class(compound_key)(**compound_body) + q._insert_query_at(compound_node, on=on, insert_below=insert_below, mode=mode) + return q - using pandagg DSL: + def __nonzero__(self): + return bool(self.to_dict()) - >>> from pandagg.query import Term - >>> q.query(Term(field=23)) - {'term': {'some_field': 23}} + __bool__ = __nonzero__ - >>> q.query({'bool': {'must': [{'term': {'some_field': 1}}]}}) - {'bool': {'must': [{'term': {'some_field': 1}}]}} + def _clone_init(self, deep=False): + return Query( + mappings=None + if self.mappings is None + else self.mappings.clone(with_nodes=True, deep=deep), + nested_autocorrect=self.nested_autocorrect, + ) - >>> from pandagg.query import Bool - >>> q.query(Bool(must=[{'term': {'some_field': 1}}], boost=1)) + def _has_bool_root(self): + if not self.root: + return False + _, r = self.get(self.root) + return isinstance(r, Bool) - :Keyword Arguments: - %(insertion_doc)s + def _compound_param_id(self, nid, key, create_if_not_exists=True): + """ + :param nid: id of compound node + :param key: param key, for instance if compound if bool, can be 'must', 'should', 'must_not' etc + :return: param node id + """ + try: + return self.child_id(nid, key) + except ValueError: + if not create_if_not_exists: + raise + # add key + param_node = self.get_node_dsl_class(key)() + self._insert_node_below(param_node, parent_id=nid, key=key, by_path=False) + return param_node.identifier + @classmethod + def _q(cls, type_or_query, **body): """ - q = self.clone(with_nodes=True) - node = self._q(type_or_query, **body) - q._insert_query_at( - node, - mode=mode, - on=on, - insert_below=insert_below, - compound_param=compound_param, - ) - return q + Convert to QueryClause instance. + """ + if isinstance(type_or_query, Query): + type_or_query = type_or_query.to_dict() + return Q(type_or_query, **body) - # compound - def bool( - self, - must=None, - should=None, - must_not=None, - filter=None, - insert_below=None, - on=None, - mode=ADD, - **body + def _insert_query_at( + self, node, mode, on=None, insert_below=None, compound_param=None ): """ - >>> Query().bool(must={"term": {"some_field": "yolo"}}) + Insert clause (and its children) in Query. + + If compound query with on specified: merge according to mode. + If insert_below is not specified, place on top (wrapped in bool-must if necessary). + If insert_below is provided (only under compound query): place under it. """ - return self.query( - "bool", - must=must, - should=should, - must_not=must_not, - filter=filter, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) + if mode not in (ADD, REPLACE, REPLACE_ALL): + raise ValueError("Invalid mode %s" % mode) - def boosting( - self, positive=None, negative=None, insert_below=None, on=None, mode=ADD, **body - ): - if not positive and not negative: - raise ValueError('Expect at least one of "positive", "negative"') - return self.query( - "boosting", - positive=positive, - negative=negative, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) - - def constant_score( - self, filter=None, boost=None, insert_below=None, on=None, mode=ADD, **body - ): - if not filter and not boost: - raise ValueError('Expect at least one of "filter", "boost"') - return self.query( - "constant_score", - filter=filter, - boost=boost, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) - - def dis_max(self, queries, insert_below=None, on=None, mode=ADD, **body): - return self.query( - "dis_max", - queries=queries, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) - - def function_score(self, query, insert_below=None, on=None, mode=ADD, **body): - return self.query( - "function_score", - query=query, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) + if ( + isinstance(node, Bool) + and not on + and not insert_below + and self._has_bool_root() + ): + on = self.root - def nested(self, path, query=None, insert_below=None, on=None, mode=ADD, **body): - return self.query( - "nested", - query=query, - insert_below=insert_below, - on=on, - mode=mode, - path=path, - **body - ) + if isinstance(node, CompoundClause) and on: + # ensure we try to merge on same type of clause + _, existing = self.get(on) + if existing.KEY != node.KEY: + raise ValueError( + "Cannot merge compound clause %s on %s. Must be the same." + % (node.KEY, existing.KEY) + ) + if mode == REPLACE_ALL: + pid = self.parent_id(on) + existing_k, _ = self.drop_subtree(on) + self._insert_query(node, insert_below=pid) + existing.body = node.body + return - def has_child(self, query, insert_below=None, on=None, mode=ADD, **body): - return self.query( - "has_child", - query=query, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) + # merge + existing.body.update(node.body) + for param_key, children in node._children.items(): + if not children: + continue + param_id = self._compound_param_id(on, param_key) + # here, possible modes are either ADD, or REPLACE + if mode == REPLACE: + existing_clauses_ids = self.children_ids(param_id) + for eid in existing_clauses_ids: + self.drop_node(eid) + for child in children: + self._insert_query(child, insert_below=param_id) + return - def has_parent(self, query, insert_below=None, on=None, mode=ADD, **body): - return self.query( - "has_parent", - query=query, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) + if insert_below: + # below node, with compound_param + _, pnode = self.get(insert_below) + if not isinstance(pnode, CompoundClause): + raise ValueError( + "Cannot insert clause below %s clause (only compound clauses can have children clauses)." + % pnode.KEY + ) + compound_param = compound_param or pnode._default_operator + if compound_param not in pnode._parent_params.keys(): + raise ValueError( + "<%s> parameter for <%s> compound clause does not accept children clauses." + % (compound_param, pnode.KEY) + ) + param_id = self._compound_param_id(insert_below, compound_param) + _, p = self.get(param_id) + if not p.MULTIPLE: + # inserting a clause, under a parameter allowing a single clause (for instance below nested query) + cids = self.children_ids(param_id) + if cids: + # can be at most one + cid = cids[0] + # must place existing clause, and new one under a bool -> must + _, existing_child = self.get(cid) + if isinstance(existing_child, Bool): + child_param_id = self._compound_param_id( + existing_child.identifier, "must" + ) + self._insert_query(node, insert_below=child_param_id) + return + _, existing_child = self.drop_node(cid) + self._insert_query( + Bool(must=[existing_child, node]), insert_below=param_id + ) + return + self._insert_query(node, insert_below=param_id) + return - def script_score(self, query, insert_below=None, on=None, mode=ADD, **body): - return self.query( - "script_score", - query=query, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) + # from now on: position was not provided: + if self.is_empty(): + # currently empty (-> on top) + self._insert_query(node) + return - def pinned_query(self, organic, insert_below=None, on=None, mode=ADD, **body): - return self.query( - "pinned_query", - organic=organic, - insert_below=insert_below, - on=on, - mode=mode, - **body - ) + if self._has_bool_root(): + # top query is bool + must_id = self._compound_param_id(self.root, "must") + self._insert_query(node, insert_below=must_id) + return - # compound parameters - def _compound_param_insert( - self, - compound_key, - compound_param_key, - mode, - type_or_query, - insert_below=None, - on=None, - compound_body=None, - **body - ): - q = self.clone(with_nodes=True) - node = self._q(type_or_query, **body) - compound_body = compound_body or {} - compound_body[compound_param_key] = node - compound_node = self.get_node_dsl_class(compound_key)(**compound_body) - q._insert_query_at(compound_node, on=on, insert_below=insert_below, mode=mode) - return q + # top query is not bool + _, initial_query = self.drop_subtree(self.root) + if isinstance(node, Bool): + # if inserted node is bool, placed on top, and previous query is added under "must" parameter + self._insert_query(node) + self._insert_query_at( + initial_query.to_dict(), insert_below=node.identifier, mode=ADD + ) + else: + # we place initial + added clauses under a bool>must + self._insert_query(Bool(must=[node, initial_query.to_dict()])) + return - def must( - self, - type_or_query, - insert_below=None, - on=None, - mode=ADD, - bool_body=None, - **body - ): + def _insert_query(self, query=None, insert_below=None): """ - >>> Query().must('term', some_field=1) - >>> Query().must({'term': {'some_field': 1}}) + Insert query clause and its children (recursively). + Does not handle logic about where to insert it, should be handled before (dumb insert below insert_below). """ - return self._compound_param_insert( - "bool", "must", mode, type_or_query, insert_below, on, bool_body, **body - ) + node = self._q(query) + self.insert_node(node, parent_id=insert_below) - def should( - self, - type_or_query, - insert_below=None, - on=None, - mode=ADD, - bool_body=None, - **body - ): - return self._compound_param_insert( - "bool", "should", mode, type_or_query, insert_below, on, bool_body, **body - ) + if not isinstance(node, CompoundClause): + return - def must_not( - self, - type_or_query, - insert_below=None, - on=None, - mode=ADD, - bool_body=None, - **body - ): - return self._compound_param_insert( - "bool", "must_not", mode, type_or_query, insert_below, on, bool_body, **body - ) + _children_clauses = node._children.copy() + for param_name, child_nodes in _children_clauses.items(): + param_node = self.get_node_dsl_class(param_name)() + if not param_node.MULTIPLE and len(child_nodes) > 1: + raise ValueError( + "Cannot insert multiple query clauses under %s parameter" + % param_name + ) + self.insert_node(param_node, parent_id=node.identifier, key=param_name) + for child in child_nodes: + self._insert_query(query=child, insert_below=param_node.identifier) - def filter( - self, - type_or_query, - insert_below=None, - on=None, - mode=ADD, - bool_body=None, - **body - ): - return self._compound_param_insert( - "bool", "filter", mode, type_or_query, insert_below, on, bool_body, **body - ) + def _insert_node_below(self, node, parent_id, key, by_path): + """ + Override lighttree.Tree._insert_node_below method to ensure inserted query clause is consistent (for instance + only compounds queries can have children clauses). + If mappings are provided, ensure that nested fields are properly handled. If nested_autocorrect is set to True + at __init__, automatically add it if necessary. + + Note: automatic handling can be ambiguous in case of multiple nested clauses, ie should it operate a must clause + at root document level, or at nested level. Example: difference between "a car with a rectangular window, and a + blue window" (can be different windows), and "a car with a rectangular and blue window" (same window must hold + same characteristics). + """ + if parent_id is not None: + _, pnode = self.get(parent_id) + if isinstance(pnode, LeafQueryClause): + raise ValueError( + "Cannot add clause under leaf query clause <%s>" % pnode.KEY + ) + if isinstance(pnode, ParentParameterClause): + if isinstance(node, ParentParameterClause): + raise ValueError( + "Cannot add parameter clause <%s> under another parameter clause <%s>" + % (pnode.KEY, node.KEY) + ) + if isinstance(pnode, CompoundClause): + if key not in pnode._parent_params: + raise ValueError( + "Expect a parameter clause of type %s under <%s> compound clause, got <%s>" + % (pnode._parent_params.keys(), pnode.KEY, key) + ) - def show(self, *args, line_max_length=80, **kwargs): - return "\n%s" % text( - super(Tree, self).show(*args, line_max_length=line_max_length, **kwargs) - ) + # automatic handling of nested clauses + if isinstance(node, Nested) or not self.mappings or not hasattr(node, "field"): + return super(Query, self)._insert_node_below( + node=node, parent_id=parent_id, key=key, by_path=by_path + ) + required_nested_level = self.mappings.nested_at_field(node.field) + if len(self.list()) <= 1: + # empty + current_nested_level = None + else: + current_nested_level = self.applied_nested_path_at_node(parent_id) + if current_nested_level == required_nested_level: + return super(Query, self)._insert_node_below( + node=node, parent_id=parent_id, key=key, by_path=by_path + ) + if not self.nested_autocorrect: + raise ValueError( + "Invalid %s query clause on %s field. Invalid nested: expected %s, current %s." + % (node.KEY, node.field, required_nested_level, current_nested_level) + ) + # requires nested - apply all required nested fields + to_insert = node + for nested_lvl in self.mappings.list_nesteds_at_field(node.field): + if current_nested_level != nested_lvl: + to_insert = self.get_node_dsl_class("nested")( + path=nested_lvl, query=to_insert + ) + self._insert_query(to_insert, parent_id) def __str__(self): return json.dumps(self.to_dict(), indent=2) diff --git a/pandagg/tree/response.py b/pandagg/tree/response.py index aaab74cd..04169623 100644 --- a/pandagg/tree/response.py +++ b/pandagg/tree/response.py @@ -3,8 +3,6 @@ from collections import OrderedDict, defaultdict -from future.utils import iteritems - from pandagg.node.query.joining import Nested from pandagg.tree._tree import Tree @@ -13,13 +11,15 @@ class AggsResponseTree(Tree): - """Tree representation of an ElasticSearch response.""" + """ + Tree shaped representation of an ElasticSearch aggregations response. + """ node_class = BucketNode def __init__(self, aggs, raw_response=None): """ - :param aggs: instance of pandagg.agg.Agg from which this Elasticsearch response originates. + :param aggs: instance of pandagg.agg.Aggs from which this Elasticsearch response originates. """ super(AggsResponseTree, self).__init__() self.__aggs = aggs @@ -28,11 +28,9 @@ def __init__(self, aggs, raw_response=None): if raw_response: self.parse(raw_response) - def _clone_init(self, deep=False): - return AggsResponseTree(aggs=self.__aggs.clone(deep=deep)) - def parse(self, raw_response): - """Build response tree from ElasticSearch aggregation response + """ + Build response tree from ElasticSearch aggregation response :param raw_response: ElasticSearch aggregation response :return: self @@ -44,29 +42,9 @@ def parse(self, raw_response): ) return self - def _parse_node_with_children(self, agg_name, agg_node, raw_response, pid): - """Recursive method to parse ES raw response. - - :param agg_node: current aggregation, pandagg.nodes.AggNode instance - :param raw_response: ES response at current level, dict - :param pid: parent node identifier - """ - agg_raw_response = raw_response.get(agg_name) - for key, raw_value in agg_node.extract_buckets(agg_raw_response): - bucket = Bucket( - level=agg_name, key=key, value=agg_node.extract_bucket_value(raw_value) - ) - self.insert_node(bucket, parent_id=pid) - for child_name, child in self.__aggs.children(agg_node.identifier): - self._parse_node_with_children( - child_name, - agg_node=child, - raw_response=raw_value, - pid=bucket.identifier, - ) - def bucket_properties(self, bucket, properties=None, end_level=None, depth=None): - """Recursive method returning a given bucket's properties in the form of an ordered dictionnary. + """ + Recursive method returning a given bucket's properties in the form of an ordered dictionnary. Travel from current bucket through all ancestors until reaching root. :param bucket: instance of pandagg.buckets.buckets.Bucket @@ -86,30 +64,9 @@ def bucket_properties(self, bucket, properties=None, end_level=None, depth=None) return properties return self.bucket_properties(parent, properties, end_level, depth) - @classmethod - def _build_filter( - cls, nid_to_children, filters_per_nested_level, current_nested_path=None - ): - """Recursive function to build bucket filters from highest to deepest nested conditions.""" - current_conditions = filters_per_nested_level.get(current_nested_path, []) - nested_children = nid_to_children[current_nested_path] - for nested_child in nested_children: - nested_child_conditions = cls._build_filter( - nid_to_children=nid_to_children, - filters_per_nested_level=filters_per_nested_level, - current_nested_path=nested_child, - ) - if nested_child_conditions: - current_conditions.append( - Nested(path=nested_child, query=nested_child_conditions) - ) - q = Query() - for clause in current_conditions: - q = q.query(clause) - return q - def get_bucket_filter(self, nid): - """Build query filtering documents belonging to that bucket. + """ + Build query filtering documents belonging to that bucket. Suppose the following configuration:: Base <- filter on base @@ -119,13 +76,13 @@ def get_bucket_filter(self, nid): └── Nested_B <- filter on B """ - tree_mapping = self.__aggs.mapping + tree_mapping = self.__aggs.mappings b_key, selected_bucket = self.get(nid) bucket_properties = self.bucket_properties(selected_bucket) agg_node_key_tuples = [ (self.__aggs.get(self.__aggs.id_from_key(level))[1], key) - for level, key in iteritems(bucket_properties) + for level, key in bucket_properties.items() ] filters_per_nested_level = defaultdict(list) @@ -163,3 +120,51 @@ def get_bucket_filter(self, nid): def show(self, **kwargs): kwargs["key"] = kwargs.get("key", lambda x: x.line_repr(depth=0)) return super(AggsResponseTree, self).show(**kwargs) + + def _clone_init(self, deep=False): + return AggsResponseTree(aggs=self.__aggs.clone(deep=deep)) + + def _parse_node_with_children(self, agg_name, agg_node, raw_response, pid): + """ + Recursive method to parse ES raw response. + + :param agg_node: current aggregation, pandagg.nodes.AggNode instance + :param raw_response: ES response at current level, dict + :param pid: parent node identifier + """ + agg_raw_response = raw_response.get(agg_name) + for key, raw_value in agg_node.extract_buckets(agg_raw_response): + bucket = Bucket( + level=agg_name, key=key, value=agg_node.extract_bucket_value(raw_value) + ) + self.insert_node(bucket, parent_id=pid) + for child_name, child in self.__aggs.children(agg_node.identifier): + self._parse_node_with_children( + child_name, + agg_node=child, + raw_response=raw_value, + pid=bucket.identifier, + ) + + @classmethod + def _build_filter( + cls, nid_to_children, filters_per_nested_level, current_nested_path=None + ): + """ + Recursive function to build bucket filters from highest to deepest nested conditions.""" + current_conditions = filters_per_nested_level.get(current_nested_path, []) + nested_children = nid_to_children[current_nested_path] + for nested_child in nested_children: + nested_child_conditions = cls._build_filter( + nid_to_children=nid_to_children, + filters_per_nested_level=filters_per_nested_level, + current_nested_path=nested_child, + ) + if nested_child_conditions: + current_conditions.append( + Nested(path=nested_child, query=nested_child_conditions) + ) + q = Query() + for clause in current_conditions: + q = q.query(clause) + return q diff --git a/pandagg/utils.py b/pandagg/utils.py index cd7ac8e5..1a9c0480 100644 --- a/pandagg/utils.py +++ b/pandagg/utils.py @@ -1,6 +1,5 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import unicode_literals # adapted from https://github.com/elastic/elasticsearch-dsl-py/blob/master/elasticsearch_dsl/utils.py#L162 diff --git a/tests/interactive/test_interactive_response.py b/tests/interactive/test_interactive_response.py index 378adf0c..9f371a73 100644 --- a/tests/interactive/test_interactive_response.py +++ b/tests/interactive/test_interactive_response.py @@ -5,7 +5,7 @@ from pandagg.tree.response import AggsResponseTree from pandagg.interactive.response import IResponse from tests import PandaggTestCase -from tests.testing_samples.mapping_example import MAPPING +from tests.testing_samples.mapping_example import MAPPINGS import tests.testing_samples.data_sample as sample @@ -15,7 +15,7 @@ def test_client_bound_response(self, uuid_mock): uuid_mock.side_effect = range(1000) client_mock = Mock(spec=["search"]) - my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) + my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) response_tree = AggsResponseTree(aggs=my_agg).parse(sample.ES_AGG_RESPONSE) response = IResponse( diff --git a/tests/interactive/test_mapping.py b/tests/interactive/test_mapping.py index e45c4d71..dfb1c345 100644 --- a/tests/interactive/test_mapping.py +++ b/tests/interactive/test_mapping.py @@ -13,19 +13,19 @@ from unittest import TestCase -from pandagg.mapping import Keyword, Text, Nested, Object, Integer -from pandagg.tree.mapping import Mapping +from pandagg.mappings import Keyword, Text, Nested, Object, Integer +from pandagg.tree.mappings import Mappings from pandagg.interactive._field_agg_factory import field_classes_per_name -from pandagg.interactive.mapping import IMapping +from pandagg.interactive.mappings import IMappings -from tests.testing_samples.mapping_example import MAPPING +from tests.testing_samples.mapping_example import MAPPINGS class IMappingTestCase(TestCase): def test_mapping_aggregations(self): - mapping_tree = Mapping(**MAPPING) + mapping_tree = Mappings(**MAPPINGS) # check that leaves are expanded, based on 'field_name' attribute of nodes - mapping = IMapping(mapping_tree, depth=1) + mappings = IMappings(mapping_tree, depth=1) for field_name in ( "classification_type", "date", @@ -35,12 +35,12 @@ def test_mapping_aggregations(self): "local_metrics", "workflow", ): - self.assertTrue(hasattr(mapping, field_name)) + self.assertTrue(hasattr(mappings, field_name)) - dataset = mapping.global_metrics.dataset + dataset = mappings.global_metrics.dataset self.assertEqual( dataset.__repr__(), - """ + """ {Object} ├── nb_classes Integer └── support_train Integer @@ -94,18 +94,18 @@ def test_imapping_init(self): }, } - mapping_tree = Mapping(**mapping_dict) + mapping_tree = Mappings(**mapping_dict) client_mock = {} index_name = "classification_report_index_name" # from dict - im1 = IMapping(mapping_dict, client=client_mock, index=index_name) + im1 = IMappings(mapping_dict, client=client_mock, index=index_name) # from tree - im2 = IMapping(mapping_tree, client=client_mock, index=index_name) + im2 = IMappings(mapping_tree, client=client_mock, index=index_name) # from nodes - im3 = IMapping( - mapping={ + im3 = IMappings( + mappings={ "properties": { "classification_type": Keyword(fields={"raw": Text()}), "local_metrics": Nested( @@ -137,8 +137,8 @@ def test_quick_agg(self): """ client_mock = {} - mapping_tree = Mapping(**MAPPING) - client_bound_mapping = IMapping( + mapping_tree = Mappings(**MAPPINGS) + client_bound_mapping = IMappings( mapping_tree, client=client_mock, index="classification_report_index_name" ) diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 70926746..fac245a2 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -11,15 +11,15 @@ from pandagg.discovery import discover, Index from mock import patch -from pandagg.interactive.mapping import IMapping -from tests.testing_samples.mapping_example import MAPPING +from pandagg.interactive.mappings import IMappings +from tests.testing_samples.mapping_example import MAPPINGS from tests.testing_samples.settings_example import SETTINGS indices_mock = { # index name "classification_report_one": { "aliases": {}, - "mappings": MAPPING, + "mappings": MAPPINGS, "settings": SETTINGS, } } @@ -42,5 +42,5 @@ def test_pandagg_wrapper(self, indice_get_mock): self.assertEqual(report_index.__str__(), "") self.assertEqual(report_index.name, "classification_report_one") - # ensure mapping presence - self.assertIsInstance(report_index.mapping, IMapping) + # ensure mappings presence + self.assertIsInstance(report_index.mappings, IMappings) diff --git a/tests/test_response.py b/tests/test_response.py index b6e8907a..058d47cf 100644 --- a/tests/test_response.py +++ b/tests/test_response.py @@ -11,7 +11,7 @@ import tests.testing_samples.data_sample as sample from pandagg.utils import ordered -from tests.testing_samples.mapping_example import MAPPING +from tests.testing_samples.mapping_example import MAPPINGS class ResponseTestCase(PandaggTestCase): @@ -138,7 +138,7 @@ def test_response(self): class AggregationsResponseTestCase(PandaggTestCase): def test_parse_as_tree(self, *_): - my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) + my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) response = Aggregations( data=sample.ES_AGG_RESPONSE, search=Search().aggs(my_agg) ).to_tree() @@ -146,7 +146,7 @@ def test_parse_as_tree(self, *_): self.assertEqual(response.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR) def test_normalize_buckets(self): - my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) + my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) response = Aggregations( data=sample.ES_AGG_RESPONSE, search=Search().aggs(my_agg) ).to_normalized() @@ -156,7 +156,7 @@ def test_normalize_buckets(self): def test_parse_as_tabular(self): # with single agg at root - my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) + my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, search=Search().aggs(my_agg) ).to_tabular(index_orient=True, grouped_by="global_metrics.field.name") @@ -281,7 +281,7 @@ def test_parse_as_tabular_multiple_roots(self): ) def test_parse_as_dataframe(self): - my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) + my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) df = Aggregations( data=sample.ES_AGG_RESPONSE, search=Search().aggs(my_agg) ).to_dataframe(grouped_by="global_metrics.field.name") @@ -320,7 +320,7 @@ def test_parse_as_dataframe(self): ) def test_grouping_agg(self): - my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) + my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) agg_response = Aggregations( data=sample.ES_AGG_RESPONSE, search=Search().aggs(my_agg) ) diff --git a/tests/test_search.py b/tests/test_search.py index 11dab7d1..70a158a2 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -7,7 +7,7 @@ from pandagg.node import Max from pandagg.search import Search from pandagg.query import Query, Bool, Match -from pandagg.tree import Mapping +from pandagg.tree import Mappings from pandagg.utils import ordered from tests import PandaggTestCase @@ -38,16 +38,16 @@ def test_search_query_combines_query(self): ) def test_search_column_selection(self): - mapping = Mapping( + mappings = Mappings( properties={"col1": {"type": "keyword"}, "col2": {"type": "integer"}} ) self.assertEqual( - Search(mapping=mapping)[["col1", "col2"]].to_dict(), + Search(mappings=mappings)[["col1", "col2"]].to_dict(), {"_source": {"includes": ["col1", "col2"]}}, ) with self.assertRaises(KeyError): - Search(mapping=Mapping(properties={"example": {"type": "keyword"}}))[ + Search(mappings=Mappings(properties={"example": {"type": "keyword"}}))[ ["col1", "col2"] ] diff --git a/tests/testing_samples/data_sample.py b/tests/testing_samples/data_sample.py index f2a14529..42a1db0d 100644 --- a/tests/testing_samples/data_sample.py +++ b/tests/testing_samples/data_sample.py @@ -6,7 +6,7 @@ """ from pandagg.aggs import Aggs, Avg, Terms -from tests.testing_samples.mapping_example import MAPPING +from tests.testing_samples.mapping_example import MAPPINGS EXPECTED_AGG_QUERY = { @@ -33,7 +33,7 @@ def get_wrapper_declared_agg(): return ( - Aggs(mapping=MAPPING) + Aggs(mappings=MAPPINGS) .groupby("classification_type") .groupby("global_metrics.field.name") .agg("avg_nb_classes", Avg(field="global_metrics.dataset.nb_classes")) diff --git a/tests/testing_samples/mapping_example.py b/tests/testing_samples/mapping_example.py index 258cd796..db9e901d 100644 --- a/tests/testing_samples/mapping_example.py +++ b/tests/testing_samples/mapping_example.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -MAPPING = { +MAPPINGS = { "dynamic": False, "properties": { "classification_type": {"type": "keyword"}, @@ -136,5 +136,5 @@ └── workflow Keyword """ -EXPECTED_MAPPING_TREE_REPR = """\n%s""" % EXPECTED_MAPPING_REPR -EXPECTED_CLIENT_BOUND_MAPPING_REPR = """\n%s""" % EXPECTED_MAPPING_REPR +EXPECTED_MAPPING_TREE_REPR = """\n%s""" % EXPECTED_MAPPING_REPR +EXPECTED_CLIENT_BOUND_MAPPING_REPR = """\n%s""" % EXPECTED_MAPPING_REPR diff --git a/tests/tree/aggs/test_aggs.py b/tests/tree/aggs/test_aggs.py index e32e66ff..2507f309 100644 --- a/tests/tree/aggs/test_aggs.py +++ b/tests/tree/aggs/test_aggs.py @@ -15,7 +15,7 @@ import tests.testing_samples.data_sample as sample -from tests.testing_samples.mapping_example import MAPPING +from tests.testing_samples.mapping_example import MAPPINGS class AggTestCase(TestCase): @@ -101,7 +101,7 @@ def test_deserialize_nodes_with_subaggs(self): ) def test_add_node_with_mapping(self): - with_mapping = Aggs(mapping=MAPPING, nested_autocorrect=True) + with_mapping = Aggs(mappings=MAPPINGS, nested_autocorrect=True) # add regular node with_mapping = with_mapping.agg("workflow", Terms(field="workflow")) @@ -249,7 +249,7 @@ def test_paste_tree_with_mapping(self): } } }, - mapping=MAPPING, + mappings=MAPPINGS, ) self.assertEqual({k for k, _ in initial_agg_1.list()}, {None, "week"}) pasted_agg_1 = Aggs( @@ -314,7 +314,7 @@ def test_paste_tree_with_mapping(self): } } }, - mapping=MAPPING, + mappings=MAPPINGS, nested_autocorrect=True, ) self.assertEqual({k for k, _ in initial_agg_2.list()}, {None, "week"}) @@ -460,7 +460,7 @@ def test_interpret_agg_string(self): # with required nested some_agg = Aggs( {"term_workflow": {"terms": {"field": "workflow", "size": 5}}}, - mapping=MAPPING, + mappings=MAPPINGS, nested_autocorrect=True, ) some_agg = some_agg.agg( @@ -495,7 +495,7 @@ def test_aggs(self): # with parent with required nested some_agg = Aggs( {"term_workflow": {"terms": {"field": "workflow", "size": 5}}}, - mapping=MAPPING, + mappings=MAPPINGS, nested_autocorrect=True, ) node = Avg(field="local_metrics.performance.test.f1_score") @@ -572,7 +572,7 @@ def test_aggs_strings(self): def test_init_from_node_hierarchy(self): node_hierarchy = sample.get_node_hierarchy() - agg = Aggs(node_hierarchy, mapping=MAPPING) + agg = Aggs(node_hierarchy, mappings=MAPPINGS) self.assertEqual(agg.to_dict(), sample.EXPECTED_AGG_QUERY) # with nested @@ -593,7 +593,7 @@ def test_init_from_node_hierarchy(self): }, ) } - agg = Aggs(node_hierarchy, mapping=MAPPING, nested_autocorrect=True) + agg = Aggs(node_hierarchy, mappings=MAPPINGS, nested_autocorrect=True) self.assertEqual( agg.to_dict(), { @@ -796,7 +796,7 @@ def test_applied_nested_path_at_node(self): }, ) } - agg = Aggs(node_hierarchy, mapping=MAPPING, nested_autocorrect=True) + agg = Aggs(node_hierarchy, mappings=MAPPINGS, nested_autocorrect=True) self.assertEqual(agg.applied_nested_path_at_node(agg.id_from_key("week")), None) for node_key in ( @@ -871,7 +871,7 @@ def test_deepest_linear_agg(self): }, ) } - agg = Aggs(node_hierarchy, mapping=MAPPING, nested_autocorrect=True) + agg = Aggs(node_hierarchy, mappings=MAPPINGS, nested_autocorrect=True) self.assertEqual( agg.get_key(agg._deepest_linear_bucket_agg), "local_metrics.field_class.name", @@ -896,7 +896,7 @@ def test_deepest_linear_agg(self): }, ) } - agg2 = Aggs(node_hierarchy_2, mapping=MAPPING, nested_autocorrect=True) + agg2 = Aggs(node_hierarchy_2, mappings=MAPPINGS, nested_autocorrect=True) self.assertEqual(agg2.get_key(agg2._deepest_linear_bucket_agg), "week") def test_grouped_by(self): diff --git a/tests/tree/mapping/test_mapping.py b/tests/tree/mapping/test_mappings.py similarity index 88% rename from tests/tree/mapping/test_mapping.py rename to tests/tree/mapping/test_mappings.py index 49169da7..c7a575a0 100644 --- a/tests/tree/mapping/test_mapping.py +++ b/tests/tree/mapping/test_mappings.py @@ -4,15 +4,15 @@ from unittest import TestCase from pandagg.exceptions import AbsentMappingFieldError -from pandagg.mapping import Keyword, Object, Text, Nested, Integer, Mapping -from pandagg.node.mapping.abstract import Field -from tests.testing_samples.mapping_example import MAPPING, EXPECTED_MAPPING_TREE_REPR +from pandagg.mappings import Keyword, Object, Text, Nested, Integer, Mappings +from pandagg.node.mappings.abstract import Field +from tests.testing_samples.mapping_example import MAPPINGS, EXPECTED_MAPPING_TREE_REPR -class MappingTreeTestCase(TestCase): +class MappingsTreeTestCase(TestCase): """All tree logic is tested in utils. Here, check that: - - a dict mapping is correctly parsed into a tree, + - a dict mappings is correctly parsed into a tree, - it has the right representation. """ @@ -45,9 +45,9 @@ def test_deserialization(self): }, } - m1 = Mapping(**mapping_dict) + m1 = Mappings(**mapping_dict) - m2 = Mapping( + m2 = Mappings( dynamic=False, properties={ "classification_type": Keyword(fields={"raw": Text()}), @@ -66,7 +66,7 @@ def test_deserialization(self): }, ) - expected_repr = """ + expected_repr = """ _ ├── classification_type Keyword │ └── raw ~ Text @@ -80,12 +80,12 @@ def test_deserialization(self): self.assertEqual(m.to_dict(), mapping_dict, "failed at m%d" % (i + 1)) def test_parse_tree_from_dict(self): - mapping_tree = Mapping(**MAPPING) + mapping_tree = Mappings(**MAPPINGS) self.assertEqual(mapping_tree.__str__(), EXPECTED_MAPPING_TREE_REPR) def test_nesteds_applied_at_field(self): - mapping_tree = Mapping(**MAPPING) + mapping_tree = Mappings(**MAPPINGS) self.assertEqual(mapping_tree.nested_at_field("classification_type"), None) self.assertEqual(mapping_tree.list_nesteds_at_field("classification_type"), []) @@ -108,7 +108,7 @@ def test_nesteds_applied_at_field(self): ) def test_mapping_type_of_field(self): - mapping_tree = Mapping(**MAPPING) + mapping_tree = Mappings(**MAPPINGS) with self.assertRaises(AbsentMappingFieldError): self.assertEqual(mapping_tree.mapping_type_of_field("yolo"), False) @@ -124,7 +124,7 @@ def test_mapping_type_of_field(self): ) def test_node_path(self): - mapping_tree = Mapping(**MAPPING) + mapping_tree = Mappings(**MAPPINGS) # get node by path syntax k, node = mapping_tree.get("local_metrics.dataset.support_test", by_path=True) self.assertIsInstance(node, Field) diff --git a/tests/tree/query/test_query.py b/tests/tree/query/test_query.py index bd49b448..e1e7cac4 100644 --- a/tests/tree/query/test_query.py +++ b/tests/tree/query/test_query.py @@ -1021,7 +1021,7 @@ def test_multiple_must_below_nested_query(self): def test_autonested(self): q = Query( - mapping={ + mappings={ "properties": { "actors": { "type": "nested", diff --git a/tests/tree/test_response.py b/tests/tree/test_response.py index 0cc08911..a67e05a6 100644 --- a/tests/tree/test_response.py +++ b/tests/tree/test_response.py @@ -5,7 +5,7 @@ from pandagg.tree.aggs import Aggs from pandagg.tree.response import AggsResponseTree -from tests.testing_samples.mapping_example import MAPPING +from tests.testing_samples.mapping_example import MAPPINGS import tests.testing_samples.data_sample as sample @@ -13,7 +13,7 @@ class ResponseTestCase(TestCase): @patch("uuid.uuid4") def test_response_tree(self, uuid_mock): uuid_mock.side_effect = range(1000) - my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) + my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) response_tree = AggsResponseTree(aggs=my_agg).parse(sample.ES_AGG_RESPONSE) self.assertEqual(response_tree.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR) self.assertEqual(len(response_tree.list()), 15)