From ee194b0edf9e0d2528ad8cd30c311d609dfbd606 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 20 Dec 2024 15:16:07 +0100 Subject: [PATCH] refactor: decouple rule tree configuration and refactor initialization - Extracted RuleTree configuration into a dedicated `Config` class for improved modularity. - Updated unit tests and references to reflect the refactoring changes in RuleTree initialization. - Add documentation to rst docs --- README.md | 39 ----- doc/source/conf.py | 1 + doc/source/configuration/rules.rst | 1 + logprep/abc/processor.py | 4 +- logprep/framework/rule_tree/rule_tree.py | 153 ++++++++++++------ pyproject.toml | 1 + .../framework/rule_tree/test_rule_tree.py | 52 +++--- tests/unit/processor/base.py | 8 +- .../processor/clusterer/test_clusterer.py | 6 +- tests/unit/processor/labeler/test_labeler.py | 2 +- 10 files changed, 149 insertions(+), 118 deletions(-) diff --git a/README.md b/README.md index 2177db99e..4a7a01b0b 100644 --- a/README.md +++ b/README.md @@ -91,45 +91,6 @@ and secondly they specify how to process the message. For example which fields should be deleted or to which IP-address the geolocation should be retrieved. -For performance reasons on startup all rules per processor are aggregated to a rule tree. -Instead of evaluating all rules independently for each log message the message is checked against -the rule tree. -Each node in the rule tree represents a condition that has to be meet, -while the leafs represent changes that the processor should apply. -If no condition is met, the processor will just pass the log event to the next processor. - -The following chart gives an example of such a rule tree: - -```mermaid -flowchart TD -A[root] -A-->B[Condition 1] -A-->C[Condition 2] -A-->D[Condition 3] -B-->E[Condition 4] -B-->H(Rule 1) -C-->I(Rule 2) -D-->J(rule 3) -E-->G(Rule 4) -``` - -To further improve the performance, it is possible to prioritize specific nodes of the rule tree, -such that broader conditions are higher up in the tree. -And specific conditions can be moved further down. -Following json gives an example of such a rule tree configuration. -This configuration will lead to the prioritization of `tags` and `message` in the rule tree. - -```json -{ - "priority_dict": { - "category": "01", - "message": "02" - }, - "tag_map": { - "check_field_name": "check-tag" - } -} -``` ### Connectors diff --git a/doc/source/conf.py b/doc/source/conf.py index 03169988a..c0ace2bbf 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -66,6 +66,7 @@ def setup(app): "sphinx.ext.napoleon", "sphinx.ext.autosummary", "sphinxcontrib.datatemplates", + "sphinxcontrib.mermaid", "nbsphinx", "IPython.sphinxext.ipython_console_highlighting", "sphinx_copybutton", diff --git a/doc/source/configuration/rules.rst b/doc/source/configuration/rules.rst index 38b32882a..48e4e1483 100644 --- a/doc/source/configuration/rules.rst +++ b/doc/source/configuration/rules.rst @@ -7,3 +7,4 @@ Rules .. automodule:: logprep.processor.base.rule .. automodule:: logprep.filter.lucene_filter +.. automodule:: logprep.framework.rule_tree.rule_tree diff --git a/logprep/abc/processor.py b/logprep/abc/processor.py index 391b366ac..b8092941a 100644 --- a/logprep/abc/processor.py +++ b/logprep/abc/processor.py @@ -98,7 +98,7 @@ class Config(Component.Config): tree_config: Optional[str] = field( default=None, validator=[validators.optional(validators.instance_of(str))] ) - """Path to a JSON file with a valid rule tree configuration. + """Path to a JSON file with a valid :ref:`Rule Tree Configuration`. For string format see :ref:`getters`.""" apply_multiple_times: Optional[bool] = field( default=False, validator=[validators.optional(validators.instance_of(bool))] @@ -123,7 +123,7 @@ class Config(Component.Config): def __init__(self, name: str, configuration: "Processor.Config"): super().__init__(name, configuration) - self._rule_tree = RuleTree(processor_name=self.name, processor_config=self._config) + self._rule_tree = RuleTree(config=self._config.tree_config) self.load_rules(rules_targets=self._config.rules) self.result = None self._bypass_rule_tree = False diff --git a/logprep/framework/rule_tree/rule_tree.py b/logprep/framework/rule_tree/rule_tree.py index 9165a33bb..4051e741b 100644 --- a/logprep/framework/rule_tree/rule_tree.py +++ b/logprep/framework/rule_tree/rule_tree.py @@ -1,94 +1,154 @@ -"""This module contains the rule tree functionality.""" +""" +.. _Rule Tree: -from enum import Enum -from logging import Logger -from typing import TYPE_CHECKING, List, Optional, Union +RuleTree +======== + +For performance reasons on startup, all rules per processor are aggregated to a rule tree. +Instead of evaluating all rules independently for each log message, the message is checked against +the rule tree. +Each node in the rule tree represents a condition that has to be met, +while the leaves represent changes that the processor should apply. +If no condition is met, the processor will just pass the log event to the next processor. + +The following chart gives an example of such a rule tree: + +.. mermaid:: + + flowchart TD + A[root] + A-->B[Condition 1] + A-->C[Condition 2] + A-->D[Condition 3] + B-->E[Condition 4] + B-->H(Rule 1) + C-->I(Rule 2) + D-->J(rule 3) + E-->G(Rule 4) + + +.. _Rule Tree Configuration: + +Rule Tree Configuration +^^^^^^^^^^^^^^^^^^^^^^^ + +To further improve the performance, it is possible to prioritize specific nodes of the rule tree, +such that broader conditions are higher up in the tree. +And specific conditions can be moved further down. +The following json gives an example of such a rule tree configuration. +This configuration will lead to the prioritization of `category` and `message` in the rule tree. + +.. code-block:: json + :linenos: + + { + "priority_dict": { + "category": "01", + "message": "02" + }, + "tag_map": { + "check_field_name": "check-tag" + } + } + +A path to a rule tree configuration can be set in any processor configuration under the key +:code:`tree_config`. +""" + +from logging import getLogger +from typing import TYPE_CHECKING, List, Optional + +from attr import validators +from attrs import define, field from logprep.framework.rule_tree.node import Node from logprep.framework.rule_tree.rule_parser import RuleParser from logprep.util import getter if TYPE_CHECKING: # pragma: no cover - from logprep.abc.processor import Processor from logprep.processor.base.rule import Rule +logger = getLogger("RuleTree") + class RuleTree: """Represent a set of rules using a rule tree model.""" + @define(kw_only=True, slots=False) + class Config: + """Configuration for the RuleTree""" + + priority_dict: dict[str] = field( + validator=[ + validators.instance_of(dict), + validators.deep_mapping( + key_validator=validators.instance_of(str), + value_validator=validators.instance_of(str), + ), + ], + default=dict(), + ) + """Fields used in filter expressions can be prioritized by specifying + them in this priority dict. The key describes the field name used in + the filter expression and the value describes the priority in form + of string number. The higher the number, the higher the priority.""" + + tag_map: dict = field( + validator=[ + validators.instance_of(dict), + validators.deep_mapping( + key_validator=validators.instance_of(str), + value_validator=validators.instance_of(str), + ), + ], + default=dict(), + ) + """tbd""" + __slots__ = ( "rule_parser", - "priority_dict", "_rule_mapping", - "_processor_config", - "_processor_type", - "_processor_name", "_root", "__dict__", ) rule_parser: Optional[RuleParser] - priority_dict: dict _rule_mapping: dict - _processor_name: str - _processor_config: "Processor.Config" - _processor_type: str _root: Node def __init__( self, root: Node = None, - processor_name: str = None, - processor_config: "Processor.Config" = None, + config: str = None, ): """Rule tree initialization function. Initializes a new rule tree with a given root node and a path to the tree's optional config file. If no root node is specified, a new node will be created and used as root node. - Also starts the further setup. Parameters ---------- root: Node, optional Node that should be used as the new rule tree's root node. - processor_config: Processor.Config, optional - Configuration of the processor that uses the rule tree. - + config: str, optional + Path to a tree configuration. """ - self.rule_parser = None self._rule_mapping = {} - self._processor_config = processor_config - self._processor_name = processor_name if processor_name is not None else "" - self._processor_type = processor_config.type if processor_name is not None else "" - self._setup() + self.tree_config = RuleTree.Config() + if config: + config_data = getter.GetterFactory.from_string(config).get_json() + self.tree_config = RuleTree.Config(**config_data) + self.rule_parser = RuleParser(self.tree_config.tag_map) + self._root = Node(None) if root: self._root = root - else: - self._root = Node(None) @property def number_of_rules(self) -> int: return len(self._rule_mapping) - def _setup(self): - """Basic setup of rule tree. - - Initiate the rule tree's priority dict, tag map and load the configuration from file. - - """ - self.priority_dict = {} - tag_map = {} - - if self._processor_config and self._processor_config.tree_config: - config_data = getter.GetterFactory.from_string( - self._processor_config.tree_config - ).get_json() - self.priority_dict = config_data["priority_dict"] - tag_map = config_data["tag_map"] - self.rule_parser = RuleParser(tag_map) - - def add_rule(self, rule: "Rule", logger: Logger = None): + def add_rule(self, rule: "Rule"): """Add rule to rule tree. Add a new rule to the rule tree. @@ -102,12 +162,9 @@ def add_rule(self, rule: "Rule", logger: Logger = None): ---------- rule: Rule Rule to be added to the rule tree. - logger: Logger - Logger to use for logging. - """ try: - parsed_rule = self.rule_parser.parse_rule(rule, self.priority_dict) + parsed_rule = self.rule_parser.parse_rule(rule, self.tree_config.priority_dict) except Exception as error: # pylint: disable=broad-except logger.warning( f'Error parsing rule "{rule.file_name}.yml": {type(error).__name__}: {error}. ' diff --git a/pyproject.toml b/pyproject.toml index fd45c4188..8bcdf29e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,6 +120,7 @@ doc = [ "sphinx", "sphinx_rtd_theme", "sphinxcontrib.datatemplates", + "sphinxcontrib-mermaid", "sphinx-copybutton", "nbsphinx", "ipython", diff --git a/tests/unit/framework/rule_tree/test_rule_tree.py b/tests/unit/framework/rule_tree/test_rule_tree.py index 3b5a7a7cd..54168bef7 100644 --- a/tests/unit/framework/rule_tree/test_rule_tree.py +++ b/tests/unit/framework/rule_tree/test_rule_tree.py @@ -1,17 +1,18 @@ # pylint: disable=protected-access # pylint: disable=missing-docstring # pylint: disable=line-too-long +import json from copy import deepcopy from unittest import mock import pytest -from logprep.factory import Factory from logprep.filter.expression.filter_expression import Exists, StringFilterExpression from logprep.framework.rule_tree.node import Node from logprep.framework.rule_tree.rule_parser import RuleParser from logprep.framework.rule_tree.rule_tree import RuleTree from logprep.processor.pre_detector.rule import PreDetectorRule +from logprep.util import getter @pytest.fixture(name="rule_dict") @@ -33,29 +34,42 @@ def rule_dict_fixture(): class TestRuleTree: def test_init_without_specifying_parameters(self): rule_tree = RuleTree() - assert isinstance(rule_tree.root, Node) assert not rule_tree.rule_parser._rule_tagger._tag_map - assert not rule_tree.priority_dict + assert not rule_tree.tree_config.priority_dict assert rule_tree.root.expression is None def test_init_with_specifying_config(self): - processor = Factory.create( - { - "processor": { - "type": "dissector", - "rules": [], - "tree_config": "tests/testdata/unit/tree_config.json", - } - } - ) - rule_tree = RuleTree(processor_config=processor._config) - + rule_tree = RuleTree(config="tests/testdata/unit/tree_config.json") assert isinstance(rule_tree.root, Node) assert rule_tree.rule_parser._rule_tagger._tag_map == { "field_name_to_check_for_in_rule": "TAG-TO-CHECK-IF-IN-EVENT" } - assert rule_tree.priority_dict == {"field_name": "priority"} + assert rule_tree.tree_config.priority_dict == {"field_name": "priority"} + + @pytest.mark.parametrize( + "tree_config", + [ + { + "priority_dict": { + "winlog": 1, # should be string + }, + "tag_map": {"field_name_to_check_for_in_rule": "TAG-TO-CHECK-IF-IN-EVENT"}, + }, + { + "priority_dict": { + "winlog": "1", + }, + "tag_map": {"field_name_to_check_for_in_rule": 1}, # should be string + }, + ], + ) + def test_init_with_invalid_tree_config(self, tmp_path, tree_config): + tree_config_path = tmp_path / "tree_config.json" + tree_config_path.write_text(json.dumps(tree_config)) + config_data = getter.GetterFactory.from_string(str(tree_config_path)).get_json() + with pytest.raises(TypeError, match=r"must be \"): + self.tree_config = RuleTree.Config(**config_data) def test_add_rule(self, rule_dict): rule_tree = RuleTree() @@ -80,21 +94,21 @@ def test_add_rule(self, rule_dict): rule ] - def test_add_rule_fails(self, rule_dict): + @mock.patch("logging.Logger.warning") + def test_add_rule_fails(self, mock_warning, rule_dict): rule_tree = RuleTree() rule = PreDetectorRule._create_from_dict(rule_dict) - mocked_logger = mock.MagicMock() with mock.patch( "logprep.framework.rule_tree.rule_parser.RuleParser.parse_rule", side_effect=Exception("mocked error"), ): - rule_tree.add_rule(rule, logger=mocked_logger) + rule_tree.add_rule(rule) expected_call = mock.call.warning( 'Error parsing rule "None.yml": Exception: mocked error. ' "Ignore and continue with next rule." ) - assert expected_call in mocked_logger.mock_calls + assert expected_call in mock_warning.mock_calls def test_get_rule_id(self, rule_dict): rule_tree = RuleTree() diff --git a/tests/unit/processor/base.py b/tests/unit/processor/base.py index 25a3b43b2..03076da6e 100644 --- a/tests/unit/processor/base.py +++ b/tests/unit/processor/base.py @@ -71,7 +71,7 @@ def set_rules(rules_dirs): def _load_rule(self, rule: dict | Rule): self.object._rule_tree = RuleTree() rule = self.object.rule_class._create_from_dict(rule) if isinstance(rule, dict) else rule - self.object._rule_tree.add_rule(rule, self.logger) + self.object._rule_tree.add_rule(rule) def setup_method(self) -> None: """ @@ -224,12 +224,8 @@ def test_accepts_tree_config_from_http(self): tree_config = Path("tests/testdata/unit/tree_config.json").read_text() responses.add(responses.GET, "http://does.not.matter.bla/tree_config.yml", tree_config) processor = Factory.create({"test instance": config}) - assert ( - processor._rule_tree._processor_config.tree_config - == "http://does.not.matter.bla/tree_config.yml" - ) tree_config = json.loads(tree_config) - assert processor._rule_tree.priority_dict == tree_config.get("priority_dict") + assert processor._rule_tree.tree_config.priority_dict == tree_config.get("priority_dict") @responses.activate def test_raises_http_error(self): diff --git a/tests/unit/processor/clusterer/test_clusterer.py b/tests/unit/processor/clusterer/test_clusterer.py index fc2b42684..c4fd38686 100644 --- a/tests/unit/processor/clusterer/test_clusterer.py +++ b/tests/unit/processor/clusterer/test_clusterer.py @@ -167,7 +167,7 @@ def test_cluster(self): document = {"message": "test signature test"} rule = ClustererRule._create_from_dict(rule_definition) - self.object._rule_tree.add_rule(rule, None) + self.object._rule_tree.add_rule(rule) self.object._cluster(document, rule) assert document == expected @@ -230,7 +230,7 @@ def test_rule_dependency_one(self, tmp_path): new_rule = ClustererRule._create_from_dict(rule) new_rule.file_name = str(idx) rules.append(new_rule) - clusterer._rule_tree.add_rule(new_rule, None) + clusterer._rule_tree.add_rule(new_rule) expected = { "message": "test some signature xyz-foo", @@ -310,7 +310,7 @@ def test_rule_dependency_two(self, tmp_path): new_rule = ClustererRule._create_from_dict(rule) new_rule.file_name = str(idx) rules.append(new_rule) - clusterer._rule_tree.add_rule(new_rule, None) + clusterer._rule_tree.add_rule(new_rule) for rule in rules: clusterer._cluster(document, rule) diff --git a/tests/unit/processor/labeler/test_labeler.py b/tests/unit/processor/labeler/test_labeler.py index 2d6daff45..c17e9b075 100644 --- a/tests/unit/processor/labeler/test_labeler.py +++ b/tests/unit/processor/labeler/test_labeler.py @@ -68,7 +68,7 @@ def _load_rule(self, rule, schema=None): # pylint: disable=arguments-differ rule = LabelerRule._create_from_dict(rule) if schema: rule.add_parent_labels_from_schema(schema) - self.object._rule_tree.add_rule(rule, self.logger) + self.object._rule_tree.add_rule(rule) def test_process_adds_labels_to_event(self): rule = {"filter": "applyrule", "labeler": {"label": {"reporter": ["windows"]}}}