From 1cc688e7b9cc87b380474d1344b96f065a2a7c4b Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Mon, 25 Mar 2024 13:01:23 +0100 Subject: [PATCH 1/6] Re-balance roles deterministically instead of randomly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (for auto-generated roles) Algorithm: - if odd # of nodes: all nodes are cluster manager eligible - if even # of nodes: node with highest unit number is data node, all other nodes are cluster manager eligible Replaces old algorithm of `random.choice` one of the nodes that should change roles This will enable in-place upgrades to be ordered from highest to lowest unit number even if roles are re-balanced (for HA [e.g. because of network cut]) while upgrade is in-progress That allows upgrade/rollback to be coordinated without leader unit setting information in peer databag, which allows partial rollback even if leader unit is in error state --- Also, this algorithm ensures that if a unit needs to be restarted for HA while upgrade in-progress, a unit (the highest number unit) on the newer version of the charm/workload will be upgraded. This reduces the likelihood that: - one of the last units on the older version restarts and - the cluster manager switches to operating with the new workload version (i.e. it stops operating in compatability mode with the old version—see https://discuss.elastic.co/t/rolling-upgrades-master-nodes-voting-config-exclusions/320463/2) which means that units on the old version cannot re-connect to the cluster --- lib/charms/opensearch/v0/helper_cluster.py | 148 +++---------- lib/charms/opensearch/v0/models.py | 1 + .../opensearch/v0/opensearch_base_charm.py | 5 +- lib/charms/opensearch/v0/opensearch_distro.py | 2 + tests/integration/ha/helpers.py | 1 + tests/unit/lib/test_helper_cluster.py | 197 ++++++++++-------- tests/unit/lib/test_opensearch_base_charm.py | 1 + .../unit/lib/test_opensearch_peer_clusters.py | 8 +- 8 files changed, 153 insertions(+), 210 deletions(-) diff --git a/lib/charms/opensearch/v0/helper_cluster.py b/lib/charms/opensearch/v0/helper_cluster.py index be51f96e4..496187e65 100644 --- a/lib/charms/opensearch/v0/helper_cluster.py +++ b/lib/charms/opensearch/v0/helper_cluster.py @@ -3,7 +3,6 @@ """Utility classes and methods for getting cluster info, configuration info and suggestions.""" import logging -from random import choice from typing import Dict, List, Optional from charms.opensearch.v0.helper_enums import BaseStrEnum @@ -36,26 +35,20 @@ class ClusterTopology: """Class for creating the best possible configuration for a Node.""" @staticmethod - def suggest_roles(nodes: List[Node], planned_units: int) -> List[str]: + def suggest_roles(nodes: List[Node], planned_units: int, unit_number: int) -> List[str]: """Get roles for a Node. - This method should be read in the context of a "rolling" start - - only 1 unit at a time will call this. - For now, we don't allow to end-user control roles. The logic here is, if number of planned units is: — odd: "all" the nodes are cm_eligible nodes. — even: "all - 1" are cm_eligible and 1 data node. """ - max_cms = ClusterTopology.max_cluster_manager_nodes(planned_units) - base_roles = ["data", "ingest", "ml", "coordinating_only"] - - nodes_by_roles = ClusterTopology.nodes_count_by_role(nodes) - if nodes_by_roles.get("cluster_manager", 0) == max_cms: + full_roles = base_roles + ["cluster_manager"] + highest_unit_number = max(node.unit_number for node in nodes) + if planned_units % 2 == 0 and unit_number == highest_unit_number: return base_roles - - return base_roles + ["cluster_manager"] + return full_roles @staticmethod def get_cluster_settings( @@ -77,115 +70,33 @@ def get_cluster_settings( @staticmethod def recompute_nodes_conf(app_name: str, nodes: List[Node]) -> Dict[str, Node]: """Recompute the configuration of all the nodes (cluster set to auto-generate roles).""" - # in case the cluster nodes' roles were previously "manually generated" - we need - # to reset the roles to their expected default values so that "roles re-balancing" - # logic (node_with_new_roles) can be safely applied: only change the last node. - # Nothing changes to the conf of the nodes if the roles were previously auto-generated. - valid_nodes = ClusterTopology.refill_node_with_default_roles(app_name, nodes) - - nodes_by_name = dict([(node.name, node) for node in nodes]) - - # compute node with new roles - only returns 1 changed node - updated_node = ClusterTopology.node_with_new_roles(app_name, valid_nodes) - if updated_node: - nodes_by_name[updated_node.name] = updated_node - - return nodes_by_name - - @staticmethod - def refill_node_with_default_roles(app_name: str, nodes: List[Node]) -> List[Node]: - """Refill the roles of a list of nodes with default values for re-computing. - - This method works hand in hand with node_with_new_roles which assumes a clean - base (regarding the auto-generation logic) and only applies changes to 1 node. - """ + nodes_by_name = {} + current_cluster_nodes = [] + for node in nodes: + if node.app_name == app_name: + current_cluster_nodes.append(node) + else: + # Leave node unchanged + nodes_by_name[node.name] = node base_roles = ["data", "ingest", "ml", "coordinating_only"] full_roles = base_roles + ["cluster_manager"] - - current_cluster_nodes = [node for node in nodes if node.app_name == app_name] - current_cm_eligible = [node for node in current_cluster_nodes if node.is_cm_eligible()] - - # we check if the roles were previously balanced in accordance with the auto-generation - # logic, in which the max difference between CMs and Non-CMs is 1 node (to keep quorum) - unbalanced = len(current_cm_eligible) < len(current_cluster_nodes) - 1 - - updated_nodes = [] + highest_unit_number = max(node.unit_number for node in current_cluster_nodes) for node in current_cluster_nodes: # we do this in order to remove any non-default role / add any missing default role - new_roles = full_roles if unbalanced or node.is_cm_eligible() else base_roles - updated = Node( + if len(current_cluster_nodes) % 2 == 0 and node.unit_number == highest_unit_number: + roles = base_roles + else: + roles = full_roles + + nodes_by_name[node.name] = Node( name=node.name, - roles=new_roles, + roles=roles, ip=node.ip, app_name=node.app_name, + unit_number=node.unit_number, temperature=node.temperature, ) - updated_nodes.append(updated) - - return updated_nodes + [node for node in nodes if node.app_name != app_name] - - @staticmethod - def node_with_new_roles(app_name: str, remaining_nodes: List[Node]) -> Optional[Node]: - """Pick and recompute the roles of the best node to re-balance the cluster. - - Args: - app_name: Name of the (current) cluster's app on which the node changes must happen - Important to have this piece of information, as in a multi-cluster - deployments the "remaining nodes" includes nodes from all the fleet. - remaining_nodes: List of nodes remaining in a cluster (sub-cluster or full-fleet) - """ - max_cms = ClusterTopology.max_cluster_manager_nodes(len(remaining_nodes)) - - nodes_by_roles = ClusterTopology.nodes_by_role(remaining_nodes) - nodes_count_by_roles = ClusterTopology.nodes_count_by_role(remaining_nodes) - current_cms = nodes_count_by_roles.get("cluster_manager", 0) - - # the nodes involved in the voting are intact, do nothing - if current_cms == max_cms: - logger.debug("Suggesting NO changes to the nodes.") - return None - - if current_cms > max_cms: - # remove cm from a node - cm = choice( - [node for node in nodes_by_roles["cluster_manager"] if node.app_name == app_name] - ) - logger.debug(f"Suggesting - removal of 'CM': {cm.name}") - return Node( - name=cm.name, - roles=[r for r in cm.roles if r != "cluster_manager"], - ip=cm.ip, - app_name=app_name, - ) - - # when cm count smaller than expected - data_only_nodes = [ - node for node in nodes_by_roles["data"] if "cluster_manager" not in node.roles - ] - - # no data-only node available to change, leave - if not data_only_nodes: - logger.debug("Suggesting NO changes to the nodes.") - return None - - # add cm to a data only (non cm) node - data = choice([node for node in data_only_nodes if node.app_name == app_name]) - logger.debug(f"Suggesting - Addition of 'CM' to data: {data.name}") - return Node( - name=data.name, - roles=data.roles + ["cluster_manager"], - ip=data.ip, - app_name=app_name, - ) - - @staticmethod - def max_cluster_manager_nodes(planned_units) -> int: - """Get the max number of CM nodes in a cluster.""" - max_managers = planned_units - if planned_units % 2 == 0: - max_managers -= 1 - - return max_managers + return nodes_by_name @staticmethod def get_cluster_managers_ips(nodes: List[Node]) -> List[str]: @@ -207,18 +118,6 @@ def get_cluster_managers_names(nodes: List[Node]) -> List[str]: return result - @staticmethod - def nodes_count_by_role(nodes: List[Node]) -> Dict[str, int]: - """Count number of nodes by role.""" - result = {} - for node in nodes: - for role in node.roles: - if role not in result: - result[role] = 0 - result[role] += 1 - - return result - @staticmethod def nodes_by_role(nodes: List[Node]) -> Dict[str, List[Node]]: """Get list of nodes by role.""" @@ -256,6 +155,7 @@ def nodes( roles=obj["roles"], ip=obj["ip"], app_name="-".join(obj["name"].split("-")[:-1]), + unit_number=int(obj["name"].split("-")[-1]), temperature=obj.get("attributes", {}).get("temp"), ) nodes.append(node) diff --git a/lib/charms/opensearch/v0/models.py b/lib/charms/opensearch/v0/models.py index 0b6bf61f0..487d92ce7 100644 --- a/lib/charms/opensearch/v0/models.py +++ b/lib/charms/opensearch/v0/models.py @@ -66,6 +66,7 @@ class Node(Model): roles: List[str] ip: str app_name: str + unit_number: int temperature: Optional[str] = None @classmethod diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index 92635dcaf..a4a75ef21 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -1063,7 +1063,7 @@ def _set_node_conf(self, nodes: List[Node]) -> None: computed_roles = ( update_conf.roles if update_conf - else ClusterTopology.suggest_roles(nodes, self.app.planned_units()) + else ClusterTopology.suggest_roles(nodes, self.app.planned_units(), self.unit_id) ) cm_names = ClusterTopology.get_cluster_managers_names(nodes) @@ -1200,6 +1200,7 @@ def _compute_and_broadcast_updated_topology(self, current_nodes: List[Node]) -> roles=roles, ip=node.ip, app_name=node.app_name, + unit_number=self.unit_id, temperature=temperature, ) @@ -1302,7 +1303,7 @@ def unit_name(self) -> str: @property def unit_id(self) -> int: """ID of the current unit.""" - return int(self.unit.name.split("/")[1]) + return int(self.unit.name.split("/")[-1]) @property def alt_hosts(self) -> Optional[List[str]]: diff --git a/lib/charms/opensearch/v0/opensearch_distro.py b/lib/charms/opensearch/v0/opensearch_distro.py index 812a00af8..02310e150 100644 --- a/lib/charms/opensearch/v0/opensearch_distro.py +++ b/lib/charms/opensearch/v0/opensearch_distro.py @@ -412,6 +412,7 @@ def current(self) -> Node: roles=current_node["roles"], ip=current_node["ip"], app_name=self._charm.app.name, + unit_number=self._charm.unit_id, temperature=current_node.get("attributes", {}).get("temp"), ) except OpenSearchHttpError: @@ -422,6 +423,7 @@ def current(self) -> Node: roles=conf_on_disk["node.roles"], ip=self._charm.unit_ip, app_name=self._charm.app.name, + unit_number=self._charm.unit_id, temperature=conf_on_disk.get("node.attr.temp"), ) diff --git a/tests/integration/ha/helpers.py b/tests/integration/ha/helpers.py index 23629d133..d50ccc12c 100644 --- a/tests/integration/ha/helpers.py +++ b/tests/integration/ha/helpers.py @@ -205,6 +205,7 @@ async def all_nodes(ops_test: OpsTest, unit_ip: str) -> List[Node]: roles=node["roles"], ip=node["ip"], app_name="-".join(node["name"].split("-")[:-1]), + unit_number=int(node["name"].split("-")[-1]), temperature=node.get("attributes", {}).get("temp"), ) ) diff --git a/tests/unit/lib/test_helper_cluster.py b/tests/unit/lib/test_helper_cluster.py index 93eca237d..98804a991 100644 --- a/tests/unit/lib/test_helper_cluster.py +++ b/tests/unit/lib/test_helper_cluster.py @@ -22,18 +22,54 @@ class TestHelperCluster(unittest.TestCase): def cluster1_5_nodes_conf(self) -> List[Node]: """Returns the expected config of a 5 "planned" nodes cluster.""" return [ - Node(name="cm1", roles=self.cm_roles, ip="0.0.0.1", app_name=self.cluster1), - Node(name="cm2", roles=self.cm_roles, ip="0.0.0.2", app_name=self.cluster1), - Node(name="cm3", roles=self.cm_roles, ip="0.0.0.3", app_name=self.cluster1), - Node(name="cm4", roles=self.cm_roles, ip="0.0.0.4", app_name=self.cluster1), - Node(name="cm5", roles=self.cm_roles, ip="0.0.0.5", app_name=self.cluster1), + Node( + name="cm1", + roles=self.cm_roles, + ip="0.0.0.1", + app_name=self.cluster1, + unit_number=0, + ), + Node( + name="cm2", + roles=self.cm_roles, + ip="0.0.0.2", + app_name=self.cluster1, + unit_number=1, + ), + Node( + name="cm3", + roles=self.cm_roles, + ip="0.0.0.3", + app_name=self.cluster1, + unit_number=3, + ), + Node( + name="cm4", + roles=self.cm_roles, + ip="0.0.0.4", + app_name=self.cluster1, + unit_number=4, + ), + Node( + name="cm5", + roles=self.cm_roles, + ip="0.0.0.5", + app_name=self.cluster1, + unit_number=5, + ), ] def cluster1_6_nodes_conf(self): """Returns the expected config of a 6 "planned" nodes cluster.""" nodes = self.cluster1_5_nodes_conf() nodes.append( - Node(name="data1", roles=self.base_roles, ip="0.0.0.6", app_name=self.cluster1) + Node( + name="data1", + roles=self.base_roles, + ip="0.0.0.6", + app_name=self.cluster1, + unit_number=6, + ) ) return nodes @@ -41,11 +77,41 @@ def cluster2_nodes_conf(self) -> List[Node]: """Returns the expected config of the sub-cluster 2.""" roles = ["cluster_manager", "data", "ml"] return [ - Node(name="cm_data_ml1", roles=roles, ip="0.0.0.11", app_name=self.cluster2), - Node(name="cm_data_ml2", roles=roles, ip="0.0.0.12", app_name=self.cluster2), - Node(name="cm_data_ml3", roles=roles, ip="0.0.0.13", app_name=self.cluster2), - Node(name="cm_data_ml4", roles=roles, ip="0.0.0.14", app_name=self.cluster2), - Node(name="cm_data_ml5", roles=roles, ip="0.0.0.15", app_name=self.cluster2), + Node( + name="cm_data_ml1", + roles=roles, + ip="0.0.0.11", + app_name=self.cluster2, + unit_number=0, + ), + Node( + name="cm_data_ml2", + roles=roles, + ip="0.0.0.12", + app_name=self.cluster2, + unit_number=1, + ), + Node( + name="cm_data_ml3", + roles=roles, + ip="0.0.0.13", + app_name=self.cluster2, + unit_number=2, + ), + Node( + name="cm_data_ml4", + roles=roles, + ip="0.0.0.14", + app_name=self.cluster2, + unit_number=3, + ), + Node( + name="cm_data_ml5", + roles=roles, + ip="0.0.0.15", + app_name=self.cluster2, + unit_number=4, + ), ] def setUp(self) -> None: @@ -61,10 +127,10 @@ def test_topology_roles_suggestion_odd_number_of_planned_units(self): planned_units = 5 cluster_5_conf = self.cluster1_5_nodes_conf() - self.assertCountEqual(ClusterTopology.suggest_roles([], planned_units), self.cm_roles) + self.assertCountEqual(ClusterTopology.suggest_roles([], planned_units, unit_number=0), self.cm_roles) for start_index in range(1, 5): self.assertCountEqual( - ClusterTopology.suggest_roles(cluster_5_conf[:start_index], planned_units), + ClusterTopology.suggest_roles(cluster_5_conf[:start_index], planned_units, unit_number=start_index), self.cm_roles, ) @@ -74,46 +140,57 @@ def test_topology_roles_suggestion_even_number_of_planned_units(self): planned_units = 6 - self.assertCountEqual(ClusterTopology.suggest_roles([], planned_units), self.cm_roles) + self.assertCountEqual(ClusterTopology.suggest_roles([], planned_units, unit_number=0), self.cm_roles) for start_index in range(1, 5): self.assertCountEqual( - ClusterTopology.suggest_roles(cluster_6_conf[:start_index], planned_units), + ClusterTopology.suggest_roles(cluster_6_conf[:start_index], planned_units, unit_number=start_index), self.cm_roles, ) self.assertCountEqual( - ClusterTopology.suggest_roles(cluster_6_conf[:-1], planned_units), self.base_roles + ClusterTopology.suggest_roles(cluster_6_conf[:-1], planned_units, unit_number=5), self.base_roles ) def test_auto_recompute_node_roles_in_cluster_6(self): """Test the automatic suggestion of new roles to an existing node.""" - cluster_conf = self.cluster1_6_nodes_conf() + cluster_conf = {node.name: node for node in self.cluster1_6_nodes_conf()} # remove a cluster manager node - computed_node_to_change = ClusterTopology.node_with_new_roles( - app_name=self.cluster1, - remaining_nodes=[node for node in cluster_conf if node.name != "cm1"], + old_cluster_conf = cluster_conf.copy() + old_cluster_conf.pop("cm1") + new_cluster_conf = ClusterTopology.recompute_nodes_conf( + app_name=self.cluster1, nodes=list(old_cluster_conf.values()) ) - self.assertEqual(computed_node_to_change.name, "data1") - self.assertCountEqual(computed_node_to_change.roles, self.cm_roles) + assert new_cluster_conf["data1"].roles == self.cm_roles + # Assert other remaining nodes unchanged + old_cluster_conf.pop("data1") + new_cluster_conf.pop("data1") + assert old_cluster_conf == new_cluster_conf # remove a data node - computed_node_to_change = ClusterTopology.node_with_new_roles( - app_name=self.cluster1, - remaining_nodes=[node for node in cluster_conf if node.name != "data1"], + old_cluster_conf = cluster_conf.copy() + old_cluster_conf.pop("data1") + new_cluster_conf = ClusterTopology.recompute_nodes_conf( + app_name=self.cluster1, nodes=list(old_cluster_conf.values()) ) - self.assertIsNone(computed_node_to_change) + # Assert all remaining nodes unchanged + assert old_cluster_conf == new_cluster_conf def test_auto_recompute_node_roles_in_cluster_5(self): """Test the automatic suggestion of new roles to an existing node.""" - cluster_conf = self.cluster1_5_nodes_conf() + cluster_conf = {node.name: node for node in self.cluster1_5_nodes_conf()} # remove a cluster manager node - computed_node_to_change = ClusterTopology.node_with_new_roles( - app_name=self.cluster1, - remaining_nodes=[node for node in cluster_conf if node.name != "cm1"], + old_cluster_conf = cluster_conf.copy() + old_cluster_conf.pop("cm1") + new_cluster_conf = ClusterTopology.recompute_nodes_conf( + app_name=self.cluster1, nodes=list(old_cluster_conf.values()) ) - self.assertCountEqual(computed_node_to_change.roles, self.base_roles) + assert new_cluster_conf["cm5"].roles == self.base_roles + # Assert other remaining nodes unchanged + old_cluster_conf.pop("cm5") + new_cluster_conf.pop("cm5") + assert old_cluster_conf == new_cluster_conf def test_auto_recompute_node_roles_in_previous_non_auto_gen_cluster(self): """Test the automatic suggestion of new roles to an existing node.""" @@ -126,7 +203,6 @@ def test_auto_recompute_node_roles_in_previous_non_auto_gen_cluster(self): new_node.roles.append("custom_role") first_cluster_nodes.append(new_node) - # remove a cluster manager node computed_node_to_change = ClusterTopology.recompute_nodes_conf( app_name=self.cluster2, nodes=cluster_conf + first_cluster_nodes, @@ -139,6 +215,7 @@ def test_auto_recompute_node_roles_in_previous_non_auto_gen_cluster(self): roles=self.cm_roles, ip=node.ip, app_name=node.app_name, + unit_number=node.unit_number, temperature=node.temperature, ) self.assertCountEqual(computed_node_to_change, expected) @@ -157,55 +234,6 @@ def test_topology_get_cluster_managers_names(self): ["cm1", "cm2", "cm3", "cm4", "cm5"], ) - def test_topology_nodes_count_by_role(self): - """Test correct mapping role / count of nodes with the role.""" - self.assertDictEqual( - ClusterTopology.nodes_count_by_role(self.cluster1_6_nodes_conf()), - { - "cluster_manager": 5, - "coordinating_only": 6, - "data": 6, - "ingest": 6, - "ml": 6, - }, - ) - - def test_refill_node_with_default_roles(self): - """Test the automatic suggestion of new roles to an existing node.""" - # First test with previously set roles in a cluster - cluster2_nodes = self.cluster2_nodes_conf() - - expected = [] - for node in cluster2_nodes: - expected.append( - Node( - name=node.name, - roles=self.cm_roles, - ip=node.ip, - app_name=node.app_name, - temperature=node.temperature, - ) - ) - expected.sort(key=lambda node: node.name) - refilled = ClusterTopology.refill_node_with_default_roles( - app_name=self.cluster2, - nodes=cluster2_nodes, - ) - refilled.sort(key=lambda node: node.name) - for index in range(len(refilled)): - self.assertEqual(refilled[index], expected[index]) - - # test on auto-gen roles: expected no changes - expected = self.cluster1_5_nodes_conf() - expected.sort(key=lambda node: node.name) - refilled = ClusterTopology.refill_node_with_default_roles( - app_name=self.cluster1, - nodes=self.cluster1_5_nodes_conf(), - ) - refilled.sort(key=lambda node: node.name) - for index in range(len(refilled)): - self.assertEqual(refilled[index], expected[index]) - @patch("charms.opensearch.v0.helper_cluster.ClusterState.shards") def test_state_busy_shards_by_unit(self, shards): """Test the busy shards filtering.""" @@ -227,7 +255,11 @@ def test_state_busy_shards_by_unit(self, shards): def test_node_obj_creation_from_json(self): """Test the creation of a Node object from a dict representation.""" raw_node = Node( - name="cm1", roles=["cluster_manager"], ip="0.0.0.11", app_name=self.cluster1 + name="cm1", + roles=["cluster_manager"], + ip="0.0.0.11", + app_name=self.cluster1, + unit_number=0, ) from_json_node = Node.from_dict( { @@ -235,6 +267,7 @@ def test_node_obj_creation_from_json(self): "roles": ["cluster_manager"], "ip": "0.0.0.11", "app_name": self.cluster1, + "unit_number": 0, } ) diff --git a/tests/unit/lib/test_opensearch_base_charm.py b/tests/unit/lib/test_opensearch_base_charm.py index 78b1f1800..917fd76e5 100644 --- a/tests/unit/lib/test_opensearch_base_charm.py +++ b/tests/unit/lib/test_opensearch_base_charm.py @@ -69,6 +69,7 @@ def setUp(self) -> None: roles=["cluster_manager", "data"], ip="1.1.1.1", app_name="opensearch-ff2z", + unit_number=3, ) self.opensearch.is_failed = MagicMock() self.opensearch.is_failed.return_value = False diff --git a/tests/unit/lib/test_opensearch_peer_clusters.py b/tests/unit/lib/test_opensearch_peer_clusters.py index 66aad09ec..18f6403b4 100644 --- a/tests/unit/lib/test_opensearch_peer_clusters.py +++ b/tests/unit/lib/test_opensearch_peer_clusters.py @@ -123,6 +123,7 @@ def test_validate_roles( roles=["cluster_manager", "data"], ip="1.1.1.1", app_name="logs", + unit_number=int(node.name.split("/")[-1]), ) for node in self.p_units[0:3] ] @@ -137,9 +138,10 @@ def test_validate_roles( roles=["cluster_manager", "data"], ip="1.1.1.1", app_name="logs", + unit_number=int(node.name.split("/")[-1]), ) for node in self.p_units[0:4] - ] + [Node(name="node", roles=["ml"], ip="0.0.0.0", app_name="logs")] + ] + [Node(name="node", roles=["ml"], ip="0.0.0.0", app_name="logs", unit_number=7)] self.peer_cm.validate_roles(nodes=nodes, on_new_unit=False) @patch("ops.model.Model.get_relation") @@ -167,9 +169,10 @@ def test_pre_validate_roles_change( roles=["data"], ip="1.1.1.1", app_name="logs", + unit_number=int(node.name.split("/")[-1]), ) for node in self.p_units - ] + [Node(name="node-5", roles=["data"], ip="2.2.2.2", app_name="logs")] + ] + [Node(name="node-5", roles=["data"], ip="2.2.2.2", app_name="logs", unit_number=5)] self.peer_cm._pre_validate_roles_change(new_roles=["ml"], prev_roles=["data", "ml"]) except OpenSearchProvidedRolesException: self.fail("_pre_validate_roles_change() failed unexpectedly.") @@ -194,6 +197,7 @@ def test_pre_validate_roles_change( roles=["data"], ip="1.1.1.1", app_name="logs", + unit_number=int(node.name.split("/")[-1]), ) for node in self.p_units ] From f4b23ae3355b787529095597badcd9720cca5955 Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Mon, 25 Mar 2024 13:18:28 +0100 Subject: [PATCH 2/6] Add comment about omitting unit 2 --- tests/unit/lib/test_helper_cluster.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/unit/lib/test_helper_cluster.py b/tests/unit/lib/test_helper_cluster.py index 98804a991..4eba334d5 100644 --- a/tests/unit/lib/test_helper_cluster.py +++ b/tests/unit/lib/test_helper_cluster.py @@ -36,6 +36,7 @@ def cluster1_5_nodes_conf(self) -> List[Node]: app_name=self.cluster1, unit_number=1, ), + # Unit number 2 omitted on purpose (unit numbers are not guaranteed to be sequential on VM charms) Node( name="cm3", roles=self.cm_roles, @@ -127,10 +128,14 @@ def test_topology_roles_suggestion_odd_number_of_planned_units(self): planned_units = 5 cluster_5_conf = self.cluster1_5_nodes_conf() - self.assertCountEqual(ClusterTopology.suggest_roles([], planned_units, unit_number=0), self.cm_roles) + self.assertCountEqual( + ClusterTopology.suggest_roles([], planned_units, unit_number=0), self.cm_roles + ) for start_index in range(1, 5): self.assertCountEqual( - ClusterTopology.suggest_roles(cluster_5_conf[:start_index], planned_units, unit_number=start_index), + ClusterTopology.suggest_roles( + cluster_5_conf[:start_index], planned_units, unit_number=start_index + ), self.cm_roles, ) @@ -140,15 +145,20 @@ def test_topology_roles_suggestion_even_number_of_planned_units(self): planned_units = 6 - self.assertCountEqual(ClusterTopology.suggest_roles([], planned_units, unit_number=0), self.cm_roles) + self.assertCountEqual( + ClusterTopology.suggest_roles([], planned_units, unit_number=0), self.cm_roles + ) for start_index in range(1, 5): self.assertCountEqual( - ClusterTopology.suggest_roles(cluster_6_conf[:start_index], planned_units, unit_number=start_index), + ClusterTopology.suggest_roles( + cluster_6_conf[:start_index], planned_units, unit_number=start_index + ), self.cm_roles, ) self.assertCountEqual( - ClusterTopology.suggest_roles(cluster_6_conf[:-1], planned_units, unit_number=5), self.base_roles + ClusterTopology.suggest_roles(cluster_6_conf[:-1], planned_units, unit_number=5), + self.base_roles, ) def test_auto_recompute_node_roles_in_cluster_6(self): From 4579a1bc4a47dc45dc33106b02773b46be964a33 Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Mon, 25 Mar 2024 13:45:28 +0100 Subject: [PATCH 3/6] Revert changes to `suggest_roles` Highest unit number cannot always be determined (e.g. if unit hasn't joined peer relation). Unable to use `planned_units` since unit numbers are not necessarily sequential --- lib/charms/opensearch/v0/helper_cluster.py | 32 +++++++++++++++++-- .../opensearch/v0/opensearch_base_charm.py | 2 +- tests/unit/lib/test_helper_cluster.py | 18 +++-------- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/lib/charms/opensearch/v0/helper_cluster.py b/lib/charms/opensearch/v0/helper_cluster.py index 496187e65..7a36cdef1 100644 --- a/lib/charms/opensearch/v0/helper_cluster.py +++ b/lib/charms/opensearch/v0/helper_cluster.py @@ -35,18 +35,23 @@ class ClusterTopology: """Class for creating the best possible configuration for a Node.""" @staticmethod - def suggest_roles(nodes: List[Node], planned_units: int, unit_number: int) -> List[str]: + def suggest_roles(nodes: List[Node], planned_units: int) -> List[str]: """Get roles for a Node. + This method should be read in the context of a "rolling" start - + only 1 unit at a time will call this. + For now, we don't allow to end-user control roles. The logic here is, if number of planned units is: — odd: "all" the nodes are cm_eligible nodes. — even: "all - 1" are cm_eligible and 1 data node. """ + max_cms = ClusterTopology.max_cluster_manager_nodes(planned_units) + base_roles = ["data", "ingest", "ml", "coordinating_only"] full_roles = base_roles + ["cluster_manager"] - highest_unit_number = max(node.unit_number for node in nodes) - if planned_units % 2 == 0 and unit_number == highest_unit_number: + nodes_by_roles = ClusterTopology.nodes_count_by_role(nodes) + if nodes_by_roles.get("cluster_manager", 0) == max_cms: return base_roles return full_roles @@ -98,6 +103,15 @@ def recompute_nodes_conf(app_name: str, nodes: List[Node]) -> Dict[str, Node]: ) return nodes_by_name + @staticmethod + def max_cluster_manager_nodes(planned_units) -> int: + """Get the max number of CM nodes in a cluster.""" + max_managers = planned_units + if planned_units % 2 == 0: + max_managers -= 1 + + return max_managers + @staticmethod def get_cluster_managers_ips(nodes: List[Node]) -> List[str]: """Get the nodes of cluster manager eligible nodes.""" @@ -118,6 +132,18 @@ def get_cluster_managers_names(nodes: List[Node]) -> List[str]: return result + @staticmethod + def nodes_count_by_role(nodes: List[Node]) -> Dict[str, int]: + """Count number of nodes by role.""" + result = {} + for node in nodes: + for role in node.roles: + if role not in result: + result[role] = 0 + result[role] += 1 + + return result + @staticmethod def nodes_by_role(nodes: List[Node]) -> Dict[str, List[Node]]: """Get list of nodes by role.""" diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index a4a75ef21..62e822b50 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -1063,7 +1063,7 @@ def _set_node_conf(self, nodes: List[Node]) -> None: computed_roles = ( update_conf.roles if update_conf - else ClusterTopology.suggest_roles(nodes, self.app.planned_units(), self.unit_id) + else ClusterTopology.suggest_roles(nodes, self.app.planned_units()) ) cm_names = ClusterTopology.get_cluster_managers_names(nodes) diff --git a/tests/unit/lib/test_helper_cluster.py b/tests/unit/lib/test_helper_cluster.py index 4eba334d5..95da54dbc 100644 --- a/tests/unit/lib/test_helper_cluster.py +++ b/tests/unit/lib/test_helper_cluster.py @@ -128,14 +128,10 @@ def test_topology_roles_suggestion_odd_number_of_planned_units(self): planned_units = 5 cluster_5_conf = self.cluster1_5_nodes_conf() - self.assertCountEqual( - ClusterTopology.suggest_roles([], planned_units, unit_number=0), self.cm_roles - ) + self.assertCountEqual(ClusterTopology.suggest_roles([], planned_units), self.cm_roles) for start_index in range(1, 5): self.assertCountEqual( - ClusterTopology.suggest_roles( - cluster_5_conf[:start_index], planned_units, unit_number=start_index - ), + ClusterTopology.suggest_roles(cluster_5_conf[:start_index], planned_units), self.cm_roles, ) @@ -145,19 +141,15 @@ def test_topology_roles_suggestion_even_number_of_planned_units(self): planned_units = 6 - self.assertCountEqual( - ClusterTopology.suggest_roles([], planned_units, unit_number=0), self.cm_roles - ) + self.assertCountEqual(ClusterTopology.suggest_roles([], planned_units), self.cm_roles) for start_index in range(1, 5): self.assertCountEqual( - ClusterTopology.suggest_roles( - cluster_6_conf[:start_index], planned_units, unit_number=start_index - ), + ClusterTopology.suggest_roles(cluster_6_conf[:start_index], planned_units), self.cm_roles, ) self.assertCountEqual( - ClusterTopology.suggest_roles(cluster_6_conf[:-1], planned_units, unit_number=5), + ClusterTopology.suggest_roles(cluster_6_conf[:-1], planned_units), self.base_roles, ) From bba518dcd077289df6118b1a60417c51743d95ce Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Mon, 25 Mar 2024 13:45:46 +0100 Subject: [PATCH 4/6] fix lint --- tests/unit/lib/test_helper_cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/lib/test_helper_cluster.py b/tests/unit/lib/test_helper_cluster.py index 95da54dbc..6f93dc13f 100644 --- a/tests/unit/lib/test_helper_cluster.py +++ b/tests/unit/lib/test_helper_cluster.py @@ -36,7 +36,8 @@ def cluster1_5_nodes_conf(self) -> List[Node]: app_name=self.cluster1, unit_number=1, ), - # Unit number 2 omitted on purpose (unit numbers are not guaranteed to be sequential on VM charms) + # Unit number 2 omitted on purpose + # (unit numbers are not guaranteed to be sequential on VM charms) Node( name="cm3", roles=self.cm_roles, From 52321f9c3f3cf585d6757f722426e1523d63002c Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Mon, 25 Mar 2024 13:47:38 +0100 Subject: [PATCH 5/6] Add handling for empty nodes max([]) raises ValueError --- lib/charms/opensearch/v0/helper_cluster.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/charms/opensearch/v0/helper_cluster.py b/lib/charms/opensearch/v0/helper_cluster.py index 7a36cdef1..9e5c65cd3 100644 --- a/lib/charms/opensearch/v0/helper_cluster.py +++ b/lib/charms/opensearch/v0/helper_cluster.py @@ -75,6 +75,8 @@ def get_cluster_settings( @staticmethod def recompute_nodes_conf(app_name: str, nodes: List[Node]) -> Dict[str, Node]: """Recompute the configuration of all the nodes (cluster set to auto-generate roles).""" + if not nodes: + return {} nodes_by_name = {} current_cluster_nodes = [] for node in nodes: From 0b97602979d4c39ee56c0ff081b1e7f24ea9d5ab Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Mon, 25 Mar 2024 13:55:50 +0100 Subject: [PATCH 6/6] Add logs --- lib/charms/opensearch/v0/helper_cluster.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/charms/opensearch/v0/helper_cluster.py b/lib/charms/opensearch/v0/helper_cluster.py index 9e5c65cd3..58bf0dbab 100644 --- a/lib/charms/opensearch/v0/helper_cluster.py +++ b/lib/charms/opensearch/v0/helper_cluster.py @@ -77,6 +77,7 @@ def recompute_nodes_conf(app_name: str, nodes: List[Node]) -> Dict[str, Node]: """Recompute the configuration of all the nodes (cluster set to auto-generate roles).""" if not nodes: return {} + logger.debug(f"Roles before re-balancing {({node.name: node.roles for node in nodes})=}") nodes_by_name = {} current_cluster_nodes = [] for node in nodes: @@ -103,6 +104,9 @@ def recompute_nodes_conf(app_name: str, nodes: List[Node]) -> Dict[str, Node]: unit_number=node.unit_number, temperature=node.temperature, ) + logger.debug( + f"Roles after re-balancing {({name: node.roles for name, node in nodes_by_name.items()})=}" + ) return nodes_by_name @staticmethod