From b3ef11483894b7439e9ef7b72c06b6435c061c10 Mon Sep 17 00:00:00 2001 From: Mehdi-Bendriss Date: Fri, 13 Sep 2024 21:22:24 +0200 Subject: [PATCH 1/4] Fix edge case failover --- .../v0/opensearch_relation_peer_cluster.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py index 6c6e5e37e..885173b33 100644 --- a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py +++ b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py @@ -2,6 +2,7 @@ # See LICENSE file for licensing details. """Peer clusters relation related classes for OpenSearch.""" +import copy import json import logging from typing import TYPE_CHECKING, Any, Dict, List, MutableMapping, Optional, Union @@ -690,22 +691,27 @@ def _on_peer_cluster_relation_departed(self, event: RelationDepartedEvent): ) # delete the orchestrator that triggered this event - orchestrators.delete(event_src_cluster_type) + new_orchestrators = copy.deepcopy(orchestrators) + new_orchestrators.delete(event_src_cluster_type) # the 'main' cluster orchestrator is the one being removed failover_promoted = False if event_src_cluster_type == "main": + # The current is not the elected failover if not orchestrators.failover_app: self.charm.status.set( BlockedStatus( "Main-cluster-orchestrator removed, and no failover cluster related." ) ) - elif orchestrators.failover_app.id == deployment_desc.app.id: - self._promote_failover(orchestrators, cms) + elif ( + new_orchestrators.failover_app + and new_orchestrators.failover_app.id == deployment_desc.app.id + ): + self._promote_failover(new_orchestrators, cms) failover_promoted = True - self.charm.peers_data.put_object(Scope.APP, "orchestrators", orchestrators.to_dict()) + self.charm.peers_data.put_object(Scope.APP, "orchestrators", new_orchestrators.to_dict()) # clear previously set errors due to this relation self._clear_errors(f"error_from_provider-{event.relation.id}") @@ -715,7 +721,10 @@ def _on_peer_cluster_relation_departed(self, event: RelationDepartedEvent): if ( self.charm.opensearch_peer_cm.deployment_desc().typ == DeploymentType.OTHER or deployment_desc.app.id - not in [orchestrators.main_app.id, orchestrators.failover_app.id] + not in [ + getattr(orchestrators.main_app, "id", None), + getattr(orchestrators.failover_app, "id", None), + ] ): return From f34a6da132b1f68d17db76016d73a0f80fbbd543 Mon Sep 17 00:00:00 2001 From: Mehdi-Bendriss Date: Fri, 13 Sep 2024 21:46:09 +0200 Subject: [PATCH 2/4] Fix edge case failover --- .../opensearch/v0/opensearch_relation_peer_cluster.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py index 885173b33..846b6adf3 100644 --- a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py +++ b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py @@ -335,7 +335,10 @@ def _put_fleet_apps( cluster_fleet_apps.update({current_app.app.id: current_app.to_dict()}) if p_cluster_app: - cluster_fleet_apps.update({p_cluster_app.app.id: p_cluster_app.to_dict()}) + if p_cluster_app.planned_units == 0: # app removal + cluster_fleet_apps.pop(p_cluster_app.app.id, None) + else: + cluster_fleet_apps.update({p_cluster_app.app.id: p_cluster_app.to_dict()}) for rel_id in target_relation_ids: self.put_in_rel( @@ -350,7 +353,10 @@ def _put_fleet_apps( cluster_fleet_apps_rels = ( self.charm.peers_data.get_object(Scope.APP, "cluster_fleet_apps_rels") or {} ) - cluster_fleet_apps_rels.update({str(trigger_rel_id): p_cluster_app.to_dict()}) + if p_cluster_app.planned_units == 0: + cluster_fleet_apps_rels.pop(str(trigger_rel_id)) + else: + cluster_fleet_apps_rels.update({str(trigger_rel_id): p_cluster_app.to_dict()}) self.charm.peers_data.put_object( Scope.APP, "cluster_fleet_apps_rels", cluster_fleet_apps_rels From 213178e7d59cdfd6bb508f8c898050611b22397c Mon Sep 17 00:00:00 2001 From: Mehdi-Bendriss Date: Fri, 13 Sep 2024 22:15:23 +0200 Subject: [PATCH 3/4] Fix edge case failover --- .../v0/opensearch_relation_peer_cluster.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py index 846b6adf3..8cb11c67b 100644 --- a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py +++ b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py @@ -696,6 +696,9 @@ def _on_peer_cluster_relation_departed(self, event: RelationDepartedEvent): "main" if event.relation.id == orchestrators.main_rel_id else "failover" ) + # remove the departing app from the peer_cluster_apps + self._update_fleet_apps_on_departure(orchestrators, event_src_cluster_type) + # delete the orchestrator that triggered this event new_orchestrators = copy.deepcopy(orchestrators) new_orchestrators.delete(event_src_cluster_type) @@ -748,6 +751,19 @@ def _on_peer_cluster_relation_departed(self, event: RelationDepartedEvent): self.put_in_rel(data={"orchestrators": rel_orchestrators.to_str()}, rel_id=rel_id) + def _update_fleet_apps_on_departure( + self, orchestrators: PeerClusterOrchestrators, departing_orchestrator: str + ): + """Delete the current departing orchestrator from the cluster fleet apps.""" + cluster_fleet_apps = ( + self.charm.peers_data.get_object(Scope.APP, "cluster_fleet_apps") or {} + ) + if departing_orchestrator == "main": + cluster_fleet_apps.pop(orchestrators.main_app.id) + else: + cluster_fleet_apps.pop(orchestrators.failover_app.id) + self.charm.peers_data.put_object(Scope.APP, "cluster_fleet_apps", cluster_fleet_apps) + def _promote_failover(self, orchestrators: PeerClusterOrchestrators, cms: List[Node]) -> None: """Handle the departure of the main orchestrator.""" # current cluster is failover From cb5a85a3060461fc22765f5727138fdb600bffc5 Mon Sep 17 00:00:00 2001 From: Mehdi-Bendriss Date: Fri, 13 Sep 2024 22:24:36 +0200 Subject: [PATCH 4/4] Fix edge case failover --- lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py index 8cb11c67b..f40486b56 100644 --- a/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py +++ b/lib/charms/opensearch/v0/opensearch_relation_peer_cluster.py @@ -759,9 +759,9 @@ def _update_fleet_apps_on_departure( self.charm.peers_data.get_object(Scope.APP, "cluster_fleet_apps") or {} ) if departing_orchestrator == "main": - cluster_fleet_apps.pop(orchestrators.main_app.id) + cluster_fleet_apps.pop(orchestrators.main_app.id, None) else: - cluster_fleet_apps.pop(orchestrators.failover_app.id) + cluster_fleet_apps.pop(orchestrators.failover_app.id, None) self.charm.peers_data.put_object(Scope.APP, "cluster_fleet_apps", cluster_fleet_apps) def _promote_failover(self, orchestrators: PeerClusterOrchestrators, cms: List[Node]) -> None: