From c23ff0da8ccc2b26d6ce2f5f5560ae47546fd6dc Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Thu, 2 May 2024 12:11:40 +0200 Subject: [PATCH 1/6] Prevent duplicate peer-relation-changed deferred events Follow up to #266 --- lib/charms/opensearch/v0/opensearch_base_charm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index 56081f80d..d2ae8ae77 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -1287,7 +1287,13 @@ def _recompute_roles_if_needed(self, event: RelationChangedEvent): try: nodes = self._get_nodes(self.opensearch.is_node_up()) if len(nodes) < self.app.planned_units(): + if self._is_peer_rel_changed_deferred: + # We already deferred this event during this Juju event. Retry on the next + # Juju event. + return event.defer() + # If the handler is called again within this Juju hook, we will abandon the event + self._is_peer_rel_changed_deferred = True return self._compute_and_broadcast_updated_topology(nodes) From 21965085936ab55865e04427a2c16f4fb0832d2e Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Tue, 30 Apr 2024 16:57:43 +0200 Subject: [PATCH 2/6] Wait until after peer relation joined before acquiring lock ## Issue During initial startup (i.e. scale-up), a unit will request a lock via the peer databag until it gets a peer-relation-joined event and learns that other units of OpenSearch are online ## Solution Wait until after the first peer-relation-joined event before trying to acquire lock (so that if [enough] units of opensearch are online, we request the opensearch lock instead of peer databag lock) --- .../opensearch/v0/opensearch_locking.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/charms/opensearch/v0/opensearch_locking.py b/lib/charms/opensearch/v0/opensearch_locking.py index ab2763d36..450538aba 100644 --- a/lib/charms/opensearch/v0/opensearch_locking.py +++ b/lib/charms/opensearch/v0/opensearch_locking.py @@ -8,6 +8,7 @@ import typing import ops +from charms.opensearch.v0.constants_charm import PeerRelationName from charms.opensearch.v0.helper_cluster import ClusterTopology from charms.opensearch.v0.opensearch_exceptions import OpenSearchHttpError @@ -221,6 +222,26 @@ def acquired(self) -> bool: # noqa: C901 host = self._charm.unit_ip else: host = None + if ( + self._charm.app.planned_units() > 1 + and (relation := self._charm.model.get_relation(PeerRelationName)) + and not relation.units + ): + # On initial startup (e.g. scaling up, on the new unit), `self._charm.alt_hosts` will + # be empty since it uses `Relation.units` on the `PeerRelationName`. + # Initial startup event sequence (some events omitted for brevity): + # - install + # - peer-relation-created + # - start + # - peer-relation-joined (e.g. for unit 2) + # - peer-relation-changed + # - peer-relation-joined (e.g. for unit 0) + # Until the peer relation joined event, `Relation.units` will be empty + # Therefore, before the first peer relation joined event, we should avoid acquiring the + # lock since otherwise we would fall back to the peer databag lock even if OpenSearch + # nodes were online. + logger.debug("[Node lock] Waiting for peer units before acquiring lock") + return False alt_hosts = [host for host in self._charm.alt_hosts if self._opensearch.is_node_up(host)] if host or alt_hosts: logger.debug("[Node lock] 1+ opensearch nodes online") From 83052e4f28f405ce9d971831ef1c16676ac216de Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Wed, 1 May 2024 15:11:43 +0200 Subject: [PATCH 3/6] TEST trying using lock peer relation for alt_hosts --- lib/charms/opensearch/v0/opensearch_locking.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/charms/opensearch/v0/opensearch_locking.py b/lib/charms/opensearch/v0/opensearch_locking.py index 450538aba..015d8e983 100644 --- a/lib/charms/opensearch/v0/opensearch_locking.py +++ b/lib/charms/opensearch/v0/opensearch_locking.py @@ -5,11 +5,12 @@ import json import logging import os +import random import typing import ops -from charms.opensearch.v0.constants_charm import PeerRelationName from charms.opensearch.v0.helper_cluster import ClusterTopology +from charms.opensearch.v0.helper_networking import units_ips from charms.opensearch.v0.opensearch_exceptions import OpenSearchHttpError if typing.TYPE_CHECKING: @@ -224,7 +225,7 @@ def acquired(self) -> bool: # noqa: C901 host = None if ( self._charm.app.planned_units() > 1 - and (relation := self._charm.model.get_relation(PeerRelationName)) + and (relation := self._charm.model.get_relation(_PeerRelationLock._ENDPOINT_NAME)) and not relation.units ): # On initial startup (e.g. scaling up, on the new unit), `self._charm.alt_hosts` will @@ -242,7 +243,12 @@ def acquired(self) -> bool: # noqa: C901 # nodes were online. logger.debug("[Node lock] Waiting for peer units before acquiring lock") return False - alt_hosts = [host for host in self._charm.alt_hosts if self._opensearch.is_node_up(host)] + alt_hosts = [ + host + for host in units_ips(self._charm, _PeerRelationLock._ENDPOINT_NAME) + if self._opensearch.is_node_up(host) + ] + random.shuffle(alt_hosts) if host or alt_hosts: logger.debug("[Node lock] 1+ opensearch nodes online") try: From fd9169537059ef846aba341de60960d8647c170b Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Wed, 1 May 2024 15:52:55 +0200 Subject: [PATCH 4/6] Revert "TEST trying using lock peer relation for alt_hosts" This reverts commit 34a766e17ecabf1f64371d6a5b6134045a6e1137. --- lib/charms/opensearch/v0/opensearch_locking.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/lib/charms/opensearch/v0/opensearch_locking.py b/lib/charms/opensearch/v0/opensearch_locking.py index 015d8e983..450538aba 100644 --- a/lib/charms/opensearch/v0/opensearch_locking.py +++ b/lib/charms/opensearch/v0/opensearch_locking.py @@ -5,12 +5,11 @@ import json import logging import os -import random import typing import ops +from charms.opensearch.v0.constants_charm import PeerRelationName from charms.opensearch.v0.helper_cluster import ClusterTopology -from charms.opensearch.v0.helper_networking import units_ips from charms.opensearch.v0.opensearch_exceptions import OpenSearchHttpError if typing.TYPE_CHECKING: @@ -225,7 +224,7 @@ def acquired(self) -> bool: # noqa: C901 host = None if ( self._charm.app.planned_units() > 1 - and (relation := self._charm.model.get_relation(_PeerRelationLock._ENDPOINT_NAME)) + and (relation := self._charm.model.get_relation(PeerRelationName)) and not relation.units ): # On initial startup (e.g. scaling up, on the new unit), `self._charm.alt_hosts` will @@ -243,12 +242,7 @@ def acquired(self) -> bool: # noqa: C901 # nodes were online. logger.debug("[Node lock] Waiting for peer units before acquiring lock") return False - alt_hosts = [ - host - for host in units_ips(self._charm, _PeerRelationLock._ENDPOINT_NAME) - if self._opensearch.is_node_up(host) - ] - random.shuffle(alt_hosts) + alt_hosts = [host for host in self._charm.alt_hosts if self._opensearch.is_node_up(host)] if host or alt_hosts: logger.debug("[Node lock] 1+ opensearch nodes online") try: From 04d0c44c6af0ff67747fc490de9023cf004bce13 Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Wed, 1 May 2024 15:56:06 +0200 Subject: [PATCH 5/6] Check if `_can_service_start` before acquiring lock --- lib/charms/opensearch/v0/opensearch_base_charm.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index d2ae8ae77..f7e334884 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -810,6 +810,10 @@ def _handle_change_to_main_orchestrator_if_needed( def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901 """Start OpenSearch, with a generated or passed conf, if all resources configured.""" + if not self._can_service_start(): + self.node_lock.release() + event.defer() + return if not self.node_lock.acquired: # (Attempt to acquire lock even if `event.ignore_lock`) if event.ignore_lock: @@ -827,11 +831,6 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901 event.defer() return - if not self._can_service_start(): - self.node_lock.release() - event.defer() - return - if self.opensearch.is_failed(): self.node_lock.release() self.status.set(BlockedStatus(ServiceStartError)) From 9091273e56ab6643f76308f11cb2f836fa25e763 Mon Sep 17 00:00:00 2001 From: Carl Csaposs Date: Thu, 2 May 2024 13:03:01 +0200 Subject: [PATCH 6/6] TEMP: log relation data --- lib/charms/opensearch/v0/opensearch_base_charm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index f7e334884..df4a536f8 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -428,6 +428,7 @@ def _on_peer_relation_joined(self, event: RelationJoinedEvent): def _on_peer_relation_changed(self, event: RelationChangedEvent): """Handle peer relation changes.""" + logger.warning(f"FOO: {event.relation.data=}") if ( self.unit.is_leader() and self.opensearch.is_node_up()