diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py index 56081f80d..df4a536f8 100644 --- a/lib/charms/opensearch/v0/opensearch_base_charm.py +++ b/lib/charms/opensearch/v0/opensearch_base_charm.py @@ -428,6 +428,7 @@ def _on_peer_relation_joined(self, event: RelationJoinedEvent): def _on_peer_relation_changed(self, event: RelationChangedEvent): """Handle peer relation changes.""" + logger.warning(f"FOO: {event.relation.data=}") if ( self.unit.is_leader() and self.opensearch.is_node_up() @@ -810,6 +811,10 @@ def _handle_change_to_main_orchestrator_if_needed( def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901 """Start OpenSearch, with a generated or passed conf, if all resources configured.""" + if not self._can_service_start(): + self.node_lock.release() + event.defer() + return if not self.node_lock.acquired: # (Attempt to acquire lock even if `event.ignore_lock`) if event.ignore_lock: @@ -827,11 +832,6 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901 event.defer() return - if not self._can_service_start(): - self.node_lock.release() - event.defer() - return - if self.opensearch.is_failed(): self.node_lock.release() self.status.set(BlockedStatus(ServiceStartError)) @@ -1287,7 +1287,13 @@ def _recompute_roles_if_needed(self, event: RelationChangedEvent): try: nodes = self._get_nodes(self.opensearch.is_node_up()) if len(nodes) < self.app.planned_units(): + if self._is_peer_rel_changed_deferred: + # We already deferred this event during this Juju event. Retry on the next + # Juju event. + return event.defer() + # If the handler is called again within this Juju hook, we will abandon the event + self._is_peer_rel_changed_deferred = True return self._compute_and_broadcast_updated_topology(nodes) diff --git a/lib/charms/opensearch/v0/opensearch_locking.py b/lib/charms/opensearch/v0/opensearch_locking.py index ab2763d36..450538aba 100644 --- a/lib/charms/opensearch/v0/opensearch_locking.py +++ b/lib/charms/opensearch/v0/opensearch_locking.py @@ -8,6 +8,7 @@ import typing import ops +from charms.opensearch.v0.constants_charm import PeerRelationName from charms.opensearch.v0.helper_cluster import ClusterTopology from charms.opensearch.v0.opensearch_exceptions import OpenSearchHttpError @@ -221,6 +222,26 @@ def acquired(self) -> bool: # noqa: C901 host = self._charm.unit_ip else: host = None + if ( + self._charm.app.planned_units() > 1 + and (relation := self._charm.model.get_relation(PeerRelationName)) + and not relation.units + ): + # On initial startup (e.g. scaling up, on the new unit), `self._charm.alt_hosts` will + # be empty since it uses `Relation.units` on the `PeerRelationName`. + # Initial startup event sequence (some events omitted for brevity): + # - install + # - peer-relation-created + # - start + # - peer-relation-joined (e.g. for unit 2) + # - peer-relation-changed + # - peer-relation-joined (e.g. for unit 0) + # Until the peer relation joined event, `Relation.units` will be empty + # Therefore, before the first peer relation joined event, we should avoid acquiring the + # lock since otherwise we would fall back to the peer databag lock even if OpenSearch + # nodes were online. + logger.debug("[Node lock] Waiting for peer units before acquiring lock") + return False alt_hosts = [host for host in self._charm.alt_hosts if self._opensearch.is_node_up(host)] if host or alt_hosts: logger.debug("[Node lock] 1+ opensearch nodes online")