Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wait until after peer relation joined before acquiring lock #276

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,10 @@ def _handle_change_to_main_orchestrator_if_needed(

def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901
"""Start OpenSearch, with a generated or passed conf, if all resources configured."""
if not self._can_service_start():
self.node_lock.release()
event.defer()
return
if not self.node_lock.acquired:
# (Attempt to acquire lock even if `event.ignore_lock`)
if event.ignore_lock:
Expand All @@ -827,11 +831,6 @@ def _start_opensearch(self, event: _StartOpenSearch) -> None: # noqa: C901
event.defer()
return

if not self._can_service_start():
self.node_lock.release()
event.defer()
return

if self.opensearch.is_failed():
self.node_lock.release()
self.status.set(BlockedStatus(ServiceStartError))
Expand Down Expand Up @@ -1287,7 +1286,13 @@ def _recompute_roles_if_needed(self, event: RelationChangedEvent):
try:
nodes = self._get_nodes(self.opensearch.is_node_up())
if len(nodes) < self.app.planned_units():
if self._is_peer_rel_changed_deferred:
# We already deferred this event during this Juju event. Retry on the next
# Juju event.
return
event.defer()
# If the handler is called again within this Juju hook, we will abandon the event
self._is_peer_rel_changed_deferred = True
return

self._compute_and_broadcast_updated_topology(nodes)
Expand Down
21 changes: 21 additions & 0 deletions lib/charms/opensearch/v0/opensearch_locking.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import typing

import ops
from charms.opensearch.v0.constants_charm import PeerRelationName
from charms.opensearch.v0.helper_cluster import ClusterTopology
from charms.opensearch.v0.opensearch_exceptions import OpenSearchHttpError

Expand Down Expand Up @@ -221,6 +222,26 @@ def acquired(self) -> bool: # noqa: C901
host = self._charm.unit_ip
else:
host = None
if (
self._charm.app.planned_units() > 1
and (relation := self._charm.model.get_relation(PeerRelationName))
and not relation.units
):
# On initial startup (e.g. scaling up, on the new unit), `self._charm.alt_hosts` will
# be empty since it uses `Relation.units` on the `PeerRelationName`.
# Initial startup event sequence (some events omitted for brevity):
# - install
# - peer-relation-created
# - start
# - peer-relation-joined (e.g. for unit 2)
# - peer-relation-changed
# - peer-relation-joined (e.g. for unit 0)
# Until the peer relation joined event, `Relation.units` will be empty
# Therefore, before the first peer relation joined event, we should avoid acquiring the
# lock since otherwise we would fall back to the peer databag lock even if OpenSearch
# nodes were online.
logger.debug("[Node lock] Waiting for peer units before acquiring lock")
return False
alt_hosts = [host for host in self._charm.alt_hosts if self._opensearch.is_node_up(host)]
if host or alt_hosts:
logger.debug("[Node lock] 1+ opensearch nodes online")
Expand Down
Loading