Skip to content

Commit

Permalink
Merge pull request #35 from canonical/DPE-5236-update-tls-certs-on-no…
Browse files Browse the repository at this point in the history
…de-port-change

[DPE-5307][DPE-5236] update tls certs on node port change + fix flakey TLS tests
  • Loading branch information
MiaAltieri authored Oct 9, 2024
2 parents 14cbfa5 + 3bc8657 commit cf96460
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 81 deletions.
24 changes: 13 additions & 11 deletions lib/charms/mongodb/v1/mongodb_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import logging
import re
import socket
from typing import Dict, List, Optional, Tuple
from typing import Optional, Tuple

from charms.tls_certificates_interface.v3.tls_certificates import (
CertificateAvailableEvent,
Expand Down Expand Up @@ -42,7 +42,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
LIBPATCH = 5

WAIT_CERT_UPDATE = "wait-cert-updated"

Expand Down Expand Up @@ -105,9 +105,6 @@ def request_certificate(
internal: bool,
):
"""Request TLS certificate."""
if not self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION):
return

if param is None:
key = generate_private_key()
else:
Expand Down Expand Up @@ -234,7 +231,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
self.charm.cluster.update_ca_secret(new_ca=event.ca)
self.charm.config_server.update_ca_secret(new_ca=event.ca)

if self.waiting_for_both_certs():
if self.is_waiting_for_both_certs():
logger.debug(
"Defer till both internal and external TLS certificates available to avoid second restart."
)
Expand All @@ -256,7 +253,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
# clear waiting status if db service is ready
self.charm.status.set_and_share_status(ActiveStatus())

def waiting_for_both_certs(self):
def is_waiting_for_both_certs(self) -> bool:
"""Returns a boolean indicating whether additional certs are needed."""
if not self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL):
logger.debug("Waiting for internal certificate.")
Expand Down Expand Up @@ -295,6 +292,10 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None:
return

logger.debug("Generating a new Certificate Signing Request.")
self.request_new_certificates(internal)

def request_new_certificates(self, internal: bool) -> None:
"""Requests the renewel of a new certificate."""
key = self.get_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL).encode("utf-8")
old_csr = self.get_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL).encode("utf-8")
sans = self.get_new_sans()
Expand All @@ -313,8 +314,9 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None:
)

self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, new_csr.decode("utf-8"))
self.set_waiting_for_cert_to_update(waiting=True, internal=internal)

def get_new_sans(self) -> Dict:
def get_new_sans(self) -> dict[str, list[str]]:
"""Create a list of DNS names for a MongoDB unit.
Returns:
Expand All @@ -341,7 +343,7 @@ def get_new_sans(self) -> Dict:

return sans

def get_current_sans(self, internal: bool) -> List[str] | None:
def get_current_sans(self, internal: bool) -> dict[str, list[str]] | None:
"""Gets the current SANs for the unit cert."""
# if unit has no certificates do not proceed.
if not self.is_tls_enabled(internal=internal):
Expand Down Expand Up @@ -411,9 +413,9 @@ def _get_subject_name(self) -> str:

def is_set_waiting_for_cert_to_update(
self,
internal=False,
internal: bool = False,
) -> bool:
"""Returns True we are waiting for a cert to update."""
"""Returns True if we are waiting for a cert to update."""
scope = "int" if internal else "ext"
label_name = f"{scope}-{WAIT_CERT_UPDATE}"

Expand Down
135 changes: 89 additions & 46 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def _on_config_changed(self, event: ConfigChangedEvent) -> None:
self.status.clear_status(Config.Status.INVALID_EXTERNAL_CONFIG)
self.update_external_services()

self.update_tls_sans()
# toggling of external connectivity means we have to update integrated hosts
self._update_client_related_hosts(event)

Expand Down Expand Up @@ -224,11 +225,14 @@ def _on_update_status(self, _):
)
return

# in the case external connectivity it is possible for public K8s endpoint to be updated
# in that case we must update our sans in TLS. K8s endpoints is not tracked by Juju.
self.update_tls_sans()

self.status.set_and_share_status(ActiveStatus())

# END: hook functions

# BEGIN: helper functions
def is_user_external_config_valid(self) -> bool:
"""Returns True if the user set external config is valid."""
return (
Expand Down Expand Up @@ -262,6 +266,41 @@ def update_external_services(self) -> None:

self.expose_external = self.model.config["expose-external"]

def update_tls_sans(self) -> None:
"""Emits a certificate expiring event when sans in current certificates are out of date.
This can occur for a variety of reasons:
1. Node port has been toggled on
2. Node port has been toggled off
3. The public K8s IP has changed
"""

for internal in [True, False]:
# if the certificate has already been requested, we do not want to re-request
# another one and lead to an infinite chain of certificate events.
if self.tls.is_set_waiting_for_cert_to_update(internal):
continue

current_sans = self.tls.get_current_sans(internal)
current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set()
expected_sans_ip = (
set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set()
)
sans_ip_changed = current_sans_ip ^ expected_sans_ip

if not sans_ip_changed:
continue

logger.info(
(
f'Mongos {self.unit.name.split("/")[1]} updating certificate SANs - '
f"OLD SANs = {current_sans_ip - expected_sans_ip}, "
f"NEW SANs = {expected_sans_ip - current_sans_ip}"
)
)

self.tls.request_new_certificates(internal)

def get_keyfile_contents(self) -> str | None:
"""Retrieves the contents of the keyfile on host machine."""
# wait for keyFile to be created by leader unit
Expand Down Expand Up @@ -338,10 +377,8 @@ def stop_mongos_service(self):
def restart_charm_services(self):
"""Restart mongos service."""
container = self.unit.get_container(Config.CONTAINER_NAME)
container.stop(Config.SERVICE_NAME)

container.add_layer(Config.CONTAINER_NAME, self._mongos_layer, combine=True)
container.replan()
container.restart(Config.SERVICE_NAME)

def set_database(self, database: str) -> None:
"""Updates the database requested for the mongos user."""
Expand Down Expand Up @@ -419,47 +456,56 @@ def get_mongos_host(self) -> str:
The host for mongos can be either the Unix Domain Socket or an IP address depending on how
the client wishes to connect to mongos (inside Juju or outside).
"""
return self.unit_host(self.unit)
unit_id = self.unit.name.split("/")[1]
return f"{self.app.name}-{unit_id}.{self.app.name}-endpoints"

def get_ext_mongos_hosts(self) -> Set:
"""Returns the K8s hosts for mongos.
"""Returns the ext hosts for mongos.
Note: for external connections it is not enough to know the external ip, but also the
port that is associated with the client.
"""
hosts = set()

if not self.is_external_client:
return hosts

try:
for unit in self.get_units():
unit_ip = self.node_port_manager.get_node_ip(unit.name)
unit_port = self.node_port_manager.get_node_port(
port_to_match=Config.MONGOS_PORT, unit_name=unit.name
)
if unit_ip and unit_port:
hosts.add(f"{unit_ip}:{unit_port}")
else:
raise NoExternalHostError(f"No external host for unit {unit.name}")
except (
ApiError,
NoExternalHostError,
FailedToFindNodePortError,
FailedToFindServiceError,
) as e:
raise FailedToGetHostsError(f"Failed to retrieve external hosts due to {e}")
for unit in self.get_units():
hosts.add(self.get_ext_mongos_host(unit))

return hosts

def get_k8s_mongos_hosts(self) -> Set:
"""Returns the K8s hosts for mongos"""
hosts = set()
for unit in self.get_units():
hosts.add(self.unit_host(unit))
unit_id = unit.name.split("/")[1]
hosts.add(f"{self.app.name}-{unit_id}.{self.app.name}-endpoints")

return hosts

def get_ext_mongos_host(self, unit: Unit, incl_port=True) -> str | None:
"""Returns the ext hosts for mongos on the provided unit."""
if not self.is_external_client:
return None

try:
unit_ip = self.node_port_manager.get_node_ip(unit.name)
if not incl_port:
return unit_ip

unit_port = self.node_port_manager.get_node_port(
port_to_match=Config.MONGOS_PORT, unit_name=unit.name
)
if unit_ip and unit_port:
return f"{unit_ip}:{unit_port}"
else:
raise NoExternalHostError(f"No external host for unit {unit.name}")
except (
NoExternalHostError,
FailedToFindNodePortError,
FailedToFindServiceError,
) as e:
raise FailedToGetHostsError(
"Failed to retrieve external hosts due to %s", e
)

def get_mongos_hosts_for_client(self) -> Set:
"""Returns the hosts for mongos as a str.
Expand Down Expand Up @@ -618,18 +664,6 @@ def delete_tls_certificate_from_workload(self) -> None:
except PathError as err:
logger.debug("Path unavailable: %s (%s)", file, str(err))

def unit_host(self, unit: Unit) -> str:
"""Create a DNS name for a MongoDB unit.
Args:
unit_name: the juju unit name, e.g. "mongodb/1".
Returns:
A string representing the hostname of the MongoDB unit.
"""
unit_id = unit.name.split("/")[1]
return f"{self.app.name}-{unit_id}.{self.app.name}-endpoints"

def get_config_server_name(self) -> Optional[str]:
"""Returns the name of the Juju Application that mongos is using as a config server."""
return self.cluster.get_config_server_name()
Expand Down Expand Up @@ -669,13 +703,22 @@ def _update_client_related_hosts(self, event) -> None:
@property
def expose_external(self) -> Optional[str]:
"""Returns mode of exposure for external connections."""
if (
self.app_peer_data.get("expose-external", Config.ExternalConnections.NONE)
== Config.ExternalConnections.NONE
):
# don't let an incorrect configuration
if not self.is_user_external_config_valid():
if (
self.app_peer_data.get(
"expose-external", Config.ExternalConnections.NONE
)
== Config.ExternalConnections.NONE
):
return None

return self.app_peer_data["expose-external"]

if self.model.config["expose-external"] == Config.ExternalConnections.NONE:
return None

return self.app_peer_data["expose-external"]
return self.model.config["expose-external"]

@expose_external.setter
def expose_external(self, expose_external: str) -> None:
Expand Down Expand Up @@ -711,7 +754,7 @@ def db_initialised(self, value):
)

@property
def peers_units(self) -> list[Unit]:
def peers_units(self) -> List[Unit]:
"""Get peers units in a safe way."""
if not self._peers:
return []
Expand Down
2 changes: 1 addition & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Config:
SUBSTRATE = "k8s"
CONTAINER_NAME = "mongos"
USER_ROLE_CREATE_USERS = "admin"
SERVICE_NAME = "mongod" # this must match the name of the service in the ROCK
SERVICE_NAME = "mongos"
MONGOD_CONF_DIR = "/etc/mongod"
UNIX_USER = "mongodb"
UNIX_GROUP = "mongodb"
Expand Down
12 changes: 12 additions & 0 deletions tests/integration/client_relations/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,18 @@ async def assert_all_unit_node_ports_available(ops_test: OpsTest):
), "client is not reachable"


async def get_external_uri(
ops_test: OpsTest, unit_id, exposed_node_port: str = None
) -> str:
exposed_node_port = exposed_node_port or get_port_from_node_port(
ops_test, node_port_name=f"{MONGOS_APP_NAME}-{unit_id}-external"
)

public_k8s_ip = get_public_k8s_ip()
username, password = await get_mongos_user_password(ops_test, MONGOS_APP_NAME)
return f"mongodb://{username}:{password}@{public_k8s_ip}:{exposed_node_port}"


async def is_external_mongos_client_reachable(
ops_test: OpsTest, exposed_node_port: str
) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ async def deploy_cluster_components(ops_test: OpsTest) -> None:

await ops_test.model.wait_for_idle(
apps=[MONGOS_APP_NAME, SHARD_APP_NAME, CONFIG_SERVER_APP_NAME],
idle_period=10,
idle_period=30,
raise_on_blocked=False,
raise_on_error=False, # Removed this once DPE-4996 is resolved.
)
Expand Down
Loading

0 comments on commit cf96460

Please sign in to comment.