diff --git a/lib/charms/mongodb/v1/mongodb_tls.py b/lib/charms/mongodb/v1/mongodb_tls.py index 8f00bde7..cfb0a739 100644 --- a/lib/charms/mongodb/v1/mongodb_tls.py +++ b/lib/charms/mongodb/v1/mongodb_tls.py @@ -12,7 +12,7 @@ import logging import re import socket -from typing import Dict, List, Optional, Tuple +from typing import Optional, Tuple from charms.tls_certificates_interface.v3.tls_certificates import ( CertificateAvailableEvent, @@ -42,7 +42,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 5 WAIT_CERT_UPDATE = "wait-cert-updated" @@ -105,9 +105,6 @@ def request_certificate( internal: bool, ): """Request TLS certificate.""" - if not self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION): - return - if param is None: key = generate_private_key() else: @@ -234,7 +231,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: self.charm.cluster.update_ca_secret(new_ca=event.ca) self.charm.config_server.update_ca_secret(new_ca=event.ca) - if self.waiting_for_both_certs(): + if self.is_waiting_for_both_certs(): logger.debug( "Defer till both internal and external TLS certificates available to avoid second restart." ) @@ -256,7 +253,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: # clear waiting status if db service is ready self.charm.status.set_and_share_status(ActiveStatus()) - def waiting_for_both_certs(self): + def is_waiting_for_both_certs(self) -> bool: """Returns a boolean indicating whether additional certs are needed.""" if not self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL): logger.debug("Waiting for internal certificate.") @@ -295,6 +292,10 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: return logger.debug("Generating a new Certificate Signing Request.") + self.request_new_certificates(internal) + + def request_new_certificates(self, internal: bool) -> None: + """Requests the renewel of a new certificate.""" key = self.get_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL).encode("utf-8") old_csr = self.get_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL).encode("utf-8") sans = self.get_new_sans() @@ -313,8 +314,9 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: ) self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, new_csr.decode("utf-8")) + self.set_waiting_for_cert_to_update(waiting=True, internal=internal) - def get_new_sans(self) -> Dict: + def get_new_sans(self) -> dict[str, list[str]]: """Create a list of DNS names for a MongoDB unit. Returns: @@ -341,7 +343,7 @@ def get_new_sans(self) -> Dict: return sans - def get_current_sans(self, internal: bool) -> List[str] | None: + def get_current_sans(self, internal: bool) -> dict[str, list[str]] | None: """Gets the current SANs for the unit cert.""" # if unit has no certificates do not proceed. if not self.is_tls_enabled(internal=internal): @@ -411,9 +413,9 @@ def _get_subject_name(self) -> str: def is_set_waiting_for_cert_to_update( self, - internal=False, + internal: bool = False, ) -> bool: - """Returns True we are waiting for a cert to update.""" + """Returns True if we are waiting for a cert to update.""" scope = "int" if internal else "ext" label_name = f"{scope}-{WAIT_CERT_UPDATE}" diff --git a/src/charm.py b/src/charm.py index 2183fb54..7d28bc7a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -122,6 +122,7 @@ def _on_config_changed(self, event: ConfigChangedEvent) -> None: self.status.clear_status(Config.Status.INVALID_EXTERNAL_CONFIG) self.update_external_services() + self.update_tls_sans() # toggling of external connectivity means we have to update integrated hosts self._update_client_related_hosts(event) @@ -224,11 +225,14 @@ def _on_update_status(self, _): ) return + # in the case external connectivity it is possible for public K8s endpoint to be updated + # in that case we must update our sans in TLS. K8s endpoints is not tracked by Juju. + self.update_tls_sans() + self.status.set_and_share_status(ActiveStatus()) # END: hook functions - # BEGIN: helper functions def is_user_external_config_valid(self) -> bool: """Returns True if the user set external config is valid.""" return ( @@ -262,6 +266,41 @@ def update_external_services(self) -> None: self.expose_external = self.model.config["expose-external"] + def update_tls_sans(self) -> None: + """Emits a certificate expiring event when sans in current certificates are out of date. + + This can occur for a variety of reasons: + 1. Node port has been toggled on + 2. Node port has been toggled off + 3. The public K8s IP has changed + """ + + for internal in [True, False]: + # if the certificate has already been requested, we do not want to re-request + # another one and lead to an infinite chain of certificate events. + if self.tls.is_set_waiting_for_cert_to_update(internal): + continue + + current_sans = self.tls.get_current_sans(internal) + current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() + expected_sans_ip = ( + set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + ) + sans_ip_changed = current_sans_ip ^ expected_sans_ip + + if not sans_ip_changed: + continue + + logger.info( + ( + f'Mongos {self.unit.name.split("/")[1]} updating certificate SANs - ' + f"OLD SANs = {current_sans_ip - expected_sans_ip}, " + f"NEW SANs = {expected_sans_ip - current_sans_ip}" + ) + ) + + self.tls.request_new_certificates(internal) + def get_keyfile_contents(self) -> str | None: """Retrieves the contents of the keyfile on host machine.""" # wait for keyFile to be created by leader unit @@ -338,10 +377,8 @@ def stop_mongos_service(self): def restart_charm_services(self): """Restart mongos service.""" container = self.unit.get_container(Config.CONTAINER_NAME) - container.stop(Config.SERVICE_NAME) - container.add_layer(Config.CONTAINER_NAME, self._mongos_layer, combine=True) - container.replan() + container.restart(Config.SERVICE_NAME) def set_database(self, database: str) -> None: """Updates the database requested for the mongos user.""" @@ -419,36 +456,18 @@ def get_mongos_host(self) -> str: The host for mongos can be either the Unix Domain Socket or an IP address depending on how the client wishes to connect to mongos (inside Juju or outside). """ - return self.unit_host(self.unit) + unit_id = self.unit.name.split("/")[1] + return f"{self.app.name}-{unit_id}.{self.app.name}-endpoints" def get_ext_mongos_hosts(self) -> Set: - """Returns the K8s hosts for mongos. + """Returns the ext hosts for mongos. Note: for external connections it is not enough to know the external ip, but also the port that is associated with the client. """ hosts = set() - - if not self.is_external_client: - return hosts - - try: - for unit in self.get_units(): - unit_ip = self.node_port_manager.get_node_ip(unit.name) - unit_port = self.node_port_manager.get_node_port( - port_to_match=Config.MONGOS_PORT, unit_name=unit.name - ) - if unit_ip and unit_port: - hosts.add(f"{unit_ip}:{unit_port}") - else: - raise NoExternalHostError(f"No external host for unit {unit.name}") - except ( - ApiError, - NoExternalHostError, - FailedToFindNodePortError, - FailedToFindServiceError, - ) as e: - raise FailedToGetHostsError(f"Failed to retrieve external hosts due to {e}") + for unit in self.get_units(): + hosts.add(self.get_ext_mongos_host(unit)) return hosts @@ -456,10 +475,37 @@ def get_k8s_mongos_hosts(self) -> Set: """Returns the K8s hosts for mongos""" hosts = set() for unit in self.get_units(): - hosts.add(self.unit_host(unit)) + unit_id = unit.name.split("/")[1] + hosts.add(f"{self.app.name}-{unit_id}.{self.app.name}-endpoints") return hosts + def get_ext_mongos_host(self, unit: Unit, incl_port=True) -> str | None: + """Returns the ext hosts for mongos on the provided unit.""" + if not self.is_external_client: + return None + + try: + unit_ip = self.node_port_manager.get_node_ip(unit.name) + if not incl_port: + return unit_ip + + unit_port = self.node_port_manager.get_node_port( + port_to_match=Config.MONGOS_PORT, unit_name=unit.name + ) + if unit_ip and unit_port: + return f"{unit_ip}:{unit_port}" + else: + raise NoExternalHostError(f"No external host for unit {unit.name}") + except ( + NoExternalHostError, + FailedToFindNodePortError, + FailedToFindServiceError, + ) as e: + raise FailedToGetHostsError( + "Failed to retrieve external hosts due to %s", e + ) + def get_mongos_hosts_for_client(self) -> Set: """Returns the hosts for mongos as a str. @@ -618,18 +664,6 @@ def delete_tls_certificate_from_workload(self) -> None: except PathError as err: logger.debug("Path unavailable: %s (%s)", file, str(err)) - def unit_host(self, unit: Unit) -> str: - """Create a DNS name for a MongoDB unit. - - Args: - unit_name: the juju unit name, e.g. "mongodb/1". - - Returns: - A string representing the hostname of the MongoDB unit. - """ - unit_id = unit.name.split("/")[1] - return f"{self.app.name}-{unit_id}.{self.app.name}-endpoints" - def get_config_server_name(self) -> Optional[str]: """Returns the name of the Juju Application that mongos is using as a config server.""" return self.cluster.get_config_server_name() @@ -669,13 +703,22 @@ def _update_client_related_hosts(self, event) -> None: @property def expose_external(self) -> Optional[str]: """Returns mode of exposure for external connections.""" - if ( - self.app_peer_data.get("expose-external", Config.ExternalConnections.NONE) - == Config.ExternalConnections.NONE - ): + # don't let an incorrect configuration + if not self.is_user_external_config_valid(): + if ( + self.app_peer_data.get( + "expose-external", Config.ExternalConnections.NONE + ) + == Config.ExternalConnections.NONE + ): + return None + + return self.app_peer_data["expose-external"] + + if self.model.config["expose-external"] == Config.ExternalConnections.NONE: return None - return self.app_peer_data["expose-external"] + return self.model.config["expose-external"] @expose_external.setter def expose_external(self, expose_external: str) -> None: @@ -711,7 +754,7 @@ def db_initialised(self, value): ) @property - def peers_units(self) -> list[Unit]: + def peers_units(self) -> List[Unit]: """Get peers units in a safe way.""" if not self._peers: return [] diff --git a/src/config.py b/src/config.py index ee5f3385..8ce6e39d 100644 --- a/src/config.py +++ b/src/config.py @@ -14,7 +14,7 @@ class Config: SUBSTRATE = "k8s" CONTAINER_NAME = "mongos" USER_ROLE_CREATE_USERS = "admin" - SERVICE_NAME = "mongod" # this must match the name of the service in the ROCK + SERVICE_NAME = "mongos" MONGOD_CONF_DIR = "/etc/mongod" UNIX_USER = "mongodb" UNIX_GROUP = "mongodb" diff --git a/tests/integration/client_relations/helpers.py b/tests/integration/client_relations/helpers.py index af6905d2..dd0f31f1 100644 --- a/tests/integration/client_relations/helpers.py +++ b/tests/integration/client_relations/helpers.py @@ -121,6 +121,18 @@ async def assert_all_unit_node_ports_available(ops_test: OpsTest): ), "client is not reachable" +async def get_external_uri( + ops_test: OpsTest, unit_id, exposed_node_port: str = None +) -> str: + exposed_node_port = exposed_node_port or get_port_from_node_port( + ops_test, node_port_name=f"{MONGOS_APP_NAME}-{unit_id}-external" + ) + + public_k8s_ip = get_public_k8s_ip() + username, password = await get_mongos_user_password(ops_test, MONGOS_APP_NAME) + return f"mongodb://{username}:{password}@{public_k8s_ip}:{exposed_node_port}" + + async def is_external_mongos_client_reachable( ops_test: OpsTest, exposed_node_port: str ) -> bool: diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index b2974aeb..44ecf5d5 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -193,7 +193,7 @@ async def deploy_cluster_components(ops_test: OpsTest) -> None: await ops_test.model.wait_for_idle( apps=[MONGOS_APP_NAME, SHARD_APP_NAME, CONFIG_SERVER_APP_NAME], - idle_period=10, + idle_period=30, raise_on_blocked=False, raise_on_error=False, # Removed this once DPE-4996 is resolved. ) diff --git a/tests/integration/tls/helpers.py b/tests/integration/tls/helpers.py index f49696d3..5058c755 100644 --- a/tests/integration/tls/helpers.py +++ b/tests/integration/tls/helpers.py @@ -4,14 +4,17 @@ from pathlib import Path from pytest_operator.plugin import OpsTest -from ..helpers import get_application_relation_data, get_secret_data +from ..helpers import get_application_relation_data from tenacity import RetryError, Retrying, stop_after_attempt, wait_exponential from datetime import datetime from typing import Optional, Dict import json import ops import logging - +import time +from ops.model import Unit +from ..client_relations.helpers import get_external_uri +from ..helpers import get_mongos_uri logger = logging.getLogger(__name__) @@ -19,7 +22,7 @@ EXTERNAL_PEM_PATH = "/etc/mongod/external-cert.pem" EXTERNAL_CERT_PATH = "/etc/mongod/external-ca.crt" INTERNAL_CERT_PATH = "/etc/mongod/internal-ca.crt" -MONGO_SHELL = "charmed-mongodb.mongosh" +MONGO_SHELL = "/usr/bin/mongosh" MONGOS_APP_NAME = "mongos-k8s" CERT_REL_NAME = "certificates" CERTS_APP_NAME = "self-signed-certificates" @@ -28,6 +31,7 @@ SHARD_APP_NAME = "shard0" CLUSTER_COMPONENTS = [CONFIG_SERVER_APP_NAME, SHARD_APP_NAME] MONGOS_SERVICE = "mongos.service" +PEBBLE_TIME_UNIT = 60 class ProcessError(Exception): @@ -76,10 +80,10 @@ async def check_mongos_tls_disabled(ops_test: OpsTest) -> None: await check_tls(ops_test, unit, enabled=False) -async def check_mongos_tls_enabled(ops_test: OpsTest) -> None: +async def check_mongos_tls_enabled(ops_test: OpsTest, internal=True) -> None: # check each replica set is running with TLS enabled for unit in ops_test.model.applications[MONGOS_APP_NAME].units: - await check_tls(ops_test, unit, enabled=True) + await check_tls(ops_test, unit, enabled=True, internal=internal) async def toggle_tls_mongos( @@ -97,24 +101,25 @@ async def toggle_tls_mongos( f"{certs_app_name}:{CERT_REL_NAME}", ) + await ops_test.model.wait_for_idle(apps=[MONGOS_APP_NAME], idle_period=30) -async def mongos_tls_command(ops_test: OpsTest) -> str: - """Generates a command which verifies TLS status.""" - secret_uri = await get_application_relation_data( - ops_test, MONGOS_APP_NAME, "mongos", "secret-user" - ) - secret_data = await get_secret_data(ops_test, secret_uri) - client_uri = secret_data.get("uris") +async def mongos_tls_command(ops_test: OpsTest, unit, internal=True) -> str: + """Generates a command which verifies TLS status.""" + unit_id = unit.name.split("/")[1] + if not internal: + client_uri = await get_external_uri(ops_test, unit_id=unit_id) + else: + client_uri = await get_mongos_uri(ops_test, unit_id=unit_id) return ( - f"{MONGO_SHELL} '{client_uri}' --eval 'sh.status()'" + f"{MONGO_SHELL} '{client_uri}' --eval 'db.getUsers()'" f" --tls --tlsCAFile {EXTERNAL_CERT_PATH}" f" --tlsCertificateKeyFile {EXTERNAL_PEM_PATH}" ) -async def check_tls(ops_test, unit, enabled) -> None: +async def check_tls(ops_test, unit, enabled, internal=True) -> None: """Returns True if TLS matches the expected state "enabled".""" try: for attempt in Retrying( @@ -122,12 +127,17 @@ async def check_tls(ops_test, unit, enabled) -> None: wait=wait_exponential(multiplier=1, min=2, max=30), ): with attempt: - mongod_tls_check = await mongos_tls_command(ops_test) - check_tls_cmd = f"exec --unit {unit.name} -- {mongod_tls_check}" - return_code, _, _ = await ops_test.juju(*check_tls_cmd.split()) + mongos_tls_check = await mongos_tls_command( + ops_test, unit=unit, internal=internal + ) + complete_command = ( + f"ssh --container mongos {unit.name} {mongos_tls_check}" + ) + return_code, _, stderr = await ops_test.juju(*complete_command.split()) tls_enabled = return_code == 0 if enabled != tls_enabled: + logger.error(stderr) raise ValueError( f"TLS is{' not' if not tls_enabled else ''} enabled on {unit.name}" ) @@ -136,6 +146,17 @@ async def check_tls(ops_test, unit, enabled) -> None: return False +async def get_sans_ips(ops_test: OpsTest, unit: Unit, internal: bool) -> str: + """Retrieves the sans for the for mongos on the provided unit.""" + cert_name = "internal" if internal else "external" + get_sans_cmd = ( + f"openssl x509 -noout -ext subjectAltName -in /etc/mongod/{cert_name}-cert.pem" + ) + complete_command = f"ssh --container mongos {unit.name} {get_sans_cmd}" + _, result, _ = await ops_test.juju(*complete_command.split()) + return result + + async def time_file_created(ops_test: OpsTest, unit_name: str, path: str) -> int: """Returns the unix timestamp of when a file was created on a specified unit.""" time_cmd = f"ls -l --time-style=full-iso {path} " @@ -162,7 +183,7 @@ async def time_process_started( # find most recent start time. By parsing most recent logs (ie in reverse order) for log in reversed(logs.split("\n")): - if "Replan" in log: + if "Restart" in log: return process_pebble_time(log.split()[4]) raise Exception("Service was never started") @@ -345,6 +366,8 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> ops_test, unit, app_name=app, tmpdir=tmpdir ) + time.sleep(PEBBLE_TIME_UNIT) + # set external and internal key using auto-generated key for each unit for unit in ops_test.model.applications[app].units: action = await unit.run_action(action_name="set-tls-private-key") @@ -395,7 +418,6 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> # Once the certificate requests are processed and updated the .service file should be # restarted - assert ( new_mongos_service_time > original_tls_info[unit.name]["mongos_service"] ), f"mongos service for {unit.name} was not restarted." diff --git a/tests/integration/tls/test_tls.py b/tests/integration/tls/test_tls.py index 543d4702..ab383f23 100644 --- a/tests/integration/tls/test_tls.py +++ b/tests/integration/tls/test_tls.py @@ -19,8 +19,9 @@ MONGOS_APP_NAME, CERTS_APP_NAME, rotate_and_verify_certs, + get_sans_ips, ) - +from ..client_relations.helpers import get_public_k8s_ip MONGOS_SERVICE = "mongos.service" @@ -40,6 +41,8 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: """Build and deploy a sharded cluster.""" await deploy_cluster_components(ops_test) + # we should verify that tests work with multiple routers. + await ops_test.model.applications[MONGOS_APP_NAME].scale(2) await build_cluster(ops_test) await deploy_tls(ops_test) @@ -58,9 +61,62 @@ async def test_mongos_tls_enabled(ops_test: OpsTest) -> None: ) await integrate_cluster_with_tls(ops_test) + await ops_test.model.wait_for_idle( + apps=[MONGOS_APP_NAME], + idle_period=30, + status="active", + timeout=TIMEOUT, + ) + await check_mongos_tls_enabled(ops_test) +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_mongos_tls_nodeport(ops_test: OpsTest): + """Tests that TLS is stable""" + # test that charm can enable nodeport without breaking mongos or accidentally disabling TLS + await ops_test.model.applications[MONGOS_APP_NAME].set_config( + {"expose-external": "nodeport"} + ) + + await ops_test.model.wait_for_idle( + apps=[MONGOS_APP_NAME], + idle_period=60, + status="active", + timeout=TIMEOUT, + ) + for internal in [True, False]: + await check_mongos_tls_enabled(ops_test, internal) + + # check for expected IP addresses in the pem file + for unit in ops_test.model.applications[MONGOS_APP_NAME].units: + assert get_public_k8s_ip() in await get_sans_ips(ops_test, unit, internal=True) + assert get_public_k8s_ip() in await get_sans_ips(ops_test, unit, internal=False) + + # test that charm can disable nodeport without breaking mongos or accidentally disabling TLS + await ops_test.model.applications[MONGOS_APP_NAME].set_config( + {"expose-external": "none"} + ) + await ops_test.model.wait_for_idle( + apps=[MONGOS_APP_NAME], + idle_period=60, + status="active", + timeout=TIMEOUT, + ) + + await check_mongos_tls_enabled(ops_test, internal=True) + + # check for no public k8s IP address in the pem file + for unit in ops_test.model.applications[MONGOS_APP_NAME].units: + assert get_public_k8s_ip() not in await get_sans_ips( + ops_test, unit, internal=True + ) + assert get_public_k8s_ip() not in await get_sans_ips( + ops_test, unit, internal=False + ) + + @pytest.mark.group(1) @pytest.mark.abort_on_fail async def test_mongos_rotate_certs(ops_test: OpsTest, tmp_path: Path) -> None: @@ -72,7 +128,6 @@ async def test_mongos_rotate_certs(ops_test: OpsTest, tmp_path: Path) -> None: async def test_mongos_tls_disabled(ops_test: OpsTest) -> None: """Tests that mongos charm can disable TLS.""" await toggle_tls_mongos(ops_test, enable=False) - await check_mongos_tls_disabled(ops_test) await wait_for_mongos_units_blocked( ops_test, @@ -81,12 +136,15 @@ async def test_mongos_tls_disabled(ops_test: OpsTest) -> None: timeout=300, ) + await check_mongos_tls_disabled(ops_test) + @pytest.mark.group(1) @pytest.mark.abort_on_fail async def test_tls_reenabled(ops_test: OpsTest) -> None: """Test that mongos can enable TLS after being integrated to cluster .""" await toggle_tls_mongos(ops_test, enable=True) + await check_mongos_tls_enabled(ops_test) @@ -95,6 +153,14 @@ async def test_tls_reenabled(ops_test: OpsTest) -> None: async def test_mongos_tls_ca_mismatch(ops_test: OpsTest) -> None: """Tests that mongos charm can disable TLS.""" await toggle_tls_mongos(ops_test, enable=False) + + await wait_for_mongos_units_blocked( + ops_test, + MONGOS_APP_NAME, + status="mongos requires TLS to be enabled.", + timeout=300, + ) + await ops_test.model.deploy( CERTS_APP_NAME, application_name=DIFFERENT_CERTS_APP_NAME, channel="stable" ) @@ -102,7 +168,6 @@ async def test_mongos_tls_ca_mismatch(ops_test: OpsTest) -> None: apps=[DIFFERENT_CERTS_APP_NAME], idle_period=10, raise_on_blocked=False, - status="active", timeout=TIMEOUT, )