From 8cc9564854015165e8bb8730149fcfef0867156d Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 6 Sep 2024 09:39:22 +0000 Subject: [PATCH 01/22] mongos k8s updates tls certs based on external node port --- lib/charms/mongodb/v0/mongodb_tls.py | 84 ++++++++- lib/charms/mongodb/v0/set_status.py | 273 --------------------------- src/charm.py | 165 +++++++++------- 3 files changed, 177 insertions(+), 345 deletions(-) delete mode 100644 lib/charms/mongodb/v0/set_status.py diff --git a/lib/charms/mongodb/v0/mongodb_tls.py b/lib/charms/mongodb/v0/mongodb_tls.py index 6669c1c3..27be5e13 100644 --- a/lib/charms/mongodb/v0/mongodb_tls.py +++ b/lib/charms/mongodb/v0/mongodb_tls.py @@ -7,11 +7,14 @@ and expose needed information for client connection via fields in external relation. """ +import json import base64 import logging import re import socket -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict +import subprocess +from ops.pebble import ExecError from charms.tls_certificates_interface.v3.tls_certificates import ( CertificateAvailableEvent, @@ -28,7 +31,8 @@ UNIT_SCOPE = Config.Relations.UNIT_SCOPE Scopes = Config.Relations.Scopes - +SANS_DNS_KEY = "sans_dns" +SANS_IPS_KEY = "sans_ips" # The unique Charmhub library identifier, never change it LIBID = "e02a50f0795e4dd292f58e93b4f493dd" @@ -104,12 +108,13 @@ def request_certificate( else: key = self._parse_tls_file(param) + sans = self.get_new_sans() csr = generate_csr( private_key=key, subject=self._get_subject_name(), organization=self._get_subject_name(), - sans=self._get_sans(), - sans_ip=[str(self.charm.model.get_binding(self.peer_relation).network.bind_address)], + sans=sans[SANS_DNS_KEY], + sans_ip=sans[SANS_IPS_KEY], ) self.set_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL, key.decode("utf-8")) self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, csr.decode("utf-8")) @@ -281,7 +286,7 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: private_key=key, subject=self._get_subject_name(), organization=self._get_subject_name(), - sans=self._get_sans(), + sans=self.get_new_sans(), sans_ip=[str(self.charm.model.get_binding(self.peer_relation).network.bind_address)], ) logger.debug("Requesting a certificate renewal.") @@ -293,21 +298,82 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, new_csr.decode("utf-8")) - def _get_sans(self) -> List[str]: + def get_new_sans(self) -> Dict: """Create a list of DNS names for a MongoDB unit. Returns: A list representing the hostnames of the MongoDB unit. """ + sans = {} + unit_id = self.charm.unit.name.split("/")[1] - return [ + sans[SANS_DNS_KEY] = [ f"{self.charm.app.name}-{unit_id}", socket.getfqdn(), - f"{self.charm.app.name}-{unit_id}.{self.charm.app.name}-endpoints", - str(self.charm.model.get_binding(self.peer_relation).network.bind_address), "localhost", + f"{self.charm.app.name}-{unit_id}.{self.charm.app.name}-endpoints", ] + sans[SANS_IPS_KEY] = [] + + if Config.SUBSTRATE == Config.Substrate.VM: + sans[SANS_IPS_KEY].append( + str(self.charm.model.get_binding(self.peer_relation).network.bind_address) + ) + + elif ( + Config.SUBSTRATE == Config.Substrate.K8S + and self.charm.is_role(Config.Role.MONGOS) + and self.charm.is_external_client + ): + sans[SANS_IPS_KEY].append( + self.charm.get_ext_mongos_host(self.charm.unit, incl_port=False) + ) + + return sans + + def get_current_sans(self) -> List | None: + """Gets the current SANs for the unit cert.""" + # if unit has no certificates do not proceed. + if not self.is_tls_enabled(internal=True) or not self.is_tls_enabled(internal=False): + return + + command = [ + "openssl", + "x509", + "-noout", + "-ext", + "subjectAltName", + "-in", + Config.TLS.EXT_PEM_FILE, + ] + + try: + container = self.charm.unit.get_container(Config.CONTAINER_NAME) + process = container.exec(command=command, working_dir=Config.MONGOD_CONF_DIR) + + output, _ = process.wait_output() + sans_lines = output.splitlines() + except (subprocess.CalledProcessError, ExecError) as e: + logger.error(e.stdout) + raise e + + for line in sans_lines: + if "DNS" in line and "IP" in line: + break + + sans_ip = [] + sans_dns = [] + for item in line.split(", "): + san_type, san_value = item.split(":") + + if san_type == "DNS": + sans_dns.append(san_value) + if san_type == "IP Address": + sans_ip.append(san_value) + + return {SANS_IPS_KEY: sorted(sans_ip), SANS_DNS_KEY: sorted(sans_dns)} + def get_tls_files(self, internal: bool) -> Tuple[Optional[str], Optional[str]]: """Prepare TLS files in special MongoDB way. diff --git a/lib/charms/mongodb/v0/set_status.py b/lib/charms/mongodb/v0/set_status.py deleted file mode 100644 index 98d1daba..00000000 --- a/lib/charms/mongodb/v0/set_status.py +++ /dev/null @@ -1,273 +0,0 @@ -#!/usr/bin/env python3 -"""Code for handing statuses in the app and unit.""" -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import json -import logging -from typing import Optional, Tuple - -from charms.mongodb.v1.mongodb import MongoConfiguration, MongoDBConnection -from ops.charm import CharmBase -from ops.framework import Object -from ops.model import ActiveStatus, BlockedStatus, StatusBase, WaitingStatus -from pymongo.errors import AutoReconnect, OperationFailure, ServerSelectionTimeoutError - -from config import Config - -# The unique Charmhub library identifier, never change it -LIBID = "9b0b9fac53244229aed5ffc5e62141eb" - -# Increment this major API version when introducing breaking changes -LIBAPI = 0 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 3 - -AUTH_FAILED_CODE = 18 -UNAUTHORISED_CODE = 13 -TLS_CANNOT_FIND_PRIMARY = 133 - - -logger = logging.getLogger(__name__) - - -class MongoDBStatusHandler(Object): - """Verifies versions across multiple integrated applications.""" - - def __init__( - self, - charm: CharmBase, - ) -> None: - """Constructor for CrossAppVersionChecker. - - Args: - charm: charm to inherit from. - """ - super().__init__(charm, None) - self.charm = charm - - # TODO Future PR: handle update_status - - # BEGIN Helpers - - def set_and_share_status(self, status: StatusBase): - """Sets the charm status and shares to app status and config-server if applicable.""" - # TODO Future Feature/Epic: process other statuses, i.e. only set provided status if its - # appropriate. - self.charm.unit.status = status - - self.set_app_status() - - if self.charm.is_role(Config.Role.SHARD): - self.share_status_to_config_server() - - def set_app_status(self): - """TODO Future Feature/Epic: parse statuses and set a status for the entire app.""" - - def is_current_unit_ready(self, ignore_unhealthy_upgrade: bool = False) -> bool: - """Returns True if the current unit status shows that the unit is ready. - - Note: we allow the use of ignore_unhealthy_upgrade, to avoid infinite loops due to this - function returning False and preventing the status from being reset. - """ - if isinstance(self.charm.unit.status, ActiveStatus): - return True - - if ignore_unhealthy_upgrade and self.charm.unit.status == Config.Status.UNHEALTHY_UPGRADE: - return True - - return self.is_status_related_to_mismatched_revision( - type(self.charm.unit.status).__name__.lower() - ) - - def is_status_related_to_mismatched_revision(self, status_type: str) -> bool: - """Returns True if the current status is related to a mimsatch in revision. - - Note: A few functions calling this method receive states differently. One receives them by - "goal state" which processes data differently and the other via the ".status" property. - Hence we have to be flexible to handle each. - """ - if not self.charm.get_cluster_mismatched_revision_status(): - return False - - if "waiting" in status_type and self.charm.is_role(Config.Role.CONFIG_SERVER): - return True - - if "blocked" in status_type and self.charm.is_role(Config.Role.SHARD): - return True - - return False - - def are_all_units_ready_for_upgrade(self, unit_to_ignore: str = "") -> bool: - """Returns True if all charm units status's show that they are ready for upgrade.""" - goal_state = self.charm.model._backend._run( - "goal-state", return_output=True, use_json=True - ) - for unit_name, unit_state in goal_state["units"].items(): - if unit_name == unit_to_ignore: - continue - if unit_state["status"] == "active": - continue - if not self.is_status_related_to_mismatched_revision(unit_state["status"]): - return False - - return True - - def are_shards_status_ready_for_upgrade(self) -> bool: - """Returns True if all integrated shards status's show that they are ready for upgrade. - - A shard is ready for upgrade if it is either in the waiting for upgrade status or active - status. - """ - if not self.charm.is_role(Config.Role.CONFIG_SERVER): - return False - - for sharding_relation in self.charm.config_server.get_all_sharding_relations(): - for unit in sharding_relation.units: - unit_data = sharding_relation.data[unit] - status_ready_for_upgrade = json.loads( - unit_data.get(Config.Status.STATUS_READY_FOR_UPGRADE, None) - ) - if not status_ready_for_upgrade: - return False - - return True - - def share_status_to_config_server(self): - """Shares this shards status info to the config server.""" - if not self.charm.is_role(Config.Role.SHARD): - return - - if not (config_relation := self.charm.shard.get_config_server_relation()): - return - - config_relation.data[self.charm.unit][Config.Status.STATUS_READY_FOR_UPGRADE] = json.dumps( - self.is_unit_status_ready_for_upgrade() - ) - - def is_unit_status_ready_for_upgrade(self) -> bool: - """Returns True if the status of the current unit reflects that it is ready for upgrade.""" - current_status = self.charm.unit.status - status_message = current_status.message - if isinstance(current_status, ActiveStatus): - return True - - if not isinstance(current_status, WaitingStatus): - return False - - if status_message and "is not up-to date with config-server" in status_message: - return True - - return False - - def process_statuses(self) -> StatusBase: - """Retrieves statuses from processes inside charm and returns the highest priority status. - - When a non-fatal error occurs while processing statuses, the error is processed and - returned as a statuses. - - TODO: add more status handling here for other cases: i.e. TLS, or resetting a status that - should not be reset - """ - # retrieve statuses of different services running on Charmed MongoDB - deployment_mode = ( - "replica set" if self.charm.is_role(Config.Role.REPLICATION) else "cluster" - ) - waiting_status = None - try: - statuses = self.get_statuses() - except OperationFailure as e: - if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE]: - waiting_status = f"Waiting to sync passwords across the {deployment_mode}" - elif e.code == TLS_CANNOT_FIND_PRIMARY: - waiting_status = ( - f"Waiting to sync internal membership across the {deployment_mode}" - ) - else: - raise - except ServerSelectionTimeoutError: - waiting_status = f"Waiting to sync internal membership across the {deployment_mode}" - - if waiting_status: - return WaitingStatus(waiting_status) - - return self.prioritize_statuses(statuses) - - def get_statuses(self) -> Tuple: - """Retrieves statuses for the different processes running inside the unit.""" - mongodb_status = build_unit_status( - self.charm.mongodb_config, self.charm.unit_host(self.charm.unit) - ) - shard_status = self.charm.shard.get_shard_status() - config_server_status = self.charm.config_server.get_config_server_status() - pbm_status = self.charm.backups.get_pbm_status() - return (mongodb_status, shard_status, config_server_status, pbm_status) - - def prioritize_statuses(self, statuses: Tuple) -> StatusBase: - """Returns the status with the highest priority from backups, sharding, and mongod.""" - mongodb_status, shard_status, config_server_status, pbm_status = statuses - # failure in mongodb takes precedence over sharding and config server - if not isinstance(mongodb_status, ActiveStatus): - return mongodb_status - - if shard_status and not isinstance(shard_status, ActiveStatus): - return shard_status - - if config_server_status and not isinstance(config_server_status, ActiveStatus): - return config_server_status - - if pbm_status and not isinstance(pbm_status, ActiveStatus): - return pbm_status - - # if all statuses are active report mongodb status over sharding status - return mongodb_status - - def get_invalid_integration_status(self) -> Optional[StatusBase]: - """Returns a status if an invalid integration is present.""" - if not self.charm.cluster.is_valid_mongos_integration(): - return BlockedStatus( - "Relation to mongos not supported, config role must be config-server" - ) - - if not self.charm.backups.is_valid_s3_integration(): - return BlockedStatus( - "Relation to s3-integrator is not supported, config role must be config-server" - ) - - return self.charm.get_cluster_mismatched_revision_status() - - -def build_unit_status(mongodb_config: MongoConfiguration, unit_host: str) -> StatusBase: - """Generates the status of a unit based on its status reported by mongod.""" - try: - with MongoDBConnection(mongodb_config) as mongo: - replset_status = mongo.get_replset_status() - - if unit_host not in replset_status: - return WaitingStatus("Member being added..") - - replica_status = replset_status[unit_host] - - match replica_status: - case "PRIMARY": - return ActiveStatus("Primary") - case "SECONDARY": - return ActiveStatus("") - case "STARTUP" | "STARTUP2" | "ROLLBACK" | "RECOVERING": - return WaitingStatus("Member is syncing...") - case "REMOVED": - return WaitingStatus("Member is removing...") - case _: - return BlockedStatus(replica_status) - except ServerSelectionTimeoutError as e: - # ServerSelectionTimeoutError is commonly due to ReplicaSetNoPrimary - logger.debug("Got error: %s, while checking replica set status", str(e)) - return WaitingStatus("Waiting for primary re-election..") - except AutoReconnect as e: - # AutoReconnect is raised when a connection to the database is lost and an attempt to - # auto-reconnect will be made by pymongo. - logger.debug("Got error: %s, while checking replica set status", str(e)) - return WaitingStatus("Waiting to reconnect to unit..") - - # END: Helpers diff --git a/src/charm.py b/src/charm.py index 98e9379f..6289a87b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -5,12 +5,13 @@ # See LICENSE file for licensing details. from ops.main import main import json +from datetime import datetime from exceptions import MissingSecretError from ops.pebble import PathError, ProtocolError, Layer from node_port import NodePortManager -from typing import Set, Optional, Dict +from typing import Set, Optional, Dict, List from charms.mongodb.v0.config_server_interface import ClusterRequirer @@ -74,9 +75,7 @@ def __init__(self, *args): # lifecycle events self.framework.observe(self.on.config_changed, self._on_config_changed) - self.framework.observe( - self.on.mongos_pebble_ready, self._on_mongos_pebble_ready - ) + self.framework.observe(self.on.mongos_pebble_ready, self._on_mongos_pebble_ready) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -99,6 +98,8 @@ def _on_config_changed(self, event: ConfigChangedEvent) -> None: self.update_external_services() + self.update_tls_sans() + # TODO DPE-5235 support updating data-integrator clients to have/not have public IP # depending on the result of the configuration @@ -143,9 +144,7 @@ def _on_start(self, event: StartEvent) -> None: logger.info( "Missing integration to config-server. mongos cannot run start sequence unless connected to config-server." ) - self.status.set_and_share_status( - BlockedStatus("Missing relation to config-server.") - ) + self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) event.defer() return @@ -178,9 +177,7 @@ def _on_update_status(self, _): logger.info( "Missing integration to config-server. mongos cannot run unless connected to config-server." ) - self.status.set_and_share_status( - BlockedStatus("Missing relation to config-server.") - ) + self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) return if tls_statuses := self.cluster.get_tls_statuses(): @@ -190,9 +187,7 @@ def _on_update_status(self, _): # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). if not self.cluster.is_mongos_running(): logger.info("mongos has not started yet") - self.status.set_and_share_status( - WaitingStatus("Waiting for mongos to start.") - ) + self.status.set_and_share_status(WaitingStatus("Waiting for mongos to start.")) return self.status.set_and_share_status(ActiveStatus()) @@ -218,21 +213,50 @@ def set_status_invalid_external_config(self) -> None: def update_external_services(self) -> None: """Update external services based on provided configuration.""" - if ( - self.model.config["expose-external"] - == Config.ExternalConnections.EXTERNAL_NODEPORT - ): + if self.model.config["expose-external"] == Config.ExternalConnections.EXTERNAL_NODEPORT: # every unit attempts to create a nodeport service - if exists, will silently continue self.node_port_manager.apply_service( - service=self.node_port_manager.build_node_port_services( - port=Config.MONGOS_PORT - ) + service=self.node_port_manager.build_node_port_services(port=Config.MONGOS_PORT) ) else: self.node_port_manager.delete_unit_service() self.expose_external = self.model.config["expose-external"] + def update_tls_sans(self) -> None: + current_sans = self.tls.get_current_sans() + current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() + expected_sans_ip = set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + sans_ip_changed = current_sans_ip ^ expected_sans_ip + + if not sans_ip_changed: + return + + logger.info( + ( + f'Broker {self.unit.name.split("/")[1]} updating certificate SANs - ' + f"OLD SANs = {current_sans_ip - expected_sans_ip}, " + f"NEW SANs = {expected_sans_ip - current_sans_ip}" + ) + ) + + # question for Marc, why not also internal certs + self.tls.certs.on.certificate_expiring.emit( + certificate=self.tls.get_tls_secret( + internal=True, label_name=Config.TLS.SECRET_CERT_LABEL + ), + expiry=datetime.now().isoformat(), + ) + self.tls.certs.on.certificate_expiring.emit( + certificate=self.tls.get_tls_secret( + internal=False, label_name=Config.TLS.SECRET_CERT_LABEL + ), + expiry=datetime.now().isoformat(), + ) + + # TODO ensures only single requested new certs, will be replaced on new certificate-available event + # self.tls.certificate = False + def get_keyfile_contents(self) -> str | None: """Retrieves the contents of the keyfile on host machine.""" # wait for keyFile to be created by leader unit @@ -249,9 +273,7 @@ def get_keyfile_contents(self) -> str | None: def is_integrated_to_config_server(self) -> bool: """Returns True if the mongos application is integrated to a config-server.""" - return ( - self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None - ) + return self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None def get_secret(self, scope: str, key: str) -> Optional[str]: """Get secret from the secret storage.""" @@ -293,9 +315,7 @@ def remove_secret(self, scope, key) -> None: content = secret.get_content() if not content.get(key) or content[key] == Config.Secrets.SECRET_DELETED_LABEL: - logger.error( - f"Non-existing secret {scope}:{key} was attempted to be removed." - ) + logger.error(f"Non-existing secret {scope}:{key} was attempted to be removed.") return content[key] = Config.Secrets.SECRET_DELETED_LABEL @@ -322,9 +342,7 @@ def set_database(self, database: str) -> None: return # a mongos shard can only be related to one config server - config_server_rel = self.model.relations[ - Config.Relations.CLUSTER_RELATIONS_NAME - ][0] + config_server_rel = self.model.relations[Config.Relations.CLUSTER_RELATIONS_NAME][0] self.cluster.database_requires.update_relation_data( config_server_rel.id, {DATABASE_TAG: database} ) @@ -379,25 +397,64 @@ def proceed_on_broken_event(self, event) -> bool: return True - def get_mongos_host(self) -> str: - """Returns the host for mongos as a str. + def get_units(self) -> List[Unit]: + units = [self.unit] + units.extend(self.peers_units) + return units - The host for mongos can be either the Unix Domain Socket or an IP address depending on how - the client wishes to connect to mongos (inside Juju or outside). + def get_k8s_mongos_hosts(self) -> Set: + """Returns the K8s hosts for mongos""" + hosts = set() + for unit in self.get_units(): + hosts.add(self.get_k8s_mongos_host(unit)) + + return hosts + + def get_ext_mongos_host(self, unit: Unit, incl_port=True) -> str | None: + """Returns the ext hosts for mongos on the provided unit.""" + if self.is_external_client: + return None + + unit_ip = self.node_port_manager.get_node_ip(unit.name) + if not incl_port: + return unit_ip + + unit_port = self.node_port_manager.get_node_port( + port_to_match=Config.MONGOS_PORT, unit_name=unit.name + ) + return f"{unit_ip}:{unit_port}" + + def get_k8s_mongos_host(self, unit: Unit) -> str: + """Create a DNS name for a MongoDB unit. + + Args: + unit_name: the juju unit name, e.g. "mongodb/1". + + Returns: + A string representing the hostname of the MongoDB unit. + """ + unit_id = unit.name.split("/")[1] + return f"{self.app.name}-{unit_id}.{self.app.name}-endpoints" + + def get_ext_mongos_hosts(self) -> Set: + """Returns the ext hosts for mongos. + + Note: for external connections it is not enough to know the external ip, but also the + port that is associated with the client. """ - return self.unit_host(self.unit) + hosts = set() + for unit in self.get_units(): + hosts.add(self.get_ext_mongos_host(unit)) - def get_mongos_hosts(self) -> Set: + return hosts + + def get_mongos_host(self) -> str: """Returns the host for mongos as a str. The host for mongos can be either the Unix Domain Socket or an IP address depending on how the client wishes to connect to mongos (inside Juju or outside). """ - hosts = {self.unit_host(self.unit)} - for unit in self.peers_units: - hosts.add(self.unit_host(unit)) - - return hosts + return self.get_k8s_mongos_host(self.unit) @staticmethod def _generate_relation_departed_key(rel_id: int) -> str: @@ -485,9 +542,7 @@ def _pull_licenses(container: Container) -> None: for license_name in licenses: try: - license_file = container.pull( - path=Config.get_license_path(license_name) - ) + license_file = container.pull(path=Config.get_license_path(license_name)) f = open(f"LICENSE_{license_name}", "x") f.write(str(license_file.read())) f.close() @@ -504,14 +559,10 @@ def _set_data_dir_permissions(container: Container) -> None: for path in [Config.DATA_DIR]: paths = container.list_files(path, itself=True) if not len(paths) == 1: - raise ExtraDataDirError( - "list_files doesn't return only the directory itself" - ) + raise ExtraDataDirError("list_files doesn't return only the directory itself") logger.debug(f"Data directory ownership: {paths[0].user}:{paths[0].group}") if paths[0].user != Config.UNIX_USER or paths[0].group != Config.UNIX_GROUP: - container.exec( - f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split() - ) + container.exec(f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split()) def push_file_to_unit( self, @@ -546,18 +597,6 @@ def delete_tls_certificate_from_workload(self) -> None: except PathError as err: logger.debug("Path unavailable: %s (%s)", file, str(err)) - def unit_host(self, unit: Unit) -> str: - """Create a DNS name for a MongoDB unit. - - Args: - unit_name: the juju unit name, e.g. "mongodb/1". - - Returns: - A string representing the hostname of the MongoDB unit. - """ - unit_id = unit.name.split("/")[1] - return f"{self.app.name}-{unit_id}.{self.app.name}-endpoints" - def get_config_server_name(self) -> Optional[str]: """Returns the name of the Juju Application that mongos is using as a config server.""" return self.cluster.get_config_server_name() @@ -614,7 +653,7 @@ def db_initialised(self, value): ) @property - def peers_units(self) -> list[Unit]: + def peers_units(self) -> List[Unit]: """Get peers units in a safe way.""" if not self._peers: return [] @@ -702,7 +741,7 @@ def mongo_config(self) -> MongoConfiguration: @property def mongos_config(self) -> MongoConfiguration: """Generates a MongoDBConfiguration object for mongos in the deployment of MongoDB.""" - hosts = self.get_mongos_hosts() + hosts = self.get_k8s_mongos_hosts() external_ca, _ = self.tls.get_tls_files(internal=False) internal_ca, _ = self.tls.get_tls_files(internal=True) From 1e743748e37556d423e9420bd6e228ee2c70b585 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 6 Sep 2024 12:20:00 +0000 Subject: [PATCH 02/22] tls is properly updated for nodeport now --- lib/charms/mongodb/v0/mongodb_tls.py | 14 ++---- src/charm.py | 55 +++++++++++++++------ src/node_port.py | 74 ++++++++++++++++++++++++---- 3 files changed, 111 insertions(+), 32 deletions(-) diff --git a/lib/charms/mongodb/v0/mongodb_tls.py b/lib/charms/mongodb/v0/mongodb_tls.py index 27be5e13..ed4af13e 100644 --- a/lib/charms/mongodb/v0/mongodb_tls.py +++ b/lib/charms/mongodb/v0/mongodb_tls.py @@ -282,12 +282,13 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: logger.debug("Generating a new Certificate Signing Request.") key = self.get_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL).encode("utf-8") old_csr = self.get_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL).encode("utf-8") + sans = self.get_new_sans() new_csr = generate_csr( private_key=key, subject=self._get_subject_name(), organization=self._get_subject_name(), - sans=self.get_new_sans(), - sans_ip=[str(self.charm.model.get_binding(self.peer_relation).network.bind_address)], + sans=sans[SANS_DNS_KEY], + sans_ip=sans[SANS_IPS_KEY], ) logger.debug("Requesting a certificate renewal.") @@ -316,16 +317,11 @@ def get_new_sans(self) -> Dict: sans[SANS_IPS_KEY] = [] - if Config.SUBSTRATE == Config.Substrate.VM: + if self.substrate == Config.Substrate.VM: sans[SANS_IPS_KEY].append( str(self.charm.model.get_binding(self.peer_relation).network.bind_address) ) - - elif ( - Config.SUBSTRATE == Config.Substrate.K8S - and self.charm.is_role(Config.Role.MONGOS) - and self.charm.is_external_client - ): + elif self.charm.is_role(Config.Role.MONGOS) and self.charm.is_external_client: sans[SANS_IPS_KEY].append( self.charm.get_ext_mongos_host(self.charm.unit, incl_port=False) ) diff --git a/src/charm.py b/src/charm.py index 6289a87b..ffb16f57 100755 --- a/src/charm.py +++ b/src/charm.py @@ -75,7 +75,9 @@ def __init__(self, *args): # lifecycle events self.framework.observe(self.on.config_changed, self._on_config_changed) - self.framework.observe(self.on.mongos_pebble_ready, self._on_mongos_pebble_ready) + self.framework.observe( + self.on.mongos_pebble_ready, self._on_mongos_pebble_ready + ) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -144,7 +146,9 @@ def _on_start(self, event: StartEvent) -> None: logger.info( "Missing integration to config-server. mongos cannot run start sequence unless connected to config-server." ) - self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) + self.status.set_and_share_status( + BlockedStatus("Missing relation to config-server.") + ) event.defer() return @@ -177,7 +181,9 @@ def _on_update_status(self, _): logger.info( "Missing integration to config-server. mongos cannot run unless connected to config-server." ) - self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) + self.status.set_and_share_status( + BlockedStatus("Missing relation to config-server.") + ) return if tls_statuses := self.cluster.get_tls_statuses(): @@ -187,7 +193,9 @@ def _on_update_status(self, _): # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). if not self.cluster.is_mongos_running(): logger.info("mongos has not started yet") - self.status.set_and_share_status(WaitingStatus("Waiting for mongos to start.")) + self.status.set_and_share_status( + WaitingStatus("Waiting for mongos to start.") + ) return self.status.set_and_share_status(ActiveStatus()) @@ -213,10 +221,15 @@ def set_status_invalid_external_config(self) -> None: def update_external_services(self) -> None: """Update external services based on provided configuration.""" - if self.model.config["expose-external"] == Config.ExternalConnections.EXTERNAL_NODEPORT: + if ( + self.model.config["expose-external"] + == Config.ExternalConnections.EXTERNAL_NODEPORT + ): # every unit attempts to create a nodeport service - if exists, will silently continue self.node_port_manager.apply_service( - service=self.node_port_manager.build_node_port_services(port=Config.MONGOS_PORT) + service=self.node_port_manager.build_node_port_services( + port=Config.MONGOS_PORT + ) ) else: self.node_port_manager.delete_unit_service() @@ -226,7 +239,9 @@ def update_external_services(self) -> None: def update_tls_sans(self) -> None: current_sans = self.tls.get_current_sans() current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() - expected_sans_ip = set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + expected_sans_ip = ( + set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + ) sans_ip_changed = current_sans_ip ^ expected_sans_ip if not sans_ip_changed: @@ -273,7 +288,9 @@ def get_keyfile_contents(self) -> str | None: def is_integrated_to_config_server(self) -> bool: """Returns True if the mongos application is integrated to a config-server.""" - return self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None + return ( + self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None + ) def get_secret(self, scope: str, key: str) -> Optional[str]: """Get secret from the secret storage.""" @@ -315,7 +332,9 @@ def remove_secret(self, scope, key) -> None: content = secret.get_content() if not content.get(key) or content[key] == Config.Secrets.SECRET_DELETED_LABEL: - logger.error(f"Non-existing secret {scope}:{key} was attempted to be removed.") + logger.error( + f"Non-existing secret {scope}:{key} was attempted to be removed." + ) return content[key] = Config.Secrets.SECRET_DELETED_LABEL @@ -342,7 +361,9 @@ def set_database(self, database: str) -> None: return # a mongos shard can only be related to one config server - config_server_rel = self.model.relations[Config.Relations.CLUSTER_RELATIONS_NAME][0] + config_server_rel = self.model.relations[ + Config.Relations.CLUSTER_RELATIONS_NAME + ][0] self.cluster.database_requires.update_relation_data( config_server_rel.id, {DATABASE_TAG: database} ) @@ -412,7 +433,7 @@ def get_k8s_mongos_hosts(self) -> Set: def get_ext_mongos_host(self, unit: Unit, incl_port=True) -> str | None: """Returns the ext hosts for mongos on the provided unit.""" - if self.is_external_client: + if not self.is_external_client: return None unit_ip = self.node_port_manager.get_node_ip(unit.name) @@ -542,7 +563,9 @@ def _pull_licenses(container: Container) -> None: for license_name in licenses: try: - license_file = container.pull(path=Config.get_license_path(license_name)) + license_file = container.pull( + path=Config.get_license_path(license_name) + ) f = open(f"LICENSE_{license_name}", "x") f.write(str(license_file.read())) f.close() @@ -559,10 +582,14 @@ def _set_data_dir_permissions(container: Container) -> None: for path in [Config.DATA_DIR]: paths = container.list_files(path, itself=True) if not len(paths) == 1: - raise ExtraDataDirError("list_files doesn't return only the directory itself") + raise ExtraDataDirError( + "list_files doesn't return only the directory itself" + ) logger.debug(f"Data directory ownership: {paths[0].user}:{paths[0].group}") if paths[0].user != Config.UNIX_USER or paths[0].group != Config.UNIX_GROUP: - container.exec(f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split()) + container.exec( + f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split() + ) def push_file_to_unit( self, diff --git a/src/node_port.py b/src/node_port.py index bb9333b7..84a8fffb 100644 --- a/src/node_port.py +++ b/src/node_port.py @@ -4,13 +4,14 @@ """Manager for handling mongos Kubernetes resources for a single mongos pod.""" +from typing import Optional import logging from functools import cached_property from ops.charm import CharmBase from lightkube.models.meta_v1 import ObjectMeta, OwnerReference from lightkube.core.client import Client from lightkube.core.exceptions import ApiError -from lightkube.resources.core_v1 import Pod, Service +from lightkube.resources.core_v1 import Pod, Service, Node from lightkube.models.core_v1 import ServicePort, ServiceSpec from ops.model import BlockedStatus @@ -59,14 +60,14 @@ def get_pod(self, pod_name: str = "") -> Pod: name=pod_name or self.pod_name, ) - def get_unit_service_name(self) -> str: + def get_unit_service_name(self, unit_name) -> str: """Returns the service name for the current unit.""" - unit_id = self.charm.unit.name.split("/")[1] - return f"{self.app_name}-{unit_id}-external" + unit_name = unit_name.replace("/", "-") + return f"{unit_name}-external" - def get_unit_service(self) -> Service | None: + def get_unit_service(self, unit_name) -> Service | None: """Gets the Service via the K8s API for the current unit.""" - return self.get_service(self.get_unit_service_name()) + return self.get_service(self.get_unit_service_name(unit_name)) # END: getters @@ -86,7 +87,7 @@ def build_node_port_services(self, port: str) -> Service: return Service( metadata=ObjectMeta( - name=self.get_unit_service_name(), + name=self.get_unit_service_name(self.charm.unit.name), namespace=self.namespace, # When we scale-down K8s will keep the Services for the deleted units around, # unless the Services' owner is also deleted. @@ -134,11 +135,11 @@ def apply_service(self, service: Service) -> None: def delete_unit_service(self) -> None: """Deletes a unit Service, if it exists.""" try: - service = self.get_unit_service() + service = self.get_unit_service(unit_name=self.charm.unit.name) except ApiError as e: if e.status.code == 404: logger.debug( - f"Could not find {self.get_unit_service_name()} to delete." + f"Could not find {self.get_unit_service_name(self.charm.unit.name)} to delete." ) return @@ -154,4 +155,59 @@ def delete_unit_service(self) -> None: else: raise + def _node_name(self, unit_name: str) -> str: + """Return the node name for this unit's pod ip.""" + try: + pod = self.client.get( + Pod, + name=unit_name.replace("/", "-"), + namespace=self.namespace, + ) + except ApiError as e: + if e.status.code == 403: + self.on_deployed_without_trust() + return + + return pod.spec.nodeName + + def get_node_ip(self, unit_name: str) -> Optional[str]: + """Return node IP for the provided unit.""" + try: + node = self.client.get( + Node, + name=self._node_name(unit_name), + namespace=self.namespace, + ) + except ApiError as e: + if e.status.code == 403: + logger.error("Could not delete service, application needs `juju trust`") + self.on_deployed_without_trust() + return + # [ + # NodeAddress(address='192.168.0.228', type='InternalIP'), + # NodeAddress(address='example.com', type='Hostname') + # ] + # Remember that OpenStack, for example, will return an internal hostname, which is not + # accessible from the outside. Give preference to ExternalIP, then InternalIP first + # Separated, as we want to give preference to ExternalIP, InternalIP and then Hostname + for typ in ["ExternalIP", "InternalIP", "Hostname"]: + for a in node.status.addresses: + if a.type == typ: + return a.address + + def get_node_port(self, port_to_match: int, unit_name: str) -> int: + """Return node port for the provided port to match.""" + service = self.get_unit_service(unit_name=unit_name) + + if not service or not service.spec.type == "NodePort": + raise Exception("No service found for port.") + + for svc_port in service.spec.ports: + if svc_port.port == 27018: + return svc_port.nodePort + + raise Exception( + f"Unable to find NodePort for {port_to_match} for the {service} service" + ) + # END: helpers From fee56a859dce9079488c5f29f6eea038c1410fc4 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 6 Sep 2024 12:34:04 +0000 Subject: [PATCH 03/22] re-request TLS certs on IP change --- src/charm.py | 58 +++++++++++++++------------------------------------- 1 file changed, 17 insertions(+), 41 deletions(-) diff --git a/src/charm.py b/src/charm.py index ffb16f57..85295ea6 100755 --- a/src/charm.py +++ b/src/charm.py @@ -75,9 +75,7 @@ def __init__(self, *args): # lifecycle events self.framework.observe(self.on.config_changed, self._on_config_changed) - self.framework.observe( - self.on.mongos_pebble_ready, self._on_mongos_pebble_ready - ) + self.framework.observe(self.on.mongos_pebble_ready, self._on_mongos_pebble_ready) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -146,9 +144,7 @@ def _on_start(self, event: StartEvent) -> None: logger.info( "Missing integration to config-server. mongos cannot run start sequence unless connected to config-server." ) - self.status.set_and_share_status( - BlockedStatus("Missing relation to config-server.") - ) + self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) event.defer() return @@ -181,9 +177,7 @@ def _on_update_status(self, _): logger.info( "Missing integration to config-server. mongos cannot run unless connected to config-server." ) - self.status.set_and_share_status( - BlockedStatus("Missing relation to config-server.") - ) + self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) return if tls_statuses := self.cluster.get_tls_statuses(): @@ -193,16 +187,17 @@ def _on_update_status(self, _): # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). if not self.cluster.is_mongos_running(): logger.info("mongos has not started yet") - self.status.set_and_share_status( - WaitingStatus("Waiting for mongos to start.") - ) + self.status.set_and_share_status(WaitingStatus("Waiting for mongos to start.")) return + # in the case external connectivity it is possible for public K8s endpoint to be updated + # in that case we must update our sans in TLS. K8s endpoints is not tracked by Juju. + self.update_tls_sans() + self.status.set_and_share_status(ActiveStatus()) # END: hook functions - # BEGIN: helper functions def is_user_external_config_valid(self) -> bool: """Returns True if the user set external config is valid.""" return ( @@ -221,15 +216,10 @@ def set_status_invalid_external_config(self) -> None: def update_external_services(self) -> None: """Update external services based on provided configuration.""" - if ( - self.model.config["expose-external"] - == Config.ExternalConnections.EXTERNAL_NODEPORT - ): + if self.model.config["expose-external"] == Config.ExternalConnections.EXTERNAL_NODEPORT: # every unit attempts to create a nodeport service - if exists, will silently continue self.node_port_manager.apply_service( - service=self.node_port_manager.build_node_port_services( - port=Config.MONGOS_PORT - ) + service=self.node_port_manager.build_node_port_services(port=Config.MONGOS_PORT) ) else: self.node_port_manager.delete_unit_service() @@ -239,9 +229,7 @@ def update_external_services(self) -> None: def update_tls_sans(self) -> None: current_sans = self.tls.get_current_sans() current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() - expected_sans_ip = ( - set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() - ) + expected_sans_ip = set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() sans_ip_changed = current_sans_ip ^ expected_sans_ip if not sans_ip_changed: @@ -288,9 +276,7 @@ def get_keyfile_contents(self) -> str | None: def is_integrated_to_config_server(self) -> bool: """Returns True if the mongos application is integrated to a config-server.""" - return ( - self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None - ) + return self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None def get_secret(self, scope: str, key: str) -> Optional[str]: """Get secret from the secret storage.""" @@ -332,9 +318,7 @@ def remove_secret(self, scope, key) -> None: content = secret.get_content() if not content.get(key) or content[key] == Config.Secrets.SECRET_DELETED_LABEL: - logger.error( - f"Non-existing secret {scope}:{key} was attempted to be removed." - ) + logger.error(f"Non-existing secret {scope}:{key} was attempted to be removed.") return content[key] = Config.Secrets.SECRET_DELETED_LABEL @@ -361,9 +345,7 @@ def set_database(self, database: str) -> None: return # a mongos shard can only be related to one config server - config_server_rel = self.model.relations[ - Config.Relations.CLUSTER_RELATIONS_NAME - ][0] + config_server_rel = self.model.relations[Config.Relations.CLUSTER_RELATIONS_NAME][0] self.cluster.database_requires.update_relation_data( config_server_rel.id, {DATABASE_TAG: database} ) @@ -563,9 +545,7 @@ def _pull_licenses(container: Container) -> None: for license_name in licenses: try: - license_file = container.pull( - path=Config.get_license_path(license_name) - ) + license_file = container.pull(path=Config.get_license_path(license_name)) f = open(f"LICENSE_{license_name}", "x") f.write(str(license_file.read())) f.close() @@ -582,14 +562,10 @@ def _set_data_dir_permissions(container: Container) -> None: for path in [Config.DATA_DIR]: paths = container.list_files(path, itself=True) if not len(paths) == 1: - raise ExtraDataDirError( - "list_files doesn't return only the directory itself" - ) + raise ExtraDataDirError("list_files doesn't return only the directory itself") logger.debug(f"Data directory ownership: {paths[0].user}:{paths[0].group}") if paths[0].user != Config.UNIX_USER or paths[0].group != Config.UNIX_GROUP: - container.exec( - f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split() - ) + container.exec(f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split()) def push_file_to_unit( self, From e1fa06b06d7e32b544df46a33e6e3b65adb3d341 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Fri, 6 Sep 2024 15:14:37 +0000 Subject: [PATCH 04/22] add int tests to verify node port with TLS is functional --- lib/charms/mongodb/v0/mongodb_tls.py | 20 +++++++-- tests/integration/tls/helpers.py | 62 +++++++++++++--------------- tests/integration/tls/test_tls.py | 44 +++++++++++++++++--- 3 files changed, 83 insertions(+), 43 deletions(-) diff --git a/lib/charms/mongodb/v0/mongodb_tls.py b/lib/charms/mongodb/v0/mongodb_tls.py index ed4af13e..fdedf113 100644 --- a/lib/charms/mongodb/v0/mongodb_tls.py +++ b/lib/charms/mongodb/v0/mongodb_tls.py @@ -1,4 +1,4 @@ -# Copyright 2023 Canonical Ltd. +# Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. """In this class we manage client database relations. @@ -27,6 +27,7 @@ from ops.framework import Object from ops.model import ActiveStatus, MaintenanceStatus, Unit, WaitingStatus + from config import Config UNIT_SCOPE = Config.Relations.UNIT_SCOPE @@ -38,11 +39,11 @@ LIBID = "e02a50f0795e4dd292f58e93b4f493dd" # Increment this major API version when introducing breaking changes -LIBAPI = 0 +LIBAPI = 1 # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 15 +LIBPATCH = 3 logger = logging.getLogger(__name__) @@ -163,12 +164,18 @@ def _on_tls_relation_joined(self, event: RelationJoinedEvent) -> None: def _on_tls_relation_broken(self, event: RelationBrokenEvent) -> None: """Disable TLS when TLS relation broken.""" - logger.debug("Disabling external and internal TLS for unit: %s", self.charm.unit.name) + if not self.charm.db_initialised: + logger.info("Deferring %s. db is not initialised.", str(type(event))) + event.defer() + return + if self.charm.upgrade_in_progress: logger.warning( "Disabling TLS is not supported during an upgrade. The charm may be in a broken, unrecoverable state." ) + logger.debug("Disabling external and internal TLS for unit: %s", self.charm.unit.name) + for internal in [True, False]: self.set_tls_secret(internal, Config.TLS.SECRET_CA_LABEL, None) self.set_tls_secret(internal, Config.TLS.SECRET_CERT_LABEL, None) @@ -193,6 +200,11 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: event.defer() return + if not self.charm.db_initialised: + logger.info("Deferring %s. db is not initialised.", str(type(event))) + event.defer() + return + int_csr = self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CSR_LABEL) ext_csr = self.get_tls_secret(internal=False, label_name=Config.TLS.SECRET_CSR_LABEL) diff --git a/tests/integration/tls/helpers.py b/tests/integration/tls/helpers.py index f49696d3..ffa60085 100644 --- a/tests/integration/tls/helpers.py +++ b/tests/integration/tls/helpers.py @@ -11,7 +11,7 @@ import json import ops import logging - +from ops.model import Unit logger = logging.getLogger(__name__) @@ -136,6 +136,22 @@ async def check_tls(ops_test, unit, enabled) -> None: return False +async def get_sans_ips(ops_test: OpsTest, unit: Unit, internal: bool) -> str: + """Retrieves the sans for the for mongos on the provided unit.""" + cert_name = "internal" if internal else "external" + get_sans_cmd = ( + f"openssl x509 -in /etc/mongod/{cert_name}-cert.pem -noout -text | grep Address" + ) + complete_command = f"ssh --container mongos {unit.name} {get_sans_cmd}" + _, result, _ = await ops_test.juju(*complete_command.split()) + + if "IP Address:" not in result: + return "" + + ip_addresses = result.split("IP Address:")[1] + return ip_addresses + + async def time_file_created(ops_test: OpsTest, unit_name: str, path: str) -> int: """Returns the unix timestamp of when a file was created on a specified unit.""" time_cmd = f"ls -l --time-style=full-iso {path} " @@ -154,9 +170,7 @@ async def time_file_created(ops_test: OpsTest, unit_name: str, path: str) -> int return process_ls_time(ls_output) -async def time_process_started( - ops_test: OpsTest, unit_name: str, process_name: str -) -> int: +async def time_process_started(ops_test: OpsTest, unit_name: str, process_name: str) -> int: """Retrieves the time that a given process started according to systemd.""" logs = await run_command_on_unit(ops_test, unit_name, "/charm/bin/pebble changes") @@ -226,9 +240,7 @@ async def check_certs_correctly_distributed( ][0] # Read the content of the cert file stored in the unit - cert_file_content = await get_file_content( - ops_test, unit.name, cert_path, tmpdir - ) + cert_file_content = await get_file_content(ops_test, unit.name, cert_path, tmpdir) # Get the external cert value from the relation relation_cert = "\n".join(tls_item["chain"]).strip() @@ -260,9 +272,7 @@ async def scp_file_preserve_ctime( return f"{filename}" -async def get_file_content( - ops_test: OpsTest, unit_name: str, path: str, tmpdir: Path -) -> str: +async def get_file_content(ops_test: OpsTest, unit_name: str, path: str, tmpdir: Path) -> str: filename = await scp_file_preserve_ctime(ops_test, unit_name, path, tmpdir) with open(filename, mode="r") as fd: @@ -341,9 +351,7 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> original_tls_info[unit.name]["mongos_service"] = await time_process_started( ops_test, unit.name, MONGOS_SERVICE ) - await check_certs_correctly_distributed( - ops_test, unit, app_name=app, tmpdir=tmpdir - ) + await check_certs_correctly_distributed(ops_test, unit, app_name=app, tmpdir=tmpdir) # set external and internal key using auto-generated key for each unit for unit in ops_test.model.applications[app].units: @@ -353,32 +361,18 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> # wait for certificate to be available and processed. Can get receive two certificate # available events and restart twice so we want to ensure we are idle for at least 1 minute - await ops_test.model.wait_for_idle( - apps=[app], status="active", timeout=1000, idle_period=60 - ) + await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=1000, idle_period=60) # After updating both the external key and the internal key a new certificate request will be # made; then the certificates should be available and updated. for unit in ops_test.model.applications[app].units: - new_external_cert = await get_file_content( - ops_test, unit.name, EXTERNAL_CERT_PATH, tmpdir - ) - new_internal_cert = await get_file_content( - ops_test, unit.name, INTERNAL_CERT_PATH, tmpdir - ) - new_external_cert_time = await time_file_created( - ops_test, unit.name, EXTERNAL_CERT_PATH - ) - new_internal_cert_time = await time_file_created( - ops_test, unit.name, INTERNAL_CERT_PATH - ) - new_mongos_service_time = await time_process_started( - ops_test, unit.name, MONGOS_SERVICE - ) + new_external_cert = await get_file_content(ops_test, unit.name, EXTERNAL_CERT_PATH, tmpdir) + new_internal_cert = await get_file_content(ops_test, unit.name, INTERNAL_CERT_PATH, tmpdir) + new_external_cert_time = await time_file_created(ops_test, unit.name, EXTERNAL_CERT_PATH) + new_internal_cert_time = await time_file_created(ops_test, unit.name, INTERNAL_CERT_PATH) + new_mongos_service_time = await time_process_started(ops_test, unit.name, MONGOS_SERVICE) - await check_certs_correctly_distributed( - ops_test, unit, app_name=app, tmpdir=tmpdir - ) + await check_certs_correctly_distributed(ops_test, unit, app_name=app, tmpdir=tmpdir) assert ( new_external_cert != original_tls_info[unit.name]["external_cert_contents"] ), "external cert not rotated" diff --git a/tests/integration/tls/test_tls.py b/tests/integration/tls/test_tls.py index 543d4702..8033037f 100644 --- a/tests/integration/tls/test_tls.py +++ b/tests/integration/tls/test_tls.py @@ -19,7 +19,9 @@ MONGOS_APP_NAME, CERTS_APP_NAME, rotate_and_verify_certs, + get_sans_ips, ) +from ..client_relations.helpers import get_public_k8s_ip MONGOS_SERVICE = "mongos.service" @@ -61,6 +63,41 @@ async def test_mongos_tls_enabled(ops_test: OpsTest) -> None: await check_mongos_tls_enabled(ops_test) +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_mongos_tls_nodeport(ops_test: OpsTest): + """Tests that TLS is stable""" + # test that charm can enable nodeport without breaking mongos or accidently disabling TLS + await ops_test.model.applications[MONGOS_APP_NAME].set_config({"expose-external": "nodeport"}) + await ops_test.model.wait_for_idle( + apps=[MONGOS_APP_NAME], + idle_period=30, + status="active", + timeout=TIMEOUT, + ) + await check_mongos_tls_enabled(ops_test) + + # check for expected IP addresses in the pem file + for unit in ops_test.model.applications[MONGOS_APP_NAME].units: + assert get_public_k8s_ip() in await get_sans_ips(ops_test, unit, internal=True) + assert get_public_k8s_ip() in await get_sans_ips(ops_test, unit, internal=False) + + # test that charm can disable nodeport without breaking mongos or accidently disabling TLS + await ops_test.model.applications[MONGOS_APP_NAME].set_config({"expose-external": "none"}) + await ops_test.model.wait_for_idle( + apps=[MONGOS_APP_NAME], + idle_period=30, + status="active", + timeout=TIMEOUT, + ) + await check_mongos_tls_enabled(ops_test) + + # check for no IP addresses in the pem file + for unit in ops_test.model.applications[MONGOS_APP_NAME].units: + assert get_public_k8s_ip() not in await get_sans_ips(ops_test, unit, internal=True) + assert get_public_k8s_ip() not in await get_sans_ips(ops_test, unit, internal=False) + + @pytest.mark.group(1) @pytest.mark.abort_on_fail async def test_mongos_rotate_certs(ops_test: OpsTest, tmp_path: Path) -> None: @@ -99,16 +136,13 @@ async def test_mongos_tls_ca_mismatch(ops_test: OpsTest) -> None: CERTS_APP_NAME, application_name=DIFFERENT_CERTS_APP_NAME, channel="stable" ) await ops_test.model.wait_for_idle( - apps=[DIFFERENT_CERTS_APP_NAME], + apps=[MONGOS_APP_NAME], idle_period=10, raise_on_blocked=False, - status="active", timeout=TIMEOUT, ) - await toggle_tls_mongos( - ops_test, enable=True, certs_app_name=DIFFERENT_CERTS_APP_NAME - ) + await toggle_tls_mongos(ops_test, enable=True, certs_app_name=DIFFERENT_CERTS_APP_NAME) await wait_for_mongos_units_blocked( ops_test, MONGOS_APP_NAME, From 232a68cc3a26a067320f615363e86771c96bbed2 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Sat, 7 Sep 2024 10:39:57 +0000 Subject: [PATCH 05/22] Finish TLS work for nodeport prevent unnessary hooks from firing + request only certificates that are needed --- lib/charms/mongodb/v0/mongodb_tls.py | 55 ++++++++++++++++------- src/charm.py | 67 ++++++++++++++++------------ 2 files changed, 78 insertions(+), 44 deletions(-) diff --git a/lib/charms/mongodb/v0/mongodb_tls.py b/lib/charms/mongodb/v0/mongodb_tls.py index fdedf113..e532bb01 100644 --- a/lib/charms/mongodb/v0/mongodb_tls.py +++ b/lib/charms/mongodb/v0/mongodb_tls.py @@ -39,12 +39,14 @@ LIBID = "e02a50f0795e4dd292f58e93b4f493dd" # Increment this major API version when introducing breaking changes -LIBAPI = 1 +LIBAPI = 0 # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version LIBPATCH = 3 +WAIT_CERT_UPDATE = "wait-cert-updated" + logger = logging.getLogger(__name__) @@ -104,6 +106,9 @@ def request_certificate( internal: bool, ): """Request TLS certificate.""" + if not self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION): + return + if param is None: key = generate_private_key() else: @@ -124,9 +129,8 @@ def request_certificate( label = "int" if internal else "ext" self.charm.unit_peer_data[f"{label}_certs_subject"] = self._get_subject_name() self.charm.unit_peer_data[f"{label}_certs_subject"] = self._get_subject_name() - - if self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION): - self.certs.request_certificate_creation(certificate_signing_request=csr) + self.certs.request_certificate_creation(certificate_signing_request=csr) + self.set_waiting_for_cert_to_update(internal=internal, waiting=True) @staticmethod def _parse_tls_file(raw_content: str) -> bytes: @@ -225,12 +229,13 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: ) self.set_tls_secret(internal, Config.TLS.SECRET_CERT_LABEL, event.certificate) self.set_tls_secret(internal, Config.TLS.SECRET_CA_LABEL, event.ca) + self.set_waiting_for_cert_to_update(waiting=False, internal=internal) if self.charm.is_role(Config.Role.CONFIG_SERVER) and internal: self.charm.cluster.update_ca_secret(new_ca=event.ca) self.charm.config_server.update_ca_secret(new_ca=event.ca) - if self.waiting_for_certs(): + if self.waiting_for_both_certs(): logger.debug( "Defer till both internal and external TLS certificates available to avoid second restart." ) @@ -252,7 +257,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: # clear waiting status if db service is ready self.charm.status.set_and_share_status(ActiveStatus()) - def waiting_for_certs(self): + def waiting_for_both_certs(self): """Returns a boolean indicating whether additional certs are needed.""" if not self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL): logger.debug("Waiting for internal certificate.") @@ -340,12 +345,13 @@ def get_new_sans(self) -> Dict: return sans - def get_current_sans(self) -> List | None: + def get_current_sans(self, internal: bool) -> List | None: """Gets the current SANs for the unit cert.""" # if unit has no certificates do not proceed. - if not self.is_tls_enabled(internal=True) or not self.is_tls_enabled(internal=False): + if not self.is_tls_enabled(internal=internal): return + pem_file = Config.TLS.INT_PEM_FILE if internal else Config.TLS.EXT_PEM_FILE command = [ "openssl", "x509", @@ -353,7 +359,7 @@ def get_current_sans(self) -> List | None: "-ext", "subjectAltName", "-in", - Config.TLS.EXT_PEM_FILE, + pem_file, ] try: @@ -407,13 +413,6 @@ def get_tls_files(self, internal: bool) -> Tuple[Optional[str], Optional[str]]: return ca_file, pem_file - def get_host(self, unit: Unit): - """Retrieves the hostname of the unit based on the substrate.""" - if self.substrate == "vm": - return self.charm.unit_ip(unit) - else: - return self.charm.get_hostname_for_unit(unit) - def set_tls_secret(self, internal: bool, label_name: str, contents: str) -> None: """Sets TLS secret, based on whether or not it is related to internal connections.""" scope = "int" if internal else "ext" @@ -437,3 +436,27 @@ def _get_subject_name(self) -> str: return self.charm.get_config_server_name() or self.charm.app.name return self.charm.app.name + + def is_set_waiting_for_cert_to_update( + self, + internal=False, + ) -> bool: + """Returns True we are waiting for a cert to update.""" + if internal: + json.loads(self.charm.app_peer_data.get(WAIT_CERT_UPDATE, "false")) + else: + json.loads(self.charm.unit_peer_data.get(WAIT_CERT_UPDATE, "false")) + + def set_waiting_for_cert_to_update( + self, + waiting: bool, + internal: bool, + ): + """Sets a boolean indicator, indicating whether or not we are waiting for a cert to update.""" + if internal and not self.charm.unit.is_leader(): + return + + if internal: + self.charm.app_peer_data[WAIT_CERT_UPDATE] = json.dumps(waiting) + else: + self.charm.unit_peer_data[WAIT_CERT_UPDATE] = json.dumps(waiting) diff --git a/src/charm.py b/src/charm.py index 85295ea6..fd97925b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -227,38 +227,49 @@ def update_external_services(self) -> None: self.expose_external = self.model.config["expose-external"] def update_tls_sans(self) -> None: - current_sans = self.tls.get_current_sans() - current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() - expected_sans_ip = set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() - sans_ip_changed = current_sans_ip ^ expected_sans_ip + """Emits a certificate expiring event when sans in current certificates are out of date. - if not sans_ip_changed: - return + This can occur for a variety of reasons: + 1. Node port has been toggled on + 2. Node port has been toggled off + 3. The public K8s IP has changed + """ - logger.info( - ( - f'Broker {self.unit.name.split("/")[1]} updating certificate SANs - ' - f"OLD SANs = {current_sans_ip - expected_sans_ip}, " - f"NEW SANs = {expected_sans_ip - current_sans_ip}" - ) - ) + for internal in [True, False]: + if internal and not self.unit.is_leader(): + continue - # question for Marc, why not also internal certs - self.tls.certs.on.certificate_expiring.emit( - certificate=self.tls.get_tls_secret( - internal=True, label_name=Config.TLS.SECRET_CERT_LABEL - ), - expiry=datetime.now().isoformat(), - ) - self.tls.certs.on.certificate_expiring.emit( - certificate=self.tls.get_tls_secret( - internal=False, label_name=Config.TLS.SECRET_CERT_LABEL - ), - expiry=datetime.now().isoformat(), - ) + # if the certificate has already been requested, we do not want to re-request + # another one and lead to an infinite chain of certificate events. + if self.tls.is_set_waiting_for_cert_to_update(internal): + continue - # TODO ensures only single requested new certs, will be replaced on new certificate-available event - # self.tls.certificate = False + current_sans = self.tls.get_current_sans(internal) + current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() + expected_sans_ip = set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + sans_ip_changed = current_sans_ip ^ expected_sans_ip + + if not sans_ip_changed: + continue + + logger.info( + ( + f'Broker {self.unit.name.split("/")[1]} updating certificate SANs - ' + f"OLD SANs = {current_sans_ip - expected_sans_ip}, " + f"NEW SANs = {expected_sans_ip - current_sans_ip}" + ) + ) + + self.tls.certs.on.certificate_expiring.emit( + certificate=self.tls.get_tls_secret( + internal=internal, label_name=Config.TLS.SECRET_CERT_LABEL + ), + expiry=datetime.now().isoformat(), + ) + # without this, it is possible that we constantly request new certificates + # over and over again - blocking the event queue from processing the expired + # certificate event. + self.tls.set_waiting_for_cert_to_update(waiting=True, internal=internal) def get_keyfile_contents(self) -> str | None: """Retrieves the contents of the keyfile on host machine.""" From 6f3fb2af2314adbf849c09512a90c932fa472bfa Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Sat, 7 Sep 2024 10:47:33 +0000 Subject: [PATCH 06/22] tests for tls and nodeport functional --- tests/integration/helpers.py | 2 +- tests/integration/tls/helpers.py | 48 ++++++++++++++++++++++--------- tests/integration/tls/test_tls.py | 31 +++++++++++++++----- 3 files changed, 59 insertions(+), 22 deletions(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index d0610576..aa356b40 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -193,7 +193,7 @@ async def deploy_cluster_components(ops_test: OpsTest) -> None: await ops_test.model.wait_for_idle( apps=[MONGOS_APP_NAME, SHARD_APP_NAME, CONFIG_SERVER_APP_NAME], - idle_period=10, + idle_period=30, raise_on_blocked=False, raise_on_error=False, # Removed this once DPE-4996 is resolved. ) diff --git a/tests/integration/tls/helpers.py b/tests/integration/tls/helpers.py index ffa60085..a5eba6af 100644 --- a/tests/integration/tls/helpers.py +++ b/tests/integration/tls/helpers.py @@ -139,9 +139,7 @@ async def check_tls(ops_test, unit, enabled) -> None: async def get_sans_ips(ops_test: OpsTest, unit: Unit, internal: bool) -> str: """Retrieves the sans for the for mongos on the provided unit.""" cert_name = "internal" if internal else "external" - get_sans_cmd = ( - f"openssl x509 -in /etc/mongod/{cert_name}-cert.pem -noout -text | grep Address" - ) + get_sans_cmd = f"openssl x509 -in /etc/mongod/{cert_name}-cert.pem -noout -text | grep Address" complete_command = f"ssh --container mongos {unit.name} {get_sans_cmd}" _, result, _ = await ops_test.juju(*complete_command.split()) @@ -170,7 +168,9 @@ async def time_file_created(ops_test: OpsTest, unit_name: str, path: str) -> int return process_ls_time(ls_output) -async def time_process_started(ops_test: OpsTest, unit_name: str, process_name: str) -> int: +async def time_process_started( + ops_test: OpsTest, unit_name: str, process_name: str +) -> int: """Retrieves the time that a given process started according to systemd.""" logs = await run_command_on_unit(ops_test, unit_name, "/charm/bin/pebble changes") @@ -240,7 +240,9 @@ async def check_certs_correctly_distributed( ][0] # Read the content of the cert file stored in the unit - cert_file_content = await get_file_content(ops_test, unit.name, cert_path, tmpdir) + cert_file_content = await get_file_content( + ops_test, unit.name, cert_path, tmpdir + ) # Get the external cert value from the relation relation_cert = "\n".join(tls_item["chain"]).strip() @@ -272,7 +274,9 @@ async def scp_file_preserve_ctime( return f"{filename}" -async def get_file_content(ops_test: OpsTest, unit_name: str, path: str, tmpdir: Path) -> str: +async def get_file_content( + ops_test: OpsTest, unit_name: str, path: str, tmpdir: Path +) -> str: filename = await scp_file_preserve_ctime(ops_test, unit_name, path, tmpdir) with open(filename, mode="r") as fd: @@ -351,7 +355,9 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> original_tls_info[unit.name]["mongos_service"] = await time_process_started( ops_test, unit.name, MONGOS_SERVICE ) - await check_certs_correctly_distributed(ops_test, unit, app_name=app, tmpdir=tmpdir) + await check_certs_correctly_distributed( + ops_test, unit, app_name=app, tmpdir=tmpdir + ) # set external and internal key using auto-generated key for each unit for unit in ops_test.model.applications[app].units: @@ -361,18 +367,32 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> # wait for certificate to be available and processed. Can get receive two certificate # available events and restart twice so we want to ensure we are idle for at least 1 minute - await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=1000, idle_period=60) + await ops_test.model.wait_for_idle( + apps=[app], status="active", timeout=1000, idle_period=60 + ) # After updating both the external key and the internal key a new certificate request will be # made; then the certificates should be available and updated. for unit in ops_test.model.applications[app].units: - new_external_cert = await get_file_content(ops_test, unit.name, EXTERNAL_CERT_PATH, tmpdir) - new_internal_cert = await get_file_content(ops_test, unit.name, INTERNAL_CERT_PATH, tmpdir) - new_external_cert_time = await time_file_created(ops_test, unit.name, EXTERNAL_CERT_PATH) - new_internal_cert_time = await time_file_created(ops_test, unit.name, INTERNAL_CERT_PATH) - new_mongos_service_time = await time_process_started(ops_test, unit.name, MONGOS_SERVICE) + new_external_cert = await get_file_content( + ops_test, unit.name, EXTERNAL_CERT_PATH, tmpdir + ) + new_internal_cert = await get_file_content( + ops_test, unit.name, INTERNAL_CERT_PATH, tmpdir + ) + new_external_cert_time = await time_file_created( + ops_test, unit.name, EXTERNAL_CERT_PATH + ) + new_internal_cert_time = await time_file_created( + ops_test, unit.name, INTERNAL_CERT_PATH + ) + new_mongos_service_time = await time_process_started( + ops_test, unit.name, MONGOS_SERVICE + ) - await check_certs_correctly_distributed(ops_test, unit, app_name=app, tmpdir=tmpdir) + await check_certs_correctly_distributed( + ops_test, unit, app_name=app, tmpdir=tmpdir + ) assert ( new_external_cert != original_tls_info[unit.name]["external_cert_contents"] ), "external cert not rotated" diff --git a/tests/integration/tls/test_tls.py b/tests/integration/tls/test_tls.py index 8033037f..041f61d3 100644 --- a/tests/integration/tls/test_tls.py +++ b/tests/integration/tls/test_tls.py @@ -60,6 +60,13 @@ async def test_mongos_tls_enabled(ops_test: OpsTest) -> None: ) await integrate_cluster_with_tls(ops_test) + await ops_test.model.wait_for_idle( + apps=[MONGOS_APP_NAME], + idle_period=30, + status="active", + timeout=TIMEOUT, + ) + await check_mongos_tls_enabled(ops_test) @@ -67,8 +74,10 @@ async def test_mongos_tls_enabled(ops_test: OpsTest) -> None: @pytest.mark.abort_on_fail async def test_mongos_tls_nodeport(ops_test: OpsTest): """Tests that TLS is stable""" - # test that charm can enable nodeport without breaking mongos or accidently disabling TLS - await ops_test.model.applications[MONGOS_APP_NAME].set_config({"expose-external": "nodeport"}) + # test that charm can enable nodeport without breaking mongos or accidentally disabling TLS + await ops_test.model.applications[MONGOS_APP_NAME].set_config( + {"expose-external": "nodeport"} + ) await ops_test.model.wait_for_idle( apps=[MONGOS_APP_NAME], idle_period=30, @@ -82,8 +91,10 @@ async def test_mongos_tls_nodeport(ops_test: OpsTest): assert get_public_k8s_ip() in await get_sans_ips(ops_test, unit, internal=True) assert get_public_k8s_ip() in await get_sans_ips(ops_test, unit, internal=False) - # test that charm can disable nodeport without breaking mongos or accidently disabling TLS - await ops_test.model.applications[MONGOS_APP_NAME].set_config({"expose-external": "none"}) + # test that charm can disable nodeport without breaking mongos or accidentally disabling TLS + await ops_test.model.applications[MONGOS_APP_NAME].set_config( + {"expose-external": "none"} + ) await ops_test.model.wait_for_idle( apps=[MONGOS_APP_NAME], idle_period=30, @@ -94,8 +105,12 @@ async def test_mongos_tls_nodeport(ops_test: OpsTest): # check for no IP addresses in the pem file for unit in ops_test.model.applications[MONGOS_APP_NAME].units: - assert get_public_k8s_ip() not in await get_sans_ips(ops_test, unit, internal=True) - assert get_public_k8s_ip() not in await get_sans_ips(ops_test, unit, internal=False) + assert get_public_k8s_ip() not in await get_sans_ips( + ops_test, unit, internal=True + ) + assert get_public_k8s_ip() not in await get_sans_ips( + ops_test, unit, internal=False + ) @pytest.mark.group(1) @@ -142,7 +157,9 @@ async def test_mongos_tls_ca_mismatch(ops_test: OpsTest) -> None: timeout=TIMEOUT, ) - await toggle_tls_mongos(ops_test, enable=True, certs_app_name=DIFFERENT_CERTS_APP_NAME) + await toggle_tls_mongos( + ops_test, enable=True, certs_app_name=DIFFERENT_CERTS_APP_NAME + ) await wait_for_mongos_units_blocked( ops_test, MONGOS_APP_NAME, From 023cc2a82f82cdfb37f86bd700713ea9c370eb2a Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Sat, 7 Sep 2024 10:47:43 +0000 Subject: [PATCH 07/22] fmt + lint --- src/charm.py | 53 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/src/charm.py b/src/charm.py index fd97925b..b723713a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -75,7 +75,9 @@ def __init__(self, *args): # lifecycle events self.framework.observe(self.on.config_changed, self._on_config_changed) - self.framework.observe(self.on.mongos_pebble_ready, self._on_mongos_pebble_ready) + self.framework.observe( + self.on.mongos_pebble_ready, self._on_mongos_pebble_ready + ) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -144,7 +146,9 @@ def _on_start(self, event: StartEvent) -> None: logger.info( "Missing integration to config-server. mongos cannot run start sequence unless connected to config-server." ) - self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) + self.status.set_and_share_status( + BlockedStatus("Missing relation to config-server.") + ) event.defer() return @@ -177,7 +181,9 @@ def _on_update_status(self, _): logger.info( "Missing integration to config-server. mongos cannot run unless connected to config-server." ) - self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) + self.status.set_and_share_status( + BlockedStatus("Missing relation to config-server.") + ) return if tls_statuses := self.cluster.get_tls_statuses(): @@ -187,7 +193,9 @@ def _on_update_status(self, _): # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). if not self.cluster.is_mongos_running(): logger.info("mongos has not started yet") - self.status.set_and_share_status(WaitingStatus("Waiting for mongos to start.")) + self.status.set_and_share_status( + WaitingStatus("Waiting for mongos to start.") + ) return # in the case external connectivity it is possible for public K8s endpoint to be updated @@ -216,10 +224,15 @@ def set_status_invalid_external_config(self) -> None: def update_external_services(self) -> None: """Update external services based on provided configuration.""" - if self.model.config["expose-external"] == Config.ExternalConnections.EXTERNAL_NODEPORT: + if ( + self.model.config["expose-external"] + == Config.ExternalConnections.EXTERNAL_NODEPORT + ): # every unit attempts to create a nodeport service - if exists, will silently continue self.node_port_manager.apply_service( - service=self.node_port_manager.build_node_port_services(port=Config.MONGOS_PORT) + service=self.node_port_manager.build_node_port_services( + port=Config.MONGOS_PORT + ) ) else: self.node_port_manager.delete_unit_service() @@ -246,7 +259,9 @@ def update_tls_sans(self) -> None: current_sans = self.tls.get_current_sans(internal) current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() - expected_sans_ip = set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + expected_sans_ip = ( + set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + ) sans_ip_changed = current_sans_ip ^ expected_sans_ip if not sans_ip_changed: @@ -287,7 +302,9 @@ def get_keyfile_contents(self) -> str | None: def is_integrated_to_config_server(self) -> bool: """Returns True if the mongos application is integrated to a config-server.""" - return self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None + return ( + self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None + ) def get_secret(self, scope: str, key: str) -> Optional[str]: """Get secret from the secret storage.""" @@ -329,7 +346,9 @@ def remove_secret(self, scope, key) -> None: content = secret.get_content() if not content.get(key) or content[key] == Config.Secrets.SECRET_DELETED_LABEL: - logger.error(f"Non-existing secret {scope}:{key} was attempted to be removed.") + logger.error( + f"Non-existing secret {scope}:{key} was attempted to be removed." + ) return content[key] = Config.Secrets.SECRET_DELETED_LABEL @@ -356,7 +375,9 @@ def set_database(self, database: str) -> None: return # a mongos shard can only be related to one config server - config_server_rel = self.model.relations[Config.Relations.CLUSTER_RELATIONS_NAME][0] + config_server_rel = self.model.relations[ + Config.Relations.CLUSTER_RELATIONS_NAME + ][0] self.cluster.database_requires.update_relation_data( config_server_rel.id, {DATABASE_TAG: database} ) @@ -556,7 +577,9 @@ def _pull_licenses(container: Container) -> None: for license_name in licenses: try: - license_file = container.pull(path=Config.get_license_path(license_name)) + license_file = container.pull( + path=Config.get_license_path(license_name) + ) f = open(f"LICENSE_{license_name}", "x") f.write(str(license_file.read())) f.close() @@ -573,10 +596,14 @@ def _set_data_dir_permissions(container: Container) -> None: for path in [Config.DATA_DIR]: paths = container.list_files(path, itself=True) if not len(paths) == 1: - raise ExtraDataDirError("list_files doesn't return only the directory itself") + raise ExtraDataDirError( + "list_files doesn't return only the directory itself" + ) logger.debug(f"Data directory ownership: {paths[0].user}:{paths[0].group}") if paths[0].user != Config.UNIX_USER or paths[0].group != Config.UNIX_GROUP: - container.exec(f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split()) + container.exec( + f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split() + ) def push_file_to_unit( self, From 2d753d8dc6ab8d84149b85f7cd2c7718a512ad52 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 9 Sep 2024 08:58:44 +0000 Subject: [PATCH 08/22] update file location --- lib/charms/mongodb/{v0 => v1}/mongodb_tls.py | 64 ++++++++++++-------- src/charm.py | 55 +++++------------ 2 files changed, 54 insertions(+), 65 deletions(-) rename lib/charms/mongodb/{v0 => v1}/mongodb_tls.py (95%) diff --git a/lib/charms/mongodb/v0/mongodb_tls.py b/lib/charms/mongodb/v1/mongodb_tls.py similarity index 95% rename from lib/charms/mongodb/v0/mongodb_tls.py rename to lib/charms/mongodb/v1/mongodb_tls.py index e532bb01..95b7657f 100644 --- a/lib/charms/mongodb/v0/mongodb_tls.py +++ b/lib/charms/mongodb/v1/mongodb_tls.py @@ -7,14 +7,13 @@ and expose needed information for client connection via fields in external relation. """ -import json import base64 +import json import logging import re import socket -from typing import List, Optional, Tuple, Dict import subprocess -from ops.pebble import ExecError +from typing import Dict, List, Optional, Tuple from charms.tls_certificates_interface.v3.tls_certificates import ( CertificateAvailableEvent, @@ -25,8 +24,8 @@ ) from ops.charm import ActionEvent, RelationBrokenEvent, RelationJoinedEvent from ops.framework import Object -from ops.model import ActiveStatus, MaintenanceStatus, Unit, WaitingStatus - +from ops.model import ActiveStatus, MaintenanceStatus, WaitingStatus +from ops.pebble import ExecError from config import Config @@ -39,11 +38,11 @@ LIBID = "e02a50f0795e4dd292f58e93b4f493dd" # Increment this major API version when introducing breaking changes -LIBAPI = 0 +LIBAPI = 1 # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 3 +LIBPATCH = 4 WAIT_CERT_UPDATE = "wait-cert-updated" @@ -351,31 +350,20 @@ def get_current_sans(self, internal: bool) -> List | None: if not self.is_tls_enabled(internal=internal): return - pem_file = Config.TLS.INT_PEM_FILE if internal else Config.TLS.EXT_PEM_FILE - command = [ - "openssl", - "x509", - "-noout", - "-ext", - "subjectAltName", - "-in", - pem_file, - ] - try: - container = self.charm.unit.get_container(Config.CONTAINER_NAME) - process = container.exec(command=command, working_dir=Config.MONGOD_CONF_DIR) - - output, _ = process.wait_output() - sans_lines = output.splitlines() + sans_lines = self.get_sans_from_host(internal) except (subprocess.CalledProcessError, ExecError) as e: logger.error(e.stdout) raise e + line = "" for line in sans_lines: if "DNS" in line and "IP" in line: break + if "DNS" not in line and "IP" not in line: + return None + sans_ip = [] sans_dns = [] for item in line.split(", "): @@ -388,6 +376,34 @@ def get_current_sans(self, internal: bool) -> List | None: return {SANS_IPS_KEY: sorted(sans_ip), SANS_DNS_KEY: sorted(sans_dns)} + def get_sans_from_host(self, internal) -> List[str]: + """Returns the sans lines from the host. + + Raises: subprocess.CalledProcessError, ExecError + """ + pem_file = Config.TLS.INT_PEM_FILE if internal else Config.TLS.EXT_PEM_FILE + + command = [ + "openssl", + "x509", + "-noout", + "-ext", + "subjectAltName", + "-in", + pem_file, + ] + + if self.substrate == Config.Substrate.K8S: + container = self.charm.unit.get_container(Config.CONTAINER_NAME) + process = container.exec(command=command, working_dir=Config.MONGOD_CONF_DIR) + output, _ = process.wait_output() + sans_lines = output.splitlines() + else: + output = subprocess.check_output(command, shell=True) + sans_lines = output.decode("utf-8").splitlines() + + return sans_lines + def get_tls_files(self, internal: bool) -> Tuple[Optional[str], Optional[str]]: """Prepare TLS files in special MongoDB way. @@ -452,7 +468,7 @@ def set_waiting_for_cert_to_update( waiting: bool, internal: bool, ): - """Sets a boolean indicator, indicating whether or not we are waiting for a cert to update.""" + """Sets a boolean indicator, for whether or not we are waiting for a cert to update.""" if internal and not self.charm.unit.is_leader(): return diff --git a/src/charm.py b/src/charm.py index b723713a..39fe8052 100755 --- a/src/charm.py +++ b/src/charm.py @@ -20,7 +20,7 @@ from charms.mongodb.v0.mongo import MongoConfiguration, MongoConnection from charms.mongos.v0.set_status import MongosStatusHandler from charms.mongodb.v1.mongodb_provider import MongoDBProvider -from charms.mongodb.v0.mongodb_tls import MongoDBTLS +from charms.mongodb.v1.mongodb_tls import MongoDBTLS from charms.mongodb.v0.mongodb_secrets import SecretCache from charms.mongodb.v0.mongodb_secrets import generate_secret_label from charms.mongodb.v1.helpers import get_mongos_args @@ -75,9 +75,7 @@ def __init__(self, *args): # lifecycle events self.framework.observe(self.on.config_changed, self._on_config_changed) - self.framework.observe( - self.on.mongos_pebble_ready, self._on_mongos_pebble_ready - ) + self.framework.observe(self.on.mongos_pebble_ready, self._on_mongos_pebble_ready) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -146,9 +144,7 @@ def _on_start(self, event: StartEvent) -> None: logger.info( "Missing integration to config-server. mongos cannot run start sequence unless connected to config-server." ) - self.status.set_and_share_status( - BlockedStatus("Missing relation to config-server.") - ) + self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) event.defer() return @@ -181,9 +177,7 @@ def _on_update_status(self, _): logger.info( "Missing integration to config-server. mongos cannot run unless connected to config-server." ) - self.status.set_and_share_status( - BlockedStatus("Missing relation to config-server.") - ) + self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) return if tls_statuses := self.cluster.get_tls_statuses(): @@ -193,9 +187,7 @@ def _on_update_status(self, _): # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). if not self.cluster.is_mongos_running(): logger.info("mongos has not started yet") - self.status.set_and_share_status( - WaitingStatus("Waiting for mongos to start.") - ) + self.status.set_and_share_status(WaitingStatus("Waiting for mongos to start.")) return # in the case external connectivity it is possible for public K8s endpoint to be updated @@ -224,15 +216,10 @@ def set_status_invalid_external_config(self) -> None: def update_external_services(self) -> None: """Update external services based on provided configuration.""" - if ( - self.model.config["expose-external"] - == Config.ExternalConnections.EXTERNAL_NODEPORT - ): + if self.model.config["expose-external"] == Config.ExternalConnections.EXTERNAL_NODEPORT: # every unit attempts to create a nodeport service - if exists, will silently continue self.node_port_manager.apply_service( - service=self.node_port_manager.build_node_port_services( - port=Config.MONGOS_PORT - ) + service=self.node_port_manager.build_node_port_services(port=Config.MONGOS_PORT) ) else: self.node_port_manager.delete_unit_service() @@ -259,9 +246,7 @@ def update_tls_sans(self) -> None: current_sans = self.tls.get_current_sans(internal) current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() - expected_sans_ip = ( - set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() - ) + expected_sans_ip = set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() sans_ip_changed = current_sans_ip ^ expected_sans_ip if not sans_ip_changed: @@ -302,9 +287,7 @@ def get_keyfile_contents(self) -> str | None: def is_integrated_to_config_server(self) -> bool: """Returns True if the mongos application is integrated to a config-server.""" - return ( - self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None - ) + return self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None def get_secret(self, scope: str, key: str) -> Optional[str]: """Get secret from the secret storage.""" @@ -346,9 +329,7 @@ def remove_secret(self, scope, key) -> None: content = secret.get_content() if not content.get(key) or content[key] == Config.Secrets.SECRET_DELETED_LABEL: - logger.error( - f"Non-existing secret {scope}:{key} was attempted to be removed." - ) + logger.error(f"Non-existing secret {scope}:{key} was attempted to be removed.") return content[key] = Config.Secrets.SECRET_DELETED_LABEL @@ -375,9 +356,7 @@ def set_database(self, database: str) -> None: return # a mongos shard can only be related to one config server - config_server_rel = self.model.relations[ - Config.Relations.CLUSTER_RELATIONS_NAME - ][0] + config_server_rel = self.model.relations[Config.Relations.CLUSTER_RELATIONS_NAME][0] self.cluster.database_requires.update_relation_data( config_server_rel.id, {DATABASE_TAG: database} ) @@ -577,9 +556,7 @@ def _pull_licenses(container: Container) -> None: for license_name in licenses: try: - license_file = container.pull( - path=Config.get_license_path(license_name) - ) + license_file = container.pull(path=Config.get_license_path(license_name)) f = open(f"LICENSE_{license_name}", "x") f.write(str(license_file.read())) f.close() @@ -596,14 +573,10 @@ def _set_data_dir_permissions(container: Container) -> None: for path in [Config.DATA_DIR]: paths = container.list_files(path, itself=True) if not len(paths) == 1: - raise ExtraDataDirError( - "list_files doesn't return only the directory itself" - ) + raise ExtraDataDirError("list_files doesn't return only the directory itself") logger.debug(f"Data directory ownership: {paths[0].user}:{paths[0].group}") if paths[0].user != Config.UNIX_USER or paths[0].group != Config.UNIX_GROUP: - container.exec( - f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split() - ) + container.exec(f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split()) def push_file_to_unit( self, From 258db0fec72bab419ea7440ab043c23dd102d5b2 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 9 Sep 2024 14:22:05 +0000 Subject: [PATCH 09/22] minor tweaks --- lib/charms/mongodb/v1/mongodb_tls.py | 18 ++++++++---------- src/charm.py | 10 ++++++++-- src/config.py | 6 ++---- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/lib/charms/mongodb/v1/mongodb_tls.py b/lib/charms/mongodb/v1/mongodb_tls.py index 95b7657f..a55cfe49 100644 --- a/lib/charms/mongodb/v1/mongodb_tls.py +++ b/lib/charms/mongodb/v1/mongodb_tls.py @@ -331,13 +331,11 @@ def get_new_sans(self) -> Dict: f"{self.charm.app.name}-{unit_id}.{self.charm.app.name}-endpoints", ] - sans[SANS_IPS_KEY] = [] + sans[SANS_IPS_KEY] = [ + str(self.charm.model.get_binding(self.peer_relation).network.bind_address) + ] - if self.substrate == Config.Substrate.VM: - sans[SANS_IPS_KEY].append( - str(self.charm.model.get_binding(self.peer_relation).network.bind_address) - ) - elif self.charm.is_role(Config.Role.MONGOS) and self.charm.is_external_client: + if self.charm.is_role(Config.Role.MONGOS) and self.charm.is_external_client: sans[SANS_IPS_KEY].append( self.charm.get_ext_mongos_host(self.charm.unit, incl_port=False) ) @@ -358,15 +356,15 @@ def get_current_sans(self, internal: bool) -> List | None: line = "" for line in sans_lines: - if "DNS" in line and "IP" in line: + if "DNS" in line or "IP" in line: break - if "DNS" not in line and "IP" not in line: - return None - sans_ip = [] sans_dns = [] for item in line.split(", "): + if ":" not in item: + continue + san_type, san_value = item.split(":") if san_type == "DNS": diff --git a/src/charm.py b/src/charm.py index 39fe8052..09af26f6 100755 --- a/src/charm.py +++ b/src/charm.py @@ -8,7 +8,7 @@ from datetime import datetime from exceptions import MissingSecretError -from ops.pebble import PathError, ProtocolError, Layer +from ops.pebble import PathError, ProtocolError, Layer, APIError from node_port import NodePortManager from typing import Set, Optional, Dict, List @@ -343,7 +343,13 @@ def stop_mongos_service(self): def restart_charm_services(self): """Restart mongos service.""" container = self.unit.get_container(Config.CONTAINER_NAME) - container.stop(Config.SERVICE_NAME) + try: + container.stop(Config.SERVICE_NAME) + except APIError: + # note that some libs will make a call to restart mongos service, before it has + # started (i.e. config_server.py) leading to a race condition where there is an + # attempt to stop the service before starting it. + pass container.add_layer(Config.CONTAINER_NAME, self._mongos_layer, combine=True) container.replan() diff --git a/src/config.py b/src/config.py index ee5f3385..491188ce 100644 --- a/src/config.py +++ b/src/config.py @@ -14,7 +14,7 @@ class Config: SUBSTRATE = "k8s" CONTAINER_NAME = "mongos" USER_ROLE_CREATE_USERS = "admin" - SERVICE_NAME = "mongod" # this must match the name of the service in the ROCK + SERVICE_NAME = "mongos" # this must match the name of the service in the ROCK MONGOD_CONF_DIR = "/etc/mongod" UNIX_USER = "mongodb" UNIX_GROUP = "mongodb" @@ -76,9 +76,7 @@ class Status: # TODO Future PR add more status messages here as constants UNHEALTHY_UPGRADE = BlockedStatus("Unhealthy after upgrade.") - INVALID_EXTERNAL_CONFIG = BlockedStatus( - "Config option for expose-external not valid." - ) + INVALID_EXTERNAL_CONFIG = BlockedStatus("Config option for expose-external not valid.") class Substrate: """Substrate related constants.""" From f4f864213d8e70203a379acde6afc74493942211 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 9 Sep 2024 14:22:19 +0000 Subject: [PATCH 10/22] fix tests for tls --- tests/integration/client_relations/helpers.py | 18 ++- tests/integration/tls/helpers.py | 110 +++++++----------- tests/integration/tls/test_tls.py | 40 +++---- 3 files changed, 76 insertions(+), 92 deletions(-) diff --git a/tests/integration/client_relations/helpers.py b/tests/integration/client_relations/helpers.py index 5c7fd4f4..249625a4 100644 --- a/tests/integration/client_relations/helpers.py +++ b/tests/integration/client_relations/helpers.py @@ -71,7 +71,9 @@ def is_relation_joined(ops_test: OpsTest, endpoint_one: str, endpoint_two: str) def get_node_port_info(ops_test: OpsTest, node_port_name: str) -> str: - node_port_cmd = f"kubectl get svc -n {ops_test.model.name} | grep NodePort | grep {node_port_name}" + node_port_cmd = ( + f"kubectl get svc -n {ops_test.model.name} | grep NodePort | grep {node_port_name}" + ) return subprocess.run(node_port_cmd, shell=True, capture_output=True, text=True) @@ -119,9 +121,17 @@ async def assert_all_unit_node_ports_available(ops_test: OpsTest): ), "client is not reachable" -async def is_external_mongos_client_reachable( - ops_test: OpsTest, exposed_node_port: str -) -> bool: +async def get_external_uri(ops_test: OpsTest, unit_id, exposed_node_port: str = None) -> str: + exposed_node_port = exposed_node_port or get_port_from_node_port( + ops_test, node_port_name=f"{MONGOS_APP_NAME}-{unit_id}-external" + ) + + public_k8s_ip = get_public_k8s_ip() + username, password = await get_mongos_user_password(ops_test, MONGOS_APP_NAME) + return f"mongodb://{username}:{password}@{public_k8s_ip}:{exposed_node_port}" + + +async def is_external_mongos_client_reachable(ops_test: OpsTest, exposed_node_port: str) -> bool: """Returns True if the mongos client is reachable on the provided node port via the k8s ip.""" public_k8s_ip = get_public_k8s_ip() username, password = await get_mongos_user_password(ops_test, MONGOS_APP_NAME) diff --git a/tests/integration/tls/helpers.py b/tests/integration/tls/helpers.py index a5eba6af..60309ed4 100644 --- a/tests/integration/tls/helpers.py +++ b/tests/integration/tls/helpers.py @@ -12,6 +12,8 @@ import ops import logging from ops.model import Unit +from ..client_relations.helpers import get_external_uri +from ..helpers import get_mongos_uri logger = logging.getLogger(__name__) @@ -19,7 +21,7 @@ EXTERNAL_PEM_PATH = "/etc/mongod/external-cert.pem" EXTERNAL_CERT_PATH = "/etc/mongod/external-ca.crt" INTERNAL_CERT_PATH = "/etc/mongod/internal-ca.crt" -MONGO_SHELL = "charmed-mongodb.mongosh" +MONGO_SHELL = "/usr/bin/mongosh" MONGOS_APP_NAME = "mongos-k8s" CERT_REL_NAME = "certificates" CERTS_APP_NAME = "self-signed-certificates" @@ -76,10 +78,10 @@ async def check_mongos_tls_disabled(ops_test: OpsTest) -> None: await check_tls(ops_test, unit, enabled=False) -async def check_mongos_tls_enabled(ops_test: OpsTest) -> None: +async def check_mongos_tls_enabled(ops_test: OpsTest, internal=True) -> None: # check each replica set is running with TLS enabled for unit in ops_test.model.applications[MONGOS_APP_NAME].units: - await check_tls(ops_test, unit, enabled=True) + await check_tls(ops_test, unit, enabled=True, internal=internal) async def toggle_tls_mongos( @@ -97,41 +99,44 @@ async def toggle_tls_mongos( f"{certs_app_name}:{CERT_REL_NAME}", ) + await ops_test.model.wait_for_idle(apps=[MONGOS_APP_NAME], idle_period=30) -async def mongos_tls_command(ops_test: OpsTest) -> str: + +async def mongos_tls_command(ops_test: OpsTest, unit, internal=True) -> str: """Generates a command which verifies TLS status.""" - secret_uri = await get_application_relation_data( - ops_test, MONGOS_APP_NAME, "mongos", "secret-user" - ) + unit_id = unit.name.split("/")[1] + if not internal: + client_uri = await get_external_uri(ops_test, unit_id=unit_id) + else: + client_uri = await get_mongos_uri(ops_test, unit_id=unit_id) + # secret data not added properly, wait for DPE-5215 to merge + # secret_uri = await get_application_relation_data( + # ops_test, MONGOS_APP_NAME, "mongos", "secret-user" + # ) - secret_data = await get_secret_data(ops_test, secret_uri) - client_uri = secret_data.get("uris") + # secret_data = await get_secret_data(ops_test, secret_uri) + # client_uri = secret_data.get("uris") return ( - f"{MONGO_SHELL} '{client_uri}' --eval 'sh.status()'" + f"{MONGO_SHELL} '{client_uri}' --eval 'db.getUsers()'" f" --tls --tlsCAFile {EXTERNAL_CERT_PATH}" f" --tlsCertificateKeyFile {EXTERNAL_PEM_PATH}" ) -async def check_tls(ops_test, unit, enabled) -> None: +async def check_tls(ops_test, unit, enabled, internal=True) -> None: """Returns True if TLS matches the expected state "enabled".""" try: - for attempt in Retrying( - stop=stop_after_attempt(10), - wait=wait_exponential(multiplier=1, min=2, max=30), - ): - with attempt: - mongod_tls_check = await mongos_tls_command(ops_test) - check_tls_cmd = f"exec --unit {unit.name} -- {mongod_tls_check}" - return_code, _, _ = await ops_test.juju(*check_tls_cmd.split()) - - tls_enabled = return_code == 0 - if enabled != tls_enabled: - raise ValueError( - f"TLS is{' not' if not tls_enabled else ''} enabled on {unit.name}" - ) - return True + mongos_tls_check = await mongos_tls_command(ops_test, unit=unit, internal=internal) + print(mongos_tls_check) + complete_command = f"ssh --container mongos {unit.name} {mongos_tls_check}" + return_code, _, stderr = await ops_test.juju(*complete_command.split()) + + tls_enabled = return_code == 0 + if enabled != tls_enabled: + logger.error(stderr) + raise ValueError(f"TLS is{' not' if not tls_enabled else ''} enabled on {unit.name}") + return True except RetryError: return False @@ -139,15 +144,10 @@ async def check_tls(ops_test, unit, enabled) -> None: async def get_sans_ips(ops_test: OpsTest, unit: Unit, internal: bool) -> str: """Retrieves the sans for the for mongos on the provided unit.""" cert_name = "internal" if internal else "external" - get_sans_cmd = f"openssl x509 -in /etc/mongod/{cert_name}-cert.pem -noout -text | grep Address" + get_sans_cmd = f"openssl x509 -noout -ext subjectAltName -in /etc/mongod/{cert_name}-cert.pem" complete_command = f"ssh --container mongos {unit.name} {get_sans_cmd}" _, result, _ = await ops_test.juju(*complete_command.split()) - - if "IP Address:" not in result: - return "" - - ip_addresses = result.split("IP Address:")[1] - return ip_addresses + return result async def time_file_created(ops_test: OpsTest, unit_name: str, path: str) -> int: @@ -168,9 +168,7 @@ async def time_file_created(ops_test: OpsTest, unit_name: str, path: str) -> int return process_ls_time(ls_output) -async def time_process_started( - ops_test: OpsTest, unit_name: str, process_name: str -) -> int: +async def time_process_started(ops_test: OpsTest, unit_name: str, process_name: str) -> int: """Retrieves the time that a given process started according to systemd.""" logs = await run_command_on_unit(ops_test, unit_name, "/charm/bin/pebble changes") @@ -240,9 +238,7 @@ async def check_certs_correctly_distributed( ][0] # Read the content of the cert file stored in the unit - cert_file_content = await get_file_content( - ops_test, unit.name, cert_path, tmpdir - ) + cert_file_content = await get_file_content(ops_test, unit.name, cert_path, tmpdir) # Get the external cert value from the relation relation_cert = "\n".join(tls_item["chain"]).strip() @@ -274,9 +270,7 @@ async def scp_file_preserve_ctime( return f"{filename}" -async def get_file_content( - ops_test: OpsTest, unit_name: str, path: str, tmpdir: Path -) -> str: +async def get_file_content(ops_test: OpsTest, unit_name: str, path: str, tmpdir: Path) -> str: filename = await scp_file_preserve_ctime(ops_test, unit_name, path, tmpdir) with open(filename, mode="r") as fd: @@ -355,9 +349,7 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> original_tls_info[unit.name]["mongos_service"] = await time_process_started( ops_test, unit.name, MONGOS_SERVICE ) - await check_certs_correctly_distributed( - ops_test, unit, app_name=app, tmpdir=tmpdir - ) + await check_certs_correctly_distributed(ops_test, unit, app_name=app, tmpdir=tmpdir) # set external and internal key using auto-generated key for each unit for unit in ops_test.model.applications[app].units: @@ -367,32 +359,18 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> # wait for certificate to be available and processed. Can get receive two certificate # available events and restart twice so we want to ensure we are idle for at least 1 minute - await ops_test.model.wait_for_idle( - apps=[app], status="active", timeout=1000, idle_period=60 - ) + await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=1000, idle_period=60) # After updating both the external key and the internal key a new certificate request will be # made; then the certificates should be available and updated. for unit in ops_test.model.applications[app].units: - new_external_cert = await get_file_content( - ops_test, unit.name, EXTERNAL_CERT_PATH, tmpdir - ) - new_internal_cert = await get_file_content( - ops_test, unit.name, INTERNAL_CERT_PATH, tmpdir - ) - new_external_cert_time = await time_file_created( - ops_test, unit.name, EXTERNAL_CERT_PATH - ) - new_internal_cert_time = await time_file_created( - ops_test, unit.name, INTERNAL_CERT_PATH - ) - new_mongos_service_time = await time_process_started( - ops_test, unit.name, MONGOS_SERVICE - ) + new_external_cert = await get_file_content(ops_test, unit.name, EXTERNAL_CERT_PATH, tmpdir) + new_internal_cert = await get_file_content(ops_test, unit.name, INTERNAL_CERT_PATH, tmpdir) + new_external_cert_time = await time_file_created(ops_test, unit.name, EXTERNAL_CERT_PATH) + new_internal_cert_time = await time_file_created(ops_test, unit.name, INTERNAL_CERT_PATH) + new_mongos_service_time = await time_process_started(ops_test, unit.name, MONGOS_SERVICE) - await check_certs_correctly_distributed( - ops_test, unit, app_name=app, tmpdir=tmpdir - ) + await check_certs_correctly_distributed(ops_test, unit, app_name=app, tmpdir=tmpdir) assert ( new_external_cert != original_tls_info[unit.name]["external_cert_contents"] ), "external cert not rotated" diff --git a/tests/integration/tls/test_tls.py b/tests/integration/tls/test_tls.py index 041f61d3..de0260b0 100644 --- a/tests/integration/tls/test_tls.py +++ b/tests/integration/tls/test_tls.py @@ -42,6 +42,8 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: """Build and deploy a sharded cluster.""" await deploy_cluster_components(ops_test) + # we should verify that tests work with multiple routers. + await ops_test.model.applications[MONGOS_APP_NAME].scale(2) await build_cluster(ops_test) await deploy_tls(ops_test) @@ -75,16 +77,15 @@ async def test_mongos_tls_enabled(ops_test: OpsTest) -> None: async def test_mongos_tls_nodeport(ops_test: OpsTest): """Tests that TLS is stable""" # test that charm can enable nodeport without breaking mongos or accidentally disabling TLS - await ops_test.model.applications[MONGOS_APP_NAME].set_config( - {"expose-external": "nodeport"} - ) + await ops_test.model.applications[MONGOS_APP_NAME].set_config({"expose-external": "nodeport"}) await ops_test.model.wait_for_idle( apps=[MONGOS_APP_NAME], - idle_period=30, + idle_period=60, status="active", timeout=TIMEOUT, ) - await check_mongos_tls_enabled(ops_test) + for internal in [True, False]: + await check_mongos_tls_enabled(ops_test, internal) # check for expected IP addresses in the pem file for unit in ops_test.model.applications[MONGOS_APP_NAME].units: @@ -92,25 +93,20 @@ async def test_mongos_tls_nodeport(ops_test: OpsTest): assert get_public_k8s_ip() in await get_sans_ips(ops_test, unit, internal=False) # test that charm can disable nodeport without breaking mongos or accidentally disabling TLS - await ops_test.model.applications[MONGOS_APP_NAME].set_config( - {"expose-external": "none"} - ) + await ops_test.model.applications[MONGOS_APP_NAME].set_config({"expose-external": "none"}) await ops_test.model.wait_for_idle( apps=[MONGOS_APP_NAME], - idle_period=30, + idle_period=60, status="active", timeout=TIMEOUT, ) - await check_mongos_tls_enabled(ops_test) - # check for no IP addresses in the pem file + await check_mongos_tls_enabled(ops_test, internal=True) + + # check for no public k8s IP address in the pem file for unit in ops_test.model.applications[MONGOS_APP_NAME].units: - assert get_public_k8s_ip() not in await get_sans_ips( - ops_test, unit, internal=True - ) - assert get_public_k8s_ip() not in await get_sans_ips( - ops_test, unit, internal=False - ) + assert get_public_k8s_ip() not in await get_sans_ips(ops_test, unit, internal=True) + assert get_public_k8s_ip() not in await get_sans_ips(ops_test, unit, internal=False) @pytest.mark.group(1) @@ -124,7 +120,6 @@ async def test_mongos_rotate_certs(ops_test: OpsTest, tmp_path: Path) -> None: async def test_mongos_tls_disabled(ops_test: OpsTest) -> None: """Tests that mongos charm can disable TLS.""" await toggle_tls_mongos(ops_test, enable=False) - await check_mongos_tls_disabled(ops_test) await wait_for_mongos_units_blocked( ops_test, @@ -133,12 +128,15 @@ async def test_mongos_tls_disabled(ops_test: OpsTest) -> None: timeout=300, ) + await check_mongos_tls_disabled(ops_test) + @pytest.mark.group(1) @pytest.mark.abort_on_fail async def test_tls_reenabled(ops_test: OpsTest) -> None: """Test that mongos can enable TLS after being integrated to cluster .""" await toggle_tls_mongos(ops_test, enable=True) + await check_mongos_tls_enabled(ops_test) @@ -151,15 +149,13 @@ async def test_mongos_tls_ca_mismatch(ops_test: OpsTest) -> None: CERTS_APP_NAME, application_name=DIFFERENT_CERTS_APP_NAME, channel="stable" ) await ops_test.model.wait_for_idle( - apps=[MONGOS_APP_NAME], + apps=[DIFFERENT_CERTS_APP_NAME], idle_period=10, raise_on_blocked=False, timeout=TIMEOUT, ) - await toggle_tls_mongos( - ops_test, enable=True, certs_app_name=DIFFERENT_CERTS_APP_NAME - ) + await toggle_tls_mongos(ops_test, enable=True, certs_app_name=DIFFERENT_CERTS_APP_NAME) await wait_for_mongos_units_blocked( ops_test, MONGOS_APP_NAME, From c33294f10128baad2f08a4b862a1f225499fe0ca Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 11 Sep 2024 09:56:17 +0000 Subject: [PATCH 11/22] external config based on model config --- src/charm.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/charm.py b/src/charm.py index 09af26f6..708b8f08 100755 --- a/src/charm.py +++ b/src/charm.py @@ -631,13 +631,20 @@ def has_config_server(self) -> bool: @property def expose_external(self) -> Optional[str]: """Returns mode of exposure for external connections.""" - if ( - self.app_peer_data.get("expose-external", Config.ExternalConnections.NONE) - == Config.ExternalConnections.NONE - ): + # don't let an incorrect configuration + if not self.is_user_external_config_valid(): + if ( + self.app_peer_data.get("expose-external", Config.ExternalConnections.NONE) + == Config.ExternalConnections.NONE + ): + return None + + return self.app_peer_data["expose-external"] + + if self.model.config["expose-external"] == Config.ExternalConnections.NONE: return None - return self.app_peer_data["expose-external"] + return self.model.config["expose-external"] @expose_external.setter def expose_external(self, expose_external: str) -> None: From cd23d7295c33fb9b425efaa973a24062276bb4c6 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 11 Sep 2024 12:18:35 +0000 Subject: [PATCH 12/22] update lib --- lib/charms/mongodb/v1/mongodb_tls.py | 87 +++++++--------------------- 1 file changed, 22 insertions(+), 65 deletions(-) diff --git a/lib/charms/mongodb/v1/mongodb_tls.py b/lib/charms/mongodb/v1/mongodb_tls.py index a55cfe49..880e7b46 100644 --- a/lib/charms/mongodb/v1/mongodb_tls.py +++ b/lib/charms/mongodb/v1/mongodb_tls.py @@ -7,6 +7,9 @@ and expose needed information for client connection via fields in external relation. """ +from cryptography import x509 +from cryptography.hazmat.backends import default_backend + import base64 import json import logging @@ -228,7 +231,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: ) self.set_tls_secret(internal, Config.TLS.SECRET_CERT_LABEL, event.certificate) self.set_tls_secret(internal, Config.TLS.SECRET_CA_LABEL, event.ca) - self.set_waiting_for_cert_to_update(waiting=False, internal=internal) + self.set_waiting_for_cert_to_update(internal=internal, waiting=False) if self.charm.is_role(Config.Role.CONFIG_SERVER) and internal: self.charm.cluster.update_ca_secret(new_ca=event.ca) @@ -289,7 +292,6 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: == self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL).rstrip() ): logger.debug("The internal TLS certificate expiring.") - internal = True else: logger.error("An unknown certificate expiring.") @@ -342,66 +344,25 @@ def get_new_sans(self) -> Dict: return sans - def get_current_sans(self, internal: bool) -> List | None: + def get_current_sans(self, internal: bool) -> List[str] | None: """Gets the current SANs for the unit cert.""" # if unit has no certificates do not proceed. if not self.is_tls_enabled(internal=internal): return - try: - sans_lines = self.get_sans_from_host(internal) - except (subprocess.CalledProcessError, ExecError) as e: - logger.error(e.stdout) - raise e - - line = "" - for line in sans_lines: - if "DNS" in line or "IP" in line: - break - - sans_ip = [] - sans_dns = [] - for item in line.split(", "): - if ":" not in item: - continue - - san_type, san_value = item.split(":") + pem_file = self.get_tls_secret(internal, Config.TLS.SECRET_CERT_LABEL) - if san_type == "DNS": - sans_dns.append(san_value) - if san_type == "IP Address": - sans_ip.append(san_value) + try: + cert = cert = x509.load_pem_x509_certificate(pem_file.encode(), default_backend()) + sans = cert.extensions.get_extension_for_class(x509.SubjectAlternativeName).value + sans_ip = [str(san) for san in sans.get_values_for_type(x509.IPAddress)] + sans_dns = [str(san) for san in sans.get_values_for_type(x509.DNSName)] + except x509.ExtensionNotFound: + sans_ip = [] + sans_dns = [] return {SANS_IPS_KEY: sorted(sans_ip), SANS_DNS_KEY: sorted(sans_dns)} - def get_sans_from_host(self, internal) -> List[str]: - """Returns the sans lines from the host. - - Raises: subprocess.CalledProcessError, ExecError - """ - pem_file = Config.TLS.INT_PEM_FILE if internal else Config.TLS.EXT_PEM_FILE - - command = [ - "openssl", - "x509", - "-noout", - "-ext", - "subjectAltName", - "-in", - pem_file, - ] - - if self.substrate == Config.Substrate.K8S: - container = self.charm.unit.get_container(Config.CONTAINER_NAME) - process = container.exec(command=command, working_dir=Config.MONGOD_CONF_DIR) - output, _ = process.wait_output() - sans_lines = output.splitlines() - else: - output = subprocess.check_output(command, shell=True) - sans_lines = output.decode("utf-8").splitlines() - - return sans_lines - def get_tls_files(self, internal: bool) -> Tuple[Optional[str], Optional[str]]: """Prepare TLS files in special MongoDB way. @@ -456,21 +417,17 @@ def is_set_waiting_for_cert_to_update( internal=False, ) -> bool: """Returns True we are waiting for a cert to update.""" - if internal: - json.loads(self.charm.app_peer_data.get(WAIT_CERT_UPDATE, "false")) - else: - json.loads(self.charm.unit_peer_data.get(WAIT_CERT_UPDATE, "false")) + scope = "int" if internal else "ext" + label_name = f"{scope}-{WAIT_CERT_UPDATE}" + + return json.loads(self.charm.unit_peer_data.get(label_name, "false")) def set_waiting_for_cert_to_update( self, waiting: bool, internal: bool, - ): + ) -> None: """Sets a boolean indicator, for whether or not we are waiting for a cert to update.""" - if internal and not self.charm.unit.is_leader(): - return - - if internal: - self.charm.app_peer_data[WAIT_CERT_UPDATE] = json.dumps(waiting) - else: - self.charm.unit_peer_data[WAIT_CERT_UPDATE] = json.dumps(waiting) + scope = "int" if internal else "ext" + label_name = f"{scope}-{WAIT_CERT_UPDATE}" + self.charm.unit_peer_data[label_name] = json.dumps(waiting) From 6e00451c1e256efd1b05d3085d6c1d03861ed4fb Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 11 Sep 2024 12:19:23 +0000 Subject: [PATCH 13/22] fmt + lint of src and update requesting of sans --- src/charm.py | 62 ++++++++++++++++++++++++++++++++++++--------------- src/config.py | 4 +++- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/src/charm.py b/src/charm.py index 708b8f08..960f37fe 100755 --- a/src/charm.py +++ b/src/charm.py @@ -75,7 +75,9 @@ def __init__(self, *args): # lifecycle events self.framework.observe(self.on.config_changed, self._on_config_changed) - self.framework.observe(self.on.mongos_pebble_ready, self._on_mongos_pebble_ready) + self.framework.observe( + self.on.mongos_pebble_ready, self._on_mongos_pebble_ready + ) self.framework.observe(self.on.start, self._on_start) self.framework.observe(self.on.update_status, self._on_update_status) @@ -144,7 +146,9 @@ def _on_start(self, event: StartEvent) -> None: logger.info( "Missing integration to config-server. mongos cannot run start sequence unless connected to config-server." ) - self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) + self.status.set_and_share_status( + BlockedStatus("Missing relation to config-server.") + ) event.defer() return @@ -177,7 +181,9 @@ def _on_update_status(self, _): logger.info( "Missing integration to config-server. mongos cannot run unless connected to config-server." ) - self.status.set_and_share_status(BlockedStatus("Missing relation to config-server.")) + self.status.set_and_share_status( + BlockedStatus("Missing relation to config-server.") + ) return if tls_statuses := self.cluster.get_tls_statuses(): @@ -187,7 +193,9 @@ def _on_update_status(self, _): # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). if not self.cluster.is_mongos_running(): logger.info("mongos has not started yet") - self.status.set_and_share_status(WaitingStatus("Waiting for mongos to start.")) + self.status.set_and_share_status( + WaitingStatus("Waiting for mongos to start.") + ) return # in the case external connectivity it is possible for public K8s endpoint to be updated @@ -216,10 +224,15 @@ def set_status_invalid_external_config(self) -> None: def update_external_services(self) -> None: """Update external services based on provided configuration.""" - if self.model.config["expose-external"] == Config.ExternalConnections.EXTERNAL_NODEPORT: + if ( + self.model.config["expose-external"] + == Config.ExternalConnections.EXTERNAL_NODEPORT + ): # every unit attempts to create a nodeport service - if exists, will silently continue self.node_port_manager.apply_service( - service=self.node_port_manager.build_node_port_services(port=Config.MONGOS_PORT) + service=self.node_port_manager.build_node_port_services( + port=Config.MONGOS_PORT + ) ) else: self.node_port_manager.delete_unit_service() @@ -236,9 +249,6 @@ def update_tls_sans(self) -> None: """ for internal in [True, False]: - if internal and not self.unit.is_leader(): - continue - # if the certificate has already been requested, we do not want to re-request # another one and lead to an infinite chain of certificate events. if self.tls.is_set_waiting_for_cert_to_update(internal): @@ -246,7 +256,9 @@ def update_tls_sans(self) -> None: current_sans = self.tls.get_current_sans(internal) current_sans_ip = set(current_sans["sans_ips"]) if current_sans else set() - expected_sans_ip = set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + expected_sans_ip = ( + set(self.tls.get_new_sans()["sans_ips"]) if current_sans else set() + ) sans_ip_changed = current_sans_ip ^ expected_sans_ip if not sans_ip_changed: @@ -254,7 +266,7 @@ def update_tls_sans(self) -> None: logger.info( ( - f'Broker {self.unit.name.split("/")[1]} updating certificate SANs - ' + f'Mongos {self.unit.name.split("/")[1]} updating certificate SANs - ' f"OLD SANs = {current_sans_ip - expected_sans_ip}, " f"NEW SANs = {expected_sans_ip - current_sans_ip}" ) @@ -287,7 +299,9 @@ def get_keyfile_contents(self) -> str | None: def is_integrated_to_config_server(self) -> bool: """Returns True if the mongos application is integrated to a config-server.""" - return self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None + return ( + self.model.get_relation(Config.Relations.CLUSTER_RELATIONS_NAME) is not None + ) def get_secret(self, scope: str, key: str) -> Optional[str]: """Get secret from the secret storage.""" @@ -329,7 +343,9 @@ def remove_secret(self, scope, key) -> None: content = secret.get_content() if not content.get(key) or content[key] == Config.Secrets.SECRET_DELETED_LABEL: - logger.error(f"Non-existing secret {scope}:{key} was attempted to be removed.") + logger.error( + f"Non-existing secret {scope}:{key} was attempted to be removed." + ) return content[key] = Config.Secrets.SECRET_DELETED_LABEL @@ -362,7 +378,9 @@ def set_database(self, database: str) -> None: return # a mongos shard can only be related to one config server - config_server_rel = self.model.relations[Config.Relations.CLUSTER_RELATIONS_NAME][0] + config_server_rel = self.model.relations[ + Config.Relations.CLUSTER_RELATIONS_NAME + ][0] self.cluster.database_requires.update_relation_data( config_server_rel.id, {DATABASE_TAG: database} ) @@ -562,7 +580,9 @@ def _pull_licenses(container: Container) -> None: for license_name in licenses: try: - license_file = container.pull(path=Config.get_license_path(license_name)) + license_file = container.pull( + path=Config.get_license_path(license_name) + ) f = open(f"LICENSE_{license_name}", "x") f.write(str(license_file.read())) f.close() @@ -579,10 +599,14 @@ def _set_data_dir_permissions(container: Container) -> None: for path in [Config.DATA_DIR]: paths = container.list_files(path, itself=True) if not len(paths) == 1: - raise ExtraDataDirError("list_files doesn't return only the directory itself") + raise ExtraDataDirError( + "list_files doesn't return only the directory itself" + ) logger.debug(f"Data directory ownership: {paths[0].user}:{paths[0].group}") if paths[0].user != Config.UNIX_USER or paths[0].group != Config.UNIX_GROUP: - container.exec(f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split()) + container.exec( + f"chown {Config.UNIX_USER}:{Config.UNIX_GROUP} -R {path}".split() + ) def push_file_to_unit( self, @@ -634,7 +658,9 @@ def expose_external(self) -> Optional[str]: # don't let an incorrect configuration if not self.is_user_external_config_valid(): if ( - self.app_peer_data.get("expose-external", Config.ExternalConnections.NONE) + self.app_peer_data.get( + "expose-external", Config.ExternalConnections.NONE + ) == Config.ExternalConnections.NONE ): return None diff --git a/src/config.py b/src/config.py index 491188ce..67862ca7 100644 --- a/src/config.py +++ b/src/config.py @@ -76,7 +76,9 @@ class Status: # TODO Future PR add more status messages here as constants UNHEALTHY_UPGRADE = BlockedStatus("Unhealthy after upgrade.") - INVALID_EXTERNAL_CONFIG = BlockedStatus("Config option for expose-external not valid.") + INVALID_EXTERNAL_CONFIG = BlockedStatus( + "Config option for expose-external not valid." + ) class Substrate: """Substrate related constants.""" From 3aee80a1f230c35cce0022d73c20bb733ee1658d Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Wed, 11 Sep 2024 12:21:25 +0000 Subject: [PATCH 14/22] fmt+lint tests + update tests now that username / password is set properly --- tests/integration/client_relations/helpers.py | 12 ++-- tests/integration/helpers.py | 19 ++---- tests/integration/tls/helpers.py | 67 ++++++++++++------- tests/integration/tls/test_tls.py | 21 ++++-- 4 files changed, 72 insertions(+), 47 deletions(-) diff --git a/tests/integration/client_relations/helpers.py b/tests/integration/client_relations/helpers.py index 249625a4..b10fec0b 100644 --- a/tests/integration/client_relations/helpers.py +++ b/tests/integration/client_relations/helpers.py @@ -71,9 +71,7 @@ def is_relation_joined(ops_test: OpsTest, endpoint_one: str, endpoint_two: str) def get_node_port_info(ops_test: OpsTest, node_port_name: str) -> str: - node_port_cmd = ( - f"kubectl get svc -n {ops_test.model.name} | grep NodePort | grep {node_port_name}" - ) + node_port_cmd = f"kubectl get svc -n {ops_test.model.name} | grep NodePort | grep {node_port_name}" return subprocess.run(node_port_cmd, shell=True, capture_output=True, text=True) @@ -121,7 +119,9 @@ async def assert_all_unit_node_ports_available(ops_test: OpsTest): ), "client is not reachable" -async def get_external_uri(ops_test: OpsTest, unit_id, exposed_node_port: str = None) -> str: +async def get_external_uri( + ops_test: OpsTest, unit_id, exposed_node_port: str = None +) -> str: exposed_node_port = exposed_node_port or get_port_from_node_port( ops_test, node_port_name=f"{MONGOS_APP_NAME}-{unit_id}-external" ) @@ -131,7 +131,9 @@ async def get_external_uri(ops_test: OpsTest, unit_id, exposed_node_port: str = return f"mongodb://{username}:{password}@{public_k8s_ip}:{exposed_node_port}" -async def is_external_mongos_client_reachable(ops_test: OpsTest, exposed_node_port: str) -> bool: +async def is_external_mongos_client_reachable( + ops_test: OpsTest, exposed_node_port: str +) -> bool: """Returns True if the mongos client is reachable on the provided node port via the k8s ip.""" public_k8s_ip = get_public_k8s_ip() username, password = await get_mongos_user_password(ops_test, MONGOS_APP_NAME) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index aa356b40..44ecf5d5 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -326,21 +326,12 @@ async def get_application_relation_data( async def get_mongos_user_password( ops_test: OpsTest, app_name=MONGOS_APP_NAME, relation_name="cluster" ) -> Tuple[str, str]: - # TODO once DPE:5215 is fixed retrieve via secret - # secret_uri = await get_application_relation_data( - # ops_test, app_name, relation_name=relation_name, key="secret-user" - # ) - - # secret_data = await get_secret_data(ops_test, secret_uri) - # return secret_data.get("username"), secret_data.get("password") - - username = await get_application_relation_data( - ops_test, app_name, relation_name=relation_name, key="username" + secret_uri = await get_application_relation_data( + ops_test, app_name, relation_name=relation_name, key="secret-user" ) - password = await get_application_relation_data( - ops_test, app_name, relation_name=relation_name, key="password" - ) - return username, password + + secret_data = await get_secret_data(ops_test, secret_uri) + return secret_data.get("username"), secret_data.get("password") async def check_mongos( diff --git a/tests/integration/tls/helpers.py b/tests/integration/tls/helpers.py index 60309ed4..73ae5a9f 100644 --- a/tests/integration/tls/helpers.py +++ b/tests/integration/tls/helpers.py @@ -4,8 +4,8 @@ from pathlib import Path from pytest_operator.plugin import OpsTest -from ..helpers import get_application_relation_data, get_secret_data -from tenacity import RetryError, Retrying, stop_after_attempt, wait_exponential +from ..helpers import get_application_relation_data +from tenacity import RetryError from datetime import datetime from typing import Optional, Dict import json @@ -109,13 +109,6 @@ async def mongos_tls_command(ops_test: OpsTest, unit, internal=True) -> str: client_uri = await get_external_uri(ops_test, unit_id=unit_id) else: client_uri = await get_mongos_uri(ops_test, unit_id=unit_id) - # secret data not added properly, wait for DPE-5215 to merge - # secret_uri = await get_application_relation_data( - # ops_test, MONGOS_APP_NAME, "mongos", "secret-user" - # ) - - # secret_data = await get_secret_data(ops_test, secret_uri) - # client_uri = secret_data.get("uris") return ( f"{MONGO_SHELL} '{client_uri}' --eval 'db.getUsers()'" @@ -127,7 +120,9 @@ async def mongos_tls_command(ops_test: OpsTest, unit, internal=True) -> str: async def check_tls(ops_test, unit, enabled, internal=True) -> None: """Returns True if TLS matches the expected state "enabled".""" try: - mongos_tls_check = await mongos_tls_command(ops_test, unit=unit, internal=internal) + mongos_tls_check = await mongos_tls_command( + ops_test, unit=unit, internal=internal + ) print(mongos_tls_check) complete_command = f"ssh --container mongos {unit.name} {mongos_tls_check}" return_code, _, stderr = await ops_test.juju(*complete_command.split()) @@ -135,7 +130,9 @@ async def check_tls(ops_test, unit, enabled, internal=True) -> None: tls_enabled = return_code == 0 if enabled != tls_enabled: logger.error(stderr) - raise ValueError(f"TLS is{' not' if not tls_enabled else ''} enabled on {unit.name}") + raise ValueError( + f"TLS is{' not' if not tls_enabled else ''} enabled on {unit.name}" + ) return True except RetryError: return False @@ -144,7 +141,9 @@ async def check_tls(ops_test, unit, enabled, internal=True) -> None: async def get_sans_ips(ops_test: OpsTest, unit: Unit, internal: bool) -> str: """Retrieves the sans for the for mongos on the provided unit.""" cert_name = "internal" if internal else "external" - get_sans_cmd = f"openssl x509 -noout -ext subjectAltName -in /etc/mongod/{cert_name}-cert.pem" + get_sans_cmd = ( + f"openssl x509 -noout -ext subjectAltName -in /etc/mongod/{cert_name}-cert.pem" + ) complete_command = f"ssh --container mongos {unit.name} {get_sans_cmd}" _, result, _ = await ops_test.juju(*complete_command.split()) return result @@ -168,7 +167,9 @@ async def time_file_created(ops_test: OpsTest, unit_name: str, path: str) -> int return process_ls_time(ls_output) -async def time_process_started(ops_test: OpsTest, unit_name: str, process_name: str) -> int: +async def time_process_started( + ops_test: OpsTest, unit_name: str, process_name: str +) -> int: """Retrieves the time that a given process started according to systemd.""" logs = await run_command_on_unit(ops_test, unit_name, "/charm/bin/pebble changes") @@ -238,7 +239,9 @@ async def check_certs_correctly_distributed( ][0] # Read the content of the cert file stored in the unit - cert_file_content = await get_file_content(ops_test, unit.name, cert_path, tmpdir) + cert_file_content = await get_file_content( + ops_test, unit.name, cert_path, tmpdir + ) # Get the external cert value from the relation relation_cert = "\n".join(tls_item["chain"]).strip() @@ -270,7 +273,9 @@ async def scp_file_preserve_ctime( return f"{filename}" -async def get_file_content(ops_test: OpsTest, unit_name: str, path: str, tmpdir: Path) -> str: +async def get_file_content( + ops_test: OpsTest, unit_name: str, path: str, tmpdir: Path +) -> str: filename = await scp_file_preserve_ctime(ops_test, unit_name, path, tmpdir) with open(filename, mode="r") as fd: @@ -349,7 +354,9 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> original_tls_info[unit.name]["mongos_service"] = await time_process_started( ops_test, unit.name, MONGOS_SERVICE ) - await check_certs_correctly_distributed(ops_test, unit, app_name=app, tmpdir=tmpdir) + await check_certs_correctly_distributed( + ops_test, unit, app_name=app, tmpdir=tmpdir + ) # set external and internal key using auto-generated key for each unit for unit in ops_test.model.applications[app].units: @@ -359,18 +366,32 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> # wait for certificate to be available and processed. Can get receive two certificate # available events and restart twice so we want to ensure we are idle for at least 1 minute - await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=1000, idle_period=60) + await ops_test.model.wait_for_idle( + apps=[app], status="active", timeout=1000, idle_period=60 + ) # After updating both the external key and the internal key a new certificate request will be # made; then the certificates should be available and updated. for unit in ops_test.model.applications[app].units: - new_external_cert = await get_file_content(ops_test, unit.name, EXTERNAL_CERT_PATH, tmpdir) - new_internal_cert = await get_file_content(ops_test, unit.name, INTERNAL_CERT_PATH, tmpdir) - new_external_cert_time = await time_file_created(ops_test, unit.name, EXTERNAL_CERT_PATH) - new_internal_cert_time = await time_file_created(ops_test, unit.name, INTERNAL_CERT_PATH) - new_mongos_service_time = await time_process_started(ops_test, unit.name, MONGOS_SERVICE) + new_external_cert = await get_file_content( + ops_test, unit.name, EXTERNAL_CERT_PATH, tmpdir + ) + new_internal_cert = await get_file_content( + ops_test, unit.name, INTERNAL_CERT_PATH, tmpdir + ) + new_external_cert_time = await time_file_created( + ops_test, unit.name, EXTERNAL_CERT_PATH + ) + new_internal_cert_time = await time_file_created( + ops_test, unit.name, INTERNAL_CERT_PATH + ) + new_mongos_service_time = await time_process_started( + ops_test, unit.name, MONGOS_SERVICE + ) - await check_certs_correctly_distributed(ops_test, unit, app_name=app, tmpdir=tmpdir) + await check_certs_correctly_distributed( + ops_test, unit, app_name=app, tmpdir=tmpdir + ) assert ( new_external_cert != original_tls_info[unit.name]["external_cert_contents"] ), "external cert not rotated" diff --git a/tests/integration/tls/test_tls.py b/tests/integration/tls/test_tls.py index de0260b0..9890b5b3 100644 --- a/tests/integration/tls/test_tls.py +++ b/tests/integration/tls/test_tls.py @@ -77,7 +77,10 @@ async def test_mongos_tls_enabled(ops_test: OpsTest) -> None: async def test_mongos_tls_nodeport(ops_test: OpsTest): """Tests that TLS is stable""" # test that charm can enable nodeport without breaking mongos or accidentally disabling TLS - await ops_test.model.applications[MONGOS_APP_NAME].set_config({"expose-external": "nodeport"}) + await ops_test.model.applications[MONGOS_APP_NAME].set_config( + {"expose-external": "nodeport"} + ) + await ops_test.model.wait_for_idle( apps=[MONGOS_APP_NAME], idle_period=60, @@ -93,7 +96,9 @@ async def test_mongos_tls_nodeport(ops_test: OpsTest): assert get_public_k8s_ip() in await get_sans_ips(ops_test, unit, internal=False) # test that charm can disable nodeport without breaking mongos or accidentally disabling TLS - await ops_test.model.applications[MONGOS_APP_NAME].set_config({"expose-external": "none"}) + await ops_test.model.applications[MONGOS_APP_NAME].set_config( + {"expose-external": "none"} + ) await ops_test.model.wait_for_idle( apps=[MONGOS_APP_NAME], idle_period=60, @@ -105,8 +110,12 @@ async def test_mongos_tls_nodeport(ops_test: OpsTest): # check for no public k8s IP address in the pem file for unit in ops_test.model.applications[MONGOS_APP_NAME].units: - assert get_public_k8s_ip() not in await get_sans_ips(ops_test, unit, internal=True) - assert get_public_k8s_ip() not in await get_sans_ips(ops_test, unit, internal=False) + assert get_public_k8s_ip() not in await get_sans_ips( + ops_test, unit, internal=True + ) + assert get_public_k8s_ip() not in await get_sans_ips( + ops_test, unit, internal=False + ) @pytest.mark.group(1) @@ -155,7 +164,9 @@ async def test_mongos_tls_ca_mismatch(ops_test: OpsTest) -> None: timeout=TIMEOUT, ) - await toggle_tls_mongos(ops_test, enable=True, certs_app_name=DIFFERENT_CERTS_APP_NAME) + await toggle_tls_mongos( + ops_test, enable=True, certs_app_name=DIFFERENT_CERTS_APP_NAME + ) await wait_for_mongos_units_blocked( ops_test, MONGOS_APP_NAME, From 86b7310f596da081a1b4ac07ba07fb2e1a4ee516 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 12 Sep 2024 12:37:19 +0000 Subject: [PATCH 15/22] update node port --- src/charm.py | 38 +++++++++++++++++++++++++++++--------- src/node_port.py | 22 ++++++++++++++++------ 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/charm.py b/src/charm.py index 960f37fe..ab542803 100755 --- a/src/charm.py +++ b/src/charm.py @@ -9,7 +9,11 @@ from exceptions import MissingSecretError from ops.pebble import PathError, ProtocolError, Layer, APIError -from node_port import NodePortManager +from node_port import ( + NodePortManager, + FailedToFindNodePortError, + FailedToFindServiceError, +) from typing import Set, Optional, Dict, List from charms.mongodb.v0.config_server_interface import ClusterRequirer @@ -19,7 +23,7 @@ from charms.mongodb.v0.mongo import MongoConfiguration, MongoConnection from charms.mongos.v0.set_status import MongosStatusHandler -from charms.mongodb.v1.mongodb_provider import MongoDBProvider +from charms.mongodb.v1.mongodb_provider import MongoDBProvider, FailedToGetHostsError from charms.mongodb.v1.mongodb_tls import MongoDBTLS from charms.mongodb.v0.mongodb_secrets import SecretCache from charms.mongodb.v0.mongodb_secrets import generate_secret_label @@ -63,6 +67,10 @@ class ExtraDataDirError(Exception): """Raised when there is unexpected data in the data directory.""" +class NoExternalHostError(Exception): + """Raised when there is not an external host for a unit, when there is expected to be one.""" + + class MongosCharm(ops.CharmBase): """Charm the service.""" @@ -453,14 +461,26 @@ def get_ext_mongos_host(self, unit: Unit, incl_port=True) -> str | None: if not self.is_external_client: return None - unit_ip = self.node_port_manager.get_node_ip(unit.name) - if not incl_port: - return unit_ip + try: + unit_ip = self.node_port_manager.get_node_ip(unit.name) + if not incl_port: + return unit_ip - unit_port = self.node_port_manager.get_node_port( - port_to_match=Config.MONGOS_PORT, unit_name=unit.name - ) - return f"{unit_ip}:{unit_port}" + unit_port = self.node_port_manager.get_node_port( + port_to_match=Config.MONGOS_PORT, unit_name=unit.name + ) + if unit_ip and unit_port: + return "{unit_ip}:{unit_port}" + else: + raise NoExternalHostError(f"No external host for unit {unit.name}") + except ( + NoExternalHostError, + FailedToFindNodePortError, + FailedToFindServiceError, + ) as e: + raise FailedToGetHostsError( + "Failed to retrieve external hosts due to %s", e + ) def get_k8s_mongos_host(self, unit: Unit) -> str: """Create a DNS name for a MongoDB unit. diff --git a/src/node_port.py b/src/node_port.py index 84a8fffb..f9af738a 100644 --- a/src/node_port.py +++ b/src/node_port.py @@ -24,6 +24,14 @@ logging.getLogger("httpcore").disabled = True +class FailedToFindNodePortError(Exception): + """Raised NodePort cannot be found, but is excepted to be present.""" + + +class FailedToFindServiceError(Exception): + """Raised service cannot be found, but is excepted to be present.""" + + class NodePortManager: """Manager for handling mongos Kubernetes resources for a single mongos pod.""" @@ -129,8 +137,7 @@ def apply_service(self, service: Service) -> None: if e.status.code == 422 and "port is already allocated" in e.status.message: logger.error(e.status.message) return - else: - raise + raise def delete_unit_service(self) -> None: """Deletes a unit Service, if it exists.""" @@ -152,8 +159,7 @@ def delete_unit_service(self) -> None: if e.status.code == 403: self.on_deployed_without_trust() return - else: - raise + raise def _node_name(self, unit_name: str) -> str: """Return the node name for this unit's pod ip.""" @@ -168,6 +174,8 @@ def _node_name(self, unit_name: str) -> str: self.on_deployed_without_trust() return + raise + return pod.spec.nodeName def get_node_ip(self, unit_name: str) -> Optional[str]: @@ -183,6 +191,8 @@ def get_node_ip(self, unit_name: str) -> Optional[str]: logger.error("Could not delete service, application needs `juju trust`") self.on_deployed_without_trust() return + + raise # [ # NodeAddress(address='192.168.0.228', type='InternalIP'), # NodeAddress(address='example.com', type='Hostname') @@ -200,13 +210,13 @@ def get_node_port(self, port_to_match: int, unit_name: str) -> int: service = self.get_unit_service(unit_name=unit_name) if not service or not service.spec.type == "NodePort": - raise Exception("No service found for port.") + raise FailedToFindServiceError(f"No service found for port on {unit_name}") for svc_port in service.spec.ports: if svc_port.port == 27018: return svc_port.nodePort - raise Exception( + raise FailedToFindNodePortError( f"Unable to find NodePort for {port_to_match} for the {service} service" ) From f3a022819ff3bda55010e300b653daa4c97e0f64 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 12 Sep 2024 12:49:02 +0000 Subject: [PATCH 16/22] personal nits --- tests/integration/tls/helpers.py | 36 +++++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tests/integration/tls/helpers.py b/tests/integration/tls/helpers.py index 73ae5a9f..7ce738eb 100644 --- a/tests/integration/tls/helpers.py +++ b/tests/integration/tls/helpers.py @@ -5,7 +5,7 @@ from pathlib import Path from pytest_operator.plugin import OpsTest from ..helpers import get_application_relation_data -from tenacity import RetryError +from tenacity import RetryError, Retrying, stop_after_attempt, wait_exponential from datetime import datetime from typing import Optional, Dict import json @@ -120,20 +120,26 @@ async def mongos_tls_command(ops_test: OpsTest, unit, internal=True) -> str: async def check_tls(ops_test, unit, enabled, internal=True) -> None: """Returns True if TLS matches the expected state "enabled".""" try: - mongos_tls_check = await mongos_tls_command( - ops_test, unit=unit, internal=internal - ) - print(mongos_tls_check) - complete_command = f"ssh --container mongos {unit.name} {mongos_tls_check}" - return_code, _, stderr = await ops_test.juju(*complete_command.split()) - - tls_enabled = return_code == 0 - if enabled != tls_enabled: - logger.error(stderr) - raise ValueError( - f"TLS is{' not' if not tls_enabled else ''} enabled on {unit.name}" - ) - return True + for attempt in Retrying( + stop=stop_after_attempt(10), + wait=wait_exponential(multiplier=1, min=2, max=30), + ): + with attempt: + mongos_tls_check = await mongos_tls_command( + ops_test, unit=unit, internal=internal + ) + complete_command = ( + f"ssh --container mongos {unit.name} {mongos_tls_check}" + ) + return_code, _, stderr = await ops_test.juju(*complete_command.split()) + + tls_enabled = return_code == 0 + if enabled != tls_enabled: + logger.error(stderr) + raise ValueError( + f"TLS is{' not' if not tls_enabled else ''} enabled on {unit.name}" + ) + return True except RetryError: return False From 8938c348360d488de27ea89394d9570e596e5d5c Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 12 Sep 2024 13:01:40 +0000 Subject: [PATCH 17/22] update lib --- lib/charms/mongodb/v1/mongodb_provider.py | 120 ++++++++++++++++++++-- src/charm.py | 11 ++ 2 files changed, 121 insertions(+), 10 deletions(-) diff --git a/lib/charms/mongodb/v1/mongodb_provider.py b/lib/charms/mongodb/v1/mongodb_provider.py index ec9de773..2fc78b25 100644 --- a/lib/charms/mongodb/v1/mongodb_provider.py +++ b/lib/charms/mongodb/v1/mongodb_provider.py @@ -31,12 +31,13 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 11 +LIBPATCH = 13 logger = logging.getLogger(__name__) REL_NAME = "database" MONGOS_RELATIONS = "cluster" MONGOS_CLIENT_RELATIONS = "mongos_proxy" +MANAGED_USERS_KEY = "managed-users-key" # We expect the MongoDB container to use the default ports @@ -48,6 +49,10 @@ deleted — key that were deleted.""" +class FailedToGetHostsError(Exception): + """Raised when charm fails to retrieve hosts.""" + + class MongoDBProvider(Object): """In this class, we manage client database relations.""" @@ -76,8 +81,8 @@ def __init__(self, charm: CharmBase, substrate="k8s", relation_name: str = REL_N self.database_provides.on.database_requested, self._on_relation_event ) - def pass_hook_checks(self, event: EventBase) -> bool: - """Runs the pre-hooks checks for MongoDBProvider, returns True if all pass.""" + def pass_sanity_hook_checks(self) -> bool: + """Runs reusable and event agnostic checks.""" # We shouldn't try to create or update users if the database is not # initialised. We will create users as part of initialisation. if not self.charm.db_initialised: @@ -92,6 +97,13 @@ def pass_hook_checks(self, event: EventBase) -> bool: if not self.charm.unit.is_leader(): return False + return True + + def pass_hook_checks(self, event: EventBase) -> bool: + """Runs the pre-hooks checks for MongoDBProvider, returns True if all pass.""" + if not self.pass_sanity_hook_checks(): + return False + if self.charm.upgrade_in_progress: logger.warning( "Adding relations is not supported during an upgrade. The charm may be in a broken, unrecoverable state." @@ -137,7 +149,11 @@ def _on_relation_event(self, event): try: self.oversee_users(departed_relation_id, event) - except PyMongoError as e: + except (PyMongoError, FailedToGetHostsError) as e: + # Failed to get hosts error is unique to mongos-k8s charm. In other charms we do not + # foresee issues to retrieve hosts. However in external mongos-k8s, the leader can + # attempt to retrieve hosts while non-leader units are still enabling node port + # resulting in an exception. logger.error("Deferring _on_relation_event since: error=%r", e) event.defer() return @@ -157,12 +173,36 @@ def oversee_users(self, departed_relation_id: Optional[int], event): When the function is executed in relation departed event, the departed relation is still on the list of all relations. Therefore, for proper work of the function, we need to exclude departed relation from the list. + + Raises: + PyMongoError """ with MongoConnection(self.charm.mongo_config) as mongo: database_users = mongo.get_users() - relation_users = self._get_users_from_relations(departed_relation_id) - for username in database_users - relation_users: + users_being_managed = database_users.intersection(self._get_relational_users_to_manage()) + expected_current_users = self._get_users_from_relations(departed_relation_id) + + self.remove_users(users_being_managed, expected_current_users) + self.add_users(users_being_managed, expected_current_users) + self.update_users(event, users_being_managed, expected_current_users) + self.auto_delete_dbs(departed_relation_id) + + def remove_users( + self, users_being_managed: Set[str], expected_current_users: Set[str] + ) -> None: + """Removes users from Charmed MongoDB. + + Note this only removes users that this application of Charmed MongoDB is responsible for + managing. It won't remove: + 1. users created from other applications + 2. users created from other mongos routers. + + Raises: + PyMongoError + """ + with MongoConnection(self.charm.mongo_config) as mongo: + for username in users_being_managed - expected_current_users: logger.info("Remove relation user: %s", username) if ( self.charm.is_role(Config.Role.MONGOS) @@ -171,8 +211,16 @@ def oversee_users(self, departed_relation_id: Optional[int], event): continue mongo.drop_user(username) + self._remove_from_relational_users_to_manage(username) + + def add_users(self, users_being_managed: Set[str], expected_current_users: Set[str]) -> None: + """Adds users to Charmed MongoDB. - for username in relation_users - database_users: + Raises: + PyMongoError + """ + with MongoConnection(self.charm.mongo_config) as mongo: + for username in expected_current_users - users_being_managed: config = self._get_config(username, None) if config.database is None: # We need to wait for the moment when the provider library @@ -181,15 +229,33 @@ def oversee_users(self, departed_relation_id: Optional[int], event): logger.info("Create relation user: %s on %s", config.username, config.database) mongo.create_user(config) + self._add_to_relational_users_to_manage(username) self._set_relation(config) - for username in relation_users.intersection(database_users): + def update_users( + self, event: EventBase, users_being_managed: Set[str], expected_current_users: Set[str] + ) -> None: + """Updates existing users in Charmed MongoDB. + + Raises: + PyMongoError + """ + with MongoConnection(self.charm.mongo_config) as mongo: + for username in expected_current_users.intersection(users_being_managed): config = self._get_config(username, None) logger.info("Update relation user: %s on %s", config.username, config.database) mongo.update_user(config) logger.info("Updating relation data according to diff") self._diff(event) + def auto_delete_dbs(self, departed_relation_id): + """Delete's unused dbs if configured to do so. + + Raises: + PyMongoError + """ + with MongoConnection(self.charm.mongo_config) as mongo: + if not self.charm.model.config["auto-delete"]: return @@ -240,7 +306,7 @@ def _diff(self, event: RelationChangedEvent) -> Diff: def update_app_relation_data(self) -> None: """Helper function to update application relation data.""" - if not self.charm.db_initialised: + if not self.pass_sanity_hook_checks(): return database_users = set() @@ -282,7 +348,9 @@ def _get_or_set_password(self, relation: Relation) -> str: self.database_provides.update_relation_data(relation.id, {"password": password}) return password - def _get_config(self, username: str, password: Optional[str]) -> MongoConfiguration: + def _get_config( + self, username: str, password: Optional[str], event=None + ) -> MongoConfiguration: """Construct the config object for future user creation.""" relation = self._get_relation_from_username(username) if not password: @@ -299,8 +367,11 @@ def _get_config(self, username: str, password: Optional[str]) -> MongoConfigurat "tls_external": False, "tls_internal": False, } + if self.charm.is_role(Config.Role.MONGOS): mongo_args["port"] = Config.MONGOS_PORT + if self.substrate == Config.Substrate.K8S: + mongo_args["hosts"] = self.charm.get_mongos_hosts_for_client() else: mongo_args["replset"] = self.charm.app.name @@ -396,6 +467,35 @@ def get_relation_name(self): else: return REL_NAME + def _get_relational_users_to_manage(self) -> Set[str]: + """Returns a set of the users to manage. + + Note json cannot serialise sets. Convert from list. + """ + return set(json.loads(self.charm.app_peer_data.get(MANAGED_USERS_KEY, "[]"))) + + def _update_relational_users_to_manage(self, new_users: Set[str]) -> None: + """Updates the set of the users to manage. + + Note json cannot serialise sets. Convert from list. + """ + if not self.charm.unit.is_leader(): + raise Exception("Cannot update relational data on non-leader unit") + + self.charm.app_peer_data[MANAGED_USERS_KEY] = json.dumps(list(new_users)) + + def _remove_from_relational_users_to_manage(self, user_to_remove: str) -> None: + """Removes the provided user from the set of the users to manage.""" + current_users = self._get_relational_users_to_manage() + updated_users = current_users - {user_to_remove} + self._update_relational_users_to_manage(updated_users) + + def _add_to_relational_users_to_manage(self, user_to_add: str) -> None: + """Adds the provided user to the set of the users to manage.""" + current_users = self._get_relational_users_to_manage() + current_users.add(user_to_add) + self._update_relational_users_to_manage(current_users) + @staticmethod def _get_database_from_relation(relation: Relation) -> Optional[str]: """Return database name from relation.""" diff --git a/src/charm.py b/src/charm.py index ab542803..283f7783 100755 --- a/src/charm.py +++ b/src/charm.py @@ -448,6 +448,17 @@ def get_units(self) -> List[Unit]: units.extend(self.peers_units) return units + def get_mongos_hosts_for_client(self) -> Set: + """Returns the hosts for mongos as a str. + + The host for mongos can be either the K8s pod name or an IP address depending on how + the app has been configured. + """ + if self.is_external_client: + return self.get_ext_mongos_hosts() + + return self.get_k8s_mongos_hosts() + def get_k8s_mongos_hosts(self) -> Set: """Returns the K8s hosts for mongos""" hosts = set() From 74f594182122a44742123903dbcc8e4d79422eac Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 16 Sep 2024 16:06:23 +0000 Subject: [PATCH 18/22] update lib --- lib/charms/mongodb/v1/mongodb_tls.py | 31 +++++++++++++--------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/lib/charms/mongodb/v1/mongodb_tls.py b/lib/charms/mongodb/v1/mongodb_tls.py index 880e7b46..8f00bde7 100644 --- a/lib/charms/mongodb/v1/mongodb_tls.py +++ b/lib/charms/mongodb/v1/mongodb_tls.py @@ -7,15 +7,11 @@ and expose needed information for client connection via fields in external relation. """ -from cryptography import x509 -from cryptography.hazmat.backends import default_backend - import base64 import json import logging import re import socket -import subprocess from typing import Dict, List, Optional, Tuple from charms.tls_certificates_interface.v3.tls_certificates import ( @@ -25,10 +21,11 @@ generate_csr, generate_private_key, ) +from cryptography import x509 +from cryptography.hazmat.backends import default_backend from ops.charm import ActionEvent, RelationBrokenEvent, RelationJoinedEvent from ops.framework import Object from ops.model import ActiveStatus, MaintenanceStatus, WaitingStatus -from ops.pebble import ExecError from config import Config @@ -323,19 +320,19 @@ def get_new_sans(self) -> Dict: Returns: A list representing the hostnames of the MongoDB unit. """ - sans = {} - unit_id = self.charm.unit.name.split("/")[1] - sans[SANS_DNS_KEY] = [ - f"{self.charm.app.name}-{unit_id}", - socket.getfqdn(), - "localhost", - f"{self.charm.app.name}-{unit_id}.{self.charm.app.name}-endpoints", - ] - sans[SANS_IPS_KEY] = [ - str(self.charm.model.get_binding(self.peer_relation).network.bind_address) - ] + sans = { + SANS_DNS_KEY: [ + f"{self.charm.app.name}-{unit_id}", + socket.getfqdn(), + "localhost", + f"{self.charm.app.name}-{unit_id}.{self.charm.app.name}-endpoints", + ], + SANS_IPS_KEY: [ + str(self.charm.model.get_binding(self.peer_relation).network.bind_address) + ], + } if self.charm.is_role(Config.Role.MONGOS) and self.charm.is_external_client: sans[SANS_IPS_KEY].append( @@ -353,7 +350,7 @@ def get_current_sans(self, internal: bool) -> List[str] | None: pem_file = self.get_tls_secret(internal, Config.TLS.SECRET_CERT_LABEL) try: - cert = cert = x509.load_pem_x509_certificate(pem_file.encode(), default_backend()) + cert = x509.load_pem_x509_certificate(pem_file.encode(), default_backend()) sans = cert.extensions.get_extension_for_class(x509.SubjectAlternativeName).value sans_ip = [str(san) for san in sans.get_values_for_type(x509.IPAddress)] sans_dns = [str(san) for san in sans.get_values_for_type(x509.DNSName)] From 4d31669000e7843da941d90fd50dcf70a79f7e33 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 16 Sep 2024 16:12:09 +0000 Subject: [PATCH 19/22] pr feedback --- src/charm.py | 31 +++++++------------------------ src/config.py | 2 +- src/node_port.py | 8 +++----- 3 files changed, 11 insertions(+), 30 deletions(-) diff --git a/src/charm.py b/src/charm.py index 283f7783..aafeeb4e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -8,7 +8,7 @@ from datetime import datetime from exceptions import MissingSecretError -from ops.pebble import PathError, ProtocolError, Layer, APIError +from ops.pebble import PathError, ProtocolError, Layer from node_port import ( NodePortManager, FailedToFindNodePortError, @@ -79,7 +79,7 @@ def __init__(self, *args): self.role = Config.Role.MONGOS self.secrets = SecretCache(self) self.status = MongosStatusHandler(self) - self.node_port_manager = NodePortManager(self) + self.node_port_manager = NodePortManager(self, port=Config.MONGOS_PORT) # lifecycle events self.framework.observe(self.on.config_changed, self._on_config_changed) @@ -367,16 +367,9 @@ def stop_mongos_service(self): def restart_charm_services(self): """Restart mongos service.""" container = self.unit.get_container(Config.CONTAINER_NAME) - try: - container.stop(Config.SERVICE_NAME) - except APIError: - # note that some libs will make a call to restart mongos service, before it has - # started (i.e. config_server.py) leading to a race condition where there is an - # attempt to stop the service before starting it. - pass - container.add_layer(Config.CONTAINER_NAME, self._mongos_layer, combine=True) container.replan() + container.restart(Config.SERVICE_NAME) def set_database(self, database: str) -> None: """Updates the database requested for the mongos user.""" @@ -463,7 +456,8 @@ def get_k8s_mongos_hosts(self) -> Set: """Returns the K8s hosts for mongos""" hosts = set() for unit in self.get_units(): - hosts.add(self.get_k8s_mongos_host(unit)) + unit_id = unit.name.split("/")[1] + hosts.add(f"{self.app.name}-{unit_id}.{self.app.name}-endpoints") return hosts @@ -493,18 +487,6 @@ def get_ext_mongos_host(self, unit: Unit, incl_port=True) -> str | None: "Failed to retrieve external hosts due to %s", e ) - def get_k8s_mongos_host(self, unit: Unit) -> str: - """Create a DNS name for a MongoDB unit. - - Args: - unit_name: the juju unit name, e.g. "mongodb/1". - - Returns: - A string representing the hostname of the MongoDB unit. - """ - unit_id = unit.name.split("/")[1] - return f"{self.app.name}-{unit_id}.{self.app.name}-endpoints" - def get_ext_mongos_hosts(self) -> Set: """Returns the ext hosts for mongos. @@ -523,7 +505,8 @@ def get_mongos_host(self) -> str: The host for mongos can be either the Unix Domain Socket or an IP address depending on how the client wishes to connect to mongos (inside Juju or outside). """ - return self.get_k8s_mongos_host(self.unit) + unit_id = self.unit.name.split("/")[1] + return f"{self.app.name}-{unit_id}.{self.app.name}-endpoints" @staticmethod def _generate_relation_departed_key(rel_id: int) -> str: diff --git a/src/config.py b/src/config.py index 67862ca7..8ce6e39d 100644 --- a/src/config.py +++ b/src/config.py @@ -14,7 +14,7 @@ class Config: SUBSTRATE = "k8s" CONTAINER_NAME = "mongos" USER_ROLE_CREATE_USERS = "admin" - SERVICE_NAME = "mongos" # this must match the name of the service in the ROCK + SERVICE_NAME = "mongos" MONGOD_CONF_DIR = "/etc/mongod" UNIX_USER = "mongodb" UNIX_GROUP = "mongodb" diff --git a/src/node_port.py b/src/node_port.py index f9af738a..89d6d85c 100644 --- a/src/node_port.py +++ b/src/node_port.py @@ -35,11 +35,9 @@ class FailedToFindServiceError(Exception): class NodePortManager: """Manager for handling mongos Kubernetes resources for a single mongos pod.""" - def __init__( - self, - charm: CharmBase, - ): + def __init__(self, charm: CharmBase, port: int): self.charm = charm + self.port = port self.pod_name = self.charm.unit.name.replace("/", "-") self.app_name = self.charm.app.name self.namespace = self.charm.model.name @@ -213,7 +211,7 @@ def get_node_port(self, port_to_match: int, unit_name: str) -> int: raise FailedToFindServiceError(f"No service found for port on {unit_name}") for svc_port in service.spec.ports: - if svc_port.port == 27018: + if svc_port.port == self.port: return svc_port.nodePort raise FailedToFindNodePortError( From da65ec3623f482beaf274b33da0befe76a241e5b Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Thu, 26 Sep 2024 16:07:10 +0000 Subject: [PATCH 20/22] add gaurdrails to tests --- tests/integration/tls/test_tls.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/integration/tls/test_tls.py b/tests/integration/tls/test_tls.py index 9890b5b3..9771b58c 100644 --- a/tests/integration/tls/test_tls.py +++ b/tests/integration/tls/test_tls.py @@ -154,6 +154,14 @@ async def test_tls_reenabled(ops_test: OpsTest) -> None: async def test_mongos_tls_ca_mismatch(ops_test: OpsTest) -> None: """Tests that mongos charm can disable TLS.""" await toggle_tls_mongos(ops_test, enable=False) + + await wait_for_mongos_units_blocked( + ops_test, + MONGOS_APP_NAME, + status="mongos requires TLS to be enabled.", + timeout=300, + ) + await ops_test.model.deploy( CERTS_APP_NAME, application_name=DIFFERENT_CERTS_APP_NAME, channel="stable" ) From baef204d1c0888e141bc88a127f36529b7330f34 Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 7 Oct 2024 12:05:28 +0000 Subject: [PATCH 21/22] PR feedback --- lib/charms/mongodb/v1/mongodb_tls.py | 22 ++++++++++++---------- src/charm.py | 13 +------------ 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/lib/charms/mongodb/v1/mongodb_tls.py b/lib/charms/mongodb/v1/mongodb_tls.py index 8f00bde7..266a202d 100644 --- a/lib/charms/mongodb/v1/mongodb_tls.py +++ b/lib/charms/mongodb/v1/mongodb_tls.py @@ -42,7 +42,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 5 WAIT_CERT_UPDATE = "wait-cert-updated" @@ -105,9 +105,6 @@ def request_certificate( internal: bool, ): """Request TLS certificate.""" - if not self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION): - return - if param is None: key = generate_private_key() else: @@ -234,7 +231,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: self.charm.cluster.update_ca_secret(new_ca=event.ca) self.charm.config_server.update_ca_secret(new_ca=event.ca) - if self.waiting_for_both_certs(): + if self.is_waiting_for_both_certs(): logger.debug( "Defer till both internal and external TLS certificates available to avoid second restart." ) @@ -256,7 +253,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: # clear waiting status if db service is ready self.charm.status.set_and_share_status(ActiveStatus()) - def waiting_for_both_certs(self): + def is_waiting_for_both_certs(self) -> bool: """Returns a boolean indicating whether additional certs are needed.""" if not self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL): logger.debug("Waiting for internal certificate.") @@ -295,6 +292,10 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: return logger.debug("Generating a new Certificate Signing Request.") + self.request_new_certificates(internal) + + def request_new_certificates(self, internal: bool) -> None: + """Requests the renewel of a new certificate.""" key = self.get_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL).encode("utf-8") old_csr = self.get_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL).encode("utf-8") sans = self.get_new_sans() @@ -313,8 +314,9 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: ) self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, new_csr.decode("utf-8")) + self.set_waiting_for_cert_to_update(waiting=True, internal=internal) - def get_new_sans(self) -> Dict: + def get_new_sans(self) -> dict[str, list[str]]: """Create a list of DNS names for a MongoDB unit. Returns: @@ -341,7 +343,7 @@ def get_new_sans(self) -> Dict: return sans - def get_current_sans(self, internal: bool) -> List[str] | None: + def get_current_sans(self, internal: bool) -> dict[str, list[str]] | None: """Gets the current SANs for the unit cert.""" # if unit has no certificates do not proceed. if not self.is_tls_enabled(internal=internal): @@ -411,9 +413,9 @@ def _get_subject_name(self) -> str: def is_set_waiting_for_cert_to_update( self, - internal=False, + internal: bool = False, ) -> bool: - """Returns True we are waiting for a cert to update.""" + """Returns True if we are waiting for a cert to update.""" scope = "int" if internal else "ext" label_name = f"{scope}-{WAIT_CERT_UPDATE}" diff --git a/src/charm.py b/src/charm.py index c16c86d5..7d28bc7a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -5,7 +5,6 @@ # See LICENSE file for licensing details. from ops.main import main import json -from datetime import datetime from exceptions import MissingSecretError from ops.pebble import PathError, ProtocolError, Layer @@ -300,16 +299,7 @@ def update_tls_sans(self) -> None: ) ) - self.tls.certs.on.certificate_expiring.emit( - certificate=self.tls.get_tls_secret( - internal=internal, label_name=Config.TLS.SECRET_CERT_LABEL - ), - expiry=datetime.now().isoformat(), - ) - # without this, it is possible that we constantly request new certificates - # over and over again - blocking the event queue from processing the expired - # certificate event. - self.tls.set_waiting_for_cert_to_update(waiting=True, internal=internal) + self.tls.request_new_certificates(internal) def get_keyfile_contents(self) -> str | None: """Retrieves the contents of the keyfile on host machine.""" @@ -388,7 +378,6 @@ def restart_charm_services(self): """Restart mongos service.""" container = self.unit.get_container(Config.CONTAINER_NAME) container.add_layer(Config.CONTAINER_NAME, self._mongos_layer, combine=True) - container.replan() container.restart(Config.SERVICE_NAME) def set_database(self, database: str) -> None: From 2642998ea678624b386c270a1956323f42e5f9fd Mon Sep 17 00:00:00 2001 From: Mia Altieri Date: Mon, 7 Oct 2024 14:01:42 +0000 Subject: [PATCH 22/22] update tests for pebble replan/restart --- tests/integration/tls/helpers.py | 7 +++++-- tests/integration/tls/test_tls.py | 1 - 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/tls/helpers.py b/tests/integration/tls/helpers.py index 7ce738eb..5058c755 100644 --- a/tests/integration/tls/helpers.py +++ b/tests/integration/tls/helpers.py @@ -11,6 +11,7 @@ import json import ops import logging +import time from ops.model import Unit from ..client_relations.helpers import get_external_uri from ..helpers import get_mongos_uri @@ -30,6 +31,7 @@ SHARD_APP_NAME = "shard0" CLUSTER_COMPONENTS = [CONFIG_SERVER_APP_NAME, SHARD_APP_NAME] MONGOS_SERVICE = "mongos.service" +PEBBLE_TIME_UNIT = 60 class ProcessError(Exception): @@ -181,7 +183,7 @@ async def time_process_started( # find most recent start time. By parsing most recent logs (ie in reverse order) for log in reversed(logs.split("\n")): - if "Replan" in log: + if "Restart" in log: return process_pebble_time(log.split()[4]) raise Exception("Service was never started") @@ -364,6 +366,8 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> ops_test, unit, app_name=app, tmpdir=tmpdir ) + time.sleep(PEBBLE_TIME_UNIT) + # set external and internal key using auto-generated key for each unit for unit in ops_test.model.applications[app].units: action = await unit.run_action(action_name="set-tls-private-key") @@ -414,7 +418,6 @@ async def rotate_and_verify_certs(ops_test: OpsTest, app: str, tmpdir: Path) -> # Once the certificate requests are processed and updated the .service file should be # restarted - assert ( new_mongos_service_time > original_tls_info[unit.name]["mongos_service"] ), f"mongos service for {unit.name} was not restarted." diff --git a/tests/integration/tls/test_tls.py b/tests/integration/tls/test_tls.py index 9771b58c..ab383f23 100644 --- a/tests/integration/tls/test_tls.py +++ b/tests/integration/tls/test_tls.py @@ -23,7 +23,6 @@ ) from ..client_relations.helpers import get_public_k8s_ip - MONGOS_SERVICE = "mongos.service" MONGOS_SERVICE = "snap.charmed-mongodb.mongos.service"