[DPE-4115] Performance Profile Support (#466)

This PR extends the current charm to support the following profiles: * testing: * focused in the integration tests and our CI -> 1G RAM dedicated to the Heap and no automation * staging: * HA capabilities must be available: for that, we will enforce index template that encompasses all the indices and sets replica: 1-all * Extends heap to: `max(1G, 10% of RAM)` * `indices.memory.index_buffer_size` extends to `25%` * Adds three component templates, that will be described later * production: * Same features as the staging, but heap is set instead to: `max(1G, 50% of RAM)` The options above are set based on the following documents: https://opensearch.org/docs/latest/tuning-your-cluster/performance/ https://opensearch.org/docs/latest/search-plugins/knn/performance-tuning/ The user can switch between the three options above, and depending on the selected value, the templates are created or destroyed.
canonical · Oct 21, 2024 · cd1c034 · cd1c034
1 parent 58d208c
commit cd1c034
Show file tree

Hide file tree

Showing 44 changed files with 844 additions and 167 deletions.
diff --git a/config.yaml b/config.yaml
@@ -36,3 +36,13 @@ options:
     default: true
     type: boolean
     description: Enable opensearch-knn
+
+  profile:
+    type: string
+    default: "production"
+    description: |
+      Profile representing the scope of deployment, and used to tune resource allocation.
+      Allowed values are: "production", "staging" or "testing"
+      Production will tune opensearch for maximum performance while default will tune for
+      minimal running performance.
+      Performance tuning is described on: https://opensearch.org/docs/latest/tuning-your-cluster/performance/
diff --git a/lib/charms/opensearch/v0/constants_charm.py b/lib/charms/opensearch/v0/constants_charm.py
@@ -118,3 +118,5 @@
 
 # User-face Backup ID format
 OPENSEARCH_BACKUP_ID_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
+
+PERFORMANCE_PROFILE = "profile"
diff --git a/lib/charms/opensearch/v0/helper_conf_setter.py b/lib/charms/opensearch/v0/helper_conf_setter.py
@@ -272,7 +272,6 @@ def replace(
             output_file: Target file for the result config, by default same as config_file
         """
         path = f"{self.base_path}{config_file}"
-
         if not exists(path):
             raise FileNotFoundError(f"{path} not found.")
 
@@ -290,7 +289,7 @@ def replace(
             logger.info(data)
 
         if output_file is None:
-            output_file = config_file
+            output_file = path
 
         with open(output_file, "w") as f:
             f.write(data)

diff --git a/lib/charms/opensearch/v0/models.py b/lib/charms/opensearch/v0/models.py
@@ -23,6 +23,10 @@
 LIBPATCH = 1
 
 
+MIN_HEAP_SIZE = 1024 * 1024  # 1GB in KB
+MAX_HEAP_SIZE = 32 * MIN_HEAP_SIZE  # 32GB in KB
+
+
 class Model(ABC, BaseModel):
     """Base model class."""
 
@@ -153,6 +157,14 @@ class DeploymentType(BaseStrEnum):
     OTHER = "other"
 
 
+class PerformanceType(BaseStrEnum):
+    """Performance types available."""
+
+    PRODUCTION = "production"
+    STAGING = "staging"
+    TESTING = "testing"
+
+
 class StartMode(BaseStrEnum):
     """Mode of start of units in this deployment."""
 
@@ -204,6 +216,10 @@ class PeerClusterConfig(Model):
     cluster_name: str
     init_hold: bool
     roles: List[str]
+    # We have a breaking change in the model
+    # For older charms, this field will not exist and they will be set in the
+    # profile called "testing".
+    profile: Optional[PerformanceType] = PerformanceType.TESTING
     data_temperature: Optional[str] = None
 
     @root_validator
@@ -346,3 +362,60 @@ def promote_failover(self) -> None:
         self.main_app = self.failover_app
         self.main_rel_id = self.failover_rel_id
         self.delete("failover")
+
+
+class OpenSearchPerfProfile(Model):
+    """Generates an immutable description of the performance profile."""
+
+    typ: PerformanceType
+    heap_size_in_kb: int = MIN_HEAP_SIZE
+    opensearch_yml: Dict[str, str] = {}
+    charmed_index_template: Dict[str, str] = {}
+    charmed_component_templates: Dict[str, str] = {}
+
+    @root_validator
+    def set_options(cls, values):  # noqa: N805
+        """Generate the attributes depending on the input."""
+        # Check if PerformanceType has been rendered correctly
+        # if an user creates the OpenSearchPerfProfile
+        if "typ" not in values:
+            raise AttributeError("Missing 'typ' attribute.")
+
+        if values["typ"] == PerformanceType.TESTING:
+            values["heap_size_in_kb"] = MIN_HEAP_SIZE
+            return values
+
+        mem_total = OpenSearchPerfProfile.meminfo()["MemTotal"]
+        mem_percent = 0.50 if values["typ"] == PerformanceType.PRODUCTION else 0.25
+
+        values["heap_size_in_kb"] = min(int(mem_percent * mem_total), MAX_HEAP_SIZE)
+
+        if values["typ"] != PerformanceType.TESTING:
+            values["opensearch_yml"] = {"indices.memory.index_buffer_size": "25%"}
+
+            values["charmed_index_template"] = {
+                "charmed-index-tpl": {
+                    "index_patterns": ["*"],
+                    "template": {
+                        "settings": {
+                            "number_of_replicas": "1",
+                        },
+                    },
+                },
+            }
+
+        return values
+
+    @staticmethod
+    def meminfo() -> dict[str, float]:
+        """Read the /proc/meminfo file and return the values.
+
+        According to the kernel source code, the values are always in kB:
+            https://github.com/torvalds/linux/blob/
+                2a130b7e1fcdd83633c4aa70998c314d7c38b476/fs/proc/meminfo.c#L31
+        """
+        with open("/proc/meminfo") as f:
+            meminfo = f.read().split("\n")
+            meminfo = [line.split() for line in meminfo if line.strip()]
+
+        return {line[0][:-1]: float(line[1]) for line in meminfo}
diff --git a/lib/charms/opensearch/v0/opensearch_base_charm.py b/lib/charms/opensearch/v0/opensearch_base_charm.py
@@ -11,6 +11,7 @@
 
 from charms.grafana_agent.v0.cos_agent import COSAgentProvider
 from charms.opensearch.v0.constants_charm import (
+    PERFORMANCE_PROFILE,
     AdminUser,
     AdminUserInitProgress,
     AdminUserNotConfigured,
@@ -49,7 +50,11 @@
     generate_hashed_password,
     generate_password,
 )
-from charms.opensearch.v0.models import DeploymentDescription, DeploymentType
+from charms.opensearch.v0.models import (
+    DeploymentDescription,
+    DeploymentType,
+    PerformanceType,
+)
 from charms.opensearch.v0.opensearch_backups import backup
 from charms.opensearch.v0.opensearch_config import OpenSearchConfig
 from charms.opensearch.v0.opensearch_distro import OpenSearchDistribution
@@ -74,6 +79,7 @@
     OpenSearchProvidedRolesException,
     StartMode,
 )
+from charms.opensearch.v0.opensearch_performance_profile import OpenSearchPerformance
 from charms.opensearch.v0.opensearch_plugin_manager import OpenSearchPluginManager
 from charms.opensearch.v0.opensearch_plugins import OpenSearchPluginError
 from charms.opensearch.v0.opensearch_relation_peer_cluster import (
@@ -246,6 +252,8 @@ def __init__(self, *args, distro: Type[OpenSearchDistribution] = None):
             metrics_rules_dir="./src/alert_rules/prometheus",
             log_slots=["opensearch:logs"],
         )
+
+        self.performance_profile = OpenSearchPerformance(self)
         # Ensure that only one instance of the `_on_peer_relation_changed` handler exists
         # in the deferred event queue
         self._is_peer_rel_changed_deferred = False
@@ -665,8 +673,19 @@ def _on_update_status(self, event: UpdateStatusEvent):  # noqa: C901
         # handle when/if certificates are expired
         self._check_certs_expiration(event)
 
+    def trigger_restart(self):
+        """Trigger a restart of the service."""
+        self._restart_opensearch_event.emit()
+
     def _on_config_changed(self, event: ConfigChangedEvent):  # noqa C901
         """On config changed event. Useful for IP changes or for user provided config changes."""
+        if not self.performance_profile.current:
+            # We are running (1) install or (2) an upgrade on instance that pre-dates profile
+            # First, we set this unit's effective profile -> 1G heap and no index templates.
+            # Our goal is to make sure this value exists once the refresh is finished
+            # and it represents the accurate value for this unit.
+            self.performance_profile.current = PerformanceType.TESTING
+
         if self.opensearch_config.update_host_if_needed():
             self.status.set(MaintenanceStatus(TLSNewCertsRequested))
             self.tls.delete_stored_tls_resources()
@@ -688,27 +707,27 @@ def _on_config_changed(self, event: ConfigChangedEvent):  # noqa C901
             # handle cluster change to main-orchestrator (i.e: init_hold: true -> false)
             self._handle_change_to_main_orchestrator_if_needed(event, previous_deployment_desc)
 
-        # todo: handle gracefully configuration setting at start of the charm
-        if not self.plugin_manager.check_plugin_manager_ready():
+        if self.upgrade_in_progress:
+            # The following changes in _on_config_changed are not supported during an upgrade
+            # Therefore, we leave now
+            logger.warning(
+                "Changing config during an upgrade is not supported. The charm may be in a broken, "
+                "unrecoverable state"
+            )
+            event.defer()
             return
 
+        perf_profile_needs_restart = False
+        plugin_needs_restart = False
+
         try:
             if not self.plugin_manager.check_plugin_manager_ready():
                 raise OpenSearchNotFullyReadyError()
 
             if self.unit.is_leader():
                 self.status.set(MaintenanceStatus(PluginConfigCheck), app=True)
 
-            if self.plugin_manager.run():
-                if self.upgrade_in_progress:
-                    logger.warning(
-                        "Changing config during an upgrade is not supported. The charm may be in a broken, "
-                        "unrecoverable state"
-                    )
-                    event.defer()
-                    return
-
-                self._restart_opensearch_event.emit()
+            plugin_needs_restart = self.plugin_manager.run()
         except (OpenSearchNotFullyReadyError, OpenSearchPluginError) as e:
             if isinstance(e, OpenSearchNotFullyReadyError):
                 logger.warning("Plugin management: cluster not ready yet at config changed")
@@ -719,11 +738,16 @@ def _on_config_changed(self, event: ConfigChangedEvent):  # noqa C901
             # config-changed is called again.
             if self.unit.is_leader():
                 self.status.clear(PluginConfigCheck, app=True)
-            return
+        else:
+            if self.unit.is_leader():
+                self.status.clear(PluginConfigCheck, app=True)
+                self.status.clear(PluginConfigChangeError, app=True)
 
-        if self.unit.is_leader():
-            self.status.clear(PluginConfigCheck, app=True)
-            self.status.clear(PluginConfigChangeError, app=True)
+        perf_profile_needs_restart = self.performance_profile.apply(
+            self.config.get(PERFORMANCE_PROFILE)
+        )
+        if plugin_needs_restart or perf_profile_needs_restart:
+            self._restart_opensearch_event.emit()
 
     def _on_set_password_action(self, event: ActionEvent):
         """Set new admin password from user input or generate if not passed."""

diff --git a/lib/charms/opensearch/v0/opensearch_config.py b/lib/charms/opensearch/v0/opensearch_config.py
@@ -8,7 +8,7 @@
 
 from charms.opensearch.v0.constants_tls import CertType
 from charms.opensearch.v0.helper_security import normalized_tls_subject
-from charms.opensearch.v0.models import App
+from charms.opensearch.v0.models import App, OpenSearchPerfProfile
 from charms.opensearch.v0.opensearch_distro import OpenSearchDistribution
 
 # The unique Charmhub library identifier, never change it
@@ -69,6 +69,25 @@ def set_client_auth(self):
             "-Djdk.tls.client.protocols=TLSv1.2",
         )
 
+    def apply_performance_profile(self, profile: OpenSearchPerfProfile):
+        """Apply the performance profile to the opensearch config."""
+        self._opensearch.config.replace(
+            self.JVM_OPTIONS,
+            "-Xms[0-9]+[kmgKMG]",
+            f"-Xms{str(profile.heap_size_in_kb)}k",
+            regex=True,
+        )
+
+        self._opensearch.config.replace(
+            self.JVM_OPTIONS,
+            "-Xmx[0-9]+[kmgKMG]",
+            f"-Xmx{str(profile.heap_size_in_kb)}k",
+            regex=True,
+        )
+
+        for key, val in profile.opensearch_yml.items():
+            self._opensearch.config.put(self.CONFIG_YML, key, val)
+
     def set_admin_tls_conf(self, secrets: Dict[str, any]):
         """Configures the admin certificate."""
         self._opensearch.config.put(

diff --git a/lib/charms/opensearch/v0/opensearch_peer_clusters.py b/lib/charms/opensearch/v0/opensearch_peer_clusters.py
@@ -165,6 +165,7 @@ def _user_config(self):
                 for option in self._charm.config.get("roles", "").split(",")
                 if option
             ],
+            profile=self._charm.performance_profile.current.typ.value,
         )
 
     def _new_cluster_setup(self, config: PeerClusterConfig) -> DeploymentDescription:
@@ -222,6 +223,7 @@ def _new_cluster_setup(self, config: PeerClusterConfig) -> DeploymentDescription
                 init_hold=config.init_hold,
                 roles=config.roles,
                 data_temperature=config.data_temperature,
+                profile=self._charm.performance_profile.current.typ.value,
             ),
             start=start_mode,
             pending_directives=directives,
@@ -270,6 +272,7 @@ def _existing_cluster_setup(
                 init_hold=prev_deployment.config.init_hold,
                 roles=config.roles,
                 data_temperature=config.data_temperature,
+                profile=self._charm.performance_profile.current.typ.value,
             ),
             start=start_mode,
             state=deployment_state,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -118,3 +118,5 @@

		# User-face Backup ID format
		OPENSEARCH_BACKUP_ID_FORMAT = "%Y-%m-%dT%H:%M:%SZ"

		PERFORMANCE_PROFILE = "profile"