Skip to content

Commit

Permalink
[DPE-4115] Performance Profile Support (#466)
Browse files Browse the repository at this point in the history
This PR extends the current charm to support the following profiles:
* testing:
* focused in the integration tests and our CI -> 1G RAM dedicated to the
Heap and no automation
* staging:
* HA capabilities must be available: for that, we will enforce index
template that encompasses all the indices and sets replica: 1-all
  * Extends heap to: `max(1G, 10% of RAM)`
  * `indices.memory.index_buffer_size` extends to `25%`
  * Adds three component templates, that will be described later
* production:
* Same features as the staging, but heap is set instead to: `max(1G, 50%
of RAM)`

The options above are set based on the following documents:
https://opensearch.org/docs/latest/tuning-your-cluster/performance/

https://opensearch.org/docs/latest/search-plugins/knn/performance-tuning/

The user can switch between the three options above, and depending on
the selected value, the templates are created or destroyed.
  • Loading branch information
phvalguima authored Oct 21, 2024
1 parent 58d208c commit cd1c034
Show file tree
Hide file tree
Showing 44 changed files with 844 additions and 167 deletions.
10 changes: 10 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,13 @@ options:
default: true
type: boolean
description: Enable opensearch-knn

profile:
type: string
default: "production"
description: |
Profile representing the scope of deployment, and used to tune resource allocation.
Allowed values are: "production", "staging" or "testing"
Production will tune opensearch for maximum performance while default will tune for
minimal running performance.
Performance tuning is described on: https://opensearch.org/docs/latest/tuning-your-cluster/performance/
2 changes: 2 additions & 0 deletions lib/charms/opensearch/v0/constants_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,5 @@

# User-face Backup ID format
OPENSEARCH_BACKUP_ID_FORMAT = "%Y-%m-%dT%H:%M:%SZ"

PERFORMANCE_PROFILE = "profile"
3 changes: 1 addition & 2 deletions lib/charms/opensearch/v0/helper_conf_setter.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ def replace(
output_file: Target file for the result config, by default same as config_file
"""
path = f"{self.base_path}{config_file}"

if not exists(path):
raise FileNotFoundError(f"{path} not found.")

Expand All @@ -290,7 +289,7 @@ def replace(
logger.info(data)

if output_file is None:
output_file = config_file
output_file = path

with open(output_file, "w") as f:
f.write(data)
Expand Down
73 changes: 73 additions & 0 deletions lib/charms/opensearch/v0/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
LIBPATCH = 1


MIN_HEAP_SIZE = 1024 * 1024 # 1GB in KB
MAX_HEAP_SIZE = 32 * MIN_HEAP_SIZE # 32GB in KB


class Model(ABC, BaseModel):
"""Base model class."""

Expand Down Expand Up @@ -153,6 +157,14 @@ class DeploymentType(BaseStrEnum):
OTHER = "other"


class PerformanceType(BaseStrEnum):
"""Performance types available."""

PRODUCTION = "production"
STAGING = "staging"
TESTING = "testing"


class StartMode(BaseStrEnum):
"""Mode of start of units in this deployment."""

Expand Down Expand Up @@ -204,6 +216,10 @@ class PeerClusterConfig(Model):
cluster_name: str
init_hold: bool
roles: List[str]
# We have a breaking change in the model
# For older charms, this field will not exist and they will be set in the
# profile called "testing".
profile: Optional[PerformanceType] = PerformanceType.TESTING
data_temperature: Optional[str] = None

@root_validator
Expand Down Expand Up @@ -346,3 +362,60 @@ def promote_failover(self) -> None:
self.main_app = self.failover_app
self.main_rel_id = self.failover_rel_id
self.delete("failover")


class OpenSearchPerfProfile(Model):
"""Generates an immutable description of the performance profile."""

typ: PerformanceType
heap_size_in_kb: int = MIN_HEAP_SIZE
opensearch_yml: Dict[str, str] = {}
charmed_index_template: Dict[str, str] = {}
charmed_component_templates: Dict[str, str] = {}

@root_validator
def set_options(cls, values): # noqa: N805
"""Generate the attributes depending on the input."""
# Check if PerformanceType has been rendered correctly
# if an user creates the OpenSearchPerfProfile
if "typ" not in values:
raise AttributeError("Missing 'typ' attribute.")

if values["typ"] == PerformanceType.TESTING:
values["heap_size_in_kb"] = MIN_HEAP_SIZE
return values

mem_total = OpenSearchPerfProfile.meminfo()["MemTotal"]
mem_percent = 0.50 if values["typ"] == PerformanceType.PRODUCTION else 0.25

values["heap_size_in_kb"] = min(int(mem_percent * mem_total), MAX_HEAP_SIZE)

if values["typ"] != PerformanceType.TESTING:
values["opensearch_yml"] = {"indices.memory.index_buffer_size": "25%"}

values["charmed_index_template"] = {
"charmed-index-tpl": {
"index_patterns": ["*"],
"template": {
"settings": {
"number_of_replicas": "1",
},
},
},
}

return values

@staticmethod
def meminfo() -> dict[str, float]:
"""Read the /proc/meminfo file and return the values.
According to the kernel source code, the values are always in kB:
https://github.com/torvalds/linux/blob/
2a130b7e1fcdd83633c4aa70998c314d7c38b476/fs/proc/meminfo.c#L31
"""
with open("/proc/meminfo") as f:
meminfo = f.read().split("\n")
meminfo = [line.split() for line in meminfo if line.strip()]

return {line[0][:-1]: float(line[1]) for line in meminfo}
58 changes: 41 additions & 17 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from charms.opensearch.v0.constants_charm import (
PERFORMANCE_PROFILE,
AdminUser,
AdminUserInitProgress,
AdminUserNotConfigured,
Expand Down Expand Up @@ -49,7 +50,11 @@
generate_hashed_password,
generate_password,
)
from charms.opensearch.v0.models import DeploymentDescription, DeploymentType
from charms.opensearch.v0.models import (
DeploymentDescription,
DeploymentType,
PerformanceType,
)
from charms.opensearch.v0.opensearch_backups import backup
from charms.opensearch.v0.opensearch_config import OpenSearchConfig
from charms.opensearch.v0.opensearch_distro import OpenSearchDistribution
Expand All @@ -74,6 +79,7 @@
OpenSearchProvidedRolesException,
StartMode,
)
from charms.opensearch.v0.opensearch_performance_profile import OpenSearchPerformance
from charms.opensearch.v0.opensearch_plugin_manager import OpenSearchPluginManager
from charms.opensearch.v0.opensearch_plugins import OpenSearchPluginError
from charms.opensearch.v0.opensearch_relation_peer_cluster import (
Expand Down Expand Up @@ -246,6 +252,8 @@ def __init__(self, *args, distro: Type[OpenSearchDistribution] = None):
metrics_rules_dir="./src/alert_rules/prometheus",
log_slots=["opensearch:logs"],
)

self.performance_profile = OpenSearchPerformance(self)
# Ensure that only one instance of the `_on_peer_relation_changed` handler exists
# in the deferred event queue
self._is_peer_rel_changed_deferred = False
Expand Down Expand Up @@ -665,8 +673,19 @@ def _on_update_status(self, event: UpdateStatusEvent): # noqa: C901
# handle when/if certificates are expired
self._check_certs_expiration(event)

def trigger_restart(self):
"""Trigger a restart of the service."""
self._restart_opensearch_event.emit()

def _on_config_changed(self, event: ConfigChangedEvent): # noqa C901
"""On config changed event. Useful for IP changes or for user provided config changes."""
if not self.performance_profile.current:
# We are running (1) install or (2) an upgrade on instance that pre-dates profile
# First, we set this unit's effective profile -> 1G heap and no index templates.
# Our goal is to make sure this value exists once the refresh is finished
# and it represents the accurate value for this unit.
self.performance_profile.current = PerformanceType.TESTING

if self.opensearch_config.update_host_if_needed():
self.status.set(MaintenanceStatus(TLSNewCertsRequested))
self.tls.delete_stored_tls_resources()
Expand All @@ -688,27 +707,27 @@ def _on_config_changed(self, event: ConfigChangedEvent): # noqa C901
# handle cluster change to main-orchestrator (i.e: init_hold: true -> false)
self._handle_change_to_main_orchestrator_if_needed(event, previous_deployment_desc)

# todo: handle gracefully configuration setting at start of the charm
if not self.plugin_manager.check_plugin_manager_ready():
if self.upgrade_in_progress:
# The following changes in _on_config_changed are not supported during an upgrade
# Therefore, we leave now
logger.warning(
"Changing config during an upgrade is not supported. The charm may be in a broken, "
"unrecoverable state"
)
event.defer()
return

perf_profile_needs_restart = False
plugin_needs_restart = False

try:
if not self.plugin_manager.check_plugin_manager_ready():
raise OpenSearchNotFullyReadyError()

if self.unit.is_leader():
self.status.set(MaintenanceStatus(PluginConfigCheck), app=True)

if self.plugin_manager.run():
if self.upgrade_in_progress:
logger.warning(
"Changing config during an upgrade is not supported. The charm may be in a broken, "
"unrecoverable state"
)
event.defer()
return

self._restart_opensearch_event.emit()
plugin_needs_restart = self.plugin_manager.run()
except (OpenSearchNotFullyReadyError, OpenSearchPluginError) as e:
if isinstance(e, OpenSearchNotFullyReadyError):
logger.warning("Plugin management: cluster not ready yet at config changed")
Expand All @@ -719,11 +738,16 @@ def _on_config_changed(self, event: ConfigChangedEvent): # noqa C901
# config-changed is called again.
if self.unit.is_leader():
self.status.clear(PluginConfigCheck, app=True)
return
else:
if self.unit.is_leader():
self.status.clear(PluginConfigCheck, app=True)
self.status.clear(PluginConfigChangeError, app=True)

if self.unit.is_leader():
self.status.clear(PluginConfigCheck, app=True)
self.status.clear(PluginConfigChangeError, app=True)
perf_profile_needs_restart = self.performance_profile.apply(
self.config.get(PERFORMANCE_PROFILE)
)
if plugin_needs_restart or perf_profile_needs_restart:
self._restart_opensearch_event.emit()

def _on_set_password_action(self, event: ActionEvent):
"""Set new admin password from user input or generate if not passed."""
Expand Down
21 changes: 20 additions & 1 deletion lib/charms/opensearch/v0/opensearch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from charms.opensearch.v0.constants_tls import CertType
from charms.opensearch.v0.helper_security import normalized_tls_subject
from charms.opensearch.v0.models import App
from charms.opensearch.v0.models import App, OpenSearchPerfProfile
from charms.opensearch.v0.opensearch_distro import OpenSearchDistribution

# The unique Charmhub library identifier, never change it
Expand Down Expand Up @@ -69,6 +69,25 @@ def set_client_auth(self):
"-Djdk.tls.client.protocols=TLSv1.2",
)

def apply_performance_profile(self, profile: OpenSearchPerfProfile):
"""Apply the performance profile to the opensearch config."""
self._opensearch.config.replace(
self.JVM_OPTIONS,
"-Xms[0-9]+[kmgKMG]",
f"-Xms{str(profile.heap_size_in_kb)}k",
regex=True,
)

self._opensearch.config.replace(
self.JVM_OPTIONS,
"-Xmx[0-9]+[kmgKMG]",
f"-Xmx{str(profile.heap_size_in_kb)}k",
regex=True,
)

for key, val in profile.opensearch_yml.items():
self._opensearch.config.put(self.CONFIG_YML, key, val)

def set_admin_tls_conf(self, secrets: Dict[str, any]):
"""Configures the admin certificate."""
self._opensearch.config.put(
Expand Down
3 changes: 3 additions & 0 deletions lib/charms/opensearch/v0/opensearch_peer_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def _user_config(self):
for option in self._charm.config.get("roles", "").split(",")
if option
],
profile=self._charm.performance_profile.current.typ.value,
)

def _new_cluster_setup(self, config: PeerClusterConfig) -> DeploymentDescription:
Expand Down Expand Up @@ -222,6 +223,7 @@ def _new_cluster_setup(self, config: PeerClusterConfig) -> DeploymentDescription
init_hold=config.init_hold,
roles=config.roles,
data_temperature=config.data_temperature,
profile=self._charm.performance_profile.current.typ.value,
),
start=start_mode,
pending_directives=directives,
Expand Down Expand Up @@ -270,6 +272,7 @@ def _existing_cluster_setup(
init_hold=prev_deployment.config.init_hold,
roles=config.roles,
data_temperature=config.data_temperature,
profile=self._charm.performance_profile.current.typ.value,
),
start=start_mode,
state=deployment_state,
Expand Down
Loading

0 comments on commit cd1c034

Please sign in to comment.