Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DPE-4528] Cover large deployments with more upgrade testing #459

Draft
wants to merge 4 commits into
base: 2/edge
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions tests/integration/upgrades/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
from typing import Optional

import pytest
from pytest_operator.plugin import OpsTest
from tenacity import Retrying, stop_after_attempt, wait_fixed

Expand All @@ -17,12 +18,43 @@
from ..helpers import APP_NAME, IDLE_PERIOD, app_name, run_action
from ..helpers_deployments import get_application_units, wait_until

OPENSEARCH_ORIGINAL_CHARM_NAME = "opensearch"
OPENSEARCH_SERVICE_PATH = "/etc/systemd/system/snap.opensearch.daemon.service"
ORIGINAL_RESTART_DELAY = 20
SECOND_APP_NAME = "second-opensearch"
RESTART_DELAY = 360


OPENSEARCH_CHANNEL = "2/edge"


STARTING_VERSION = "2.15.0"


VERSION_TO_REVISION = {
STARTING_VERSION: 144,
"2.16.0": 160,
}


CHANNELS = ["edge", "beta", "2/stable"]


FROM_VERSION_PREFIX = "from_v{}_to_local"


UPGRADE_INITIAL_VERSION = [
(
pytest.param(
version,
id=FROM_VERSION_PREFIX.format(version),
marks=pytest.mark.group(FROM_VERSION_PREFIX.format(version)),
)
)
for version in VERSION_TO_REVISION.keys()
]


logger = logging.getLogger(__name__)


Expand Down
227 changes: 180 additions & 47 deletions tests/integration/upgrades/test_manual_large_deployment_upgrades.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,51 +10,68 @@

from ..ha.continuous_writes import ContinuousWrites
from ..ha.helpers import assert_continuous_writes_consistency
from ..helpers import APP_NAME, IDLE_PERIOD, MODEL_CONFIG, SERIES, run_action
from ..helpers_deployments import get_application_units, wait_until
from ..helpers import (
APP_NAME,
IDLE_PERIOD,
MODEL_CONFIG,
SERIES,
get_leader_unit_id,
run_action,
set_watermark,
)
from ..helpers_deployments import wait_until
from ..tls.test_tls import TLS_CERTIFICATES_APP_NAME
from .helpers import (
OPENSEARCH_CHANNEL,
OPENSEARCH_ORIGINAL_CHARM_NAME,
STARTING_VERSION,
UPGRADE_INITIAL_VERSION,
VERSION_TO_REVISION,
assert_upgrade_to_local,
refresh,
)

logger = logging.getLogger(__name__)


OPENSEARCH_ORIGINAL_CHARM_NAME = "opensearch"
OPENSEARCH_INITIAL_CHANNEL = "2/edge"
OPENSEARCH_MAIN_APP_NAME = "main"
OPENSEARCH_FAILOVER_APP_NAME = "failover"
REL_ORCHESTRATOR = "peer-cluster-orchestrator"
REL_PEER = "peer-cluster"


charm = None


WORKLOAD = {
APP_NAME: 3,
OPENSEARCH_FAILOVER_APP_NAME: 2,
OPENSEARCH_MAIN_APP_NAME: 1,
APP_NAME: 2,
OPENSEARCH_FAILOVER_APP_NAME: 1,
OPENSEARCH_MAIN_APP_NAME: 3,
}


@pytest.mark.skip(reason="Fix with DPE-4528")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
@pytest.mark.skip_if_deployed
async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None:
"""Build and deploy the charm for large deployment tests."""
#######################################################################
#
# Auxiliary functions
#
#######################################################################
async def _build_env(ops_test: OpsTest, version: str) -> None:
"""Deploy OpenSearch cluster from a given revision."""
await ops_test.model.set_config(MODEL_CONFIG)
# Deploy TLS Certificates operator.
tls_config = {"ca-common-name": "CN_CA"}

main_orchestrator_conf = {
"cluster_name": "backup-test",
"cluster_name": "upgrade-test",
"init_hold": False,
"roles": "cluster_manager",
"roles": "cluster_manager,data",
}
failover_orchestrator_conf = {
"cluster_name": "backup-test",
"cluster_name": "upgrade-test",
"init_hold": True,
"roles": "cluster_manager",
"roles": "cluster_manager,data",
}
data_hot_conf = {"cluster_name": "backup-test", "init_hold": True, "roles": "data.hot"}
data_conf = {"cluster_name": "upgrade-test", "init_hold": True, "roles": "data"}

await asyncio.gather(
ops_test.model.deploy(TLS_CERTIFICATES_APP_NAME, channel="stable", config=tls_config),
Expand All @@ -63,38 +80,42 @@ async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None
application_name=OPENSEARCH_MAIN_APP_NAME,
num_units=WORKLOAD[OPENSEARCH_MAIN_APP_NAME],
series=SERIES,
channel=OPENSEARCH_INITIAL_CHANNEL,
channel=OPENSEARCH_CHANNEL,
config=main_orchestrator_conf,
),
ops_test.model.deploy(
OPENSEARCH_ORIGINAL_CHARM_NAME,
application_name=OPENSEARCH_FAILOVER_APP_NAME,
num_units=WORKLOAD[OPENSEARCH_FAILOVER_APP_NAME],
series=SERIES,
channel=OPENSEARCH_INITIAL_CHANNEL,
channel=OPENSEARCH_CHANNEL,
config=failover_orchestrator_conf,
),
ops_test.model.deploy(
OPENSEARCH_ORIGINAL_CHARM_NAME,
application_name=APP_NAME,
num_units=WORKLOAD[APP_NAME],
series=SERIES,
channel=OPENSEARCH_INITIAL_CHANNEL,
config=data_hot_conf,
channel=OPENSEARCH_CHANNEL,
config=data_conf,
),
)

# Large deployment setup
await ops_test.model.integrate("main:peer-cluster-orchestrator", "failover:peer-cluster")
await ops_test.model.integrate("main:peer-cluster-orchestrator", f"{APP_NAME}:peer-cluster")
# integrate TLS to all applications
for app in [OPENSEARCH_MAIN_APP_NAME, OPENSEARCH_FAILOVER_APP_NAME, APP_NAME]:
await ops_test.model.integrate(app, TLS_CERTIFICATES_APP_NAME)

# create the peer-cluster-relation
await ops_test.model.integrate(
"failover:peer-cluster-orchestrator", f"{APP_NAME}:peer-cluster"
f"{APP_NAME}:{REL_PEER}", f"{OPENSEARCH_MAIN_APP_NAME}:{REL_ORCHESTRATOR}"
)
await ops_test.model.integrate(
f"{OPENSEARCH_FAILOVER_APP_NAME}:{REL_PEER}",
f"{OPENSEARCH_MAIN_APP_NAME}:{REL_ORCHESTRATOR}",
)
await ops_test.model.integrate(
f"{APP_NAME}:{REL_PEER}", f"{OPENSEARCH_FAILOVER_APP_NAME}:{REL_ORCHESTRATOR}"
)

# TLS setup
await ops_test.model.integrate("main", TLS_CERTIFICATES_APP_NAME)
await ops_test.model.integrate("failover", TLS_CERTIFICATES_APP_NAME)
await ops_test.model.integrate(APP_NAME, TLS_CERTIFICATES_APP_NAME)

# Charms except s3-integrator should be active
await wait_until(
Expand All @@ -117,18 +138,12 @@ async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None
timeout=3600,
)

await set_watermark(ops_test, APP_NAME)

@pytest.mark.skip(reason="Fix with DPE-4528")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_manually_upgrade_to_local(
ops_test: OpsTest, c_writes: ContinuousWrites, c_writes_runner
) -> None:
"""Test upgrade from usptream to currently locally built version."""
units = await get_application_units(ops_test, OPENSEARCH_MAIN_APP_NAME)
leader_id = [u.id for u in units if u.is_leader][0]

async def _upgrade(ops_test: OpsTest, local_build: bool = False, revision: str = None) -> None:
app = OPENSEARCH_MAIN_APP_NAME
leader_id = await get_leader_unit_id(ops_test, app)
action = await run_action(
ops_test,
leader_id,
Expand All @@ -144,13 +159,14 @@ async def test_manually_upgrade_to_local(

async with ops_test.fast_forward():
for app, unit_count in WORKLOAD.items():
application = ops_test.model.applications[app]
units = await get_application_units(ops_test, app)
leader_id = [u.id for u in units if u.is_leader][0]
leader_id = get_leader_unit_id(ops_test, app)

logger.info(f"Refresh app {app}, leader {leader_id}")

await application.refresh(path=charm)
if local_build:
await refresh(ops_test, app, path=charm)
else:
await refresh(ops_test, app, revision=revision)
logger.info("Refresh is over, waiting for the charm to settle")

if unit_count == 1:
Expand All @@ -166,10 +182,13 @@ async def test_manually_upgrade_to_local(
logger.info(f"Upgrade of app {app} finished")
continue

# Wait until we are set in an idle state and can rollback the revision.
# app status blocked: that will happen if we are jumping N-2 versions in our test
# app status active: that will happen if we are jumping N-1 in our test
await wait_until(
ops_test,
apps=[app],
apps_statuses=["blocked"],
apps_statuses=["active", "blocked"],
units_statuses=["active"],
wait_for_exact_units={
app: unit_count,
Expand Down Expand Up @@ -197,9 +216,123 @@ async def test_manually_upgrade_to_local(
)
logger.info(f"Upgrade of app {app} finished")


#######################################################################
#
# Tests
#
#######################################################################
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"])
@pytest.mark.group("happy_path_upgrade")
@pytest.mark.abort_on_fail
@pytest.mark.skip_if_deployed
async def test_large_deployment_deploy_original_charm(ops_test: OpsTest) -> None:
"""Deploy OpenSearch."""
await _build_env(ops_test, STARTING_VERSION)


@pytest.mark.group("happy_path_upgrade")
@pytest.mark.abort_on_fail
async def test_upgrade_between_versions(
ops_test: OpsTest, c_writes: ContinuousWrites, c_writes_runner
) -> None:
"""Test upgrade from upstream to currently locally built version."""
for version, rev in VERSION_TO_REVISION.items():
if version == STARTING_VERSION:
# We're starting in this version
continue

logger.info(f"Upgrading to version {version}")
await _upgrade(ops_test, revision=rev)

await _upgrade(ops_test, local_build=True)
# continuous writes checks
await assert_continuous_writes_consistency(
ops_test,
c_writes,
[APP_NAME, OPENSEARCH_MAIN_APP_NAME],
)


##################################################################################
#
# test scenarios from each version:
# Start with each version, moving to local and then rolling back mid-upgrade
# Once this test passes, the 2nd test will rerun the upgrade, this time to
# its end.
#
##################################################################################
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "xlarge"])
@pytest.mark.parametrize("version", UPGRADE_INITIAL_VERSION)
@pytest.mark.abort_on_fail
@pytest.mark.skip_if_deployed
async def test_deploy_from_version(ops_test: OpsTest, version) -> None:
"""Deploy OpenSearch."""
await _build_env(ops_test, version)


@pytest.mark.parametrize("version", UPGRADE_INITIAL_VERSION)
@pytest.mark.abort_on_fail
async def test_upgrade_rollback_from_local(
ops_test: OpsTest, c_writes: ContinuousWrites, c_writes_runner, version
) -> None:
"""Test upgrade and rollback to each version available."""
app = OPENSEARCH_MAIN_APP_NAME
leader_id = await get_leader_unit_id(ops_test, app)
action = await run_action(
ops_test,
leader_id,
"pre-upgrade-check",
app=OPENSEARCH_MAIN_APP_NAME,
)
assert action.status == "completed"

logger.info("Build charm locally")
global charm
if not charm:
charm = await ops_test.build_charm(".")

async with ops_test.fast_forward():
logger.info(f"Refresh app {app}, leader {leader_id}")

async with ops_test.fast_forward():
for app, unit_count in WORKLOAD.items():
leader_id = get_leader_unit_id(ops_test, app)

await refresh(ops_test, app, path=charm)
logger.info("Refresh is over, waiting for the charm to settle")

# Wait until we are set in an idle state and can rollback the revision.
# app status blocked: that will happen if we are jumping N-2 versions in our test
# app status active: that will happen if we are jumping N-1 in our test
await wait_until(
ops_test,
apps=[app],
apps_statuses=["active", "blocked"],
units_statuses=["active"],
wait_for_exact_units={
app: unit_count,
},
idle_period=120,
timeout=3600,
)

# continuous writes checks
await assert_continuous_writes_consistency(
ops_test,
c_writes,
[APP_NAME, OPENSEARCH_MAIN_APP_NAME],
)


@pytest.mark.parametrize("version", UPGRADE_INITIAL_VERSION)
@pytest.mark.abort_on_fail
async def test_upgrade_from_version_to_local(
ops_test: OpsTest, c_writes: ContinuousWrites, c_writes_runner, version
) -> None:
"""Test upgrade from usptream to currently locally built version."""
logger.info("Build charm locally")
global charm
if not charm:
charm = await ops_test.build_charm(".")
await assert_upgrade_to_local(ops_test, c_writes, charm)
Loading
Loading