Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nexus][support-bundles] Avoid looking at expunged datasets, zones #7325

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::collections::BTreeMap;
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(118, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(119, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(119, "support-bundle-dataset-index"),
KnownVersion::new(118, "support-bundles"),
KnownVersion::new(117, "add-completing-and-new-region-volume"),
KnownVersion::new(116, "bp-physical-disk-disposition"),
Expand Down
149 changes: 126 additions & 23 deletions nexus/db-queries/src/db/datastore/support_bundle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,22 +225,7 @@ impl DataStore {
) -> Result<SupportBundleExpungementReport, Error> {
opctx.authorize(authz::Action::Modify, &authz::FLEET).await?;

// For this blueprint: The set of all expunged Nexus zones
let invalid_nexus_zones = blueprint
.all_omicron_zones(
nexus_types::deployment::BlueprintZoneFilter::Expunged,
)
.filter_map(|(_sled, zone)| {
if matches!(
zone.zone_type,
nexus_types::deployment::BlueprintZoneType::Nexus(_)
) {
Some(zone.id.into_untyped_uuid())
} else {
None
}
})
.collect::<Vec<Uuid>>();
// For this blueprint: The set of all in-service Nexus zones.
let valid_nexus_zones = blueprint
.all_omicron_zones(
nexus_types::deployment::BlueprintZoneFilter::ShouldBeRunning,
Expand All @@ -257,10 +242,10 @@ impl DataStore {
})
.collect::<Vec<Uuid>>();

// For this blueprint: The set of expunged debug datasets
let invalid_datasets = blueprint
// For this blueprint: The set of in-service debug datasets
let valid_datasets = blueprint
.all_omicron_datasets(
nexus_types::deployment::BlueprintDatasetFilter::Expunged,
nexus_types::deployment::BlueprintDatasetFilter::InService,
)
.filter_map(|(_sled_id, dataset_config)| {
if matches!(
Expand All @@ -282,15 +267,14 @@ impl DataStore {
opctx,
BlueprintUuid::from_untyped_uuid(blueprint.id),
|conn| {
let invalid_nexus_zones = invalid_nexus_zones.clone();
let valid_nexus_zones = valid_nexus_zones.clone();
let invalid_datasets = invalid_datasets.clone();
let valid_datasets = valid_datasets.clone();
async move {
use db::schema::support_bundle::dsl;

// Find all bundles without backing storage.
let bundles_with_bad_datasets = dsl::support_bundle
.filter(dsl::dataset_id.eq_any(invalid_datasets))
.filter(dsl::dataset_id.ne_all(valid_datasets))
.select(SupportBundle::as_select())
.load_async(conn)
.await?;
Expand Down Expand Up @@ -347,7 +331,7 @@ impl DataStore {

// Find all bundles on nexuses that no longer exist.
let bundles_with_bad_nexuses = dsl::support_bundle
.filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones))
.filter(dsl::assigned_nexus.ne_all(valid_nexus_zones))
.select(SupportBundle::as_select())
.load_async(conn)
.await?;
Expand Down Expand Up @@ -955,6 +939,15 @@ mod test {
}
}

fn delete_dataset_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) {
for datasets in bp.blueprint_datasets.values_mut() {
let bundle_dataset_id: DatasetUuid = bundle.dataset_id.into();
if datasets.datasets.remove(&bundle_dataset_id).is_some() {
datasets.generation = datasets.generation.next();
}
}
}

fn expunge_nexus_for_bundle(bp: &mut Blueprint, bundle: &SupportBundle) {
for zones in bp.blueprint_zones.values_mut() {
for (_, zone) in &mut zones.zones {
Expand Down Expand Up @@ -1187,6 +1180,116 @@ mod test {
logctx.cleanup_successful();
}

// This test is identical to "test_bundle_failed_from_expunged_dataset", but
// it fully deletes the dataset rather than marking it expunged in the
// blueprint.
#[tokio::test]
async fn test_bundle_failed_from_pruned_dataset() {
static TEST_NAME: &str = "test_bundle_failed_from_pruned_dataset";
let logctx = dev::test_setup_log(TEST_NAME);
let db = TestDatabase::new_with_datastore(&logctx.log).await;
let (opctx, datastore) = (db.opctx(), db.datastore());

let mut rng = SimRngState::from_seed(TEST_NAME);
let (_example, mut bp1) = ExampleSystemBuilder::new_with_rng(
&logctx.log,
rng.next_system_rng(),
)
.build();

// Weirdly, the "ExampleSystemBuilder" blueprint has a parent blueprint,
// but which isn't exposed through the API. Since we're only able to see
// the blueprint it emits, that means we can't actually make it the
// target because "the parent blueprint is not the current target".
//
// Instead of dealing with that, we lie: claim this is the primordial
// blueprint, with no parent.
//
// Regardless, make this starter blueprint our target.
bp1.parent_blueprint_id = None;
bp_insert_and_make_target(&opctx, &datastore, &bp1).await;

// Manually perform the equivalent of blueprint execution to populate
// database records.
let sleds = TestSled::new_from_blueprint(&bp1);
for sled in &sleds {
sled.create_database_records(&datastore, &opctx).await;
}

// Extract Nexus and Dataset information from the generated blueprint.
let this_nexus_id = get_nexuses_from_blueprint(
&bp1,
BlueprintZoneFilter::ShouldBeRunning,
)
.get(0)
.map(|id| *id)
.expect("There should be a Nexus in the example blueprint");
let debug_datasets = get_debug_datasets_from_blueprint(
&bp1,
BlueprintDatasetFilter::InService,
);
assert!(!debug_datasets.is_empty());

// When we create a bundle, it should exist on a dataset provisioned by
// the blueprint.
let bundle = datastore
.support_bundle_create(&opctx, "for the test", this_nexus_id)
.await
.expect("Should be able to create bundle");
assert_eq!(bundle.assigned_nexus, Some(this_nexus_id.into()));
assert!(
debug_datasets.contains(&DatasetUuid::from(bundle.dataset_id)),
"Bundle should have been allocated from a blueprint dataset"
);

// If we try to "fail support bundles" from expunged datasets/nexuses,
// we should see a no-op. Nothing has been expunged yet!
let report =
datastore.support_bundle_fail_expunged(&opctx, &bp1).await.expect(
"Should have been able to perform no-op support bundle failure",
);
assert_eq!(SupportBundleExpungementReport::default(), report);

// Fully remove the bundle's dataset (manually)
let bp2 = {
let mut bp2 = bp1.clone();
bp2.id = Uuid::new_v4();
bp2.parent_blueprint_id = Some(bp1.id);
delete_dataset_for_bundle(&mut bp2, &bundle);
bp2
};
bp_insert_and_make_target(&opctx, &datastore, &bp2).await;

datastore
.support_bundle_fail_expunged(&opctx, &bp1)
.await
.expect_err("bp1 is no longer the target; this should fail");
let report = datastore
.support_bundle_fail_expunged(&opctx, &bp2)
.await
.expect("Should have been able to mark bundle state as failed");
assert_eq!(
SupportBundleExpungementReport {
bundles_failed_missing_datasets: 1,
..Default::default()
},
report
);

let observed_bundle = datastore
.support_bundle_get(&opctx, bundle.id.into())
.await
.expect("Should be able to get bundle we just failed");
assert_eq!(SupportBundleState::Failed, observed_bundle.state);
assert!(observed_bundle
.reason_for_failure
.unwrap()
.contains(FAILURE_REASON_NO_DATASET));

db.terminate().await;
logctx.cleanup_successful();
}

#[tokio::test]
async fn test_bundle_failed_from_expunged_nexus_no_reassign() {
static TEST_NAME: &str =
Expand Down
4 changes: 2 additions & 2 deletions schema/crdb/dbinit.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2447,7 +2447,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.support_bundle (
-- for allocation changes to allocate more intelligently.
CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle (
dataset_id
);
) WHERE dataset_id IS NOT NULL;
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is necessary to avoid a full-table scan


CREATE INDEX IF NOT EXISTS lookup_bundle_by_nexus ON omicron.public.support_bundle (
assigned_nexus
Expand Down Expand Up @@ -4757,7 +4757,7 @@ INSERT INTO omicron.public.db_metadata (
version,
target_version
) VALUES
(TRUE, NOW(), NOW(), '118.0.0', NULL)
(TRUE, NOW(), NOW(), '119.0.0', NULL)
ON CONFLICT DO NOTHING;

COMMIT;
2 changes: 2 additions & 0 deletions schema/crdb/support-bundle-dataset-index/up01.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- Drop the old unique index
DROP INDEX IF EXISTS one_bundle_per_dataset CASCADE;
4 changes: 4 additions & 0 deletions schema/crdb/support-bundle-dataset-index/up02.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- Create a new unique index
CREATE UNIQUE INDEX IF NOT EXISTS one_bundle_per_dataset ON omicron.public.support_bundle (
dataset_id
) WHERE dataset_id IS NOT NULL;
Loading