From 09b150f17780ccd8e35051f4611ff9883ced71ff Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 19 Dec 2024 09:22:54 -0500 Subject: [PATCH] Handle region snapshot replacement volume deletes (#7046) Volumes can be deleted at any time, but the tasks and sagas that perform region snapshot replacement did not account for this. This commit adds checks in a few places for if a volume is soft-deleted or hard-deleted, and bails out of any affected region snapshot replacement accordingly: - if a volume that has the region snapshot being replaced is soft-deleted, then skip making a region snapshot replacement step for it - if a region snapshot replacement step has the volume deleted after the step was created, transition it directly to the VolumeDeleted state - if a region snapshot replacement step has the volume deleted during the saga invocation, then skip notifying any Upstairs and allow the saga to transition the request to Complete, where then associated clean up can proceed An interesting race condition emerged during unit testing: the read-only region allocated to replace a region snapshot would be swapped into the snapshot volume, but would be susceptible to being deleted by the user, and therefore unable to be swapped into other volumes that have that snapshot volume as a read-only parent. This requires an additional volume that used that read-only region in order to bump the reference count associated with that region, so that the user cannot delete it before it was used to replace all other uses of the region snapshot it was meant to replace. This additional volume's lifetime lives as long as the region snapshot replacement, and therefore needs to be deleted when the region snapshot replacement is finished. This required a new region snapshot replacement finish saga, which required a new "Completing" state to perform the same type of state based lock on the replacement request done for all the other sagas. Testing also revealed that there were scenarios where `find_deleted_volume_regions` would return volumes for hard-deletion prematurely. The function now returns a struct instead of a list of tuples, and in that struct, regions freed for deletion are now distinct from volumes freed for deletion. Volumes are now only returned for hard-deletion when all associated read/write regions have been (or are going to be) deleted. Fixes #6353 --- dev-tools/omdb/src/bin/omdb/db.rs | 41 +- dev-tools/omdb/src/bin/omdb/nexus.rs | 23 +- dev-tools/omdb/tests/successes.out | 8 +- .../src/region_snapshot_replacement.rs | 20 +- nexus/db-model/src/schema.rs | 1 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/mod.rs | 11 +- .../src/db/datastore/region_replacement.rs | 58 +- .../datastore/region_snapshot_replacement.rs | 575 +++++- nexus/db-queries/src/db/datastore/volume.rs | 130 +- .../src/db/datastore/volume_repair.rs | 165 +- nexus/src/app/background/init.rs | 2 +- .../background/tasks/region_replacement.rs | 54 +- .../tasks/region_replacement_driver.rs | 57 + .../region_snapshot_replacement_finish.rs | 118 +- ...on_snapshot_replacement_garbage_collect.rs | 41 +- .../region_snapshot_replacement_start.rs | 352 +++- .../tasks/region_snapshot_replacement_step.rs | 193 +- nexus/src/app/sagas/mod.rs | 4 +- .../app/sagas/region_replacement_finish.rs | 14 + .../src/app/sagas/region_replacement_start.rs | 9 +- .../region_snapshot_replacement_finish.rs | 211 ++ ...on_snapshot_replacement_garbage_collect.rs | 18 +- .../region_snapshot_replacement_start.rs | 118 +- .../sagas/region_snapshot_replacement_step.rs | 17 +- ...apshot_replacement_step_garbage_collect.rs | 18 +- nexus/src/app/sagas/volume_delete.rs | 42 +- nexus/test-utils/src/background.rs | 2 +- .../crucible_replacements.rs | 1706 ++++++++++++++--- .../integration_tests/volume_management.rs | 7 +- nexus/types/src/internal_api/background.rs | 4 +- .../up01.sql | 1 + .../up02.sql | 1 + schema/crdb/dbinit.sql | 9 +- 34 files changed, 3540 insertions(+), 493 deletions(-) create mode 100644 nexus/src/app/sagas/region_snapshot_replacement_finish.rs create mode 100644 schema/crdb/add-completing-and-new-region-volume/up01.sql create mode 100644 schema/crdb/add-completing-and-new-region-volume/up02.sql diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 5058739da0..e501e650b1 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -2591,39 +2591,48 @@ async fn cmd_db_region_used_by( async fn cmd_db_region_find_deleted( datastore: &DataStore, ) -> Result<(), anyhow::Error> { - let datasets_regions_volumes = + let freed_crucible_resources = datastore.find_deleted_volume_regions().await?; #[derive(Tabled)] - struct Row { + struct RegionRow { dataset_id: DatasetUuid, region_id: Uuid, - volume_id: String, } - let rows: Vec = datasets_regions_volumes - .into_iter() + #[derive(Tabled)] + struct VolumeRow { + volume_id: Uuid, + } + + let region_rows: Vec = freed_crucible_resources + .datasets_and_regions + .iter() .map(|row| { - let (dataset, region, volume) = row; + let (dataset, region) = row; - Row { - dataset_id: dataset.id(), - region_id: region.id(), - volume_id: if let Some(volume) = volume { - volume.id().to_string() - } else { - String::from("") - }, - } + RegionRow { dataset_id: dataset.id(), region_id: region.id() } }) .collect(); - let table = tabled::Table::new(rows) + let table = tabled::Table::new(region_rows) .with(tabled::settings::Style::psql()) .to_string(); println!("{}", table); + let volume_rows: Vec = freed_crucible_resources + .volumes + .iter() + .map(|volume_id| VolumeRow { volume_id: *volume_id }) + .collect(); + + let volume_table = tabled::Table::new(volume_rows) + .with(tabled::settings::Style::psql()) + .to_string(); + + println!("{}", volume_table); + Ok(()) } diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index fe16b4076e..eb7d74b340 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1655,6 +1655,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!( + " total requests completed ok: {}", + status.requests_completed_ok.len(), + ); + for line in &status.requests_completed_ok { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); @@ -1720,6 +1728,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!( + " total steps set to volume_deleted ok: {}", + status.step_set_volume_deleted_ok.len(), + ); + for line in &status.step_set_volume_deleted_ok { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); @@ -1831,10 +1847,11 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { Ok(status) => { println!( - " total records transitioned to done: {}", - status.records_set_to_done.len(), + " region snapshot replacement finish sagas started \ + ok: {}", + status.finish_invoked_ok.len() ); - for line in &status.records_set_to_done { + for line in &status.finish_invoked_ok { println!(" > {line}"); } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index dcf1671d8f..75f6ce0d0e 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -627,7 +627,7 @@ task: "region_snapshot_replacement_finish" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total records transitioned to done: 0 + region snapshot replacement finish sagas started ok: 0 errors: 0 task: "region_snapshot_replacement_garbage_collection" @@ -645,6 +645,7 @@ task: "region_snapshot_replacement_start" started at (s ago) and ran for ms total requests created ok: 0 total start saga invoked ok: 0 + total requests completed ok: 0 errors: 0 task: "region_snapshot_replacement_step" @@ -655,6 +656,7 @@ task: "region_snapshot_replacement_step" total step records created ok: 0 total step garbage collect saga invoked ok: 0 total step saga invoked ok: 0 + total steps set to volume_deleted ok: 0 errors: 0 task: "saga_recovery" @@ -1070,7 +1072,7 @@ task: "region_snapshot_replacement_finish" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total records transitioned to done: 0 + region snapshot replacement finish sagas started ok: 0 errors: 0 task: "region_snapshot_replacement_garbage_collection" @@ -1088,6 +1090,7 @@ task: "region_snapshot_replacement_start" started at (s ago) and ran for ms total requests created ok: 0 total start saga invoked ok: 0 + total requests completed ok: 0 errors: 0 task: "region_snapshot_replacement_step" @@ -1098,6 +1101,7 @@ task: "region_snapshot_replacement_step" total step records created ok: 0 total step garbage collect saga invoked ok: 0 total step saga invoked ok: 0 + total steps set to volume_deleted ok: 0 errors: 0 task: "saga_recovery" diff --git a/nexus/db-model/src/region_snapshot_replacement.rs b/nexus/db-model/src/region_snapshot_replacement.rs index bcbd55028d..28627d8379 100644 --- a/nexus/db-model/src/region_snapshot_replacement.rs +++ b/nexus/db-model/src/region_snapshot_replacement.rs @@ -28,6 +28,7 @@ impl_enum_type!( ReplacementDone => b"replacement_done" DeletingOldVolume => b"deleting_old_volume" Running => b"running" + Completing => b"completing" Complete => b"complete" ); @@ -46,6 +47,7 @@ impl std::str::FromStr for RegionSnapshotReplacementState { Ok(RegionSnapshotReplacementState::DeletingOldVolume) } "running" => Ok(RegionSnapshotReplacementState::Running), + "completing" => Ok(RegionSnapshotReplacementState::Completing), "complete" => Ok(RegionSnapshotReplacementState::Complete), _ => Err(format!("unrecognized value {} for enum", s)), } @@ -79,9 +81,14 @@ impl std::str::FromStr for RegionSnapshotReplacementState { /// | | /// v --- /// --- -/// Running | -/// | set in region snapshot replacement -/// | | finish background task +/// Running <-- | +/// | | +/// | | | +/// v | | +/// | | responsibility of region snapshot +/// Completing -- | replacement finish saga +/// | +/// | | /// v | /// | /// Complete --- @@ -133,6 +140,12 @@ pub struct RegionSnapshotReplacement { pub replacement_state: RegionSnapshotReplacementState, pub operating_saga_id: Option, + + /// In order for the newly created region not to be deleted inadvertently, + /// an additional reference count bump is required. This volume should live + /// as long as this request so that all necessary replacements can be + /// completed. + pub new_region_volume_id: Option, } impl RegionSnapshotReplacement { @@ -157,6 +170,7 @@ impl RegionSnapshotReplacement { old_snapshot_id, old_snapshot_volume_id: None, new_region_id: None, + new_region_volume_id: None, replacement_state: RegionSnapshotReplacementState::Requested, operating_saga_id: None, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 399da81ea4..a54c2e3029 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1932,6 +1932,7 @@ table! { new_region_id -> Nullable, replacement_state -> crate::RegionSnapshotReplacementStateEnum, operating_saga_id -> Nullable, + new_region_volume_id -> Nullable, } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 02646bc6dd..8b663d2549 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(116, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(117, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(117, "add-completing-and-new-region-volume"), KnownVersion::new(116, "bp-physical-disk-disposition"), KnownVersion::new(115, "inv-omicron-physical-disks-generation"), KnownVersion::new(114, "crucible-ref-count-records"), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index c90f1cc92e..d20f24e773 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -125,16 +125,7 @@ pub use sled::TransitionError; pub use switch_port::SwitchPortSettingsCombinedResult; pub use virtual_provisioning_collection::StorageType; pub use vmm::VmmStateUpdateResult; -pub use volume::read_only_resources_associated_with_volume; -pub use volume::CrucibleResources; -pub use volume::CrucibleTargets; -pub use volume::ExistingTarget; -pub use volume::ReplacementTarget; -pub use volume::VolumeCheckoutReason; -pub use volume::VolumeReplaceResult; -pub use volume::VolumeReplacementParams; -pub use volume::VolumeToDelete; -pub use volume::VolumeWithTarget; +pub use volume::*; // Number of unique datasets required to back a region. // TODO: This should likely turn into a configuration option. diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index 0d259ba47e..8aad7f2cfd 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -59,19 +59,22 @@ impl DataStore { opctx: &OpContext, request: RegionReplacement, ) -> Result<(), Error> { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper("insert_region_replacement_request") .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_replacement::dsl; - Self::volume_repair_insert_query( + Self::volume_repair_insert_in_txn( + &conn, + err, request.volume_id, request.id, ) - .execute_async(&conn) .await?; diesel::insert_into(dsl::region_replacement) @@ -83,7 +86,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_replacement_request_by_id( @@ -915,6 +924,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { @@ -926,6 +936,20 @@ mod test { let region_2_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionReplacement::new(region_1_id, volume_id); let request_2 = RegionReplacement::new(region_2_id, volume_id); @@ -957,6 +981,20 @@ mod test { let region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::Running; @@ -1049,6 +1087,20 @@ mod test { let region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::ReplacementDone; diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 76a83cca2a..90b014c582 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -16,7 +16,6 @@ use crate::db::model::RegionSnapshotReplacement; use crate::db::model::RegionSnapshotReplacementState; use crate::db::model::RegionSnapshotReplacementStep; use crate::db::model::RegionSnapshotReplacementStepState; -use crate::db::model::VolumeRepair; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::update_and_check::UpdateAndCheck; @@ -93,6 +92,7 @@ impl DataStore { request: RegionSnapshotReplacement, volume_id: Uuid, ) -> Result<(), Error> { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper( @@ -100,20 +100,24 @@ impl DataStore { ) .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_snapshot_replacement::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; - // An associated volume repair record isn't _strictly_ needed: - // snapshot volumes should never be directly constructed, and - // therefore won't ever have an associated Upstairs that - // receives a volume replacement request. However it's being - // done in an attempt to be overly cautious. - - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { volume_id, repair_id: request.id }) - .execute_async(&conn) - .await?; + // An associated volume repair record isn't _strictly_ + // needed: snapshot volumes should never be directly + // constructed, and therefore won't ever have an associated + // Upstairs that receives a volume replacement request. + // However it's being done in an attempt to be overly + // cautious, and it validates that the volume exist: + // otherwise it would be possible to create a region + // snapshot replacement request for a volume that didn't + // exist! + + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, request.id, + ) + .await?; diesel::insert_into(dsl::region_snapshot_replacement) .values(request) @@ -124,7 +128,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_request_by_id( @@ -342,6 +352,7 @@ impl DataStore { region_snapshot_replacement_id: Uuid, operating_saga_id: Uuid, new_region_id: Uuid, + new_region_volume_id: Uuid, old_snapshot_volume_id: Uuid, ) -> Result<(), Error> { use db::schema::region_snapshot_replacement::dsl; @@ -357,6 +368,7 @@ impl DataStore { .eq(RegionSnapshotReplacementState::ReplacementDone), dsl::old_snapshot_volume_id.eq(Some(old_snapshot_volume_id)), dsl::new_region_id.eq(Some(new_region_id)), + dsl::new_region_volume_id.eq(Some(new_region_volume_id)), dsl::operating_saga_id.eq(Option::::None), )) .check_if_exists::( @@ -375,6 +387,8 @@ impl DataStore { && record.replacement_state == RegionSnapshotReplacementState::ReplacementDone && record.new_region_id == Some(new_region_id) + && record.new_region_volume_id + == Some(new_region_volume_id) && record.old_snapshot_volume_id == Some(old_snapshot_volume_id) { @@ -557,15 +571,201 @@ impl DataStore { } } - /// Transition a RegionSnapshotReplacement record from Running to Complete. - /// Also removes the `volume_repair` record that is taking a "lock" on the - /// Volume. Note this doesn't occur from a saga context, and therefore 1) - /// doesn't accept an operating saga id parameter, and 2) checks that - /// operating_saga_id is null for the corresponding record. + /// Transition a RegionSnapshotReplacement record from Running to + /// Completing, setting a unique id at the same time. + pub async fn set_region_snapshot_replacement_completing( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + ) + .filter(dsl::operating_saga_id.is_null()) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + dsl::operating_saga_id.eq(operating_saga_id), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == Some(operating_saga_id) + && record.replacement_state + == RegionSnapshotReplacementState::Completing + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionReplacement record from Completing to Running, + /// clearing the operating saga id. + pub async fn undo_set_region_snapshot_replacement_completing( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementState::Running + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacement record from Completing to + /// Complete. Also removes the `volume_repair` record that is taking a + /// "lock" on the Volume. pub async fn set_region_snapshot_replacement_complete( &self, opctx: &OpContext, region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement::dsl; + + let result = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementState::Complete + { + Ok(()) + } else { + Err(err.bail(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + )))) + } + } + } + } + }) + .await + .map_err(|e| match err.take() { + Some(error) => error, + None => public_error_from_diesel(e, ErrorHandler::Server), + }) + } + + /// Transition a RegionSnapshotReplacement record from Requested to Complete + /// - this is required when the region snapshot is hard-deleted, which means + /// that all volume references are gone and no replacement is required. Also + /// removes the `volume_repair` record that is taking a "lock" on the + /// Volume. + pub async fn set_region_snapshot_replacement_complete_from_requested( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, ) -> Result<(), Error> { type TxnError = TransactionError; @@ -577,6 +777,7 @@ impl DataStore { let err = err.clone(); async move { use db::schema::volume_repair::dsl as volume_repair_dsl; + use db::schema::region_snapshot_replacement::dsl; diesel::delete( volume_repair_dsl::volume_repair.filter( @@ -587,17 +788,16 @@ impl DataStore { .execute_async(&conn) .await?; - use db::schema::region_snapshot_replacement::dsl; - let result = diesel::update(dsl::region_snapshot_replacement) .filter(dsl::id.eq(region_snapshot_replacement_id)) .filter( dsl::replacement_state - .eq(RegionSnapshotReplacementState::Running), + .eq(RegionSnapshotReplacementState::Requested), ) .filter(dsl::operating_saga_id.is_null()) - .set((dsl::replacement_state - .eq(RegionSnapshotReplacementState::Complete),)) + .filter(dsl::new_region_volume_id.is_null()) + .set(dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete)) .check_if_exists::( region_snapshot_replacement_id, ) @@ -621,7 +821,7 @@ impl DataStore { region_snapshot_replacement_id, record.replacement_state, record.operating_saga_id, - ), + ) )))) } } @@ -651,6 +851,7 @@ impl DataStore { opctx: &OpContext, request: RegionSnapshotReplacementStep, ) -> Result { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper( @@ -658,10 +859,10 @@ impl DataStore { ) .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_snapshot_replacement_step::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; // Skip inserting this new record if we found another region // snapshot replacement step with this volume in the step's @@ -714,13 +915,13 @@ impl DataStore { // volume replacement: create an associated volume repair // record. - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { - volume_id: request.volume_id, - repair_id: request.id, - }) - .execute_async(&conn) - .await?; + Self::volume_repair_insert_in_txn( + &conn, + err, + request.volume_id, + request.id, + ) + .await?; let request_id = request.id; @@ -733,7 +934,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_step_by_id( @@ -1073,6 +1280,87 @@ impl DataStore { Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), } } + + /// Transition from Requested to VolumeDeleted, and remove the associated + /// `volume_repair` record. This occurs when the associated snapshot's + /// volume is deleted. Note this doesn't occur from a saga context, and + /// therefore 1) doesn't accept an operating saga id parameter, and 2) + /// checks that operating_saga_id is null for the corresponding record. + pub async fn set_region_snapshot_replacement_step_volume_deleted_from_requested( + &self, + opctx: &OpContext, + region_snapshot_replacement_step: RegionSnapshotReplacementStep, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + + async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_step.id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement_step::dsl; + let result = + diesel::update(dsl::region_snapshot_replacement_step) + .filter(dsl::id.eq(region_snapshot_replacement_step.id)) + .filter(dsl::operating_saga_id.is_null()) + .filter(dsl::old_snapshot_volume_id.is_null()) + .filter( + dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::Requested, + ), + ) + .set(dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::VolumeDeleted, + )) + .check_if_exists::( + region_snapshot_replacement_step.id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementStepState::VolumeDeleted + { + Ok(()) + } else { + Err(err.bail(Error::conflict(format!( + "region snapshot replacement step {} set \ + to {:?} (operating saga id {:?})", + region_snapshot_replacement_step.id, + record.replacement_state, + record.operating_saga_id, + )))) + } + } + } + } + }) + .await + .map_err(|e| match err.take() { + Some(error) => error, + None => public_error_from_diesel(e, ErrorHandler::Server), + }) + } } #[cfg(test)] @@ -1083,6 +1371,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { @@ -1100,6 +1389,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionSnapshotReplacement::new( dataset_1_id, region_1_id, @@ -1146,6 +1449,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionSnapshotReplacement::new( dataset_1_id, region_1_id, @@ -1182,6 +1499,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); @@ -1214,11 +1545,25 @@ mod test { // Insert some replacement steps, and make sure counting works + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); let result = datastore .insert_region_snapshot_replacement_step(&opctx, step) @@ -1247,11 +1592,25 @@ mod test { 1, ); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let mut step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let mut step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Running; @@ -1283,11 +1642,25 @@ mod test { 1, ); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let mut step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let mut step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); // VolumeDeleted does not count as "in-progress" step.replacement_state = @@ -1337,6 +1710,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let step = RegionSnapshotReplacementStep::new(Uuid::new_v4(), volume_id); let first_request_id = step.id; @@ -1431,6 +1818,22 @@ mod test { let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut request = RegionSnapshotReplacement::new( DatasetUuid::new_v4(), Uuid::new_v4(), @@ -1442,9 +1845,7 @@ mod test { datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -1457,8 +1858,24 @@ mod test { .unwrap() .is_empty()); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; let result = datastore @@ -1468,8 +1885,24 @@ mod test { assert!(matches!(result, InsertStepResult::Inserted { .. })); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; let result = datastore @@ -1509,6 +1942,34 @@ mod test { let volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + datastore + .volume_create(nexus_db_model::Volume::new( + old_snapshot_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = RegionSnapshotReplacementStep::new(request_id, volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; @@ -1558,6 +2019,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); datastore diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 4e0d1ccac1..505533da50 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -59,6 +59,7 @@ use serde::Deserialize; use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; +use std::collections::HashSet; use std::collections::VecDeque; use std::net::AddrParseError; use std::net::SocketAddr; @@ -179,6 +180,23 @@ enum ReplaceSnapshotError { MultipleResourceUsageRecords(String), } +/// Crucible resources freed by previous volume deletes +#[derive(Debug, Serialize, Deserialize)] +pub struct FreedCrucibleResources { + /// Regions that previously could not be deleted (often due to region + /// snaphots) that were freed by a volume delete + pub datasets_and_regions: Vec<(Dataset, Region)>, + + /// Previously soft-deleted volumes that can now be hard-deleted + pub volumes: Vec, +} + +impl FreedCrucibleResources { + pub fn is_empty(&self) -> bool { + self.datasets_and_regions.is_empty() && self.volumes.is_empty() + } +} + impl DataStore { async fn volume_create_in_txn( conn: &async_bb8_diesel::Connection, @@ -412,7 +430,7 @@ impl DataStore { }) } - async fn volume_get_impl( + pub(super) async fn volume_get_impl( conn: &async_bb8_diesel::Connection, volume_id: Uuid, ) -> Result, diesel::result::Error> { @@ -1115,12 +1133,12 @@ impl DataStore { .await } - /// Find regions for deleted volumes that do not have associated region - /// snapshots and are not being used by any other non-deleted volumes, and - /// return them for garbage collection + /// Find read/write regions for deleted volumes that do not have associated + /// region snapshots and are not being used by any other non-deleted + /// volumes, and return them for garbage collection pub async fn find_deleted_volume_regions( &self, - ) -> ListResultVec<(Dataset, Region, Option)> { + ) -> LookupResult { let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("find_deleted_volume_regions") .transaction(&conn, |conn| async move { @@ -1132,8 +1150,7 @@ impl DataStore { async fn find_deleted_volume_regions_in_txn( conn: &async_bb8_diesel::Connection, - ) -> Result)>, diesel::result::Error> - { + ) -> Result { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; @@ -1179,6 +1196,9 @@ impl DataStore { let mut deleted_regions = Vec::with_capacity(unfiltered_deleted_regions.len()); + let mut volume_set: HashSet = + HashSet::with_capacity(unfiltered_deleted_regions.len()); + for (dataset, region, region_snapshot, volume) in unfiltered_deleted_regions { @@ -1212,10 +1232,61 @@ impl DataStore { continue; } + if let Some(volume) = &volume { + volume_set.insert(volume.id()); + } + deleted_regions.push((dataset, region, volume)); } - Ok(deleted_regions) + let regions_for_deletion: HashSet = + deleted_regions.iter().map(|(_, region, _)| region.id()).collect(); + + let mut volumes = Vec::with_capacity(deleted_regions.len()); + + for volume_id in volume_set { + // Do not return a volume hard-deletion if there are still lingering + // read/write regions, unless all those lingering read/write regions + // will be deleted from the result of returning from this function. + let allocated_rw_regions: HashSet = + Self::get_allocated_regions_query(volume_id) + .get_results_async::<(Dataset, Region)>(conn) + .await? + .into_iter() + .filter_map(|(_, region)| { + if !region.read_only() { + Some(region.id()) + } else { + None + } + }) + .collect(); + + if allocated_rw_regions.is_subset(®ions_for_deletion) { + // If all the allocated rw regions for this volume are in the + // set of regions being returned for deletion, then we can + // hard-delete this volume. Read-only region accounting should + // have already been updated by soft-deleting this volume. + // + // Note: we'll be in this branch if allocated_rw_regions is + // empty. I believe the only time we'll hit this empty case is + // when the volume is fully populated with read-only resources + // (read-only regions and region snapshots). + volumes.push(volume_id); + } else { + // Not all r/w regions allocated to this volume are being + // deleted here, so we can't hard-delete the volume yet. + } + } + + Ok(FreedCrucibleResources { + datasets_and_regions: deleted_regions + .into_iter() + .map(|(d, r, _)| (d, r)) + .collect(), + + volumes, + }) } pub async fn read_only_resources_associated_with_volume( @@ -2621,8 +2692,11 @@ pub enum VolumeReplaceResult { // this call performed the replacement Done, - // the "existing" volume was deleted - ExistingVolumeDeleted, + // the "existing" volume was soft deleted + ExistingVolumeSoftDeleted, + + // the "existing" volume was hard deleted + ExistingVolumeHardDeleted, } impl DataStore { @@ -2747,14 +2821,14 @@ impl DataStore { // perform the region replacement now, and this will short-circuit // the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeHardDeleted); }; if old_volume.time_deleted.is_some() { // Existing volume was soft-deleted, so return here for the same // reason: the region replacement process should be short-circuited // now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeSoftDeleted); } let old_vcr: VolumeConstructionRequest = @@ -3001,14 +3075,14 @@ impl DataStore { // perform the region replacement now, and this will short-circuit // the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeHardDeleted); }; if old_volume.time_deleted.is_some() { // Existing volume was soft-deleted, so return here for the same // reason: the region replacement process should be short-circuited // now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeSoftDeleted); } let old_vcr: VolumeConstructionRequest = @@ -3682,7 +3756,15 @@ impl DataStore { let mut paginator = Paginator::new(SQL_BATCH_SIZE); let conn = self.pool_connection_authorized(opctx).await?; - let needle = address.to_string(); + let needle = match address { + SocketAddr::V4(_) => { + return Err(Error::internal_error(&format!( + "find_volumes_referencing_socket_addr not ipv6: {address}" + ))); + } + + SocketAddr::V6(addr) => addr, + }; while let Some(p) = paginator.next() { use db::schema::volume::dsl; @@ -3699,7 +3781,23 @@ impl DataStore { paginator = p.found_batch(&haystack, &|r| r.id()); for volume in haystack { - if volume.data().contains(&needle) { + let vcr: VolumeConstructionRequest = + match serde_json::from_str(&volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(Error::internal_error(&format!( + "cannot deserialize volume data for {}: {e}", + volume.id(), + ))); + } + }; + + let rw_reference = region_in_vcr(&vcr, &needle) + .map_err(|e| Error::internal_error(&e.to_string()))?; + let ro_reference = read_only_target_in_vcr(&vcr, &needle) + .map_err(|e| Error::internal_error(&e.to_string()))?; + + if rw_reference || ro_reference { volumes.push(volume); } } diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs index 7ea88c8542..598d9d77a2 100644 --- a/nexus/db-queries/src/db/datastore/volume_repair.rs +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -11,6 +11,8 @@ use crate::db::datastore::RunnableQuery; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::VolumeRepair; +use crate::db::DbConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::result::DatabaseErrorKind; @@ -19,14 +21,75 @@ use omicron_common::api::external::Error; use uuid::Uuid; impl DataStore { - pub(super) fn volume_repair_insert_query( + /// Insert a volume repair record, taking a "lock" on the volume pointed to + /// by volume id with some repair id. + /// + /// If there exists a record that has a matching volume id and repair id, + /// return Ok(()). + /// + /// If there is no volume that matches the given volume id, return an error: + /// it should not be possible to lock a volume that does not exist! Note + /// that it is possible to lock a soft-deleted volume. + /// + /// If there is already an existing record that has a matching volume id but + /// a different repair id, then this function returns an Error::conflict. + pub(super) async fn volume_repair_insert_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, volume_id: Uuid, repair_id: Uuid, - ) -> impl RunnableQuery { + ) -> Result<(), diesel::result::Error> { use db::schema::volume_repair::dsl; - diesel::insert_into(dsl::volume_repair) + // If a lock that matches the arguments exists already, return Ok + // + // Note: if rerunning this function (for example if a saga node was + // rerun), the volume could have existed when this lock was inserted the + // first time, but have been deleted now. + let maybe_lock = dsl::volume_repair + .filter(dsl::repair_id.eq(repair_id)) + .filter(dsl::volume_id.eq(volume_id)) + .first_async::(conn) + .await + .optional()?; + + if maybe_lock.is_some() { + return Ok(()); + } + + // Do not allow a volume repair record to be created if the volume does + // not exist, or was hard-deleted! + let maybe_volume = Self::volume_get_impl(conn, volume_id).await?; + + if maybe_volume.is_none() { + return Err(err.bail(Error::invalid_request(format!( + "cannot create record: volume {volume_id} does not exist" + )))); + } + + // Do not check for soft-deletion here: We may want to request locks for + // soft-deleted volumes. + + match diesel::insert_into(dsl::volume_repair) .values(VolumeRepair { volume_id, repair_id }) + .execute_async(conn) + .await + { + Ok(_) => Ok(()), + + Err(e) => match e { + DieselError::DatabaseError( + DatabaseErrorKind::UniqueViolation, + ref error_information, + ) if error_information.constraint_name() + == Some("volume_repair_pkey") => + { + Err(err.bail(Error::conflict("volume repair lock"))) + } + + _ => Err(e), + }, + } } pub async fn volume_repair_lock( @@ -36,21 +99,25 @@ impl DataStore { repair_id: Uuid, ) -> Result<(), Error> { let conn = self.pool_connection_authorized(opctx).await?; - Self::volume_repair_insert_query(volume_id, repair_id) - .execute_async(&*conn) + let err = OptionalError::new(); + + self.transaction_retry_wrapper("volume_repair_lock") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, repair_id, + ) + .await + } + }) .await - .map(|_| ()) - .map_err(|e| match e { - DieselError::DatabaseError( - DatabaseErrorKind::UniqueViolation, - ref error_information, - ) if error_information.constraint_name() - == Some("volume_repair_pkey") => - { - Error::conflict("volume repair lock") + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - - _ => public_error_from_diesel(e, ErrorHandler::Server), }) } @@ -102,6 +169,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn volume_lock_conflict_error_returned() { @@ -113,6 +181,20 @@ mod test { let lock_2 = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore.volume_repair_lock(&opctx, volume_id, lock_1).await.unwrap(); let err = datastore @@ -125,4 +207,55 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + /// Assert that you can't take a volume repair lock if the volume does not + /// exist yet! + #[tokio::test] + async fn volume_lock_should_fail_without_volume() { + let logctx = + dev::test_setup_log("volume_lock_should_fail_without_volume"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let lock_1 = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_repair_lock(&opctx, volume_id, lock_1) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn volume_lock_relock_allowed() { + let logctx = dev::test_setup_log("volume_lock_relock_allowed"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let lock_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + datastore.volume_repair_lock(&opctx, volume_id, lock_id).await.unwrap(); + datastore.volume_repair_lock(&opctx, volume_id, lock_id).await.unwrap(); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index ad39777054..f107f2af71 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -825,7 +825,7 @@ impl BackgroundTasksInitializer { done", period: config.region_snapshot_replacement_finish.period_secs, task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( - datastore, + datastore, sagas, )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index 71b49c5a6f..caa262bc8c 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -23,6 +23,7 @@ use nexus_db_model::RegionReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionReplacementStatus; +use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TypedUuid; use serde_json::json; @@ -42,7 +43,7 @@ impl RegionReplacementDetector { &self, serialized_authn: authn::saga::Serialized, request: RegionReplacement, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_replacement_start::Params { serialized_authn, request, @@ -135,15 +136,31 @@ impl BackgroundTask for RegionReplacementDetector { } Err(e) => { - let s = format!( - "error adding region replacement request for \ - region {} volume id {}: {e}", - region.id(), - region.volume_id(), - ); - error!(&log, "{s}"); + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + let s = format!( + "error adding region replacement \ + request for region {} volume id {}: \ + {e}", + region.id(), + region.volume_id(), + ); + error!(&log, "{s}"); + + status.errors.push(s); + } + } - status.errors.push(s); continue; } } @@ -174,7 +191,9 @@ impl BackgroundTask for RegionReplacementDetector { // If the replacement request is in the `requested` state and // the request's volume was soft-deleted or hard-deleted, avoid // sending the start request and instead transition the request - // to completed + // to completed. Note the saga will do the right thing if the + // volume is deleted, but this avoids the overhead of starting + // it. let volume_deleted = match self .datastore @@ -315,6 +334,21 @@ mod test { // Add a region replacement request for a fake region let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); let request_id = request.id; diff --git a/nexus/src/app/background/tasks/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs index e7fe0d6338..6cc28f9dfd 100644 --- a/nexus/src/app/background/tasks/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -258,6 +258,7 @@ mod test { use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -288,6 +289,20 @@ mod test { let new_region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::Running; @@ -382,6 +397,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state ReplacementDone. Set the new_region_id to the region created // above. @@ -481,6 +510,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state Running. Set the new_region_id to the region created above. let request = { @@ -630,6 +673,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state Running. Set the new_region_id to the region created above. let request = { diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 0eebd37fb7..61a84c579d 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -8,9 +8,15 @@ //! Once all related region snapshot replacement steps are done, the region //! snapshot replacement can be completed. +use crate::app::authn; use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; +use crate::app::sagas; +use crate::app::sagas::region_snapshot_replacement_finish::*; +use crate::app::sagas::NexusSaga; use futures::future::BoxFuture; use futures::FutureExt; +use nexus_db_model::RegionSnapshotReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus; @@ -19,11 +25,31 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFinishDetector { datastore: Arc, + sagas: Arc, } impl RegionSnapshotReplacementFinishDetector { - pub fn new(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore } + pub fn new(datastore: Arc, sagas: Arc) -> Self { + RegionSnapshotReplacementFinishDetector { datastore, sagas } + } + + async fn send_finish_request( + &self, + opctx: &OpContext, + request: RegionSnapshotReplacement, + ) -> Result<(), omicron_common::api::external::Error> { + let params = sagas::region_snapshot_replacement_finish::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + request, + }; + + let saga_dag = SagaRegionSnapshotReplacementFinish::prepare(¶ms)?; + + // We only care that the saga was started, and don't wish to wait for it + // to complete, so use `StartSaga::saga_start`, rather than `saga_run`. + self.sagas.saga_start(saga_dag).await?; + + Ok(()) } async fn transition_requests_to_done( @@ -120,21 +146,23 @@ impl RegionSnapshotReplacementFinishDetector { } }; - // Transition region snapshot replacement to Complete - match self - .datastore - .set_region_snapshot_replacement_complete(opctx, request.id) - .await - { + let request_id = request.id; + + match self.send_finish_request(opctx, request).await { Ok(()) => { - let s = format!("set request {} to done", request.id); + let s = format!( + "region snapshot replacement finish invoked ok for \ + {request_id}" + ); + info!(&log, "{s}"); - status.records_set_to_done.push(s); + status.finish_invoked_ok.push(s); } Err(e) => { let s = format!( - "marking snapshot replacement as done failed: {e}" + "invoking region snapshot replacement finish for \ + {request_id} failed: {e}", ); error!(&log, "{s}"); status.errors.push(s); @@ -170,6 +198,7 @@ mod test { use nexus_db_queries::db::datastore::region_snapshot_replacement; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -186,8 +215,10 @@ mod test { datastore.clone(), ); - let mut task = - RegionSnapshotReplacementFinishDetector::new(datastore.clone()); + let mut task = RegionSnapshotReplacementFinishDetector::new( + datastore.clone(), + nexus.sagas.clone(), + ); // Noop test let result: RegionSnapshotReplacementFinishStatus = @@ -208,11 +239,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -232,6 +277,7 @@ mod test { .unwrap(); let new_region_id = Uuid::new_v4(); + let new_region_volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); datastore @@ -240,6 +286,7 @@ mod test { request_id, operating_saga_id, new_region_id, + new_region_volume_id, old_snapshot_volume_id, ) .await @@ -267,14 +314,44 @@ mod test { let operating_saga_id = Uuid::new_v4(); + let step_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step_1 = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step_1.replacement_state = RegionSnapshotReplacementStepState::Complete; step_1.operating_saga_id = Some(operating_saga_id); let step_1_id = step_1.id; + let step_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step_2 = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step_2.replacement_state = RegionSnapshotReplacementStepState::Complete; step_2.operating_saga_id = Some(operating_saga_id); let step_2_id = step_2.id; @@ -335,8 +412,9 @@ mod test { assert_eq!( result, RegionSnapshotReplacementFinishStatus { - records_set_to_done: vec![format!( - "set request {request_id} to done" + finish_invoked_ok: vec![format!( + "region snapshot replacement finish invoked ok for \ + {request_id}" )], errors: vec![], }, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index db30e1d074..57bbf3741c 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -155,6 +155,7 @@ mod test { use nexus_db_model::RegionSnapshotReplacementState; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -199,11 +200,25 @@ mod test { let request_1_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -219,11 +234,25 @@ mod test { let request_2_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 04d86a1268..f2b82a3943 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -23,6 +23,7 @@ use nexus_db_model::RegionSnapshotReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; +use omicron_common::api::external::Error; use serde_json::json; use std::sync::Arc; @@ -40,7 +41,7 @@ impl RegionSnapshotReplacementDetector { &self, serialized_authn: authn::saga::Serialized, request: RegionSnapshotReplacement, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_snapshot_replacement_start::Params { serialized_authn, request, @@ -138,17 +139,33 @@ impl RegionSnapshotReplacementDetector { } Err(e) => { - let s = - format!("error creating replacement request: {e}"); - - error!( - &log, - "{s}"; - "snapshot_id" => %region_snapshot.snapshot_id, - "region_id" => %region_snapshot.region_id, - "dataset_id" => %region_snapshot.dataset_id, - ); - status.errors.push(s); + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + let s = format!( + "error creating replacement request: {e}" + ); + + error!( + &log, + "{s}"; + "snapshot_id" => %region_snapshot.snapshot_id, + "region_id" => %region_snapshot.region_id, + "dataset_id" => %region_snapshot.dataset_id, + ); + + status.errors.push(s); + } + } } } } @@ -185,6 +202,67 @@ impl RegionSnapshotReplacementDetector { for request in requests { let request_id = request.id; + // If the region snapshot is gone, then there are no more references + // in any volume, and the whole region snapshot replacement can be + // fast-tracked to Complete. + + let maybe_region_snapshot = match self + .datastore + .region_snapshot_get( + request.old_dataset_id.into(), + request.old_region_id, + request.old_snapshot_id, + ) + .await + { + Ok(maybe_region_snapshot) => maybe_region_snapshot, + + Err(e) => { + let s = format!("query for region snapshot failed: {e}"); + + error!( + &log, + "{s}"; + "request.snapshot_id" => %request.old_snapshot_id, + "request.region_id" => %request.old_region_id, + "request.dataset_id" => %request.old_dataset_id, + ); + status.errors.push(s); + return; + } + }; + + if maybe_region_snapshot.is_none() { + match self + .datastore + .set_region_snapshot_replacement_complete_from_requested( + &opctx, request.id, + ) + .await + { + Ok(()) => { + let s = format!( + "region snapshot replacement {request_id} \ + completed ok" + ); + info!(&log, "{s}"); + status.requests_completed_ok.push(s); + } + + Err(e) => { + let s = format!( + "query to set region snapshot request state \ + to complete failed: {e}" + ); + + error!(&log, "{s}"; "request.id" => %request_id); + status.errors.push(s); + } + } + + continue; + } + let result = self .send_start_request( authn::saga::Serialized::for_opctx(opctx), @@ -269,6 +347,7 @@ mod test { use nexus_db_model::Snapshot; use nexus_db_model::SnapshotIdentity; use nexus_db_model::SnapshotState; + use nexus_db_model::VolumeResourceUsage; use nexus_db_queries::authz; use nexus_db_queries::db::lookup::LookupPath; use nexus_test_utils::resource_helpers::create_project; @@ -276,6 +355,8 @@ mod test { use omicron_common::api::external; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; + use sled_agent_client::types::CrucibleOpts; + use sled_agent_client::types::VolumeConstructionRequest; use std::collections::BTreeMap; use uuid::Uuid; @@ -309,19 +390,43 @@ mod test { // Add a region snapshot replacement request for a fake region snapshot - let request = RegionSnapshotReplacement::new( - DatasetUuid::new_v4(), // dataset id - Uuid::new_v4(), // region id - Uuid::new_v4(), // snapshot id + let dataset_id = DatasetUuid::new_v4(); + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let region_snapshot = RegionSnapshot::new( + dataset_id, + region_id, + snapshot_id, + "[::]:12345".to_string(), ); + datastore.region_snapshot_create(region_snapshot).await.unwrap(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -339,6 +444,7 @@ mod test { "region snapshot replacement start invoked ok for \ {request_id}" )], + requests_completed_ok: vec![], errors: vec![], }, ); @@ -407,6 +513,22 @@ mod test { .await .unwrap(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .project_ensure_snapshot( &opctx, @@ -426,7 +548,7 @@ mod test { project_id, disk_id: Uuid::new_v4(), - volume_id: Uuid::new_v4(), + volume_id, destination_volume_id: Uuid::new_v4(), gen: Generation::new(), @@ -508,4 +630,194 @@ mod test { dataset_to_zpool.get(&first_zpool.id.to_string()).unwrap(); assert_eq!(&request.old_dataset_id.to_string(), dataset_id); } + + #[nexus_test(server = crate::Server)] + async fn test_delete_region_snapshot_replacement_volume_causes_complete( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = RegionSnapshotReplacementDetector::new( + datastore.clone(), + starter.clone(), + ); + + // Noop test + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementStartStatus::default()); + assert_eq!(starter.count_reset(), 0); + + // The volume reference counting machinery needs a fake dataset to exist + // (region snapshots are joined with the dataset table when creating the + // CrucibleResources object) + + let disk_test = DiskTest::new(cptestctx).await; + + let dataset_id = disk_test.zpools().next().unwrap().datasets[0].id; + + // Add a region snapshot replacement request for a fake region snapshot + + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let region_snapshot = RegionSnapshot::new( + dataset_id, + region_id, + snapshot_id, + "[::1]:12345".to_string(), + ); + + datastore + .region_snapshot_create(region_snapshot.clone()) + .await + .unwrap(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + + let request_id = request.id; + + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + // the region snapshot + String::from("[::1]:12345"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Assert usage + + let records = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id.into(), + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert!(!records.is_empty()); + assert_eq!(records[0].volume_id, volume_id); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, request, volume_id, + ) + .await + .unwrap(); + + // Before the task starts, soft-delete the volume, and delete the + // region snapshot (like the volume delete saga would do). + + let crucible_resources = + datastore.soft_delete_volume(volume_id).await.unwrap(); + + // Assert no more usage + + let records = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id.into(), + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert!(records.is_empty()); + + // The region snapshot should have been returned for deletion + + let datasets_and_snapshots = + datastore.snapshots_to_delete(&crucible_resources).await.unwrap(); + + assert!(!datasets_and_snapshots.is_empty()); + + let region_snapshot_to_delete = &datasets_and_snapshots[0].1; + + assert_eq!( + region_snapshot_to_delete.dataset_id, + region_snapshot.dataset_id, + ); + assert_eq!( + region_snapshot_to_delete.region_id, + region_snapshot.region_id, + ); + assert_eq!( + region_snapshot_to_delete.snapshot_id, + region_snapshot.snapshot_id, + ); + + // So delete it! + + datastore + .region_snapshot_remove( + region_snapshot_to_delete.dataset_id.into(), + region_snapshot_to_delete.region_id, + region_snapshot_to_delete.snapshot_id, + ) + .await + .unwrap(); + + // Activate the task - it should pick the request up but not attempt to + // run the start saga + + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + + assert_eq!( + result, + RegionSnapshotReplacementStartStatus { + requests_created_ok: vec![], + start_invoked_ok: vec![], + requests_completed_ok: vec![format!( + "region snapshot replacement {request_id} completed ok" + )], + errors: vec![], + }, + ); + + // Assert start saga not invoked + assert_eq!(starter.count_reset(), 0); + } } diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index ac259ecba8..f481126312 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -32,10 +32,11 @@ use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_model::RegionSnapshotReplacementStep; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::datastore::region_snapshot_replacement; +use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::DataStore; use nexus_types::identity::Asset; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; +use omicron_common::api::external::Error; use serde_json::json; use std::sync::Arc; @@ -53,7 +54,7 @@ impl RegionSnapshotReplacementFindAffected { &self, opctx: &OpContext, request: RegionSnapshotReplacementStep, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_snapshot_replacement_step::Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), request, @@ -70,7 +71,7 @@ impl RegionSnapshotReplacementFindAffected { &self, opctx: &OpContext, request: RegionSnapshotReplacementStep, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let Some(old_snapshot_volume_id) = request.old_snapshot_volume_id else { // This state is illegal! @@ -79,9 +80,7 @@ impl RegionSnapshotReplacementFindAffected { request.id, ); - return Err(omicron_common::api::external::Error::internal_error( - &s, - )); + return Err(Error::internal_error(&s)); }; let params = @@ -315,6 +314,21 @@ impl RegionSnapshotReplacementFindAffected { // functions execute), an indefinite amount of work would be // created, continually "moving" the snapshot_addr from // temporary volume to temporary volume. + // + // If the volume was soft deleted, then skip making a step for + // it. + + if volume.time_deleted.is_some() { + info!( + log, + "volume was soft-deleted, skipping creating a step for \ + it"; + "request id" => ?request.id, + "volume id" => ?volume.id(), + ); + + continue; + } match self .datastore @@ -326,7 +340,7 @@ impl RegionSnapshotReplacementFindAffected { .await { Ok(insertion_result) => match insertion_result { - region_snapshot_replacement::InsertStepResult::Inserted { step_id } => { + InsertStepResult::Inserted { step_id } => { let s = format!("created {step_id}"); info!( log, @@ -337,7 +351,7 @@ impl RegionSnapshotReplacementFindAffected { status.step_records_created_ok.push(s); } - region_snapshot_replacement::InsertStepResult::AlreadyHandled { .. } => { + InsertStepResult::AlreadyHandled { .. } => { info!( log, "step already exists for volume id"; @@ -345,17 +359,32 @@ impl RegionSnapshotReplacementFindAffected { "volume id" => ?volume.id(), ); } - } + }, Err(e) => { let s = format!("error creating step request: {e}"); - error!( + warn!( log, "{s}"; "request id" => ?request.id, "volume id" => ?volume.id(), ); - status.errors.push(s); + + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + status.errors.push(s); + } + } } } } @@ -392,13 +421,81 @@ impl RegionSnapshotReplacementFindAffected { }; for request in step_requests { - let request_id = request.id; + let request_step_id = request.id; + + // Check if the volume was deleted _after_ the replacement step was + // created. Avoid launching the region snapshot replacement step + // saga if it was deleted: the saga will do the right thing if it is + // deleted, but this avoids the overhead of starting it. + + let volume_deleted = + match self.datastore.volume_deleted(request.volume_id).await { + Ok(volume_deleted) => volume_deleted, + + Err(e) => { + let s = format!( + "error checking if volume id {} was \ + deleted: {e}", + request.volume_id, + ); + error!(&log, "{s}"); + + status.errors.push(s); + continue; + } + }; + + if volume_deleted { + // Volume was soft or hard deleted, so proceed with clean up, + // which if this is in state Requested there won't be any + // additional associated state, so transition the record to + // Completed. + + info!( + &log, + "request {} step {} volume {} was soft or hard deleted!", + request.request_id, + request_step_id, + request.volume_id, + ); + + let result = self + .datastore + .set_region_snapshot_replacement_step_volume_deleted_from_requested( + opctx, request, + ) + .await; + + match result { + Ok(()) => { + let s = format!( + "request step {request_step_id} transitioned from \ + requested to volume_deleted" + ); + + info!(&log, "{s}"); + status.step_set_volume_deleted_ok.push(s); + } + + Err(e) => { + let s = format!( + "error transitioning {request_step_id} from \ + requested to complete: {e}" + ); + + error!(&log, "{s}"); + status.errors.push(s); + } + } + + continue; + } match self.send_start_request(opctx, request.clone()).await { Ok(()) => { let s = format!( "region snapshot replacement step saga invoked ok for \ - {request_id}" + {request_step_id}" ); info!( @@ -413,7 +510,7 @@ impl RegionSnapshotReplacementFindAffected { Err(e) => { let s = format!( "invoking region snapshot replacement step saga for \ - {request_id} failed: {e}" + {request_step_id} failed: {e}" ); error!( @@ -575,11 +672,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -599,6 +710,7 @@ mod test { .unwrap(); let new_region_id = Uuid::new_v4(); + let new_region_volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); datastore @@ -607,6 +719,7 @@ mod test { request_id, operating_saga_id, new_region_id, + new_region_volume_id, old_snapshot_volume_id, ) .await @@ -731,11 +844,27 @@ mod test { // Now, add some Complete records and make sure the garbage collection // saga is invoked. + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let result = datastore .insert_region_snapshot_replacement_step(&opctx, { let mut record = RegionSnapshotReplacementStep::new( Uuid::new_v4(), - Uuid::new_v4(), + volume_id, ); record.replacement_state = @@ -747,16 +876,29 @@ mod test { .await .unwrap(); - assert!(matches!( - result, - region_snapshot_replacement::InsertStepResult::Inserted { .. } - )); + assert!(matches!(result, InsertStepResult::Inserted { .. })); + + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); let result = datastore .insert_region_snapshot_replacement_step(&opctx, { let mut record = RegionSnapshotReplacementStep::new( Uuid::new_v4(), - Uuid::new_v4(), + volume_id, ); record.replacement_state = @@ -768,10 +910,7 @@ mod test { .await .unwrap(); - assert!(matches!( - result, - region_snapshot_replacement::InsertStepResult::Inserted { .. } - )); + assert!(matches!(result, InsertStepResult::Inserted { .. })); // Activate the task - it should pick the complete steps up and try to // run the region snapshot replacement step garbage collect saga diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 405a972976..d8ba6abbdd 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -39,6 +39,7 @@ pub mod project_create; pub mod region_replacement_drive; pub mod region_replacement_finish; pub mod region_replacement_start; +pub mod region_snapshot_replacement_finish; pub mod region_snapshot_replacement_garbage_collect; pub mod region_snapshot_replacement_start; pub mod region_snapshot_replacement_step; @@ -173,7 +174,8 @@ fn make_action_registry() -> ActionRegistry { region_snapshot_replacement_start::SagaRegionSnapshotReplacementStart, region_snapshot_replacement_garbage_collect::SagaRegionSnapshotReplacementGarbageCollect, region_snapshot_replacement_step::SagaRegionSnapshotReplacementStep, - region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect + region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect, + region_snapshot_replacement_finish::SagaRegionSnapshotReplacementFinish ]; #[cfg(test)] diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs index c7efa2f03f..2212e6fdf3 100644 --- a/nexus/src/app/sagas/region_replacement_finish.rs +++ b/nexus/src/app/sagas/region_replacement_finish.rs @@ -311,6 +311,20 @@ pub(crate) mod test { operating_saga_id: None, }; + datastore + .volume_create(nexus_db_model::Volume::new( + new_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: new_volume_id, + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_replacement_request(&opctx, request.clone()) .await diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index ce063dc5be..aa9e83c037 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -491,7 +491,7 @@ async fn srrs_get_old_region_address( async fn srrs_replace_region_in_volume( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; @@ -555,8 +555,6 @@ async fn srrs_replace_region_in_volume( .await .map_err(ActionError::action_failed)?; - debug!(log, "replacement returned {:?}", volume_replace_region_result); - match volume_replace_region_result { VolumeReplaceResult::AlreadyHappened | VolumeReplaceResult::Done => { // The replacement was done either by this run of this saga node, or @@ -565,10 +563,11 @@ async fn srrs_replace_region_in_volume( // with the rest of the saga (to properly clean up allocated // resources). - Ok(()) + Ok(volume_replace_region_result) } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // Unwind the saga here to clean up the resources allocated during // this saga. The associated background task will transition this // request's state to Completed. diff --git a/nexus/src/app/sagas/region_snapshot_replacement_finish.rs b/nexus/src/app/sagas/region_snapshot_replacement_finish.rs new file mode 100644 index 0000000000..d992f753d6 --- /dev/null +++ b/nexus/src/app/sagas/region_snapshot_replacement_finish.rs @@ -0,0 +1,211 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! After the change to store a "new region volume" in the region snapshot +//! replacement request, that volume requires garbage collection before the +//! region snapshot replacement transitions to Complete. It's this saga's +//! responsibility to ensure that cleanup. This saga handles the following +//! region snapshot replacement request state transitions: +//! +//! ```text +//! Running <-- +//! | +//! | | +//! v | +//! | +//! Completing -- +//! +//! | +//! v +//! +//! Complete +//! ``` +//! +//! The first thing this saga does is set itself as the "operating saga" for the +//! request, and change the state to "Completing". Then, it performs the volume +//! delete sub-saga for the new region volume. Finally, it updates the region +//! snapshot replacement request by clearing the operating saga id and changing +//! the state to "Complete". +//! +//! Any unwind will place the state back into Running. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::sagas::declare_saga_actions; +use crate::app::sagas::volume_delete; +use crate::app::{authn, db}; +use serde::Deserialize; +use serde::Serialize; +use steno::ActionError; +use steno::Node; +use uuid::Uuid; + +// region snapshot replacement finish saga: input parameters + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub serialized_authn: authn::saga::Serialized, + pub request: db::model::RegionSnapshotReplacement, +} + +// region snapshot replacement finish saga: actions + +declare_saga_actions! { + region_snapshot_replacement_finish; + SET_SAGA_ID -> "unused_1" { + + rsrfs_set_saga_id + - rsrfs_set_saga_id_undo + } + UPDATE_REQUEST_RECORD -> "unused_4" { + + rsrfs_update_request_record + } +} + +// region snapshot replacement finish saga: definition + +#[derive(Debug)] +pub(crate) struct SagaRegionSnapshotReplacementFinish; +impl NexusSaga for SagaRegionSnapshotReplacementFinish { + const NAME: &'static str = "region-snapshot-replacement-finish"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + region_snapshot_replacement_finish_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(Node::action( + "saga_id", + "GenerateSagaId", + ACTION_GENERATE_ID.as_ref(), + )); + + builder.append(set_saga_id_action()); + + if let Some(new_region_volume_id) = params.request.new_region_volume_id + { + let subsaga_params = volume_delete::Params { + serialized_authn: params.serialized_authn.clone(), + volume_id: new_region_volume_id, + }; + + let subsaga_dag = { + let subsaga_builder = steno::DagBuilder::new( + steno::SagaName::new(volume_delete::SagaVolumeDelete::NAME), + ); + volume_delete::SagaVolumeDelete::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + + builder.append(Node::constant( + "params_for_volume_delete_subsaga", + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + "params_for_volume_delete_subsaga".to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "volume_delete_subsaga_no_result", + subsaga_dag, + "params_for_volume_delete_subsaga", + )); + } + + builder.append(update_request_record_action()); + + Ok(builder.build()?) + } +} + +// region snapshot replacement finish saga: action implementations + +async fn rsrfs_set_saga_id( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Change the request record here to an intermediate "completing" state to + // block out other sagas that will be triggered for the same request. + osagactx + .datastore() + .set_region_snapshot_replacement_completing( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrfs_set_saga_id_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + osagactx + .datastore() + .undo_set_region_snapshot_replacement_completing( + &opctx, + params.request.id, + saga_id, + ) + .await?; + + Ok(()) +} + +async fn rsrfs_update_request_record( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Update the replacement request record to 'Complete' and clear the + // operating saga id. There is no undo step for this, it should succeed + // idempotently. + datastore + .set_region_snapshot_replacement_complete( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} diff --git a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs index 762182724b..675b2b0cb3 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs @@ -284,11 +284,27 @@ pub(crate) mod test { RegionSnapshotReplacementState::ReplacementDone; request.old_snapshot_volume_id = Some(old_snapshot_volume_id); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( &opctx, request.clone(), - Uuid::new_v4(), + volume_id, ) .await .unwrap(); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 2092c3065a..bb5fd60209 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -139,6 +139,10 @@ declare_saga_actions! { NEW_REGION_ENSURE -> "ensured_dataset_and_region" { + rsrss_new_region_ensure } + NEW_REGION_VOLUME_CREATE -> "new_region_volume" { + + rsrss_new_region_volume_create + - rsrss_new_region_volume_create_undo + } GET_OLD_SNAPSHOT_VOLUME_ID -> "old_snapshot_volume_id" { + rsrss_get_old_snapshot_volume_id } @@ -183,12 +187,19 @@ impl NexusSaga for SagaRegionSnapshotReplacementStart { ACTION_GENERATE_ID.as_ref(), )); + builder.append(Node::action( + "new_region_volume_id", + "GenerateNewRegionVolumeId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(set_saga_id_action()); builder.append(get_alloc_region_params_action()); builder.append(alloc_new_region_action()); builder.append(find_new_region_action()); builder.append(new_region_ensure_undo_action()); builder.append(new_region_ensure_action()); + builder.append(new_region_volume_create_action()); builder.append(get_old_snapshot_volume_id_action()); builder.append(create_fake_volume_action()); builder.append(replace_snapshot_in_volume_action()); @@ -509,6 +520,94 @@ async fn rsrss_new_region_ensure_undo( Ok(()) } +async fn rsrss_new_region_volume_create( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + + let (new_dataset, ensured_region) = sagactx.lookup::<( + db::model::Dataset, + crucible_agent_client::types::Region, + )>( + "ensured_dataset_and_region", + )?; + + let Some(new_dataset_address) = new_dataset.address() else { + return Err(ActionError::action_failed(format!( + "dataset {} does not have an address!", + new_dataset.id(), + ))); + }; + + let new_region_address = SocketAddrV6::new( + *new_dataset_address.ip(), + ensured_region.port_number, + 0, + 0, + ); + + // Create a volume to inflate the reference count of the newly created + // read-only region. If this is not done it's possible that a user could + // delete the snapshot volume _after_ the new read-only region was swapped + // in, removing the last reference to it and causing garbage collection. + + let volume_construction_request = VolumeConstructionRequest::Volume { + id: new_region_volume_id, + block_size: 0, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 0, + blocks_per_extent: 0, + extent_count: 0, + gen: 0, + opts: CrucibleOpts { + id: new_region_volume_id, + target: vec![new_region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&volume_construction_request) + .map_err(|e| { + ActionError::action_failed(Error::internal_error(&e.to_string())) + })?; + + let volume = db::model::Volume::new(new_region_volume_id, volume_data); + + osagactx + .datastore() + .volume_create(volume) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrss_new_region_volume_create_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + + // Delete the volume. + + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + osagactx.datastore().volume_hard_delete(new_region_volume_id).await?; + + Ok(()) +} + async fn rsrss_get_old_snapshot_volume_id( sagactx: NexusActionContext, ) -> Result { @@ -695,7 +794,7 @@ async fn get_replace_params( async fn rsrss_replace_snapshot_in_volume( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); @@ -729,10 +828,11 @@ async fn rsrss_replace_snapshot_in_volume( // if the transaction occurred on the non-deleted volume so proceed // with the rest of the saga. - Ok(()) + Ok(volume_replace_snapshot_result) } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // If the snapshot volume was deleted, we still want to proceed with // replacing the rest of the uses of the region snapshot. Note this // also covers the case where this saga node runs (performing the @@ -741,7 +841,7 @@ async fn rsrss_replace_snapshot_in_volume( // deleted. If this saga unwound here, that would violate the // property of idempotency. - Ok(()) + Ok(volume_replace_snapshot_result) } } } @@ -815,6 +915,9 @@ async fn rsrss_update_request_record( let old_region_volume_id = sagactx.lookup::("new_volume_id")?; + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + // Now that the region has been ensured and the construction request has // been updated, update the replacement request record to 'ReplacementDone' // and clear the operating saga id. There is no undo step for this, it @@ -825,6 +928,7 @@ async fn rsrss_update_request_record( params.request.id, saga_id, new_region_id, + new_region_volume_id, old_region_volume_id, ) .await @@ -1026,8 +1130,10 @@ pub(crate) mod test { .await .unwrap(); - assert_eq!(volumes.len(), 1); - assert_eq!(volumes[0].id(), db_snapshot.volume_id); + assert!(volumes + .iter() + .map(|v| v.id()) + .any(|vid| vid == db_snapshot.volume_id)); } fn new_test_params( diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index 7b1d598861..a236fcf62c 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -375,7 +375,8 @@ async fn rsrss_replace_snapshot_in_volume( // with the saga. } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // Proceed with the saga but skip the notification step. } } @@ -423,6 +424,20 @@ async fn rsrss_notify_upstairs( let params = sagactx.saga_params::()?; let log = sagactx.user_data().log(); + // If the associated volume was deleted, then skip this notification step as + // there is no Upstairs to talk to. Continue with the saga to transition the + // step request to Complete, and then perform the associated clean up. + + let volume_replace_snapshot_result = sagactx + .lookup::("volume_replace_snapshot_result")?; + if matches!( + volume_replace_snapshot_result, + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted + ) { + return Ok(()); + } + // Make an effort to notify a Propolis if one was booted for this volume. // This is best effort: if there is a failure, this saga will unwind and be // triggered again for the same request. If there is no Propolis booted for diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs index b83f917a70..15c6a39651 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs @@ -187,8 +187,24 @@ pub(crate) mod test { .await .unwrap(); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut request = - RegionSnapshotReplacementStep::new(Uuid::new_v4(), Uuid::new_v4()); + RegionSnapshotReplacementStep::new(Uuid::new_v4(), step_volume_id); request.replacement_state = RegionSnapshotReplacementStepState::Complete; request.old_snapshot_volume_id = Some(old_snapshot_volume_id); diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index a9be1f34ee..a8ded4e33c 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -27,12 +27,9 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; use crate::app::sagas::declare_saga_actions; -use nexus_db_model::Dataset; -use nexus_db_model::Region; -use nexus_db_model::Volume; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::CrucibleResources; -use nexus_types::identity::Asset; +use nexus_db_queries::db::datastore::FreedCrucibleResources; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -330,8 +327,6 @@ async fn svd_delete_crucible_snapshot_records( Ok(()) } -type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; - /// Deleting region snapshots in a previous saga node may have freed up regions /// that were deleted in the DB but couldn't be deleted by the Crucible Agent /// because a snapshot existed. Look for those here. These will be a different @@ -417,7 +412,7 @@ type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; /// another snapshot delete. async fn svd_find_freed_crucible_regions( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); // Find regions freed up for deletion by a previous saga node deleting the @@ -432,11 +427,7 @@ async fn svd_find_freed_crucible_regions( }, )?; - // Don't serialize the whole Volume, as the data field contains key material! - Ok(freed_datasets_regions_and_volumes - .into_iter() - .map(|x| (x.0, x.1, x.2.map(|v: Volume| v.id()))) - .collect()) + Ok(freed_datasets_regions_and_volumes) } async fn svd_delete_freed_crucible_regions( @@ -448,9 +439,11 @@ async fn svd_delete_freed_crucible_regions( // Find regions freed up for deletion by a previous saga node deleting the // region snapshots. let freed_datasets_regions_and_volumes = - sagactx.lookup::("freed_crucible_regions")?; + sagactx.lookup::("freed_crucible_regions")?; - for (dataset, region, volume_id) in freed_datasets_regions_and_volumes { + for (dataset, region) in + &freed_datasets_regions_and_volumes.datasets_and_regions + { // Send DELETE calls to the corresponding Crucible agents osagactx .nexus() @@ -477,18 +470,17 @@ async fn svd_delete_freed_crucible_regions( e, )) })?; + } - // Remove volume DB record - if let Some(volume_id) = volume_id { - osagactx.datastore().volume_hard_delete(volume_id).await.map_err( - |e| { - ActionError::action_failed(format!( - "failed to volume_hard_delete {}: {:?}", - volume_id, e, - )) - }, - )?; - } + for volume_id in &freed_datasets_regions_and_volumes.volumes { + osagactx.datastore().volume_hard_delete(*volume_id).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to volume_hard_delete {}: {:?}", + volume_id, e, + )) + }, + )?; } Ok(()) diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 32a2f24d9d..7c8857123c 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -305,7 +305,7 @@ pub async fn run_region_snapshot_replacement_finish( assert!(status.errors.is_empty()); - status.records_set_to_done.len() + status.finish_invoked_ok.len() } /// Run all replacement related background tasks until they aren't doing diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 5f7e92c393..c301372d5f 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -5,9 +5,12 @@ //! Tests related to region and region snapshot replacement use dropshot::test_util::ClientTestContext; +use nexus_client::types::LastResult; use nexus_db_model::PhysicalDiskPolicy; use nexus_db_model::RegionReplacementState; +use nexus_db_model::RegionSnapshotReplacementState; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; use nexus_test_utils::background::*; @@ -15,11 +18,22 @@ use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::resource_helpers::create_default_ip_pool; use nexus_test_utils::resource_helpers::create_disk; +use nexus_test_utils::resource_helpers::create_disk_from_snapshot; use nexus_test_utils::resource_helpers::create_project; +use nexus_test_utils::resource_helpers::create_snapshot; +use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::params; +use nexus_types::external_api::views; +use nexus_types::identity::Asset; +use nexus_types::internal_api::background::*; +use omicron_common::api::external; +use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use omicron_uuid_kinds::GenericUuid; use slog::Logger; +use std::collections::HashSet; +use std::net::SocketAddr; use std::sync::Arc; use uuid::Uuid; @@ -40,12 +54,37 @@ fn get_disk_url(disk_name: &str) -> String { format!("/v1/disks/{disk_name}?project={}", PROJECT_NAME) } +fn get_disks_url() -> String { + format!("/v1/disks?project={}", PROJECT_NAME) +} + +fn get_snapshot_url(snapshot_name: &str) -> String { + format!("/v1/snapshots/{snapshot_name}?project={}", PROJECT_NAME) +} + +fn get_snapshots_url() -> String { + format!("/v1/snapshots?project={}", PROJECT_NAME) +} + async fn create_project_and_pool(client: &ClientTestContext) -> Uuid { create_default_ip_pool(client).await; let project = create_project(client, PROJECT_NAME).await; project.identity.id } +async fn collection_list( + client: &ClientTestContext, + list_url: &str, +) -> Vec +where + T: Clone + serde::de::DeserializeOwned, +{ + NexusRequest::iter_collection_authn(client, list_url, "", None) + .await + .expect("failed to list") + .all_items +} + /// Assert that the first part of region replacement does not create a freed /// crucible region (that would be picked up by a volume delete saga) #[nexus_test] @@ -118,280 +157,355 @@ async fn test_region_replacement_does_not_create_freed_region( assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); } -struct RegionReplacementDeletedVolumeTest<'a> { - log: Logger, - datastore: Arc, - disk_test: DiskTest<'a>, - client: ClientTestContext, - internal_client: ClientTestContext, - replacement_request_id: Uuid, -} - -#[derive(Debug)] -struct ExpectedEndState(pub RegionReplacementState); - -#[derive(Debug)] -struct ExpectedIntermediateState(pub RegionReplacementState); - -impl<'a> RegionReplacementDeletedVolumeTest<'a> { - pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { - let nexus = &cptestctx.server.server_context().nexus; - - // Create four zpools, each with one dataset. This is required for - // region and region snapshot replacement to have somewhere to move the - // data. - let disk_test = DiskTestBuilder::new(&cptestctx) - .on_specific_sled(cptestctx.first_sled()) - .with_zpool_count(4) - .build() - .await; - - let client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - let datastore = nexus.datastore().clone(); - - let opctx = OpContext::for_tests( - cptestctx.logctx.log.new(o!()), - datastore.clone(), - ); - - // Create a disk - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, "disk").await; - - // Manually create the region replacement request for the first - // allocated region of that disk - - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk.identity.id) - .fetch() - .await - .unwrap(); +mod region_replacement { + use super::*; - assert_eq!(db_disk.id(), disk.identity.id); + #[derive(Debug)] + struct ExpectedEndState(pub RegionReplacementState); - let disk_allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - let (_, region) = &disk_allocated_regions[0]; + #[derive(Debug)] + struct ExpectedIntermediateState(pub RegionReplacementState); - let replacement_request_id = datastore - .create_region_replacement_request_for_region(&opctx, ®ion) - .await - .unwrap(); + #[derive(Debug)] + struct ExpectedStartState(pub RegionReplacementState); - // Assert the request is in state Requested + pub(super) struct DeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, + } - let region_replacement = datastore - .get_region_replacement_request_by_id( - &opctx, + impl<'a> DeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move + // the data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + // Manually create the region replacement request for the first + // allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = datastore + .get_allocated_regions(db_disk.volume_id) + .await + .unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let replacement_request_id = datastore + .create_region_replacement_request_for_region(&opctx, ®ion) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Requested, + ); + + DeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), replacement_request_id, - ) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Requested, - ); - - RegionReplacementDeletedVolumeTest { - log: cptestctx.logctx.log.new(o!()), - datastore, - disk_test, - client: client.clone(), - internal_client: internal_client.clone(), - replacement_request_id, + } } - } - pub fn opctx(&self) -> OpContext { - OpContext::for_tests(self.log.clone(), self.datastore.clone()) - } + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } - pub async fn delete_the_disk(&self) { - let disk_url = get_disk_url("disk"); - NexusRequest::object_delete(&self.client, &disk_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("failed to delete disk"); - } + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } - /// Make sure: - /// - /// - all region replacement related background tasks run to completion - /// - this harness' region replacement request has transitioned to Complete - /// - no Crucible resources are leaked - pub async fn finish_test(&self) { - // Make sure that all the background tasks can run to completion. + /// Make sure: + /// + /// - all region replacement related background tasks run to completion + /// - this harness' region replacement request has transitioned to + /// Complete + /// - no Crucible resources are leaked + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. - run_replacement_tasks_to_completion(&self.internal_client).await; + run_replacement_tasks_to_completion(&self.internal_client).await; - // Assert the request is in state Complete + // Assert the request is in state Complete - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, - ) - .await - .unwrap(); + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Complete, - ); + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Complete, + ); - // Assert there are no more Crucible resources + // Assert there are no more Crucible resources - assert!(self.disk_test.crucible_resources_deleted().await); - } + assert!(self.disk_test.crucible_resources_deleted().await); + } - async fn wait_for_request_state( - &self, - expected_end_state: ExpectedEndState, - expected_intermediate_state: ExpectedIntermediateState, - ) { - wait_for_condition( - || { - let datastore = self.datastore.clone(); - let opctx = self.opctx(); - let replacement_request_id = self.replacement_request_id; - - async move { - let region_replacement = datastore - .get_region_replacement_request_by_id( - &opctx, - replacement_request_id, - ) - .await - .unwrap(); - - let state = region_replacement.replacement_state; - - if state == expected_end_state.0 { - // The saga transitioned the request ok - Ok(()) - } else if state == expected_intermediate_state.0 { - // The saga is still running - Err(CondCheckError::<()>::NotYet) - } else { - // Any other state is not expected - panic!("unexpected state {state:?}!"); + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + expected_start_state: ExpectedStartState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + // If the expected start and end state are the same + // (i.e. there's a back edge in the associated request's + // state machine), then it's impossible to determine + // when the saga starts and stops based on the state. + if expected_end_state.0 == expected_start_state.0 { + if state == expected_end_state.0 { + // The saga transitioned the request ok, or + // hasn't started yet. Either way we have to + // return here, and the call site should perform + // an additional check for some associated + // expected result. + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } else { + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 + || state == expected_start_state.0 + { + // The saga is still running, or hasn't started + // yet. + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } } - } - }, - &std::time::Duration::from_millis(500), - &std::time::Duration::from_secs(60), - ) - .await - .expect("request transitioned to expected state"); - - // Assert the request state - - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), ) .await - .unwrap(); - - assert_eq!(region_replacement.replacement_state, expected_end_state.0); - } - - /// Run the "region replacement" task to transition the request to Running. - pub async fn transition_request_to_running(&self) { - // Activate the "region replacement" background task - - run_region_replacement(&self.internal_client).await; + .expect("request transitioned to expected state"); + + // Assert the request state + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + expected_end_state.0 + ); + } - // The activation above could only have started the associated saga, so - // wait until the request is in state Running. + /// Run the "region replacement" task to transition the request to + /// Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region replacement" background task - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::Running), - ExpectedIntermediateState(RegionReplacementState::Allocating), - ) - .await; - } + run_region_replacement(&self.internal_client).await; - /// Call the region replacement drive task to attach the associated volume - /// to the simulated pantry, ostensibly for reconciliation - pub async fn attach_request_volume_to_pantry(&self) { - // Run the "region replacement driver" task to attach the associated - // volume to the simulated pantry. + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. - run_region_replacement_driver(&self.internal_client).await; + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Allocating), + ExpectedStartState(RegionReplacementState::Requested), + ) + .await; + } - // The activation above could only have started the associated saga, so - // wait until the request is in the expected end state. + /// Call the region replacement drive task to attach the associated volume + /// to the simulated pantry, ostensibly for reconciliation + pub async fn attach_request_volume_to_pantry(&self) { + // Run the "region replacement driver" task to attach the associated + // volume to the simulated pantry. - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::Running), - ExpectedIntermediateState(RegionReplacementState::Driving), - ) - .await; + run_region_replacement_driver(&self.internal_client).await; - // Additionally, assert that the drive saga recorded that it sent the - // attachment request to the simulated pantry + // The activation above could only have started the associated saga, + // so wait until the request is in the expected end state. - let most_recent_step = self - .datastore - .current_region_replacement_request_step( - &self.opctx(), - self.replacement_request_id, + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Driving), + ExpectedStartState(RegionReplacementState::Running), ) - .await - .unwrap() - .unwrap(); - - assert!(most_recent_step.pantry_address().is_some()); - } + .await; - /// Manually activate the background attachment for the request volume - pub async fn manually_activate_attached_volume( - &self, - cptestctx: &'a ControlPlaneTestContext, - ) { - let pantry = - cptestctx.sled_agent.pantry_server.as_ref().unwrap().pantry.clone(); - - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, + // Additionally, assert that the drive saga recorded that it sent + // the attachment request to the simulated pantry. + // + // If `wait_for_request_state` has the same expected start and end + // state (as it does above), it's possible to exit that function + // having not yet started the saga yet, and this requires an + // additional `wait_for_condition` to wait for the expected recorded + // step. + + let most_recent_step = wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + match datastore + .current_region_replacement_request_step( + &opctx, + replacement_request_id, + ) + .await + .unwrap() + { + Some(step) => Ok(step), + + None => { + // The saga either has not started yet or is + // still running - see the comment before this + // check for more info. + Err(CondCheckError::<()>::NotYet) + } + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), ) .await - .unwrap(); + .expect("most recent step"); - pantry - .activate_background_attachment( - region_replacement.volume_id.to_string(), - ) - .await - .unwrap(); - } + assert!(most_recent_step.pantry_address().is_some()); + } - /// Transition request to ReplacementDone via the region replacement drive - /// saga - pub async fn transition_request_to_replacement_done(&self) { - // Run the "region replacement driver" task + /// Manually activate the background attachment for the request volume + pub async fn manually_activate_attached_volume( + &self, + cptestctx: &'a ControlPlaneTestContext, + ) { + let pantry = cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .clone(); + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + pantry + .activate_background_attachment( + region_replacement.volume_id.to_string(), + ) + .await + .unwrap(); + } - run_region_replacement_driver(&self.internal_client).await; + /// Transition request to ReplacementDone via the region replacement + /// drive saga + pub async fn transition_request_to_replacement_done(&self) { + // Run the "region replacement driver" task - // The activation above could only have started the associated saga, so - // wait until the request is in the expected end state. + run_region_replacement_driver(&self.internal_client).await; - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::ReplacementDone), - ExpectedIntermediateState(RegionReplacementState::Driving), - ) - .await; + // The activation above could only have started the associated saga, + // so wait until the request is in the expected end state. + + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::ReplacementDone), + ExpectedIntermediateState(RegionReplacementState::Driving), + ExpectedStartState(RegionReplacementState::Running), + ) + .await; + } } } @@ -401,7 +515,8 @@ impl<'a> RegionReplacementDeletedVolumeTest<'a> { async fn test_delete_volume_region_replacement_state_requested( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: delete the // disk, then finish the test. @@ -417,7 +532,8 @@ async fn test_delete_volume_region_replacement_state_requested( async fn test_delete_volume_region_replacement_state_running( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -437,7 +553,8 @@ async fn test_delete_volume_region_replacement_state_running( async fn test_delete_volume_region_replacement_state_running_on_pantry( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -459,7 +576,8 @@ async fn test_delete_volume_region_replacement_state_running_on_pantry( async fn test_delete_volume_region_replacement_state_replacement_done( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -481,3 +599,1109 @@ async fn test_delete_volume_region_replacement_state_replacement_done( test_harness.finish_test().await; } + +/// Assert that the problem experienced in issue 6353 is fixed +#[nexus_test] +async fn test_racing_replacements_for_soft_deleted_disk_volume( + cptestctx: &ControlPlaneTestContext, +) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create four zpools, each with one dataset. This is required for region + // and region snapshot replacement to have somewhere to move the data. + let sled_id = cptestctx.first_sled(); + let mut disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(4) + .build() + .await; + + // Create a disk, then a snapshot of that disk + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); + + let snapshot: views::Snapshot = object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: "snapshot".parse().unwrap(), + description: String::from("a snapshot"), + }, + disk: disk.identity.name.into(), + }, + ) + .await; + + // Before deleting the disk, save the DB model + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + // Next, expunge a physical disk that contains a region snapshot (which + // means it'll have the region too) + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let (dataset, region) = &disk_allocated_regions[0]; + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + + // Only one region snapshot should be been returned by the following call + // due to the allocation policy. + + let expunged_region_snapshots = datastore + .find_region_snapshots_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + assert_eq!(expunged_region_snapshots.len(), 1); + + for expunged_region_snapshot in expunged_region_snapshots { + assert_eq!(expunged_region_snapshot.snapshot_id, snapshot.identity.id); + } + + // Either one or two regions can be returned, depending on if the snapshot + // destination volume was allocated on to the physical disk that was + // expunged. + + let expunged_regions = datastore + .find_regions_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + match expunged_regions.len() { + 1 => { + assert_eq!(expunged_regions[0].id(), region.id()); + } + + 2 => { + assert!(expunged_regions.iter().any(|r| r.id() == region.id())); + + let (.., db_snapshot) = LookupPath::new(&opctx, datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let snapshot_allocated_datasets_and_regions = datastore + .get_allocated_regions(db_snapshot.destination_volume_id) + .await + .unwrap(); + + let snapshot_allocated_regions: Vec = + snapshot_allocated_datasets_and_regions + .into_iter() + .map(|(_, r)| r.id()) + .collect(); + + assert!(expunged_regions.iter().any(|region| { + snapshot_allocated_regions.contains(®ion.id()) + })); + } + + _ => { + panic!("unexpected number of expunged regions!"); + } + } + + // Now, race the region replacement with the region snapshot replacement: + // + // 1) region replacement will allocate a new region and swap it into the + // disk volume. + + let internal_client = &cptestctx.internal_client; + + let _ = + activate_background_task(&internal_client, "region_replacement").await; + + // After that task invocation, there should be one running region + // replacement for the disk's region. Filter out the replacement request for + // the snapshot destination volume if it's there. The above background task + // only starts the associated saga, so wait for it to complete. + + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let region_replacements: Vec<_> = datastore + .get_running_region_replacements(&opctx) + .await + .unwrap() + .into_iter() + .filter(|x| x.old_region_id == region.id()) + .collect(); + + if region_replacements.len() == 1 { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(20), + ) + .await + .expect("request transitioned to expected state"); + + let region_replacements: Vec<_> = datastore + .get_running_region_replacements(&opctx) + .await + .unwrap() + .into_iter() + .filter(|x| x.old_region_id == region.id()) + .collect(); + + assert_eq!(region_replacements.len(), 1); + + // 2) region snapshot replacement start will replace the region snapshot in + // the snapshot volume + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_start", + ) + .await; + + // After that, there should be one "replacement done" region snapshot + // replacement for the associated region snapshot. The above background task + // only starts the associated saga, so wait for it to complete. + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let region_snapshot_replacements = datastore + .get_replacement_done_region_snapshot_replacements(&opctx) + .await + .unwrap(); + + if region_snapshot_replacements.len() == 1 { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(20), + ) + .await + .expect("request transitioned to expected state"); + + let region_snapshot_replacements = datastore + .get_replacement_done_region_snapshot_replacements(&opctx) + .await + .unwrap(); + + assert_eq!(region_snapshot_replacements.len(), 1); + assert_eq!( + region_snapshot_replacements[0].old_dataset_id, + dataset.id().into() + ); + assert_eq!(region_snapshot_replacements[0].old_region_id, region.id()); + assert_eq!( + region_snapshot_replacements[0].old_snapshot_id, + snapshot.identity.id + ); + assert_eq!( + region_snapshot_replacements[0].replacement_state, + RegionSnapshotReplacementState::ReplacementDone, + ); + + assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); + + // 3) Delete the disk + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // The volume should be soft-deleted now. The region snapshot replacement + // swapped out the region snapshot from the snapshot volume to the temporary + // volume for later deletion, but has not actually deleted that temporary + // volume yet, so the count will not have gone to 0. + + let volume = datastore.volume_get(db_disk.volume_id).await.unwrap(); + assert!(volume.is_some()); + assert!(volume.unwrap().time_deleted.is_some()); + + // 4) region snapshot replacement garbage collect will delete the temporary + // volume with the stashed reference to the region snapshot, bringing the + // reference count to zero. + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_garbage_collection", + ) + .await; + + // Assert the region snapshot was deleted. + assert!(datastore + .region_snapshot_get(dataset.id(), region.id(), snapshot.identity.id) + .await + .unwrap() + .is_none()); + + // Assert that the disk's volume is still only soft-deleted, because the two + // other associated region snapshots still exist. + let volume = datastore.volume_get(db_disk.volume_id).await.unwrap(); + assert!(volume.is_some()); + + // Check on the old region id - it should not be deleted + let maybe_region = + datastore.get_region_optional(region.id()).await.unwrap(); + + eprintln!("old_region_id: {:?}", &maybe_region); + assert!(maybe_region.is_some()); + + // But the new region id will be! + let maybe_region = datastore + .get_region_optional(region_replacements[0].new_region_id.unwrap()) + .await + .unwrap(); + + eprintln!("new region id: {:?}", &maybe_region); + assert!(maybe_region.is_none()); + + // The region_replacement drive task should invoke the drive saga now, which + // will skip over all notification steps and transition the request to + // ReplacementDone + + let last_background_task = + activate_background_task(&internal_client, "region_replacement_driver") + .await; + + assert!(match last_background_task.last { + LastResult::Completed(last_result_completed) => { + match serde_json::from_value::( + last_result_completed.details, + ) { + Err(e) => { + eprintln!("{e}"); + false + } + + Ok(v) => !v.drive_invoked_ok.is_empty(), + } + } + + _ => { + false + } + }); + + // wait for the drive saga to complete here + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let replacement_request_id = region_replacements[0].id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == RegionReplacementState::ReplacementDone { + // The saga transitioned the request ok + Ok(()) + } else if state == RegionReplacementState::Driving { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else if state == RegionReplacementState::Completing { + // The saga transitioned the request ok, and it's now being + // finished by the region replacement finish saga + Ok(()) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // After the region snapshot replacement process runs to completion, there + // should be no more crucible resources left. Run the "region snapshot + // replacement step" background task until there's nothing left, then the + // "region snapshot replacement finish", then make sure there are no + // crucible resources left. + + let mut count = 0; + loop { + let actions_taken = + run_region_snapshot_replacement_step(&internal_client).await; + + if actions_taken == 0 { + break; + } + + count += 1; + + if count > 20 { + assert!(false); + } + } + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_finish", + ) + .await; + + // Ensure the region snapshot replacement request went to Complete + + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let request_id = region_snapshot_replacements[0].id; + + async move { + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(); + + let state = region_snapshot_replacement.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + Ok(()) + } else { + // Any other state is not expected + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // Delete the snapshot + + let snapshot_url = get_snapshot_url("snapshot"); + NexusRequest::object_delete(client, &snapshot_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + // and now there should be no higher level resources left + + let disks_url = get_disks_url(); + assert_eq!( + collection_list::(&client, &disks_url).await.len(), + 0 + ); + + let snapshots_url = get_snapshots_url(); + assert_eq!( + collection_list::(&client, &snapshots_url).await.len(), + 0 + ); + + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&internal_client).await; + + // The disk volume should be deleted by the snapshot delete: wait until this + // happens + + wait_for_condition( + || { + let datastore = datastore.clone(); + let volume_id = db_disk.volume_id; + + async move { + let volume = datastore.volume_get(volume_id).await.unwrap(); + if volume.is_none() { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("disk volume deleted"); + + // There should be no more crucible resources left. Don't just check for + // `crucible_resources_deleted` here! We have set one of the physical disk + // policies to expunged, so Nexus will not attempt to clean up any resources + // on that physical disk. + + disk_test.remove_zpool(db_zpool.id()).await; + + // Now, assert that all crucible resources are cleaned up + + assert!(disk_test.crucible_resources_deleted().await); +} + +mod region_snapshot_replacement { + use super::*; + + #[derive(Debug)] + struct ExpectedEndState(pub RegionSnapshotReplacementState); + + #[derive(Debug)] + struct ExpectedIntermediateState(pub RegionSnapshotReplacementState); + + #[derive(Debug)] + struct ExpectedStartState(pub RegionSnapshotReplacementState); + + pub(super) struct DeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, + snapshot_socket_addr: SocketAddr, + } + + impl<'a> DeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move + // the data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk, a snapshot of that disk, and a disk from that + // snapshot + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot") + .await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Manually create the region snapshot replacement request for the + // first allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = datastore + .get_allocated_regions(db_disk.volume_id) + .await + .unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let region_snapshot = datastore + .region_snapshot_get( + region.dataset_id(), + region.id(), + snapshot.identity.id, + ) + .await + .expect("found region snapshot without error") + .unwrap(); + + let replacement_request_id = datastore + .create_region_snapshot_replacement_request( + &opctx, + ®ion_snapshot, + ) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Requested, + ); + + // Assert two volumes reference the snapshot addr + + let snapshot_socket_addr = + region_snapshot.snapshot_addr.parse().unwrap(); + + let volumes = datastore + .find_volumes_referencing_socket_addr( + &opctx, + snapshot_socket_addr, + ) + .await + .unwrap(); + + assert_eq!(volumes.len(), 2); + + // Validate that they are snapshot and disk from snapshot + + let volumes_set: HashSet = + volumes.into_iter().map(|v| v.id()).collect(); + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let (.., db_disk_from_snapshot) = + LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap(); + + assert!(volumes_set.contains(&db_snapshot.volume_id)); + assert!(volumes_set.contains(&db_disk_from_snapshot.volume_id)); + + DeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), + replacement_request_id, + snapshot_socket_addr, + } + } + + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } + + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } + + pub async fn delete_the_snapshot(&self) { + let snapshot_url = get_snapshot_url("snapshot"); + NexusRequest::object_delete(&self.client, &snapshot_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + } + + pub async fn delete_the_disk_from_snapshot(&self) { + let disk_url = get_disk_url("disk-from-snapshot"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + } + + /// Make sure: + /// + /// - all region snapshot replacement related background tasks run to + /// completion + /// - this harness' region snapshot replacement request has transitioned + /// to Complete + /// - there are no more volumes that reference the request's region + /// snapshot + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&self.internal_client).await; + + // Assert the request is in state Complete + + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("request transitioned to expected state"); + + let region_snapshot_replacement = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Complete, + ); + + // Assert no volumes are referencing the snapshot address + + let volumes = self + .datastore + .find_volumes_referencing_socket_addr( + &self.opctx(), + self.snapshot_socket_addr, + ) + .await + .unwrap(); + + if !volumes.is_empty() { + eprintln!("{:?}", volumes); + } + + assert!(volumes.is_empty()); + } + + /// Assert no Crucible resources are leaked + pub async fn assert_no_crucible_resources_leaked(&self) { + assert!(self.disk_test.crucible_resources_deleted().await); + } + + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + expected_start_state: ExpectedStartState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = request.replacement_state; + + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else if state == expected_start_state.0 { + // The saga hasn't started yet + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // Assert the request state + + let region_snapshot_replacement = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + expected_end_state.0, + ); + } + + /// Run the "region snapshot replacement" task to transition the request + /// to ReplacementDone. + pub async fn transition_request_to_replacement_done(&self) { + // Activate the "region snapshot replacement start" background task + + run_region_snapshot_replacement_start(&self.internal_client).await; + + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. + + self.wait_for_request_state( + ExpectedEndState( + RegionSnapshotReplacementState::ReplacementDone, + ), + ExpectedIntermediateState( + RegionSnapshotReplacementState::Allocating, + ), + ExpectedStartState(RegionSnapshotReplacementState::Requested), + ) + .await; + } + + /// Run the "region snapshot replacement garbage collection" task to + /// transition the request to Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region snapshot replacement garbage collection" + // background task + + run_region_snapshot_replacement_garbage_collection( + &self.internal_client, + ) + .await; + + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. + + self.wait_for_request_state( + ExpectedEndState(RegionSnapshotReplacementState::Running), + ExpectedIntermediateState( + RegionSnapshotReplacementState::DeletingOldVolume, + ), + ExpectedStartState( + RegionSnapshotReplacementState::ReplacementDone, + ), + ) + .await; + } + + /// Manually create a region snapshot replacement step for the disk + /// created from the snapshot + pub async fn create_manual_region_snapshot_replacement_step(&self) { + let disk_url = get_disk_url("disk-from-snapshot"); + + let disk_from_snapshot: external::Disk = + NexusRequest::object_get(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + + let (.., db_disk_from_snapshot) = + LookupPath::new(&self.opctx(), &self.datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let result = self + .datastore + .create_region_snapshot_replacement_step( + &self.opctx(), + self.replacement_request_id, + db_disk_from_snapshot.volume_id, + ) + .await + .unwrap(); + + match result { + InsertStepResult::Inserted { .. } => {} + + _ => { + assert!(false, "bad result from create_region_snapshot_replacement_step"); + } + } + } + } +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have its snapshot deleted and still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: delete + // the snapshot, then finish the test. + + test_harness.delete_the_snapshot().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk().await; + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have its snapshot deleted, and the snapshot's source disk can be deleted, +/// and the request will still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested_2( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - delete the snapshot + // - delete the snapshot's source disk + // - the only thing that will remain is the disk-from-snap that was created + // - finally, call finish_test + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have everything be deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested_3( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - delete the snapshot + // - delete the snapshot's source disk + // - delete the disk created from the snapshot + // - finally, call finish_test + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + test_harness.delete_the_disk_from_snapshot().await; + + test_harness.finish_test().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "ReplacementDone" +/// can have its snapshot deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_replacement_done( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - delete the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Running" +/// can have its snapshot deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_running( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - transition the request to "Running" + // - delete the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + test_harness.transition_request_to_running().await; + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement step can have its associated +/// volume deleted and still transition to VolumeDeleted +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_step( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - transition the request to "Running" + // - manually create a region snapshot replacement step for the disk created + // from the snapshot + // - delete the disk created from the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + test_harness.transition_request_to_running().await; + + test_harness.create_manual_region_snapshot_replacement_step().await; + test_harness.delete_the_disk_from_snapshot().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk().await; + test_harness.delete_the_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 6a9ce28389..f0eb294e58 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -3764,12 +3764,13 @@ impl TestReadOnlyRegionReferenceUsage { // read-only regions should never be returned by find_deleted_volume_regions pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { - let deleted_volume_regions = + let freed_crucible_resources = self.datastore.find_deleted_volume_regions().await.unwrap(); - assert!(!deleted_volume_regions + assert!(!freed_crucible_resources + .datasets_and_regions .into_iter() - .any(|(_, r, _)| r.id() == self.region.id())); + .any(|(_, r)| r.id() == self.region.id())); } pub async fn create_first_volume_region_in_rop(&self) { diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index cf3d652587..bee2f56c34 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -37,6 +37,7 @@ pub struct LookupRegionPortStatus { pub struct RegionSnapshotReplacementStartStatus { pub requests_created_ok: Vec, pub start_invoked_ok: Vec, + pub requests_completed_ok: Vec, pub errors: Vec, } @@ -55,13 +56,14 @@ pub struct RegionSnapshotReplacementStepStatus { pub step_records_created_ok: Vec, pub step_garbage_collect_invoked_ok: Vec, pub step_invoked_ok: Vec, + pub step_set_volume_deleted_ok: Vec, pub errors: Vec, } /// The status of a `region_snapshot_replacement_finish` background task activation #[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] pub struct RegionSnapshotReplacementFinishStatus { - pub records_set_to_done: Vec, + pub finish_invoked_ok: Vec, pub errors: Vec, } diff --git a/schema/crdb/add-completing-and-new-region-volume/up01.sql b/schema/crdb/add-completing-and-new-region-volume/up01.sql new file mode 100644 index 0000000000..6a973eb3c3 --- /dev/null +++ b/schema/crdb/add-completing-and-new-region-volume/up01.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.region_snapshot_replacement_state ADD VALUE IF NOT EXISTS 'completing' AFTER 'complete'; diff --git a/schema/crdb/add-completing-and-new-region-volume/up02.sql b/schema/crdb/add-completing-and-new-region-volume/up02.sql new file mode 100644 index 0000000000..42c0028ff5 --- /dev/null +++ b/schema/crdb/add-completing-and-new-region-volume/up02.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region_snapshot_replacement ADD COLUMN IF NOT EXISTS new_region_volume_id UUID; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 75b7dbaf08..67b8ab0041 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4427,7 +4427,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.region_snapshot_replacement_state AS EN 'replacement_done', 'deleting_old_volume', 'running', - 'complete' + 'complete', + 'completing' ); CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( @@ -4445,7 +4446,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( replacement_state omicron.public.region_snapshot_replacement_state NOT NULL, - operating_saga_id UUID + operating_saga_id UUID, + + new_region_volume_id UUID ); CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_by_state on omicron.public.region_snapshot_replacement (replacement_state); @@ -4694,7 +4697,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '116.0.0', NULL) + (TRUE, NOW(), NOW(), '117.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;