Skip to content

Commit

Permalink
Handle region snapshot replacement volume deletes
Browse files Browse the repository at this point in the history
Volumes can be deleted at any time, but the tasks and sagas that perform
region snapshot replacement did not account for this. This commit adds
checks in a few places for if a volume is soft-deleted or hard-deleted,
and bails out of any affected region snapshot replacement accordingly:

- if a volume that has the region snapshot being replaced is
  soft-deleted, then skip making a region snapshot replacement step for
  it

- if a region snapshot replacement step has the volume deleted after the
  step was created, transition it directly to the VolumeDeleted state

- if a region snapshot replacement step has the volume deleted during
  the saga invocation, then skip notifying any Upstairs and allow the
  saga to transition the request to Complete, where then associated
  clean up can proceed

An interesting race condition emerged during unit testing: the read-only
region allocated to replace a region snapshot would be swapped into the
snapshot volume, but would be susceptible to being deleted by the user,
and therefore unable to be swapped into other volumes that have that
snapshot volume as a read-only parent.

This requires an additional volume that used that read-only region in
order to bump the reference count associated with that region, so that
the user cannot delete it before it was used to replace all other uses
of the region snapshot it was meant to replace.

This additional volume's lifetime lives as long as the region snapshot
replacement, and therefore needs to be deleted when the region snapshot
replacement is finished. This required a new region snapshot replacement
finish saga, which required a new "Completing" state to perform the same
type of state based lock on the replacement request done for all the
other sagas.

Testing also revealed that there were scenarios where
`find_deleted_volume_regions` would return volumes for hard-deletion
prematurely. The function now returns a struct instead of a list of
tuples, and in that struct, regions freed for deletion are now distinct
from volumes freed for deletion. Volumes are now only returned for
hard-deletion when all associated read/write regions have been (or are
going to be) deleted.

Fixes oxidecomputer#6353
  • Loading branch information
jmpesp committed Nov 12, 2024
1 parent 7cf688c commit 92da0a9
Show file tree
Hide file tree
Showing 25 changed files with 2,234 additions and 363 deletions.
41 changes: 25 additions & 16 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2558,39 +2558,48 @@ async fn cmd_db_region_used_by(
async fn cmd_db_region_find_deleted(
datastore: &DataStore,
) -> Result<(), anyhow::Error> {
let datasets_regions_volumes =
let freed_crucible_resources =
datastore.find_deleted_volume_regions().await?;

#[derive(Tabled)]
struct Row {
struct RegionRow {
dataset_id: Uuid,
region_id: Uuid,
volume_id: String,
}

let rows: Vec<Row> = datasets_regions_volumes
.into_iter()
#[derive(Tabled)]
struct VolumeRow {
volume_id: Uuid,
}

let region_rows: Vec<RegionRow> = freed_crucible_resources
.datasets_and_regions
.iter()
.map(|row| {
let (dataset, region, volume) = row;
let (dataset, region) = row;

Row {
dataset_id: dataset.id(),
region_id: region.id(),
volume_id: if let Some(volume) = volume {
volume.id().to_string()
} else {
String::from("")
},
}
RegionRow { dataset_id: dataset.id(), region_id: region.id() }
})
.collect();

let table = tabled::Table::new(rows)
let table = tabled::Table::new(region_rows)
.with(tabled::settings::Style::psql())
.to_string();

println!("{}", table);

let volume_rows: Vec<VolumeRow> = freed_crucible_resources
.volumes
.iter()
.map(|volume_id| VolumeRow { volume_id: *volume_id })
.collect();

let volume_table = tabled::Table::new(volume_rows)
.with(tabled::settings::Style::psql())
.to_string();

println!("{}", volume_table);

Ok(())
}

Expand Down
23 changes: 20 additions & 3 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1655,6 +1655,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
println!(" > {line}");
}

println!(
" total requests completed ok: {}",
status.requests_completed_ok.len(),
);
for line in &status.requests_completed_ok {
println!(" > {line}");
}

println!(" errors: {}", status.errors.len());
for line in &status.errors {
println!(" > {line}");
Expand Down Expand Up @@ -1720,6 +1728,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
println!(" > {line}");
}

println!(
" total steps set to volume_deleted ok: {}",
status.step_set_volume_deleted_ok.len(),
);
for line in &status.step_set_volume_deleted_ok {
println!(" > {line}");
}

println!(" errors: {}", status.errors.len());
for line in &status.errors {
println!(" > {line}");
Expand Down Expand Up @@ -1831,10 +1847,11 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {

Ok(status) => {
println!(
" total records transitioned to done: {}",
status.records_set_to_done.len(),
" region snapshot replacement finish sagas started \
ok: {}",
status.finish_invoked_ok.len()
);
for line in &status.records_set_to_done {
for line in &status.finish_invoked_ok {
println!(" > {line}");
}

Expand Down
8 changes: 6 additions & 2 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ task: "region_snapshot_replacement_finish"
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total records transitioned to done: 0
region snapshot replacement finish sagas started ok: 0
errors: 0

task: "region_snapshot_replacement_garbage_collection"
Expand All @@ -645,6 +645,7 @@ task: "region_snapshot_replacement_start"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total requests created ok: 0
total start saga invoked ok: 0
total requests completed ok: 0
errors: 0

task: "region_snapshot_replacement_step"
Expand All @@ -655,6 +656,7 @@ task: "region_snapshot_replacement_step"
total step records created ok: 0
total step garbage collect saga invoked ok: 0
total step saga invoked ok: 0
total steps set to volume_deleted ok: 0
errors: 0

task: "saga_recovery"
Expand Down Expand Up @@ -1070,7 +1072,7 @@ task: "region_snapshot_replacement_finish"
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total records transitioned to done: 0
region snapshot replacement finish sagas started ok: 0
errors: 0

task: "region_snapshot_replacement_garbage_collection"
Expand All @@ -1088,6 +1090,7 @@ task: "region_snapshot_replacement_start"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total requests created ok: 0
total start saga invoked ok: 0
total requests completed ok: 0
errors: 0

task: "region_snapshot_replacement_step"
Expand All @@ -1098,6 +1101,7 @@ task: "region_snapshot_replacement_step"
total step records created ok: 0
total step garbage collect saga invoked ok: 0
total step saga invoked ok: 0
total steps set to volume_deleted ok: 0
errors: 0

task: "saga_recovery"
Expand Down
18 changes: 16 additions & 2 deletions nexus/db-model/src/region_snapshot_replacement.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ impl_enum_type!(
ReplacementDone => b"replacement_done"
DeletingOldVolume => b"deleting_old_volume"
Running => b"running"
Completing => b"completing"
Complete => b"complete"
);

Expand All @@ -43,6 +44,7 @@ impl std::str::FromStr for RegionSnapshotReplacementState {
Ok(RegionSnapshotReplacementState::DeletingOldVolume)
}
"running" => Ok(RegionSnapshotReplacementState::Running),
"completing" => Ok(RegionSnapshotReplacementState::Completing),
"complete" => Ok(RegionSnapshotReplacementState::Complete),
_ => Err(format!("unrecognized value {} for enum", s)),
}
Expand Down Expand Up @@ -77,8 +79,13 @@ impl std::str::FromStr for RegionSnapshotReplacementState {
/// v ---
/// ---
/// Running |
/// | set in region snapshot replacement
/// | | finish background task
/// |
/// | |
/// v |
/// | responsibility of region snapshot
/// Completing | replacement finish saga
/// |
/// | |
/// v |
/// |
/// Complete ---
Expand Down Expand Up @@ -130,6 +137,12 @@ pub struct RegionSnapshotReplacement {
pub replacement_state: RegionSnapshotReplacementState,

pub operating_saga_id: Option<Uuid>,

/// In order for the newly created region not to be deleted inadvertently,
/// an additional reference count bump is required. This volume should live
/// as long as this request so that all necessary replacements can be
/// completed.
pub new_region_volume_id: Option<Uuid>,
}

impl RegionSnapshotReplacement {
Expand All @@ -154,6 +167,7 @@ impl RegionSnapshotReplacement {
old_snapshot_id,
old_snapshot_volume_id: None,
new_region_id: None,
new_region_volume_id: None,
replacement_state: RegionSnapshotReplacementState::Requested,
operating_saga_id: None,
}
Expand Down
1 change: 1 addition & 0 deletions nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1929,6 +1929,7 @@ table! {
new_region_id -> Nullable<Uuid>,
replacement_state -> crate::RegionSnapshotReplacementStateEnum,
operating_saga_id -> Nullable<Uuid>,
new_region_volume_id -> Nullable<Uuid>,
}
}

Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::collections::BTreeMap;
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(114, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(115, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(115, "add-completing-and-new-region-volume"),
KnownVersion::new(114, "crucible-ref-count-records"),
KnownVersion::new(113, "add-tx-eq"),
KnownVersion::new(112, "blueprint-dataset"),
Expand Down
11 changes: 1 addition & 10 deletions nexus/db-queries/src/db/datastore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,16 +125,7 @@ pub use sled::TransitionError;
pub use switch_port::SwitchPortSettingsCombinedResult;
pub use virtual_provisioning_collection::StorageType;
pub use vmm::VmmStateUpdateResult;
pub use volume::read_only_resources_associated_with_volume;
pub use volume::CrucibleResources;
pub use volume::CrucibleTargets;
pub use volume::ExistingTarget;
pub use volume::ReplacementTarget;
pub use volume::VolumeCheckoutReason;
pub use volume::VolumeReplaceResult;
pub use volume::VolumeReplacementParams;
pub use volume::VolumeToDelete;
pub use volume::VolumeWithTarget;
pub use volume::*;

// Number of unique datasets required to back a region.
// TODO: This should likely turn into a configuration option.
Expand Down
Loading

0 comments on commit 92da0a9

Please sign in to comment.