Skip to content

Commit

Permalink
[sled-agent] Refactor service management out of StorageManager (#2946)
Browse files Browse the repository at this point in the history
## History

The Sled Agent has historically had two different "managers" responsible
for Zones:

1. `ServiceManager`, which resided over zones that do not operate on
Datasets
2. `StorageManager`, which manages disks, but also manages zones which
operate on those disks

This separation is even reflected in the sled agent API exposed to Nexus
- the Sled Agent exposes:

- `PUT /services`
- `PUT /filesystem`

For "add a service (within a zone) to this sled" vs "add a dataset (and
corresponding zone) to this sled within a particular zpool".

This has been kinda handy for Nexus, since "provision CRDB on this
dataset" and "start the CRDB service on that dataset" don't need to be
separate operations. Within the Sled Agent, however, it has been a
pain-in-the-butt from a perspective of diverging implementations. The
`StorageManager` and `ServiceManager` have evolved their own mechanisms
for storing configs, identifying filesystems on which to place zpools,
etc, even though their responsibilities (managing running zones) overlap
quite a lot.

## This PR

This PR migrates the responsibility for "service management" entirely
into the `ServiceManager`, leaving the `StorageManager` responsible for
monitoring disks.

In detail, this means:

- The responsibility for launching Clickhouse, CRDB, and Crucible zones
has moved from `storage_manager.rs` into `services.rs`
- Unfortunately, this also means we're taking a somewhat hacky approach
to formatting CRDB. This is fixed in
#2954.
- The `StorageManager` no longer requires an Etherstub device during
construction
- The `ServiceZoneRequest` can operate on an optional `dataset` argument
- The "config management" for datastore-based zones is now much more
aligned with non-dataset zones. Each sled stores
`/var/oxide/services.toml` and `/var/oxide/storage-services.toml` for
each group.
- These still need to be fixed with
#2888 , but it should be
simpler now.
- `filesystem_ensure` - which previously asked the `StorageManager` to
format a dataset and also launch a zone - now asks the `StorageManager`
to format a dataset, and separately asks the `ServiceManager` to launch
a zone.
- In the future, this may become vectorized ("ensure the sled has *all*
the datasets we want...") to have parity with the service management,
but this would require a more invasive change in Nexus.
  • Loading branch information
smklein authored May 1, 2023
1 parent 4261090 commit ccc28fe
Show file tree
Hide file tree
Showing 13 changed files with 777 additions and 758 deletions.
12 changes: 6 additions & 6 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -592,8 +592,8 @@ impl InstalledZone {
///
/// This results in a zone name which is distinct across different zpools,
/// but stable and predictable across reboots.
pub fn get_zone_name(zone_name: &str, unique_name: Option<&str>) -> String {
let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_name);
pub fn get_zone_name(zone_type: &str, unique_name: Option<&str>) -> String {
let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_type);
if let Some(suffix) = unique_name {
zone_name.push_str(&format!("_{}", suffix));
}
Expand All @@ -618,7 +618,7 @@ impl InstalledZone {
log: &Logger,
underlay_vnic_allocator: &VnicAllocator<Etherstub>,
zone_root_path: &Path,
zone_name: &str,
zone_type: &str,
unique_name: Option<&str>,
datasets: &[zone::Dataset],
filesystems: &[zone::Fs],
Expand All @@ -631,14 +631,14 @@ impl InstalledZone {
let control_vnic =
underlay_vnic_allocator.new_control(None).map_err(|err| {
InstallZoneError::CreateVnic {
zone: zone_name.to_string(),
zone: zone_type.to_string(),
err,
}
})?;

let full_zone_name = Self::get_zone_name(zone_name, unique_name);
let full_zone_name = Self::get_zone_name(zone_type, unique_name);
let zone_image_path =
PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_name));
PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_type));

let net_device_names: Vec<String> = opte_ports
.iter()
Expand Down
1 change: 1 addition & 0 deletions illumos-utils/src/zpool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ impl Zpool {
}

#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub enum ZpoolKind {
// This zpool is used for external storage (u.2)
External,
Expand Down
110 changes: 98 additions & 12 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -640,13 +640,6 @@
{
"type": "object",
"properties": {
"all_addresses": {
"description": "The addresses of all nodes within the cluster.",
"type": "array",
"items": {
"type": "string"
}
},
"type": {
"type": "string",
"enum": [
Expand All @@ -655,7 +648,6 @@
}
},
"required": [
"all_addresses",
"type"
]
},
Expand Down Expand Up @@ -689,6 +681,21 @@
}
]
},
"DatasetName": {
"type": "object",
"properties": {
"kind": {
"$ref": "#/components/schemas/DatasetKind"
},
"pool_name": {
"$ref": "#/components/schemas/ZpoolName"
}
},
"required": [
"kind",
"pool_name"
]
},
"DendriteAsic": {
"type": "string",
"enum": [
Expand Down Expand Up @@ -1738,7 +1745,7 @@
"pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$"
},
"ServiceEnsureBody": {
"description": "Used to request that the Sled initialize certain services on initialization.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.",
"description": "Used to request that the Sled initialize multiple services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.",
"type": "object",
"properties": {
"services": {
Expand Down Expand Up @@ -2036,6 +2043,48 @@
"mode",
"type"
]
},
{
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": [
"clickhouse"
]
}
},
"required": [
"type"
]
},
{
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": [
"cockroach_db"
]
}
},
"required": [
"type"
]
},
{
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": [
"crucible"
]
}
},
"required": [
"type"
]
}
]
},
Expand All @@ -2050,6 +2099,15 @@
"format": "ipv6"
}
},
"dataset": {
"nullable": true,
"default": null,
"allOf": [
{
"$ref": "#/components/schemas/DatasetName"
}
]
},
"gz_addresses": {
"default": [],
"type": "array",
Expand Down Expand Up @@ -2080,6 +2138,7 @@
]
},
"ServiceZoneService": {
"description": "Used to request that the Sled initialize a single service.",
"type": "object",
"properties": {
"details": {
Expand Down Expand Up @@ -2481,13 +2540,16 @@
"description": "The type of zone which may be requested from Sled Agent",
"type": "string",
"enum": [
"clickhouse",
"cockroach_db",
"crucible_pantry",
"crucible",
"external_dns",
"internal_dns",
"nexus",
"ntp",
"oximeter",
"switch",
"crucible_pantry",
"ntp"
"switch"
]
},
"Zpool": {
Expand All @@ -2505,6 +2567,30 @@
"disk_type",
"id"
]
},
"ZpoolKind": {
"type": "string",
"enum": [
"external",
"internal"
]
},
"ZpoolName": {
"description": "A wrapper around a zpool name.\n\nThis expects that the format will be: `ox{i,p}_<UUID>` - we parse the prefix when reading the structure, and validate that the UUID can be utilized.",
"type": "object",
"properties": {
"id": {
"type": "string",
"format": "uuid"
},
"kind": {
"$ref": "#/components/schemas/ZpoolKind"
}
},
"required": [
"id",
"kind"
]
}
}
}
Expand Down
5 changes: 1 addition & 4 deletions sled-agent/src/bootstrap/hardware.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,7 @@ impl HardwareMonitor {
let hardware = HardwareManager::new(log, sled_mode)
.map_err(|e| Error::Hardware(e))?;

// TODO: The coupling between the storage and service manager is growing
// pretty tight; we should consider merging them together.
let storage_manager =
StorageManager::new(&log, underlay_etherstub.clone()).await;
let storage_manager = StorageManager::new(&log).await;

let service_manager = ServiceManager::new(
log.clone(),
Expand Down
1 change: 1 addition & 0 deletions sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ async fn filesystem_put(
let sa = rqctx.context();
let body_args = body.into_inner();
sa.filesystem_ensure(
body_args.id,
body_args.zpool_id,
body_args.dataset_kind,
body_args.address,
Expand Down
1 change: 1 addition & 0 deletions sled-agent/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ mod services;
mod sled_agent;
mod smf_helper;
pub mod sp;
pub(crate) mod storage;
mod storage_manager;
mod updates;

Expand Down
Loading

0 comments on commit ccc28fe

Please sign in to comment.