From 0c075fab3a73c89a6857b46c9b85afca968ee646 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 29 Oct 2024 18:40:10 +0200
Subject: [PATCH 01/32] Add --replica parameter to basebackup (#9553)

## Problem

See https://github.com/neondatabase/neon/pull/9458
This PR separates PS related changes in #9458 from compute_ctl changes
to enforce that PS is deployed before compute.

## Summary of changes

This PR adds handlings of `--replica` parameters of backebackup to page
server.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/basebackup.rs   | 11 ++++++++--
 pageserver/src/page_service.rs | 37 +++++++++++++++++++++-------------
 2 files changed, 32 insertions(+), 16 deletions(-)
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 975318419f65..cae0ffb9805b 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -59,6 +59,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     req_lsn: Option<Lsn>,
     prev_lsn: Option<Lsn>,
     full_backup: bool,
+    replica: bool,
     ctx: &'a RequestContext,
 ) -> Result<(), BasebackupError>
 where
@@ -110,8 +111,8 @@ where
     };
 
     info!(
-        "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
-        backup_lsn, prev_lsn, full_backup
+        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
+        backup_lsn, prev_lsn, full_backup, replica
     );
 
     let basebackup = Basebackup {
@@ -120,6 +121,7 @@ where
         lsn: backup_lsn,
         prev_record_lsn: prev_lsn,
         full_backup,
+        replica,
         ctx,
     };
     basebackup
@@ -140,6 +142,7 @@ where
     lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
+    replica: bool,
     ctx: &'a RequestContext,
 }
 
@@ -372,6 +375,10 @@ where
 
         for (path, content) in aux_files {
             if path.starts_with("pg_replslot") {
+                // Do not create LR slots at standby because they are not used but prevent WAL truncation
+                if self.replica {
+                    continue;
+                }
                 let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                 let restart_lsn = Lsn(u64::from_le_bytes(
                     content[offs..offs + 8].try_into().unwrap(),
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 62b14cb83e55..aed8a878515f 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1080,6 +1080,7 @@ impl PageServerHandler {
         prev_lsn: Option<Lsn>,
         full_backup: bool,
         gzip: bool,
+        replica: bool,
         ctx: &RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -1132,6 +1133,7 @@ impl PageServerHandler {
                 lsn,
                 prev_lsn,
                 full_backup,
+                replica,
                 ctx,
             )
             .await
@@ -1154,6 +1156,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
+                    replica,
                     ctx,
                 )
                 .await
@@ -1170,6 +1173,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
+                    replica,
                     ctx,
                 )
                 .await
@@ -1326,24 +1330,27 @@ where
                 .for_command(ComputeCommandKind::Basebackup)
                 .inc();
 
-            let (lsn, gzip) = match (params.get(2), params.get(3)) {
-                (None, _) => (None, false),
-                (Some(&"--gzip"), _) => (None, true),
-                (Some(lsn_str), gzip_str_opt) => {
-                    let lsn = Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
-                    let gzip = match gzip_str_opt {
-                        Some(&"--gzip") => true,
-                        None => false,
-                        Some(third_param) => {
+            let mut lsn = None;
+            let mut replica = false;
+            let mut gzip = false;
+            for param in &params[2..] {
+                if param.starts_with("--") {
+                    match *param {
+                        "--gzip" => gzip = true,
+                        "--replica" => replica = true,
+                        _ => {
                             return Err(QueryError::Other(anyhow::anyhow!(
-                                "Parameter in position 3 unknown {third_param}",
+                                "Unknown parameter {param}",
                             )))
                         }
-                    };
-                    (Some(lsn), gzip)
+                    }
+                } else {
+                    lsn = Some(
+                        Lsn::from_str(param)
+                            .with_context(|| format!("Failed to parse Lsn from {param}"))?,
+                    );
                 }
-            };
+            }
 
             let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
             let res = async {
@@ -1355,6 +1362,7 @@ where
                     None,
                     false,
                     gzip,
+                    replica,
                     &ctx,
                 )
                 .await?;
@@ -1415,6 +1423,7 @@ where
                 prev_lsn,
                 true,
                 false,
+                false,
                 &ctx,
             )
             .await?;

From 88ff8a78032d4b58ac9d44efbd8cd4ae2be6040d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 29 Oct 2024 14:25:32 -0400
Subject: [PATCH 02/32] feat(pageserver): support partial gc-compaction for
 lowest retain lsn (#9134)

part of https://github.com/neondatabase/neon/issues/8921,
https://github.com/neondatabase/neon/issues/9114

## Summary of changes

We start the partial compaction implementation with the image layer
partial generation. The partial compaction API now takes a key range. We
will only generate images for that key range for now, and remove layers
fully included in the key range after compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs                     | 264 +++++++++++++++++--
 pageserver/src/tenant/timeline/compaction.rs | 221 ++++++++++++----
 2 files changed, 410 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 64e4eb46ce5f..6ac11b0ae130 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5138,6 +5138,7 @@ mod tests {
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use pageserver_api::value::Value;
+    use pageserver_compaction::helpers::overlaps_with;
     use rand::{thread_rng, Rng};
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
@@ -7660,23 +7661,7 @@ mod tests {
         }
 
         // Check if old layers are removed / new layers have the expected LSN
-        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
-        all_layers.sort_by(|k1, k2| {
-            (
-                k1.is_delta,
-                k1.key_range.start,
-                k1.key_range.end,
-                k1.lsn_range.start,
-                k1.lsn_range.end,
-            )
-                .cmp(&(
-                    k2.is_delta,
-                    k2.key_range.start,
-                    k2.key_range.end,
-                    k2.lsn_range.start,
-                    k2.lsn_range.end,
-                ))
-        });
+        let all_layers = inspect_and_sort(&tline, None).await;
         assert_eq!(
             all_layers,
             vec![
@@ -9220,4 +9205,249 @@ mod tests {
 
         Ok(())
     }
+
+    async fn inspect_and_sort(
+        tline: &Arc<Timeline>,
+        filter: Option<std::ops::Range<Key>>,
+    ) -> Vec<PersistentLayerKey> {
+        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
+        if let Some(filter) = filter {
+            all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
+        }
+        all_layers.sort_by(|k1, k2| {
+            (
+                k1.is_delta,
+                k1.key_range.start,
+                k1.key_range.end,
+                k1.lsn_range.start,
+                k1.lsn_range.end,
+            )
+                .cmp(&(
+                    k2.is_delta,
+                    k2.key_range.start,
+                    k2.key_range.end,
+                    k2.lsn_range.start,
+                    k2.lsn_range.end,
+                ))
+        });
+        all_layers
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_partial_bottom_most_compaction").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        // img layer at 0x10
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::Image(Bytes::from("value 1@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::Image(Bytes::from("value 2@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::Image(Bytes::from("value 3@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::Image(Bytes::from("value 5@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::Image(Bytes::from("value 6@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 8@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 9@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let cancel = CancellationToken::new();
+
+        // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
+        // can be removed because they might be used for other key ranges.
+        tline
+            .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        assert_eq!(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(5)..get_key(7),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true
+                }
+            ]
+        );
+
+        // Do a partial compaction on key range 4..10
+        tline
+            .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        assert_eq!(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    // if (in the future) GC kicks in, this layer will be removed
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(5)..get_key(7),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true
+                }
+            ]
+        );
+
+        // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
+        tline
+            .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        assert_eq!(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(5)..get_key(7),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true
+                }
+            ]
+        );
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 70f93656cdee..01c280388177 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1716,20 +1716,32 @@ impl Timeline {
         Ok(())
     }
 
+    pub(crate) async fn compact_with_gc(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.partial_compact_with_gc(None, cancel, flags, ctx).await
+    }
+
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
     /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
     /// and create delta layers with all deltas >= gc horizon.
-    pub(crate) async fn compact_with_gc(
+    ///
+    /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
+    /// is not complete yet, and if it is set, only image layers will be generated.
+    ///
+    pub(crate) async fn partial_compact_with_gc(
         self: &Arc<Self>,
+        compaction_key_range: Option<Range<Key>>,
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        use std::collections::BTreeSet;
-
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1750,8 +1762,13 @@ impl Timeline {
         .await?;
 
         let dry_run = flags.contains(CompactFlags::DryRun);
+        let partial_compaction = compaction_key_range.is_some();
 
-        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        if let Some(ref compaction_key_range) = compaction_key_range {
+            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
+        } else {
+            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        }
 
         scopeguard::defer! {
             info!("done enhanced gc bottom-most compaction");
@@ -1763,7 +1780,7 @@ impl Timeline {
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
+        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
             let guard = self.layers.read().await;
             let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
@@ -1779,7 +1796,7 @@ impl Timeline {
                     retain_lsns_below_horizon.push(*lsn);
                 }
             }
-            let mut selected_layers = Vec::new();
+            let mut selected_layers: Vec<Layer> = Vec::new();
             drop(gc_info);
             // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
             let Some(max_layer_lsn) = layers
@@ -1804,8 +1821,52 @@ impl Timeline {
             }
             retain_lsns_below_horizon.sort();
             (selected_layers, gc_cutoff, retain_lsns_below_horizon)
+        } else {
+            // In case of partial compaction, we currently only support generating image layers, and therefore,
+            // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map()?;
+            let gc_info = self.gc_info.read().unwrap();
+            let mut min_lsn = gc_info.cutoffs.select_min();
+            for (lsn, _, _) in &gc_info.retain_lsns {
+                if lsn < &min_lsn {
+                    min_lsn = *lsn;
+                }
+            }
+            for lsn in gc_info.leases.keys() {
+                if lsn < &min_lsn {
+                    min_lsn = *lsn;
+                }
+            }
+            let mut selected_layers = Vec::new();
+            drop(gc_info);
+            // |-------| |-------| |-------|
+            // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
+            // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
+            // | Delta | | Delta | | Delta |    ...we can remove them after compaction
+            // |-------| |-------| |-------|
+            // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
+            let Some(compaction_key_range) = compaction_key_range.as_ref() else {
+                unreachable!()
+            };
+            for desc in layers.iter_historic_layers() {
+                if desc.get_lsn_range().end <= min_lsn
+                    && overlaps_with(&desc.key_range, compaction_key_range)
+                {
+                    selected_layers.push(guard.get_from_desc(&desc));
+                }
+            }
+            if selected_layers.is_empty() {
+                info!("no layers to compact with gc");
+                return Ok(());
+            }
+            (selected_layers, min_lsn, Vec::new())
         };
         let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
+            if partial_compaction {
+                warn!("partial compaction cannot run on child branches (for now)");
+                return Ok(());
+            }
             Lsn(self.ancestor_lsn.0 + 1)
         } else {
             let res = retain_lsns_below_horizon
@@ -1833,23 +1894,18 @@ impl Timeline {
 
         self.check_compaction_space(&layer_selection).await?;
 
-        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
-        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+        // Generate statistics for the compaction
         for layer in &layer_selection {
             let desc = layer.layer_desc();
             if desc.is_delta() {
-                // ignore single-key layer files
-                if desc.key_range.start.next() != desc.key_range.end {
-                    let lsn_range = &desc.lsn_range;
-                    lsn_split_point.insert(lsn_range.start);
-                    lsn_split_point.insert(lsn_range.end);
-                }
                 stat.visit_delta_layer(desc.file_size());
             } else {
                 stat.visit_image_layer(desc.file_size());
             }
         }
+
+        // Step 1: construct a k-merge iterator over all layers.
+        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
         let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
             .iter()
             .map(|layer| layer.layer_desc().layer_name())
@@ -1900,7 +1956,10 @@ impl Timeline {
                     self.conf,
                     self.timeline_id,
                     self.tenant_shard_id,
-                    Key::MIN,
+                    compaction_key_range
+                        .as_ref()
+                        .map(|x| x.start)
+                        .unwrap_or(Key::MIN),
                     lowest_retain_lsn,
                     self.get_compaction_target_size(),
                     ctx,
@@ -1961,55 +2020,71 @@ impl Timeline {
             } else {
                 let last_key = last_key.as_mut().unwrap();
                 stat.on_unique_key_visited();
-                let retention = self
-                    .generate_key_retention(
-                        *last_key,
-                        &accumulated_values,
-                        gc_cutoff,
-                        &retain_lsns_below_horizon,
-                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
-                    )
-                    .await?;
-                // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                retention
-                    .pipe_to(
-                        *last_key,
-                        &mut delta_layer_writer,
-                        image_layer_writer.as_mut(),
-                        &mut stat,
-                        ctx,
-                    )
-                    .await?;
+                let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
+                    !compaction_key_range.contains(last_key)
+                } else {
+                    false
+                };
+                if !skip_adding_key {
+                    let retention = self
+                        .generate_key_retention(
+                            *last_key,
+                            &accumulated_values,
+                            gc_cutoff,
+                            &retain_lsns_below_horizon,
+                            COMPACTION_DELTA_THRESHOLD,
+                            get_ancestor_image(self, *last_key, ctx).await?,
+                        )
+                        .await?;
+                    // Put the image into the image layer. Currently we have a single big layer for the compaction.
+                    retention
+                        .pipe_to(
+                            *last_key,
+                            &mut delta_layer_writer,
+                            image_layer_writer.as_mut(),
+                            &mut stat,
+                            ctx,
+                        )
+                        .await?;
+                }
                 accumulated_values.clear();
                 *last_key = key;
                 accumulated_values.push((key, lsn, val));
             }
         }
 
+        // TODO: move the below part to the loop body
         let last_key = last_key.expect("no keys produced during compaction");
-        // TODO: move this part to the loop body
         stat.on_unique_key_visited();
-        let retention = self
-            .generate_key_retention(
-                last_key,
-                &accumulated_values,
-                gc_cutoff,
-                &retain_lsns_below_horizon,
-                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
-            )
-            .await?;
-        // Put the image into the image layer. Currently we have a single big layer for the compaction.
-        retention
-            .pipe_to(
-                last_key,
-                &mut delta_layer_writer,
-                image_layer_writer.as_mut(),
-                &mut stat,
-                ctx,
-            )
-            .await?;
+
+        let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
+            !compaction_key_range.contains(&last_key)
+        } else {
+            false
+        };
+        if !skip_adding_key {
+            let retention = self
+                .generate_key_retention(
+                    last_key,
+                    &accumulated_values,
+                    gc_cutoff,
+                    &retain_lsns_below_horizon,
+                    COMPACTION_DELTA_THRESHOLD,
+                    get_ancestor_image(self, last_key, ctx).await?,
+                )
+                .await?;
+            // Put the image into the image layer. Currently we have a single big layer for the compaction.
+            retention
+                .pipe_to(
+                    last_key,
+                    &mut delta_layer_writer,
+                    image_layer_writer.as_mut(),
+                    &mut stat,
+                    ctx,
+                )
+                .await?;
+        }
+        // end: move the above part to the loop body
 
         let discard = |key: &PersistentLayerKey| {
             let key = key.clone();
@@ -2018,8 +2093,12 @@ impl Timeline {
 
         let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
+                let end_key = compaction_key_range
+                    .as_ref()
+                    .map(|x| x.end)
+                    .unwrap_or(Key::MAX);
                 writer
-                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
+                    .finish_with_discard_fn(self, ctx, end_key, discard)
                     .await?
             } else {
                 drop(writer);
@@ -2038,6 +2117,10 @@ impl Timeline {
             Vec::new()
         };
 
+        if partial_compaction && !produced_delta_layers.is_empty() {
+            bail!("implementation error: partial compaction should not be producing delta layers (for now)");
+        }
+
         let mut compact_to = Vec::new();
         let mut keep_layers = HashSet::new();
         let produced_delta_layers_len = produced_delta_layers.len();
@@ -2068,6 +2151,28 @@ impl Timeline {
         }
         let mut layer_selection = layer_selection;
         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+        if let Some(ref compaction_key_range) = compaction_key_range {
+            // Partial compaction might select more data than it processes, e.g., if
+            // the compaction_key_range only partially overlaps:
+            //
+            //         [---compaction_key_range---]
+            //   [---A----][----B----][----C----][----D----]
+            //
+            // A,B,C,D are all in the `layer_selection`. The created image layers contain
+            // whatever is needed from B, C, and from `----]` of A, and from  `[--` of D.
+            //
+            // In contrast, `[--A-` and `--D----]` have not been processed, so, we must
+            // keep that data.
+            //
+            // The solution for now is to keep A and D completely.
+            // (layer_selection is what we'll remove from the layer map, so,
+            //  retain what is _not_ fully covered by compaction_key_range).
+            layer_selection.retain(|x| {
+                let key_range = &x.layer_desc().key_range;
+                key_range.start >= compaction_key_range.start
+                    && key_range.end <= compaction_key_range.end
+            });
+        }
 
         info!(
             "gc-compaction statistics: {}",

From 81f9aba0057fb6efefa71aaa57fe2b4ec93899ad Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 29 Oct 2024 15:16:23 -0400
Subject: [PATCH 03/32] fix(pagectl): layer parsing and image layer dump
 (#9571)

This patch contains various improvements for the pagectl tool.

## Summary of changes

* Rewrite layer name parsing: LayerName now supports all variants we use
now.
* Drop pagectl's own layer parsing function, use LayerName in the
pageserver crate.
* Support image layer dumping in the layer dump command using
ImageLayer::dump, drop the original implementation.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/ctl/src/layer_map_analyzer.rs      |  43 ++-----
 pageserver/ctl/src/layers.rs                  |  70 ++++--------
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |   2 +-
 .../src/tenant/storage_layer/layer_name.rs    | 105 +++++++++---------
 5 files changed, 91 insertions(+), 133 deletions(-)

diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 451d2a1d692c..11b8e98f57d0 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -2,7 +2,7 @@
 //!
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
 
-use anyhow::Result;
+use anyhow::{anyhow, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
@@ -11,13 +11,14 @@ use pageserver::virtual_file::api::IoMode;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
+use std::str::FromStr;
 use std::{fs, str};
 
 use pageserver::page_cache::{self, PAGE_SZ};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
-use pageserver::tenant::storage_layer::range_overlaps;
+use pageserver::tenant::storage_layer::{range_overlaps, LayerName};
 use pageserver::virtual_file::{self, VirtualFile};
 use pageserver_api::key::{Key, KEY_SIZE};
 
@@ -74,35 +75,15 @@ impl LayerFile {
     }
 }
 
-pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
-    let split: Vec<&str> = name.split("__").collect();
-    if split.len() != 2 {
-        return None;
-    }
-    let keys: Vec<&str> = split[0].split('-').collect();
-    let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
-    let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
-    let the_lsns: [&str; 2];
-
-    /*
-     * Generations add a -vX-XXXXXX postfix, which causes issues when we try to
-     * parse 'vX' as an LSN.
-     */
-    let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
-        the_lsns = [lsns[0], lsns[0]];
-        false
-    } else {
-        the_lsns = [lsns[0], lsns[1]];
-        true
-    };
-
-    let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
-    let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
+pub(crate) fn parse_filename(name: &str) -> anyhow::Result<LayerFile> {
+    let layer_name =
+        LayerName::from_str(name).map_err(|e| anyhow!("failed to parse layer name: {e}"))?;
+
     let holes = Vec::new();
-    Some(LayerFile {
-        key_range,
-        lsn_range,
-        is_delta,
+    Ok(LayerFile {
+        key_range: layer_name.key_range().clone(),
+        lsn_range: layer_name.lsn_as_range(),
+        is_delta: layer_name.is_delta(),
         holes,
     })
 }
@@ -179,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
 
             for layer in fs::read_dir(timeline.path())? {
                 let layer = layer?;
-                if let Some(mut layer_file) =
+                if let Ok(mut layer_file) =
                     parse_filename(&layer.file_name().into_string().unwrap())
                 {
                     if layer_file.is_delta {
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index 22627d72c8e3..6f543dcaa9ff 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -5,24 +5,12 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
-use pageserver::tenant::block_io::BlockCursor;
-use pageserver::tenant::disk_btree::DiskBtreeReader;
-use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
 use pageserver::tenant::storage_layer::{delta_layer, image_layer};
 use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
-use pageserver::{
-    tenant::{
-        block_io::FileBlockReader, disk_btree::VisitDirection,
-        storage_layer::delta_layer::DELTA_KEY_SIZE,
-    },
-    virtual_file::VirtualFile,
-};
-use pageserver_api::key::{Key, KEY_SIZE};
-use std::fs;
-use utils::bin_ser::BeSer;
+use std::fs::{self, File};
 use utils::id::{TenantId, TimelineId};
 
 use crate::layer_map_analyzer::parse_filename;
@@ -59,44 +47,30 @@ pub(crate) enum LayerCmd {
 }
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
-    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
     virtual_file::init(
         10,
         virtual_file::api::IoEngineKind::StdFs,
         IoMode::preferred(),
     );
     page_cache::init(100);
-    let file = VirtualFile::open(path, ctx).await?;
-    let file_id = page_cache::next_file_id();
-    let block_reader = FileBlockReader::new(&file, file_id);
-    let summary_blk = block_reader.read_blk(0, ctx).await?;
-    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
-    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-        actual_summary.index_start_blk,
-        actual_summary.index_root_blk,
-        &block_reader,
+    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
+    let file = File::open(path)?;
+    let delta_layer = DeltaLayer::new_for_path(path, file)?;
+    delta_layer.dump(true, ctx).await?;
+    Ok(())
+}
+
+async fn read_image_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
     );
-    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
-    let mut all = vec![];
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, value_offset| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                all.push((curr, BlobRef(value_offset)));
-                true
-            },
-            ctx,
-        )
-        .await?;
-    let cursor = BlockCursor::new_fileblockreader(&block_reader);
-    for (k, v) in all {
-        let value = cursor.read_blob(v.pos(), ctx).await?;
-        println!("key:{} value_len:{}", k, value.len());
-        assert!(k.is_i128_representable(), "invalid key: ");
-    }
-    // TODO(chi): special handling for last key?
+    page_cache::init(100);
+    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
+    let file = File::open(path)?;
+    let image_layer = ImageLayer::new_for_path(path, file)?;
+    image_layer.dump(true, ctx).await?;
     Ok(())
 }
 
@@ -133,8 +107,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             let mut idx = 0;
             for layer in fs::read_dir(timeline_path)? {
                 let layer = layer?;
-                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
-                {
+                if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
                     println!(
                         "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
                         idx,
@@ -163,8 +136,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             let mut idx = 0;
             for layer in fs::read_dir(timeline_path)? {
                 let layer = layer?;
-                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
-                {
+                if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
                     if *id == idx {
                         // TODO(chi): dedup code
                         println!(
@@ -180,7 +152,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                         if layer_file.is_delta {
                             read_delta_file(layer.path(), &ctx).await?;
                         } else {
-                            anyhow::bail!("not supported yet :(");
+                            read_image_file(layer.path(), &ctx).await?;
                         }
 
                         break;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 10165b1d068e..664c00a6b1c3 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -270,7 +270,7 @@ impl AsLayerDesc for DeltaLayer {
 }
 
 impl DeltaLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
         self.desc.dump();
 
         if !verbose {
@@ -1438,7 +1438,7 @@ impl DeltaLayerInner {
         offset
     }
 
-    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
+    pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
         let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c0d183dc08ce..834d1931d00f 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -231,7 +231,7 @@ impl AsLayerDesc for ImageLayer {
 }
 
 impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
         self.desc.dump();
 
         if !verbose {
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 2b98d74f9f66..addf3b85d9c7 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -2,13 +2,11 @@
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
 use pageserver_api::key::Key;
-use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::fmt;
 use std::ops::Range;
 use std::str::FromStr;
 
-use regex::Regex;
 use utils::lsn::Lsn;
 
 use super::PersistentLayerDesc;
@@ -60,32 +58,31 @@ impl Ord for DeltaLayerName {
 /// Represents the region of the LSN-Key space covered by a DeltaLayer
 ///
 /// ```text
-///    <key start>-<key end>__<LSN start>-<LSN end>
+///    <key start>-<key end>__<LSN start>-<LSN end>-<generation>
 /// ```
 impl DeltaLayerName {
     /// Parse the part of a delta layer's file name that represents the LayerName. Returns None
     /// if the filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
-        let mut parts = fname.split("__");
-        let mut key_parts = parts.next()?.split('-');
-        let mut lsn_parts = parts.next()?.split('-');
-
-        let key_start_str = key_parts.next()?;
-        let key_end_str = key_parts.next()?;
-        let lsn_start_str = lsn_parts.next()?;
-        let lsn_end_str = lsn_parts.next()?;
-
-        if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
-            return None;
-        }
-
-        if key_start_str.len() != 36
-            || key_end_str.len() != 36
-            || lsn_start_str.len() != 16
-            || lsn_end_str.len() != 16
+        let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
+        let (key_start_str, key_end_str) = key_parts.split_once('-')?;
+        let (lsn_start_str, lsn_end_generation_parts) = lsn_generation_parts.split_once('-')?;
+        let lsn_end_str = if let Some((lsn_end_str, maybe_generation)) =
+            lsn_end_generation_parts.split_once('-')
         {
-            return None;
-        }
+            if maybe_generation.starts_with("v") {
+                // vY-XXXXXXXX
+                lsn_end_str
+            } else if maybe_generation.len() == 8 {
+                // XXXXXXXX
+                lsn_end_str
+            } else {
+                // no idea what this is
+                return None;
+            }
+        } else {
+            lsn_end_generation_parts
+        };
 
         let key_start = Key::from_hex(key_start_str).ok()?;
         let key_end = Key::from_hex(key_end_str).ok()?;
@@ -173,25 +170,29 @@ impl ImageLayerName {
 /// Represents the part of the Key-LSN space covered by an ImageLayer
 ///
 /// ```text
-///    <key start>-<key end>__<LSN>
+///    <key start>-<key end>__<LSN>-<generation>
 /// ```
 impl ImageLayerName {
     /// Parse a string as then LayerName part of an image layer file name. Returns None if the
     /// filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
-        let mut parts = fname.split("__");
-        let mut key_parts = parts.next()?.split('-');
-
-        let key_start_str = key_parts.next()?;
-        let key_end_str = key_parts.next()?;
-        let lsn_str = parts.next()?;
-        if parts.next().is_some() || key_parts.next().is_some() {
-            return None;
-        }
-
-        if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
-            return None;
-        }
+        let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
+        let (key_start_str, key_end_str) = key_parts.split_once('-')?;
+        let lsn_str =
+            if let Some((lsn_str, maybe_generation)) = lsn_generation_parts.split_once('-') {
+                if maybe_generation.starts_with("v") {
+                    // vY-XXXXXXXX
+                    lsn_str
+                } else if maybe_generation.len() == 8 {
+                    // XXXXXXXX
+                    lsn_str
+                } else {
+                    // likely a delta layer
+                    return None;
+                }
+            } else {
+                lsn_generation_parts
+            };
 
         let key_start = Key::from_hex(key_start_str).ok()?;
         let key_end = Key::from_hex(key_end_str).ok()?;
@@ -258,6 +259,14 @@ impl LayerName {
         }
     }
 
+    /// Gets the LSN range encoded in the layer name.
+    pub fn lsn_as_range(&self) -> Range<Lsn> {
+        match &self {
+            LayerName::Image(layer) => layer.lsn_as_range(),
+            LayerName::Delta(layer) => layer.lsn_range.clone(),
+        }
+    }
+
     pub fn is_delta(&self) -> bool {
         matches!(self, LayerName::Delta(_))
     }
@@ -290,18 +299,8 @@ impl FromStr for LayerName {
     /// Self. When loading a physical layer filename, we drop any extra information
     /// not needed to build Self.
     fn from_str(value: &str) -> Result<Self, Self::Err> {
-        let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
-        let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
-            Some(captures) => captures
-                .name("base")
-                .expect("Non-optional group")
-                .as_str()
-                .into(),
-            None => value.into(),
-        };
-
-        let delta = DeltaLayerName::parse_str(&file_name);
-        let image = ImageLayerName::parse_str(&file_name);
+        let delta = DeltaLayerName::parse_str(value);
+        let image = ImageLayerName::parse_str(value);
         let ok = match (delta, image) {
             (None, None) => {
                 return Err(format!(
@@ -367,11 +366,14 @@ mod test {
             lsn: Lsn::from_hex("00000000014FED58").unwrap(),
         });
         let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
-        assert_eq!(parsed, expected,);
+        assert_eq!(parsed, expected);
+
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").unwrap();
+        assert_eq!(parsed, expected);
 
         // Omitting generation suffix is valid
         let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
-        assert_eq!(parsed, expected,);
+        assert_eq!(parsed, expected);
     }
 
     #[test]
@@ -385,6 +387,9 @@ mod test {
         let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
         assert_eq!(parsed, expected);
 
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").unwrap();
+        assert_eq!(parsed, expected);
+
         // Omitting generation suffix is valid
         let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
         assert_eq!(parsed, expected);

From b77b9bdc9fd1ef7b1b3d86ca20877e22fd8928f9 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 29 Oct 2024 15:13:06 -0500
Subject: [PATCH 04/32] Add tests for sql-exporter metrics

Should help us keep non-working metrics from hitting staging or
production.

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Fixes: https://github.com/neondatabase/neon/issues/8569
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 build-tools.Dockerfile                      |  12 +
 compute/Makefile                            |   3 +-
 compute/compute-node.Dockerfile             |   5 +-
 compute/etc/sql_exporter.jsonnet            |   4 +-
 poetry.lock                                 | 278 +++++++-----
 pyproject.toml                              |   6 +
 test_runner/fixtures/paths.py               |  11 +-
 test_runner/regress/test_compute_metrics.py | 448 +++++++++++++++++++-
 8 files changed, 651 insertions(+), 116 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 818cc1b6db92..93f1e48afae7 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -57,6 +57,18 @@ RUN set -e \
         zstd \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
+# sql_exporter
+
+# Keep the version the same as in compute/compute-node.Dockerfile and
+# test_runner/regress/test_compute_metrics.py.
+ENV SQL_EXPORTER_VERSION=0.13.1
+RUN curl -fsSL \
+    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
+    --output sql_exporter.tar.gz \
+    && mkdir /tmp/sql_exporter \
+    && tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
+    && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
+
 # protobuf-compiler (protoc)
 ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
diff --git a/compute/Makefile b/compute/Makefile
index 645880ce70ab..0036196160cf 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -22,6 +22,7 @@ sql_exporter.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \
 		etc/sql_exporter.jsonnet
 
 sql_exporter_autoscaling.yml: $(jsonnet_files)
@@ -29,7 +30,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
-		--tla-str application_name=sql_exporter_autoscaling \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \
 		etc/sql_exporter.jsonnet
 
 .PHONY: clean
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 85fb9c441d56..7e38ef8221b7 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1298,7 +1298,10 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin l
 #########################################################################################
 
 FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
-FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
+
+# Keep the version the same as in build-tools.Dockerfile and
+# test_runner/regress/test_compute_metrics.py.
+FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
 
 #########################################################################################
 #
diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
index 3c36fd4f68b9..e957dfd86e3d 100644
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -1,4 +1,4 @@
-function(collector_name, collector_file, application_name='sql_exporter') {
+function(collector_name, collector_file, connection_string) {
   // Configuration for sql_exporter for autoscaling-agent
   // Global defaults.
   global: {
@@ -23,7 +23,7 @@ function(collector_name, collector_file, application_name='sql_exporter') {
   target: {
     // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
     // the schema gets dropped or replaced to match the driver expected DSN format.
-    data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
+    data_source_name: connection_string,
 
     // Collectors (referenced by name) to execute on the target.
     // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
diff --git a/poetry.lock b/poetry.lock
index 36ea82a446da..e06950cb5252 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1034,24 +1034,25 @@ test-randomorder = ["pytest-randomly"]
 
 [[package]]
 name = "docker"
-version = "4.2.2"
+version = "7.1.0"
 description = "A Python library for the Docker Engine API."
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+python-versions = ">=3.8"
 files = [
-    {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"},
-    {file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"},
+    {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
+    {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
 ]
 
 [package.dependencies]
-pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""}
-requests = ">=2.14.2,<2.18.0 || >2.18.0"
-six = ">=1.4.0"
-websocket-client = ">=0.32.0"
+pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""}
+requests = ">=2.26.0"
+urllib3 = ">=1.26.0"
 
 [package.extras]
-ssh = ["paramiko (>=2.4.2)"]
-tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"]
+dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"]
+docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
+ssh = ["paramiko (>=2.4.3)"]
+websockets = ["websocket-client (>=1.3.0)"]
 
 [[package]]
 name = "exceptiongroup"
@@ -1416,6 +1417,16 @@ files = [
     {file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"},
 ]
 
+[[package]]
+name = "jsonnet"
+version = "0.20.0"
+description = "Python bindings for Jsonnet - The data templating language"
+optional = false
+python-versions = "*"
+files = [
+    {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"},
+]
+
 [[package]]
 name = "jsonpatch"
 version = "1.32"
@@ -2126,6 +2137,7 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@@ -2134,6 +2146,8 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
@@ -2340,20 +2354,6 @@ files = [
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 
-[[package]]
-name = "pypiwin32"
-version = "223"
-description = ""
-optional = false
-python-versions = "*"
-files = [
-    {file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"},
-    {file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"},
-]
-
-[package.dependencies]
-pywin32 = ">=223"
-
 [[package]]
 name = "pyrsistent"
 version = "0.18.1"
@@ -2573,80 +2573,91 @@ files = [
 
 [[package]]
 name = "pywin32"
-version = "301"
+version = "308"
 description = "Python for Window Extensions"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"},
-    {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"},
-    {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"},
-    {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"},
-    {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"},
-    {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"},
-    {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"},
-    {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"},
-    {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"},
-    {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"},
+    {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
+    {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
+    {file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"},
+    {file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"},
+    {file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"},
+    {file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"},
+    {file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"},
+    {file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"},
+    {file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"},
+    {file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"},
+    {file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"},
+    {file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"},
+    {file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"},
+    {file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"},
+    {file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"},
+    {file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"},
+    {file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"},
+    {file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"},
 ]
 
 [[package]]
 name = "pyyaml"
-version = "6.0.1"
+version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
-    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
-    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
-    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
-    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
-    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
-    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
 
 [[package]]
@@ -2901,6 +2912,58 @@ files = [
 [package.dependencies]
 mpmath = ">=0.19"
 
+[[package]]
+name = "testcontainers"
+version = "4.8.1"
+description = "Python library for throwaway instances of anything that can run in a Docker container"
+optional = false
+python-versions = "<4.0,>=3.9"
+files = [
+    {file = "testcontainers-4.8.1-py3-none-any.whl", hash = "sha256:d8ae43e8fe34060fcd5c3f494e0b7652b7774beabe94568a2283d0881e94d489"},
+    {file = "testcontainers-4.8.1.tar.gz", hash = "sha256:5ded4820b7227ad526857eb3caaafcabce1bbac05d22ad194849b136ffae3cb0"},
+]
+
+[package.dependencies]
+docker = "*"
+typing-extensions = "*"
+urllib3 = "*"
+wrapt = "*"
+
+[package.extras]
+arangodb = ["python-arango (>=7.8,<8.0)"]
+aws = ["boto3", "httpx"]
+azurite = ["azure-storage-blob (>=12.19,<13.0)"]
+chroma = ["chromadb-client"]
+clickhouse = ["clickhouse-driver"]
+cosmosdb = ["azure-cosmos"]
+db2 = ["ibm_db_sa", "sqlalchemy"]
+generic = ["httpx", "redis"]
+google = ["google-cloud-datastore (>=2)", "google-cloud-pubsub (>=2)"]
+influxdb = ["influxdb", "influxdb-client"]
+k3s = ["kubernetes", "pyyaml"]
+keycloak = ["python-keycloak"]
+localstack = ["boto3"]
+mailpit = ["cryptography"]
+minio = ["minio"]
+mongodb = ["pymongo"]
+mssql = ["pymssql", "sqlalchemy"]
+mysql = ["pymysql[rsa]", "sqlalchemy"]
+nats = ["nats-py"]
+neo4j = ["neo4j"]
+opensearch = ["opensearch-py"]
+oracle = ["oracledb", "sqlalchemy"]
+oracle-free = ["oracledb", "sqlalchemy"]
+qdrant = ["qdrant-client"]
+rabbitmq = ["pika"]
+redis = ["redis"]
+registry = ["bcrypt"]
+scylla = ["cassandra-driver (==3.29.1)"]
+selenium = ["selenium"]
+sftp = ["cryptography"]
+test-module-import = ["httpx"]
+trino = ["trino"]
+weaviate = ["weaviate-client (>=4.5.4,<5.0.0)"]
+
 [[package]]
 name = "toml"
 version = "0.10.2"
@@ -2970,6 +3033,17 @@ files = [
     {file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"},
 ]
 
+[[package]]
+name = "types-pyyaml"
+version = "6.0.12.20240917"
+description = "Typing stubs for PyYAML"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-PyYAML-6.0.12.20240917.tar.gz", hash = "sha256:d1405a86f9576682234ef83bcb4e6fff7c9305c8b1fbad5e0bcd4f7dbdc9c587"},
+    {file = "types_PyYAML-6.0.12.20240917-py3-none-any.whl", hash = "sha256:392b267f1c0fe6022952462bf5d6523f31e37f6cea49b14cee7ad634b6301570"},
+]
+
 [[package]]
 name = "types-requests"
 version = "2.31.0.0"
@@ -3044,22 +3118,6 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl
 secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
-[[package]]
-name = "websocket-client"
-version = "1.3.3"
-description = "WebSocket client for Python with low level API options"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "websocket-client-1.3.3.tar.gz", hash = "sha256:d58c5f284d6a9bf8379dab423259fe8f85b70d5fa5d2916d5791a84594b122b1"},
-    {file = "websocket_client-1.3.3-py3-none-any.whl", hash = "sha256:5d55652dc1d0b3c734f044337d929aaf83f4f9138816ec680c1aefefb4dc4877"},
-]
-
-[package.extras]
-docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
-optional = ["python-socks", "wsaccel"]
-test = ["websockets"]
-
 [[package]]
 name = "websockets"
 version = "12.0"
@@ -3184,6 +3242,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3421,4 +3489,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "ad5c9ee7723359af22bbd7fa41538dcf78913c02e947a13a8f9a87eb3a59039e"
+content-hash = "13bfc7479aacfe051abb92252b8ddc2e0c429f4607b2d9d8c4b353d2f75c1927"
diff --git a/pyproject.toml b/pyproject.toml
index faa5f9123c8f..3f21094ba42f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,10 @@ kafka-python = "^2.0.2"
 jwcrypto = "^1.5.6"
 h2 = "^4.1.0"
 types-jwcrypto = "^1.5.0.20240925"
+pyyaml = "^6.0.2"
+types-pyyaml = "^6.0.12.20240917"
+testcontainers = "^4.8.1"
+jsonnet = "^0.20.0"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -73,12 +77,14 @@ strict = true
 
 [[tool.mypy.overrides]]
 module = [
+    "_jsonnet.*",
     "asyncpg.*",
     "pg8000.*",
     "allure.*",
     "allure_commons.*",
     "allure_pytest.*",
     "kafka.*",
+    "testcontainers.*",
 ]
 ignore_missing_imports = true
 
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index 65f8e432b05e..d950f2356d3a 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -21,6 +21,8 @@
     from typing import Optional
 
 
+BASE_DIR = Path(__file__).parents[2]
+COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc"
 DEFAULT_OUTPUT_DIR: str = "test_output"
 
 
@@ -64,18 +66,17 @@ def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
 @pytest.fixture(scope="session")
 def base_dir() -> Iterator[Path]:
     # find the base directory (currently this is the git root)
-    base_dir = Path(__file__).parents[2]
-    log.info(f"base_dir is {base_dir}")
+    log.info(f"base_dir is {BASE_DIR}")
 
-    yield base_dir
+    yield BASE_DIR
 
 
 @pytest.fixture(scope="session")
-def compute_config_dir(base_dir: Path) -> Iterator[Path]:
+def compute_config_dir() -> Iterator[Path]:
     """
     Retrieve the path to the compute configuration directory.
     """
-    yield base_dir / "compute" / "etc"
+    yield COMPUTE_CONFIG_DIR
 
 
 @pytest.fixture(scope="function")
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 6c757656321d..c5e3034591af 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -1,9 +1,453 @@
 from __future__ import annotations
 
-from fixtures.neon_fixtures import NeonEnv
+import enum
+import os
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
 
+# Docs are available at https://jsonnet.org/ref/bindings.html#python_api
+import _jsonnet
+import pytest
+import requests
+import yaml
+from fixtures.log_helper import log
+from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR
 
-def test_compute_metrics(neon_simple_env: NeonEnv):
+if TYPE_CHECKING:
+    from types import TracebackType
+    from typing import Optional, TypedDict, Union
+
+    from fixtures.neon_fixtures import NeonEnv
+    from fixtures.pg_version import PgVersion
+    from fixtures.port_distributor import PortDistributor
+
+    class Metric(TypedDict):
+        metric_name: str
+        type: str
+        help: str
+        key_labels: Optional[list[str]]
+        values: Optional[list[str]]
+        query: Optional[str]
+        query_ref: Optional[str]
+
+    class Collector(TypedDict):
+        collector_name: str
+        metrics: list[Metric]
+        queries: Optional[list[Query]]
+
+    class Query(TypedDict):
+        query_name: str
+        query: str
+
+
+JSONNET_IMPORT_CACHE: dict[str, bytes] = {}
+JSONNET_PATH: list[Path] = [BASE_DIR / "compute" / "jsonnet", COMPUTE_CONFIG_DIR]
+
+
+def __import_callback(dir: str, rel: str) -> tuple[str, bytes]:
+    """
+    dir: The directory of the Jsonnet file which tried to import a file
+    rel: The actual import path from Jsonnet
+    """
+    if not rel:
+        raise RuntimeError("Empty filename")
+
+    full_path: Optional[str] = None
+    if os.path.isabs(rel):
+        full_path = rel
+    else:
+        for p in (dir, *JSONNET_PATH):
+            assert isinstance(p, (str, Path)), "for mypy"
+            full_path = os.path.join(p, rel)
+
+            assert isinstance(full_path, str), "for mypy"
+            if not os.path.exists(full_path):
+                full_path = None
+                continue
+
+            break
+
+        if not full_path:
+            raise RuntimeError(f"Could not resolve import ({rel}) in {dir}")
+
+    if os.path.isdir(full_path):
+        raise RuntimeError(f"Attempted to import directory: {full_path}")
+
+    if full_path not in JSONNET_IMPORT_CACHE:
+        with open(full_path, encoding="utf-8") as f:
+            JSONNET_IMPORT_CACHE[full_path] = f.read().encode()
+
+    return full_path, JSONNET_IMPORT_CACHE[full_path]
+
+
+def jsonnet_evaluate_file(
+    jsonnet_file: Union[str, Path],
+    ext_vars: Optional[Union[str, dict[str, str]]] = None,
+    tla_vars: Optional[Union[str, dict[str, str]]] = None,
+) -> str:
+    return cast(
+        "str",
+        _jsonnet.evaluate_file(
+            str(jsonnet_file),
+            ext_vars=ext_vars,
+            tla_vars=tla_vars,
+            import_callback=__import_callback,
+        ),
+    )
+
+
+def evaluate_collector(jsonnet_file: Path, pg_version: PgVersion) -> str:
+    return jsonnet_evaluate_file(jsonnet_file, ext_vars={"pg_version": str(pg_version)})
+
+
+def evaluate_config(
+    jsonnet_file: Path, collector_name: str, collector_file: Union[str, Path], connstr: str
+) -> str:
+    return jsonnet_evaluate_file(
+        jsonnet_file,
+        tla_vars={
+            "collector_name": collector_name,
+            "collector_file": str(collector_file),
+            "connection_string": connstr,
+        },
+    )
+
+
+@enum.unique
+class SqlExporterProcess(str, enum.Enum):
+    COMPUTE = "compute"
+    AUTOSCALING = "autoscaling"
+
+
+@pytest.mark.parametrize(
+    "collector_name",
+    ["neon_collector", "neon_collector_autoscaling"],
+    ids=[SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],
+)
+def test_sql_exporter_metrics_smoke(
+    pg_version: PgVersion,
+    neon_simple_env: NeonEnv,
+    compute_config_dir: Path,
+    collector_name: str,
+):
+    """
+    This is a smoke test to ensure the metrics SQL queries for sql_exporter
+    work without errors.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create("main")
+    endpoint.respec(skip_pg_catalog_updates=False)
+    endpoint.start()
+
+    # Extract all the SQL queries from the sql_exporter config files, and run
+    # them.
+    collector = cast(
+        "Collector",
+        yaml.safe_load(
+            jsonnet_evaluate_file(
+                str(compute_config_dir / f"{collector_name}.jsonnet"),
+                ext_vars={"pg_version": pg_version},
+            )
+        ),
+    )
+
+    for metric in collector["metrics"]:
+        query = metric.get("query")
+        if query is not None:
+            log.info("Checking query for metric %s in %s", metric["metric_name"], collector_name)
+            endpoint.safe_psql(query)
+
+    queries = collector.get("queries")
+    if queries is not None:
+        # This variable is named q because mypy is too silly to understand it is
+        # different from the query above.
+        #
+        # query: Optional[str]
+        # q: Metric
+        for q in queries:
+            log.info("Checking query %s in %s", q["query_name"], collector_name)
+            endpoint.safe_psql(q["query"])
+
+
+class SqlExporterRunner:
+    def __init__(self, test_output_dir: Path, sql_exporter_port: int) -> None:
+        self._log_file_name = test_output_dir / "sql_exporter.stderr"
+        self._sql_exporter_port = sql_exporter_port
+
+        log.info(f"Starting sql_exporter at http://localhost:{self._sql_exporter_port}")
+
+    def start(self) -> None:
+        raise NotImplementedError()
+
+    def stop(self) -> None:
+        raise NotImplementedError()
+
+    def __enter__(self) -> SqlExporterRunner:
+        self.start()
+
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
+        self.stop()
+
+
+SQL_EXPORTER = shutil.which("sql_exporter")
+
+if SQL_EXPORTER is None:
+    from testcontainers.core.container import DockerContainer
+    from testcontainers.core.waiting_utils import wait_for_logs
+    from typing_extensions import override
+
+    class SqlExporterContainer(DockerContainer):  # type: ignore
+        def __init__(
+            self, logs_dir: Path, config_file: Path, collector_file: Path, port: int
+        ) -> None:
+            # NOTE: Keep the version the same as in
+            # compute/Dockerfile.compute-node and Dockerfile.build-tools.
+            #
+            # The "host" network mode allows sql_exporter to talk to the
+            # endpoint which is running on the host.
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.13.1", network_mode="host")
+
+            self.__logs_dir = logs_dir
+            self.__port = port
+
+            config_file_name = config_file.name
+            collector_file_name = collector_file.name
+
+            self.with_command(f"-config.file=/etc/{config_file_name} -web.listen-address=:{port}")
+
+            container_config_file = f"/etc/{config_file_name}"
+            container_collector_file = f"/etc/{collector_file_name}"
+            log.info(
+                "Mapping %s to %s in sql_exporter container", config_file, container_config_file
+            )
+            log.info(
+                "Mapping %s to %s in sql_exporter container",
+                collector_file,
+                container_collector_file,
+            )
+
+            # NOTE: z allows Podman to work with SELinux. Please don't change it.
+            # Ideally this would be a ro (read-only) mount, but I couldn't seem to
+            # get it to work.
+            self.with_volume_mapping(str(config_file), container_config_file, "z")
+            self.with_volume_mapping(str(collector_file), container_collector_file, "z")
+
+        @override
+        def start(self) -> SqlExporterContainer:
+            super().start()
+
+            log.info("Waiting for sql_exporter to be ready")
+            wait_for_logs(
+                self,
+                rf'level=info msg="Listening on" address=\[::\]:{self.__port}',
+                timeout=5,
+            )
+
+            return self
+
+    class SqlExporterContainerRunner(SqlExporterRunner):
+        def __init__(
+            self,
+            test_output_dir: Path,
+            config_file: Path,
+            collector_file: Path,
+            sql_exporter_port: int,
+        ) -> None:
+            super().__init__(test_output_dir, sql_exporter_port)
+
+            self.__container = SqlExporterContainer(
+                test_output_dir, config_file, collector_file, sql_exporter_port
+            )
+
+        @override
+        def start(self) -> None:
+            self.__container.start()
+
+        @override
+        def stop(self) -> None:
+            try:
+                # sql_exporter doesn't print anything to stdout
+                with open(self._log_file_name, "w", encoding="utf-8") as f:
+                    f.write(self.__container.get_logs()[1].decode())
+            except Exception:
+                log.exception("Failed to write sql_exporter logs")
+
+            # Stop the container *after* getting the logs
+            self.__container.stop()
+
+else:
+    import subprocess
+    import time
+    from signal import Signals
+
+    from typing_extensions import override
+
+    if TYPE_CHECKING:
+        from collections.abc import Mapping
+
+    class SqlExporterNativeRunner(SqlExporterRunner):
+        def __init__(
+            self,
+            test_output_dir: Path,
+            config_file: Path,
+            collector_file: Path,
+            sql_exporter_port: int,
+        ) -> None:
+            super().__init__(test_output_dir, sql_exporter_port)
+
+            self.__config_file = config_file
+            self.__collector_file = collector_file
+            self.__proc: subprocess.Popen[str]
+
+        @override
+        def start(self) -> None:
+            assert SQL_EXPORTER is not None
+
+            log_file = open(self._log_file_name, "w", encoding="utf-8")
+            self.__proc = subprocess.Popen(
+                [
+                    os.path.realpath(SQL_EXPORTER),
+                    f"-config.file={self.__config_file}",
+                    f"-web.listen-address=:{self._sql_exporter_port}",
+                ],
+                # If PGSERVICEFILE is set, sql_exporter won't launch.
+                env=cast("Mapping[str, str]", {}),
+                stderr=log_file,
+                bufsize=0,
+                text=True,
+            )
+
+            log.info("Waiting for sql_exporter to be ready")
+
+            with open(self._log_file_name, encoding="utf-8") as f:
+                started = time.time()
+                while True:
+                    if time.time() - started > 5:
+                        self.__proc.kill()
+                        raise RuntimeError("sql_exporter did not start up properly")
+
+                    line = f.readline()
+                    if not line:
+                        time.sleep(0.5)
+                        continue
+
+                    if (
+                        f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}'
+                        in line
+                    ):
+                        break
+
+        @override
+        def stop(self) -> None:
+            self.__proc.send_signal(Signals.SIGINT)
+            self.__proc.wait()
+
+
+@pytest.mark.parametrize(
+    "exporter",
+    [SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],
+)
+def test_sql_exporter_metrics_e2e(
+    pg_version: PgVersion,
+    neon_simple_env: NeonEnv,
+    test_output_dir: Path,
+    compute_config_dir: Path,
+    exporter: SqlExporterProcess,
+    port_distributor: PortDistributor,
+):
+    """
+    This is a full E2E test of the sql_exporter setup to make sure it works
+    without error.
+
+    If you use Podman instead of Docker, you may run into issues. If you run
+    rootful Podman, you may need to add a ~/.testcontainers.properties file
+    with the following content:
+
+        ryuk.container.privileged=true
+
+    If you are not running rootful Podman, set the following environment
+    variable:
+
+        TESTCONTAINERS_RYUK_DISABLED=true
+
+    Note that you will need the Podman socket to be running. On a systemd-based
+    system, that command will look something like:
+
+        # Use `enable --now` to start the socket on login and immediately.
+        systemctl --user start podman.socket
+
+    Whether you use the user service manager or the system service manager is
+    up to you, but may have implications on the above ryuk related steps. Note
+    that you may also need the docker(1) Podman frontend. I am unsure if the
+    docker Python package supports Podman natively.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create("main")
+    endpoint.respec(skip_pg_catalog_updates=False)
+    endpoint.start()
+
+    if exporter == SqlExporterProcess.COMPUTE:
+        stem_suffix = ""
+    elif exporter == SqlExporterProcess.AUTOSCALING:
+        stem_suffix = "_autoscaling"
+
+    # Write the collector file
+    collector_file = test_output_dir / f"neon_collector{stem_suffix}.yml"
+    with open(collector_file, "w", encoding="utf-8") as o:
+        collector = evaluate_collector(
+            compute_config_dir / f"neon_collector{stem_suffix}.jsonnet", pg_version
+        )
+        o.write(collector)
+
+    conn_options = endpoint.conn_options()
+    pg_host = conn_options["host"]
+    pg_port = conn_options["port"]
+    pg_user = conn_options["user"]
+    pg_dbname = conn_options["dbname"]
+    pg_application_name = f"sql_exporter{stem_suffix}"
+    connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}"
+
+    def escape_go_filepath_match_characters(s: str) -> str:
+        """
+        Unfortunately sql_exporter doesn't use plain file paths, so we need to
+        escape special characters. pytest encodes the parameters of a test using
+        [ and ], so we need to escape them with backslashes.
+        See https://pkg.go.dev/path/filepath#Match.
+        """
+        return s.replace("[", r"\[").replace("]", r"\]")
+
+    # Write the config file
+    config_file = test_output_dir / f"sql_exporter{stem_suffix}.yml"
+    with open(config_file, "w", encoding="utf-8") as o:
+        config = evaluate_config(
+            compute_config_dir / "sql_exporter.jsonnet",
+            collector_name=collector_file.stem,
+            collector_file=escape_go_filepath_match_characters(str(collector_file))
+            if SQL_EXPORTER
+            else collector_file.name,
+            connstr=connstr,
+        )
+        o.write(config)
+
+    sql_exporter_port = port_distributor.get_port()
+    with (SqlExporterNativeRunner if SQL_EXPORTER else SqlExporterContainerRunner)(
+        test_output_dir, config_file, collector_file, sql_exporter_port
+    ) as _runner:
+        resp = requests.get(f"http://localhost:{sql_exporter_port}/metrics")
+        resp.raise_for_status()
+
+
+def test_perf_counters(neon_simple_env: NeonEnv):
     """
     Test compute metrics, exposed in the neon_backend_perf_counters and
     neon_perf_counters views

From 8e2e9f0fed000c1204b84a8dc9702ba28046938b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 29 Oct 2024 22:24:04 +0000
Subject: [PATCH 05/32] pageserver: generation-aware storage for TenantManifest
 (#9555)

## Problem

When tenant manifest objects are written without a generation suffix,
concurrently attached pageservers may stamp on each others writes of the
manifest and cause undefined behavior.

Closes: #9543

## Summary of changes

- Use download_generation_object helper when reading manifests, to
search for the most recent generation
- Use Tenant::generation as the generation suffix when writing
manifests.
---
 pageserver/src/tenant.rs                      |  9 ++-
 .../src/tenant/remote_timeline_client.rs      | 26 +++++---
 .../tenant/remote_timeline_client/download.rs | 59 ++++++++++++++-----
 3 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6ac11b0ae130..90d9feeeb604 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1352,14 +1352,15 @@ impl Tenant {
         )
         .await?;
         let (offloaded_add, tenant_manifest) =
-            match remote_timeline_client::do_download_tenant_manifest(
+            match remote_timeline_client::download_tenant_manifest(
                 remote_storage,
                 &self.tenant_shard_id,
+                self.generation,
                 &cancel,
             )
             .await
             {
-                Ok((tenant_manifest, _generation)) => (
+                Ok((tenant_manifest, _generation, _manifest_mtime)) => (
                     format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
                     tenant_manifest,
                 ),
@@ -3130,8 +3131,6 @@ impl Tenant {
         }
 
         let tenant_manifest = self.build_tenant_manifest();
-        // TODO: generation support
-        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
         for child_shard in child_shards {
             tracing::info!(
                 "Uploading tenant manifest for child {}",
@@ -3140,7 +3139,7 @@ impl Tenant {
             upload_tenant_manifest(
                 &self.remote_storage,
                 child_shard,
-                generation,
+                self.generation,
                 &tenant_manifest,
                 &self.cancel,
             )
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 19e762b9fae8..03ec18c8822e 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -190,6 +190,7 @@ use chrono::{NaiveDateTime, Utc};
 pub(crate) use download::download_initdb_tar_zst;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
+use regex::Regex;
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -199,7 +200,7 @@ use utils::pausable_failpoint;
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Mutex, OnceLock};
 use std::time::Duration;
 
 use remote_storage::{
@@ -245,7 +246,7 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;
 
 pub(crate) use download::{
-    do_download_tenant_manifest, download_index_part, is_temp_download_file,
+    download_index_part, download_tenant_manifest, is_temp_download_file,
     list_remote_tenant_shards, list_remote_timelines,
 };
 pub(crate) use index::LayerFileMetadata;
@@ -274,12 +275,6 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 /// which we warn and skip.
 const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
 
-/// Hardcode a generation for the tenant manifest for now so that we don't
-/// need to deal with generation-less manifests in the future.
-///
-/// TODO: add proper generation support to all the places that use this.
-pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
-
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -2239,6 +2234,12 @@ pub fn remote_tenant_manifest_path(
     RemotePath::from_string(&path).expect("Failed to construct path")
 }
 
+/// Prefix to all generations' manifest objects in a tenant shard
+pub fn remote_tenant_manifest_prefix(tenant_shard_id: &TenantShardId) -> RemotePath {
+    let path = format!("tenants/{tenant_shard_id}/tenant-manifest",);
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
     RemotePath::from_string(&path).expect("Failed to construct path")
@@ -2333,6 +2334,15 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
     }
 }
 
+/// Given the key of a tenant manifest, parse out the generation number
+pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
+    re.captures(path.get_path().as_str())
+        .and_then(|c| c.get(1))
+        .and_then(|m| Generation::parse_suffix(m.as_str()))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 8679c68a2713..efcd20d1bf5c 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -20,7 +20,9 @@ use utils::backoff;
 
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::span::{
+    debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id,
+};
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
@@ -36,9 +38,10 @@ use utils::pausable_failpoint;
 use super::index::{IndexPart, LayerFileMetadata};
 use super::manifest::TenantManifest;
 use super::{
-    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
-    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path,
+    remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path,
+    remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
 ///
@@ -365,32 +368,34 @@ async fn do_download_remote_path_retry_forever(
     .await
 }
 
-pub async fn do_download_tenant_manifest(
+async fn do_download_tenant_manifest(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
+    _timeline_id: Option<&TimelineId>,
+    generation: Generation,
     cancel: &CancellationToken,
-) -> Result<(TenantManifest, Generation), DownloadError> {
-    // TODO: generation support
-    let generation = super::TENANT_MANIFEST_GENERATION;
+) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
     let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
 
-    let (manifest_bytes, _manifest_bytes_mtime) =
+    let (manifest_bytes, manifest_bytes_mtime) =
         do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
 
     let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
         .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
-    Ok((tenant_manifest, generation))
+    Ok((tenant_manifest, generation, manifest_bytes_mtime))
 }
 
 async fn do_download_index_part(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
+    timeline_id: Option<&TimelineId>,
     index_generation: Generation,
     cancel: &CancellationToken,
 ) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    let timeline_id =
+        timeline_id.expect("A timeline ID is always provided when downloading an index");
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
     let (index_part_bytes, index_part_mtime) =
@@ -426,7 +431,7 @@ async fn do_download_index_part(
 pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
     storage: &'a GenericRemoteStorage,
     tenant_shard_id: &'a TenantShardId,
-    timeline_id: &'a TimelineId,
+    timeline_id: Option<&'a TimelineId>,
     my_generation: Generation,
     what: &str,
     prefix: RemotePath,
@@ -438,7 +443,7 @@ where
     DF: Fn(
         &'a GenericRemoteStorage,
         &'a TenantShardId,
-        &'a TimelineId,
+        Option<&'a TimelineId>,
         Generation,
         &'a CancellationToken,
     ) -> DFF,
@@ -446,7 +451,7 @@ where
     PF: Fn(RemotePath) -> Option<Generation>,
     T: 'static,
 {
-    debug_assert_current_span_has_tenant_and_timeline_id();
+    debug_assert_current_span_has_tenant_id();
 
     if my_generation.is_none() {
         // Operating without generations: just fetch the generation-less path
@@ -552,11 +557,13 @@ pub(crate) async fn download_index_part(
     my_generation: Generation,
     cancel: &CancellationToken,
 ) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
     download_generation_object(
         storage,
         tenant_shard_id,
-        timeline_id,
+        Some(timeline_id),
         my_generation,
         "index_part",
         index_prefix,
@@ -567,6 +574,28 @@ pub(crate) async fn download_index_part(
     .await
 }
 
+pub(crate) async fn download_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    my_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
+    let manifest_prefix = remote_tenant_manifest_prefix(tenant_shard_id);
+
+    download_generation_object(
+        storage,
+        tenant_shard_id,
+        None,
+        my_generation,
+        "tenant-manifest",
+        manifest_prefix,
+        do_download_tenant_manifest,
+        parse_remote_tenant_manifest_path,
+        cancel,
+    )
+    .await
+}
+
 pub(crate) async fn download_initdb_tar_zst(
     conf: &'static PageServerConf,
     storage: &GenericRemoteStorage,

From 0c828c57e2f82302f8261e7e9b58cef7f9f31f50 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 29 Oct 2024 23:03:45 -0500
Subject: [PATCH 06/32] Remove non-gzipped basebackup code path

In July of 2023, Bojan and Chi authored
92aee7e07f347a0cc125462705811963ab5c78e9. Our in production pageservers
are most definitely at a version where they all support gzipped
basebackups.
---
 compute_tools/src/compute.rs | 32 ++++----------------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c9dd4dcfc59e..d3e42fe61884 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,7 +1,6 @@
 use std::collections::HashMap;
 use std::env;
 use std::fs;
-use std::io::BufRead;
 use std::os::unix::fs::{symlink, PermissionsExt};
 use std::path::Path;
 use std::process::{Command, Stdio};
@@ -365,8 +364,7 @@ impl ComputeNode {
         let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
 
         let basebackup_cmd = match lsn {
-            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
-            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
+            Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
             _ => format!(
                 "basebackup {} {} {} --gzip",
                 spec.tenant_id, spec.timeline_id, lsn
@@ -375,38 +373,16 @@ impl ComputeNode {
 
         let copyreader = client.copy_out(basebackup_cmd.as_str())?;
         let mut measured_reader = MeasuredReader::new(copyreader);
-
-        // Check the magic number to see if it's a gzip or not. Even though
-        // we might explicitly ask for gzip, an old pageserver with no implementation
-        // of gzip compression might send us uncompressed data. After some time
-        // passes we can assume all pageservers know how to compress and we can
-        // delete this check.
-        //
-        // If the data is not gzip, it will be tar. It will not be mistakenly
-        // recognized as gzip because tar starts with an ascii encoding of a filename,
-        // and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
-        // we send the "global" directory first from the pageserver, so it definitely
-        // won't be recognized as gzip.
         let mut bufreader = std::io::BufReader::new(&mut measured_reader);
-        let gzip = {
-            let peek = bufreader.fill_buf().unwrap();
-            peek[0] == 0x1f && peek[1] == 0x8b
-        };
 
         // Read the archive directly from the `CopyOutReader`
         //
         // Set `ignore_zeros` so that unpack() reads all the Copy data and
         // doesn't stop at the end-of-archive marker. Otherwise, if the server
         // sends an Error after finishing the tarball, we will not notice it.
-        if gzip {
-            let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
-            ar.set_ignore_zeros(true);
-            ar.unpack(&self.pgdata)?;
-        } else {
-            let mut ar = tar::Archive::new(&mut bufreader);
-            ar.set_ignore_zeros(true);
-            ar.unpack(&self.pgdata)?;
-        };
+        let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata)?;
 
         // Report metrics
         let mut state = self.state.lock().unwrap();

From 745061ddf862395894b34a9aa2e8c698d26cacd7 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 30 Oct 2024 11:07:02 +0100
Subject: [PATCH 07/32] chore(compute): Bump pg_mooncake to the latest version
 (#9576)

## Problem

There were some critical breaking changes made in the upstream since Oct
29th morning.

## Summary of changes

Point it to the topmost commit in the `neon` branch at the time of
writing this
https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
https://github.com/Mooncake-Labs/pg_mooncake/commit/c495cd17d6018a6fd170b3f47c645a89b23917fc
---
 compute/compute-node.Dockerfile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 7e38ef8221b7..c2333eda08e2 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1131,14 +1131,17 @@ FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PG_MOONCAKE_VERSION=882175dbba07ba2e6e59b1088d61bf325b910b9e
+# The topmost commit in the `neon` branch at the time of writing this
+# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
+# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
+ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN case "${PG_VERSION}" in \
         'v14') \
             echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
     esac && \
-    git clone --depth 1 --branch neon https://github.com/kelvich/pg_mooncake.git pg_mooncake-src && \
+    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
     cd pg_mooncake-src && \
     git checkout "${PG_MOONCAKE_VERSION}" && \
     git submodule update --init --depth 1 --recursive && \

From 96e35e11a6e092429015d78120f8d12dcc542077 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 30 Oct 2024 12:46:39 +0100
Subject: [PATCH 08/32] postgres_ffi: add WAL generator for tests/benchmarks
 (#9503)

## Problem

We don't have a convenient way to generate WAL records for benchmarks
and tests.

## Summary of changes

Adds a WAL generator, exposed as an iterator. It currently only
generates logical messages (noops), but will be extended to write actual
table rows later.

Some existing code for WAL generation has been replaced with this
generator, to reduce duplication.
---
 libs/postgres_ffi/src/lib.rs                  |   1 +
 libs/postgres_ffi/src/wal_generator.rs        | 203 +++++++++++++
 libs/postgres_ffi/src/xlog_utils.rs           |  76 +----
 libs/utils/src/lsn.rs                         |   2 +-
 safekeeper/src/json_ctrl.rs                   |   3 +-
 .../tests/walproposer_sim/simulation.rs       |   3 +-
 .../tests/walproposer_sim/walproposer_disk.rs | 270 +-----------------
 7 files changed, 235 insertions(+), 323 deletions(-)
 create mode 100644 libs/postgres_ffi/src/wal_generator.rs

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 6b219488acfb..0239b56d9cf8 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -36,6 +36,7 @@ macro_rules! postgres_ffi {
             pub mod controlfile_utils;
             pub mod nonrelfile_utils;
             pub mod wal_craft_test_export;
+            pub mod wal_generator;
             pub mod waldecoder_handler;
             pub mod xlog_utils;
 
diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs
new file mode 100644
index 000000000000..97968c269b49
--- /dev/null
+++ b/libs/postgres_ffi/src/wal_generator.rs
@@ -0,0 +1,203 @@
+use std::ffi::CStr;
+
+use bytes::{Bytes, BytesMut};
+use crc32c::crc32c_append;
+use utils::lsn::Lsn;
+
+use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
+use super::xlog_utils::{
+    XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
+    XLP_FIRST_IS_CONTRECORD,
+};
+use super::XLogRecord;
+use crate::pg_constants::{
+    RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
+    XLR_BLOCK_ID_DATA_SHORT,
+};
+use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
+
+/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
+/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
+/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
+/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
+/// boundaries if desired. Not optimized for performance.
+///
+/// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
+/// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
+///
+/// A WAL is split into 16 MB segments. Each segment is split into 8 KB pages, with headers.
+/// Records are arbitrary length, 8-byte aligned, and may span pages. The layout is e.g.:
+///
+/// |        Segment 1         |        Segment 2         |        Segment 3         |
+/// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
+/// | R1 |   R2  |R3|  R4  | R5  |  R6  |                 R7            | R8  |
+///
+/// TODO: support generating actual tables and rows.
+#[derive(Default)]
+pub struct WalGenerator {
+    /// Current LSN to append the next record at.
+    ///
+    /// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
+    /// ensure that the LSN is on a valid record boundary (i.e. we can't start appending in the
+    /// middle on an existing record or header, or beyond the end of the existing WAL).
+    pub lsn: Lsn,
+    /// The starting LSN of the previous record. Used in WAL record headers. The Safekeeper doesn't
+    /// care about this, unlike Postgres, but we include it for completeness.
+    pub prev_lsn: Lsn,
+}
+
+impl WalGenerator {
+    // For now, hardcode the message payload.
+    // TODO: support specifying the payload size.
+    const PREFIX: &CStr = c"prefix";
+    const MESSAGE: &[u8] = b"message";
+
+    // Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
+    const SYS_ID: u64 = 0;
+    const TIMELINE_ID: u32 = 1;
+    const DB_ID: u32 = 0;
+
+    /// Creates a new WAL generator, which emits logical message records (noops).
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Encodes a logical message (basically a noop), with the given prefix and message.
+    pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
+        let prefix = prefix.to_bytes_with_nul();
+        let header = XlLogicalMessage {
+            db_id: Self::DB_ID,
+            transactional: 0,
+            prefix_size: prefix.len() as u64,
+            message_size: message.len() as u64,
+        };
+        [&header.encode(), prefix, message].concat().into()
+    }
+
+    /// Encode a WAL record with the given payload data (e.g. a logical message).
+    pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
+        // Prefix data with block ID and length.
+        let data_header = Bytes::from(match data.len() {
+            0 => vec![],
+            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
+            256.. => {
+                let len_bytes = (data.len() as u32).to_le_bytes();
+                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
+            }
+        });
+
+        // Construct the WAL record header.
+        let mut header = XLogRecord {
+            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
+            xl_xid: 0,
+            xl_prev: prev_lsn.into(),
+            xl_info: info,
+            xl_rmid: rmid,
+            __bindgen_padding_0: [0; 2],
+            xl_crc: 0, // see below
+        };
+
+        // Compute the CRC checksum for the data, and the header up to the CRC field.
+        let mut crc = 0;
+        crc = crc32c_append(crc, &data_header);
+        crc = crc32c_append(crc, &data);
+        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
+        header.xl_crc = crc;
+
+        // Encode the final header and record.
+        let header = header.encode().unwrap();
+
+        [header, data_header, data].concat().into()
+    }
+
+    /// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
+    /// is to be appended.
+    fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
+        // Fast path: record fits in current page, and the page already has a header.
+        if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
+            return record;
+        }
+
+        let mut pages = BytesMut::new();
+        let mut remaining = record.clone(); // Bytes::clone() is cheap
+        while !remaining.is_empty() {
+            // At new page boundary, inject page header.
+            if lsn.block_offset() == 0 {
+                let mut page_header = XLogPageHeaderData {
+                    xlp_magic: XLOG_PAGE_MAGIC as u16,
+                    xlp_info: XLP_BKP_REMOVABLE,
+                    xlp_tli: Self::TIMELINE_ID,
+                    xlp_pageaddr: lsn.0,
+                    xlp_rem_len: 0,
+                    __bindgen_padding_0: [0; 4],
+                };
+                // If the record was split across page boundaries, mark as continuation.
+                if remaining.len() < record.len() {
+                    page_header.xlp_rem_len = remaining.len() as u32;
+                    page_header.xlp_info |= XLP_FIRST_IS_CONTRECORD;
+                }
+                // At start of segment, use a long page header.
+                let page_header = if lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
+                    page_header.xlp_info |= XLP_LONG_HEADER;
+                    XLogLongPageHeaderData {
+                        std: page_header,
+                        xlp_sysid: Self::SYS_ID,
+                        xlp_seg_size: WAL_SEGMENT_SIZE as u32,
+                        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
+                    }
+                    .encode()
+                    .unwrap()
+                } else {
+                    page_header.encode().unwrap()
+                };
+                pages.extend_from_slice(&page_header);
+                lsn += page_header.len() as u64;
+            }
+
+            // Append the record up to the next page boundary, if any.
+            let page_free = lsn.remaining_in_block() as usize;
+            let chunk = remaining.split_to(std::cmp::min(page_free, remaining.len()));
+            pages.extend_from_slice(&chunk);
+            lsn += chunk.len() as u64;
+        }
+        pages.freeze()
+    }
+
+    /// Records must be 8-byte aligned. Take an encoded record (including any injected page
+    /// boundaries), starting at the given LSN, and add any necessary padding at the end.
+    fn pad_record(record: Bytes, mut lsn: Lsn) -> Bytes {
+        lsn += record.len() as u64;
+        let padding = lsn.calc_padding(8u64) as usize;
+        if padding == 0 {
+            return record;
+        }
+        [record, Bytes::from(vec![0; padding])].concat().into()
+    }
+
+    /// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
+    pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
+        let record = Self::encode_record(data, rmid, info, self.prev_lsn);
+        let record = Self::encode_pages(record, self.lsn);
+        let record = Self::pad_record(record, self.lsn);
+        self.prev_lsn = self.lsn;
+        self.lsn += record.len() as u64;
+        record
+    }
+
+    /// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
+    pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
+        let data = Self::encode_logical_message(prefix, message);
+        self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
+    }
+}
+
+/// Generate WAL records as an iterator.
+impl Iterator for WalGenerator {
+    type Item = (Lsn, Bytes);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let lsn = self.lsn;
+        let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
+        Some((lsn, record))
+    }
+}
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index a636bd2a97ef..78a965174f86 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -7,15 +7,14 @@
 // have been named the same as the corresponding PostgreSQL functions instead.
 //
 
-use crc32c::crc32c_append;
-
 use super::super::waldecoder::WalStreamDecoder;
 use super::bindings::{
     CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
     XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
 };
+use super::wal_generator::WalGenerator;
 use super::PG_MAJORVERSION;
-use crate::pg_constants;
+use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
 use crate::PG_TLI;
 use crate::{uint32, uint64, Oid};
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -26,7 +25,7 @@ use bytes::{Buf, Bytes};
 use log::*;
 
 use serde::Serialize;
-use std::ffi::OsStr;
+use std::ffi::{CString, OsStr};
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
@@ -39,6 +38,7 @@ use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
 
 pub const XLOG_FNAME_LEN: usize = 24;
+pub const XLP_BKP_REMOVABLE: u16 = 0x0004;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
@@ -489,64 +489,16 @@ impl XlLogicalMessage {
 /// Create new WAL record for non-transactional logical message.
 /// Used for creating artificial WAL for tests, as LogicalMessage
 /// record is basically no-op.
-///
-/// NOTE: This leaves the xl_prev field zero. The safekeeper and
-/// pageserver tolerate that, but PostgreSQL does not.
-pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
-    let mut prefix_bytes: Vec<u8> = Vec::with_capacity(prefix.len() + 1);
-    prefix_bytes.write_all(prefix.as_bytes()).unwrap();
-    prefix_bytes.push(0);
-
-    let message_bytes = message.as_bytes();
-
-    let logical_message = XlLogicalMessage {
-        db_id: 0,
-        transactional: 0,
-        prefix_size: prefix_bytes.len() as u64,
-        message_size: message_bytes.len() as u64,
-    };
-
-    let mainrdata = logical_message.encode();
-    let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len();
-    // only short mainrdata is supported for now
-    assert!(mainrdata_len <= 255);
-    let mainrdata_len = mainrdata_len as u8;
-
-    let mut data: Vec<u8> = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len];
-    data.extend_from_slice(&mainrdata);
-    data.extend_from_slice(&prefix_bytes);
-    data.extend_from_slice(message_bytes);
-
-    let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len();
-
-    let mut header = XLogRecord {
-        xl_tot_len: total_len as u32,
-        xl_xid: 0,
-        xl_prev: 0,
-        xl_info: 0,
-        xl_rmid: 21,
-        __bindgen_padding_0: [0u8; 2usize],
-        xl_crc: 0, // crc will be calculated later
-    };
-
-    let header_bytes = header.encode().expect("failed to encode header");
-    let crc = crc32c_append(0, &data);
-    let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
-    header.xl_crc = crc;
-
-    let mut wal: Vec<u8> = Vec::new();
-    wal.extend_from_slice(&header.encode().expect("failed to encode header"));
-    wal.extend_from_slice(&data);
-
-    // WAL start position must be aligned at 8 bytes,
-    // this will add padding for the next WAL record.
-    const PADDING: usize = 8;
-    let padding_rem = wal.len() % PADDING;
-    if padding_rem != 0 {
-        wal.resize(wal.len() + PADDING - padding_rem, 0);
-    }
-
-    wal
+pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
+    // This function can take untrusted input, so discard any NUL bytes in the prefix string.
+    let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
+    let message = message.as_bytes();
+    WalGenerator::encode_record(
+        WalGenerator::encode_logical_message(&prefix, message),
+        RM_LOGICALMSG_ID,
+        XLOG_LOGICAL_MESSAGE,
+        Lsn(0),
+    )
 }
 
 #[cfg(test)]
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 3ec2c130bdb4..524f3604a1bb 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -12,7 +12,7 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;
 
 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Default, Eq, Ord, PartialEq, PartialOrd, Hash)]
 pub struct Lsn(pub u64);
 
 impl Serialize for Lsn {
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 7fe924a08e79..0573ea81e799 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -7,7 +7,6 @@
 //!
 
 use anyhow::Context;
-use bytes::Bytes;
 use postgres_backend::QueryError;
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -176,7 +175,7 @@ pub async fn append_logical_message(
             truncate_lsn: msg.truncate_lsn,
             proposer_uuid: [0u8; 16],
         },
-        wal_data: Bytes::from(wal_data),
+        wal_data,
     });
 
     let response = tli.process_msg(&append_request).await?;
diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs
index 0d7aaf517bac..fabf450eefec 100644
--- a/safekeeper/tests/walproposer_sim/simulation.rs
+++ b/safekeeper/tests/walproposer_sim/simulation.rs
@@ -151,8 +151,7 @@ impl WalProposer {
         for _ in 0..cnt {
             self.disk
                 .lock()
-                .insert_logical_message("prefix", b"message")
-                .expect("failed to generate logical message");
+                .insert_logical_message(c"prefix", b"message");
         }
 
         let end_lsn = self.disk.lock().flush_rec_ptr();
diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
index 123cd6bad65c..f70cd65dfc77 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -1,24 +1,7 @@
-use std::{ffi::CString, sync::Arc};
+use std::{ffi::CStr, sync::Arc};
 
-use byteorder::{LittleEndian, WriteBytesExt};
-use crc32c::crc32c_append;
 use parking_lot::{Mutex, MutexGuard};
-use postgres_ffi::{
-    pg_constants::{
-        RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
-        XLR_BLOCK_ID_DATA_SHORT,
-    },
-    v16::{
-        wal_craft_test_export::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC},
-        xlog_utils::{
-            XLogSegNoOffsetToRecPtr, XlLogicalMessage, XLOG_RECORD_CRC_OFFS,
-            XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            XLP_FIRST_IS_CONTRECORD,
-        },
-        XLogRecord,
-    },
-    WAL_SEGMENT_SIZE, XLOG_BLCKSZ,
-};
+use postgres_ffi::v16::wal_generator::WalGenerator;
 use utils::lsn::Lsn;
 
 use super::block_storage::BlockStorage;
@@ -35,6 +18,7 @@ impl DiskWalProposer {
                 internal_available_lsn: Lsn(0),
                 prev_lsn: Lsn(0),
                 disk: BlockStorage::new(),
+                wal_generator: WalGenerator::new(),
             }),
         })
     }
@@ -51,6 +35,8 @@ pub struct State {
     prev_lsn: Lsn,
     // actual WAL storage
     disk: BlockStorage,
+    // WAL record generator
+    wal_generator: WalGenerator,
 }
 
 impl State {
@@ -66,6 +52,9 @@ impl State {
     /// Update the internal available LSN to the given value.
     pub fn reset_to(&mut self, lsn: Lsn) {
         self.internal_available_lsn = lsn;
+        self.prev_lsn = Lsn(0); // Safekeeper doesn't care if this is omitted
+        self.wal_generator.lsn = self.internal_available_lsn;
+        self.wal_generator.prev_lsn = self.prev_lsn;
     }
 
     /// Get current LSN.
@@ -73,242 +62,11 @@ impl State {
         self.internal_available_lsn
     }
 
-    /// Generate a new WAL record at the current LSN.
-    pub fn insert_logical_message(&mut self, prefix: &str, msg: &[u8]) -> anyhow::Result<()> {
-        let prefix_cstr = CString::new(prefix)?;
-        let prefix_bytes = prefix_cstr.as_bytes_with_nul();
-
-        let lm = XlLogicalMessage {
-            db_id: 0,
-            transactional: 0,
-            prefix_size: prefix_bytes.len() as ::std::os::raw::c_ulong,
-            message_size: msg.len() as ::std::os::raw::c_ulong,
-        };
-
-        let record_bytes = lm.encode();
-        let rdatas: Vec<&[u8]> = vec![&record_bytes, prefix_bytes, msg];
-        insert_wal_record(self, rdatas, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
-    }
-}
-
-fn insert_wal_record(
-    state: &mut State,
-    rdatas: Vec<&[u8]>,
-    rmid: u8,
-    info: u8,
-) -> anyhow::Result<()> {
-    // bytes right after the header, in the same rdata block
-    let mut scratch = Vec::new();
-    let mainrdata_len: usize = rdatas.iter().map(|rdata| rdata.len()).sum();
-
-    if mainrdata_len > 0 {
-        if mainrdata_len > 255 {
-            scratch.push(XLR_BLOCK_ID_DATA_LONG);
-            // TODO: verify endiness
-            let _ = scratch.write_u32::<LittleEndian>(mainrdata_len as u32);
-        } else {
-            scratch.push(XLR_BLOCK_ID_DATA_SHORT);
-            scratch.push(mainrdata_len as u8);
-        }
-    }
-
-    let total_len: u32 = (XLOG_SIZE_OF_XLOG_RECORD + scratch.len() + mainrdata_len) as u32;
-    let size = maxalign(total_len);
-    assert!(size as usize > XLOG_SIZE_OF_XLOG_RECORD);
-
-    let start_bytepos = recptr_to_bytepos(state.internal_available_lsn);
-    let end_bytepos = start_bytepos + size as u64;
-
-    let start_recptr = bytepos_to_recptr(start_bytepos);
-    let end_recptr = bytepos_to_recptr(end_bytepos);
-
-    assert!(recptr_to_bytepos(start_recptr) == start_bytepos);
-    assert!(recptr_to_bytepos(end_recptr) == end_bytepos);
-
-    let mut crc = crc32c_append(0, &scratch);
-    for rdata in &rdatas {
-        crc = crc32c_append(crc, rdata);
-    }
-
-    let mut header = XLogRecord {
-        xl_tot_len: total_len,
-        xl_xid: 0,
-        xl_prev: state.prev_lsn.0,
-        xl_info: info,
-        xl_rmid: rmid,
-        __bindgen_padding_0: [0u8; 2usize],
-        xl_crc: crc,
-    };
-
-    // now we have the header and can finish the crc
-    let header_bytes = header.encode()?;
-    let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
-    header.xl_crc = crc;
-
-    let mut header_bytes = header.encode()?.to_vec();
-    assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_RECORD);
-
-    header_bytes.extend_from_slice(&scratch);
-
-    // finish rdatas
-    let mut rdatas = rdatas;
-    rdatas.insert(0, &header_bytes);
-
-    write_walrecord_to_disk(state, total_len as u64, rdatas, start_recptr, end_recptr)?;
-
-    state.internal_available_lsn = end_recptr;
-    state.prev_lsn = start_recptr;
-    Ok(())
-}
-
-fn write_walrecord_to_disk(
-    state: &mut State,
-    total_len: u64,
-    rdatas: Vec<&[u8]>,
-    start: Lsn,
-    end: Lsn,
-) -> anyhow::Result<()> {
-    let mut curr_ptr = start;
-    let mut freespace = insert_freespace(curr_ptr);
-    let mut written: usize = 0;
-
-    assert!(freespace >= size_of::<u32>());
-
-    for mut rdata in rdatas {
-        while rdata.len() >= freespace {
-            assert!(
-                curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
-                    || freespace == 0
-            );
-
-            state.write(curr_ptr.0, &rdata[..freespace]);
-            rdata = &rdata[freespace..];
-            written += freespace;
-            curr_ptr = Lsn(curr_ptr.0 + freespace as u64);
-
-            let mut new_page = XLogPageHeaderData {
-                xlp_magic: XLOG_PAGE_MAGIC as u16,
-                xlp_info: XLP_BKP_REMOVABLE,
-                xlp_tli: 1,
-                xlp_pageaddr: curr_ptr.0,
-                xlp_rem_len: (total_len - written as u64) as u32,
-                ..Default::default() // Put 0 in padding fields.
-            };
-            if new_page.xlp_rem_len > 0 {
-                new_page.xlp_info |= XLP_FIRST_IS_CONTRECORD;
-            }
-
-            if curr_ptr.segment_offset(WAL_SEGMENT_SIZE) == 0 {
-                new_page.xlp_info |= XLP_LONG_HEADER;
-                let long_page = XLogLongPageHeaderData {
-                    std: new_page,
-                    xlp_sysid: 0,
-                    xlp_seg_size: WAL_SEGMENT_SIZE as u32,
-                    xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
-                };
-                let header_bytes = long_page.encode()?;
-                assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_LONG_PHD);
-                state.write(curr_ptr.0, &header_bytes);
-                curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
-            } else {
-                let header_bytes = new_page.encode()?;
-                assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_SHORT_PHD);
-                state.write(curr_ptr.0, &header_bytes);
-                curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
-            }
-            freespace = insert_freespace(curr_ptr);
-        }
-
-        assert!(
-            curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
-                || rdata.is_empty()
-        );
-        state.write(curr_ptr.0, rdata);
-        curr_ptr = Lsn(curr_ptr.0 + rdata.len() as u64);
-        written += rdata.len();
-        freespace -= rdata.len();
-    }
-
-    assert!(written == total_len as usize);
-    curr_ptr.0 = maxalign(curr_ptr.0);
-    assert!(curr_ptr == end);
-    Ok(())
-}
-
-fn maxalign<T>(size: T) -> T
-where
-    T: std::ops::BitAnd<Output = T>
-        + std::ops::Add<Output = T>
-        + std::ops::Not<Output = T>
-        + From<u8>,
-{
-    (size + T::from(7)) & !T::from(7)
-}
-
-fn insert_freespace(ptr: Lsn) -> usize {
-    if ptr.block_offset() == 0 {
-        0
-    } else {
-        (XLOG_BLCKSZ as u64 - ptr.block_offset()) as usize
-    }
-}
-
-const XLP_BKP_REMOVABLE: u16 = 0x0004;
-const USABLE_BYTES_IN_PAGE: u64 = (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
-const USABLE_BYTES_IN_SEGMENT: u64 = ((WAL_SEGMENT_SIZE / XLOG_BLCKSZ) as u64
-    * USABLE_BYTES_IN_PAGE)
-    - (XLOG_SIZE_OF_XLOG_RECORD - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
-
-fn bytepos_to_recptr(bytepos: u64) -> Lsn {
-    let fullsegs = bytepos / USABLE_BYTES_IN_SEGMENT;
-    let mut bytesleft = bytepos % USABLE_BYTES_IN_SEGMENT;
-
-    let seg_offset = if bytesleft < (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 {
-        // fits on first page of segment
-        bytesleft + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
-    } else {
-        // account for the first page on segment with long header
-        bytesleft -= (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
-        let fullpages = bytesleft / USABLE_BYTES_IN_PAGE;
-        bytesleft %= USABLE_BYTES_IN_PAGE;
-
-        XLOG_BLCKSZ as u64
-            + fullpages * XLOG_BLCKSZ as u64
-            + bytesleft
-            + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
-    };
-
-    Lsn(XLogSegNoOffsetToRecPtr(
-        fullsegs,
-        seg_offset as u32,
-        WAL_SEGMENT_SIZE,
-    ))
-}
-
-fn recptr_to_bytepos(ptr: Lsn) -> u64 {
-    let fullsegs = ptr.segment_number(WAL_SEGMENT_SIZE);
-    let offset = ptr.segment_offset(WAL_SEGMENT_SIZE) as u64;
-
-    let fullpages = offset / XLOG_BLCKSZ as u64;
-    let offset = offset % XLOG_BLCKSZ as u64;
-
-    if fullpages == 0 {
-        fullsegs * USABLE_BYTES_IN_SEGMENT
-            + if offset > 0 {
-                assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
-                offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
-            } else {
-                0
-            }
-    } else {
-        fullsegs * USABLE_BYTES_IN_SEGMENT
-            + (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64
-            + (fullpages - 1) * USABLE_BYTES_IN_PAGE
-            + if offset > 0 {
-                assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
-                offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
-            } else {
-                0
-            }
+    /// Inserts a logical record in the WAL at the current LSN.
+    pub fn insert_logical_message(&mut self, prefix: &CStr, msg: &[u8]) {
+        let record = self.wal_generator.generate_logical_message(prefix, msg);
+        self.disk.write(self.internal_available_lsn.into(), &record);
+        self.prev_lsn = self.internal_available_lsn;
+        self.internal_available_lsn += record.len() as u64;
     }
 }

From 8af9412eb211093a2d43afe5036552f3271aadf4 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 30 Oct 2024 09:58:29 -0500
Subject: [PATCH 09/32] Collect compute backpressure throttling time

This will tell us how much time the compute has spent throttled if
pageserver/safekeeper cannot keep up with WAL generation.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/neon_collector.jsonnet                     |  1 +
 .../compute_backpressure_throttling_ms.libsonnet       | 10 ++++++++++
 .../compute_backpressure_throttling_ms.sql             |  1 +
 3 files changed, 12 insertions(+)
 create mode 100644 compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index 8b43ebe7a388..e73fb132eedf 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -3,6 +3,7 @@
   metrics: [
     import 'sql_exporter/checkpoints_req.libsonnet',
     import 'sql_exporter/checkpoints_timed.libsonnet',
+    import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
     import 'sql_exporter/compute_current_lsn.libsonnet',
     import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
     import 'sql_exporter/compute_receive_lsn.libsonnet',
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
new file mode 100644
index 000000000000..b25bb73d0f33
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_backpressure_throttling_ms',
+  type: 'gauge',
+  help: 'Time compute has spent throttled',
+  key_labels: null,
+  values: [
+    'throttled',
+  ],
+  query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
new file mode 100644
index 000000000000..1fa62d38a4ca
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
@@ -0,0 +1 @@
+SELECT neon.backpressure_throttling_time() AS throttled;

From d0a02f36494e83df2e6ba942dbe8673e24e33848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 30 Oct 2024 17:04:57 +0100
Subject: [PATCH 10/32] Disallow archived timelines to be detached or
 reparented (#9578)

Disallow a request for timeline ancestor detach if either the to be
detached timeline, or any of the to be reparented timelines are
offloaded or archived.

In theory we could support timelines that are archived but not
offloaded, but archived timelines are at the risk of being offloaded, so
we treat them like offloaded timelines. As for offloaded timelines, any
code to "support" them would amount to unoffloading them, at which point
we can just demand to have the timelines be unarchived.

Part of #8088
---
 .../src/tenant/timeline/detach_ancestor.rs    | 45 ++++++++++++++++++-
 .../regress/test_timeline_detach_ancestor.py  | 37 +++++++++++++--
 2 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 641faada25d4..b4c0ab032944 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -29,6 +29,9 @@ pub(crate) enum Error {
     #[error("shutting down, please retry later")]
     ShuttingDown,
 
+    #[error("archived: {}", .0)]
+    Archived(TimelineId),
+
     #[error(transparent)]
     NotFound(crate::tenant::GetTimelineError),
 
@@ -79,8 +82,9 @@ impl From<Error> for ApiError {
     fn from(value: Error) -> Self {
         match value {
             Error::NoAncestor => ApiError::Conflict(value.to_string()),
-            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)),
+            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")),
             Error::ShuttingDown => ApiError::ShuttingDown,
+            Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")),
             Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
                 ApiError::ResourceUnavailable(value.to_string().into())
             }
@@ -201,12 +205,18 @@ pub(super) async fn prepare(
         }));
     };
 
+    if detached.is_archived() != Some(false) {
+        return Err(Archived(detached.timeline_id));
+    }
+
     if !ancestor_lsn.is_valid() {
         // rare case, probably wouldn't even load
         tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
         return Err(NoAncestor);
     }
 
+    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+
     if ancestor.ancestor_timeline.is_some() {
         // non-technical requirement; we could flatten N ancestors just as easily but we chose
         // not to, at least initially
@@ -950,3 +960,36 @@ where
         }
     })
 }
+
+fn check_no_archived_children_of_ancestor(
+    tenant: &Tenant,
+    detached: &Arc<Timeline>,
+    ancestor: &Arc<Timeline>,
+    ancestor_lsn: Lsn,
+) -> Result<(), Error> {
+    let timelines = tenant.timelines.lock().unwrap();
+    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+    for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
+        if timeline.is_archived() == Some(true) {
+            return Err(Error::Archived(timeline.timeline_id));
+        }
+    }
+    for timeline_offloaded in timelines_offloaded.values() {
+        if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
+            continue;
+        }
+        // This forbids the detach ancestor feature if flattened timelines are present,
+        // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
+        // But as per current design, we don't record the ancestor_lsn of flattened timelines.
+        // This is a bit unfortunate, but as of writing this we don't support flattening
+        // anyway. Maybe we can evolve the data model in the future.
+        if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
+            let is_earlier = retain_lsn <= ancestor_lsn;
+            if !is_earlier {
+                continue;
+            }
+        }
+        return Err(Error::Archived(timeline_offloaded.timeline_id));
+    }
+    Ok(())
+}
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 0c8554bb5432..d467c59e62d0 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -9,7 +9,7 @@
 from threading import Barrier
 
 import pytest
-from fixtures.common_types import Lsn, TimelineId
+from fixtures.common_types import Lsn, TimelineArchivalState, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     LogCursor,
@@ -634,7 +634,13 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
     shards = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shards
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shards if sharded else None,
+        initial_tenant_conf={
+            # turn off gc, we want to do manual offloading here.
+            "gc_period": "0s",
+        },
+    )
 
     pageservers = dict((int(p.id), p) for p in env.pageservers)
 
@@ -656,7 +662,9 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
         client.detach_ancestor(env.initial_tenant, env.initial_timeline)
     assert info.value.status_code == 409
 
-    _ = env.create_branch("first_branch")
+    early_branch = env.create_branch("early_branch")
+
+    first_branch = env.create_branch("first_branch")
 
     second_branch = env.create_branch("second_branch", ancestor_branch_name="first_branch")
 
@@ -665,6 +673,29 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
         client.detach_ancestor(env.initial_tenant, second_branch)
     assert info.value.status_code == 400
 
+    client.timeline_archival_config(
+        env.initial_tenant, second_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    client.timeline_archival_config(
+        env.initial_tenant, early_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    with pytest.raises(PageserverApiException, match=f".*archived: {early_branch}") as info:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert info.value.status_code == 400
+
+    if not sharded:
+        client.timeline_offload(env.initial_tenant, early_branch)
+
+    client.timeline_archival_config(
+        env.initial_tenant, first_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    with pytest.raises(PageserverApiException, match=f".*archived: {first_branch}") as info:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert info.value.status_code == 400
+
 
 def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     """

From bcfe013094a962a62c217fb41e7d02c01361505f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 30 Oct 2024 18:44:29 +0100
Subject: [PATCH 11/32] Don't keep around the timeline's remote_client (#9583)

Constructing a remote client is no big deal. Yes, it means an extra
download from S3 but it's not that expensive. This simplifies code paths
and scenarios to test. This unifies timelines that have been recently
offloaded with timelines that have been offloaded in an earlier
invocation of the process.

Part of #8088
---
 pageserver/src/tenant.rs | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 90d9feeeb604..8237f4662cea 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -521,13 +521,6 @@ pub struct OffloadedTimeline {
     /// Present for future flattening deliberations.
     pub archived_at: NaiveDateTime,
 
-    /// Lazily constructed remote client for the timeline
-    ///
-    /// If we offload a timeline, we keep around the remote client
-    /// for the duration of the process. If we find it through the
-    /// manifest, we don't construct it up until it's needed (deletion).
-    pub remote_client: Option<Arc<RemoteTimelineClient>>,
-
     /// Prevent two tasks from deleting the timeline at the same time. If held, the
     /// timeline is being deleted. If 'true', the timeline has already been deleted.
     pub delete_progress: TimelineDeleteProgress,
@@ -554,7 +547,6 @@ impl OffloadedTimeline {
             ancestor_retain_lsn,
             archived_at,
 
-            remote_client: Some(timeline.remote_client.clone()),
             delete_progress: timeline.delete_progress.clone(),
         })
     }
@@ -571,7 +563,6 @@ impl OffloadedTimeline {
             ancestor_timeline_id,
             ancestor_retain_lsn,
             archived_at,
-            remote_client: None,
             delete_progress: TimelineDeleteProgress::default(),
         }
     }
@@ -636,7 +627,7 @@ impl TimelineOrOffloaded {
     fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
         match self {
             TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
-            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.remote_client.clone(),
+            TimelineOrOffloaded::Offloaded(_offloaded) => None,
         }
     }
 }

From 8d70f88b3704f32e5abdb7e9580ff3bbc9c796b7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 30 Oct 2024 14:13:11 -0400
Subject: [PATCH 12/32] refactor(pageserver): use JSON field encoding for
 consumption metrics cache (#9470)

In https://github.com/neondatabase/neon/issues/9032, I would like to
eventually add a `generation` field to the consumption metrics cache.
The current encoding is not backward compatible and it is hard to add
another field into the cache. Therefore, this patch refactors the format
to store "field -> value", and it's easier to maintain backward/forward
compatibility with this new format.

## Summary of changes

* Add `NewRawMetric` as the new format.
* Add upgrade path. When opening the disk cache, the codepath first
inspects the `version` field, and decide how to decode.
* Refactor metrics generation code and tests.
* Add tests on upgrade / compatibility with the old format.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/consumption_metrics.rs         |  60 ++++++++-
 .../src/consumption_metrics/disk_cache.rs     |  38 +++++-
 pageserver/src/consumption_metrics/metrics.rs |  94 +++++++++-----
 .../src/consumption_metrics/metrics/tests.rs  |  67 +++++++---
 pageserver/src/consumption_metrics/upload.rs  | 116 +++++++++++++++---
 5 files changed, 300 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 0c7630edca49..7e8c00c293a4 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -14,6 +14,7 @@ use itertools::Itertools as _;
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
@@ -35,12 +36,62 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// upload attempts.
 type RawMetric = (MetricsKey, (EventType, u64));
 
+/// The new serializable metrics format
+#[derive(Serialize, Deserialize)]
+struct NewMetricsRoot {
+    version: usize,
+    metrics: Vec<NewRawMetric>,
+}
+
+impl NewMetricsRoot {
+    pub fn is_v2_metrics(json_value: &serde_json::Value) -> bool {
+        if let Some(ver) = json_value.get("version") {
+            if let Some(2) = ver.as_u64() {
+                return true;
+            }
+        }
+        false
+    }
+}
+
+/// The new serializable metrics format
+#[derive(Serialize)]
+struct NewMetricsRefRoot<'a> {
+    version: usize,
+    metrics: &'a [NewRawMetric],
+}
+
+impl<'a> NewMetricsRefRoot<'a> {
+    fn new(metrics: &'a [NewRawMetric]) -> Self {
+        Self {
+            version: 2,
+            metrics,
+        }
+    }
+}
+
+/// The new serializable metrics format
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+struct NewRawMetric {
+    key: MetricsKey,
+    kind: EventType,
+    value: u64,
+    // TODO: add generation field and check against generations
+}
+
+impl NewRawMetric {
+    #[cfg(test)]
+    fn to_kv_pair(&self) -> (MetricsKey, NewRawMetric) {
+        (self.key, self.clone())
+    }
+}
+
 /// Caches the [`RawMetric`]s
 ///
 /// In practice, during startup, last sent values are stored here to be used in calculating new
 /// ones. After successful uploading, the cached values are updated to cache. This used to be used
 /// for deduplication, but that is no longer needed.
-type Cache = HashMap<MetricsKey, (EventType, u64)>;
+type Cache = HashMap<MetricsKey, NewRawMetric>;
 
 pub async fn run(
     conf: &'static PageServerConf,
@@ -231,11 +282,14 @@ async fn restore_and_reschedule(
             // collect_all_metrics
             let earlier_metric_at = found_some
                 .iter()
-                .map(|(_, (et, _))| et.recorded_at())
+                .map(|item| item.kind.recorded_at())
                 .copied()
                 .next();
 
-            let cached = found_some.into_iter().collect::<Cache>();
+            let cached = found_some
+                .into_iter()
+                .map(|item| (item.key, item))
+                .collect::<Cache>();
 
             (cached, earlier_metric_at)
         }
diff --git a/pageserver/src/consumption_metrics/disk_cache.rs b/pageserver/src/consumption_metrics/disk_cache.rs
index 387bf7a0f930..54a505a134a2 100644
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -2,11 +2,33 @@ use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use std::sync::Arc;
 
-use super::RawMetric;
+use crate::consumption_metrics::NewMetricsRefRoot;
+
+use super::{NewMetricsRoot, NewRawMetric, RawMetric};
+
+pub(super) fn read_metrics_from_serde_value(
+    json_value: serde_json::Value,
+) -> anyhow::Result<Vec<NewRawMetric>> {
+    if NewMetricsRoot::is_v2_metrics(&json_value) {
+        let root = serde_json::from_value::<NewMetricsRoot>(json_value)?;
+        Ok(root.metrics)
+    } else {
+        let all_metrics = serde_json::from_value::<Vec<RawMetric>>(json_value)?;
+        let all_metrics = all_metrics
+            .into_iter()
+            .map(|(key, (event_type, value))| NewRawMetric {
+                key,
+                kind: event_type,
+                value,
+            })
+            .collect();
+        Ok(all_metrics)
+    }
+}
 
 pub(super) async fn read_metrics_from_disk(
     path: Arc<Utf8PathBuf>,
-) -> anyhow::Result<Vec<RawMetric>> {
+) -> anyhow::Result<Vec<NewRawMetric>> {
     // do not add context to each error, callsite will log with full path
     let span = tracing::Span::current();
     tokio::task::spawn_blocking(move || {
@@ -20,7 +42,8 @@ pub(super) async fn read_metrics_from_disk(
 
         let mut file = std::fs::File::open(&*path)?;
         let reader = std::io::BufReader::new(&mut file);
-        anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
+        let json_value = serde_json::from_reader::<_, serde_json::Value>(reader)?;
+        read_metrics_from_serde_value(json_value)
     })
     .await
     .context("read metrics join error")
@@ -63,7 +86,7 @@ fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> {
 }
 
 pub(super) async fn flush_metrics_to_disk(
-    current_metrics: &Arc<Vec<RawMetric>>,
+    current_metrics: &Arc<Vec<NewRawMetric>>,
     path: &Arc<Utf8PathBuf>,
 ) -> anyhow::Result<()> {
     use std::io::Write;
@@ -93,8 +116,11 @@ pub(super) async fn flush_metrics_to_disk(
             // write out all of the raw metrics, to be read out later on restart as cached values
             {
                 let mut writer = std::io::BufWriter::new(&mut tempfile);
-                serde_json::to_writer(&mut writer, &*current_metrics)
-                    .context("serialize metrics")?;
+                serde_json::to_writer(
+                    &mut writer,
+                    &NewMetricsRefRoot::new(current_metrics.as_ref()),
+                )
+                .context("serialize metrics")?;
                 writer
                     .into_inner()
                     .map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 7ba2d04c4f7e..07fac09f6fcf 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -9,7 +9,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{Cache, RawMetric};
+use super::{Cache, NewRawMetric};
 
 /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
 /// instead of static str.
@@ -64,11 +64,21 @@ impl MetricsKey {
 struct AbsoluteValueFactory(MetricsKey);
 
 impl AbsoluteValueFactory {
-    const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
+    #[cfg(test)]
+    const fn at_old_format(self, time: DateTime<Utc>, val: u64) -> super::RawMetric {
         let key = self.0;
         (key, (EventType::Absolute { time }, val))
     }
 
+    const fn at(self, time: DateTime<Utc>, val: u64) -> NewRawMetric {
+        let key = self.0;
+        NewRawMetric {
+            key,
+            kind: EventType::Absolute { time },
+            value: val,
+        }
+    }
+
     fn key(&self) -> &MetricsKey {
         &self.0
     }
@@ -84,7 +94,28 @@ impl IncrementalValueFactory {
         prev_end: DateTime<Utc>,
         up_to: DateTime<Utc>,
         val: u64,
-    ) -> RawMetric {
+    ) -> NewRawMetric {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        let when = EventType::Incremental {
+            start_time: prev_end,
+            stop_time: up_to,
+        };
+        NewRawMetric {
+            key,
+            kind: when,
+            value: val,
+        }
+    }
+
+    #[allow(clippy::wrong_self_convention)]
+    #[cfg(test)]
+    const fn from_until_old_format(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> super::RawMetric {
         let key = self.0;
         // cannot assert prev_end < up_to because these are realtime clock based
         let when = EventType::Incremental {
@@ -185,7 +216,7 @@ pub(super) async fn collect_all_metrics(
     tenant_manager: &Arc<TenantManager>,
     cached_metrics: &Cache,
     ctx: &RequestContext,
-) -> Vec<RawMetric> {
+) -> Vec<NewRawMetric> {
     use pageserver_api::models::TenantState;
 
     let started_at = std::time::Instant::now();
@@ -220,11 +251,11 @@ pub(super) async fn collect_all_metrics(
     res
 }
 
-async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
+async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
 where
     S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
 {
-    let mut current_metrics: Vec<RawMetric> = Vec::new();
+    let mut current_metrics: Vec<NewRawMetric> = Vec::new();
 
     let mut tenants = std::pin::pin!(tenants);
 
@@ -291,7 +322,7 @@ impl TenantSnapshot {
         tenant_id: TenantId,
         now: DateTime<Utc>,
         cached: &Cache,
-        metrics: &mut Vec<RawMetric>,
+        metrics: &mut Vec<NewRawMetric>,
     ) {
         let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
 
@@ -302,9 +333,9 @@ impl TenantSnapshot {
             let mut synthetic_size = self.synthetic_size;
 
             if synthetic_size == 0 {
-                if let Some((_, value)) = cached.get(factory.key()) {
-                    // use the latest value from previous session
-                    synthetic_size = *value;
+                if let Some(item) = cached.get(factory.key()) {
+                    // use the latest value from previous session, TODO: check generation number
+                    synthetic_size = item.value;
                 }
             }
 
@@ -381,37 +412,36 @@ impl TimelineSnapshot {
         tenant_id: TenantId,
         timeline_id: TimelineId,
         now: DateTime<Utc>,
-        metrics: &mut Vec<RawMetric>,
+        metrics: &mut Vec<NewRawMetric>,
         cache: &Cache,
     ) {
         let timeline_written_size = u64::from(self.last_record_lsn);
 
         let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
 
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
+        let last_stop_time = cache.get(written_size_delta_key.key()).map(|item| {
+            item.kind
+                .incremental_timerange()
+                .expect("never create EventType::Absolute for written_size_delta")
+                .end
+        });
 
-        let (key, written_size_now) =
+        let written_size_now =
             MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
 
         // by default, use the last sent written_size as the basis for
         // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
+        let prev: (DateTime<Utc>, u64) = cache
+            .get(&written_size_now.key)
+            .map(|item| {
                 // use the prev time from our last incremental update, or default to latest
                 // absolute update on the first round.
-                let prev_at = prev_at
+                let prev_at = item
+                    .kind
                     .absolute_time()
                     .expect("never create EventType::Incremental for written_size");
                 let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
+                (*prev_at, item.value)
             })
             .unwrap_or_else(|| {
                 // if we don't have a previous point of comparison, compare to the load time
@@ -422,24 +452,28 @@ impl TimelineSnapshot {
 
         let up_to = now;
 
-        if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+        if let Some(delta) = written_size_now.value.checked_sub(prev.1) {
             let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
             // written_size_delta
             metrics.push(key_value);
             // written_size
-            metrics.push((key, written_size_now));
+            metrics.push(written_size_now);
         } else {
             // the cached value was ahead of us, report zero until we've caught up
             metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
             // the cached value was ahead of us, report the same until we've caught up
-            metrics.push((key, (written_size_now.0, prev.1)));
+            metrics.push(NewRawMetric {
+                key: written_size_now.key,
+                kind: written_size_now.kind,
+                value: prev.1,
+            });
         }
 
         {
             let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
             let current_or_previous = self
                 .current_exact_logical_size
-                .or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
+                .or_else(|| cache.get(factory.key()).map(|item| item.value));
 
             if let Some(size) = current_or_previous {
                 metrics.push(factory.at(now, size));
@@ -452,4 +486,4 @@ impl TimelineSnapshot {
 mod tests;
 
 #[cfg(test)]
-pub(crate) use tests::metric_examples;
+pub(crate) use tests::{metric_examples, metric_examples_old};
diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs
index f9cbcea565cf..3ed7b4412304 100644
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,3 +1,5 @@
+use crate::consumption_metrics::RawMetric;
+
 use super::*;
 use std::collections::HashMap;
 
@@ -50,9 +52,9 @@ fn startup_collected_timeline_metrics_second_round() {
     let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
 
     let mut metrics = Vec::new();
-    let cache = HashMap::from([
-        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-    ]);
+    let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
+        .at(before, disk_consistent_lsn.0)
+        .to_kv_pair()]);
 
     let snap = TimelineSnapshot {
         loaded_at: (disk_consistent_lsn, init),
@@ -89,9 +91,13 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
     let mut metrics = Vec::new();
     let cache = HashMap::from([
         // at t=before was the last time the last_record_lsn changed
-        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+        MetricsKey::written_size(tenant_id, timeline_id)
+            .at(before, disk_consistent_lsn.0)
+            .to_kv_pair(),
         // end time of this event is used for the next ones
-        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until(before, just_before, 0)
+            .to_kv_pair(),
     ]);
 
     let snap = TimelineSnapshot {
@@ -138,13 +144,17 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
     };
 
     let mut cache = HashMap::from([
-        MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
-        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
-            way_before,
-            before_restart,
-            // not taken into account, but the timestamps are important
-            999_999_999,
-        ),
+        MetricsKey::written_size(tenant_id, timeline_id)
+            .at(before_restart, 100)
+            .to_kv_pair(),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until(
+                way_before,
+                before_restart,
+                // not taken into account, but the timestamps are important
+                999_999_999,
+            )
+            .to_kv_pair(),
     ]);
 
     let mut metrics = Vec::new();
@@ -163,7 +173,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
     );
 
     // now if we cache these metrics, and re-run while "still in recovery"
-    cache.extend(metrics.drain(..));
+    cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
 
     // "still in recovery", because our snapshot did not change
     snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
@@ -194,14 +204,14 @@ fn post_restart_current_exact_logical_size_uses_cached() {
         current_exact_logical_size: None,
     };
 
-    let cache = HashMap::from([
-        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
-    ]);
+    let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
+        .at(before_restart, 100)
+        .to_kv_pair()]);
 
     let mut metrics = Vec::new();
     snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
 
-    metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
+    metrics.retain(|item| item.key.metric == Name::LogicalSize);
 
     assert_eq!(
         metrics,
@@ -224,7 +234,9 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
     let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
     let now = DateTime::<Utc>::from(now);
 
-    let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
+    let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id)
+        .at(before_restart, 1000)
+        .to_kv_pair()]);
 
     let mut metrics = Vec::new();
     ts.to_metrics(tenant_id, now, &cached, &mut metrics);
@@ -278,12 +290,29 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
     times
 }
 
-pub(crate) const fn metric_examples(
+pub(crate) const fn metric_examples_old(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     now: DateTime<Utc>,
     before: DateTime<Utc>,
 ) -> [RawMetric; 6] {
+    [
+        MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until_old_format(before, now, 0),
+        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
+        MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
+        MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
+        MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
+    ]
+}
+
+pub(crate) const fn metric_examples(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    now: DateTime<Utc>,
+    before: DateTime<Utc>,
+) -> [NewRawMetric; 6] {
     [
         MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
         MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 1eb25d337b48..1cb4e917c081 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -7,7 +7,7 @@ use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 
-use super::{metrics::Name, Cache, MetricsKey, RawMetric};
+use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric};
 use utils::id::{TenantId, TimelineId};
 
 /// How the metrics from pageserver are identified.
@@ -24,7 +24,7 @@ pub(super) async fn upload_metrics_http(
     client: &reqwest::Client,
     metric_collection_endpoint: &reqwest::Url,
     cancel: &CancellationToken,
-    metrics: &[RawMetric],
+    metrics: &[NewRawMetric],
     cached_metrics: &mut Cache,
     idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
@@ -53,8 +53,8 @@ pub(super) async fn upload_metrics_http(
 
         match res {
             Ok(()) => {
-                for (curr_key, curr_val) in chunk {
-                    cached_metrics.insert(*curr_key, *curr_val);
+                for item in chunk {
+                    cached_metrics.insert(item.key, item.clone());
                 }
                 uploaded += chunk.len();
             }
@@ -86,7 +86,7 @@ pub(super) async fn upload_metrics_bucket(
     client: &GenericRemoteStorage,
     cancel: &CancellationToken,
     node_id: &str,
-    metrics: &[RawMetric],
+    metrics: &[NewRawMetric],
     idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
     if metrics.is_empty() {
@@ -140,16 +140,16 @@ pub(super) async fn upload_metrics_bucket(
 /// across different metrics sinks), and must have the same length as input.
 fn serialize_in_chunks<'a>(
     chunk_size: usize,
-    input: &'a [RawMetric],
+    input: &'a [NewRawMetric],
     idempotency_keys: &'a [IdempotencyKey<'a>],
-) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
+) -> impl ExactSizeIterator<Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>> + 'a
 {
     use bytes::BufMut;
 
     assert_eq!(input.len(), idempotency_keys.len());
 
     struct Iter<'a> {
-        inner: std::slice::Chunks<'a, RawMetric>,
+        inner: std::slice::Chunks<'a, NewRawMetric>,
         idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
         chunk_size: usize,
 
@@ -160,7 +160,7 @@ fn serialize_in_chunks<'a>(
     }
 
     impl<'a> Iterator for Iter<'a> {
-        type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
+        type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>;
 
         fn next(&mut self) -> Option<Self::Item> {
             let chunk = self.inner.next()?;
@@ -269,6 +269,58 @@ impl RawMetricExt for RawMetric {
     }
 }
 
+impl RawMetricExt for NewRawMetric {
+    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.key;
+
+        let kind = self.kind;
+        let value = self.value;
+
+        Event {
+            kind,
+            metric,
+            idempotency_key: key.to_string(),
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        }
+    }
+
+    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
+        use std::fmt::Write;
+
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.key;
+
+        let kind = self.kind;
+        let value = self.value;
+
+        *event = Event {
+            kind,
+            metric,
+            idempotency_key: {
+                event.idempotency_key.clear();
+                write!(event.idempotency_key, "{key}").unwrap();
+                std::mem::take(&mut event.idempotency_key)
+            },
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        };
+    }
+}
+
 pub(crate) trait KeyGen<'a> {
     fn generate(&self) -> IdempotencyKey<'a>;
 }
@@ -381,6 +433,10 @@ async fn upload(
 
 #[cfg(test)]
 mod tests {
+    use crate::consumption_metrics::{
+        disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot,
+    };
+
     use super::*;
     use chrono::{DateTime, Utc};
     use once_cell::sync::Lazy;
@@ -473,23 +529,49 @@ mod tests {
         let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
         let examples = examples.into_iter().zip(metric_samples());
 
-        for ((line, expected), (key, (kind, value))) in examples {
+        for ((line, expected), item) in examples {
             let e = consumption_metrics::Event {
-                kind,
-                metric: key.metric,
+                kind: item.kind,
+                metric: item.key.metric,
                 idempotency_key: idempotency_key.to_string(),
-                value,
+                value: item.value,
                 extra: Ids {
-                    tenant_id: key.tenant_id,
-                    timeline_id: key.timeline_id,
+                    tenant_id: item.key.tenant_id,
+                    timeline_id: item.key.timeline_id,
                 },
             };
             let actual = serde_json::to_string(&e).unwrap();
-            assert_eq!(expected, actual, "example for {kind:?} from line {line}");
+            assert_eq!(
+                expected, actual,
+                "example for {:?} from line {line}",
+                item.kind
+            );
         }
     }
 
-    fn metric_samples() -> [RawMetric; 6] {
+    #[test]
+    fn disk_format_upgrade() {
+        let old_samples_json = serde_json::to_value(metric_samples_old()).unwrap();
+        let new_samples =
+            serde_json::to_value(NewMetricsRefRoot::new(metric_samples().as_ref())).unwrap();
+        let upgraded_samples = read_metrics_from_serde_value(old_samples_json).unwrap();
+        let new_samples = read_metrics_from_serde_value(new_samples).unwrap();
+        assert_eq!(upgraded_samples, new_samples);
+    }
+
+    fn metric_samples_old() -> [RawMetric; 6] {
+        let tenant_id = TenantId::from_array([0; 16]);
+        let timeline_id = TimelineId::from_array([0xff; 16]);
+
+        let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
+            .unwrap()
+            .into();
+        let [now, before] = [*SAMPLES_NOW, before];
+
+        super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
+    }
+
+    fn metric_samples() -> [NewRawMetric; 6] {
         let tenant_id = TenantId::from_array([0; 16]);
         let timeline_id = TimelineId::from_array([0xff; 16]);
 

From 65b69392ea156ff04a3b4fc1609ba7b990ddbe27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 30 Oct 2024 19:37:09 +0100
Subject: [PATCH 13/32] Disallow offloaded children during timeline deletion
 (#9582)

If we delete a timeline that has childen, those children will have their
data corrupted. Therefore, extend the already existing safety check to
offloaded timelines as well.

Part of #8088
---
 pageserver/src/tenant/timeline/delete.rs     | 39 ++++++++++----------
 pageserver/src/tenant/timeline/offload.rs    |  4 +-
 test_runner/regress/test_timeline_archive.py |  7 ++++
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 2c6161da15b7..b0c4fa2bc995 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -214,7 +214,8 @@ impl DeleteTimelineFlow {
     ) -> Result<(), DeleteTimelineError> {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
+        let allow_offloaded_children = false;
+        let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
 
         guard.mark_in_progress()?;
 
@@ -340,6 +341,7 @@ impl DeleteTimelineFlow {
     pub(super) fn prepare(
         tenant: &Tenant,
         timeline_id: TimelineId,
+        allow_offloaded_children: bool,
     ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
         // Note the interaction between this guard and deletion guard.
         // Here we attempt to lock deletion guard when we're holding a lock on timelines.
@@ -352,30 +354,27 @@ impl DeleteTimelineFlow {
         // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
         // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
         let timelines = tenant.timelines.lock().unwrap();
+        let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
 
         let timeline = match timelines.get(&timeline_id) {
             Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
-            None => {
-                let offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
-                match offloaded_timelines.get(&timeline_id) {
-                    Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
-                    None => return Err(DeleteTimelineError::NotFound),
-                }
-            }
+            None => match timelines_offloaded.get(&timeline_id) {
+                Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
+                None => return Err(DeleteTimelineError::NotFound),
+            },
         };
 
-        // Ensure that there are no child timelines **attached to that pageserver**,
-        // because detach removes files, which will break child branches
-        let children: Vec<TimelineId> = timelines
-            .iter()
-            .filter_map(|(id, entry)| {
-                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
-                    Some(*id)
-                } else {
-                    None
-                }
-            })
-            .collect();
+        // Ensure that there are no child timelines, because we are about to remove files,
+        // which will break child branches
+        let mut children = Vec::new();
+        if !allow_offloaded_children {
+            children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
+                (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
+            }));
+        }
+        children.extend(timelines.iter().filter_map(|(id, entry)| {
+            (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
+        }));
 
         if !children.is_empty() {
             return Err(DeleteTimelineError::HasChildren(children));
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 305c139b54e4..5b196cf8a79f 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -12,7 +12,9 @@ pub(crate) async fn offload_timeline(
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
-    let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;
+    let allow_offloaded_children = true;
+    let (timeline, guard) =
+        DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
         tracing::error!("timeline already offloaded, but given timeline object");
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 77efd7b74981..3e9812c38a27 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -213,6 +213,13 @@ def leaf_offloaded():
     wait_until(30, 1, leaf_offloaded)
     wait_until(30, 1, parent_offloaded)
 
+    # Offloaded child timelines should still prevent deletion
+    with pytest.raises(
+        PageserverApiException,
+        match=f".* timeline which has child timelines: \\[{leaf_timeline_id}\\]",
+    ):
+        ps_http.timeline_delete(tenant_id, parent_timeline_id)
+
     ps_http.timeline_archival_config(
         tenant_id,
         grandparent_timeline_id,

From 411c3aa0d62a4d8a2e18b43dc03b677bf1969d66 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 31 Oct 2024 10:47:43 +0000
Subject: [PATCH 14/32] pageserver: lift decoding and interpreting of wal into
 wal_decoder (#9524)

## Problem

Decoding and ingestion are still coupled in `pageserver::WalIngest`.

## Summary of changes

A new type is added to `wal_decoder::models`, InterpretedWalRecord. This
type contains everything that the pageserver requires in order to ingest
a WAL record. The highlights are the `metadata_record` which is an
optional special record type to be handled and `blocks` which stores
key, value pairs to be persisted to storage.

This type is produced by
`wal_decoder::models::InterpretedWalRecord::from_bytes` from a raw PG
wal record.

The rest of this commit separates decoding and interpretation of the PG
WAL record from its application in `WalIngest::ingest_record`.

Related: https://github.com/neondatabase/neon/issues/9335
Epic: https://github.com/neondatabase/neon/issues/9329
---
 libs/wal_decoder/src/decoder.rs               |  969 ++++++++++++++
 libs/wal_decoder/src/models.rs                |   44 +
 pageserver/src/import_datadir.rs              |   23 +-
 .../walreceiver/walreceiver_connection.rs     |   16 +-
 pageserver/src/walingest.rs                   | 1188 ++---------------
 5 files changed, 1170 insertions(+), 1070 deletions(-)

diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index 8b137891791f..780fce3d6903 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -1 +1,970 @@
+//! This module contains logic for decoding and interpreting
+//! raw bytes which represent a raw Postgres WAL record.
 
+use crate::models::*;
+use bytes::{Buf, Bytes, BytesMut};
+use pageserver_api::key::rel_block_to_key;
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::reltag::{RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::value::Value;
+use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
+use postgres_ffi::walrecord::*;
+use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
+use utils::lsn::Lsn;
+
+impl InterpretedWalRecord {
+    /// Decode and interpreted raw bytes which represent one Postgres WAL record.
+    /// Data blocks which do not match the provided shard identity are filtered out.
+    /// Shard 0 is a special case since it tracks all relation sizes. We only give it
+    /// the keys that are being written as that is enough for updating relation sizes.
+    pub fn from_bytes_filtered(
+        buf: Bytes,
+        shard: &ShardIdentity,
+        lsn: Lsn,
+        pg_version: u32,
+    ) -> anyhow::Result<InterpretedWalRecord> {
+        let mut decoded = DecodedWALRecord::default();
+        decode_wal_record(buf, &mut decoded, pg_version)?;
+
+        let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) {
+            FlushUncommittedRecords::Yes
+        } else {
+            FlushUncommittedRecords::No
+        };
+
+        let metadata_record = MetadataRecord::from_decoded(&decoded, lsn, pg_version)?;
+
+        let mut blocks = Vec::default();
+        for blk in decoded.blocks.iter() {
+            let rel = RelTag {
+                spcnode: blk.rnode_spcnode,
+                dbnode: blk.rnode_dbnode,
+                relnode: blk.rnode_relnode,
+                forknum: blk.forknum,
+            };
+
+            let key = rel_block_to_key(rel, blk.blkno);
+
+            if !key.is_valid_key_on_write_path() {
+                anyhow::bail!("Unsupported key decoded at LSN {}: {}", lsn, key);
+            }
+
+            let key_is_local = shard.is_key_local(&key);
+
+            tracing::debug!(
+                lsn=%lsn,
+                key=%key,
+                "ingest: shard decision {}",
+                if !key_is_local { "drop" } else { "keep" },
+            );
+
+            if !key_is_local {
+                if shard.is_shard_zero() {
+                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                    // its blkno in case it implicitly extends a relation.
+                    blocks.push((key.to_compact(), None));
+                }
+
+                continue;
+            }
+
+            // Instead of storing full-page-image WAL record,
+            // it is better to store extracted image: we can skip wal-redo
+            // in this case. Also some FPI records may contain multiple (up to 32) pages,
+            // so them have to be copied multiple times.
+            //
+            let value = if blk.apply_image
+                && blk.has_image
+                && decoded.xl_rmid == pg_constants::RM_XLOG_ID
+                && (decoded.xl_info == pg_constants::XLOG_FPI
+                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+                // compression of WAL is not yet supported: fall back to storing the original WAL record
+                && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
+                // do not materialize null pages because them most likely be soon replaced with real data
+                && blk.bimg_len != 0
+            {
+                // Extract page image from FPI record
+                let img_len = blk.bimg_len as usize;
+                let img_offs = blk.bimg_offset as usize;
+                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
+                // TODO(vlad): skip the copy
+                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
+
+                if blk.hole_length != 0 {
+                    let tail = image.split_off(blk.hole_offset as usize);
+                    image.resize(image.len() + blk.hole_length as usize, 0u8);
+                    image.unsplit(tail);
+                }
+                //
+                // Match the logic of XLogReadBufferForRedoExtended:
+                // The page may be uninitialized. If so, we can't set the LSN because
+                // that would corrupt the page.
+                //
+                if !page_is_new(&image) {
+                    page_set_lsn(&mut image, lsn)
+                }
+                assert_eq!(image.len(), BLCKSZ as usize);
+
+                Value::Image(image.freeze())
+            } else {
+                Value::WalRecord(NeonWalRecord::Postgres {
+                    will_init: blk.will_init || blk.apply_image,
+                    rec: decoded.record.clone(),
+                })
+            };
+
+            blocks.push((key.to_compact(), Some(value)));
+        }
+
+        Ok(InterpretedWalRecord {
+            metadata_record,
+            blocks,
+            lsn,
+            flush_uncommitted,
+            xid: decoded.xl_xid,
+        })
+    }
+}
+
+impl MetadataRecord {
+    fn from_decoded(
+        decoded: &DecodedWALRecord,
+        lsn: Lsn,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        // Note: this doesn't actually copy the bytes since
+        // the [`Bytes`] type implements it via a level of indirection.
+        let mut buf = decoded.record.clone();
+        buf.advance(decoded.main_data_offset);
+
+        match decoded.xl_rmid {
+            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
+                Self::decode_heapam_record(&mut buf, decoded, pg_version)
+            }
+            pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
+            // Handle other special record types
+            pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
+            pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_TBLSPC_ID => {
+                tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
+                Ok(None)
+            }
+            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, lsn),
+            pg_constants::RM_MULTIXACT_ID => {
+                Self::decode_multixact_record(&mut buf, decoded, pg_version)
+            }
+            pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
+            // This is an odd duck. It needs to go to all shards.
+            // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
+            // in WalIngest::new), we have to send the whole DecodedWalRecord::record to
+            // the pageserver and decode it there.
+            //
+            // Alternatively, one can make the checkpoint part of the subscription protocol
+            // to the pageserver. This should work fine, but can be done at a later point.
+            pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, lsn),
+            pg_constants::RM_LOGICALMSG_ID => {
+                Self::decode_logical_message_record(&mut buf, decoded)
+            }
+            pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
+            pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
+            _unexpected => {
+                // TODO: consider failing here instead of blindly doing something without
+                // understanding the protocol
+                Ok(None)
+            }
+        }
+    }
+
+    fn decode_heapam_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        // Handle VM bit updates that are implicitly part of heap records.
+
+        // First, look at the record to determine which VM bits need
+        // to be cleared. If either of these variables is set, we
+        // need to clear the corresponding bits in the visibility map.
+        let mut new_heap_blkno: Option<u32> = None;
+        let mut old_heap_blkno: Option<u32> = None;
+        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+
+        match pg_version {
+            14 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v14::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v14::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v14::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v14::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v14::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v14::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            15 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v15::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v15::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v15::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v15::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v15::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v15::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            16 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v16::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v16::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v16::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v16::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v16::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v16::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            17 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v17::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v17::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v17::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v17::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v17::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v17::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            _ => {}
+        }
+
+        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
+            let vm_rel = RelTag {
+                forknum: VISIBILITYMAP_FORKNUM,
+                spcnode: decoded.blocks[0].rnode_spcnode,
+                dbnode: decoded.blocks[0].rnode_dbnode,
+                relnode: decoded.blocks[0].rnode_relnode,
+            };
+
+            Ok(Some(MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
+                ClearVmBits {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    vm_rel,
+                    flags,
+                },
+            ))))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn decode_neonmgr_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        // Handle VM bit updates that are implicitly part of heap records.
+
+        // First, look at the record to determine which VM bits need
+        // to be cleared. If either of these variables is set, we
+        // need to clear the corresponding bits in the visibility map.
+        let mut new_heap_blkno: Option<u32> = None;
+        let mut old_heap_blkno: Option<u32> = None;
+        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+
+        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
+
+        match pg_version {
+            16 | 17 => {
+                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                match info {
+                    pg_constants::XLOG_NEON_HEAP_INSERT => {
+                        let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_DELETE => {
+                        let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_UPDATE
+                    | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
+                        let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
+                        let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_LOCK => {
+                        let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                    info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info),
+                }
+            }
+            _ => anyhow::bail!(
+                "Neon RMGR has no known compatibility with PostgreSQL version {}",
+                pg_version
+            ),
+        }
+
+        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
+            let vm_rel = RelTag {
+                forknum: VISIBILITYMAP_FORKNUM,
+                spcnode: decoded.blocks[0].rnode_spcnode,
+                dbnode: decoded.blocks[0].rnode_dbnode,
+                relnode: decoded.blocks[0].rnode_relnode,
+            };
+
+            Ok(Some(MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
+                ClearVmBits {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    vm_rel,
+                    flags,
+                },
+            ))))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn decode_smgr_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_SMGR_CREATE {
+            let create = XlSmgrCreate::decode(buf);
+            let rel = RelTag {
+                spcnode: create.rnode.spcnode,
+                dbnode: create.rnode.dbnode,
+                relnode: create.rnode.relnode,
+                forknum: create.forknum,
+            };
+
+            return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Create(SmgrCreate {
+                rel,
+            }))));
+        } else if info == pg_constants::XLOG_SMGR_TRUNCATE {
+            let truncate = XlSmgrTruncate::decode(buf);
+            return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Truncate(truncate))));
+        }
+
+        Ok(None)
+    }
+
+    fn decode_dbase_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        // TODO: Refactor this to avoid the duplication between postgres versions.
+
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID");
+
+        if pg_version == 14 {
+            if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
+                let createdb = XlCreateDatabase::decode(buf);
+                tracing::debug!("XLOG_DBASE_CREATE v14");
+
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 15 {
+            if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 16 {
+            if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 17 {
+            if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));
+
+                return Ok(Some(record));
+            }
+        }
+
+        Ok(None)
+    }
+
+    fn decode_clog_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
+
+        if info == pg_constants::CLOG_ZEROPAGE {
+            let pageno = if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            };
+            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+            Ok(Some(MetadataRecord::Clog(ClogRecord::ZeroPage(
+                ClogZeroPage { segno, rpageno },
+            ))))
+        } else {
+            assert!(info == pg_constants::CLOG_TRUNCATE);
+            let xlrec = XlClogTruncate::decode(buf, pg_version);
+
+            Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate(
+                ClogTruncate {
+                    pageno: xlrec.pageno,
+                    oldest_xid: xlrec.oldest_xid,
+                    oldest_xid_db: xlrec.oldest_xid_db,
+                },
+            ))))
+        }
+    }
+
+    fn decode_xact_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        lsn: Lsn,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
+        let origin_id = decoded.origin_id;
+        let xl_xid = decoded.xl_xid;
+
+        if info == pg_constants::XLOG_XACT_COMMIT {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(MetadataRecord::Xact(XactRecord::Commit(XactCommon {
+                parsed,
+                origin_id,
+                xl_xid,
+                lsn,
+            }))));
+        } else if info == pg_constants::XLOG_XACT_ABORT {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(MetadataRecord::Xact(XactRecord::Abort(XactCommon {
+                parsed,
+                origin_id,
+                xl_xid,
+                lsn,
+            }))));
+        } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(MetadataRecord::Xact(XactRecord::CommitPrepared(
+                XactCommon {
+                    parsed,
+                    origin_id,
+                    xl_xid,
+                    lsn,
+                },
+            ))));
+        } else if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(MetadataRecord::Xact(XactRecord::AbortPrepared(
+                XactCommon {
+                    parsed,
+                    origin_id,
+                    xl_xid,
+                    lsn,
+                },
+            ))));
+        } else if info == pg_constants::XLOG_XACT_PREPARE {
+            return Ok(Some(MetadataRecord::Xact(XactRecord::Prepare(
+                XactPrepare {
+                    xl_xid: decoded.xl_xid,
+                    data: Bytes::copy_from_slice(&buf[..]),
+                },
+            ))));
+        }
+
+        Ok(None)
+    }
+
+    fn decode_multixact_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+
+        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
+            || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
+        {
+            let pageno = if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            };
+            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+            let slru_kind = match info {
+                pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE => SlruKind::MultiXactOffsets,
+                pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE => SlruKind::MultiXactMembers,
+                _ => unreachable!(),
+            };
+
+            return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::ZeroPage(
+                MultiXactZeroPage {
+                    slru_kind,
+                    segno,
+                    rpageno,
+                },
+            ))));
+        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+            let xlrec = XlMultiXactCreate::decode(buf);
+            return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Create(
+                xlrec,
+            ))));
+        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
+            let xlrec = XlMultiXactTruncate::decode(buf);
+            return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Truncate(
+                xlrec,
+            ))));
+        }
+
+        Ok(None)
+    }
+
+    fn decode_relmap_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let update = XlRelmapUpdate::decode(buf);
+
+        let mut buf = decoded.record.clone();
+        buf.advance(decoded.main_data_offset);
+        // skip xl_relmap_update
+        buf.advance(12);
+
+        Ok(Some(MetadataRecord::Relmap(RelmapRecord::Update(
+            RelmapUpdate {
+                update,
+                buf: Bytes::copy_from_slice(&buf[..]),
+            },
+        ))))
+    }
+
+    fn decode_xlog_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        lsn: Lsn,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        Ok(Some(MetadataRecord::Xlog(XlogRecord::Raw(RawXlogRecord {
+            info,
+            lsn,
+            buf: buf.clone(),
+        }))))
+    }
+
+    fn decode_logical_message_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_LOGICAL_MESSAGE {
+            let xlrec = XlLogicalMessage::decode(buf);
+            let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
+
+            #[cfg(feature = "testing")]
+            if prefix == "neon-test" {
+                return Ok(Some(MetadataRecord::LogicalMessage(
+                    LogicalMessageRecord::Failpoint,
+                )));
+            }
+
+            if let Some(path) = prefix.strip_prefix("neon-file:") {
+                let buf_size = xlrec.prefix_size + xlrec.message_size;
+                let buf = Bytes::copy_from_slice(&buf[xlrec.prefix_size..buf_size]);
+                return Ok(Some(MetadataRecord::LogicalMessage(
+                    LogicalMessageRecord::Put(PutLogicalMessage {
+                        path: path.to_string(),
+                        buf,
+                    }),
+                )));
+            }
+        }
+
+        Ok(None)
+    }
+
+    fn decode_standby_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_RUNNING_XACTS {
+            let xlrec = XlRunningXacts::decode(buf);
+            return Ok(Some(MetadataRecord::Standby(StandbyRecord::RunningXacts(
+                StandbyRunningXacts {
+                    oldest_running_xid: xlrec.oldest_running_xid,
+                },
+            ))));
+        }
+
+        Ok(None)
+    }
+
+    fn decode_replorigin_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_REPLORIGIN_SET {
+            let xlrec = XlReploriginSet::decode(buf);
+            return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Set(
+                xlrec,
+            ))));
+        } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
+            let xlrec = XlReploriginDrop::decode(buf);
+            return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Drop(
+                xlrec,
+            ))));
+        }
+
+        Ok(None)
+    }
+}
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 58f8e1b2da42..92b66fcefdf0 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -25,7 +25,9 @@
 //!                     |--> write to KV store within the pageserver
 
 use bytes::Bytes;
+use pageserver_api::key::CompactKey;
 use pageserver_api::reltag::{RelTag, SlruKind};
+use pageserver_api::value::Value;
 use postgres_ffi::walrecord::{
     XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
     XlSmgrTruncate, XlXactParsedRecord,
@@ -33,6 +35,48 @@ use postgres_ffi::walrecord::{
 use postgres_ffi::{Oid, TransactionId};
 use utils::lsn::Lsn;
 
+pub enum FlushUncommittedRecords {
+    Yes,
+    No,
+}
+
+/// An interpreted Postgres WAL record, ready to be handled by the pageserver
+pub struct InterpretedWalRecord {
+    /// Optional metadata record - may cause writes to metadata keys
+    /// in the storage engine
+    pub metadata_record: Option<MetadataRecord>,
+    /// Images or deltas for blocks modified in the original WAL record.
+    /// The [`Value`] is optional to avoid sending superfluous data to
+    /// shard 0 for relation size tracking.
+    pub blocks: Vec<(CompactKey, Option<Value>)>,
+    /// Byte offset within WAL for the end of the original PG WAL record
+    pub lsn: Lsn,
+    /// Whether to flush all uncommitted modifications to the storage engine
+    /// before ingesting this record. This is currently only used for legacy PG
+    /// database creations which read pages from a template database. Such WAL
+    /// records require reading data blocks while ingesting, hence the need to flush.
+    pub flush_uncommitted: FlushUncommittedRecords,
+    /// Transaction id of the original PG WAL record
+    pub xid: TransactionId,
+}
+
+/// The interpreted part of the Postgres WAL record which requires metadata
+/// writes to the underlying storage engine.
+pub enum MetadataRecord {
+    Heapam(HeapamRecord),
+    Neonrmgr(NeonrmgrRecord),
+    Smgr(SmgrRecord),
+    Dbase(DbaseRecord),
+    Clog(ClogRecord),
+    Xact(XactRecord),
+    MultiXact(MultiXactRecord),
+    Relmap(RelmapRecord),
+    Xlog(XlogRecord),
+    LogicalMessage(LogicalMessageRecord),
+    Standby(StandbyRecord),
+    Replorigin(ReploriginRecord),
+}
+
 pub enum HeapamRecord {
     ClearVmBits(ClearVmBits),
 }
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 530c91c4da3e..06c4553e1c5b 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,6 +12,7 @@ use pageserver_api::key::rel_block_to_key;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
 use tracing::*;
+use wal_decoder::models::InterpretedWalRecord;
 use walkdir::WalkDir;
 
 use crate::context::RequestContext;
@@ -23,7 +24,6 @@ use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use postgres_ffi::ControlFileData;
 use postgres_ffi::DBState_DB_SHUTDOWNED;
 use postgres_ffi::Oid;
@@ -312,11 +312,15 @@ async fn import_wal(
         let mut modification = tline.begin_modification(last_lsn);
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                let mut decoded = DecodedWALRecord::default();
-                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
+                let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                    recdata,
+                    tline.get_shard_identity(),
+                    lsn,
+                    tline.pg_version,
+                )?;
 
                 walingest
-                    .ingest_record(decoded, lsn, &mut modification, ctx)
+                    .ingest_record(interpreted, &mut modification, ctx)
                     .await?;
                 WAL_INGEST.records_committed.inc();
 
@@ -453,10 +457,15 @@ pub async fn import_wal_from_tar(
         let mut modification = tline.begin_modification(last_lsn);
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                let mut decoded = DecodedWALRecord::default();
-                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
+                let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                    recdata,
+                    tline.get_shard_identity(),
+                    lsn,
+                    tline.pg_version,
+                )?;
+
                 walingest
-                    .ingest_record(decoded, lsn, &mut modification, ctx)
+                    .ingest_record(interpreted, &mut modification, ctx)
                     .await?;
                 modification.commit(ctx).await?;
                 last_lsn = lsn;
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 739fadbc6bf2..eb19fb691fe3 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,6 +22,7 @@ use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn, Instrument};
+use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
 
 use super::TaskStateUpdate;
 use crate::{
@@ -35,7 +36,6 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use utils::{id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
@@ -339,11 +339,15 @@ pub(super) async fn handle_walreceiver_connection(
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
-                        // Deserialize WAL record
-                        let mut decoded = DecodedWALRecord::default();
-                        decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
+                        // Deserialize and interpret WAL record
+                        let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                            recdata,
+                            modification.tline.get_shard_identity(),
+                            lsn,
+                            modification.tline.pg_version,
+                        )?;
 
-                        if decoded.is_dbase_create_copy(timeline.pg_version)
+                        if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                             && uncommitted_records > 0
                         {
                             // Special case: legacy PG database creations operate by reading pages from a 'template' database:
@@ -360,7 +364,7 @@ pub(super) async fn handle_walreceiver_connection(
 
                         // Ingest the records without immediately committing them.
                         let ingested = walingest
-                            .ingest_record(decoded, lsn, &mut modification, &ctx)
+                            .ingest_record(interpreted, &mut modification, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
                         if !ingested {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 27b3f9384553..84353970b78b 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -3,17 +3,17 @@
 //!
 //! The pipeline for ingesting WAL looks like this:
 //!
-//! WAL receiver  ->   WalIngest  ->   Repository
+//! WAL receiver  -> [`wal_decoder`] ->  WalIngest  ->   Repository
 //!
-//! The WAL receiver receives a stream of WAL from the WAL safekeepers,
-//! and decodes it to individual WAL records. It feeds the WAL records
-//! to WalIngest, which parses them and stores them in the Repository.
+//! The WAL receiver receives a stream of WAL from the WAL safekeepers.
+//! Records get decoded and interpreted in the [`wal_decoder`] module
+//! and then stored to the Repository by WalIngest.
 //!
 //! The neon Repository can store page versions in two formats: as
-//! page images, or a WAL records. WalIngest::ingest_record() extracts
-//! page images out of some WAL records, but most it stores as WAL
+//! page images, or a WAL records. [`wal_decoder::models::InterpretedWalRecord::from_bytes_filtered`]
+//! extracts page images out of some WAL records, but mostly it's WAL
 //! records. If a WAL record modifies multiple pages, WalIngest
-//! will call Repository::put_wal_record or put_page_image functions
+//! will call Repository::put_rel_wal_record or put_rel_page_image functions
 //! separately for each modified page.
 //!
 //! To reconstruct a page using a WAL record, the Repository calls the
@@ -28,14 +28,15 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;
 
+use pageserver_api::key::Key;
 use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::walrecord::*;
 use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
-use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 use wal_decoder::models::*;
 
 use anyhow::{bail, Context, Result};
-use bytes::{Buf, Bytes, BytesMut};
+use bytes::{Buf, Bytes};
 use tracing::*;
 use utils::failpoint_support;
 use utils::rate_limit::RateLimit;
@@ -50,10 +51,10 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::value::Value;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::TransactionId;
-use postgres_ffi::BLCKSZ;
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
 
@@ -140,257 +141,161 @@ impl WalIngest {
         })
     }
 
-    ///
-    /// Decode a PostgreSQL WAL record and store it in the repository, in the given timeline.
+    /// Ingest an interpreted PostgreSQL WAL record by doing writes to the underlying key value
+    /// storage of a given timeline.
     ///
     /// This function updates `lsn` field of `DatadirModification`
     ///
-    /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
-    /// relations/pages that the record affects.
-    ///
     /// This function returns `true` if the record was ingested, and `false` if it was filtered out
     pub async fn ingest_record(
         &mut self,
-        decoded: DecodedWALRecord,
-        lsn: Lsn,
+        interpreted: InterpretedWalRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
     ) -> anyhow::Result<bool> {
         WAL_INGEST.records_received.inc();
-        let pg_version = modification.tline.pg_version;
         let prev_len = modification.len();
 
-        modification.set_lsn(lsn)?;
+        modification.set_lsn(interpreted.lsn)?;
 
-        if decoded.is_dbase_create_copy(pg_version) {
+        if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) {
             // Records of this type should always be preceded by a commit(), as they
             // rely on reading data pages back from the Timeline.
             assert!(!modification.has_dirty_data_pages());
         }
 
-        let mut buf = decoded.record.clone();
-        buf.advance(decoded.main_data_offset);
-
         assert!(!self.checkpoint_modified);
-        if decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID
-            && self.checkpoint.update_next_xid(decoded.xl_xid)
+        if interpreted.xid != pg_constants::INVALID_TRANSACTION_ID
+            && self.checkpoint.update_next_xid(interpreted.xid)
         {
             self.checkpoint_modified = true;
         }
 
         failpoint_support::sleep_millis_async!("wal-ingest-record-sleep");
 
-        match decoded.xl_rmid {
-            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
-                // Heap AM records need some special handling, because they modify VM pages
-                // without registering them with the standard mechanism.
-                let maybe_heapam_record =
-                    Self::decode_heapam_record(&mut buf, &decoded, pg_version)?;
-                if let Some(heapam_record) = maybe_heapam_record {
-                    match heapam_record {
-                        HeapamRecord::ClearVmBits(clear_vm_bits) => {
-                            self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
-                                .await?;
-                        }
-                    }
+        match interpreted.metadata_record {
+            Some(MetadataRecord::Heapam(rec)) => match rec {
+                HeapamRecord::ClearVmBits(clear_vm_bits) => {
+                    self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
+                        .await?;
                 }
-            }
-            pg_constants::RM_NEON_ID => {
-                let maybe_nenonrmgr_record =
-                    Self::decode_neonmgr_record(&mut buf, &decoded, pg_version)?;
-                if let Some(neonrmgr_record) = maybe_nenonrmgr_record {
-                    match neonrmgr_record {
-                        NeonrmgrRecord::ClearVmBits(clear_vm_bits) => {
-                            self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
-                                .await?;
-                        }
-                    }
+            },
+            Some(MetadataRecord::Neonrmgr(rec)) => match rec {
+                NeonrmgrRecord::ClearVmBits(clear_vm_bits) => {
+                    self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
+                        .await?;
                 }
-            }
-            // Handle other special record types
-            pg_constants::RM_SMGR_ID => {
-                let maybe_smgr_record =
-                    Self::decode_smgr_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(smgr_record) = maybe_smgr_record {
-                    match smgr_record {
-                        SmgrRecord::Create(create) => {
-                            self.ingest_xlog_smgr_create(create, modification, ctx)
-                                .await?;
-                        }
-                        SmgrRecord::Truncate(truncate) => {
-                            self.ingest_xlog_smgr_truncate(truncate, modification, ctx)
-                                .await?;
-                        }
-                    }
+            },
+            Some(MetadataRecord::Smgr(rec)) => match rec {
+                SmgrRecord::Create(create) => {
+                    self.ingest_xlog_smgr_create(create, modification, ctx)
+                        .await?;
                 }
-            }
-            pg_constants::RM_DBASE_ID => {
-                let maybe_dbase_record =
-                    Self::decode_dbase_record(&mut buf, &decoded, pg_version).unwrap();
-
-                if let Some(dbase_record) = maybe_dbase_record {
-                    match dbase_record {
-                        DbaseRecord::Create(create) => {
-                            self.ingest_xlog_dbase_create(create, modification, ctx)
-                                .await?;
-                        }
-                        DbaseRecord::Drop(drop) => {
-                            self.ingest_xlog_dbase_drop(drop, modification, ctx).await?;
-                        }
-                    }
+                SmgrRecord::Truncate(truncate) => {
+                    self.ingest_xlog_smgr_truncate(truncate, modification, ctx)
+                        .await?;
                 }
-            }
-            pg_constants::RM_TBLSPC_ID => {
-                trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
-            }
-            pg_constants::RM_CLOG_ID => {
-                // [`Self::decode_clog_record`] may never fail and always returns.
-                // It has this interface to match all the other decoding methods.
-                let clog_record = Self::decode_clog_record(&mut buf, &decoded, pg_version)
-                    .unwrap()
-                    .unwrap();
-
-                match clog_record {
-                    ClogRecord::ZeroPage(zero_page) => {
-                        self.ingest_clog_zero_page(zero_page, modification, ctx)
-                            .await?;
-                    }
-                    ClogRecord::Truncate(truncate) => {
-                        self.ingest_clog_truncate(truncate, modification, ctx)
-                            .await?;
-                    }
+            },
+            Some(MetadataRecord::Dbase(rec)) => match rec {
+                DbaseRecord::Create(create) => {
+                    self.ingest_xlog_dbase_create(create, modification, ctx)
+                        .await?;
                 }
-            }
-            pg_constants::RM_XACT_ID => {
-                let maybe_xact_record =
-                    Self::decode_xact_record(&mut buf, &decoded, lsn, pg_version).unwrap();
-                if let Some(xact_record) = maybe_xact_record {
-                    self.ingest_xact_record(xact_record, modification, ctx)
+                DbaseRecord::Drop(drop) => {
+                    self.ingest_xlog_dbase_drop(drop, modification, ctx).await?;
+                }
+            },
+            Some(MetadataRecord::Clog(rec)) => match rec {
+                ClogRecord::ZeroPage(zero_page) => {
+                    self.ingest_clog_zero_page(zero_page, modification, ctx)
                         .await?;
                 }
-            }
-            pg_constants::RM_MULTIXACT_ID => {
-                let maybe_multixact_record =
-                    Self::decode_multixact_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(multixact_record) = maybe_multixact_record {
-                    match multixact_record {
-                        MultiXactRecord::ZeroPage(zero_page) => {
-                            self.ingest_multixact_zero_page(zero_page, modification, ctx)
-                                .await?;
-                        }
-                        MultiXactRecord::Create(create) => {
-                            self.ingest_multixact_create(modification, &create)?;
-                        }
-                        MultiXactRecord::Truncate(truncate) => {
-                            self.ingest_multixact_truncate(modification, &truncate, ctx)
-                                .await?;
-                        }
-                    }
+                ClogRecord::Truncate(truncate) => {
+                    self.ingest_clog_truncate(truncate, modification, ctx)
+                        .await?;
                 }
+            },
+            Some(MetadataRecord::Xact(rec)) => {
+                self.ingest_xact_record(rec, modification, ctx).await?;
             }
-            pg_constants::RM_RELMAP_ID => {
-                let relmap_record = Self::decode_relmap_record(&mut buf, &decoded, pg_version)
-                    .unwrap()
-                    .unwrap();
-                match relmap_record {
-                    RelmapRecord::Update(update) => {
-                        self.ingest_relmap_update(update, modification, ctx).await?;
-                    }
+            Some(MetadataRecord::MultiXact(rec)) => match rec {
+                MultiXactRecord::ZeroPage(zero_page) => {
+                    self.ingest_multixact_zero_page(zero_page, modification, ctx)
+                        .await?;
                 }
-            }
-            // This is an odd duck. It needs to go to all shards.
-            // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
-            // in WalIngest::new), we have to send the whole DecodedWalRecord::record to
-            // the pageserver and decode it there.
-            //
-            // Alternatively, one can make the checkpoint part of the subscription protocol
-            // to the pageserver. This should work fine, but can be done at a later point.
-            pg_constants::RM_XLOG_ID => {
-                let xlog_record = Self::decode_xlog_record(&mut buf, &decoded, lsn, pg_version)
-                    .unwrap()
-                    .unwrap();
-
-                match xlog_record {
-                    XlogRecord::Raw(raw) => {
-                        self.ingest_raw_xlog_record(raw, modification, ctx).await?;
-                    }
+                MultiXactRecord::Create(create) => {
+                    self.ingest_multixact_create(modification, &create)?;
                 }
-            }
-            pg_constants::RM_LOGICALMSG_ID => {
-                let maybe_logical_message_record =
-                    Self::decode_logical_message_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(logical_message_record) = maybe_logical_message_record {
-                    match logical_message_record {
-                        LogicalMessageRecord::Put(put) => {
-                            self.ingest_logical_message_put(put, modification, ctx)
-                                .await?;
-                        }
-                        #[cfg(feature = "testing")]
-                        LogicalMessageRecord::Failpoint => {
-                            // This is a convenient way to make the WAL ingestion pause at
-                            // particular point in the WAL. For more fine-grained control,
-                            // we could peek into the message and only pause if it contains
-                            // a particular string, for example, but this is enough for now.
-                            failpoint_support::sleep_millis_async!(
-                                "pageserver-wal-ingest-logical-message-sleep"
-                            );
-                        }
-                    }
+                MultiXactRecord::Truncate(truncate) => {
+                    self.ingest_multixact_truncate(modification, &truncate, ctx)
+                        .await?;
                 }
-            }
-            pg_constants::RM_STANDBY_ID => {
-                let maybe_standby_record =
-                    Self::decode_standby_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(standby_record) = maybe_standby_record {
-                    self.ingest_standby_record(standby_record).unwrap();
+            },
+            Some(MetadataRecord::Relmap(rec)) => match rec {
+                RelmapRecord::Update(update) => {
+                    self.ingest_relmap_update(update, modification, ctx).await?;
                 }
-            }
-            pg_constants::RM_REPLORIGIN_ID => {
-                let maybe_replorigin_record =
-                    Self::decode_replorigin_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(replorigin_record) = maybe_replorigin_record {
-                    self.ingest_replorigin_record(replorigin_record, modification)
+            },
+            Some(MetadataRecord::Xlog(rec)) => match rec {
+                XlogRecord::Raw(raw) => {
+                    self.ingest_raw_xlog_record(raw, modification, ctx).await?;
+                }
+            },
+            Some(MetadataRecord::LogicalMessage(rec)) => match rec {
+                LogicalMessageRecord::Put(put) => {
+                    self.ingest_logical_message_put(put, modification, ctx)
                         .await?;
                 }
+                #[cfg(feature = "testing")]
+                LogicalMessageRecord::Failpoint => {
+                    // This is a convenient way to make the WAL ingestion pause at
+                    // particular point in the WAL. For more fine-grained control,
+                    // we could peek into the message and only pause if it contains
+                    // a particular string, for example, but this is enough for now.
+                    failpoint_support::sleep_millis_async!(
+                        "pageserver-wal-ingest-logical-message-sleep"
+                    );
+                }
+            },
+            Some(MetadataRecord::Standby(rec)) => {
+                self.ingest_standby_record(rec).unwrap();
             }
-            _x => {
-                // TODO: should probably log & fail here instead of blindly
-                // doing something without understanding the protocol
+            Some(MetadataRecord::Replorigin(rec)) => {
+                self.ingest_replorigin_record(rec, modification).await?;
+            }
+            None => {
+                // There are two cases through which we end up here:
+                // 1. The resource manager for the original PG WAL record
+                //    is [`pg_constants::RM_TBLSPC_ID`]. This is not a supported
+                //    record type within Neon.
+                // 2. The resource manager id was unknown to
+                //    [`wal_decoder::decoder::MetadataRecord::from_decoded`].
+                // TODO(vlad): Tighten this up more once we build confidence
+                // that case (2) does not happen in the field.
             }
         }
 
-        // Iterate through all the blocks that the record modifies, and
-        // "put" a separate copy of the record for each block.
-        for blk in decoded.blocks.iter() {
-            let rel = RelTag {
-                spcnode: blk.rnode_spcnode,
-                dbnode: blk.rnode_dbnode,
-                relnode: blk.rnode_relnode,
-                forknum: blk.forknum,
-            };
-
-            let key = rel_block_to_key(rel, blk.blkno);
-            let key_is_local = self.shard.is_key_local(&key);
-
-            tracing::debug!(
-                lsn=%lsn,
-                key=%key,
-                "ingest: shard decision {} (checkpoint={})",
-                if !key_is_local { "drop" } else { "keep" },
-                self.checkpoint_modified
-            );
-
-            if !key_is_local {
-                if self.shard.is_shard_zero() {
-                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+        // Iterate through all the key value pairs provided in the interpreted block
+        // and update the modification currently in-flight to include them.
+        for (compact_key, maybe_value) in interpreted.blocks.into_iter() {
+            let (rel, blk) = Key::from_compact(compact_key).to_rel_block()?;
+            match maybe_value {
+                Some(Value::Image(img)) => {
+                    self.put_rel_page_image(modification, rel, blk, img, ctx)
+                        .await?;
+                }
+                Some(Value::WalRecord(rec)) => {
+                    self.put_rel_wal_record(modification, rel, blk, rec, ctx)
+                        .await?;
+                }
+                None => {
+                    // Shard 0 tracks relation sizes. We will observe
                     // its blkno in case it implicitly extends a relation.
-                    self.observe_decoded_block(modification, blk, ctx).await?;
+                    assert!(self.shard.is_shard_zero());
+                    self.observe_decoded_block(modification, rel, blk, ctx)
+                        .await?;
                 }
-
-                continue;
             }
-            self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx)
-                .await?;
         }
 
         // If checkpoint data was updated, store the new version in the repository
@@ -433,82 +338,11 @@ impl WalIngest {
     async fn observe_decoded_block(
         &mut self,
         modification: &mut DatadirModification<'_>,
-        blk: &DecodedBkpBlock,
-        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
-        let rel = RelTag {
-            spcnode: blk.rnode_spcnode,
-            dbnode: blk.rnode_dbnode,
-            relnode: blk.rnode_relnode,
-            forknum: blk.forknum,
-        };
-        self.handle_rel_extend(modification, rel, blk.blkno, ctx)
-            .await
-    }
-
-    async fn ingest_decoded_block(
-        &mut self,
-        modification: &mut DatadirModification<'_>,
-        lsn: Lsn,
-        decoded: &DecodedWALRecord,
-        blk: &DecodedBkpBlock,
+        rel: RelTag,
+        blkno: BlockNumber,
         ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
-        let rel = RelTag {
-            spcnode: blk.rnode_spcnode,
-            dbnode: blk.rnode_dbnode,
-            relnode: blk.rnode_relnode,
-            forknum: blk.forknum,
-        };
-
-        //
-        // Instead of storing full-page-image WAL record,
-        // it is better to store extracted image: we can skip wal-redo
-        // in this case. Also some FPI records may contain multiple (up to 32) pages,
-        // so them have to be copied multiple times.
-        //
-        if blk.apply_image
-            && blk.has_image
-            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
-            && (decoded.xl_info == pg_constants::XLOG_FPI
-            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-            // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
-            // do not materialize null pages because them most likely be soon replaced with real data
-            && blk.bimg_len != 0
-        {
-            // Extract page image from FPI record
-            let img_len = blk.bimg_len as usize;
-            let img_offs = blk.bimg_offset as usize;
-            let mut image = BytesMut::with_capacity(BLCKSZ as usize);
-            image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
-
-            if blk.hole_length != 0 {
-                let tail = image.split_off(blk.hole_offset as usize);
-                image.resize(image.len() + blk.hole_length as usize, 0u8);
-                image.unsplit(tail);
-            }
-            //
-            // Match the logic of XLogReadBufferForRedoExtended:
-            // The page may be uninitialized. If so, we can't set the LSN because
-            // that would corrupt the page.
-            //
-            if !page_is_new(&image) {
-                page_set_lsn(&mut image, lsn)
-            }
-            assert_eq!(image.len(), BLCKSZ as usize);
-
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
-                .await?;
-        } else {
-            let rec = NeonWalRecord::Postgres {
-                will_init: blk.will_init || blk.apply_image,
-                rec: decoded.record.clone(),
-            };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec, ctx)
-                .await?;
-        }
-        Ok(())
+        self.handle_rel_extend(modification, rel, blkno, ctx).await
     }
 
     async fn ingest_clear_vm_bits(
@@ -599,413 +433,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_heapam_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<HeapamRecord>> {
-        // Handle VM bit updates that are implicitly part of heap records.
-
-        // First, look at the record to determine which VM bits need
-        // to be cleared. If either of these variables is set, we
-        // need to clear the corresponding bits in the visibility map.
-        let mut new_heap_blkno: Option<u32> = None;
-        let mut old_heap_blkno: Option<u32> = None;
-        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
-
-        match pg_version {
-            14 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v14::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v14::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v14::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v14::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v14::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v14::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
-            15 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v15::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v15::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v15::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v15::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v15::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v15::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
-            16 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v16::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v16::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v16::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v16::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v16::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v16::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
-            17 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v17::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v17::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v17::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v17::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v17::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v17::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
-            _ => {}
-        }
-
-        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
-            let vm_rel = RelTag {
-                forknum: VISIBILITYMAP_FORKNUM,
-                spcnode: decoded.blocks[0].rnode_spcnode,
-                dbnode: decoded.blocks[0].rnode_dbnode,
-                relnode: decoded.blocks[0].rnode_relnode,
-            };
-
-            Ok(Some(HeapamRecord::ClearVmBits(ClearVmBits {
-                new_heap_blkno,
-                old_heap_blkno,
-                vm_rel,
-                flags,
-            })))
-        } else {
-            Ok(None)
-        }
-    }
-
-    fn decode_neonmgr_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<NeonrmgrRecord>> {
-        // Handle VM bit updates that are implicitly part of heap records.
-
-        // First, look at the record to determine which VM bits need
-        // to be cleared. If either of these variables is set, we
-        // need to clear the corresponding bits in the visibility map.
-        let mut new_heap_blkno: Option<u32> = None;
-        let mut old_heap_blkno: Option<u32> = None;
-        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
-
-        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
-
-        match pg_version {
-            16 | 17 => {
-                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                match info {
-                    pg_constants::XLOG_NEON_HEAP_INSERT => {
-                        let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    }
-                    pg_constants::XLOG_NEON_HEAP_DELETE => {
-                        let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    }
-                    pg_constants::XLOG_NEON_HEAP_UPDATE
-                    | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
-                        let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    }
-                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
-                        let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    }
-                    pg_constants::XLOG_NEON_HEAP_LOCK => {
-                        let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                    info => bail!("Unknown WAL record type for Neon RMGR: {}", info),
-                }
-            }
-            _ => bail!(
-                "Neon RMGR has no known compatibility with PostgreSQL version {}",
-                pg_version
-            ),
-        }
-
-        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
-            let vm_rel = RelTag {
-                forknum: VISIBILITYMAP_FORKNUM,
-                spcnode: decoded.blocks[0].rnode_spcnode,
-                dbnode: decoded.blocks[0].rnode_dbnode,
-                relnode: decoded.blocks[0].rnode_relnode,
-            };
-
-            Ok(Some(NeonrmgrRecord::ClearVmBits(ClearVmBits {
-                new_heap_blkno,
-                old_heap_blkno,
-                vm_rel,
-                flags,
-            })))
-        } else {
-            Ok(None)
-        }
-    }
-
     /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
     async fn ingest_xlog_dbase_create(
         &mut self,
@@ -1122,125 +549,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_dbase_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<DbaseRecord>> {
-        // TODO: Refactor this to avoid the duplication between postgres versions.
-
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        debug!(%info, %pg_version, "handle RM_DBASE_ID");
-
-        if pg_version == 14 {
-            if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
-                let createdb = XlCreateDatabase::decode(buf);
-                debug!("XLOG_DBASE_CREATE v14");
-
-                let record = DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                });
-
-                return Ok(Some(record));
-            } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-
-                let record = DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                });
-
-                return Ok(Some(record));
-            }
-        } else if pg_version == 15 {
-            if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                debug!("XLOG_DBASE_CREATE_FILE_COPY");
-
-                let createdb = XlCreateDatabase::decode(buf);
-                let record = DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                });
-
-                return Ok(Some(record));
-            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                });
-
-                return Ok(Some(record));
-            }
-        } else if pg_version == 16 {
-            if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                debug!("XLOG_DBASE_CREATE_FILE_COPY");
-
-                let createdb = XlCreateDatabase::decode(buf);
-                let record = DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                });
-
-                return Ok(Some(record));
-            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                });
-
-                return Ok(Some(record));
-            }
-        } else if pg_version == 17 {
-            if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                debug!("XLOG_DBASE_CREATE_FILE_COPY");
-
-                let createdb = XlCreateDatabase::decode(buf);
-                let record = DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                });
-
-                return Ok(Some(record));
-            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                });
-
-                return Ok(Some(record));
-            }
-        }
-
-        Ok(None)
-    }
-
     async fn ingest_xlog_smgr_create(
         &mut self,
         create: SmgrCreate,
@@ -1252,30 +560,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_smgr_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<SmgrRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_SMGR_CREATE {
-            let create = XlSmgrCreate::decode(buf);
-            let rel = RelTag {
-                spcnode: create.rnode.spcnode,
-                dbnode: create.rnode.dbnode,
-                relnode: create.rnode.relnode,
-                forknum: create.forknum,
-            };
-
-            return Ok(Some(SmgrRecord::Create(SmgrCreate { rel })));
-        } else if info == pg_constants::XLOG_SMGR_TRUNCATE {
-            let truncate = XlSmgrTruncate::decode(buf);
-            return Ok(Some(SmgrRecord::Truncate(truncate)));
-        }
-
-        Ok(None)
-    }
-
     /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record.
     ///
     /// This is the same logic as in PostgreSQL's smgr_redo() function.
@@ -1535,59 +819,6 @@ impl WalIngest {
         Ok(())
     }
 
-    // TODO(vlad): Standardise interface for `decode_...`
-    fn decode_xact_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        lsn: Lsn,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<XactRecord>> {
-        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
-        let origin_id = decoded.origin_id;
-        let xl_xid = decoded.xl_xid;
-
-        if info == pg_constants::XLOG_XACT_COMMIT {
-            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
-            return Ok(Some(XactRecord::Commit(XactCommon {
-                parsed,
-                origin_id,
-                xl_xid,
-                lsn,
-            })));
-        } else if info == pg_constants::XLOG_XACT_ABORT {
-            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
-            return Ok(Some(XactRecord::Abort(XactCommon {
-                parsed,
-                origin_id,
-                xl_xid,
-                lsn,
-            })));
-        } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
-            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
-            return Ok(Some(XactRecord::CommitPrepared(XactCommon {
-                parsed,
-                origin_id,
-                xl_xid,
-                lsn,
-            })));
-        } else if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
-            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
-            return Ok(Some(XactRecord::AbortPrepared(XactCommon {
-                parsed,
-                origin_id,
-                xl_xid,
-                lsn,
-            })));
-        } else if info == pg_constants::XLOG_XACT_PREPARE {
-            return Ok(Some(XactRecord::Prepare(XactPrepare {
-                xl_xid: decoded.xl_xid,
-                data: Bytes::copy_from_slice(&buf[..]),
-            })));
-        }
-
-        Ok(None)
-    }
-
     async fn ingest_clog_truncate(
         &mut self,
         truncate: ClogTruncate,
@@ -1681,35 +912,6 @@ impl WalIngest {
         .await
     }
 
-    fn decode_clog_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<ClogRecord>> {
-        let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
-
-        if info == pg_constants::CLOG_ZEROPAGE {
-            let pageno = if pg_version < 17 {
-                buf.get_u32_le()
-            } else {
-                buf.get_u64_le() as u32
-            };
-            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-            Ok(Some(ClogRecord::ZeroPage(ClogZeroPage { segno, rpageno })))
-        } else {
-            assert!(info == pg_constants::CLOG_TRUNCATE);
-            let xlrec = XlClogTruncate::decode(buf, pg_version);
-
-            Ok(Some(ClogRecord::Truncate(ClogTruncate {
-                pageno: xlrec.pageno,
-                oldest_xid: xlrec.oldest_xid,
-                oldest_xid_db: xlrec.oldest_xid_db,
-            })))
-        }
-    }
-
     fn ingest_multixact_create(
         &mut self,
         modification: &mut DatadirModification,
@@ -1880,46 +1082,6 @@ impl WalIngest {
         .await
     }
 
-    fn decode_multixact_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<MultiXactRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-
-        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
-            || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
-        {
-            let pageno = if pg_version < 17 {
-                buf.get_u32_le()
-            } else {
-                buf.get_u64_le() as u32
-            };
-            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-            let slru_kind = match info {
-                pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE => SlruKind::MultiXactOffsets,
-                pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE => SlruKind::MultiXactMembers,
-                _ => unreachable!(),
-            };
-
-            return Ok(Some(MultiXactRecord::ZeroPage(MultiXactZeroPage {
-                slru_kind,
-                segno,
-                rpageno,
-            })));
-        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-            let xlrec = XlMultiXactCreate::decode(buf);
-            return Ok(Some(MultiXactRecord::Create(xlrec)));
-        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
-            let xlrec = XlMultiXactTruncate::decode(buf);
-            return Ok(Some(MultiXactRecord::Truncate(xlrec)));
-        }
-
-        Ok(None)
-    }
-
     async fn ingest_relmap_update(
         &mut self,
         update: RelmapUpdate,
@@ -1933,24 +1095,6 @@ impl WalIngest {
             .await
     }
 
-    fn decode_relmap_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<RelmapRecord>> {
-        let update = XlRelmapUpdate::decode(buf);
-
-        let mut buf = decoded.record.clone();
-        buf.advance(decoded.main_data_offset);
-        // skip xl_relmap_update
-        buf.advance(12);
-
-        Ok(Some(RelmapRecord::Update(RelmapUpdate {
-            update,
-            buf: Bytes::copy_from_slice(&buf[..]),
-        })))
-    }
-
     async fn ingest_raw_xlog_record(
         &mut self,
         raw_record: RawXlogRecord,
@@ -2051,20 +1195,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_xlog_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        lsn: Lsn,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<XlogRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        Ok(Some(XlogRecord::Raw(RawXlogRecord {
-            info,
-            lsn,
-            buf: buf.clone(),
-        })))
-    }
-
     async fn ingest_logical_message_put(
         &mut self,
         put: PutLogicalMessage,
@@ -2075,50 +1205,6 @@ impl WalIngest {
         modification.put_file(path.as_str(), &buf, ctx).await
     }
 
-    fn decode_logical_message_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<LogicalMessageRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-            let xlrec = XlLogicalMessage::decode(buf);
-            let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
-
-            #[cfg(feature = "testing")]
-            if prefix == "neon-test" {
-                return Ok(Some(LogicalMessageRecord::Failpoint));
-            }
-
-            if let Some(path) = prefix.strip_prefix("neon-file:") {
-                let buf_size = xlrec.prefix_size + xlrec.message_size;
-                let buf = Bytes::copy_from_slice(&buf[xlrec.prefix_size..buf_size]);
-                return Ok(Some(LogicalMessageRecord::Put(PutLogicalMessage {
-                    path: path.to_string(),
-                    buf,
-                })));
-            }
-        }
-
-        Ok(None)
-    }
-
-    fn decode_standby_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<StandbyRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_RUNNING_XACTS {
-            let xlrec = XlRunningXacts::decode(buf);
-            return Ok(Some(StandbyRecord::RunningXacts(StandbyRunningXacts {
-                oldest_running_xid: xlrec.oldest_running_xid,
-            })));
-        }
-
-        Ok(None)
-    }
-
     fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
         match record {
             StandbyRecord::RunningXacts(running_xacts) => {
@@ -2133,23 +1219,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_replorigin_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<ReploriginRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_REPLORIGIN_SET {
-            let xlrec = XlReploriginSet::decode(buf);
-            return Ok(Some(ReploriginRecord::Set(xlrec)));
-        } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
-            let xlrec = XlReploriginDrop::decode(buf);
-            return Ok(Some(ReploriginRecord::Drop(xlrec)));
-        }
-
-        Ok(None)
-    }
-
     async fn ingest_replorigin_record(
         &mut self,
         record: ReploriginRecord,
@@ -3010,7 +2079,6 @@ mod tests {
     async fn test_ingest_real_wal() {
         use crate::tenant::harness::*;
         use postgres_ffi::waldecoder::WalStreamDecoder;
-        use postgres_ffi::walrecord::decode_wal_record;
         use postgres_ffi::WAL_SEGMENT_SIZE;
 
         // Define test data path and constants.
@@ -3082,10 +2150,16 @@ mod tests {
         for chunk in bytes[xlogoff..].chunks(50) {
             decoder.feed_bytes(chunk);
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
-                let mut decoded = DecodedWALRecord::default();
-                decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap();
+                let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                    recdata,
+                    modification.tline.get_shard_identity(),
+                    lsn,
+                    modification.tline.pg_version,
+                )
+                .unwrap();
+
                 walingest
-                    .ingest_record(decoded, lsn, &mut modification, &ctx)
+                    .ingest_record(interpreted, &mut modification, &ctx)
                     .instrument(span.clone())
                     .await
                     .unwrap();

From f9d8256d559cd767aab4c04106ef732aca8a2811 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 31 Oct 2024 11:51:58 +0100
Subject: [PATCH 15/32] pageserver: don't return option from
 `DeletionQueue::new` (#9588)

`DeletionQueue::new()` always returns deletion workers, so the returned
`Option` is redundant.
---
 pageserver/src/bin/pageserver.rs |  4 +---
 pageserver/src/deletion_queue.rs | 13 ++++---------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index c6659345f94c..782122139e0d 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -398,9 +398,7 @@ fn start_pageserver(
         ControllerUpcallClient::new(conf, &shutdown_pageserver),
         conf,
     );
-    if let Some(deletion_workers) = deletion_workers {
-        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
-    }
+    deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
 
     // Up to this point no significant I/O has been done: this should have been fast.  Record
     // duration prior to starting I/O intensive phase of startup.
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 7733bdb640a0..37fa3004676a 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -618,13 +618,11 @@ impl DeletionQueue {
     /// Caller may use the returned object to construct clients with new_client.
     /// Caller should tokio::spawn the background() members of the two worker objects returned:
     /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
-    ///
-    /// If remote_storage is None, then the returned workers will also be None.
     pub fn new<C>(
         remote_storage: GenericRemoteStorage,
         controller_upcall_client: Option<C>,
         conf: &'static PageServerConf,
-    ) -> (Self, Option<DeletionQueueWorkers<C>>)
+    ) -> (Self, DeletionQueueWorkers<C>)
     where
         C: ControlPlaneGenerationsApi + Send + Sync,
     {
@@ -656,7 +654,7 @@ impl DeletionQueue {
                 },
                 cancel: cancel.clone(),
             },
-            Some(DeletionQueueWorkers {
+            DeletionQueueWorkers {
                 frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
                 backend: Validator::new(
                     conf,
@@ -667,7 +665,7 @@ impl DeletionQueue {
                     cancel.clone(),
                 ),
                 executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
-            }),
+            },
         )
     }
 
@@ -742,9 +740,7 @@ mod test {
             );
 
             tracing::debug!("Spawning worker for new queue queue");
-            let worker_join = workers
-                .unwrap()
-                .spawn_with(&tokio::runtime::Handle::current());
+            let worker_join = workers.spawn_with(&tokio::runtime::Handle::current());
 
             let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
             let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
@@ -855,7 +851,6 @@ mod test {
             harness.conf,
         );
 
-        let worker = worker.unwrap();
         let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
 
         Ok(TestSetup {

From e96398a552ccd68acac58666665936f5c8cbe431 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 31 Oct 2024 13:05:24 +0000
Subject: [PATCH 16/32]  Add support of extensions for v17 (part 4) (#9568)

- pg_jsonschema 0.3.3
- pg_graphql 1.5.9
- rum 65e0a752
- pg_tiktoken a5bc447e

update support of extensions for v14-v16:
- pg_jsonschema 0.3.1 -> 0.3.3
- pg_graphql 1.5.7 -> 1.5.9
- rum 6ab37053 -> 65e0a752
- pg_tiktoken e64e55aa -> a5bc447e
---
 compute/compute-node.Dockerfile | 78 +++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 28 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index c2333eda08e2..e4c6589c60ed 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -431,14 +431,11 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/rum.patch /rum.patch
 
-# maybe version-specific
-# support for v17 is unknown
-# last release 1.3.13 - Sep 19, 2022
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
-    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
+# supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25
+# doesn't use releases since 1.3.13 - Sep 19, 2022
+# use latest commit from the master branch
+RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \
+    echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     patch -p1 < /rum.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -959,21 +956,31 @@ RUN apt-get install -y protobuf-compiler && \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-jsonschema-pg-build
+FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
 ARG PG_VERSION
-
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \
+# version 0.3.3 supports v17
+# last release v0.3.3 - Oct 16, 2024
+#
+# there were no breaking changes
+# so we can use the same version for all postgres versions
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+        export PG_JSONSCHEMA_VERSION=0.3.3 \
+        export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
     esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
+    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
     # `unsafe-postgres` feature allows to build pgx extensions
     # against postgres forks that decided to change their ABI name (like us).
     # With that we can build extensions without forking them and using stock
     # pgx. As this feature is new few manual version bumps were required.
-    sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
 
@@ -984,16 +991,27 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-graphql-pg-build
+FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
 ARG PG_VERSION
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \
+# version 1.5.9 supports v17
+# last release v1.5.9 - Oct 16, 2024
+#
+# there were no breaking changes
+# so we can use the same version for all postgres versions
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+        export PG_GRAPHQL_VERSION=1.5.9 \
+        export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
     esac && \
-    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
-    echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
+    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
+    echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -1006,15 +1024,13 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-tiktoken-pg-build
+FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build
 ARG PG_VERSION
 
-# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \
-    esac && \
-    wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
+# doesn't use releases
+# 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     # TODO update pgrx version in the pg_tiktoken repo and remove this line
     sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
@@ -1032,6 +1048,8 @@ RUN case "${PG_VERSION}" in "v17") \
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
+# doesn't support v17 yet
+# https://github.com/pksunkara/pgx_ulid/pull/52
 RUN case "${PG_VERSION}" in "v17") \
     echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
     esac && \
@@ -1052,6 +1070,10 @@ RUN case "${PG_VERSION}" in "v17") \
 FROM rust-extensions-build AS pg-session-jwt-build
 ARG PG_VERSION
 
+# TODO use versioned releases
+# add v17 support
+# NOTE: local_proxy depends on the version of pg_session_jwt
+# Do not update without approve from proxy team
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \

From 51fda118f608271ce8f35662ccff4484d45778da Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 31 Oct 2024 14:34:50 +0100
Subject: [PATCH 17/32] increase lifetime of AWS session token to 12 hours
 (#9590)

## Problem

clickbench regression causes clickbench to run >9 hours and the AWS
session token is expired before the run completes

## Summary of changes

extend lifetime of session token for this job to 12 hours
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 69b8bc5d7075..abc58733b347 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -683,7 +683,7 @@ jobs:
       with:
         aws-region: eu-central-1
         role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
+        role-duration-seconds: 43200 # 12 hours
 
     - name: Download Neon artifact
       uses: ./.github/actions/download

From 552088ac1635f88a65914f6fc540fd343db66e57 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 31 Oct 2024 14:44:59 +0000
Subject: [PATCH 18/32] pageserver: fix spurious error logs in timeline
 lifecycle (#9589)

## Problem

The final part of https://github.com/neondatabase/neon/issues/9543 will
be a chaos test that creates/deletes/archives/offloads timelines while
restarting pageservers and migrating tenants. Developing that test
showed up a few places where we log errors during normal shutdown.

## Summary of changes

- UninitializedTimeline's drop should log at info severity: this is a
normal code path when some part of timeline creation encounters a
cancellation `?` path.
- When offloading and finding a `RemoteTimelineClient` in a
non-initialized state, this is not an error and should not be logged as
such.
- The `offload_timeline` function returned an anyhow error, so callers
couldn't gracefully pick out cancellation errors from real errors:
update this to have a structured error type and use it throughout.
---
 pageserver/src/http/routes.rs               |  9 +++-
 pageserver/src/tenant.rs                    |  8 ++-
 pageserver/src/tenant/tasks.rs              |  1 +
 pageserver/src/tenant/timeline.rs           | 13 +++++
 pageserver/src/tenant/timeline/offload.rs   | 56 ++++++++++++++-------
 pageserver/src/tenant/timeline/uninit.rs    |  4 +-
 test_runner/regress/test_broken_timeline.py |  2 -
 test_runner/regress/test_import.py          |  1 -
 test_runner/regress/test_tenant_delete.py   |  2 -
 9 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2d8f4309ca46..ef8efd3f277f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -80,6 +80,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::offload::offload_timeline;
+use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
@@ -2004,7 +2005,12 @@ async fn timeline_offload_handler(
         }
         offload_timeline(&tenant, &timeline)
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| {
+                match e {
+                    OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()),
+                    _ => ApiError::InternalServerError(anyhow!(e))
+                }
+            })?;
 
         json_response(StatusCode::OK, ())
     }
@@ -2060,6 +2066,7 @@ async fn timeline_checkpoint_handler(
                 .map_err(|e|
                     match e {
                         CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                        CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::Other(e) => ApiError::InternalServerError(e)
                     }
                 )?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8237f4662cea..68f8f7e13c72 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2529,6 +2529,11 @@ impl Tenant {
                         .await
                         .inspect_err(|e| match e {
                             timeline::CompactionError::ShuttingDown => (),
+                            timeline::CompactionError::Offload(_) => {
+                                // Failures to offload timelines do not trip the circuit breaker, because
+                                // they do not do lots of writes the way compaction itself does: it is cheap
+                                // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                            }
                             timeline::CompactionError::Other(e) => {
                                 self.compaction_circuit_breaker
                                     .lock()
@@ -2544,8 +2549,7 @@ impl Tenant {
             if pending_task_left == Some(false) && *can_offload {
                 offload_timeline(self, timeline)
                     .instrument(info_span!("offload_timeline", %timeline_id))
-                    .await
-                    .map_err(timeline::CompactionError::Other)?;
+                    .await?;
             }
         }
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 547739e7734c..16dac10dca22 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -279,6 +279,7 @@ fn log_compaction_error(
 
     let decision = match e {
         ShuttingDown => None,
+        Offload(_) => Some(LooksLike::Error),
         _ if task_cancelled => Some(LooksLike::Info),
         Other(e) => {
             let root_cause = e.root_cause();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d765a7c987b6..12919866a374 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -20,6 +20,7 @@ use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
 use handle::ShardTimelineId;
+use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
@@ -4475,11 +4476,23 @@ impl Drop for Timeline {
 pub(crate) enum CompactionError {
     #[error("The timeline or pageserver is shutting down")]
     ShuttingDown,
+    /// Compaction tried to offload a timeline and failed
+    #[error("Failed to offload timeline: {0}")]
+    Offload(OffloadError),
     /// Compaction cannot be done right now; page reconstruction and so on.
     #[error(transparent)]
     Other(anyhow::Error),
 }
 
+impl From<OffloadError> for CompactionError {
+    fn from(e: OffloadError) -> Self {
+        match e {
+            OffloadError::Cancelled => Self::ShuttingDown,
+            _ => Self::Offload(e),
+        }
+    }
+}
+
 impl CompactionError {
     pub fn is_cancelled(&self) -> bool {
         matches!(self, CompactionError::ShuttingDown)
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 5b196cf8a79f..c77c2400007e 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -3,18 +3,40 @@ use std::sync::Arc;
 use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
+use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum OffloadError {
+    #[error("Cancelled")]
+    Cancelled,
+    #[error("Timeline is not archived")]
+    NotArchived,
+    #[error(transparent)]
+    RemoteStorage(anyhow::Error),
+    #[error("Unexpected offload error: {0}")]
+    Other(anyhow::Error),
+}
+
+impl From<TenantManifestError> for OffloadError {
+    fn from(e: TenantManifestError) -> Self {
+        match e {
+            TenantManifestError::Cancelled => Self::Cancelled,
+            TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
+        }
+    }
+}
 
 pub(crate) async fn offload_timeline(
     tenant: &Tenant,
     timeline: &Arc<Timeline>,
-) -> anyhow::Result<()> {
+) -> Result<(), OffloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
     let allow_offloaded_children = true;
     let (timeline, guard) =
-        DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)?;
+        DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
+            .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
         tracing::error!("timeline already offloaded, but given timeline object");
@@ -26,14 +48,15 @@ pub(crate) async fn offload_timeline(
         Some(true) => (),
         Some(false) => {
             tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
-            anyhow::bail!("timeline isn't archived");
+            return Err(OffloadError::NotArchived);
         }
         None => {
-            tracing::warn!(
+            // This is legal: calls to this function can race with the timeline shutting down
+            tracing::info!(
                 ?is_archived,
-                "tried offloading a timeline where manifest is not yet available"
+                "tried offloading a timeline whose remote storage is not initialized"
             );
-            anyhow::bail!("timeline manifest hasn't been loaded yet");
+            return Err(OffloadError::Cancelled);
         }
     }
 
@@ -44,9 +67,11 @@ pub(crate) async fn offload_timeline(
     // to make deletions possible while offloading is in progress
 
     let conf = &tenant.conf;
-    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
+    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline)
+        .await
+        .map_err(OffloadError::Other)?;
 
-    remove_timeline_from_tenant(tenant, &timeline, &guard).await?;
+    remove_timeline_from_tenant(tenant, &timeline, &guard);
 
     {
         let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
@@ -65,21 +90,18 @@ pub(crate) async fn offload_timeline(
     // at the next restart attach it again.
     // For that to happen, we'd need to make the manifest reflect our *intended* state,
     // not our actual state of offloaded timelines.
-    tenant
-        .store_tenant_manifest()
-        .await
-        .map_err(|e| anyhow::anyhow!(e))?;
+    tenant.store_tenant_manifest().await?;
 
     Ok(())
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn remove_timeline_from_tenant(
+fn remove_timeline_from_tenant(
     tenant: &Tenant,
     timeline: &Timeline,
     _: &DeletionGuard, // using it as a witness
-) -> anyhow::Result<()> {
+) {
     // Remove the timeline from the map.
     let mut timelines = tenant.timelines.lock().unwrap();
     let children_exist = timelines
@@ -95,8 +117,4 @@ async fn remove_timeline_from_tenant(
     timelines
         .remove(&timeline.timeline_id)
         .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
-
-    drop(timelines);
-
-    Ok(())
 }
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index c398289a5c8d..a93bdde3f8d3 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -141,7 +141,9 @@ impl Drop for UninitializedTimeline<'_> {
     fn drop(&mut self) {
         if let Some((_, create_guard)) = self.raw_timeline.take() {
             let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
-            error!("Timeline got dropped without initializing, cleaning its files");
+            // This is unusual, but can happen harmlessly if the pageserver is stopped while
+            // creating a timeline.
+            info!("Timeline got dropped without initializing, cleaning its files");
             cleanup_timeline_directory(create_guard);
         }
     }
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 99e0e23b4a63..124e62999abf 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -103,7 +103,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
     env.pageserver.allowed_errors.extend(
         [
             ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
-            ".*Timeline got dropped without initializing, cleaning its files.*",
         ]
     )
 
@@ -145,7 +144,6 @@ def test_timeline_init_break_before_checkpoint_recreate(
     env.pageserver.allowed_errors.extend(
         [
             ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
-            ".*Timeline got dropped without initializing, cleaning its files.*",
             ".*Failed to load index_part from remote storage, failed creation?.*",
         ]
     )
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index e367db33ff88..743fa72aba79 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -91,7 +91,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         [
             ".*Failed to import basebackup.*",
             ".*unexpected non-zero bytes after the tar archive.*",
-            ".*Timeline got dropped without initializing, cleaning its files.*",
             ".*InternalServerError.*timeline not found.*",
             ".*InternalServerError.*Tenant .* not found.*",
             ".*InternalServerError.*Timeline .* not found.*",
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index f4863274457d..47df3ead7020 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -146,8 +146,6 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
 
     env.pageserver.allowed_errors.extend(
         [
-            # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
-            ".*Timeline got dropped without initializing, cleaning its files",
             # the response hit_pausable_failpoint_and_later_fail
             f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn",
         ]

From 897cffb9d86389c990e65eba1f3ddac1213b0363 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 31 Oct 2024 14:57:55 +0000
Subject: [PATCH 19/32] auth_broker: fix local_proxy conn count (#9593)

our current metrics for http pool opened connections is always negative
:D oops
---
 proxy/src/serverless/http_conn_pool.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 934a50c14ff5..b92ae3131016 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -294,6 +294,11 @@ pub(crate) fn poll_http2_client(
                 conn_id,
                 aux: aux.clone(),
             });
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .inc();
 
             Arc::downgrade(&pool)
         }
@@ -306,7 +311,7 @@ pub(crate) fn poll_http2_client(
             let res = connection.await;
             match res {
                 Ok(()) => info!("connection closed"),
-                Err(e) => error!(%session_id, "connection error: {}", e),
+                Err(e) => error!(%session_id, "connection error: {e:?}"),
             }
 
             // remove from connection pool

From 9761b6a64e80a4e8bce4b00afce5c2c4f6b825bd Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 31 Oct 2024 15:50:41 +0000
Subject: [PATCH 20/32] update pg_session_jwt to use pgrx 0.12 for pg17 (#9595)

Updates the extension to use pgrx 0.12. No changes to the extensions
have been made, the only difference is the pgrx version.
---
 compute/compute-node.Dockerfile | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index e4c6589c60ed..30126de56c9a 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1067,20 +1067,16 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-session-jwt-build
+FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build
 ARG PG_VERSION
 
-# TODO use versioned releases
-# add v17 support
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
-    esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
+# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
 
 #########################################################################################

From e589c2e5ecce03b169ef24d8639e42beeec48837 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 31 Oct 2024 18:29:16 +0000
Subject: [PATCH 21/32] storage_controller: allow deployment infra to use infra
 token  (#9596)

## Problem

We wish for the deployment orchestrator to use infra scoped tokens,
but storcon endpoints it's using require admin scoped tokens.

## Summary of Changes

Switch over all endpoints that are used by the deployment orchestrator
to use an infra scoped token. This causes no breakage during mixed
version scenarios because admin scoped tokens allow access to all
endpoints. The deployment orchestrator can cut over to the infra token
after this commit touches down in prod.

Once this commit is released we should also update the tests code to use
infra scoped tokens where appropriate. Currently it would fail on the
[compat tests](https://github.com/neondatabase/neon/blob/9761b6a64e80a4e8bce4b00afce5c2c4f6b825bd/test_runner/regress/test_storage_controller.py#L69-L71).
---
 storage_controller/src/http.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index face3d2c2d29..f6ea1aedc626 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -658,7 +658,7 @@ async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiE
 }
 
 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -737,7 +737,7 @@ async fn handle_node_configure(req: Request<Body>) -> Result<Response<Body>, Api
 }
 
 async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -786,7 +786,7 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
 }
 
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -804,7 +804,7 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
 }
 
 async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -822,7 +822,7 @@ async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>,
 }
 
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -840,7 +840,7 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
 }
 
 async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {

From 2d1366c8ee217da3c09a1d3d68a3cfc7e98f500a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 1 Nov 2024 11:22:38 +0000
Subject: [PATCH 22/32] fix pre-commit hook with python stubs (#9602)

fix #9601
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3f21094ba42f..92580ee156fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,7 +67,7 @@ exclude = [
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
 # Without this line it would behave differently when executed on the entire project.
-mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner"
+mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner:$MYPY_CONFIG_FILE_DIR/test_runner/stubs"
 
 disallow_incomplete_defs = false
 disallow_untyped_calls = false

From 4c2c8d67081ee7856246e92df68dc13b1009c1a6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 1 Nov 2024 12:25:04 +0100
Subject: [PATCH 23/32] test_runner: fix `tenant_get_shards` with one
 pageserver (#9603)

## Problem

`tenant_get_shards()` does not work with a sharded tenant on 1
pageserver, as it assumes an unsharded tenant in this case. This special
case appears to have been added to handle e.g. `test_emergency_mode`,
where the storage controller is stopped. This breaks e.g. the sharded
ingest benchmark in #9591 when run with a single shard.

## Summary of changes

Correctly look up shards even with a single pageserver, but add a
special case that assumes an unsharded tenant if the storage controller
is stopped and the caller provides an explicit pageserver, in order to
accomodate `test_emergency_mode`.
---
 test_runner/fixtures/neon_fixtures.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1b9bc873f4be..e4d6e6da5df7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1397,7 +1397,7 @@ def neon_simple_env(
     pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnv]:
     """
-    Simple Neon environment, with no authentication and no safekeepers.
+    Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync.
 
     This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
     """
@@ -4701,6 +4701,7 @@ def tenant_get_shards(
 
     If the caller provides `pageserver_id`, it will be used for all shards, even
     if the shard is indicated by storage controller to be on some other pageserver.
+    If the storage controller is not running, assume an unsharded tenant.
 
     Caller should over the response to apply their per-pageserver action to
     each shard
@@ -4710,17 +4711,17 @@ def tenant_get_shards(
     else:
         override_pageserver = None
 
-    if len(env.pageservers) > 1:
-        return [
-            (
-                TenantShardId.parse(s["shard_id"]),
-                override_pageserver or env.get_pageserver(s["node_id"]),
-            )
-            for s in env.storage_controller.locate(tenant_id)
-        ]
-    else:
-        # Assume an unsharded tenant
-        return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)]
+    if not env.storage_controller.running and override_pageserver is not None:
+        log.warning(f"storage controller not running, assuming unsharded tenant {tenant_id}")
+        return [(TenantShardId(tenant_id, 0, 0), override_pageserver)]
+
+    return [
+        (
+            TenantShardId.parse(s["shard_id"]),
+            override_pageserver or env.get_pageserver(s["node_id"]),
+        )
+        for s in env.storage_controller.locate(tenant_id)
+    ]
 
 
 def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):

From 8b3bcf71eee04dc24398b37b82e5d8e528c2d0c4 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 1 Nov 2024 12:46:02 +0100
Subject: [PATCH 24/32] revert higher token expiration (#9605)

## Problem

The IAM role associated with our github action runner supports a max
token expiration which is lower than the value we tried.

## Summary of changes

Since we believe to have understood the performance regression we (by
ensuring availability zone affinity of compute and pageserver) the job
should again run in lower than 5 hours and we revert this change instead
of increasing the max session token expiration in the IAM role which
would reduce our security.
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index abc58733b347..69b8bc5d7075 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -683,7 +683,7 @@ jobs:
       with:
         aws-region: eu-central-1
         role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 43200 # 12 hours
+        role-duration-seconds: 18000 # 5 hours
 
     - name: Download Neon artifact
       uses: ./.github/actions/download

From 123816e99ac3b150aecfc71002f53cf0b1e64bf0 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 1 Nov 2024 13:47:03 +0100
Subject: [PATCH 25/32] safekeeper: log slow WalAcceptor sends (#9564)

## Problem

We don't have any observability into full WalAcceptor queues per
timeline.

## Summary of changes

Logs a message when a WalAcceptor send has blocked for 5 seconds, and
another message when the send completes. This implies that the log
frequency is at most once every 5 seconds per timeline, so we don't need
further throttling.
---
 safekeeper/src/receive_wal.rs | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index f97e127a1724..2410e22f450f 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -26,10 +26,11 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use tokio::io::AsyncRead;
 use tokio::io::AsyncWrite;
+use tokio::sync::mpsc::error::SendTimeoutError;
 use tokio::sync::mpsc::{channel, Receiver, Sender};
 use tokio::task;
 use tokio::task::JoinHandle;
-use tokio::time::{Duration, MissedTickBehavior};
+use tokio::time::{Duration, Instant, MissedTickBehavior};
 use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -384,9 +385,29 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
     msg_tx: Sender<ProposerAcceptorMessage>,
     mut next_msg: ProposerAcceptorMessage,
 ) -> Result<(), CopyStreamHandlerEnd> {
+    /// Threshold for logging slow WalAcceptor sends.
+    const SLOW_THRESHOLD: Duration = Duration::from_secs(5);
+
     loop {
-        if msg_tx.send(next_msg).await.is_err() {
-            return Ok(()); // chan closed, WalAcceptor terminated
+        let started = Instant::now();
+        match msg_tx.send_timeout(next_msg, SLOW_THRESHOLD).await {
+            Ok(()) => {}
+            // Slow send, log a message and keep trying. Log context has timeline ID.
+            Err(SendTimeoutError::Timeout(next_msg)) => {
+                warn!(
+                    "slow WalAcceptor send blocked for {:.3}s",
+                    Instant::now().duration_since(started).as_secs_f64()
+                );
+                if msg_tx.send(next_msg).await.is_err() {
+                    return Ok(()); // WalAcceptor terminated
+                }
+                warn!(
+                    "slow WalAcceptor send completed after {:.3}s",
+                    Instant::now().duration_since(started).as_secs_f64()
+                )
+            }
+            // WalAcceptor terminated.
+            Err(SendTimeoutError::Closed(_)) => return Ok(()),
         }
         next_msg = read_message(pgb_reader).await?;
     }

From 3c16bd6e0bbc5e39111188cfca571b5033d3a377 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Nov 2024 16:47:20 +0000
Subject: [PATCH 26/32] storcon: skip non-active projects in chaos injection
 (#9606)

## Problem

We may sometimes use scheduling modes like `Pause` to pin a tenant in
its current location for operational reasons. It is undesirable for the
chaos task to make any changes to such projects.

## Summary of changes

- Add a check for scheduling mode
- Add a log line when we do choose to do a chaos action for a tenant:
this will help us understand which operations originate from the chaos
task.
---
 storage_controller/src/service/chaos_injector.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 99961d691c7c..0e551beaa717 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -1,5 +1,6 @@
 use std::{sync::Arc, time::Duration};
 
+use pageserver_api::controller_api::ShardSchedulingPolicy;
 use rand::seq::SliceRandom;
 use rand::thread_rng;
 use tokio_util::sync::CancellationToken;
@@ -47,6 +48,16 @@ impl ChaosInjector {
                 .get_mut(victim)
                 .expect("Held lock between choosing ID and this get");
 
+            if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
+                // Skip non-active scheduling policies, so that a shard with a policy like Pause can
+                // be pinned without being disrupted by us.
+                tracing::info!(
+                    "Skipping shard {victim}: scheduling policy is {:?}",
+                    shard.get_scheduling_policy()
+                );
+                continue;
+            }
+
             // Pick a secondary to promote
             let Some(new_location) = shard
                 .intent
@@ -63,6 +74,8 @@ impl ChaosInjector {
                 continue;
             };
 
+            tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}");
+
             shard.intent.demote_attached(scheduler, old_location);
             shard.intent.promote_attached(scheduler, new_location);
             self.service.maybe_reconcile_shard(shard, nodes);

From 8ac523d2ee8e29ecc6891f6bd661fcf51b2147ba Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 1 Nov 2024 20:31:29 +0200
Subject: [PATCH 27/32] Do not assign page LSN to new (uninitialized) page in
 ClearVisibilityMapFlags redo handler (#9287)

## Problem

https://neondb.slack.com/archives/C04DGM6SMTM/p1727872045252899

See https://github.com/neondatabase/neon/issues/9240

## Summary of changes

Add `!page_is_new` check before assigning page lsn.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/walredo/apply_neon.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 7aaa357318bd..d712d8bf5efc 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -67,7 +67,10 @@ pub(crate) fn apply_in_neon(
                 let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
 
                 map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
+                // The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it.
+                if !postgres_ffi::page_is_new(page) {
+                    postgres_ffi::page_set_lsn(page, lsn);
+                }
             }
 
             // Repeat for 'old_heap_blkno', if any
@@ -81,7 +84,10 @@ pub(crate) fn apply_in_neon(
                 let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
 
                 map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
+                // The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it.
+                if !postgres_ffi::page_is_new(page) {
+                    postgres_ffi::page_set_lsn(page, lsn);
+                }
             }
         }
         // Non-relational WAL records are handled here, with custom code that has the

From 0058eb09df13ba13ead20a8a34ceefa4a3580f23 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sat, 2 Nov 2024 17:42:10 +0100
Subject: [PATCH 28/32] test_runner/performance: add sharded ingest benchmark
 (#9591)

Adds a Python benchmark for sharded ingestion. This ingests 7 GB of WAL
(100M rows) into a Safekeeper and fans out to 10 shards running on 10
different pageservers. The ingest volume and duration is recorded.
---
 .../performance/test_sharded_ingest.py        | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 test_runner/performance/test_sharded_ingest.py

diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
new file mode 100644
index 000000000000..77e8f2cf17d6
--- /dev/null
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+from contextlib import closing
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn, TenantShardId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    tenant_get_shards,
+    wait_for_last_flush_lsn,
+)
+
+
+@pytest.mark.timeout(600)
+@pytest.mark.parametrize("shard_count", [1, 8, 32])
+def test_sharded_ingest(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    shard_count: int,
+):
+    """
+    Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
+    and fanning out to a large number of shards on dedicated Pageservers. Comparing the base case
+    (shard_count=1) to the sharded case indicates the overhead of sharding.
+    """
+
+    ROW_COUNT = 100_000_000  # about 7 GB of WAL
+
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start()
+
+    # Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure
+    # the storage controller doesn't mess with shard placements.
+    #
+    # TODO: there should be a way to disable storage controller background reconciliations.
+    # Currently, disabling reconciliation also disables foreground operations.
+    tenant_id, timeline_id = env.create_tenant(shard_count=shard_count)
+
+    for shard_number in range(0, shard_count):
+        tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
+        pageserver_id = shard_number + 1
+        env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_id)
+
+    shards = tenant_get_shards(env, tenant_id)
+    env.storage_controller.reconcile_until_idle()
+    assert tenant_get_shards(env, tenant_id) == shards, "shards moved"
+
+    # Start the endpoint.
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    # Ingest data and measure WAL volume and duration.
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            log.info("Ingesting data")
+            cur.execute("set statement_timeout = 0")
+            cur.execute("create table huge (i int, j int)")
+
+            with zenbenchmark.record_duration("pageserver_ingest"):
+                with zenbenchmark.record_duration("wal_ingest"):
+                    cur.execute(f"insert into huge values (generate_series(1, {ROW_COUNT}), 0)")
+
+                wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+    end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+    wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
+    zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+
+    assert tenant_get_shards(env, tenant_id) == shards, "shards moved"

From 4534f5cdc663e81f9ccd233d0cc2b2db733658c6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Nov 2024 09:11:52 +0000
Subject: [PATCH 29/32] pageserver: make local timeline deletion infallible
 (#9594)

## Problem

In https://github.com/neondatabase/neon/pull/9589, timeline offload code
is modified to return an explicit error type rather than propagating
anyhow::Error. One of the 'Other' cases there is I/O errors from local
timeline deletion, which shouldn't need to exist, because our policy is
not to try and continue running if the local disk gives us errors.

## Summary of changes

- Make `delete_local_timeline_directory` and use `.fatal_err(` on I/O
errors

---------

Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 pageserver/src/tenant/timeline/delete.rs  | 40 +++++++++--------------
 pageserver/src/tenant/timeline/offload.rs |  4 +--
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index b0c4fa2bc995..5a4c2d9da34f 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -18,6 +18,7 @@ use crate::{
         CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
         TimelineOrOffloaded,
     },
+    virtual_file::MaybeFatalIo,
 };
 
 use super::{Timeline, TimelineResources};
@@ -62,10 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
     conf: &PageServerConf,
     tenant_shard_id: TenantShardId,
     timeline: &Timeline,
-) -> anyhow::Result<()> {
+) {
     // Always ensure the lock order is compaction -> gc.
     let compaction_lock = timeline.compaction_lock.lock();
-    let compaction_lock = crate::timed(
+    let _compaction_lock = crate::timed(
         compaction_lock,
         "acquires compaction lock",
         std::time::Duration::from_secs(5),
@@ -73,7 +74,7 @@ pub(super) async fn delete_local_timeline_directory(
     .await;
 
     let gc_lock = timeline.gc_lock.lock();
-    let gc_lock = crate::timed(
+    let _gc_lock = crate::timed(
         gc_lock,
         "acquires gc lock",
         std::time::Duration::from_secs(5),
@@ -85,24 +86,15 @@ pub(super) async fn delete_local_timeline_directory(
 
     let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
 
-    fail::fail_point!("timeline-delete-before-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-    });
-
     // NB: This need not be atomic because the deleted flag in the IndexPart
     // will be observed during tenant/timeline load. The deletion will be resumed there.
     //
-    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
-    // This can happen if we're called a second time, e.g.,
-    // because of a previous failure/cancellation at/after
-    // failpoint timeline-delete-after-rm.
-    //
-    // ErrorKind::NotFound can also happen if we race with tenant detach, because,
+    // ErrorKind::NotFound can happen e.g. if we race with tenant detach, because,
     // no locks are shared.
     tokio::fs::remove_dir_all(local_timeline_directory)
         .await
         .or_else(fs_ext::ignore_not_found)
-        .context("remove local timeline directory")?;
+        .fatal_err("removing timeline directory");
 
     // Make sure previous deletions are ordered before mark removal.
     // Otherwise there is no guarantee that they reach the disk before mark deletion.
@@ -113,17 +105,9 @@ pub(super) async fn delete_local_timeline_directory(
     let timeline_path = conf.timelines_path(&tenant_shard_id);
     crashsafe::fsync_async(timeline_path)
         .await
-        .context("fsync_pre_mark_remove")?;
+        .fatal_err("fsync after removing timeline directory");
 
     info!("finished deleting layer files, releasing locks");
-    drop(gc_lock);
-    drop(compaction_lock);
-
-    fail::fail_point!("timeline-delete-after-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-    });
-
-    Ok(())
 }
 
 /// Removes remote layers and an index file after them.
@@ -440,12 +424,20 @@ impl DeleteTimelineFlow {
         timeline: &TimelineOrOffloaded,
         remote_client: Arc<RemoteTimelineClient>,
     ) -> Result<(), DeleteTimelineError> {
+        fail::fail_point!("timeline-delete-before-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+        });
+
         // Offloaded timelines have no local state
         // TODO: once we persist offloaded information, delete the timeline from there, too
         if let TimelineOrOffloaded::Timeline(timeline) = timeline {
-            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
+            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
         }
 
+        fail::fail_point!("timeline-delete-after-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+        });
+
         delete_remote_layers_and_index(&remote_client).await?;
 
         pausable_failpoint!("in_progress_delete");
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index c77c2400007e..cccf24e303a9 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -67,9 +67,7 @@ pub(crate) async fn offload_timeline(
     // to make deletions possible while offloading is in progress
 
     let conf = &tenant.conf;
-    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline)
-        .await
-        .map_err(OffloadError::Other)?;
+    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
 
     remove_timeline_from_tenant(tenant, &timeline, &guard);
 

From d5de63c6b86816b61f5997bff503f0eb20d5fa72 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 4 Nov 2024 13:10:32 +0100
Subject: [PATCH 30/32] Fix a time zone issue in a PG17 test case (#9618)

The commit was cherry-picked and thus shouldn't cause issues once we
merge the release tag for PostgreSQL 17.1
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 68b5038f27e4..9ad2f3c5c37c 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 68b5038f27e493bde6ae552fe066f10cbdfe6a14
+Subproject commit 9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 896a75814e93..18bde183590a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.0",
-    "68b5038f27e493bde6ae552fe066f10cbdfe6a14"
+    "9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb"
   ],
   "v16": [
     "16.4",

From 3dcdbcc34dbf64a296a78a8252c0b42d7137cc3c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 4 Nov 2024 13:29:13 +0000
Subject: [PATCH 31/32] remove aws-lc-rs dep and fix storage_broker tls (#9613)

It seems the ecosystem is not so keen on moving to aws-lc-rs as it's
build setup is more complicated than ring (requiring cmake).

Eventually I expect the ecosystem should pivot to
https://github.com/ctz/graviola/tree/main/rustls-graviola as it
stabilises (it has a very simply build step and license), but for now
let's try not have a headache of juggling two crypto libs.

I also noticed that tonic will just fail with tls without a default
provider, so I added some defensive code for that.
---
 Cargo.lock                                    | 150 +++---------------
 Cargo.toml                                    |   4 +-
 libs/postgres_backend/tests/simple_select.rs  |   6 +-
 proxy/src/bin/pg_sni_router.rs                |  17 +-
 proxy/src/compute.rs                          |  18 +--
 proxy/src/config.rs                           |   6 +-
 proxy/src/proxy/tests/mod.rs                  |  28 ++--
 storage_broker/Cargo.toml                     |   1 +
 storage_broker/src/lib.rs                     |   6 +
 .../src/scan_safekeeper_metadata.rs           |   6 +-
 workspace_hack/Cargo.toml                     |   9 +-
 11 files changed, 72 insertions(+), 179 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c5af247e8be4..44ef6d960c69 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -310,33 +310,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "aws-lc-rs"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
-dependencies = [
- "aws-lc-sys",
- "mirai-annotations",
- "paste",
- "zeroize",
-]
-
-[[package]]
-name = "aws-lc-sys"
-version = "0.21.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
-dependencies = [
- "bindgen 0.69.5",
- "cc",
- "cmake",
- "dunce",
- "fs_extra",
- "libc",
- "paste",
-]
-
 [[package]]
 name = "aws-runtime"
 version = "1.4.3"
@@ -942,29 +915,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "bindgen"
-version = "0.69.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
-dependencies = [
- "bitflags 2.4.1",
- "cexpr",
- "clang-sys",
- "itertools 0.10.5",
- "lazy_static",
- "lazycell",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash",
- "shlex",
- "syn 2.0.52",
- "which",
-]
-
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@@ -974,7 +924,7 @@ dependencies = [
  "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "itertools 0.10.5",
+ "itertools 0.12.1",
  "log",
  "prettyplease",
  "proc-macro2",
@@ -1220,15 +1170,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
-[[package]]
-name = "cmake"
-version = "0.1.51"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1329,9 +1270,9 @@ dependencies = [
 
 [[package]]
 name = "const-oid"
-version = "0.9.5"
+version = "0.9.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
 
 [[package]]
 name = "const-random"
@@ -1815,12 +1756,6 @@ dependencies = [
  "syn 2.0.52",
 ]
 
-[[package]]
-name = "dunce"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
-
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2125,12 +2060,6 @@ dependencies = [
  "tokio-util",
 ]
 
-[[package]]
-name = "fs_extra"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2484,15 +2413,6 @@ dependencies = [
  "digest",
 ]
 
-[[package]]
-name = "home"
-version = "0.5.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
-dependencies = [
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2988,12 +2908,6 @@ dependencies = [
  "spin",
 ]
 
-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3224,12 +3138,6 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "mirai-annotations"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
-
 [[package]]
 name = "multimap"
 version = "0.8.3"
@@ -4147,7 +4055,7 @@ dependencies = [
  "bytes",
  "once_cell",
  "pq_proto",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
@@ -4176,7 +4084,7 @@ name = "postgres_ffi"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen 0.70.1",
+ "bindgen",
  "bytes",
  "crc32c",
  "env_logger",
@@ -4314,7 +4222,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
  "heck 0.5.0",
- "itertools 0.10.5",
+ "itertools 0.12.1",
  "log",
  "multimap",
  "once_cell",
@@ -4334,7 +4242,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
 dependencies = [
  "anyhow",
- "itertools 0.10.5",
+ "itertools 0.12.1",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -4422,7 +4330,7 @@ dependencies = [
  "rsa",
  "rstest",
  "rustc-hash",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
  "scopeguard",
@@ -5106,23 +5014,22 @@ dependencies = [
  "log",
  "ring",
  "rustls-pki-types",
- "rustls-webpki 0.102.2",
+ "rustls-webpki 0.102.8",
  "subtle",
  "zeroize",
 ]
 
 [[package]]
 name = "rustls"
-version = "0.23.7"
+version = "0.23.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
+checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
 dependencies = [
- "aws-lc-rs",
  "log",
  "once_cell",
  "ring",
  "rustls-pki-types",
- "rustls-webpki 0.102.2",
+ "rustls-webpki 0.102.8",
  "subtle",
  "zeroize",
 ]
@@ -5202,11 +5109,10 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.102.2"
+version = "0.102.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
+checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
 dependencies = [
- "aws-lc-rs",
  "ring",
  "rustls-pki-types",
  "untrusted",
@@ -5823,6 +5729,7 @@ dependencies = [
  "once_cell",
  "parking_lot 0.12.1",
  "prost",
+ "rustls 0.23.16",
  "tokio",
  "tonic",
  "tonic-build",
@@ -5905,7 +5812,7 @@ dependencies = [
  "postgres_ffi",
  "remote_storage",
  "reqwest 0.12.4",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-native-certs 0.8.0",
  "serde",
  "serde_json",
@@ -6338,7 +6245,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
  "ring",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "tokio",
  "tokio-postgres",
  "tokio-rustls 0.26.0",
@@ -6372,7 +6279,7 @@ version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
 dependencies = [
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-pki-types",
  "tokio",
 ]
@@ -6781,7 +6688,7 @@ dependencies = [
  "base64 0.22.1",
  "log",
  "once_cell",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-pki-types",
  "url",
  "webpki-roots 0.26.1",
@@ -6985,7 +6892,7 @@ name = "walproposer"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen 0.70.1",
+ "bindgen",
  "postgres_ffi",
  "utils",
 ]
@@ -7160,18 +7067,6 @@ dependencies = [
  "rustls-pki-types",
 ]
 
-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix",
-]
-
 [[package]]
 name = "whoami"
 version = "1.5.1"
@@ -7431,7 +7326,7 @@ dependencies = [
  "hyper-util",
  "indexmap 1.9.3",
  "indexmap 2.0.1",
- "itertools 0.10.5",
+ "itertools 0.12.1",
  "lazy_static",
  "libc",
  "log",
@@ -7452,8 +7347,7 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest 0.12.4",
- "rustls 0.23.7",
- "rustls-webpki 0.102.2",
+ "rustls 0.23.16",
  "scopeguard",
  "serde",
  "serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index 7f9a766ff954..e5f7719e7f3e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -143,7 +143,7 @@ reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.23"
+rustls = { version = "0.23.16", default-features = false }
 rustls-pemfile = "2"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -174,7 +174,7 @@ tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
-tokio-rustls = "0.26"
+tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 9d3031d6998b..3fcfbf4a0318 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -2,7 +2,7 @@
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use std::io::Cursor;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -94,7 +94,7 @@ async fn simple_select_ssl() {
     let (client_sock, server_sock) = make_tcp_pair().await;
 
     let server_cfg =
-        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
             .expect("aws_lc_rs should support the default protocol versions")
             .with_no_client_auth()
@@ -110,7 +110,7 @@ async fn simple_select_ssl() {
     });
 
     let client_cfg =
-        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
             .expect("aws_lc_rs should support the default protocol versions")
             .with_root_certificates({
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 025053d3cbf4..71783ee45256 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -15,7 +15,7 @@ use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use proxy::stream::{PqStream, Stream};
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpListener;
@@ -105,14 +105,13 @@ async fn main() -> anyhow::Result<()> {
             let first_cert = cert_chain.first().context("missing certificate")?;
             let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
 
-            let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new(
-                aws_lc_rs::default_provider(),
-            ))
-            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
-            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
-            .with_no_client_auth()
-            .with_single_cert(cert_chain, key)?
-            .into();
+            let tls_config =
+                rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
+                    .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+                    .context("ring should support TLS1.2 and TLS1.3")?
+                    .with_no_client_auth()
+                    .with_single_cert(cert_chain, key)?
+                    .into();
 
             (tls_config, tls_server_end_point)
         }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b97942ee5de8..65b6dd215bea 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,7 +8,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -266,12 +266,12 @@ impl ConnCfg {
     }
 }
 
+type RustlsStream = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::Stream;
+
 pub(crate) struct PostgresConnection {
     /// Socket connected to a compute node.
-    pub(crate) stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
-        tokio::net::TcpStream,
-        tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
-    >,
+    pub(crate) stream:
+        tokio_postgres::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
     /// PostgreSQL connection parameters.
     pub(crate) params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
@@ -298,9 +298,9 @@ impl ConnCfg {
         let client_config = if allow_self_signed_compute {
             // Allow all certificates for creating the connection
             let verifier = Arc::new(AcceptEverythingVerifier);
-            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
-                .expect("aws_lc_rs should support the default protocol versions")
+                .expect("ring should support the default protocol versions")
                 .dangerous()
                 .with_custom_certificate_verifier(verifier)
         } else {
@@ -308,9 +308,9 @@ impl ConnCfg {
                 .get_or_try_init(load_certs)
                 .map_err(ConnectionError::TlsCertificateError)?
                 .clone();
-            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
-                .expect("aws_lc_rs should support the default protocol versions")
+                .expect("ring should support the default protocol versions")
                 .with_root_certificates(root_store)
         };
         let client_config = client_config.with_no_client_auth();
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5183f22fa3d0..2870e100b7ae 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -7,7 +7,7 @@ use anyhow::{bail, ensure, Context, Ok};
 use clap::ValueEnum;
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
-use rustls::crypto::aws_lc_rs::{self, sign};
+use rustls::crypto::ring::{self, sign};
 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 use sha2::{Digest, Sha256};
 use tracing::{error, info};
@@ -127,9 +127,9 @@ pub fn configure_tls(
 
     // allow TLS 1.2 to be compatible with older client libraries
     let mut config =
-        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
-            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
+            .context("ring should support TLS1.2 and TLS1.3")?
             .with_no_client_auth()
             .with_cert_resolver(cert_resolver.clone());
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index fe62fee20445..abb0599d0808 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -9,11 +9,12 @@ use async_trait::async_trait;
 use http::StatusCode;
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use rustls::pki_types;
+use tokio::io::DuplexStream;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
-use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
+use tokio_postgres_rustls::MakeRustlsConnect;
 
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -69,19 +70,12 @@ struct ClientConfig<'a> {
     hostname: &'a str,
 }
 
+type TlsConnect<S> = <MakeRustlsConnect as MakeTlsConnect<S>>::TlsConnect;
+
 impl ClientConfig<'_> {
-    fn make_tls_connect<S: AsyncRead + AsyncWrite + Unpin + Send + 'static>(
-        self,
-    ) -> anyhow::Result<
-        impl tokio_postgres::tls::TlsConnect<
-                S,
-                Error = impl std::fmt::Debug + use<S>,
-                Future = impl Send + use<S>,
-                Stream = RustlsStream<S>,
-            > + use<S>,
-    > {
+    fn make_tls_connect(self) -> anyhow::Result<TlsConnect<DuplexStream>> {
         let mut mk = MakeRustlsConnect::new(self.config);
-        let tls = MakeTlsConnect::<S>::make_tls_connect(&mut mk, self.hostname)?;
+        let tls = MakeTlsConnect::<DuplexStream>::make_tls_connect(&mut mk, self.hostname)?;
         Ok(tls)
     }
 }
@@ -95,9 +89,9 @@ fn generate_tls_config<'a>(
 
     let tls_config = {
         let config =
-            rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
-                .context("aws_lc_rs should support the default protocol versions")?
+                .context("ring should support the default protocol versions")?
                 .with_no_client_auth()
                 .with_single_cert(vec![cert.clone()], key.clone_key())?
                 .into();
@@ -116,9 +110,9 @@ fn generate_tls_config<'a>(
 
     let client_config = {
         let config =
-            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
-                .context("aws_lc_rs should support the default protocol versions")?
+                .context("ring should support the default protocol versions")?
                 .with_root_certificates({
                     let mut store = rustls::RootCertStore::empty();
                     store.add(ca)?;
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index 2d19472c362b..17d4aed63bba 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -28,6 +28,7 @@ tokio = { workspace = true, features = ["rt-multi-thread"] }
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
+rustls.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index bc632a39f7bc..3ac40f6e142c 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -52,6 +52,12 @@ where
     // If schema starts with https, start encrypted connection; do plain text
     // otherwise.
     if let Some("https") = tonic_endpoint.uri().scheme_str() {
+        // if there's no default provider and both ring+aws-lc-rs are enabled
+        // this the tls settings on tonic will not work.
+        // erroring is ok.
+        rustls::crypto::ring::default_provider()
+            .install_default()
+            .ok();
         let tls = ClientTlsConfig::new();
         tonic_endpoint = tonic_endpoint.tls_config(tls)?;
     }
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 6c312d003687..403b4590a825 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -6,7 +6,7 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
 use remote_storage::GenericRemoteStorage;
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{debug, error, info};
@@ -256,9 +256,9 @@ async fn load_timelines_from_db(
     // Use rustls (Neon requires TLS)
     let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
     let client_config =
-        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
-            .context("aws_lc_rs should support the default protocol versions")?
+            .context("ring should support the default protocol versions")?
             .with_root_certificates(root_store)
             .with_no_client_auth();
     let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 28c51b8ac120..8d83d9d9e28e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -47,7 +47,7 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"]
 hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.10" }
+itertools = { version = "0.12" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -65,8 +65,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
-rustls = { version = "0.23", features = ["ring"] }
-rustls-webpki = { version = "0.102", default-features = false, features = ["aws_lc_rs", "ring", "std"] }
+rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["alloc", "raw_value"] }
@@ -80,7 +79,7 @@ tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] }
-tokio-rustls = { version = "0.26", features = ["ring"] }
+tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
@@ -106,7 +105,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.10" }
+itertools = { version = "0.12" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }

From 8ad1dbce7252d1ed91543ddda6cd4c7c8ade414d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 4 Nov 2024 14:04:56 +0000
Subject: [PATCH 32/32] [proxy]: parse proxy protocol TLVs with aws/azure
 support (#9610)

AWS/azure private link shares extra information in the "TLV" values of
the proxy protocol v2 header. This code doesn't action on it, but it
parses it as appropriate.
---
 Cargo.lock                          |   2 +
 proxy/Cargo.toml                    |   2 +
 proxy/src/bin/pg_sni_router.rs      |   6 +-
 proxy/src/console_redirect_proxy.rs |   6 +-
 proxy/src/context/mod.rs            |  26 ++--
 proxy/src/context/parquet.rs        |   2 +-
 proxy/src/protocol2.rs              | 186 ++++++++++++++++++++++++----
 proxy/src/proxy/mod.rs              |  10 +-
 proxy/src/serverless/mod.rs         |  56 ++++++---
 9 files changed, 236 insertions(+), 60 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 44ef6d960c69..484769bd16fa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4341,6 +4341,8 @@ dependencies = [
  "smallvec",
  "smol_str",
  "socket2",
+ "strum",
+ "strum_macros",
  "subtle",
  "thiserror",
  "tikv-jemalloc-ctl",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index e25d2fcbab03..2580b1cf8a4f 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -74,6 +74,8 @@ sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 subtle.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 71783ee45256..ef5b5e8509f1 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -13,6 +13,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
+use proxy::protocol2::ConnectionInfo;
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use proxy::stream::{PqStream, Stream};
 use rustls::crypto::ring;
@@ -178,7 +179,10 @@ async fn task_main(
                 info!(%peer_addr, "serving");
                 let ctx = RequestMonitoring::new(
                     session_id,
-                    peer_addr.ip(),
+                    ConnectionInfo {
+                        addr: peer_addr,
+                        extra: None,
+                    },
                     proxy::metrics::Protocol::SniRouter,
                     "sni",
                 );
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 81d1d70958d2..243ef078548d 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -11,7 +11,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestMonitoring;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::read_proxy_protocol;
+use crate::protocol2::{read_proxy_protocol, ConnectionInfo};
 use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism};
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::proxy::passthrough::ProxyPassthrough;
@@ -65,8 +65,8 @@ pub async fn task_main(
                     error!("proxy protocol header not supported");
                     return;
                 }
-                Ok((socket, Some(addr))) => (socket, addr.ip()),
-                Ok((socket, None)) => (socket, peer_addr.ip()),
+                Ok((socket, Some(info))) => (socket, info),
+                Ok((socket, None)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }),
             };
 
             match socket.inner.set_nodelay(true) {
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index ca3b808a1b41..2a6c9c596924 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -19,6 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
     ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
 };
+use crate::protocol2::ConnectionInfo;
 use crate::types::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
@@ -40,7 +41,7 @@ pub struct RequestMonitoring(
 );
 
 struct RequestMonitoringInner {
-    pub(crate) peer_addr: IpAddr,
+    pub(crate) conn_info: ConnectionInfo,
     pub(crate) session_id: Uuid,
     pub(crate) protocol: Protocol,
     first_packet: chrono::DateTime<Utc>,
@@ -84,7 +85,7 @@ impl Clone for RequestMonitoring {
     fn clone(&self) -> Self {
         let inner = self.0.try_lock().expect("should not deadlock");
         let new = RequestMonitoringInner {
-            peer_addr: inner.peer_addr,
+            conn_info: inner.conn_info.clone(),
             session_id: inner.session_id,
             protocol: inner.protocol,
             first_packet: inner.first_packet,
@@ -117,7 +118,7 @@ impl Clone for RequestMonitoring {
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
-        peer_addr: IpAddr,
+        conn_info: ConnectionInfo,
         protocol: Protocol,
         region: &'static str,
     ) -> Self {
@@ -125,13 +126,13 @@ impl RequestMonitoring {
             "connect_request",
             %protocol,
             ?session_id,
-            %peer_addr,
+            %conn_info,
             ep = tracing::field::Empty,
             role = tracing::field::Empty,
         );
 
         let inner = RequestMonitoringInner {
-            peer_addr,
+            conn_info,
             session_id,
             protocol,
             first_packet: Utc::now(),
@@ -162,7 +163,11 @@ impl RequestMonitoring {
 
     #[cfg(test)]
     pub(crate) fn test() -> Self {
-        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
+        use std::net::SocketAddr;
+        let ip = IpAddr::from([127, 0, 0, 1]);
+        let addr = SocketAddr::new(ip, 5432);
+        let conn_info = ConnectionInfo { addr, extra: None };
+        RequestMonitoring::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
     }
 
     pub(crate) fn console_application_name(&self) -> String {
@@ -286,7 +291,12 @@ impl RequestMonitoring {
     }
 
     pub(crate) fn peer_addr(&self) -> IpAddr {
-        self.0.try_lock().expect("should not deadlock").peer_addr
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .conn_info
+            .addr
+            .ip()
     }
 
     pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
@@ -362,7 +372,7 @@ impl RequestMonitoringInner {
     }
 
     fn has_private_peer_addr(&self) -> bool {
-        match self.peer_addr {
+        match self.conn_info.addr.ip() {
             IpAddr::V4(ip) => ip.is_private(),
             IpAddr::V6(_) => false,
         }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 3432ac5ff660..adbb74c8e526 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -121,7 +121,7 @@ impl From<&RequestMonitoringInner> for RequestData {
     fn from(value: &RequestMonitoringInner) -> Self {
         Self {
             session_id: value.session_id,
-            peer_addr: value.peer_addr.to_string(),
+            peer_addr: value.conn_info.addr.ip().to_string(),
             timestamp: value.first_packet.naive_utc(),
             username: value.user.as_deref().map(String::from),
             application_name: value.application.as_deref().map(String::from),
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index ef2391cdd805..d1084ca2ffd5 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,12 +1,15 @@
 //! Proxy Protocol V2 implementation
+//! Compatible with <https://www.haproxy.org/download/3.1/doc/proxy-protocol.txt>
 
+use core::fmt;
 use std::io;
-use std::net::SocketAddr;
+use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr};
 use std::pin::Pin;
 use std::task::{Context, Poll};
 
-use bytes::BytesMut;
+use bytes::{Buf, Bytes, BytesMut};
 use pin_project_lite::pin_project;
+use strum_macros::FromRepr;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 
 pin_project! {
@@ -58,9 +61,35 @@ const HEADER: [u8; 12] = [
     0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
 ];
 
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct ConnectionInfo {
+    pub addr: SocketAddr,
+    pub extra: Option<ConnectionInfoExtra>,
+}
+
+impl fmt::Display for ConnectionInfo {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.extra {
+            None => self.addr.ip().fmt(f),
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                write!(f, "vpce_id[{vpce_id:?}]:addr[{}]", self.addr.ip())
+            }
+            Some(ConnectionInfoExtra::Azure { link_id }) => {
+                write!(f, "link_id[{link_id}]:addr[{}]", self.addr.ip())
+            }
+        }
+    }
+}
+
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub enum ConnectionInfoExtra {
+    Aws { vpce_id: Bytes },
+    Azure { link_id: u32 },
+}
+
 pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
     mut read: T,
-) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
+) -> std::io::Result<(ChainRW<T>, Option<ConnectionInfo>)> {
     let mut buf = BytesMut::with_capacity(128);
     while buf.len() < 16 {
         let bytes_read = read.read_buf(&mut buf).await?;
@@ -164,22 +193,107 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
     //   - destination layer 3 address in network byte order
     //   - source layer 4 address if any, in network byte order (port)
     //   - destination layer 4 address if any, in network byte order (port)
-    let addresses = buf.split_to(remaining_length as usize);
-    let socket = match address_length {
+    let mut header = buf.split_to(usize::from(remaining_length));
+    let mut addr = header.split_to(usize::from(address_length));
+    let socket = match addr.len() {
         12 => {
-            let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
-            let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
+            let src_addr = Ipv4Addr::from_bits(addr.get_u32());
+            let _dst_addr = Ipv4Addr::from_bits(addr.get_u32());
+            let src_port = addr.get_u16();
+            let _dst_port = addr.get_u16();
             Some(SocketAddr::from((src_addr, src_port)))
         }
         36 => {
-            let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
-            let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
+            let src_addr = Ipv6Addr::from_bits(addr.get_u128());
+            let _dst_addr = Ipv6Addr::from_bits(addr.get_u128());
+            let src_port = addr.get_u16();
+            let _dst_port = addr.get_u16();
             Some(SocketAddr::from((src_addr, src_port)))
         }
         _ => None,
     };
 
-    Ok((ChainRW { inner: read, buf }, socket))
+    let mut extra = None;
+
+    while let Some(mut tlv) = read_tlv(&mut header) {
+        match Pp2Kind::from_repr(tlv.kind) {
+            Some(Pp2Kind::Aws) => {
+                if tlv.value.is_empty() {
+                    tracing::warn!("invalid aws tlv: no subtype");
+                }
+                let subtype = tlv.value.get_u8();
+                match Pp2AwsType::from_repr(subtype) {
+                    Some(Pp2AwsType::VpceId) => {
+                        extra = Some(ConnectionInfoExtra::Aws { vpce_id: tlv.value });
+                    }
+                    None => {
+                        tracing::warn!("unknown aws tlv: subtype={subtype}");
+                    }
+                }
+            }
+            Some(Pp2Kind::Azure) => {
+                if tlv.value.is_empty() {
+                    tracing::warn!("invalid azure tlv: no subtype");
+                }
+                let subtype = tlv.value.get_u8();
+                match Pp2AzureType::from_repr(subtype) {
+                    Some(Pp2AzureType::PrivateEndpointLinkId) => {
+                        if tlv.value.len() != 4 {
+                            tracing::warn!("invalid azure link_id: {:?}", tlv.value);
+                        }
+                        extra = Some(ConnectionInfoExtra::Azure {
+                            link_id: tlv.value.get_u32_le(),
+                        });
+                    }
+                    None => {
+                        tracing::warn!("unknown azure tlv: subtype={subtype}");
+                    }
+                }
+            }
+            Some(kind) => {
+                tracing::debug!("unused tlv[{kind:?}]: {:?}", tlv.value);
+            }
+            None => {
+                tracing::debug!("unknown tlv: {tlv:?}");
+            }
+        }
+    }
+
+    let conn_info = socket.map(|addr| ConnectionInfo { addr, extra });
+
+    Ok((ChainRW { inner: read, buf }, conn_info))
+}
+
+#[derive(FromRepr, Debug, Copy, Clone)]
+#[repr(u8)]
+enum Pp2Kind {
+    // The following are defined by https://www.haproxy.org/download/3.1/doc/proxy-protocol.txt
+    // we don't use these but it would be interesting to know what's available
+    Alpn = 0x01,
+    Authority = 0x02,
+    Crc32C = 0x03,
+    Noop = 0x04,
+    UniqueId = 0x05,
+    Ssl = 0x20,
+    NetNs = 0x30,
+
+    /// <https://docs.aws.amazon.com/elasticloadbalancing/latest/network/edit-target-group-attributes.html#proxy-protocol>
+    Aws = 0xEA,
+
+    /// <https://learn.microsoft.com/en-us/azure/private-link/private-link-service-overview#getting-connection-information-using-tcp-proxy-v2>
+    Azure = 0xEE,
+}
+
+#[derive(FromRepr, Debug, Copy, Clone)]
+#[repr(u8)]
+enum Pp2AwsType {
+    VpceId = 0x01,
+}
+
+#[derive(FromRepr, Debug, Copy, Clone)]
+#[repr(u8)]
+enum Pp2AzureType {
+    PrivateEndpointLinkId = 0x01,
 }
 
 impl<T: AsyncRead> AsyncRead for ChainRW<T> {
@@ -216,6 +330,25 @@ impl<T: AsyncRead> ChainRW<T> {
     }
 }
 
+#[derive(Debug)]
+struct Tlv {
+    kind: u8,
+    value: Bytes,
+}
+
+fn read_tlv(b: &mut BytesMut) -> Option<Tlv> {
+    if b.len() < 3 {
+        return None;
+    }
+    let kind = b.get_u8();
+    let len = usize::from(b.get_u16());
+    if b.len() < len {
+        return None;
+    }
+    let value = b.split_to(len).freeze();
+    Some(Tlv { kind, value })
+}
+
 #[cfg(test)]
 mod tests {
     use tokio::io::AsyncReadExt;
@@ -242,7 +375,7 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+        let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
             .await
             .unwrap();
 
@@ -250,7 +383,9 @@ mod tests {
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into()));
+
+        let info = info.unwrap();
+        assert_eq!(info.addr, ([127, 0, 0, 1], 65535).into());
     }
 
     #[tokio::test]
@@ -273,7 +408,7 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+        let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
             .await
             .unwrap();
 
@@ -281,9 +416,11 @@ mod tests {
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
+
+        let info = info.unwrap();
         assert_eq!(
-            addr,
-            Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into())
+            info.addr,
+            ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
         );
     }
 
@@ -291,30 +428,31 @@ mod tests {
     async fn test_invalid() {
         let data = [0x55; 256];
 
-        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
+        let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(addr, None);
+        assert_eq!(info, None);
     }
 
     #[tokio::test]
     async fn test_short() {
         let data = [0x55; 10];
 
-        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
+        let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(addr, None);
+        assert_eq!(info, None);
     }
 
     #[tokio::test]
     async fn test_large_tlv() {
         let tlv = vec![0x55; 32768];
-        let len = (12 + tlv.len() as u16).to_be_bytes();
+        let tlv_len = (tlv.len() as u16).to_be_bytes();
+        let len = (12 + 3 + tlv.len() as u16).to_be_bytes();
 
         let header = super::HEADER
             // Proxy command, Inet << 4 | Stream
@@ -330,11 +468,13 @@ mod tests {
             // dst port
             .chain([1, 1].as_slice())
             // TLV
+            .chain([255].as_slice())
+            .chain(tlv_len.as_slice())
             .chain(tlv.as_slice());
 
         let extra_data = [0xaa; 256];
 
-        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+        let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
             .await
             .unwrap();
 
@@ -342,6 +482,8 @@ mod tests {
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into()));
+
+        let info = info.unwrap();
+        assert_eq!(info.addr, ([55, 56, 57, 58], 65535).into());
     }
 }
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 2970d933934c..922646d88909 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -28,7 +28,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestMonitoring;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::read_proxy_protocol;
+use crate::protocol2::{read_proxy_protocol, ConnectionInfo};
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
@@ -87,7 +87,7 @@ pub async fn task_main(
         let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
 
         connections.spawn(async move {
-            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
+            let (socket, conn_info) = match read_proxy_protocol(socket).await {
                 Err(e) => {
                     warn!("per-client task finished with an error: {e:#}");
                     return;
@@ -100,8 +100,8 @@ pub async fn task_main(
                     warn!("proxy protocol header not supported");
                     return;
                 }
-                Ok((socket, Some(addr))) => (socket, addr.ip()),
-                Ok((socket, None)) => (socket, peer_addr.ip()),
+                Ok((socket, Some(info))) => (socket, info),
+                Ok((socket, None)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }),
             };
 
             match socket.inner.set_nodelay(true) {
@@ -114,7 +114,7 @@ pub async fn task_main(
 
             let ctx = RequestMonitoring::new(
                 session_id,
-                peer_addr,
+                conn_info,
                 crate::metrics::Protocol::Tcp,
                 &config.region,
             );
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index edbb0347d33c..4b60ddf60f7c 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -44,10 +44,10 @@ use tracing::{info, warn, Instrument};
 use utils::http::error::ApiError;
 
 use crate::cancellation::CancellationHandlerMain;
-use crate::config::ProxyConfig;
+use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
-use crate::protocol2::{read_proxy_protocol, ChainRW};
+use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectionInfo};
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
@@ -180,7 +180,7 @@ pub async fn task_main(
                     peer_addr,
                 ))
                 .await;
-                let Some((conn, peer_addr)) = startup_result else {
+                let Some((conn, conn_info)) = startup_result else {
                     return;
                 };
 
@@ -192,7 +192,7 @@ pub async fn task_main(
                     endpoint_rate_limiter,
                     conn_token,
                     conn,
-                    peer_addr,
+                    conn_info,
                     session_id,
                 ))
                 .await;
@@ -240,7 +240,7 @@ async fn connection_startup(
     session_id: uuid::Uuid,
     conn: TcpStream,
     peer_addr: SocketAddr,
-) -> Option<(AsyncRW, IpAddr)> {
+) -> Option<(AsyncRW, ConnectionInfo)> {
     // handle PROXY protocol
     let (conn, peer) = match read_proxy_protocol(conn).await {
         Ok(c) => c,
@@ -250,17 +250,32 @@ async fn connection_startup(
         }
     };
 
-    let peer_addr = peer.unwrap_or(peer_addr).ip();
-    let has_private_peer_addr = match peer_addr {
+    let conn_info = match peer {
+        None if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+            tracing::warn!("missing required proxy protocol header");
+            return None;
+        }
+        Some(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+            tracing::warn!("proxy protocol header not supported");
+            return None;
+        }
+        Some(info) => info,
+        None => ConnectionInfo {
+            addr: peer_addr,
+            extra: None,
+        },
+    };
+
+    let has_private_peer_addr = match conn_info.addr.ip() {
         IpAddr::V4(ip) => ip.is_private(),
         IpAddr::V6(_) => false,
     };
-    info!(?session_id, %peer_addr, "accepted new TCP connection");
+    info!(?session_id, %conn_info, "accepted new TCP connection");
 
     // try upgrade to TLS, but with a timeout.
     let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await {
         Ok(Ok(conn)) => {
-            info!(?session_id, %peer_addr, "accepted new TLS connection");
+            info!(?session_id, %conn_info, "accepted new TLS connection");
             conn
         }
         // The handshake failed
@@ -268,7 +283,7 @@ async fn connection_startup(
             if !has_private_peer_addr {
                 Metrics::get().proxy.tls_handshake_failures.inc();
             }
-            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            warn!(?session_id, %conn_info, "failed to accept TLS connection: {e:?}");
             return None;
         }
         // The handshake timed out
@@ -276,12 +291,12 @@ async fn connection_startup(
             if !has_private_peer_addr {
                 Metrics::get().proxy.tls_handshake_failures.inc();
             }
-            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            warn!(?session_id, %conn_info, "failed to accept TLS connection: {e:?}");
             return None;
         }
     };
 
-    Some((conn, peer_addr))
+    Some((conn, conn_info))
 }
 
 /// Handles HTTP connection
@@ -297,7 +312,7 @@ async fn connection_handler(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     conn: AsyncRW,
-    peer_addr: IpAddr,
+    conn_info: ConnectionInfo,
     session_id: uuid::Uuid,
 ) {
     let session_id = AtomicTake::new(session_id);
@@ -306,6 +321,7 @@ async fn connection_handler(
     let http_cancellation_token = CancellationToken::new();
     let _cancel_connection = http_cancellation_token.clone().drop_guard();
 
+    let conn_info2 = conn_info.clone();
     let server = Builder::new(TokioExecutor::new());
     let conn = server.serve_connection_with_upgrades(
         hyper_util::rt::TokioIo::new(conn),
@@ -340,7 +356,7 @@ async fn connection_handler(
                     connections.clone(),
                     cancellation_handler.clone(),
                     session_id,
-                    peer_addr,
+                    conn_info2.clone(),
                     http_request_token,
                     endpoint_rate_limiter.clone(),
                 )
@@ -365,7 +381,7 @@ async fn connection_handler(
     // On cancellation, trigger the HTTP connection handler to shut down.
     let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
         Either::Left((_cancelled, mut conn)) => {
-            tracing::debug!(%peer_addr, "cancelling connection");
+            tracing::debug!(%conn_info, "cancelling connection");
             conn.as_mut().graceful_shutdown();
             conn.await
         }
@@ -373,8 +389,8 @@ async fn connection_handler(
     };
 
     match res {
-        Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"),
-        Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"),
+        Ok(()) => tracing::info!(%conn_info, "HTTP connection closed"),
+        Err(e) => tracing::warn!(%conn_info, "HTTP connection error {e}"),
     }
 }
 
@@ -386,7 +402,7 @@ async fn request_handler(
     ws_connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
     session_id: uuid::Uuid,
-    peer_addr: IpAddr,
+    conn_info: ConnectionInfo,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -404,7 +420,7 @@ async fn request_handler(
     {
         let ctx = RequestMonitoring::new(
             session_id,
-            peer_addr,
+            conn_info,
             crate::metrics::Protocol::Ws,
             &config.region,
         );
@@ -439,7 +455,7 @@ async fn request_handler(
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
         let ctx = RequestMonitoring::new(
             session_id,
-            peer_addr,
+            conn_info,
             crate::metrics::Protocol::Http,
             &config.region,
         );