coordinode-lsm-tree 5.5.0

// SPDX-License-Identifier: Apache-2.0
// Copyright (c) 2024-present, fjall-rs
// Copyright (c) 2026-present, Structured World Foundation

mod blob_file_list;
mod diff;
pub mod edit;
pub mod edit_log;
// `framing` uses `std::io::{Read, Write}`. The whole `version`
// module (recovery, persist, super_version, this file's
// `Version::encode_into`) is also std-bound today and consumes
// the framing helpers unconditionally. Gating only `framing`
// behind `#[cfg(feature = "std")]` would NOT help the
// no-std-check job at all — the callers (recovery, persist,
// super_version) would then fail to resolve `framing` under
// `--no-default-features --features alloc`, producing a
// strictly-worse compile-error count than leaving it ungated
// (one missing-module error per call site × N call sites vs the
// existing std-only call sites failing to compile on their own
// merits). The `no_std-check` job's metric is "error count must
// not increase", and adding a feature gate here increases it.
//
// Migration plan: when the surrounding `version` submodules
// (`recovery`, `persist`, `super_version`) are themselves ported
// to `crate::io` traits + `crate::path` (tracked in the no-std
// epic #274 with PR #311 / #347 as the first prerequisite),
// `framing` gets migrated in the same pass so the whole
// directory transitions to no-std together. See
// `.github/workflows/coordinode-ci.yml` no-std-check job for the
// progress meter.
mod framing;
mod optimize;
mod persist;
pub mod recovery;
pub mod run;
mod super_version;

pub use blob_file_list::BlobFileList;
pub use persist::persist_version;
pub use run::Run;
pub use super_version::{SuperVersion, SuperVersions};

use crate::TreeType;
use crate::blob_tree::{FragmentationEntry, FragmentationMap};
use crate::checksum::ChecksumType;
use crate::coding::Encode;
use crate::compaction::state::hidden_set::HiddenSet;
use crate::version::recovery::Recovery;
use crate::{
    HashSet, KeyRange, Table, TableId,
    comparator::UserComparator,
    vlog::{BlobFile, BlobFileId},
};
use alloc::sync::Arc;
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;
use core::ops::Deref;

use optimize::optimize_runs;
use run::Ranged;

/// Context threaded through [`Version`] transformation methods.
///
/// Bundles the user comparator required for maintaining correct table ordering
/// across level mutations. Passed to [`Version::with_new_l0_run`],
/// [`Version::with_merge`], [`Version::with_moved`], and
/// [`Version::with_dropped`].
pub struct TransformContext<'a> {
    comparator: &'a dyn UserComparator,
}

impl<'a> TransformContext<'a> {
    /// Creates a new context with the given user comparator.
    pub fn new(comparator: &'a dyn UserComparator) -> Self {
        Self { comparator }
    }

    /// Returns the user comparator.
    pub fn comparator(&self) -> &'a dyn UserComparator {
        self.comparator
    }
}

pub const DEFAULT_LEVEL_COUNT: u8 = 7;

/// Monotonically increasing ID of a version.
pub type VersionId = u64;

impl Ranged for Table {
    fn key_range(&self) -> &KeyRange {
        &self.metadata.key_range
    }
}

pub struct GenericLevel<T: Ranged> {
    runs: Vec<Arc<Run<T>>>,
}

impl<T: Ranged> core::ops::Deref for GenericLevel<T> {
    type Target = [Arc<Run<T>>];

    fn deref(&self) -> &Self::Target {
        &self.runs
    }
}

impl<T: Ranged> GenericLevel<T> {
    pub fn new(runs: Vec<Arc<Run<T>>>) -> Self {
        Self { runs }
    }

    pub fn table_count(&self) -> usize {
        self.iter().map(|x| x.len()).sum()
    }

    pub fn run_count(&self) -> usize {
        self.runs.len()
    }

    pub fn is_disjoint(&self) -> bool {
        self.run_count() == 1
    }

    pub fn is_empty(&self) -> bool {
        self.runs.is_empty()
    }

    pub fn iter(&self) -> impl DoubleEndedIterator<Item = &Arc<Run<T>>> {
        self.runs.iter()
    }
}

#[derive(Clone)]
pub struct Level(Arc<GenericLevel<Table>>);

impl core::ops::Deref for Level {
    type Target = GenericLevel<Table>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl Level {
    pub fn empty() -> Self {
        Self::from_runs(vec![])
    }

    pub fn from_runs(runs: Vec<Arc<Run<Table>>>) -> Self {
        Self(Arc::new(GenericLevel { runs }))
    }

    pub fn list_ids(&self) -> HashSet<TableId> {
        self.iter()
            .flat_map(|run| run.iter())
            .map(Table::id)
            .collect()
    }

    pub fn first_run(&self) -> Option<&Arc<Run<Table>>> {
        self.runs.first()
    }

    /// Returns the on-disk size of the level.
    pub fn size(&self) -> u64 {
        self.0
            .iter()
            .flat_map(|x| x.iter())
            .map(Table::file_size)
            .sum()
    }

    pub fn aggregate_key_range(&self) -> KeyRange {
        if self.run_count() == 1 {
            #[expect(
                clippy::expect_used,
                reason = "we check for run_count, so the first run must exist"
            )]
            self.runs
                .first()
                .expect("should exist")
                .aggregate_key_range()
        } else {
            let key_ranges = self
                .iter()
                .map(|x| Run::aggregate_key_range(x))
                .collect::<Vec<_>>();

            KeyRange::aggregate(key_ranges.iter())
        }
    }

    /// Like [`aggregate_key_range`], but uses a custom comparator for key ordering.
    ///
    /// Per-run aggregation via [`Run::aggregate_key_range`] is comparator-correct
    /// because runs are sorted in comparator order (ensured by `push_cmp`), so
    /// `first().min()` and `last().max()` yield the true extremes under the
    /// configured comparator. The cross-run aggregation then uses `aggregate_cmp`
    /// to find the global min/max.
    pub fn aggregate_key_range_cmp(&self, cmp: &dyn crate::comparator::UserComparator) -> KeyRange {
        if self.run_count() == 1 {
            #[expect(
                clippy::expect_used,
                reason = "we check for run_count, so the first run must exist"
            )]
            self.runs
                .first()
                .expect("should exist")
                .aggregate_key_range()
        } else {
            let key_ranges = self
                .iter()
                .map(|x| Run::aggregate_key_range(x))
                .collect::<Vec<_>>();

            KeyRange::aggregate_cmp(key_ranges.iter(), cmp)
        }
    }
}

pub struct VersionInner {
    /// The version's ID
    id: VersionId,

    tree_type: TreeType,

    /// The individual LSM-tree levels which consist of runs of tables
    levels: Vec<Level>,

    // NOTE: We purposefully use Arc<_> to avoid deep cloning the blob files again and again
    //
    // Changing the value log tends to happen way less often than other modifications to the
    // LSM-tree
    //
    /// Blob files for large values (value log)
    #[doc(hidden)]
    pub blob_files: Arc<BlobFileList>,

    /// Blob file fragmentation
    gc_stats: Arc<FragmentationMap>,
}

/// A version is an immutable, point-in-time view of a tree's structure
///
/// Any time a table is created or deleted, a new version is created.
#[derive(Clone)]
pub struct Version {
    inner: Arc<VersionInner>,
}

impl core::ops::Deref for Version {
    type Target = VersionInner;

    fn deref(&self) -> &Self::Target {
        &self.inner
    }
}

// TODO: impl using generics so we can easily unit test Version transformation functions
impl Version {
    /// Returns the initial tree type.
    pub fn tree_type(&self) -> TreeType {
        self.tree_type
    }

    /// Returns the version ID.
    pub fn id(&self) -> VersionId {
        self.id
    }

    pub fn gc_stats(&self) -> &FragmentationMap {
        &self.gc_stats
    }

    pub fn l0(&self) -> &Level {
        #[expect(clippy::expect_used)]
        self.levels.first().expect("L0 should exist")
    }

    #[must_use]
    pub fn level_is_busy(&self, idx: usize, hidden_set: &HiddenSet) -> bool {
        self.level(idx).is_some_and(|level| {
            level
                .iter()
                .flat_map(|run| run.iter())
                .any(|table| hidden_set.is_hidden(table.id()))
        })
    }

    /// Creates a new empty version.
    pub fn new(id: VersionId, tree_type: TreeType) -> Self {
        let levels = (0..DEFAULT_LEVEL_COUNT).map(|_| Level::empty()).collect();

        Self {
            inner: Arc::new(VersionInner {
                id,
                tree_type,
                levels,
                blob_files: Arc::default(),
                gc_stats: Arc::default(),
            }),
        }
    }

    pub(crate) fn from_recovery(
        recovery: Recovery,
        tables: &[Table],
        blob_files: &[BlobFile],
    ) -> crate::Result<Self> {
        let version_levels = recovery
            .table_ids
            .iter()
            .map(|level| {
                let level_runs = level
                    .iter()
                    .map(|run| {
                        let run_tables = run
                            .iter()
                            .map(|table| {
                                let opened = tables
                                    .iter()
                                    .find(|x| x.id() == table.id)
                                    .cloned()
                                    .ok_or(crate::Error::Unrecoverable)?;
                                // Rebuild the tight-space restricted view: the
                                // data below the bound was punched out, so reads
                                // must clamp to it (its index still references the
                                // punched prefix).
                                Ok(match recovery.restrictions.get(&table.id) {
                                    Some(bound) => opened.with_restriction(bound.clone()),
                                    None => opened,
                                })
                            })
                            .collect::<crate::Result<Vec<_>>>()?;

                        // Tables are in persisted order, which preserves the
                        // comparator-sorted order from when the run was written.
                        // No re-sort needed — the manifest faithfully round-trips
                        // the run's table sequence.
                        Ok(Arc::new(
                            #[expect(
                                clippy::expect_used,
                                reason = "empty runs should not exist, so there should not be any empty persisted runs"
                            )]
                            Run::new(run_tables).expect("persisted runs should not be empty"),
                        ))
                    })
                    .collect::<crate::Result<Vec<_>>>()?;

                Ok(Level::from_runs(level_runs))
            })
            .collect::<crate::Result<Vec<_>>>()?;

        Ok(Self::from_levels(
            recovery.curr_version_id,
            recovery.tree_type,
            version_levels,
            BlobFileList::new(blob_files.iter().cloned().map(|bf| (bf.id(), bf)).collect()),
            recovery.gc_stats,
        ))
    }

    /// Creates a new pre-populated version.
    pub fn from_levels(
        id: VersionId,
        tree_type: TreeType,
        levels: Vec<Level>,
        blob_files: BlobFileList,
        gc_stats: FragmentationMap,
    ) -> Self {
        Self {
            inner: Arc::new(VersionInner {
                id,
                tree_type,
                levels,
                blob_files: Arc::new(blob_files),
                gc_stats: Arc::new(gc_stats),
            }),
        }
    }

    /// Returns the number of levels.
    pub fn level_count(&self) -> usize {
        self.levels.len()
    }

    /// Returns an iterator through all levels.
    pub fn iter_levels(&self) -> impl Iterator<Item = &Level> {
        self.levels.iter()
    }

    /// Returns the number of tables in all levels.
    pub fn table_count(&self) -> usize {
        self.iter_levels().map(|x| x.table_count()).sum()
    }

    pub fn blob_file_count(&self) -> usize {
        self.blob_files.len()
    }

    /// Returns an iterator over all tables.
    pub fn iter_tables(&self) -> impl Iterator<Item = &Table> {
        self.levels
            .iter()
            .flat_map(|x| x.iter())
            .flat_map(|x| x.iter())
    }

    pub(crate) fn get_table(&self, id: TableId) -> Option<&Table> {
        self.iter_tables().find(|x| x.metadata.id == id)
    }

    /// Gets the n-th level.
    pub fn level(&self, n: usize) -> Option<&Level> {
        self.levels.get(n)
    }

    /// Creates a new version with the additional run added to the "top" of L0.
    pub fn with_new_l0_run(
        &self,
        run: &[Table],
        blob_files: Option<&[BlobFile]>,
        diff: Option<FragmentationMap>,
        ctx: &TransformContext<'_>,
    ) -> Self {
        let comparator = ctx.comparator;
        let id = self.id + 1;

        let mut levels = vec![];

        // L0
        levels.push({
            // Copy-on-write the first level with new run at top

            #[expect(clippy::expect_used, reason = "L0 always exists")]
            let l0 = self.levels.first().expect("L0 should always exist");

            let prev_runs = l0
                .runs
                .iter()
                .map(|run| {
                    let run: Run<_> = run.deref().clone();
                    run
                })
                .collect::<Vec<_>>();

            let mut runs = Vec::with_capacity(prev_runs.len() + run.len());

            // Start each freshly flushed table as its own run. `optimize_runs`
            // will fuse them back together when their key ranges stay truly
            // disjoint, but RT-bearing flush tables may intentionally widen
            // their persisted key_range and must not be forced into a single
            // run where `Run::get_for_key` assumes non-overlap.
            runs.extend(run.iter().cloned().map(|table| {
                let Some(run) = Run::new(vec![table]) else {
                    unreachable!("single-table run should never be empty");
                };

                run
            }));

            runs.extend(prev_runs);

            let runs = optimize_runs(runs, comparator);

            Level::from_runs(runs.into_iter().map(Arc::new).collect())
        });

        // L1+
        levels.extend(self.levels.iter().skip(1).cloned());

        // Value log
        let value_log = if let Some(blob_files) = blob_files {
            let mut copy = self.blob_files.deref().clone();
            copy.extend(blob_files.iter().cloned().map(|bf| (bf.id(), bf)));
            copy.into()
        } else {
            self.blob_files.clone()
        };

        let gc_stats = if let Some(diff) = diff {
            let mut copy = self.gc_stats.deref().clone();
            diff.merge_into(&mut copy);
            copy.prune(&value_log);
            Arc::new(copy)
        } else {
            self.gc_stats.clone()
        };

        Self {
            inner: Arc::new(VersionInner {
                id,
                tree_type: self.tree_type,
                levels,
                blob_files: value_log,
                gc_stats,
            }),
        }
    }

    /// Returns a new version with a list of tables removed.
    ///
    /// The table files are not immediately deleted, this is handled by the version system's free list.
    pub fn with_dropped(
        &self,
        ids: &[TableId],
        dropped_blob_files: &mut Vec<BlobFile>,
        ctx: &TransformContext<'_>,
    ) -> crate::Result<Self> {
        let comparator = ctx.comparator;
        let id = self.id + 1;

        let mut levels = vec![];

        let mut dropped_tables: Vec<Table> = vec![];

        for level in &self.levels {
            let runs = level
                .runs
                .iter()
                .map(|run| {
                    // TODO: don't clone Arc inner if we don't need to modify
                    let mut run: Run<_> = run.deref().clone();

                    let removed_tables = run
                        .inner_mut()
                        .extract_if(.., |x| ids.contains(&x.metadata.id));

                    dropped_tables.extend(removed_tables);

                    run
                })
                .filter(|x| !x.is_empty())
                .collect::<Vec<_>>();

            let runs = optimize_runs(runs, comparator);

            levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect()));
        }

        let gc_stats = if dropped_tables.is_empty() {
            self.gc_stats.clone()
        } else {
            let mut copy = self.gc_stats.deref().clone();

            for table in &dropped_tables {
                let linked_blob_files = table.list_blob_file_references()?.unwrap_or_default();

                for blob_file in linked_blob_files {
                    copy.entry(blob_file.blob_file_id)
                        .and_modify(|counter| {
                            counter.bytes += blob_file.bytes;
                            counter.len += blob_file.len;
                        })
                        .or_insert_with(|| {
                            FragmentationEntry::new(
                                blob_file.len,
                                blob_file.bytes,
                                blob_file.on_disk_bytes,
                            )
                        });
                }
            }

            Arc::new(copy)
        };

        let value_log = if dropped_tables.is_empty() {
            self.blob_files.clone()
        } else {
            let mut copy = self.blob_files.deref().clone();
            dropped_blob_files.extend(copy.prune_dead(&gc_stats));
            Arc::new(copy)
        };

        Ok(Self {
            inner: Arc::new(VersionInner {
                id,
                tree_type: self.tree_type,
                levels,
                blob_files: value_log,
                gc_stats,
            }),
        })
    }

    #[expect(
        clippy::too_many_arguments,
        reason = "merge requires blob/GC params alongside context; further bundling planned"
    )]
    pub fn with_merge(
        &self,
        old_ids: &[TableId],
        new_tables: &[Table],
        dest_level: usize,
        diff: Option<FragmentationMap>,
        new_blob_files: Vec<BlobFile>,
        blob_files_to_drop: &HashSet<BlobFileId>,
        ctx: &TransformContext<'_>,
    ) -> Self {
        let comparator = ctx.comparator;
        let id = self.id + 1;

        let mut levels = vec![];

        for (level_idx, level) in self.levels.iter().enumerate() {
            let mut runs = level
                .runs
                .iter()
                .map(|run| {
                    // TODO: don't clone Arc inner if we don't need to modify
                    let mut run: Run<_> = run.deref().clone();
                    run.retain(|x| !old_ids.contains(&x.metadata.id));
                    run
                })
                .filter(|x| !x.is_empty())
                .collect::<Vec<_>>();

            if level_idx == dest_level
                && let Some(run) = Run::new(new_tables.to_vec())
            {
                if dest_level == 0 {
                    // NOTE: dest_level == 0 in with_merge only occurs for intra-L0
                    // compaction (memtable flushes use with_new_l0_run, not with_merge).
                    // Append the merged (older) run so that any concurrently flushed
                    // (newer) runs remain at the front and are searched first during
                    // point reads.
                    runs.push(run);
                } else {
                    runs.insert(0, run);
                }
            }

            let runs = optimize_runs(runs, comparator);

            levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect()));
        }

        let has_diff = diff.is_some();

        let value_log = if has_diff || !new_blob_files.is_empty() || !blob_files_to_drop.is_empty()
        {
            let mut copy = self.blob_files.deref().clone();

            for blob_file in new_blob_files {
                copy.insert(blob_file.id(), blob_file);
            }

            for &id in blob_files_to_drop {
                copy.remove(id);
            }

            Arc::new(copy)
        } else {
            self.blob_files.clone()
        };

        let gc_stats = if has_diff || !blob_files_to_drop.is_empty() {
            let mut copy = self.gc_stats.deref().clone();

            if let Some(diff) = diff {
                diff.merge_into(&mut copy);
            }

            copy.prune(&value_log);

            Arc::new(copy)
        } else {
            self.gc_stats.clone()
        };

        Self {
            inner: Arc::new(VersionInner {
                id,
                tree_type: self.tree_type,
                levels,
                blob_files: value_log,
                gc_stats,
            }),
        }
    }

    pub fn with_moved(
        &self,
        ids: &[TableId],
        dest_level: usize,
        ctx: &TransformContext<'_>,
    ) -> Self {
        let comparator = ctx.comparator;
        let id = self.id + 1;

        let affected_tables = self
            .iter_tables()
            .filter(|x| ids.contains(&x.id()))
            .cloned()
            .collect::<Vec<_>>();

        assert_eq!(affected_tables.len(), ids.len(), "invalid table IDs");

        let mut levels = vec![];

        for (level_idx, level) in self.levels.iter().enumerate() {
            let mut runs = level
                .runs
                .iter()
                .map(|run| {
                    // TODO: don't clone Arc inner if we don't need to modify
                    let mut run: Run<_> = run.deref().clone();
                    run.retain(|x| !ids.contains(&x.metadata.id));
                    run
                })
                .filter(|x| !x.is_empty())
                .collect::<Vec<_>>();

            if level_idx == dest_level
                && let Some(run) = Run::new(affected_tables.clone())
            {
                runs.insert(0, run);
            }

            let runs = optimize_runs(runs, comparator);

            levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect()));
        }

        Self {
            inner: Arc::new(VersionInner {
                id,
                tree_type: self.tree_type,
                levels,
                blob_files: self.blob_files.clone(),
                gc_stats: self.gc_stats.clone(),
            }),
        }
    }

    /// Tight-space slice install for one or more inputs: replaces each
    /// `(id, restricted view)` in `restricted` with its clamped view, drops the
    /// fully-consumed inputs in `removed_ids`, and adds the slice `outputs` as a
    /// new run in `dest_level`. The restriction rides on the [`Table`] wrapper,
    /// so `diff` / `encode_into` persist it; a replaced / removed prior view is
    /// released by the version swap and (for a restricted view) punches its
    /// consumed prefix once its readers drain.
    ///
    /// For KV-separated trees the slice's `gc_diff` (newly dead blob entries)
    /// and any `new_blob_files` it produced are folded in, so the running GC
    /// stats stay accurate; globally-dead blob files are not dropped here (an
    /// unprocessed slice may still reference them) — that happens at the final
    /// removal.
    #[expect(
        clippy::too_many_arguments,
        reason = "a slice install carries the SST swaps/removes/outputs plus the \
                  KV-separation blob delta (new files + GC diff); bundling would \
                  just move the argument list"
    )]
    pub fn with_tight_slice(
        &self,
        restricted: &[(TableId, Table)],
        removed_ids: &[TableId],
        outputs: &[Table],
        new_blob_files: Vec<BlobFile>,
        gc_diff: Option<FragmentationMap>,
        dest_level: usize,
        ctx: &TransformContext<'_>,
    ) -> Self {
        let comparator = ctx.comparator;
        let id = self.id + 1;

        let mut levels = vec![];

        for (level_idx, level) in self.levels.iter().enumerate() {
            let mut runs = level
                .runs
                .iter()
                .map(|run| {
                    let mut run: Run<_> = run.deref().clone();
                    // Drop fully-consumed inputs.
                    run.retain(|t| !removed_ids.contains(&t.id()));
                    // Swap each restricted input for its clamped view (same id).
                    for t in run.inner_mut().iter_mut() {
                        if let Some((_, view)) = restricted.iter().find(|(rid, _)| *rid == t.id()) {
                            *t = view.clone();
                        }
                    }
                    run
                })
                .filter(|run| !run.is_empty())
                .collect::<Vec<_>>();

            if level_idx == dest_level
                && let Some(run) = Run::new(outputs.to_vec())
            {
                if dest_level == 0 {
                    runs.push(run);
                } else {
                    runs.insert(0, run);
                }
            }

            let runs = optimize_runs(runs, comparator);

            levels.push(Level::from_runs(runs.into_iter().map(Arc::new).collect()));
        }

        // KV-separation blob delta: add any newly written blob files and fold in
        // this slice's GC diff. Dead blob files are NOT pruned here — a later
        // slice may still reference them; the final removal does the drop.
        let value_log = if gc_diff.is_some() || !new_blob_files.is_empty() {
            let mut copy = self.blob_files.deref().clone();
            for blob_file in new_blob_files {
                copy.insert(blob_file.id(), blob_file);
            }
            Arc::new(copy)
        } else {
            self.blob_files.clone()
        };
        let gc_stats = if let Some(diff) = gc_diff {
            let mut copy = self.gc_stats.deref().clone();
            diff.merge_into(&mut copy);
            Arc::new(copy)
        } else {
            self.gc_stats.clone()
        };

        Self {
            inner: Arc::new(VersionInner {
                id,
                tree_type: self.tree_type,
                levels,
                blob_files: value_log,
                gc_stats,
            }),
        }
    }
}

impl Version {
    pub(crate) fn encode_into(
        &self,
        writer: &mut crate::manifest_blocks::writer::ManifestArchiveWriter,
        comparator_name: &str,
    ) -> Result<(), crate::Error> {
        use crate::FormatVersion;
        #[cfg(not(feature = "std"))]
        use crate::io::Write;
        use crate::io::{LittleEndian, WriteBytesExt};
        #[cfg(feature = "std")]
        use std::io::Write;

        //
        // Manifest
        //

        writer.start("format_version")?;
        // V5 is currently pre-release (no published binary writes it yet),
        // so the manifest layout under V5 may still be amended in-place.
        // Once V5 ships, ANY on-disk byte change to the manifest under
        // V5 must bump FormatVersion to V6. Policy tracked in #351.
        writer.write_u8(FormatVersion::V5.into())?;

        writer.start("crate_version")?;
        writer.write_all(env!("CARGO_PKG_VERSION").as_bytes())?;

        writer.start("tree_type")?;
        writer.write_u8(self.tree_type.into())?;

        writer.start("level_count")?;
        #[expect(
            clippy::cast_possible_truncation,
            reason = "level count is bounded by 255"
        )]
        writer.write_u8(self.level_count() as u8)?;

        writer.start("filter_hash_type")?;
        writer.write_u8(u8::from(ChecksumType::Xxh3))?;

        writer.start("comparator_name")?;
        writer.write_all(comparator_name.as_bytes())?;

        //
        // Levels
        //

        writer.start("tables")?;

        // Shared scratch buffer for per-record framing payloads.
        // Reused across every table + blob record in this version,
        // so the framing helper grows the buffer once and reuses it
        // — no per-record heap allocation. The records are all
        // <= 33 bytes today; pre-sized to cover the larger of the
        // two so the first iteration doesn't trigger a realloc.
        let mut framing_scratch: Vec<u8> = Vec::with_capacity(64);

        // Per-record framing details live in src/version/framing.rs.
        // Top-level shape inside the `tables` section after framing:
        //   level_count: u8
        //   for each level:
        //     run_count: u8
        //     for each run:
        //       table_count: u32 LE
        //       for each table:
        //         FRAMED(table_record_payload)   // 12-byte header + payload
        //
        // The level / run / table_count counters stay unframed
        // because they ARE the section's own structural shape — the
        // pre-framing readers used them to walk the section and the
        // framing-aware readers continue to use them the same way.
        // Only the per-table record bytes (the 33-byte
        // id+checksum_type+checksum+global_seqno payload) become
        // framed so PointInTimeRecovery / SkipAnyCorruptedRecords
        // have exact byte boundaries to skip on.

        // Level count
        #[expect(
            clippy::cast_possible_truncation,
            reason = "there are always less than 256 levels"
        )]
        writer.write_u8(self.level_count() as u8)?;

        for level in self.iter_levels() {
            // Run count
            #[expect(
                clippy::cast_possible_truncation,
                reason = "there are always less than 256 runs"
            )]
            writer.write_u8(level.len() as u8)?;

            for run in level.iter() {
                // Table count
                #[expect(
                    clippy::cast_possible_truncation,
                    reason = "there are always less than 4 billion tables in a run"
                )]
                writer.write_u32::<LittleEndian>(run.len() as u32)?;

                // Tables — each one framed.
                for table in run.iter() {
                    framing::write_framed_record(writer, &mut framing_scratch, |payload| {
                        payload.write_u64::<LittleEndian>(table.id())?;
                        payload.write_u8(0)?; // Checksum type, 0 = XXH3
                        payload.write_u128::<LittleEndian>(table.checksum().into_u128())?;
                        payload.write_u64::<LittleEndian>(table.global_seqno())?;
                        Ok(())
                    })?;
                }
            }
        }

        writer.start("blob_files")?;

        // Blob file count
        #[expect(
            clippy::cast_possible_truncation,
            reason = "there are always less than 4 billion blob files"
        )]
        writer.write_u32::<LittleEndian>(self.blob_files.len() as u32)?;

        for file in self.blob_files.iter() {
            // Per-blob record framed, same rationale as tables
            // above; same scratch buffer keeps allocations at zero
            // across the section boundary.
            framing::write_framed_record(writer, &mut framing_scratch, |payload| {
                payload.write_u64::<LittleEndian>(file.id())?;
                payload.write_u8(0)?; // Checksum type, 0 = XXH3
                payload.write_u128::<LittleEndian>(file.0.checksum.into_u128())?;
                Ok(())
            })?;
        }

        writer.start("blob_gc_stats")?;

        self.gc_stats.encode_into(writer)?;

        // Tight-space restrictions: per-table key-range lower bounds for tables
        // whose prefix has been punched out and superseded by a merged output.
        // Empty (count 0) on versions that never ran tight-space reclaim, so the
        // section is one zero count in the common case. Each entry is the table
        // id then a length-prefixed key (variable length, hence its own section
        // rather than the fixed-length per-table `tables` record).
        writer.start("restrictions")?;
        let restricted: Vec<(TableId, crate::UserKey)> = self
            .iter_tables()
            .filter_map(|t| t.restrict_lower_bound().map(|b| (t.id(), b.clone())))
            .collect();
        writer.write_u32::<LittleEndian>(
            u32::try_from(restricted.len()).map_err(|_| crate::Error::Unrecoverable)?,
        )?;
        for (id, key) in &restricted {
            writer.write_u64::<LittleEndian>(*id)?;
            writer.write_u32::<LittleEndian>(
                u32::try_from(key.len()).map_err(|_| crate::Error::Unrecoverable)?,
            )?;
            writer.write_all(key)?;
        }

        Ok(())
    }
}