Skip to main content

obj_core/btree/
mod.rs

1//! B+tree (L3) — copy-on-write B+tree over the pager.
2//!
3//! The B+tree is the data structure both the primary store (M5) and
4//! every secondary index (M7) will compose over. M4 introduces the
5//! data structure itself; M5+ wire it into typed catalog and index
6//! layers.
7//!
8//! See `docs/format.md` § B+tree for the authoritative on-disk
9//! layout this module is the reference implementation of.
10//!
11//! # Power-of-ten posture
12//!
13//! - **Rule 1.** Every tree traversal uses an explicit `heapless::Vec
14//!   <PageId, MAX_BTREE_DEPTH>` stack; no recursive descent. Exceeding
15//!   the depth bound returns `Error::BTreeDepthExceeded` rather than
16//!   exhausting the call stack.
17//! - **Rule 2.** Per-node loops iterate over `key_count`, which is
18//!   bounded by the slot directory capacity (a function of
19//!   `PAGE_SIZE` known at compile time). Range scans carry an
20//!   explicit `MAX_RANGE_NODES` budget.
21//! - **Rule 5.** Node invariants (sorted keys, level monotonicity,
22//!   slot-count bounds) are encoded as `debug_assert!`s in
23//!   `validate_node`; release builds surface a tripped invariant as
24//!   `Error::BTreeInvariantViolated`.
25//! - **Rule 7.** No `unwrap` / `expect` in the production code path.
26//!   Decode errors surface as `Error::Corruption`; semantic errors
27//!   surface as one of the `BTree*` variants on `Error`.
28
29#![forbid(unsafe_code)]
30
31pub mod delete;
32pub mod insert;
33pub mod node;
34pub mod range;
35
36pub use crate::btree::range::{RangeIter, SnapshotRangeIter};
37
38use crate::error::{Error, Result};
39use crate::pager::page::PageId;
40use crate::pager::Pager;
41use crate::platform::{FileBackend, FileHandle};
42
43use heapless::Vec as HeaplessVec;
44
45/// Maximum B+tree depth.
46///
47/// At fanout ≥ 4 a 32-level tree holds 2^64 entries, which exceeds
48/// the page-id space. The bound exists so [`Rule 1`](crate) tree
49/// traversals always terminate; a tree that somehow grew past 32
50/// levels surfaces as [`Error::BTreeDepthExceeded`] rather than a
51/// stack overflow.
52///
53/// `crate` link target above is documented as `power-of-ten.md`
54/// Rule 1: no unbounded recursion.
55pub const MAX_BTREE_DEPTH: usize = 32;
56
57/// Maximum number of B+tree nodes visited in a single range scan.
58///
59/// Power-of-ten Rule 2: every loop has an explicit upper bound.
60/// 1 000 000 nodes × ~32 slots/node ≈ 32 M entries — comfortably
61/// above the design.md "collection scan" target. The bound is
62/// configurable in M5+ when the query planner needs higher ceilings.
63pub const MAX_RANGE_NODES: usize = 1_000_000;
64
65/// Borrowed view of the in-memory representation of a B+tree node.
66pub use crate::btree::node::{
67    decode_node, encode_node, max_inline_value, max_key_len, validate_node, ChildEntry,
68    DecodedNode, InternalEntry, LeafEntry, NodeKind, NODE_HEADER_SIZE,
69};
70
71/// A B+tree handle.
72///
73/// Construct via [`BTree::empty`] (fresh tree allocating a new root
74/// leaf) or [`BTree::open`] (attach to an existing root page-id).
75/// The handle does not own the pager; the caller passes a `&mut`
76/// reference into mutating methods.
77///
78/// Generic over `F: FileBackend` so the same code works against the
79/// production [`FileHandle`] and the fault-injection harness. M4 only
80/// uses the type parameter as a marker — it appears in the
81/// `Pager<F>` signature of every method.
82///
83/// Single-writer: every mutating method takes `&mut self`. The pager
84/// already enforces single-writer at the file level (M3); the B+tree
85/// inherits the same property.
86#[derive(Debug)]
87pub struct BTree<F: FileBackend = FileHandle> {
88    root: PageId,
89    _phantom: std::marker::PhantomData<fn() -> F>,
90}
91
92impl<F: FileBackend> BTree<F> {
93    /// Attach a B+tree handle to an existing root page-id.
94    ///
95    /// The root page is NOT read here — `BTree::open` is a pure
96    /// constructor that records the root id. The first mutating or
97    /// reading call will fault the root in via `Pager::read_page`.
98    ///
99    /// # Errors
100    ///
101    /// Returns [`Error::InvalidArgument`] if `root` is zero. (Zero
102    /// is the sentinel "no tree"; the caller should use
103    /// [`BTree::empty`] to bootstrap.)
104    pub fn open(_pager: &Pager<F>, root: PageId) -> Result<Self> {
105        Ok(Self {
106            root,
107            _phantom: std::marker::PhantomData,
108        })
109    }
110
111    /// Allocate a fresh empty B+tree.
112    ///
113    /// Allocates one new page through the pager, encodes an empty
114    /// leaf node into it, and writes it through the WAL transaction
115    /// buffer. The caller is responsible for calling
116    /// [`Pager::commit`] before relying on the tree being durable.
117    ///
118    /// # Errors
119    ///
120    /// Propagates pager errors from [`Pager::alloc_page`] and
121    /// [`Pager::write_page`].
122    pub fn empty(pager: &mut Pager<F>) -> Result<Self> {
123        let root = pager.alloc_page()?;
124        let mut page = crate::pager::page::Page::zeroed();
125        encode_node(
126            &DecodedNode {
127                kind: NodeKind::Leaf,
128                level: 0,
129                next_sibling: 0,
130                children: Vec::new(),
131                leaves: Vec::new(),
132                internals: Vec::new(),
133            },
134            &mut page,
135        )?;
136        pager.write_page(root, &page)?;
137        Ok(Self {
138            root,
139            _phantom: std::marker::PhantomData,
140        })
141    }
142
143    /// The current root page-id of this tree.
144    ///
145    /// Mutating methods update this in place after a successful
146    /// write. The caller can persist this id externally so that a
147    /// subsequent [`BTree::open`] reattaches to the same tree.
148    #[must_use]
149    pub fn root(&self) -> PageId {
150        self.root
151    }
152
153    // `set_root` and related insert/delete helpers land in issues
154    // #26 / #27. Per the project policy we add them in the commit
155    // that uses them rather than carrying dead code here.
156
157    /// Look up `key` in the tree. Returns `None` if absent.
158    ///
159    /// The traversal walks root → leaf using an explicit stack of
160    /// page-ids (no recursion, no heap allocation — the stack is a
161    /// `heapless::Vec`).
162    ///
163    /// # Errors
164    ///
165    /// - [`Error::Corruption`] if any page along the path fails to
166    ///   decode.
167    /// - [`Error::BTreeDepthExceeded`] if the depth bound trips
168    ///   (would require a tree height ≥ 32).
169    /// - [`Error::Io`] / [`Error::InvalidArgument`] propagated from
170    ///   the pager on a cache-miss read.
171    pub fn get(&self, pager: &mut Pager<F>, key: &[u8]) -> Result<Option<Vec<u8>>> {
172        let leaf_id = descend_to_leaf(pager, self.root, key)?;
173        let page_ref = pager.read_page(leaf_id)?;
174        // Find-only decode: copies out ONLY the matched value (an owned
175        // `Vec<u8>`), not all 2N entries. Retains every per-slot
176        // bounds/varint/order check on the slots it reads (see
177        // `decode_node_find`). Mutations and the integrity checker keep
178        // the full `decode_node` + `validate_node_release`.
179        node::decode_node_find(page_ref.as_bytes(), key)
180    }
181
182    /// Snapshot-consistent variant of [`BTree::get`] — descends the
183    /// tree using [`crate::pager::ReaderSnapshot::read_page`] rather
184    /// than the live `Pager::read_page` (which consults the WAL
185    /// overlay — including a concurrent writer's post-snapshot
186    /// commits).
187    ///
188    /// Used by the M6 #53 fix: a reader's `Collection<T>::get` must
189    /// walk the primary B+tree consistently with the snapshot's
190    /// pinned LSN. Without this, a writer that COW-frees a page and
191    /// reallocates the same page-id can serve the reader a page
192    /// whose `state.view` content is no longer a valid B+tree node,
193    /// surfacing as `Error::Corruption { page_id: 0 }` from
194    /// [`crate::btree::node::decode_node`].
195    ///
196    /// `root` is the B+tree root captured at the reader's relevant
197    /// snapshot-time view of the catalog (i.e. the descriptor's
198    /// `primary_root` read via [`crate::Catalog::lookup_via_snapshot`]).
199    /// Walking from the writer's live root would defeat the
200    /// snapshot read; the caller is responsible for handing in the
201    /// snapshot-time root.
202    ///
203    /// # Errors
204    ///
205    /// - [`Error::BTreeDepthExceeded`] if the tree height exceeds
206    ///   [`MAX_BTREE_DEPTH`].
207    /// - [`Error::Corruption`] / [`Error::BTreeInvariantViolated`]
208    ///   propagated from the descent.
209    /// - Snapshot read errors propagated from
210    ///   [`crate::pager::ReaderSnapshot::read_page`].
211    pub fn get_via_snapshot(
212        pager: &Pager<F>,
213        snapshot: &crate::pager::ReaderSnapshot<F>,
214        root: PageId,
215        key: &[u8],
216    ) -> Result<Option<Vec<u8>>> {
217        let mut path: HeaplessVec<PageId, MAX_BTREE_DEPTH> = HeaplessVec::new();
218        let mut current = root;
219        loop {
220            if path.push(current).is_err() {
221                return Err(Error::BTreeDepthExceeded {
222                    limit: MAX_BTREE_DEPTH,
223                });
224            }
225            let page = snapshot.read_page(pager, current)?;
226            match node::peek_node_kind(page.as_bytes())? {
227                // Find-only leaf decode: copies out ONLY the matched
228                // value, retaining every per-slot safety check (see
229                // `decode_node_find`).
230                NodeKind::Leaf => return node::decode_node_find(page.as_bytes(), key),
231                NodeKind::Internal => {
232                    let decoded = decode_node(page.as_bytes())?;
233                    current = choose_child(&decoded, key)?;
234                }
235            }
236        }
237    }
238}
239
240/// Walk root → leaf, returning the leaf page-id that **would** contain
241/// `key`. Used by every read/insert/delete path.
242///
243/// The walk is iterative and uses an explicit `heapless::Vec<PageId,
244/// MAX_BTREE_DEPTH>` stack of visited ancestor ids (Rule 1 + Rule
245/// 3). Exceeding the depth bound returns
246/// [`Error::BTreeDepthExceeded`] rather than recursing into a stack
247/// overflow.
248pub(crate) fn descend_to_leaf<F: FileBackend>(
249    pager: &mut Pager<F>,
250    root: PageId,
251    key: &[u8],
252) -> Result<PageId> {
253    let mut path: HeaplessVec<PageId, MAX_BTREE_DEPTH> = HeaplessVec::new();
254    let mut current = root;
255    loop {
256        if path.push(current).is_err() {
257            return Err(Error::BTreeDepthExceeded {
258                limit: MAX_BTREE_DEPTH,
259            });
260        }
261        let page_ref = pager.read_page(current)?;
262        let decoded = decode_node(page_ref.as_bytes())?;
263        match decoded.kind {
264            NodeKind::Leaf => return Ok(current),
265            NodeKind::Internal => {
266                current = choose_child(&decoded, key)?;
267            }
268        }
269    }
270}
271
272/// Pick the child page-id of `node` whose subtree contains `key`.
273/// Internal-node child selection: find the smallest `i` with
274/// `pivot[i] > key`; descend into `child[i]`. If no such `i`,
275/// descend into `child[K]` (the rightmost child).
276pub(crate) fn choose_child(node: &DecodedNode, key: &[u8]) -> Result<PageId> {
277    debug_assert!(matches!(node.kind, NodeKind::Internal));
278    debug_assert_eq!(node.children.len(), node.internals.len() + 1);
279    if node.children.is_empty() {
280        return Err(Error::BTreeInvariantViolated {
281            reason: "internal node with zero children",
282        });
283    }
284    let mut idx = node.internals.len();
285    for (i, pivot) in node.internals.iter().enumerate() {
286        if pivot.key.as_slice() > key {
287            idx = i;
288            break;
289        }
290    }
291    let raw = node.children[idx];
292    PageId::new(raw).ok_or(Error::BTreeInvariantViolated {
293        reason: "internal node had zero child page-id",
294    })
295}
296
297#[cfg(test)]
298mod tests;