obj_core/btree/mod.rs
1//! B+tree (L3) — copy-on-write B+tree over the pager.
2//!
3//! The B+tree is the data structure both the primary store (M5) and
4//! every secondary index (M7) will compose over. M4 introduces the
5//! data structure itself; M5+ wire it into typed catalog and index
6//! layers.
7//!
8//! See `docs/format.md` § B+tree for the authoritative on-disk
9//! layout this module is the reference implementation of.
10//!
11//! # Power-of-ten posture
12//!
13//! - **Rule 1.** Every tree traversal uses an explicit `heapless::Vec
14//! <PageId, MAX_BTREE_DEPTH>` stack; no recursive descent. Exceeding
15//! the depth bound returns `Error::BTreeDepthExceeded` rather than
16//! exhausting the call stack.
17//! - **Rule 2.** Per-node loops iterate over `key_count`, which is
18//! bounded by the slot directory capacity (a function of
19//! `PAGE_SIZE` known at compile time). Range scans carry an
20//! explicit `MAX_RANGE_NODES` budget.
21//! - **Rule 5.** Node invariants (sorted keys, level monotonicity,
22//! slot-count bounds) are encoded as `debug_assert!`s in
23//! `validate_node`; release builds surface a tripped invariant as
24//! `Error::BTreeInvariantViolated`.
25//! - **Rule 7.** No `unwrap` / `expect` in the production code path.
26//! Decode errors surface as `Error::Corruption`; semantic errors
27//! surface as one of the `BTree*` variants on `Error`.
28
29#![forbid(unsafe_code)]
30
31pub mod delete;
32pub mod insert;
33pub mod node;
34pub mod range;
35
36pub use crate::btree::range::{RangeIter, SnapshotRangeIter};
37
38use crate::error::{Error, Result};
39use crate::pager::page::PageId;
40use crate::pager::Pager;
41use crate::platform::{FileBackend, FileHandle};
42
43use heapless::Vec as HeaplessVec;
44
45/// Maximum B+tree depth.
46///
47/// At fanout ≥ 4 a 32-level tree holds 2^64 entries, which exceeds
48/// the page-id space. The bound exists so [`Rule 1`](crate) tree
49/// traversals always terminate; a tree that somehow grew past 32
50/// levels surfaces as [`Error::BTreeDepthExceeded`] rather than a
51/// stack overflow.
52///
53/// `crate` link target above is documented as `power-of-ten.md`
54/// Rule 1: no unbounded recursion.
55pub const MAX_BTREE_DEPTH: usize = 32;
56
57/// Maximum number of B+tree nodes visited in a single range scan.
58///
59/// Power-of-ten Rule 2: every loop has an explicit upper bound.
60/// 1 000 000 nodes × ~32 slots/node ≈ 32 M entries — comfortably
61/// above the design.md "collection scan" target. The bound is
62/// configurable in M5+ when the query planner needs higher ceilings.
63pub const MAX_RANGE_NODES: usize = 1_000_000;
64
65/// Borrowed view of the in-memory representation of a B+tree node.
66pub use crate::btree::node::{
67 decode_node, encode_node, max_inline_value, max_key_len, validate_node, ChildEntry,
68 DecodedNode, InternalEntry, LeafEntry, NodeKind, NODE_HEADER_SIZE,
69};
70
71/// A B+tree handle.
72///
73/// Construct via [`BTree::empty`] (fresh tree allocating a new root
74/// leaf) or [`BTree::open`] (attach to an existing root page-id).
75/// The handle does not own the pager; the caller passes a `&mut`
76/// reference into mutating methods.
77///
78/// Generic over `F: FileBackend` so the same code works against the
79/// production [`FileHandle`] and the fault-injection harness. M4 only
80/// uses the type parameter as a marker — it appears in the
81/// `Pager<F>` signature of every method.
82///
83/// Single-writer: every mutating method takes `&mut self`. The pager
84/// already enforces single-writer at the file level (M3); the B+tree
85/// inherits the same property.
86#[derive(Debug)]
87pub struct BTree<F: FileBackend = FileHandle> {
88 root: PageId,
89 _phantom: std::marker::PhantomData<fn() -> F>,
90}
91
92impl<F: FileBackend> BTree<F> {
93 /// Attach a B+tree handle to an existing root page-id.
94 ///
95 /// The root page is NOT read here — `BTree::open` is a pure
96 /// constructor that records the root id. The first mutating or
97 /// reading call will fault the root in via `Pager::read_page`.
98 ///
99 /// # Errors
100 ///
101 /// Returns [`Error::InvalidArgument`] if `root` is zero. (Zero
102 /// is the sentinel "no tree"; the caller should use
103 /// [`BTree::empty`] to bootstrap.)
104 pub fn open(_pager: &Pager<F>, root: PageId) -> Result<Self> {
105 Ok(Self {
106 root,
107 _phantom: std::marker::PhantomData,
108 })
109 }
110
111 /// Allocate a fresh empty B+tree.
112 ///
113 /// Allocates one new page through the pager, encodes an empty
114 /// leaf node into it, and writes it through the WAL transaction
115 /// buffer. The caller is responsible for calling
116 /// [`Pager::commit`] before relying on the tree being durable.
117 ///
118 /// # Errors
119 ///
120 /// Propagates pager errors from [`Pager::alloc_page`] and
121 /// [`Pager::write_page`].
122 pub fn empty(pager: &mut Pager<F>) -> Result<Self> {
123 let root = pager.alloc_page()?;
124 let mut page = crate::pager::page::Page::zeroed();
125 encode_node(
126 &DecodedNode {
127 kind: NodeKind::Leaf,
128 level: 0,
129 next_sibling: 0,
130 children: Vec::new(),
131 leaves: Vec::new(),
132 internals: Vec::new(),
133 },
134 &mut page,
135 )?;
136 pager.write_page(root, &page)?;
137 Ok(Self {
138 root,
139 _phantom: std::marker::PhantomData,
140 })
141 }
142
143 /// The current root page-id of this tree.
144 ///
145 /// Mutating methods update this in place after a successful
146 /// write. The caller can persist this id externally so that a
147 /// subsequent [`BTree::open`] reattaches to the same tree.
148 #[must_use]
149 pub fn root(&self) -> PageId {
150 self.root
151 }
152
153 // `set_root` and related insert/delete helpers land in issues
154 // #26 / #27. Per the project policy we add them in the commit
155 // that uses them rather than carrying dead code here.
156
157 /// Look up `key` in the tree. Returns `None` if absent.
158 ///
159 /// The traversal walks root → leaf using an explicit stack of
160 /// page-ids (no recursion, no heap allocation — the stack is a
161 /// `heapless::Vec`).
162 ///
163 /// # Errors
164 ///
165 /// - [`Error::Corruption`] if any page along the path fails to
166 /// decode.
167 /// - [`Error::BTreeDepthExceeded`] if the depth bound trips
168 /// (would require a tree height ≥ 32).
169 /// - [`Error::Io`] / [`Error::InvalidArgument`] propagated from
170 /// the pager on a cache-miss read.
171 pub fn get(&self, pager: &mut Pager<F>, key: &[u8]) -> Result<Option<Vec<u8>>> {
172 let leaf_id = descend_to_leaf(pager, self.root, key)?;
173 let page_ref = pager.read_page(leaf_id)?;
174 // Find-only decode: copies out ONLY the matched value (an owned
175 // `Vec<u8>`), not all 2N entries. Retains every per-slot
176 // bounds/varint/order check on the slots it reads (see
177 // `decode_node_find`). Mutations and the integrity checker keep
178 // the full `decode_node` + `validate_node_release`.
179 node::decode_node_find(page_ref.as_bytes(), key)
180 }
181
182 /// Snapshot-consistent variant of [`BTree::get`] — descends the
183 /// tree using [`crate::pager::ReaderSnapshot::read_page`] rather
184 /// than the live `Pager::read_page` (which consults the WAL
185 /// overlay — including a concurrent writer's post-snapshot
186 /// commits).
187 ///
188 /// Used by the M6 #53 fix: a reader's `Collection<T>::get` must
189 /// walk the primary B+tree consistently with the snapshot's
190 /// pinned LSN. Without this, a writer that COW-frees a page and
191 /// reallocates the same page-id can serve the reader a page
192 /// whose `state.view` content is no longer a valid B+tree node,
193 /// surfacing as `Error::Corruption { page_id: 0 }` from
194 /// [`crate::btree::node::decode_node`].
195 ///
196 /// `root` is the B+tree root captured at the reader's relevant
197 /// snapshot-time view of the catalog (i.e. the descriptor's
198 /// `primary_root` read via [`crate::Catalog::lookup_via_snapshot`]).
199 /// Walking from the writer's live root would defeat the
200 /// snapshot read; the caller is responsible for handing in the
201 /// snapshot-time root.
202 ///
203 /// # Errors
204 ///
205 /// - [`Error::BTreeDepthExceeded`] if the tree height exceeds
206 /// [`MAX_BTREE_DEPTH`].
207 /// - [`Error::Corruption`] / [`Error::BTreeInvariantViolated`]
208 /// propagated from the descent.
209 /// - Snapshot read errors propagated from
210 /// [`crate::pager::ReaderSnapshot::read_page`].
211 pub fn get_via_snapshot(
212 pager: &Pager<F>,
213 snapshot: &crate::pager::ReaderSnapshot<F>,
214 root: PageId,
215 key: &[u8],
216 ) -> Result<Option<Vec<u8>>> {
217 let mut path: HeaplessVec<PageId, MAX_BTREE_DEPTH> = HeaplessVec::new();
218 let mut current = root;
219 loop {
220 if path.push(current).is_err() {
221 return Err(Error::BTreeDepthExceeded {
222 limit: MAX_BTREE_DEPTH,
223 });
224 }
225 let page = snapshot.read_page(pager, current)?;
226 match node::peek_node_kind(page.as_bytes())? {
227 // Find-only leaf decode: copies out ONLY the matched
228 // value, retaining every per-slot safety check (see
229 // `decode_node_find`).
230 NodeKind::Leaf => return node::decode_node_find(page.as_bytes(), key),
231 NodeKind::Internal => {
232 let decoded = decode_node(page.as_bytes())?;
233 current = choose_child(&decoded, key)?;
234 }
235 }
236 }
237 }
238}
239
240/// Walk root → leaf, returning the leaf page-id that **would** contain
241/// `key`. Used by every read/insert/delete path.
242///
243/// The walk is iterative and uses an explicit `heapless::Vec<PageId,
244/// MAX_BTREE_DEPTH>` stack of visited ancestor ids (Rule 1 + Rule
245/// 3). Exceeding the depth bound returns
246/// [`Error::BTreeDepthExceeded`] rather than recursing into a stack
247/// overflow.
248pub(crate) fn descend_to_leaf<F: FileBackend>(
249 pager: &mut Pager<F>,
250 root: PageId,
251 key: &[u8],
252) -> Result<PageId> {
253 let mut path: HeaplessVec<PageId, MAX_BTREE_DEPTH> = HeaplessVec::new();
254 let mut current = root;
255 loop {
256 if path.push(current).is_err() {
257 return Err(Error::BTreeDepthExceeded {
258 limit: MAX_BTREE_DEPTH,
259 });
260 }
261 let page_ref = pager.read_page(current)?;
262 let decoded = decode_node(page_ref.as_bytes())?;
263 match decoded.kind {
264 NodeKind::Leaf => return Ok(current),
265 NodeKind::Internal => {
266 current = choose_child(&decoded, key)?;
267 }
268 }
269 }
270}
271
272/// Pick the child page-id of `node` whose subtree contains `key`.
273/// Internal-node child selection: find the smallest `i` with
274/// `pivot[i] > key`; descend into `child[i]`. If no such `i`,
275/// descend into `child[K]` (the rightmost child).
276pub(crate) fn choose_child(node: &DecodedNode, key: &[u8]) -> Result<PageId> {
277 debug_assert!(matches!(node.kind, NodeKind::Internal));
278 debug_assert_eq!(node.children.len(), node.internals.len() + 1);
279 if node.children.is_empty() {
280 return Err(Error::BTreeInvariantViolated {
281 reason: "internal node with zero children",
282 });
283 }
284 let mut idx = node.internals.len();
285 for (i, pivot) in node.internals.iter().enumerate() {
286 if pivot.key.as_slice() > key {
287 idx = i;
288 break;
289 }
290 }
291 let raw = node.children[idx];
292 PageId::new(raw).ok_or(Error::BTreeInvariantViolated {
293 reason: "internal node had zero child page-id",
294 })
295}
296
297#[cfg(test)]
298mod tests;